514 files changed, 20306 insertions, 6614 deletions
diff --git a/net/6lowpan/nhc.c b/net/6lowpan/nhc.c
index 7008d53e455c..4fa2fdda174d 100644
--- a/net/6lowpan/nhc.c
+++ b/net/6lowpan/nhc.c
@@ -27,8 +27,8 @@ static int lowpan_nhc_insert(struct lowpan_nhc *nhc)
 
 	/* Figure out where to put new node */
 	while (*new) {
-		struct lowpan_nhc *this = container_of(*new, struct lowpan_nhc,
-						       node);
+		struct lowpan_nhc *this = rb_entry(*new, struct lowpan_nhc,
+						   node);
 		int result, len_dif, len;
 
 		len_dif = nhc->idlen - this->idlen;
@@ -69,8 +69,8 @@ static struct lowpan_nhc *lowpan_nhc_by_nhcid(const struct sk_buff *skb)
 	const u8 *nhcid_skb_ptr = skb->data;
 
 	while (node) {
-		struct lowpan_nhc *nhc = container_of(node, struct lowpan_nhc,
-						      node);
+		struct lowpan_nhc *nhc = rb_entry(node, struct lowpan_nhc,
+						  node);
 		u8 nhcid_skb_ptr_masked[LOWPAN_NHC_MAX_ID_LEN];
 		int result, i;
 
diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c
index 10da6c588bf8..e97ab824e368 100644
--- a/net/8021q/vlan_dev.c
+++ b/net/8021q/vlan_dev.c
@@ -671,7 +671,8 @@ static int vlan_ethtool_get_ts_info(struct net_device *dev,
 	return 0;
 }
 
-static struct rtnl_link_stats64 *vlan_dev_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
+static void vlan_dev_get_stats64(struct net_device *dev,
+				 struct rtnl_link_stats64 *stats)
 {
 	struct vlan_pcpu_stats *p;
 	u32 rx_errors = 0, tx_dropped = 0;
@@ -702,8 +703,6 @@ static struct rtnl_link_stats64 *vlan_dev_get_stats64(struct net_device *dev, st
 	}
 	stats->rx_errors  = rx_errors;
 	stats->tx_dropped = tx_dropped;
-
-	return stats;
 }
 
 #ifdef CONFIG_NET_POLL_CONTROLLER
@@ -792,8 +791,6 @@ static const struct net_device_ops vlan_netdev_ops = {
 	.ndo_netpoll_cleanup	= vlan_dev_netpoll_cleanup,
 #endif
 	.ndo_fix_features	= vlan_dev_fix_features,
-	.ndo_neigh_construct	= netdev_default_l2upper_neigh_construct,
-	.ndo_neigh_destroy	= netdev_default_l2upper_neigh_destroy,
 	.ndo_fdb_add		= switchdev_port_fdb_add,
 	.ndo_fdb_del		= switchdev_port_fdb_del,
 	.ndo_fdb_dump		= switchdev_port_fdb_dump,
diff --git a/net/Kconfig b/net/Kconfig
index a29bb4b41c50..102f781a0131 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -57,6 +57,7 @@ source "net/packet/Kconfig"
 source "net/unix/Kconfig"
 source "net/xfrm/Kconfig"
 source "net/iucv/Kconfig"
+source "net/smc/Kconfig"
 
 config INET
 	bool "TCP/IP networking"
@@ -296,7 +297,8 @@ config BPF_JIT
 
 	  Note, admin should enable this feature changing:
 	  /proc/sys/net/core/bpf_jit_enable
-	  /proc/sys/net/core/bpf_jit_harden (optional)
+	  /proc/sys/net/core/bpf_jit_harden   (optional)
+	  /proc/sys/net/core/bpf_jit_kallsyms (optional)
 
 config NET_FLOW_LIMIT
 	bool
@@ -389,6 +391,8 @@ source "net/9p/Kconfig"
 source "net/caif/Kconfig"
 source "net/ceph/Kconfig"
 source "net/nfc/Kconfig"
+source "net/psample/Kconfig"
+source "net/ife/Kconfig"
 
 config LWTUNNEL
 	bool "Network light weight tunnels"
@@ -410,6 +414,10 @@ config DST_CACHE
 	bool
 	default n
 
+config GRO_CELLS
+	bool
+	default n
+
 config NET_DEVLINK
 	tristate "Network physical/parent device Netlink interface"
 	help
diff --git a/net/Makefile b/net/Makefile
index 4cafaa2b4667..9b681550e3a3 100644
--- a/net/Makefile
+++ b/net/Makefile
@@ -51,6 +51,7 @@ obj-$(CONFIG_MAC80211)		+= mac80211/
 obj-$(CONFIG_TIPC)		+= tipc/
 obj-$(CONFIG_NETLABEL)		+= netlabel/
 obj-$(CONFIG_IUCV)		+= iucv/
+obj-$(CONFIG_SMC)		+= smc/
 obj-$(CONFIG_RFKILL)		+= rfkill/
 obj-$(CONFIG_NET_9P)		+= 9p/
 obj-$(CONFIG_CAIF)		+= caif/
@@ -69,6 +70,8 @@ obj-$(CONFIG_DNS_RESOLVER)	+= dns_resolver/
 obj-$(CONFIG_CEPH_LIB)		+= ceph/
 obj-$(CONFIG_BATMAN_ADV)	+= batman-adv/
 obj-$(CONFIG_NFC)		+= nfc/
+obj-$(CONFIG_PSAMPLE)		+= psample/
+obj-$(CONFIG_NET_IFE)		+= ife/
 obj-$(CONFIG_OPENVSWITCH)	+= openvswitch/
 obj-$(CONFIG_VSOCKETS)	+= vmw_vsock/
 obj-$(CONFIG_MPLS)		+= mpls/
diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c
index 10d2bdce686e..465cc24b41e5 100644
--- a/net/appletalk/ddp.c
+++ b/net/appletalk/ddp.c
@@ -1656,7 +1656,7 @@ static int atalk_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
 	ddp->deh_dport = usat->sat_port;
 	ddp->deh_sport = at->src_port;
 
-	SOCK_DEBUG(sk, "SK %p: Copy user data (%Zd bytes).\n", sk, len);
+	SOCK_DEBUG(sk, "SK %p: Copy user data (%zd bytes).\n", sk, len);
 
 	err = memcpy_from_msg(skb_put(skb, len), msg, len);
 	if (err) {
@@ -1720,7 +1720,7 @@ static int atalk_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
 		 */
 		aarp_send_ddp(dev, skb, &usat->sat_addr, NULL);
 	}
-	SOCK_DEBUG(sk, "SK %p: Done write (%Zd).\n", sk, len);
+	SOCK_DEBUG(sk, "SK %p: Done write (%zd).\n", sk, len);
 
 out:
 	release_sock(sk);
diff --git a/net/atm/mpc.c b/net/atm/mpc.c
index 3b3b1a292ec8..a190800572bd 100644
--- a/net/atm/mpc.c
+++ b/net/atm/mpc.c
@@ -451,7 +451,7 @@ static void lane2_assoc_ind(struct net_device *dev, const u8 *mac_addr,
 			return;
 	}
 	if (end_of_tlvs - tlvs != 0)
-		pr_info("(%s) ignoring %Zd bytes of trailing TLV garbage\n",
+		pr_info("(%s) ignoring %zd bytes of trailing TLV garbage\n",
 			dev->name, end_of_tlvs - tlvs);
 }
 
diff --git a/net/batman-adv/Makefile b/net/batman-adv/Makefile
index f724d3c98a81..915987bc6d29 100644
--- a/net/batman-adv/Makefile
+++ b/net/batman-adv/Makefile
@@ -1,5 +1,5 @@
 #
-# Copyright (C) 2007-2016  B.A.T.M.A.N. contributors:
+# Copyright (C) 2007-2017  B.A.T.M.A.N. contributors:
 #
 # Marek Lindner, Simon Wunderlich
 #
diff --git a/net/batman-adv/bat_algo.c b/net/batman-adv/bat_algo.c
index 623d04302aa2..44fd073b7546 100644
--- a/net/batman-adv/bat_algo.c
+++ b/net/batman-adv/bat_algo.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007-2016  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich
  *
diff --git a/net/batman-adv/bat_algo.h b/net/batman-adv/bat_algo.h
index 3b5b69cdd12b..29f6312f9bf1 100644
--- a/net/batman-adv/bat_algo.h
+++ b/net/batman-adv/bat_algo.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2011-2016  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2011-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Linus Lüssing
  *
diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c
index f00f666e2ccd..7c3d994e90d8 100644
--- a/net/batman-adv/bat_iv_ogm.c
+++ b/net/batman-adv/bat_iv_ogm.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007-2016  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich
  *
diff --git a/net/batman-adv/bat_iv_ogm.h b/net/batman-adv/bat_iv_ogm.h
index b9f3550faaf7..ae2ab526bdb1 100644
--- a/net/batman-adv/bat_iv_ogm.h
+++ b/net/batman-adv/bat_iv_ogm.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007-2016  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich
  *
diff --git a/net/batman-adv/bat_v.c b/net/batman-adv/bat_v.c
index 2ac612d7bab4..0acd081dd286 100644
--- a/net/batman-adv/bat_v.c
+++ b/net/batman-adv/bat_v.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2013-2016 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2013-2017  B.A.T.M.A.N. contributors:
  *
  * Linus Lüssing, Marek Lindner
  *
diff --git a/net/batman-adv/bat_v.h b/net/batman-adv/bat_v.h
index 83b77639729e..dd7c4b647e6b 100644
--- a/net/batman-adv/bat_v.h
+++ b/net/batman-adv/bat_v.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2011-2016  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2011-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Linus Lüssing
  *
diff --git a/net/batman-adv/bat_v_elp.c b/net/batman-adv/bat_v_elp.c
index f2fb2f05b6bf..b90c9903e246 100644
--- a/net/batman-adv/bat_v_elp.c
+++ b/net/batman-adv/bat_v_elp.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2011-2016 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2011-2017  B.A.T.M.A.N. contributors:
  *
  * Linus Lüssing, Marek Lindner
  *
diff --git a/net/batman-adv/bat_v_elp.h b/net/batman-adv/bat_v_elp.h
index be17c0b1369e..376ead280ab9 100644
--- a/net/batman-adv/bat_v_elp.h
+++ b/net/batman-adv/bat_v_elp.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2013-2016 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2013-2017  B.A.T.M.A.N. contributors:
  *
  * Linus Lüssing, Marek Lindner
  *
diff --git a/net/batman-adv/bat_v_ogm.c b/net/batman-adv/bat_v_ogm.c
index 38b9aab83fc0..03a35c9f456d 100644
--- a/net/batman-adv/bat_v_ogm.c
+++ b/net/batman-adv/bat_v_ogm.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2013-2016 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2013-2017  B.A.T.M.A.N. contributors:
  *
  * Antonio Quartulli
  *
diff --git a/net/batman-adv/bat_v_ogm.h b/net/batman-adv/bat_v_ogm.h
index 4c4d45caa422..2068770b542d 100644
--- a/net/batman-adv/bat_v_ogm.h
+++ b/net/batman-adv/bat_v_ogm.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2013-2016 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2013-2017  B.A.T.M.A.N. contributors:
  *
  * Antonio Quartulli
  *
diff --git a/net/batman-adv/bitarray.c b/net/batman-adv/bitarray.c
index 032271421a20..2b070c7e31da 100644
--- a/net/batman-adv/bitarray.c
+++ b/net/batman-adv/bitarray.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2006-2016  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2006-2017  B.A.T.M.A.N. contributors:
  *
  * Simon Wunderlich, Marek Lindner
  *
diff --git a/net/batman-adv/bitarray.h b/net/batman-adv/bitarray.h
index 0e6e9d09078c..cc262c9d97e0 100644
--- a/net/batman-adv/bitarray.h
+++ b/net/batman-adv/bitarray.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2006-2016  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2006-2017  B.A.T.M.A.N. contributors:
  *
  * Simon Wunderlich, Marek Lindner
  *
diff --git a/net/batman-adv/bridge_loop_avoidance.c b/net/batman-adv/bridge_loop_avoidance.c
index e7f690b571ea..ba8420d8a992 100644
--- a/net/batman-adv/bridge_loop_avoidance.c
+++ b/net/batman-adv/bridge_loop_avoidance.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2011-2016  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2011-2017  B.A.T.M.A.N. contributors:
  *
  * Simon Wunderlich
  *
@@ -449,7 +449,6 @@ static void batadv_bla_send_claim(struct batadv_priv *bat_priv, u8 *mac,
 	batadv_inc_counter(bat_priv, BATADV_CNT_RX);
 	batadv_add_counter(bat_priv, BATADV_CNT_RX_BYTES,
 			   skb->len + ETH_HLEN);
-	soft_iface->last_rx = jiffies;
 
 	netif_rx(skb);
 out:
diff --git a/net/batman-adv/bridge_loop_avoidance.h b/net/batman-adv/bridge_loop_avoidance.h
index 1ae93e46fb98..e157986bd01c 100644
--- a/net/batman-adv/bridge_loop_avoidance.h
+++ b/net/batman-adv/bridge_loop_avoidance.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2011-2016  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2011-2017  B.A.T.M.A.N. contributors:
  *
  * Simon Wunderlich
  *
@@ -20,6 +20,8 @@
 
 #include "main.h"
 
+#include <linux/compiler.h>
+#include <linux/stddef.h>
 #include <linux/types.h>
 
 struct net_device;
@@ -27,6 +29,22 @@ struct netlink_callback;
 struct seq_file;
 struct sk_buff;
 
+/**
+ * batadv_bla_is_loopdetect_mac - check if the mac address is from a loop detect
+ *  frame sent by bridge loop avoidance
+ * @mac: mac address to check
+ *
+ * Return: true if the it looks like a loop detect frame
+ * (mac starts with BA:BE), false otherwise
+ */
+static inline bool batadv_bla_is_loopdetect_mac(const uint8_t *mac)
+{
+	if (mac[0] == 0xba && mac[1] == 0xbe)
+		return true;
+
+	return false;
+}
+
 #ifdef CONFIG_BATMAN_ADV_BLA
 bool batadv_bla_rx(struct batadv_priv *bat_priv, struct sk_buff *skb,
 		   unsigned short vid, bool is_bcast);
diff --git a/net/batman-adv/debugfs.c b/net/batman-adv/debugfs.c
index 77925504379d..e32ad47c6efd 100644
--- a/net/batman-adv/debugfs.c
+++ b/net/batman-adv/debugfs.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2010-2016  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2010-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner
  *
@@ -19,7 +19,7 @@
 #include "main.h"
 
 #include <linux/debugfs.h>
-#include <linux/device.h>
+#include <linux/err.h>
 #include <linux/errno.h>
 #include <linux/export.h>
 #include <linux/fs.h>
diff --git a/net/batman-adv/debugfs.h b/net/batman-adv/debugfs.h
index e49121ee55f6..9c5d4a65b98c 100644
--- a/net/batman-adv/debugfs.h
+++ b/net/batman-adv/debugfs.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2010-2016  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2010-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner
  *
diff --git a/net/batman-adv/distributed-arp-table.c b/net/batman-adv/distributed-arp-table.c
index 49576c5a3fe3..1bfd1dbc2feb 100644
--- a/net/batman-adv/distributed-arp-table.c
+++ b/net/batman-adv/distributed-arp-table.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2011-2016  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2011-2017  B.A.T.M.A.N. contributors:
  *
  * Antonio Quartulli
  *
@@ -1050,7 +1050,6 @@ bool batadv_dat_snoop_outgoing_arp_request(struct batadv_priv *bat_priv,
 						   bat_priv->soft_iface);
 		bat_priv->stats.rx_packets++;
 		bat_priv->stats.rx_bytes += skb->len + ETH_HLEN + hdr_size;
-		bat_priv->soft_iface->last_rx = jiffies;
 
 		netif_rx(skb_new);
 		batadv_dbg(BATADV_DBG_DAT, bat_priv, "ARP request replied locally\n");
diff --git a/net/batman-adv/distributed-arp-table.h b/net/batman-adv/distributed-arp-table.h
index 813ecea96cf9..ec364a3c1c66 100644
--- a/net/batman-adv/distributed-arp-table.h
+++ b/net/batman-adv/distributed-arp-table.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2011-2016  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2011-2017  B.A.T.M.A.N. contributors:
  *
  * Antonio Quartulli
  *
diff --git a/net/batman-adv/fragmentation.c b/net/batman-adv/fragmentation.c
index 0854ebd8613e..ead18ca836de 100644
--- a/net/batman-adv/fragmentation.c
+++ b/net/batman-adv/fragmentation.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2013-2016  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2013-2017  B.A.T.M.A.N. contributors:
  *
  * Martin Hundebøll <martin@hundeboll.net>
  *
diff --git a/net/batman-adv/fragmentation.h b/net/batman-adv/fragmentation.h
index b95f619606af..1a2d6c308745 100644
--- a/net/batman-adv/fragmentation.h
+++ b/net/batman-adv/fragmentation.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2013-2016  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2013-2017  B.A.T.M.A.N. contributors:
  *
  * Martin Hundebøll <martin@hundeboll.net>
  *
diff --git a/net/batman-adv/gateway_client.c b/net/batman-adv/gateway_client.c
index 52b8bd6ec431..de9955d5224d 100644
--- a/net/batman-adv/gateway_client.c
+++ b/net/batman-adv/gateway_client.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2009-2016  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2009-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner
  *
diff --git a/net/batman-adv/gateway_client.h b/net/batman-adv/gateway_client.h
index 859166d03561..3baa3d466e5e 100644
--- a/net/batman-adv/gateway_client.h
+++ b/net/batman-adv/gateway_client.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2009-2016  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2009-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner
  *
diff --git a/net/batman-adv/gateway_common.c b/net/batman-adv/gateway_common.c
index 21184810d89f..5db2e43e3775 100644
--- a/net/batman-adv/gateway_common.c
+++ b/net/batman-adv/gateway_common.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2009-2016  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2009-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner
  *
diff --git a/net/batman-adv/gateway_common.h b/net/batman-adv/gateway_common.h
index 8a5e1ddf1175..0a6a97d201f2 100644
--- a/net/batman-adv/gateway_common.h
+++ b/net/batman-adv/gateway_common.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2009-2016  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2009-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner
  *
diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c
index 61a431a9772b..e348f76ea8c1 100644
--- a/net/batman-adv/hard-interface.c
+++ b/net/batman-adv/hard-interface.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007-2016  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich
  *
diff --git a/net/batman-adv/hard-interface.h b/net/batman-adv/hard-interface.h
index d6309a423629..9f9890ff7a22 100644
--- a/net/batman-adv/hard-interface.h
+++ b/net/batman-adv/hard-interface.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007-2016  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich
  *
diff --git a/net/batman-adv/hash.c b/net/batman-adv/hash.c
index a0a0fdb85805..b5f7e13918ac 100644
--- a/net/batman-adv/hash.c
+++ b/net/batman-adv/hash.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2006-2016  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2006-2017  B.A.T.M.A.N. contributors:
  *
  * Simon Wunderlich, Marek Lindner
  *
diff --git a/net/batman-adv/hash.h b/net/batman-adv/hash.h
index 557a7044cfbc..0c905e91c5e2 100644
--- a/net/batman-adv/hash.h
+++ b/net/batman-adv/hash.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2006-2016  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2006-2017  B.A.T.M.A.N. contributors:
  *
  * Simon Wunderlich, Marek Lindner
  *
diff --git a/net/batman-adv/icmp_socket.c b/net/batman-adv/icmp_socket.c
index b310f381ae02..6308c9f0fd96 100644
--- a/net/batman-adv/icmp_socket.c
+++ b/net/batman-adv/icmp_socket.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007-2016  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner
  *
diff --git a/net/batman-adv/icmp_socket.h b/net/batman-adv/icmp_socket.h
index e44a7da51431..f3fec40aae86 100644
--- a/net/batman-adv/icmp_socket.h
+++ b/net/batman-adv/icmp_socket.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007-2016  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner
  *
diff --git a/net/batman-adv/log.c b/net/batman-adv/log.c
index c73c31769aba..4ef4bde2cc2d 100644
--- a/net/batman-adv/log.c
+++ b/net/batman-adv/log.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2010-2016  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2010-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner
  *
diff --git a/net/batman-adv/log.h b/net/batman-adv/log.h
index 3284a7b0325d..7a2b9f4da078 100644
--- a/net/batman-adv/log.h
+++ b/net/batman-adv/log.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007-2016  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich
  *
diff --git a/net/batman-adv/main.c b/net/batman-adv/main.c
index d46415edd3be..5000c540614d 100644
--- a/net/batman-adv/main.c
+++ b/net/batman-adv/main.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007-2016  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich
  *
diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h
index a6cc8040a21d..57a8103dbce7 100644
--- a/net/batman-adv/main.h
+++ b/net/batman-adv/main.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007-2016  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich
  *
@@ -24,7 +24,7 @@
 #define BATADV_DRIVER_DEVICE "batman-adv"
 
 #ifndef BATADV_SOURCE_VERSION
-#define BATADV_SOURCE_VERSION "2016.5"
+#define BATADV_SOURCE_VERSION "2017.0"
 #endif
 
 /* B.A.T.M.A.N. parameters */
diff --git a/net/batman-adv/multicast.c b/net/batman-adv/multicast.c
index 090a69fc342e..952ba81a565b 100644
--- a/net/batman-adv/multicast.c
+++ b/net/batman-adv/multicast.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2014-2016  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2014-2017  B.A.T.M.A.N. contributors:
  *
  * Linus Lüssing
  *
diff --git a/net/batman-adv/multicast.h b/net/batman-adv/multicast.h
index 2cddaf52a21d..2a78cddab0e9 100644
--- a/net/batman-adv/multicast.h
+++ b/net/batman-adv/multicast.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2014-2016  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2014-2017  B.A.T.M.A.N. contributors:
  *
  * Linus Lüssing
  *
diff --git a/net/batman-adv/netlink.c b/net/batman-adv/netlink.c
index 062738163bdc..ab13b4d58733 100644
--- a/net/batman-adv/netlink.c
+++ b/net/batman-adv/netlink.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2016 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2016-2017  B.A.T.M.A.N. contributors:
  *
  * Matthias Schiffer
  *
diff --git a/net/batman-adv/netlink.h b/net/batman-adv/netlink.h
index 52eb16281aba..f1cd8c5da966 100644
--- a/net/batman-adv/netlink.h
+++ b/net/batman-adv/netlink.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2016 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2016-2017  B.A.T.M.A.N. contributors:
  *
  * Matthias Schiffer
  *
diff --git a/net/batman-adv/network-coding.c b/net/batman-adv/network-coding.c
index ab5a3bf0765f..e1f6fc72fe3e 100644
--- a/net/batman-adv/network-coding.c
+++ b/net/batman-adv/network-coding.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2012-2016  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2012-2017  B.A.T.M.A.N. contributors:
  *
  * Martin Hundebøll, Jeppe Ledet-Pedersen
  *
diff --git a/net/batman-adv/network-coding.h b/net/batman-adv/network-coding.h
index d6d7fb4ec5d5..c66efb81d2f4 100644
--- a/net/batman-adv/network-coding.h
+++ b/net/batman-adv/network-coding.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2012-2016  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2012-2017  B.A.T.M.A.N. contributors:
  *
  * Martin Hundebøll, Jeppe Ledet-Pedersen
  *
diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c
index 8f3b2969cc4e..8e2a4b205257 100644
--- a/net/batman-adv/originator.c
+++ b/net/batman-adv/originator.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2009-2016  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2009-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich
  *
diff --git a/net/batman-adv/originator.h b/net/batman-adv/originator.h
index ebc56183f358..d94220a6d21a 100644
--- a/net/batman-adv/originator.h
+++ b/net/batman-adv/originator.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007-2016  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich
  *
diff --git a/net/batman-adv/packet.h b/net/batman-adv/packet.h
index 7a36bcfa0ba0..8e8a5db197cb 100644
--- a/net/batman-adv/packet.h
+++ b/net/batman-adv/packet.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007-2016  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich
  *
diff --git a/net/batman-adv/routing.c b/net/batman-adv/routing.c
index 6713bdf414cd..7fd740b6e36d 100644
--- a/net/batman-adv/routing.c
+++ b/net/batman-adv/routing.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007-2016  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich
  *
@@ -719,20 +719,19 @@ static int batadv_route_unicast_packet(struct sk_buff *skb,
 
 	len = skb->len;
 	res = batadv_send_skb_to_orig(skb, orig_node, recv_if);
-	if (res == NET_XMIT_SUCCESS)
-		ret = NET_RX_SUCCESS;
-
-	/* skb was consumed */
-	skb = NULL;
 
 	/* translate transmit result into receive result */
 	if (res == NET_XMIT_SUCCESS) {
+		ret = NET_RX_SUCCESS;
 		/* skb was transmitted and consumed */
 		batadv_inc_counter(bat_priv, BATADV_CNT_FORWARD);
 		batadv_add_counter(bat_priv, BATADV_CNT_FORWARD_BYTES,
 				   len + ETH_HLEN);
 	}
 
+	/* skb was consumed */
+	skb = NULL;
+
 put_orig_node:
 	batadv_orig_node_put(orig_node);
 free_skb:
diff --git a/net/batman-adv/routing.h b/net/batman-adv/routing.h
index 05c3ff42e181..5ede16c32f15 100644
--- a/net/batman-adv/routing.h
+++ b/net/batman-adv/routing.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007-2016  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich
  *
diff --git a/net/batman-adv/send.c b/net/batman-adv/send.c
index 49021b7124f3..1489ec27daff 100644
--- a/net/batman-adv/send.c
+++ b/net/batman-adv/send.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007-2016  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich
  *
@@ -77,6 +77,7 @@ int batadv_send_skb_packet(struct sk_buff *skb,
 {
 	struct batadv_priv *bat_priv;
 	struct ethhdr *ethhdr;
+	int ret;
 
 	bat_priv = netdev_priv(hard_iface->soft_iface);
 
@@ -115,7 +116,8 @@ int batadv_send_skb_packet(struct sk_buff *skb,
 	 * congestion and traffic shaping, it drops and returns NET_XMIT_DROP
 	 * (which is > 0). This will not be treated as an error.
 	 */
-	return dev_queue_xmit(skb);
+	ret = dev_queue_xmit(skb);
+	return net_xmit_eval(ret);
 send_skb_err:
 	kfree_skb(skb);
 	return NET_XMIT_DROP;
diff --git a/net/batman-adv/send.h b/net/batman-adv/send.h
index a94e1e8639ca..f21166d10323 100644
--- a/net/batman-adv/send.h
+++ b/net/batman-adv/send.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007-2016  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich
  *
diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c
index 7b3494ae6ad9..5d099b2e6cfc 100644
--- a/net/batman-adv/soft-interface.c
+++ b/net/batman-adv/soft-interface.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007-2016  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich
  *
@@ -258,7 +258,8 @@ static int batadv_interface_tx(struct sk_buff *skb,
 	ethhdr = eth_hdr(skb);
 
 	/* Register the client MAC in the transtable */
-	if (!is_multicast_ether_addr(ethhdr->h_source)) {
+	if (!is_multicast_ether_addr(ethhdr->h_source) &&
+	    !batadv_bla_is_loopdetect_mac(ethhdr->h_source)) {
 		client_added = batadv_tt_local_add(soft_iface, ethhdr->h_source,
 						   vid, skb->skb_iif,
 						   skb->mark);
@@ -481,8 +482,6 @@ void batadv_interface_rx(struct net_device *soft_iface,
 	batadv_add_counter(bat_priv, BATADV_CNT_RX_BYTES,
 			   skb->len + ETH_HLEN);
 
-	soft_iface->last_rx = jiffies;
-
 	/* Let the bridge loop avoidance check the packet. If will
 	 * not handle it, we can safely push it up.
 	 */
diff --git a/net/batman-adv/soft-interface.h b/net/batman-adv/soft-interface.h
index ec303ddbf647..639c3abb214a 100644
--- a/net/batman-adv/soft-interface.h
+++ b/net/batman-adv/soft-interface.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007-2016  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner
  *
diff --git a/net/batman-adv/sysfs.c b/net/batman-adv/sysfs.c
index 17c844196eb2..0ae8b30e4eaa 100644
--- a/net/batman-adv/sysfs.c
+++ b/net/batman-adv/sysfs.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2010-2016  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2010-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner
  *
diff --git a/net/batman-adv/sysfs.h b/net/batman-adv/sysfs.h
index c76021b4e198..e487412e256b 100644
--- a/net/batman-adv/sysfs.h
+++ b/net/batman-adv/sysfs.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2010-2016  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2010-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner
  *
diff --git a/net/batman-adv/tp_meter.c b/net/batman-adv/tp_meter.c
index 981e8c5b07e9..c94ebdecdc3d 100644
--- a/net/batman-adv/tp_meter.c
+++ b/net/batman-adv/tp_meter.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2012-2016 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2012-2017  B.A.T.M.A.N. contributors:
  *
  * Edo Monticelli, Antonio Quartulli
  *
@@ -23,7 +23,7 @@
 #include <linux/byteorder/generic.h>
 #include <linux/cache.h>
 #include <linux/compiler.h>
-#include <linux/device.h>
+#include <linux/err.h>
 #include <linux/etherdevice.h>
 #include <linux/fs.h>
 #include <linux/if_ether.h>
diff --git a/net/batman-adv/tp_meter.h b/net/batman-adv/tp_meter.h
index ba922c425e56..a8ada5c123bd 100644
--- a/net/batman-adv/tp_meter.h
+++ b/net/batman-adv/tp_meter.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2012-2016 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2012-2017  B.A.T.M.A.N. contributors:
  *
  * Edo Monticelli, Antonio Quartulli
  *
diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c
index 30ecbfb40adf..6077a87d46f0 100644
--- a/net/batman-adv/translation-table.c
+++ b/net/batman-adv/translation-table.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007-2016  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich, Antonio Quartulli
  *
@@ -3714,7 +3714,6 @@ static void batadv_tt_local_set_flags(struct batadv_priv *bat_priv, u16 flags,
 {
 	struct batadv_hashtable *hash = bat_priv->tt.local_hash;
 	struct batadv_tt_common_entry *tt_common_entry;
-	u16 changed_num = 0;
 	struct hlist_head *head;
 	u32 i;
 
@@ -3736,7 +3735,6 @@ static void batadv_tt_local_set_flags(struct batadv_priv *bat_priv, u16 flags,
 					continue;
 				tt_common_entry->flags &= ~flags;
 			}
-			changed_num++;
 
 			if (!count)
 				continue;
diff --git a/net/batman-adv/translation-table.h b/net/batman-adv/translation-table.h
index 783fdba84db2..411d586191da 100644
--- a/net/batman-adv/translation-table.h
+++ b/net/batman-adv/translation-table.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007-2016  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich, Antonio Quartulli
  *
diff --git a/net/batman-adv/tvlv.c b/net/batman-adv/tvlv.c
index a783420356ae..1d9e267caec9 100644
--- a/net/batman-adv/tvlv.c
+++ b/net/batman-adv/tvlv.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007-2016  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich
  *
diff --git a/net/batman-adv/tvlv.h b/net/batman-adv/tvlv.h
index e4369b547b43..4d01400ada30 100644
--- a/net/batman-adv/tvlv.h
+++ b/net/batman-adv/tvlv.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007-2016  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich
  *
diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h
index e913aee28c98..8f64a5c01345 100644
--- a/net/batman-adv/types.h
+++ b/net/batman-adv/types.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007-2016  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich
  *
diff --git a/net/bluetooth/6lowpan.c b/net/bluetooth/6lowpan.c
index 1904a93f47d5..d491529332f4 100644
--- a/net/bluetooth/6lowpan.c
+++ b/net/bluetooth/6lowpan.c
@@ -920,7 +920,7 @@ static void chan_close_cb(struct l2cap_chan *chan)
 			BT_DBG("dev %p removing %speer %p", dev,
 			       last ? "last " : "1 ", peer);
 			BT_DBG("chan %p orig refcnt %d", chan,
-			       atomic_read(&chan->kref.refcount));
+			       kref_read(&chan->kref));
 
 			l2cap_chan_put(chan);
 			break;
diff --git a/net/bluetooth/a2mp.c b/net/bluetooth/a2mp.c
index 5f123c3320a7..f0095fd79818 100644
--- a/net/bluetooth/a2mp.c
+++ b/net/bluetooth/a2mp.c
@@ -810,7 +810,7 @@ static struct l2cap_chan *a2mp_chan_open(struct l2cap_conn *conn, bool locked)
 /* AMP Manager functions */
 struct amp_mgr *amp_mgr_get(struct amp_mgr *mgr)
 {
-	BT_DBG("mgr %p orig refcnt %d", mgr, atomic_read(&mgr->kref.refcount));
+	BT_DBG("mgr %p orig refcnt %d", mgr, kref_read(&mgr->kref));
 
 	kref_get(&mgr->kref);
 
@@ -833,7 +833,7 @@ static void amp_mgr_destroy(struct kref *kref)
 
 int amp_mgr_put(struct amp_mgr *mgr)
 {
-	BT_DBG("mgr %p orig refcnt %d", mgr, atomic_read(&mgr->kref.refcount));
+	BT_DBG("mgr %p orig refcnt %d", mgr, kref_read(&mgr->kref));
 
 	return kref_put(&mgr->kref, &amp_mgr_destroy);
 }
diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c
index 1aff2da9bc74..cfb2faba46de 100644
--- a/net/bluetooth/af_bluetooth.c
+++ b/net/bluetooth/af_bluetooth.c
@@ -245,7 +245,7 @@ int bt_sock_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
 	if (err == 0) {
 		sock_recv_ts_and_drops(msg, sk, skb);
 
-		if (bt_sk(sk)->skb_msg_name)
+		if (msg->msg_name && bt_sk(sk)->skb_msg_name)
 			bt_sk(sk)->skb_msg_name(skb, msg->msg_name,
 						&msg->msg_namelen);
 	}
diff --git a/net/bluetooth/amp.c b/net/bluetooth/amp.c
index e32f34189007..02a4ccc04e1e 100644
--- a/net/bluetooth/amp.c
+++ b/net/bluetooth/amp.c
@@ -24,7 +24,7 @@
 void amp_ctrl_get(struct amp_ctrl *ctrl)
 {
 	BT_DBG("ctrl %p orig refcnt %d", ctrl,
-	       atomic_read(&ctrl->kref.refcount));
+	       kref_read(&ctrl->kref));
 
 	kref_get(&ctrl->kref);
 }
@@ -42,7 +42,7 @@ static void amp_ctrl_destroy(struct kref *kref)
 int amp_ctrl_put(struct amp_ctrl *ctrl)
 {
 	BT_DBG("ctrl %p orig refcnt %d", ctrl,
-	       atomic_read(&ctrl->kref.refcount));
+	       kref_read(&ctrl->kref));
 
 	return kref_put(&ctrl->kref, &amp_ctrl_destroy);
 }
diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c
index e17aacbc5630..0b4dba08a14e 100644
--- a/net/bluetooth/hci_event.c
+++ b/net/bluetooth/hci_event.c
@@ -4749,7 +4749,7 @@ static void process_adv_report(struct hci_dev *hdev, u8 type, bdaddr_t *bdaddr,
 	case LE_ADV_SCAN_RSP:
 		break;
 	default:
-		BT_ERR_RATELIMITED("Unknown advetising packet type: 0x%02x",
+		BT_ERR_RATELIMITED("Unknown advertising packet type: 0x%02x",
 				   type);
 		return;
 	}
diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c
index 48f9471e7c85..f64d6566021f 100644
--- a/net/bluetooth/hci_sock.c
+++ b/net/bluetooth/hci_sock.c
@@ -851,7 +851,7 @@ static int hci_sock_release(struct socket *sock)
 
 	if (hdev) {
 		if (hci_pi(sk)->channel == HCI_CHANNEL_USER) {
-			/* When releasing an user channel exclusive access,
+			/* When releasing a user channel exclusive access,
 			 * call hci_dev_do_close directly instead of calling
 			 * hci_dev_close to ensure the exclusive access will
 			 * be released and the controller brought back down.
@@ -1172,7 +1172,7 @@ static int hci_sock_bind(struct socket *sock, struct sockaddr *addr,
 				/* In case the transport is already up and
 				 * running, clear the error here.
 				 *
-				 * This can happen when opening an user
+				 * This can happen when opening a user
 				 * channel and HCI_AUTO_OFF grace period
 				 * is still active.
 				 */
@@ -1190,7 +1190,7 @@ static int hci_sock_bind(struct socket *sock, struct sockaddr *addr,
 		if (!hci_sock_gen_cookie(sk)) {
 			/* In the case when a cookie has already been assigned,
 			 * this socket will transition from a raw socket into
-			 * an user channel socket. For a clean transition, send
+			 * a user channel socket. For a clean transition, send
 			 * the close notification first.
 			 */
 			skb = create_monitor_ctrl_close(sk);
diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c
index ce0b5dd01953..fc7f321a3823 100644
--- a/net/bluetooth/l2cap_core.c
+++ b/net/bluetooth/l2cap_core.c
@@ -481,14 +481,14 @@ static void l2cap_chan_destroy(struct kref *kref)
 
 void l2cap_chan_hold(struct l2cap_chan *c)
 {
-	BT_DBG("chan %p orig refcnt %d", c, atomic_read(&c->kref.refcount));
+	BT_DBG("chan %p orig refcnt %d", c, kref_read(&c->kref));
 
 	kref_get(&c->kref);
 }
 
 void l2cap_chan_put(struct l2cap_chan *c)
 {
-	BT_DBG("chan %p orig refcnt %d", c, atomic_read(&c->kref.refcount));
+	BT_DBG("chan %p orig refcnt %d", c, kref_read(&c->kref));
 
 	kref_put(&c->kref, l2cap_chan_destroy);
 }
diff --git a/net/bridge/Makefile b/net/bridge/Makefile
index 0aefc011b668..40b1ede527ca 100644
--- a/net/bridge/Makefile
+++ b/net/bridge/Makefile
@@ -6,7 +6,8 @@ obj-$(CONFIG_BRIDGE) += bridge.o
 
 bridge-y	:= br.o br_device.o br_fdb.o br_forward.o br_if.o br_input.o \
 			br_ioctl.o br_stp.o br_stp_bpdu.o \
-			br_stp_if.o br_stp_timer.o br_netlink.o
+			br_stp_if.o br_stp_timer.o br_netlink.o \
+			br_netlink_tunnel.o
 
 bridge-$(CONFIG_SYSFS) += br_sysfs_if.o br_sysfs_br.o
 
@@ -18,7 +19,7 @@ obj-$(CONFIG_BRIDGE_NETFILTER) += br_netfilter.o
 
 bridge-$(CONFIG_BRIDGE_IGMP_SNOOPING) += br_multicast.o br_mdb.o
 
-bridge-$(CONFIG_BRIDGE_VLAN_FILTERING) += br_vlan.o
+bridge-$(CONFIG_BRIDGE_VLAN_FILTERING) += br_vlan.o br_vlan_tunnel.o
 
 bridge-$(CONFIG_NET_SWITCHDEV) += br_switchdev.o
 
diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c
index ed3b3192fb00..ea71513fca21 100644
--- a/net/bridge/br_device.c
+++ b/net/bridge/br_device.c
@@ -79,7 +79,7 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev)
 			br_multicast_flood(mdst, skb, false, true);
 		else
 			br_flood(br, skb, BR_PKT_MULTICAST, false, true);
-	} else if ((dst = __br_fdb_get(br, dest, vid)) != NULL) {
+	} else if ((dst = br_fdb_find_rcu(br, dest, vid)) != NULL) {
 		br_forward(dst->dst, skb, false, true);
 	} else {
 		br_flood(br, skb, BR_PKT_UNICAST, false, true);
@@ -153,8 +153,8 @@ static int br_dev_stop(struct net_device *dev)
 	return 0;
 }
 
-static struct rtnl_link_stats64 *br_get_stats64(struct net_device *dev,
-						struct rtnl_link_stats64 *stats)
+static void br_get_stats64(struct net_device *dev,
+			   struct rtnl_link_stats64 *stats)
 {
 	struct net_bridge *br = netdev_priv(dev);
 	struct pcpu_sw_netstats tmp, sum = { 0 };
@@ -178,8 +178,6 @@ static struct rtnl_link_stats64 *br_get_stats64(struct net_device *dev,
 	stats->tx_packets = sum.tx_packets;
 	stats->rx_bytes   = sum.rx_bytes;
 	stats->rx_packets = sum.rx_packets;
-
-	return stats;
 }
 
 static int br_change_mtu(struct net_device *dev, int new_mtu)
@@ -349,8 +347,6 @@ static const struct net_device_ops br_netdev_ops = {
 	.ndo_add_slave		 = br_add_slave,
 	.ndo_del_slave		 = br_del_slave,
 	.ndo_fix_features        = br_fix_features,
-	.ndo_neigh_construct	 = netdev_default_l2upper_neigh_construct,
-	.ndo_neigh_destroy	 = netdev_default_l2upper_neigh_destroy,
 	.ndo_fdb_add		 = br_fdb_add,
 	.ndo_fdb_del		 = br_fdb_delete,
 	.ndo_fdb_dump		 = br_fdb_dump,
@@ -415,4 +411,5 @@ void br_dev_setup(struct net_device *dev)
 	br_netfilter_rtable_init(br);
 	br_stp_timer_init(br);
 	br_multicast_init(br);
+	INIT_DELAYED_WORK(&br->gc_work, br_fdb_cleanup);
 }
diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c
index e4a4176171c9..4f598dc2d916 100644
--- a/net/bridge/br_fdb.c
+++ b/net/bridge/br_fdb.c
@@ -28,9 +28,6 @@
 #include "br_private.h"
 
 static struct kmem_cache *br_fdb_cache __read_mostly;
-static struct net_bridge_fdb_entry *fdb_find(struct hlist_head *head,
-					     const unsigned char *addr,
-					     __u16 vid);
 static int fdb_insert(struct net_bridge *br, struct net_bridge_port *source,
 		      const unsigned char *addr, u16 vid);
 static void fdb_notify(struct net_bridge *br,
@@ -68,7 +65,7 @@ static inline unsigned long hold_time(const struct net_bridge *br)
 static inline int has_expired(const struct net_bridge *br,
 				  const struct net_bridge_fdb_entry *fdb)
 {
-	return !fdb->is_static &&
+	return !fdb->is_static && !fdb->added_by_external_learn &&
 		time_before_eq(fdb->updated + hold_time(br), jiffies);
 }
 
@@ -86,6 +83,47 @@ static void fdb_rcu_free(struct rcu_head *head)
 	kmem_cache_free(br_fdb_cache, ent);
 }
 
+static struct net_bridge_fdb_entry *fdb_find_rcu(struct hlist_head *head,
+						 const unsigned char *addr,
+						 __u16 vid)
+{
+	struct net_bridge_fdb_entry *f;
+
+	WARN_ON_ONCE(!rcu_read_lock_held());
+
+	hlist_for_each_entry_rcu(f, head, hlist)
+		if (ether_addr_equal(f->addr.addr, addr) && f->vlan_id == vid)
+			break;
+
+	return f;
+}
+
+/* requires bridge hash_lock */
+static struct net_bridge_fdb_entry *br_fdb_find(struct net_bridge *br,
+						const unsigned char *addr,
+						__u16 vid)
+{
+	struct hlist_head *head = &br->hash[br_mac_hash(addr, vid)];
+	struct net_bridge_fdb_entry *fdb;
+
+	WARN_ON_ONCE(!br_hash_lock_held(br));
+
+	rcu_read_lock();
+	fdb = fdb_find_rcu(head, addr, vid);
+	rcu_read_unlock();
+
+	return fdb;
+}
+
+struct net_bridge_fdb_entry *br_fdb_find_rcu(struct net_bridge *br,
+					     const unsigned char *addr,
+					     __u16 vid)
+{
+	struct hlist_head *head = &br->hash[br_mac_hash(addr, vid)];
+
+	return fdb_find_rcu(head, addr, vid);
+}
+
 /* When a static FDB entry is added, the mac address from the entry is
  * added to the bridge private HW address list and all required ports
  * are then updated with the new information.
@@ -154,7 +192,7 @@ static void fdb_delete(struct net_bridge *br, struct net_bridge_fdb_entry *f)
 	if (f->added_by_external_learn)
 		fdb_del_external_learn(f);
 
-	hlist_del_rcu(&f->hlist);
+	hlist_del_init_rcu(&f->hlist);
 	fdb_notify(br, f, RTM_DELNEIGH);
 	call_rcu(&f->rcu, fdb_rcu_free);
 }
@@ -198,11 +236,10 @@ void br_fdb_find_delete_local(struct net_bridge *br,
 			      const struct net_bridge_port *p,
 			      const unsigned char *addr, u16 vid)
 {
-	struct hlist_head *head = &br->hash[br_mac_hash(addr, vid)];
 	struct net_bridge_fdb_entry *f;
 
 	spin_lock_bh(&br->hash_lock);
-	f = fdb_find(head, addr, vid);
+	f = br_fdb_find(br, addr, vid);
 	if (f && f->is_local && !f->added_by_user && f->dst == p)
 		fdb_delete_local(br, p, f);
 	spin_unlock_bh(&br->hash_lock);
@@ -266,7 +303,7 @@ void br_fdb_change_mac_address(struct net_bridge *br, const u8 *newaddr)
 	spin_lock_bh(&br->hash_lock);
 
 	/* If old entry was unassociated with any port, then delete it. */
-	f = __br_fdb_get(br, br->dev->dev_addr, 0);
+	f = br_fdb_find(br, br->dev->dev_addr, 0);
 	if (f && f->is_local && !f->dst && !f->added_by_user)
 		fdb_delete_local(br, NULL, f);
 
@@ -281,7 +318,7 @@ void br_fdb_change_mac_address(struct net_bridge *br, const u8 *newaddr)
 	list_for_each_entry(v, &vg->vlan_list, vlist) {
 		if (!br_vlan_should_use(v))
 			continue;
-		f = __br_fdb_get(br, br->dev->dev_addr, v->vid);
+		f = br_fdb_find(br, br->dev->dev_addr, v->vid);
 		if (f && f->is_local && !f->dst && !f->added_by_user)
 			fdb_delete_local(br, NULL, f);
 		fdb_insert(br, NULL, newaddr, v->vid);
@@ -290,34 +327,43 @@ out:
 	spin_unlock_bh(&br->hash_lock);
 }
 
-void br_fdb_cleanup(unsigned long _data)
+void br_fdb_cleanup(struct work_struct *work)
 {
-	struct net_bridge *br = (struct net_bridge *)_data;
+	struct net_bridge *br = container_of(work, struct net_bridge,
+					     gc_work.work);
 	unsigned long delay = hold_time(br);
-	unsigned long next_timer = jiffies + br->ageing_time;
+	unsigned long work_delay = delay;
+	unsigned long now = jiffies;
 	int i;
 
-	spin_lock(&br->hash_lock);
 	for (i = 0; i < BR_HASH_SIZE; i++) {
 		struct net_bridge_fdb_entry *f;
 		struct hlist_node *n;
 
+		if (!br->hash[i].first)
+			continue;
+
+		spin_lock_bh(&br->hash_lock);
 		hlist_for_each_entry_safe(f, n, &br->hash[i], hlist) {
 			unsigned long this_timer;
+
 			if (f->is_static)
 				continue;
 			if (f->added_by_external_learn)
 				continue;
 			this_timer = f->updated + delay;
-			if (time_before_eq(this_timer, jiffies))
+			if (time_after(this_timer, now))
+				work_delay = min(work_delay, this_timer - now);
+			else
 				fdb_delete(br, f);
-			else if (time_before(this_timer, next_timer))
-				next_timer = this_timer;
 		}
+		spin_unlock_bh(&br->hash_lock);
+		cond_resched();
 	}
-	spin_unlock(&br->hash_lock);
 
-	mod_timer(&br->gc_timer, round_jiffies_up(next_timer));
+	/* Cleanup minimum 10 milliseconds apart */
+	work_delay = max_t(unsigned long, work_delay, msecs_to_jiffies(10));
+	mod_delayed_work(system_long_wq, &br->gc_work, work_delay);
 }
 
 /* Completely flush all dynamic entries in forwarding database.*/
@@ -371,26 +417,6 @@ void br_fdb_delete_by_port(struct net_bridge *br,
 	spin_unlock_bh(&br->hash_lock);
 }
 
-/* No locking or refcounting, assumes caller has rcu_read_lock */
-struct net_bridge_fdb_entry *__br_fdb_get(struct net_bridge *br,
-					  const unsigned char *addr,
-					  __u16 vid)
-{
-	struct net_bridge_fdb_entry *fdb;
-
-	hlist_for_each_entry_rcu(fdb,
-				&br->hash[br_mac_hash(addr, vid)], hlist) {
-		if (ether_addr_equal(fdb->addr.addr, addr) &&
-		    fdb->vlan_id == vid) {
-			if (unlikely(has_expired(br, fdb)))
-				break;
-			return fdb;
-		}
-	}
-
-	return NULL;
-}
-
 #if IS_ENABLED(CONFIG_ATM_LANE)
 /* Interface used by ATM LANE hook to test
  * if an addr is on some other bridge port */
@@ -405,7 +431,7 @@ int br_fdb_test_addr(struct net_device *dev, unsigned char *addr)
 	if (!port)
 		ret = 0;
 	else {
-		fdb = __br_fdb_get(port->br, addr, 0);
+		fdb = br_fdb_find_rcu(port->br, addr, 0);
 		ret = fdb && fdb->dst && fdb->dst->dev != dev &&
 			fdb->dst->state == BR_STATE_FORWARDING;
 	}
@@ -467,34 +493,6 @@ int br_fdb_fillbuf(struct net_bridge *br, void *buf,
 	return num;
 }
 
-static struct net_bridge_fdb_entry *fdb_find(struct hlist_head *head,
-					     const unsigned char *addr,
-					     __u16 vid)
-{
-	struct net_bridge_fdb_entry *fdb;
-
-	hlist_for_each_entry(fdb, head, hlist) {
-		if (ether_addr_equal(fdb->addr.addr, addr) &&
-		    fdb->vlan_id == vid)
-			return fdb;
-	}
-	return NULL;
-}
-
-static struct net_bridge_fdb_entry *fdb_find_rcu(struct hlist_head *head,
-						 const unsigned char *addr,
-						 __u16 vid)
-{
-	struct net_bridge_fdb_entry *fdb;
-
-	hlist_for_each_entry_rcu(fdb, head, hlist) {
-		if (ether_addr_equal(fdb->addr.addr, addr) &&
-		    fdb->vlan_id == vid)
-			return fdb;
-	}
-	return NULL;
-}
-
 static struct net_bridge_fdb_entry *fdb_create(struct hlist_head *head,
 					       struct net_bridge_port *source,
 					       const unsigned char *addr,
@@ -528,7 +526,7 @@ static int fdb_insert(struct net_bridge *br, struct net_bridge_port *source,
 	if (!is_valid_ether_addr(addr))
 		return -EINVAL;
 
-	fdb = fdb_find(head, addr, vid);
+	fdb = br_fdb_find(br, addr, vid);
 	if (fdb) {
 		/* it is okay to have multiple ports with same
 		 * address, just use the first one.
@@ -585,12 +583,15 @@ void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source,
 				br_warn(br, "received packet on %s with own address as source address (addr:%pM, vlan:%u)\n",
 					source->dev->name, addr, vid);
 		} else {
+			unsigned long now = jiffies;
+
 			/* fastpath: update of existing entry */
 			if (unlikely(source != fdb->dst)) {
 				fdb->dst = source;
 				fdb_modified = true;
 			}
-			fdb->updated = jiffies;
+			if (now != fdb->updated)
+				fdb->updated = now;
 			if (unlikely(added_by_user))
 				fdb->added_by_user = 1;
 			if (unlikely(fdb_modified))
@@ -598,7 +599,7 @@ void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source,
 		}
 	} else {
 		spin_lock(&br->hash_lock);
-		if (likely(!fdb_find(head, addr, vid))) {
+		if (likely(!fdb_find_rcu(head, addr, vid))) {
 			fdb = fdb_create(head, source, addr, vid, 0, 0);
 			if (fdb) {
 				if (unlikely(added_by_user))
@@ -782,7 +783,7 @@ static int fdb_add_entry(struct net_bridge *br, struct net_bridge_port *source,
 		return -EINVAL;
 	}
 
-	fdb = fdb_find(head, addr, vid);
+	fdb = br_fdb_find(br, addr, vid);
 	if (fdb == NULL) {
 		if (!(flags & NLM_F_CREATE))
 			return -ENOENT;
@@ -929,55 +930,30 @@ out:
 	return err;
 }
 
-static int fdb_delete_by_addr(struct net_bridge *br, const u8 *addr,
-			      u16 vid)
-{
-	struct hlist_head *head = &br->hash[br_mac_hash(addr, vid)];
-	struct net_bridge_fdb_entry *fdb;
-
-	fdb = fdb_find(head, addr, vid);
-	if (!fdb)
-		return -ENOENT;
-
-	fdb_delete(br, fdb);
-	return 0;
-}
-
-static int __br_fdb_delete_by_addr(struct net_bridge *br,
-				   const unsigned char *addr, u16 vid)
-{
-	int err;
-
-	spin_lock_bh(&br->hash_lock);
-	err = fdb_delete_by_addr(br, addr, vid);
-	spin_unlock_bh(&br->hash_lock);
-
-	return err;
-}
-
-static int fdb_delete_by_addr_and_port(struct net_bridge_port *p,
+static int fdb_delete_by_addr_and_port(struct net_bridge *br,
+				       const struct net_bridge_port *p,
 				       const u8 *addr, u16 vlan)
 {
-	struct net_bridge *br = p->br;
-	struct hlist_head *head = &br->hash[br_mac_hash(addr, vlan)];
 	struct net_bridge_fdb_entry *fdb;
 
-	fdb = fdb_find(head, addr, vlan);
+	fdb = br_fdb_find(br, addr, vlan);
 	if (!fdb || fdb->dst != p)
 		return -ENOENT;
 
 	fdb_delete(br, fdb);
+
 	return 0;
 }
 
-static int __br_fdb_delete(struct net_bridge_port *p,
+static int __br_fdb_delete(struct net_bridge *br,
+			   const struct net_bridge_port *p,
 			   const unsigned char *addr, u16 vid)
 {
 	int err;
 
-	spin_lock_bh(&p->br->hash_lock);
-	err = fdb_delete_by_addr_and_port(p, addr, vid);
-	spin_unlock_bh(&p->br->hash_lock);
+	spin_lock_bh(&br->hash_lock);
+	err = fdb_delete_by_addr_and_port(br, p, addr, vid);
+	spin_unlock_bh(&br->hash_lock);
 
 	return err;
 }
@@ -990,7 +966,7 @@ int br_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[],
 	struct net_bridge_vlan_group *vg;
 	struct net_bridge_port *p = NULL;
 	struct net_bridge_vlan *v;
-	struct net_bridge *br = NULL;
+	struct net_bridge *br;
 	int err;
 
 	if (dev->priv_flags & IFF_EBRIDGE) {
@@ -1004,6 +980,7 @@ int br_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[],
 			return -EINVAL;
 		}
 		vg = nbp_vlan_group(p);
+		br = p->br;
 	}
 
 	if (vid) {
@@ -1013,30 +990,20 @@ int br_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[],
 			return -EINVAL;
 		}
 
-		if (dev->priv_flags & IFF_EBRIDGE)
-			err = __br_fdb_delete_by_addr(br, addr, vid);
-		else
-			err = __br_fdb_delete(p, addr, vid);
+		err = __br_fdb_delete(br, p, addr, vid);
 	} else {
 		err = -ENOENT;
-		if (dev->priv_flags & IFF_EBRIDGE)
-			err = __br_fdb_delete_by_addr(br, addr, 0);
-		else
-			err &= __br_fdb_delete(p, addr, 0);
-
+		err &= __br_fdb_delete(br, p, addr, 0);
 		if (!vg || !vg->num_vlans)
-			goto out;
+			return err;
 
 		list_for_each_entry(v, &vg->vlan_list, vlist) {
 			if (!br_vlan_should_use(v))
 				continue;
-			if (dev->priv_flags & IFF_EBRIDGE)
-				err = __br_fdb_delete_by_addr(br, addr, v->vid);
-			else
-				err &= __br_fdb_delete(p, addr, v->vid);
+			err &= __br_fdb_delete(br, p, addr, v->vid);
 		}
 	}
-out:
+
 	return err;
 }
 
@@ -1107,7 +1074,7 @@ int br_fdb_external_learn_add(struct net_bridge *br, struct net_bridge_port *p,
 	spin_lock_bh(&br->hash_lock);
 
 	head = &br->hash[br_mac_hash(addr, vid)];
-	fdb = fdb_find(head, addr, vid);
+	fdb = br_fdb_find(br, addr, vid);
 	if (!fdb) {
 		fdb = fdb_create(head, p, addr, vid, 0, 0);
 		if (!fdb) {
@@ -1135,15 +1102,13 @@ err_unlock:
 int br_fdb_external_learn_del(struct net_bridge *br, struct net_bridge_port *p,
 			      const unsigned char *addr, u16 vid)
 {
-	struct hlist_head *head;
 	struct net_bridge_fdb_entry *fdb;
 	int err = 0;
 
 	ASSERT_RTNL();
 	spin_lock_bh(&br->hash_lock);
 
-	head = &br->hash[br_mac_hash(addr, vid)];
-	fdb = fdb_find(head, addr, vid);
+	fdb = br_fdb_find(br, addr, vid);
 	if (fdb && fdb->added_by_external_learn)
 		fdb_delete(br, fdb);
 	else
diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c
index 7cb41aee4c82..6bfac29318f2 100644
--- a/net/bridge/br_forward.c
+++ b/net/bridge/br_forward.c
@@ -80,7 +80,7 @@ static void __br_forward(const struct net_bridge_port *to,
 	int br_hook;
 
 	vg = nbp_vlan_group_rcu(to);
-	skb = br_handle_vlan(to->br, vg, skb);
+	skb = br_handle_vlan(to->br, to, vg, skb);
 	if (!skb)
 		return;
 
@@ -220,6 +220,31 @@ out:
 }
 
 #ifdef CONFIG_BRIDGE_IGMP_SNOOPING
+static void maybe_deliver_addr(struct net_bridge_port *p, struct sk_buff *skb,
+			       const unsigned char *addr, bool local_orig)
+{
+	struct net_device *dev = BR_INPUT_SKB_CB(skb)->brdev;
+	const unsigned char *src = eth_hdr(skb)->h_source;
+
+	if (!should_deliver(p, skb))
+		return;
+
+	/* Even with hairpin, no soliloquies - prevent breaking IPv6 DAD */
+	if (skb->dev == p->dev && ether_addr_equal(src, addr))
+		return;
+
+	skb = skb_copy(skb, GFP_ATOMIC);
+	if (!skb) {
+		dev->stats.tx_dropped++;
+		return;
+	}
+
+	if (!is_broadcast_ether_addr(addr))
+		memcpy(eth_hdr(skb)->h_dest, addr, ETH_ALEN);
+
+	__br_forward(p, skb, local_orig);
+}
+
 /* called with rcu_read_lock */
 void br_multicast_flood(struct net_bridge_mdb_entry *mdst,
 			struct sk_buff *skb,
@@ -241,10 +266,20 @@ void br_multicast_flood(struct net_bridge_mdb_entry *mdst,
 		rport = rp ? hlist_entry(rp, struct net_bridge_port, rlist) :
 			     NULL;
 
-		port = (unsigned long)lport > (unsigned long)rport ?
-		       lport : rport;
+		if ((unsigned long)lport > (unsigned long)rport) {
+			port = lport;
+
+			if (port->flags & BR_MULTICAST_TO_UNICAST) {
+				maybe_deliver_addr(lport, skb, p->eth_addr,
+						   local_orig);
+				goto delivered;
+			}
+		} else {
+			port = rport;
+		}
 
 		prev = maybe_deliver(prev, port, skb, local_orig);
+delivered:
 		if (IS_ERR(prev))
 			goto out;
 		if (prev == port)
diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c
index ed0dd3340084..8ac1770aa222 100644
--- a/net/bridge/br_if.c
+++ b/net/bridge/br_if.c
@@ -313,7 +313,7 @@ void br_dev_delete(struct net_device *dev, struct list_head *head)
 
 	br_vlan_flush(br);
 	br_multicast_dev_del(br);
-	del_timer_sync(&br->gc_timer);
+	cancel_delayed_work_sync(&br->gc_work);
 
 	br_sysfs_delbr(br->dev);
 	unregister_netdevice_queue(br->dev, head);
diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c
index 855b72fbe1da..236f34244dbe 100644
--- a/net/bridge/br_input.c
+++ b/net/bridge/br_input.c
@@ -21,6 +21,7 @@
 #include <linux/export.h>
 #include <linux/rculist.h>
 #include "br_private.h"
+#include "br_private_tunnel.h"
 
 /* Hook for brouter */
 br_should_route_hook_t __rcu *br_should_route_hook __read_mostly;
@@ -57,7 +58,7 @@ static int br_pass_frame_up(struct sk_buff *skb)
 
 	indev = skb->dev;
 	skb->dev = brdev;
-	skb = br_handle_vlan(br, vg, skb);
+	skb = br_handle_vlan(br, NULL, vg, skb);
 	if (!skb)
 		return NET_RX_DROP;
 	/* update the multicast stats if the packet is IGMP/MLD */
@@ -113,7 +114,7 @@ static void br_do_proxy_arp(struct sk_buff *skb, struct net_bridge *br,
 			return;
 		}
 
-		f = __br_fdb_get(br, n->ha, vid);
+		f = br_fdb_find_rcu(br, n->ha, vid);
 		if (f && ((p->flags & BR_PROXYARP) ||
 			  (f->dst && (f->dst->flags & BR_PROXYARP_WIFI)))) {
 			arp_send(ARPOP_REPLY, ETH_P_ARP, sip, skb->dev, tip,
@@ -188,16 +189,19 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb
 		}
 		break;
 	case BR_PKT_UNICAST:
-		dst = __br_fdb_get(br, dest, vid);
+		dst = br_fdb_find_rcu(br, dest, vid);
 	default:
 		break;
 	}
 
 	if (dst) {
+		unsigned long now = jiffies;
+
 		if (dst->is_local)
 			return br_pass_frame_up(skb);
 
-		dst->used = jiffies;
+		if (now != dst->used)
+			dst->used = now;
 		br_forward(dst->dst, skb, local_rcv, false);
 	} else {
 		if (!mcast_hit)
@@ -261,6 +265,11 @@ rx_handler_result_t br_handle_frame(struct sk_buff **pskb)
 		return RX_HANDLER_CONSUMED;
 
 	p = br_port_get_rcu(skb->dev);
+	if (p->flags & BR_VLAN_TUNNEL) {
+		if (br_handle_ingress_vlan_tunnel(skb, p,
+						  nbp_vlan_group_rcu(p)))
+			goto drop;
+	}
 
 	if (unlikely(is_link_local_ether_addr(dest))) {
 		u16 fwd_mask = p->br->group_fwd_mask_required;
diff --git a/net/bridge/br_ioctl.c b/net/bridge/br_ioctl.c
index da8157c57eb1..7970f8540cbb 100644
--- a/net/bridge/br_ioctl.c
+++ b/net/bridge/br_ioctl.c
@@ -149,7 +149,7 @@ static int old_dev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
 		b.hello_timer_value = br_timer_value(&br->hello_timer);
 		b.tcn_timer_value = br_timer_value(&br->tcn_timer);
 		b.topology_change_timer_value = br_timer_value(&br->topology_change_timer);
-		b.gc_timer_value = br_timer_value(&br->gc_timer);
+		b.gc_timer_value = br_timer_value(&br->gc_work.timer);
 		rcu_read_unlock();
 
 		if (copy_to_user((void __user *)args[1], &b, sizeof(b)))
diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c
index 7dbc80d01eb0..056e6ac49d8f 100644
--- a/net/bridge/br_mdb.c
+++ b/net/bridge/br_mdb.c
@@ -531,7 +531,7 @@ static int br_mdb_add_group(struct net_bridge *br, struct net_bridge_port *port,
 			break;
 	}
 
-	p = br_multicast_new_port_group(port, group, *pp, state);
+	p = br_multicast_new_port_group(port, group, *pp, state, NULL);
 	if (unlikely(!p))
 		return -ENOMEM;
 	rcu_assign_pointer(*pp, p);
diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
index b30e77e8427c..b760f2620abf 100644
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -27,6 +27,7 @@
 #include <linux/inetdevice.h>
 #include <linux/mroute.h>
 #include <net/ip.h>
+#include <net/switchdev.h>
 #if IS_ENABLED(CONFIG_IPV6)
 #include <net/ipv6.h>
 #include <net/mld.h>
@@ -43,12 +44,15 @@ static void br_multicast_add_router(struct net_bridge *br,
 static void br_ip4_multicast_leave_group(struct net_bridge *br,
 					 struct net_bridge_port *port,
 					 __be32 group,
-					 __u16 vid);
+					 __u16 vid,
+					 const unsigned char *src);
+
+static void __del_port_router(struct net_bridge_port *p);
 #if IS_ENABLED(CONFIG_IPV6)
 static void br_ip6_multicast_leave_group(struct net_bridge *br,
 					 struct net_bridge_port *port,
 					 const struct in6_addr *group,
-					 __u16 vid);
+					 __u16 vid, const unsigned char *src);
 #endif
 unsigned int br_mdb_rehash_seq;
 
@@ -540,7 +544,7 @@ static struct sk_buff *br_ip6_multicast_alloc_query(struct net_bridge *br,
 		break;
 	case 2:
 		mld2q = (struct mld2_query *)icmp6_hdr(skb);
-		mld2q->mld2q_mrc = ntohs((u16)jiffies_to_msecs(interval));
+		mld2q->mld2q_mrc = htons((u16)jiffies_to_msecs(interval));
 		mld2q->mld2q_type = ICMPV6_MGM_QUERY;
 		mld2q->mld2q_code = 0;
 		mld2q->mld2q_cksum = 0;
@@ -711,7 +715,8 @@ struct net_bridge_port_group *br_multicast_new_port_group(
 			struct net_bridge_port *port,
 			struct br_ip *group,
 			struct net_bridge_port_group __rcu *next,
-			unsigned char flags)
+			unsigned char flags,
+			const unsigned char *src)
 {
 	struct net_bridge_port_group *p;
 
@@ -726,12 +731,32 @@ struct net_bridge_port_group *br_multicast_new_port_group(
 	hlist_add_head(&p->mglist, &port->mglist);
 	setup_timer(&p->timer, br_multicast_port_group_expired,
 		    (unsigned long)p);
+
+	if (src)
+		memcpy(p->eth_addr, src, ETH_ALEN);
+	else
+		memset(p->eth_addr, 0xff, ETH_ALEN);
+
 	return p;
 }
 
+static bool br_port_group_equal(struct net_bridge_port_group *p,
+				struct net_bridge_port *port,
+				const unsigned char *src)
+{
+	if (p->port != port)
+		return false;
+
+	if (!(port->flags & BR_MULTICAST_TO_UNICAST))
+		return true;
+
+	return ether_addr_equal(src, p->eth_addr);
+}
+
 static int br_multicast_add_group(struct net_bridge *br,
 				  struct net_bridge_port *port,
-				  struct br_ip *group)
+				  struct br_ip *group,
+				  const unsigned char *src)
 {
 	struct net_bridge_port_group __rcu **pp;
 	struct net_bridge_port_group *p;
@@ -758,13 +783,13 @@ static int br_multicast_add_group(struct net_bridge *br,
 	for (pp = &mp->ports;
 	     (p = mlock_dereference(*pp, br)) != NULL;
 	     pp = &p->next) {
-		if (p->port == port)
+		if (br_port_group_equal(p, port, src))
 			goto found;
 		if ((unsigned long)p->port < (unsigned long)port)
 			break;
 	}
 
-	p = br_multicast_new_port_group(port, group, *pp, 0);
+	p = br_multicast_new_port_group(port, group, *pp, 0, src);
 	if (unlikely(!p))
 		goto err;
 	rcu_assign_pointer(*pp, p);
@@ -783,7 +808,8 @@ err:
 static int br_ip4_multicast_add_group(struct net_bridge *br,
 				      struct net_bridge_port *port,
 				      __be32 group,
-				      __u16 vid)
+				      __u16 vid,
+				      const unsigned char *src)
 {
 	struct br_ip br_group;
 
@@ -794,14 +820,15 @@ static int br_ip4_multicast_add_group(struct net_bridge *br,
 	br_group.proto = htons(ETH_P_IP);
 	br_group.vid = vid;
 
-	return br_multicast_add_group(br, port, &br_group);
+	return br_multicast_add_group(br, port, &br_group, src);
 }
 
 #if IS_ENABLED(CONFIG_IPV6)
 static int br_ip6_multicast_add_group(struct net_bridge *br,
 				      struct net_bridge_port *port,
 				      const struct in6_addr *group,
-				      __u16 vid)
+				      __u16 vid,
+				      const unsigned char *src)
 {
 	struct br_ip br_group;
 
@@ -812,7 +839,7 @@ static int br_ip6_multicast_add_group(struct net_bridge *br,
 	br_group.proto = htons(ETH_P_IPV6);
 	br_group.vid = vid;
 
-	return br_multicast_add_group(br, port, &br_group);
+	return br_multicast_add_group(br, port, &br_group, src);
 }
 #endif
 
@@ -824,16 +851,10 @@ static void br_multicast_router_expired(unsigned long data)
 	spin_lock(&br->multicast_lock);
 	if (port->multicast_router == MDB_RTR_TYPE_DISABLED ||
 	    port->multicast_router == MDB_RTR_TYPE_PERM ||
-	    timer_pending(&port->multicast_router_timer) ||
-	    hlist_unhashed(&port->rlist))
+	    timer_pending(&port->multicast_router_timer))
 		goto out;
 
-	hlist_del_init_rcu(&port->rlist);
-	br_rtr_notify(br->dev, port, RTM_DELMDB);
-	/* Don't allow timer refresh if the router expired */
-	if (port->multicast_router == MDB_RTR_TYPE_TEMP)
-		port->multicast_router = MDB_RTR_TYPE_TEMP_QUERY;
-
+	__del_port_router(port);
 out:
 	spin_unlock(&br->multicast_lock);
 }
@@ -982,6 +1003,18 @@ static void br_ip6_multicast_port_query_expired(unsigned long data)
 }
 #endif
 
+static void br_mc_disabled_update(struct net_device *dev, bool value)
+{
+	struct switchdev_attr attr = {
+		.orig_dev = dev,
+		.id = SWITCHDEV_ATTR_ID_BRIDGE_MC_DISABLED,
+		.flags = SWITCHDEV_F_DEFER,
+		.u.mc_disabled = value,
+	};
+
+	switchdev_port_attr_set(dev, &attr);
+}
+
 int br_multicast_add_port(struct net_bridge_port *port)
 {
 	port->multicast_router = MDB_RTR_TYPE_TEMP_QUERY;
@@ -994,6 +1027,8 @@ int br_multicast_add_port(struct net_bridge_port *port)
 	setup_timer(&port->ip6_own_query.timer,
 		    br_ip6_multicast_port_query_expired, (unsigned long)port);
 #endif
+	br_mc_disabled_update(port->dev, port->br->multicast_disabled);
+
 	port->mcast_stats = netdev_alloc_pcpu_stats(struct bridge_mcast_stats);
 	if (!port->mcast_stats)
 		return -ENOMEM;
@@ -1061,13 +1096,8 @@ void br_multicast_disable_port(struct net_bridge_port *port)
 		if (!(pg->flags & MDB_PG_FLAGS_PERMANENT))
 			br_multicast_del_pg(br, pg);
 
-	if (!hlist_unhashed(&port->rlist)) {
-		hlist_del_init_rcu(&port->rlist);
-		br_rtr_notify(br->dev, port, RTM_DELMDB);
-		/* Don't allow timer refresh if disabling */
-		if (port->multicast_router == MDB_RTR_TYPE_TEMP)
-			port->multicast_router = MDB_RTR_TYPE_TEMP_QUERY;
-	}
+	__del_port_router(port);
+
 	del_timer(&port->multicast_router_timer);
 	del_timer(&port->ip4_own_query.timer);
 #if IS_ENABLED(CONFIG_IPV6)
@@ -1081,6 +1111,7 @@ static int br_ip4_multicast_igmp3_report(struct net_bridge *br,
 					 struct sk_buff *skb,
 					 u16 vid)
 {
+	const unsigned char *src;
 	struct igmpv3_report *ih;
 	struct igmpv3_grec *grec;
 	int i;
@@ -1121,12 +1152,14 @@ static int br_ip4_multicast_igmp3_report(struct net_bridge *br,
 			continue;
 		}
 
+		src = eth_hdr(skb)->h_source;
 		if ((type == IGMPV3_CHANGE_TO_INCLUDE ||
 		     type == IGMPV3_MODE_IS_INCLUDE) &&
 		    ntohs(grec->grec_nsrcs) == 0) {
-			br_ip4_multicast_leave_group(br, port, group, vid);
+			br_ip4_multicast_leave_group(br, port, group, vid, src);
 		} else {
-			err = br_ip4_multicast_add_group(br, port, group, vid);
+			err = br_ip4_multicast_add_group(br, port, group, vid,
+							 src);
 			if (err)
 				break;
 		}
@@ -1141,6 +1174,7 @@ static int br_ip6_multicast_mld2_report(struct net_bridge *br,
 					struct sk_buff *skb,
 					u16 vid)
 {
+	const unsigned char *src;
 	struct icmp6hdr *icmp6h;
 	struct mld2_grec *grec;
 	int i;
@@ -1188,14 +1222,16 @@ static int br_ip6_multicast_mld2_report(struct net_bridge *br,
 			continue;
 		}
 
+		src = eth_hdr(skb)->h_source;
 		if ((grec->grec_type == MLD2_CHANGE_TO_INCLUDE ||
 		     grec->grec_type == MLD2_MODE_IS_INCLUDE) &&
 		    ntohs(*nsrcs) == 0) {
 			br_ip6_multicast_leave_group(br, port, &grec->grec_mca,
-						     vid);
+						     vid, src);
 		} else {
 			err = br_ip6_multicast_add_group(br, port,
-							 &grec->grec_mca, vid);
+							 &grec->grec_mca, vid,
+							 src);
 			if (err)
 				break;
 		}
@@ -1281,6 +1317,19 @@ br_multicast_update_query_timer(struct net_bridge *br,
 	mod_timer(&query->timer, jiffies + br->multicast_querier_interval);
 }
 
+static void br_port_mc_router_state_change(struct net_bridge_port *p,
+					   bool is_mc_router)
+{
+	struct switchdev_attr attr = {
+		.orig_dev = p->dev,
+		.id = SWITCHDEV_ATTR_ID_PORT_MROUTER,
+		.flags = SWITCHDEV_F_DEFER,
+		.u.mrouter = is_mc_router,
+	};
+
+	switchdev_port_attr_set(p->dev, &attr);
+}
+
 /*
  * Add port to router_list
  *  list is maintained ordered by pointer value
@@ -1306,6 +1355,7 @@ static void br_multicast_add_router(struct net_bridge *br,
 	else
 		hlist_add_head_rcu(&port->rlist, &br->router_list);
 	br_rtr_notify(br->dev, port, RTM_NEWMDB);
+	br_port_mc_router_state_change(port, true);
 }
 
 static void br_multicast_mark_router(struct net_bridge *br,
@@ -1511,7 +1561,8 @@ br_multicast_leave_group(struct net_bridge *br,
 			 struct net_bridge_port *port,
 			 struct br_ip *group,
 			 struct bridge_mcast_other_query *other_query,
-			 struct bridge_mcast_own_query *own_query)
+			 struct bridge_mcast_own_query *own_query,
+			 const unsigned char *src)
 {
 	struct net_bridge_mdb_htable *mdb;
 	struct net_bridge_mdb_entry *mp;
@@ -1535,7 +1586,7 @@ br_multicast_leave_group(struct net_bridge *br,
 		for (pp = &mp->ports;
 		     (p = mlock_dereference(*pp, br)) != NULL;
 		     pp = &p->next) {
-			if (p->port != port)
+			if (!br_port_group_equal(p, port, src))
 				continue;
 
 			rcu_assign_pointer(*pp, p->next);
@@ -1566,7 +1617,7 @@ br_multicast_leave_group(struct net_bridge *br,
 		for (p = mlock_dereference(mp->ports, br);
 		     p != NULL;
 		     p = mlock_dereference(p->next, br)) {
-			if (p->port != port)
+			if (!br_port_group_equal(p, port, src))
 				continue;
 
 			if (!hlist_unhashed(&p->mglist) &&
@@ -1617,7 +1668,8 @@ out:
 static void br_ip4_multicast_leave_group(struct net_bridge *br,
 					 struct net_bridge_port *port,
 					 __be32 group,
-					 __u16 vid)
+					 __u16 vid,
+					 const unsigned char *src)
 {
 	struct br_ip br_group;
 	struct bridge_mcast_own_query *own_query;
@@ -1632,14 +1684,15 @@ static void br_ip4_multicast_leave_group(struct net_bridge *br,
 	br_group.vid = vid;
 
 	br_multicast_leave_group(br, port, &br_group, &br->ip4_other_query,
-				 own_query);
+				 own_query, src);
 }
 
 #if IS_ENABLED(CONFIG_IPV6)
 static void br_ip6_multicast_leave_group(struct net_bridge *br,
 					 struct net_bridge_port *port,
 					 const struct in6_addr *group,
-					 __u16 vid)
+					 __u16 vid,
+					 const unsigned char *src)
 {
 	struct br_ip br_group;
 	struct bridge_mcast_own_query *own_query;
@@ -1654,7 +1707,7 @@ static void br_ip6_multicast_leave_group(struct net_bridge *br,
 	br_group.vid = vid;
 
 	br_multicast_leave_group(br, port, &br_group, &br->ip6_other_query,
-				 own_query);
+				 own_query, src);
 }
 #endif
 
@@ -1712,6 +1765,7 @@ static int br_multicast_ipv4_rcv(struct net_bridge *br,
 				 u16 vid)
 {
 	struct sk_buff *skb_trimmed = NULL;
+	const unsigned char *src;
 	struct igmphdr *ih;
 	int err;
 
@@ -1731,13 +1785,14 @@ static int br_multicast_ipv4_rcv(struct net_bridge *br,
 	}
 
 	ih = igmp_hdr(skb);
+	src = eth_hdr(skb)->h_source;
 	BR_INPUT_SKB_CB(skb)->igmp = ih->type;
 
 	switch (ih->type) {
 	case IGMP_HOST_MEMBERSHIP_REPORT:
 	case IGMPV2_HOST_MEMBERSHIP_REPORT:
 		BR_INPUT_SKB_CB(skb)->mrouters_only = 1;
-		err = br_ip4_multicast_add_group(br, port, ih->group, vid);
+		err = br_ip4_multicast_add_group(br, port, ih->group, vid, src);
 		break;
 	case IGMPV3_HOST_MEMBERSHIP_REPORT:
 		err = br_ip4_multicast_igmp3_report(br, port, skb_trimmed, vid);
@@ -1746,7 +1801,7 @@ static int br_multicast_ipv4_rcv(struct net_bridge *br,
 		err = br_ip4_multicast_query(br, port, skb_trimmed, vid);
 		break;
 	case IGMP_HOST_LEAVE_MESSAGE:
-		br_ip4_multicast_leave_group(br, port, ih->group, vid);
+		br_ip4_multicast_leave_group(br, port, ih->group, vid, src);
 		break;
 	}
 
@@ -1766,6 +1821,7 @@ static int br_multicast_ipv6_rcv(struct net_bridge *br,
 				 u16 vid)
 {
 	struct sk_buff *skb_trimmed = NULL;
+	const unsigned char *src;
 	struct mld_msg *mld;
 	int err;
 
@@ -1785,8 +1841,10 @@ static int br_multicast_ipv6_rcv(struct net_bridge *br,
 
 	switch (mld->mld_type) {
 	case ICMPV6_MGM_REPORT:
+		src = eth_hdr(skb)->h_source;
 		BR_INPUT_SKB_CB(skb)->mrouters_only = 1;
-		err = br_ip6_multicast_add_group(br, port, &mld->mld_mca, vid);
+		err = br_ip6_multicast_add_group(br, port, &mld->mld_mca, vid,
+						 src);
 		break;
 	case ICMPV6_MLD2_REPORT:
 		err = br_ip6_multicast_mld2_report(br, port, skb_trimmed, vid);
@@ -1795,7 +1853,8 @@ static int br_multicast_ipv6_rcv(struct net_bridge *br,
 		err = br_ip6_multicast_query(br, port, skb_trimmed, vid);
 		break;
 	case ICMPV6_MGM_REDUCTION:
-		br_ip6_multicast_leave_group(br, port, &mld->mld_mca, vid);
+		src = eth_hdr(skb)->h_source;
+		br_ip6_multicast_leave_group(br, port, &mld->mld_mca, vid, src);
 		break;
 	}
 
@@ -2004,6 +2063,11 @@ static void __del_port_router(struct net_bridge_port *p)
 		return;
 	hlist_del_init_rcu(&p->rlist);
 	br_rtr_notify(p->br->dev, p, RTM_DELMDB);
+	br_port_mc_router_state_change(p, false);
+
+	/* don't allow timer refresh */
+	if (p->multicast_router == MDB_RTR_TYPE_TEMP)
+		p->multicast_router = MDB_RTR_TYPE_TEMP_QUERY;
 }
 
 int br_multicast_set_port_router(struct net_bridge_port *p, unsigned long val)
@@ -2081,6 +2145,7 @@ int br_multicast_toggle(struct net_bridge *br, unsigned long val)
 	if (br->multicast_disabled == !val)
 		goto unlock;
 
+	br_mc_disabled_update(br->dev, !val);
 	br->multicast_disabled = !val;
 	if (br->multicast_disabled)
 		goto unlock;
diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c
index 7109b389ea58..a8f6acd23e30 100644
--- a/net/bridge/br_netlink.c
+++ b/net/bridge/br_netlink.c
@@ -20,6 +20,7 @@
 
 #include "br_private.h"
 #include "br_private_stp.h"
+#include "br_private_tunnel.h"
 
 static int __get_num_vlan_infos(struct net_bridge_vlan_group *vg,
 				u32 filter_mask)
@@ -95,9 +96,10 @@ static size_t br_get_link_af_size_filtered(const struct net_device *dev,
 					   u32 filter_mask)
 {
 	struct net_bridge_vlan_group *vg = NULL;
-	struct net_bridge_port *p;
+	struct net_bridge_port *p = NULL;
 	struct net_bridge *br;
 	int num_vlan_infos;
+	size_t vinfo_sz = 0;
 
 	rcu_read_lock();
 	if (br_port_exists(dev)) {
@@ -110,8 +112,13 @@ static size_t br_get_link_af_size_filtered(const struct net_device *dev,
 	num_vlan_infos = br_get_num_vlan_infos(vg, filter_mask);
 	rcu_read_unlock();
 
+	if (p && (p->flags & BR_VLAN_TUNNEL))
+		vinfo_sz += br_get_vlan_tunnel_info_size(vg);
+
 	/* Each VLAN is returned in bridge_vlan_info along with flags */
-	return num_vlan_infos * nla_total_size(sizeof(struct bridge_vlan_info));
+	vinfo_sz += num_vlan_infos * nla_total_size(sizeof(struct bridge_vlan_info));
+
+	return vinfo_sz;
 }
 
 static inline size_t br_port_info_size(void)
@@ -123,10 +130,12 @@ static inline size_t br_port_info_size(void)
 		+ nla_total_size(1)	/* IFLA_BRPORT_GUARD */
 		+ nla_total_size(1)	/* IFLA_BRPORT_PROTECT */
 		+ nla_total_size(1)	/* IFLA_BRPORT_FAST_LEAVE */
+		+ nla_total_size(1)	/* IFLA_BRPORT_MCAST_TO_UCAST */
 		+ nla_total_size(1)	/* IFLA_BRPORT_LEARNING */
 		+ nla_total_size(1)	/* IFLA_BRPORT_UNICAST_FLOOD */
 		+ nla_total_size(1)	/* IFLA_BRPORT_PROXYARP */
 		+ nla_total_size(1)	/* IFLA_BRPORT_PROXYARP_WIFI */
+		+ nla_total_size(1)	/* IFLA_BRPORT_VLAN_TUNNEL */
 		+ nla_total_size(sizeof(struct ifla_bridge_id))	/* IFLA_BRPORT_ROOT_ID */
 		+ nla_total_size(sizeof(struct ifla_bridge_id))	/* IFLA_BRPORT_BRIDGE_ID */
 		+ nla_total_size(sizeof(u16))	/* IFLA_BRPORT_DESIGNATED_PORT */
@@ -173,6 +182,8 @@ static int br_port_fill_attrs(struct sk_buff *skb,
 		       !!(p->flags & BR_ROOT_BLOCK)) ||
 	    nla_put_u8(skb, IFLA_BRPORT_FAST_LEAVE,
 		       !!(p->flags & BR_MULTICAST_FAST_LEAVE)) ||
+	    nla_put_u8(skb, IFLA_BRPORT_MCAST_TO_UCAST,
+		       !!(p->flags & BR_MULTICAST_TO_UNICAST)) ||
 	    nla_put_u8(skb, IFLA_BRPORT_LEARNING, !!(p->flags & BR_LEARNING)) ||
 	    nla_put_u8(skb, IFLA_BRPORT_UNICAST_FLOOD,
 		       !!(p->flags & BR_FLOOD)) ||
@@ -191,7 +202,9 @@ static int br_port_fill_attrs(struct sk_buff *skb,
 	    nla_put_u16(skb, IFLA_BRPORT_NO, p->port_no) ||
 	    nla_put_u8(skb, IFLA_BRPORT_TOPOLOGY_CHANGE_ACK,
 		       p->topology_change_ack) ||
-	    nla_put_u8(skb, IFLA_BRPORT_CONFIG_PENDING, p->config_pending))
+	    nla_put_u8(skb, IFLA_BRPORT_CONFIG_PENDING, p->config_pending) ||
+	    nla_put_u8(skb, IFLA_BRPORT_VLAN_TUNNEL, !!(p->flags &
+							BR_VLAN_TUNNEL)))
 		return -EMSGSIZE;
 
 	timerval = br_timer_value(&p->message_age_timer);
@@ -414,6 +427,9 @@ static int br_fill_ifinfo(struct sk_buff *skb,
 			err = br_fill_ifvlaninfo_compressed(skb, vg);
 		else
 			err = br_fill_ifvlaninfo(skb, vg);
+
+		if (port && (port->flags & BR_VLAN_TUNNEL))
+			err = br_fill_vlan_tunnel_info(skb, vg);
 		rcu_read_unlock();
 		if (err)
 			goto nla_put_failure;
@@ -514,60 +530,88 @@ static int br_vlan_info(struct net_bridge *br, struct net_bridge_port *p,
 	return err;
 }
 
+static int br_process_vlan_info(struct net_bridge *br,
+				struct net_bridge_port *p, int cmd,
+				struct bridge_vlan_info *vinfo_curr,
+				struct bridge_vlan_info **vinfo_last)
+{
+	if (!vinfo_curr->vid || vinfo_curr->vid >= VLAN_VID_MASK)
+		return -EINVAL;
+
+	if (vinfo_curr->flags & BRIDGE_VLAN_INFO_RANGE_BEGIN) {
+		/* check if we are already processing a range */
+		if (*vinfo_last)
+			return -EINVAL;
+		*vinfo_last = vinfo_curr;
+		/* don't allow range of pvids */
+		if ((*vinfo_last)->flags & BRIDGE_VLAN_INFO_PVID)
+			return -EINVAL;
+		return 0;
+	}
+
+	if (*vinfo_last) {
+		struct bridge_vlan_info tmp_vinfo;
+		int v, err;
+
+		if (!(vinfo_curr->flags & BRIDGE_VLAN_INFO_RANGE_END))
+			return -EINVAL;
+
+		if (vinfo_curr->vid <= (*vinfo_last)->vid)
+			return -EINVAL;
+
+		memcpy(&tmp_vinfo, *vinfo_last,
+		       sizeof(struct bridge_vlan_info));
+		for (v = (*vinfo_last)->vid; v <= vinfo_curr->vid; v++) {
+			tmp_vinfo.vid = v;
+			err = br_vlan_info(br, p, cmd, &tmp_vinfo);
+			if (err)
+				break;
+		}
+		*vinfo_last = NULL;
+
+		return 0;
+	}
+
+	return br_vlan_info(br, p, cmd, vinfo_curr);
+}
+
 static int br_afspec(struct net_bridge *br,
 		     struct net_bridge_port *p,
 		     struct nlattr *af_spec,
 		     int cmd)
 {
-	struct bridge_vlan_info *vinfo_start = NULL;
-	struct bridge_vlan_info *vinfo = NULL;
+	struct bridge_vlan_info *vinfo_curr = NULL;
+	struct bridge_vlan_info *vinfo_last = NULL;
 	struct nlattr *attr;
-	int err = 0;
-	int rem;
+	struct vtunnel_info tinfo_last = {};
+	struct vtunnel_info tinfo_curr = {};
+	int err = 0, rem;
 
 	nla_for_each_nested(attr, af_spec, rem) {
-		if (nla_type(attr) != IFLA_BRIDGE_VLAN_INFO)
-			continue;
-		if (nla_len(attr) != sizeof(struct bridge_vlan_info))
-			return -EINVAL;
-		vinfo = nla_data(attr);
-		if (!vinfo->vid || vinfo->vid >= VLAN_VID_MASK)
-			return -EINVAL;
-		if (vinfo->flags & BRIDGE_VLAN_INFO_RANGE_BEGIN) {
-			if (vinfo_start)
-				return -EINVAL;
-			vinfo_start = vinfo;
-			/* don't allow range of pvids */
-			if (vinfo_start->flags & BRIDGE_VLAN_INFO_PVID)
-				return -EINVAL;
-			continue;
-		}
-
-		if (vinfo_start) {
-			struct bridge_vlan_info tmp_vinfo;
-			int v;
-
-			if (!(vinfo->flags & BRIDGE_VLAN_INFO_RANGE_END))
+		err = 0;
+		switch (nla_type(attr)) {
+		case IFLA_BRIDGE_VLAN_TUNNEL_INFO:
+			if (!(p->flags & BR_VLAN_TUNNEL))
 				return -EINVAL;
-
-			if (vinfo->vid <= vinfo_start->vid)
+			err = br_parse_vlan_tunnel_info(attr, &tinfo_curr);
+			if (err)
+				return err;
+			err = br_process_vlan_tunnel_info(br, p, cmd,
+							  &tinfo_curr,
+							  &tinfo_last);
+			if (err)
+				return err;
+			break;
+		case IFLA_BRIDGE_VLAN_INFO:
+			if (nla_len(attr) != sizeof(struct bridge_vlan_info))
 				return -EINVAL;
-
-			memcpy(&tmp_vinfo, vinfo_start,
-			       sizeof(struct bridge_vlan_info));
-
-			for (v = vinfo_start->vid; v <= vinfo->vid; v++) {
-				tmp_vinfo.vid = v;
-				err = br_vlan_info(br, p, cmd, &tmp_vinfo);
-				if (err)
-					break;
-			}
-			vinfo_start = NULL;
-		} else {
-			err = br_vlan_info(br, p, cmd, vinfo);
-		}
-		if (err)
+			vinfo_curr = nla_data(attr);
+			err = br_process_vlan_info(br, p, cmd, vinfo_curr,
+						   &vinfo_last);
+			if (err)
+				return err;
 			break;
+		}
 	}
 
 	return err;
@@ -586,6 +630,7 @@ static const struct nla_policy br_port_policy[IFLA_BRPORT_MAX + 1] = {
 	[IFLA_BRPORT_PROXYARP]	= { .type = NLA_U8 },
 	[IFLA_BRPORT_PROXYARP_WIFI] = { .type = NLA_U8 },
 	[IFLA_BRPORT_MULTICAST_ROUTER] = { .type = NLA_U8 },
+	[IFLA_BRPORT_MCAST_TO_UCAST] = { .type = NLA_U8 },
 };
 
 /* Change the state of the port and notify spanning tree */
@@ -626,8 +671,9 @@ static void br_set_port_flag(struct net_bridge_port *p, struct nlattr *tb[],
 /* Process bridge protocol info on port */
 static int br_setport(struct net_bridge_port *p, struct nlattr *tb[])
 {
-	int err;
 	unsigned long old_flags = p->flags;
+	bool br_vlan_tunnel_old = false;
+	int err;
 
 	br_set_port_flag(p, tb, IFLA_BRPORT_MODE, BR_HAIRPIN_MODE);
 	br_set_port_flag(p, tb, IFLA_BRPORT_GUARD, BR_BPDU_GUARD);
@@ -636,9 +682,15 @@ static int br_setport(struct net_bridge_port *p, struct nlattr *tb[])
 	br_set_port_flag(p, tb, IFLA_BRPORT_LEARNING, BR_LEARNING);
 	br_set_port_flag(p, tb, IFLA_BRPORT_UNICAST_FLOOD, BR_FLOOD);
 	br_set_port_flag(p, tb, IFLA_BRPORT_MCAST_FLOOD, BR_MCAST_FLOOD);
+	br_set_port_flag(p, tb, IFLA_BRPORT_MCAST_TO_UCAST, BR_MULTICAST_TO_UNICAST);
 	br_set_port_flag(p, tb, IFLA_BRPORT_PROXYARP, BR_PROXYARP);
 	br_set_port_flag(p, tb, IFLA_BRPORT_PROXYARP_WIFI, BR_PROXYARP_WIFI);
 
+	br_vlan_tunnel_old = (p->flags & BR_VLAN_TUNNEL) ? true : false;
+	br_set_port_flag(p, tb, IFLA_BRPORT_VLAN_TUNNEL, BR_VLAN_TUNNEL);
+	if (br_vlan_tunnel_old && !(p->flags & BR_VLAN_TUNNEL))
+		nbp_vlan_tunnel_info_flush(p);
+
 	if (tb[IFLA_BRPORT_COST]) {
 		err = br_stp_set_path_cost(p, nla_get_u32(tb[IFLA_BRPORT_COST]));
 		if (err)
@@ -1195,7 +1247,7 @@ static int br_fill_info(struct sk_buff *skb, const struct net_device *brdev)
 	if (nla_put_u64_64bit(skb, IFLA_BR_TOPOLOGY_CHANGE_TIMER, clockval,
 			      IFLA_BR_PAD))
 		return -EMSGSIZE;
-	clockval = br_timer_value(&br->gc_timer);
+	clockval = br_timer_value(&br->gc_work.timer);
 	if (nla_put_u64_64bit(skb, IFLA_BR_GC_TIMER, clockval, IFLA_BR_PAD))
 		return -EMSGSIZE;
 
diff --git a/net/bridge/br_netlink_tunnel.c b/net/bridge/br_netlink_tunnel.c
new file mode 100644
index 000000000000..c913491495ab
--- /dev/null
+++ b/net/bridge/br_netlink_tunnel.c
@@ -0,0 +1,294 @@
+/*
+ *	Bridge per vlan tunnel port dst_metadata netlink control interface
+ *
+ *	Authors:
+ *	Roopa Prabhu		<roopa@cumulusnetworks.com>
+ *
+ *	This program is free software; you can redistribute it and/or
+ *	modify it under the terms of the GNU General Public License
+ *	as published by the Free Software Foundation; either version
+ *	2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/etherdevice.h>
+#include <net/rtnetlink.h>
+#include <net/net_namespace.h>
+#include <net/sock.h>
+#include <uapi/linux/if_bridge.h>
+#include <net/dst_metadata.h>
+
+#include "br_private.h"
+#include "br_private_tunnel.h"
+
+static size_t __get_vlan_tinfo_size(void)
+{
+	return nla_total_size(0) + /* nest IFLA_BRIDGE_VLAN_TUNNEL_INFO */
+		  nla_total_size(sizeof(u32)) + /* IFLA_BRIDGE_VLAN_TUNNEL_ID */
+		  nla_total_size(sizeof(u16)) + /* IFLA_BRIDGE_VLAN_TUNNEL_VID */
+		  nla_total_size(sizeof(u16)); /* IFLA_BRIDGE_VLAN_TUNNEL_FLAGS */
+}
+
+static bool vlan_tunid_inrange(struct net_bridge_vlan *v_curr,
+			       struct net_bridge_vlan *v_last)
+{
+	__be32 tunid_curr = tunnel_id_to_key32(v_curr->tinfo.tunnel_id);
+	__be32 tunid_last = tunnel_id_to_key32(v_last->tinfo.tunnel_id);
+
+	return (be32_to_cpu(tunid_curr) - be32_to_cpu(tunid_last)) == 1;
+}
+
+static int __get_num_vlan_tunnel_infos(struct net_bridge_vlan_group *vg)
+{
+	struct net_bridge_vlan *v, *vtbegin = NULL, *vtend = NULL;
+	int num_tinfos = 0;
+
+	/* Count number of vlan infos */
+	list_for_each_entry_rcu(v, &vg->vlan_list, vlist) {
+		/* only a context, bridge vlan not activated */
+		if (!br_vlan_should_use(v) || !v->tinfo.tunnel_id)
+			continue;
+
+		if (!vtbegin) {
+			goto initvars;
+		} else if ((v->vid - vtend->vid) == 1 &&
+			   vlan_tunid_inrange(v, vtend)) {
+			vtend = v;
+			continue;
+		} else {
+			if ((vtend->vid - vtbegin->vid) > 0)
+				num_tinfos += 2;
+			else
+				num_tinfos += 1;
+		}
+initvars:
+		vtbegin = v;
+		vtend = v;
+	}
+
+	if (vtbegin && vtend) {
+		if ((vtend->vid - vtbegin->vid) > 0)
+			num_tinfos += 2;
+		else
+			num_tinfos += 1;
+	}
+
+	return num_tinfos;
+}
+
+int br_get_vlan_tunnel_info_size(struct net_bridge_vlan_group *vg)
+{
+	int num_tinfos;
+
+	if (!vg)
+		return 0;
+
+	rcu_read_lock();
+	num_tinfos = __get_num_vlan_tunnel_infos(vg);
+	rcu_read_unlock();
+
+	return num_tinfos * __get_vlan_tinfo_size();
+}
+
+static int br_fill_vlan_tinfo(struct sk_buff *skb, u16 vid,
+			      __be64 tunnel_id, u16 flags)
+{
+	__be32 tid = tunnel_id_to_key32(tunnel_id);
+	struct nlattr *tmap;
+
+	tmap = nla_nest_start(skb, IFLA_BRIDGE_VLAN_TUNNEL_INFO);
+	if (!tmap)
+		return -EMSGSIZE;
+	if (nla_put_u32(skb, IFLA_BRIDGE_VLAN_TUNNEL_ID,
+			be32_to_cpu(tid)))
+		goto nla_put_failure;
+	if (nla_put_u16(skb, IFLA_BRIDGE_VLAN_TUNNEL_VID,
+			vid))
+		goto nla_put_failure;
+	if (nla_put_u16(skb, IFLA_BRIDGE_VLAN_TUNNEL_FLAGS,
+			flags))
+		goto nla_put_failure;
+	nla_nest_end(skb, tmap);
+
+	return 0;
+
+nla_put_failure:
+	nla_nest_cancel(skb, tmap);
+
+	return -EMSGSIZE;
+}
+
+static int br_fill_vlan_tinfo_range(struct sk_buff *skb,
+				    struct net_bridge_vlan *vtbegin,
+				    struct net_bridge_vlan *vtend)
+{
+	int err;
+
+	if (vtend && (vtend->vid - vtbegin->vid) > 0) {
+		/* add range to skb */
+		err = br_fill_vlan_tinfo(skb, vtbegin->vid,
+					 vtbegin->tinfo.tunnel_id,
+					 BRIDGE_VLAN_INFO_RANGE_BEGIN);
+		if (err)
+			return err;
+
+		err = br_fill_vlan_tinfo(skb, vtend->vid,
+					 vtend->tinfo.tunnel_id,
+					 BRIDGE_VLAN_INFO_RANGE_END);
+		if (err)
+			return err;
+	} else {
+		err = br_fill_vlan_tinfo(skb, vtbegin->vid,
+					 vtbegin->tinfo.tunnel_id,
+					 0);
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
+int br_fill_vlan_tunnel_info(struct sk_buff *skb,
+			     struct net_bridge_vlan_group *vg)
+{
+	struct net_bridge_vlan *vtbegin = NULL;
+	struct net_bridge_vlan *vtend = NULL;
+	struct net_bridge_vlan *v;
+	int err;
+
+	/* Count number of vlan infos */
+	list_for_each_entry_rcu(v, &vg->vlan_list, vlist) {
+		/* only a context, bridge vlan not activated */
+		if (!br_vlan_should_use(v))
+			continue;
+
+		if (!v->tinfo.tunnel_dst)
+			continue;
+
+		if (!vtbegin) {
+			goto initvars;
+		} else if ((v->vid - vtend->vid) == 1 &&
+			    vlan_tunid_inrange(v, vtend)) {
+			vtend = v;
+			continue;
+		} else {
+			err = br_fill_vlan_tinfo_range(skb, vtbegin, vtend);
+			if (err)
+				return err;
+		}
+initvars:
+		vtbegin = v;
+		vtend = v;
+	}
+
+	if (vtbegin) {
+		err = br_fill_vlan_tinfo_range(skb, vtbegin, vtend);
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
+static const struct nla_policy vlan_tunnel_policy[IFLA_BRIDGE_VLAN_TUNNEL_MAX + 1] = {
+	[IFLA_BRIDGE_VLAN_TUNNEL_ID] = { .type = NLA_U32 },
+	[IFLA_BRIDGE_VLAN_TUNNEL_VID] = { .type = NLA_U16 },
+	[IFLA_BRIDGE_VLAN_TUNNEL_FLAGS] = { .type = NLA_U16 },
+};
+
+static int br_vlan_tunnel_info(struct net_bridge_port *p, int cmd,
+			       u16 vid, u32 tun_id)
+{
+	int err = 0;
+
+	if (!p)
+		return -EINVAL;
+
+	switch (cmd) {
+	case RTM_SETLINK:
+		err = nbp_vlan_tunnel_info_add(p, vid, tun_id);
+		break;
+	case RTM_DELLINK:
+		nbp_vlan_tunnel_info_delete(p, vid);
+		break;
+	}
+
+	return err;
+}
+
+int br_parse_vlan_tunnel_info(struct nlattr *attr,
+			      struct vtunnel_info *tinfo)
+{
+	struct nlattr *tb[IFLA_BRIDGE_VLAN_TUNNEL_MAX + 1];
+	u32 tun_id;
+	u16 vid, flags = 0;
+	int err;
+
+	memset(tinfo, 0, sizeof(*tinfo));
+
+	err = nla_parse_nested(tb, IFLA_BRIDGE_VLAN_TUNNEL_MAX,
+			       attr, vlan_tunnel_policy);
+	if (err < 0)
+		return err;
+
+	if (!tb[IFLA_BRIDGE_VLAN_TUNNEL_ID] ||
+	    !tb[IFLA_BRIDGE_VLAN_TUNNEL_VID])
+		return -EINVAL;
+
+	tun_id = nla_get_u32(tb[IFLA_BRIDGE_VLAN_TUNNEL_ID]);
+	vid = nla_get_u16(tb[IFLA_BRIDGE_VLAN_TUNNEL_VID]);
+	if (vid >= VLAN_VID_MASK)
+		return -ERANGE;
+
+	if (tb[IFLA_BRIDGE_VLAN_TUNNEL_FLAGS])
+		flags = nla_get_u16(tb[IFLA_BRIDGE_VLAN_TUNNEL_FLAGS]);
+
+	tinfo->tunid = tun_id;
+	tinfo->vid = vid;
+	tinfo->flags = flags;
+
+	return 0;
+}
+
+int br_process_vlan_tunnel_info(struct net_bridge *br,
+				struct net_bridge_port *p, int cmd,
+				struct vtunnel_info *tinfo_curr,
+				struct vtunnel_info *tinfo_last)
+{
+	int err;
+
+	if (tinfo_curr->flags & BRIDGE_VLAN_INFO_RANGE_BEGIN) {
+		if (tinfo_last->flags & BRIDGE_VLAN_INFO_RANGE_BEGIN)
+			return -EINVAL;
+		memcpy(tinfo_last, tinfo_curr, sizeof(struct vtunnel_info));
+	} else if (tinfo_curr->flags & BRIDGE_VLAN_INFO_RANGE_END) {
+		int t, v;
+
+		if (!(tinfo_last->flags & BRIDGE_VLAN_INFO_RANGE_BEGIN))
+			return -EINVAL;
+		if ((tinfo_curr->vid - tinfo_last->vid) !=
+		    (tinfo_curr->tunid - tinfo_last->tunid))
+			return -EINVAL;
+		t = tinfo_last->tunid;
+		for (v = tinfo_last->vid; v <= tinfo_curr->vid; v++) {
+			err = br_vlan_tunnel_info(p, cmd, v, t);
+			if (err)
+				return err;
+			t++;
+		}
+		memset(tinfo_last, 0, sizeof(struct vtunnel_info));
+		memset(tinfo_curr, 0, sizeof(struct vtunnel_info));
+	} else {
+		if (tinfo_last->flags)
+			return -EINVAL;
+		err = br_vlan_tunnel_info(p, cmd, tinfo_curr->vid,
+					  tinfo_curr->tunid);
+		if (err)
+			return err;
+		memset(tinfo_last, 0, sizeof(struct vtunnel_info));
+		memset(tinfo_curr, 0, sizeof(struct vtunnel_info));
+	}
+
+	return 0;
+}
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 8ce621e8345c..2288fca7756c 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -91,6 +91,11 @@ struct br_vlan_stats {
 	struct u64_stats_sync syncp;
 };
 
+struct br_tunnel_info {
+	__be64			tunnel_id;
+	struct metadata_dst	*tunnel_dst;
+};
+
 /**
  * struct net_bridge_vlan - per-vlan entry
  *
@@ -113,6 +118,7 @@ struct br_vlan_stats {
  */
 struct net_bridge_vlan {
 	struct rhash_head		vnode;
+	struct rhash_head		tnode;
 	u16				vid;
 	u16				flags;
 	struct br_vlan_stats __percpu	*stats;
@@ -124,6 +130,9 @@ struct net_bridge_vlan {
 		atomic_t		refcnt;
 		struct net_bridge_vlan	*brvlan;
 	};
+
+	struct br_tunnel_info		tinfo;
+
 	struct list_head		vlist;
 
 	struct rcu_head			rcu;
@@ -145,24 +154,27 @@ struct net_bridge_vlan {
  */
 struct net_bridge_vlan_group {
 	struct rhashtable		vlan_hash;
+	struct rhashtable		tunnel_hash;
 	struct list_head		vlan_list;
 	u16				num_vlans;
 	u16				pvid;
 };
 
-struct net_bridge_fdb_entry
-{
+struct net_bridge_fdb_entry {
 	struct hlist_node		hlist;
 	struct net_bridge_port		*dst;
 
-	unsigned long			updated;
-	unsigned long			used;
 	mac_addr			addr;
 	__u16				vlan_id;
 	unsigned char			is_local:1,
 					is_static:1,
 					added_by_user:1,
 					added_by_external_learn:1;
+
+	/* write-heavy members should not affect lookups */
+	unsigned long			updated ____cacheline_aligned_in_smp;
+	unsigned long			used;
+
 	struct rcu_head			rcu;
 };
 
@@ -177,6 +189,7 @@ struct net_bridge_port_group {
 	struct timer_list		timer;
 	struct br_ip			addr;
 	unsigned char			flags;
+	unsigned char			eth_addr[ETH_ALEN];
 };
 
 struct net_bridge_mdb_entry
@@ -201,12 +214,16 @@ struct net_bridge_mdb_htable
 	u32				ver;
 };
 
-struct net_bridge_port
-{
+struct net_bridge_port {
 	struct net_bridge		*br;
 	struct net_device		*dev;
 	struct list_head		list;
 
+	unsigned long			flags;
+#ifdef CONFIG_BRIDGE_VLAN_FILTERING
+	struct net_bridge_vlan_group	__rcu *vlgrp;
+#endif
+
 	/* STP */
 	u8				priority;
 	u8				state;
@@ -227,8 +244,6 @@ struct net_bridge_port
 	struct kobject			kobj;
 	struct rcu_head			rcu;
 
-	unsigned long 			flags;
-
 #ifdef CONFIG_BRIDGE_IGMP_SNOOPING
 	struct bridge_mcast_own_query	ip4_own_query;
 #if IS_ENABLED(CONFIG_IPV6)
@@ -248,9 +263,6 @@ struct net_bridge_port
 #ifdef CONFIG_NET_POLL_CONTROLLER
 	struct netpoll			*np;
 #endif
-#ifdef CONFIG_BRIDGE_VLAN_FILTERING
-	struct net_bridge_vlan_group	__rcu *vlgrp;
-#endif
 #ifdef CONFIG_NET_SWITCHDEV
 	int				offload_fwd_mark;
 #endif
@@ -272,14 +284,21 @@ static inline struct net_bridge_port *br_port_get_rtnl(const struct net_device *
 		rtnl_dereference(dev->rx_handler_data) : NULL;
 }
 
-struct net_bridge
-{
+struct net_bridge {
 	spinlock_t			lock;
+	spinlock_t			hash_lock;
 	struct list_head		port_list;
 	struct net_device		*dev;
-
 	struct pcpu_sw_netstats		__percpu *stats;
-	spinlock_t			hash_lock;
+	/* These fields are accessed on each packet */
+#ifdef CONFIG_BRIDGE_VLAN_FILTERING
+	u8				vlan_enabled;
+	u8				vlan_stats_enabled;
+	__be16				vlan_proto;
+	u16				default_pvid;
+	struct net_bridge_vlan_group	__rcu *vlgrp;
+#endif
+
 	struct hlist_head		hash[BR_HASH_SIZE];
 #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
 	union {
@@ -297,6 +316,9 @@ struct net_bridge
 	bridge_id			designated_root;
 	bridge_id			bridge_id;
 	u32				root_path_cost;
+	unsigned char			topology_change;
+	unsigned char			topology_change_detected;
+	u16				root_port;
 	unsigned long			max_age;
 	unsigned long			hello_time;
 	unsigned long			forward_delay;
@@ -308,7 +330,6 @@ struct net_bridge
 
 	u8				group_addr[ETH_ALEN];
 	bool				group_addr_set;
-	u16				root_port;
 
 	enum {
 		BR_NO_STP, 		/* no spanning tree */
@@ -316,9 +337,6 @@ struct net_bridge
 		BR_USER_STP,		/* new RSTP in userspace */
 	} stp_enabled;
 
-	unsigned char			topology_change;
-	unsigned char			topology_change_detected;
-
 #ifdef CONFIG_BRIDGE_IGMP_SNOOPING
 	unsigned char			multicast_router;
 
@@ -363,21 +381,13 @@ struct net_bridge
 	struct timer_list		hello_timer;
 	struct timer_list		tcn_timer;
 	struct timer_list		topology_change_timer;
-	struct timer_list		gc_timer;
+	struct delayed_work		gc_work;
 	struct kobject			*ifobj;
 	u32				auto_cnt;
 
 #ifdef CONFIG_NET_SWITCHDEV
 	int offload_fwd_mark;
 #endif
-
-#ifdef CONFIG_BRIDGE_VLAN_FILTERING
-	struct net_bridge_vlan_group	__rcu *vlgrp;
-	u8				vlan_enabled;
-	u8				vlan_stats_enabled;
-	__be16				vlan_proto;
-	u16				default_pvid;
-#endif
 };
 
 struct br_input_skb_cb {
@@ -494,11 +504,12 @@ void br_fdb_find_delete_local(struct net_bridge *br,
 			      const unsigned char *addr, u16 vid);
 void br_fdb_changeaddr(struct net_bridge_port *p, const unsigned char *newaddr);
 void br_fdb_change_mac_address(struct net_bridge *br, const u8 *newaddr);
-void br_fdb_cleanup(unsigned long arg);
+void br_fdb_cleanup(struct work_struct *work);
 void br_fdb_delete_by_port(struct net_bridge *br,
 			   const struct net_bridge_port *p, u16 vid, int do_all);
-struct net_bridge_fdb_entry *__br_fdb_get(struct net_bridge *br,
-					  const unsigned char *addr, __u16 vid);
+struct net_bridge_fdb_entry *br_fdb_find_rcu(struct net_bridge *br,
+					     const unsigned char *addr,
+					     __u16 vid);
 int br_fdb_test_addr(struct net_device *dev, unsigned char *addr);
 int br_fdb_fillbuf(struct net_bridge *br, void *buf, unsigned long count,
 		   unsigned long off);
@@ -520,6 +531,15 @@ int br_fdb_external_learn_add(struct net_bridge *br, struct net_bridge_port *p,
 int br_fdb_external_learn_del(struct net_bridge *br, struct net_bridge_port *p,
 			      const unsigned char *addr, u16 vid);
 
+static inline bool br_hash_lock_held(struct net_bridge *br)
+{
+#ifdef CONFIG_LOCKDEP
+	return lockdep_is_held(&br->hash_lock);
+#else
+	return true;
+#endif
+}
+
 /* br_forward.c */
 enum br_pkt_type {
 	BR_PKT_UNICAST,
@@ -599,7 +619,7 @@ void br_multicast_free_pg(struct rcu_head *head);
 struct net_bridge_port_group *
 br_multicast_new_port_group(struct net_bridge_port *port, struct br_ip *group,
 			    struct net_bridge_port_group __rcu *next,
-			    unsigned char flags);
+			    unsigned char flags, const unsigned char *src);
 void br_mdb_init(void);
 void br_mdb_uninit(void);
 void br_mdb_notify(struct net_device *dev, struct net_bridge_port *port,
@@ -764,6 +784,7 @@ bool br_allowed_egress(struct net_bridge_vlan_group *vg,
 		       const struct sk_buff *skb);
 bool br_should_learn(struct net_bridge_port *p, struct sk_buff *skb, u16 *vid);
 struct sk_buff *br_handle_vlan(struct net_bridge *br,
+			       const struct net_bridge_port *port,
 			       struct net_bridge_vlan_group *vg,
 			       struct sk_buff *skb);
 int br_vlan_add(struct net_bridge *br, u16 vid, u16 flags);
@@ -863,6 +884,7 @@ static inline bool br_should_learn(struct net_bridge_port *p,
 }
 
 static inline struct sk_buff *br_handle_vlan(struct net_bridge *br,
+					     const struct net_bridge_port *port,
 					     struct net_bridge_vlan_group *vg,
 					     struct sk_buff *skb)
 {
diff --git a/net/bridge/br_private_tunnel.h b/net/bridge/br_private_tunnel.h
new file mode 100644
index 000000000000..4a447a378ab3
--- /dev/null
+++ b/net/bridge/br_private_tunnel.h
@@ -0,0 +1,83 @@
+/*
+ *	Bridge per vlan tunnels
+ *
+ *	Authors:
+ *	Roopa Prabhu		<roopa@cumulusnetworks.com>
+ *
+ *	This program is free software; you can redistribute it and/or
+ *	modify it under the terms of the GNU General Public License
+ *	as published by the Free Software Foundation; either version
+ *	2 of the License, or (at your option) any later version.
+ */
+
+#ifndef _BR_PRIVATE_TUNNEL_H
+#define _BR_PRIVATE_TUNNEL_H
+
+struct vtunnel_info {
+	u32	tunid;
+	u16	vid;
+	u16	flags;
+};
+
+/* br_netlink_tunnel.c */
+int br_parse_vlan_tunnel_info(struct nlattr *attr,
+			      struct vtunnel_info *tinfo);
+int br_process_vlan_tunnel_info(struct net_bridge *br,
+				struct net_bridge_port *p,
+				int cmd,
+				struct vtunnel_info *tinfo_curr,
+				struct vtunnel_info *tinfo_last);
+int br_get_vlan_tunnel_info_size(struct net_bridge_vlan_group *vg);
+int br_fill_vlan_tunnel_info(struct sk_buff *skb,
+			     struct net_bridge_vlan_group *vg);
+
+#ifdef CONFIG_BRIDGE_VLAN_FILTERING
+/* br_vlan_tunnel.c */
+int vlan_tunnel_init(struct net_bridge_vlan_group *vg);
+void vlan_tunnel_deinit(struct net_bridge_vlan_group *vg);
+int nbp_vlan_tunnel_info_delete(struct net_bridge_port *port, u16 vid);
+int nbp_vlan_tunnel_info_add(struct net_bridge_port *port, u16 vid, u32 tun_id);
+void nbp_vlan_tunnel_info_flush(struct net_bridge_port *port);
+void vlan_tunnel_info_del(struct net_bridge_vlan_group *vg,
+			  struct net_bridge_vlan *vlan);
+int br_handle_ingress_vlan_tunnel(struct sk_buff *skb,
+				  struct net_bridge_port *p,
+				  struct net_bridge_vlan_group *vg);
+int br_handle_egress_vlan_tunnel(struct sk_buff *skb,
+				 struct net_bridge_vlan *vlan);
+#else
+static inline int vlan_tunnel_init(struct net_bridge_vlan_group *vg)
+{
+	return 0;
+}
+
+static inline int nbp_vlan_tunnel_info_delete(struct net_bridge_port *port,
+					      u16 vid)
+{
+	return 0;
+}
+
+static inline int nbp_vlan_tunnel_info_add(struct net_bridge_port *port,
+					   u16 vid, u32 tun_id)
+{
+	return 0;
+}
+
+static inline void nbp_vlan_tunnel_info_flush(struct net_bridge_port *port)
+{
+}
+
+static inline void vlan_tunnel_info_del(struct net_bridge_vlan_group *vg,
+					struct net_bridge_vlan *vlan)
+{
+}
+
+static inline int br_handle_ingress_vlan_tunnel(struct sk_buff *skb,
+						struct net_bridge_port *p,
+						struct net_bridge_vlan_group *vg)
+{
+	return 0;
+}
+#endif
+
+#endif
diff --git a/net/bridge/br_stp.c b/net/bridge/br_stp.c
index 71fd1a4e63cc..8f56c2d1f1a7 100644
--- a/net/bridge/br_stp.c
+++ b/net/bridge/br_stp.c
@@ -602,7 +602,7 @@ int br_set_ageing_time(struct net_bridge *br, clock_t ageing_time)
 	br->ageing_time = t;
 	spin_unlock_bh(&br->lock);
 
-	mod_timer(&br->gc_timer, jiffies);
+	mod_delayed_work(system_long_wq, &br->gc_work, 0);
 
 	return 0;
 }
diff --git a/net/bridge/br_stp_if.c b/net/bridge/br_stp_if.c
index 6c1e21411125..08341d2aa9c9 100644
--- a/net/bridge/br_stp_if.c
+++ b/net/bridge/br_stp_if.c
@@ -57,7 +57,7 @@ void br_stp_enable_bridge(struct net_bridge *br)
 	spin_lock_bh(&br->lock);
 	if (br->stp_enabled == BR_KERNEL_STP)
 		mod_timer(&br->hello_timer, jiffies + br->hello_time);
-	mod_timer(&br->gc_timer, jiffies + HZ/10);
+	mod_delayed_work(system_long_wq, &br->gc_work, HZ / 10);
 
 	br_config_bpdu_generation(br);
 
@@ -88,7 +88,7 @@ void br_stp_disable_bridge(struct net_bridge *br)
 	del_timer_sync(&br->hello_timer);
 	del_timer_sync(&br->topology_change_timer);
 	del_timer_sync(&br->tcn_timer);
-	del_timer_sync(&br->gc_timer);
+	cancel_delayed_work_sync(&br->gc_work);
 }
 
 /* called under bridge lock */
diff --git a/net/bridge/br_stp_timer.c b/net/bridge/br_stp_timer.c
index 7ddb38e0a06e..c98b3e5c140a 100644
--- a/net/bridge/br_stp_timer.c
+++ b/net/bridge/br_stp_timer.c
@@ -153,8 +153,6 @@ void br_stp_timer_init(struct net_bridge *br)
 	setup_timer(&br->topology_change_timer,
 		      br_topology_change_timer_expired,
 		      (unsigned long) br);
-
-	setup_timer(&br->gc_timer, br_fdb_cleanup, (unsigned long) br);
 }
 
 void br_stp_port_timer_init(struct net_bridge_port *p)
diff --git a/net/bridge/br_sysfs_br.c b/net/bridge/br_sysfs_br.c
index a18148213b08..0f4034934d56 100644
--- a/net/bridge/br_sysfs_br.c
+++ b/net/bridge/br_sysfs_br.c
@@ -263,7 +263,7 @@ static ssize_t gc_timer_show(struct device *d, struct device_attribute *attr,
 			     char *buf)
 {
 	struct net_bridge *br = to_bridge(d);
-	return sprintf(buf, "%ld\n", br_timer_value(&br->gc_timer));
+	return sprintf(buf, "%ld\n", br_timer_value(&br->gc_work.timer));
 }
 static DEVICE_ATTR_RO(gc_timer);
 
diff --git a/net/bridge/br_sysfs_if.c b/net/bridge/br_sysfs_if.c
index 8bd569695e76..05e8946ccc03 100644
--- a/net/bridge/br_sysfs_if.c
+++ b/net/bridge/br_sysfs_if.c
@@ -188,6 +188,7 @@ static BRPORT_ATTR(multicast_router, S_IRUGO | S_IWUSR, show_multicast_router,
 		   store_multicast_router);
 
 BRPORT_ATTR_FLAG(multicast_fast_leave, BR_MULTICAST_FAST_LEAVE);
+BRPORT_ATTR_FLAG(multicast_to_unicast, BR_MULTICAST_TO_UNICAST);
 #endif
 
 static const struct brport_attribute *brport_attrs[] = {
@@ -214,6 +215,7 @@ static const struct brport_attribute *brport_attrs[] = {
 #ifdef CONFIG_BRIDGE_IGMP_SNOOPING
 	&brport_attr_multicast_router,
 	&brport_attr_multicast_fast_leave,
+	&brport_attr_multicast_to_unicast,
 #endif
 	&brport_attr_proxyarp,
 	&brport_attr_proxyarp_wifi,
diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c
index b6de4f457161..62e68c0dc687 100644
--- a/net/bridge/br_vlan.c
+++ b/net/bridge/br_vlan.c
@@ -5,6 +5,7 @@
 #include <net/switchdev.h>
 
 #include "br_private.h"
+#include "br_private_tunnel.h"
 
 static inline int br_vlan_cmp(struct rhashtable_compare_arg *arg,
 			      const void *ptr)
@@ -310,6 +311,7 @@ static int __vlan_del(struct net_bridge_vlan *v)
 	}
 
 	if (masterv != v) {
+		vlan_tunnel_info_del(vg, v);
 		rhashtable_remove_fast(&vg->vlan_hash, &v->vnode,
 				       br_vlan_rht_params);
 		__vlan_del_list(v);
@@ -325,6 +327,7 @@ static void __vlan_group_free(struct net_bridge_vlan_group *vg)
 {
 	WARN_ON(!list_empty(&vg->vlan_list));
 	rhashtable_destroy(&vg->vlan_hash);
+	vlan_tunnel_deinit(vg);
 	kfree(vg);
 }
 
@@ -338,6 +341,7 @@ static void __vlan_flush(struct net_bridge_vlan_group *vg)
 }
 
 struct sk_buff *br_handle_vlan(struct net_bridge *br,
+			       const struct net_bridge_port *p,
 			       struct net_bridge_vlan_group *vg,
 			       struct sk_buff *skb)
 {
@@ -378,6 +382,12 @@ struct sk_buff *br_handle_vlan(struct net_bridge *br,
 
 	if (v->flags & BRIDGE_VLAN_INFO_UNTAGGED)
 		skb->vlan_tci = 0;
+
+	if (p && (p->flags & BR_VLAN_TUNNEL) &&
+	    br_handle_egress_vlan_tunnel(skb, v)) {
+		kfree_skb(skb);
+		return NULL;
+	}
 out:
 	return skb;
 }
@@ -613,6 +623,8 @@ int br_vlan_delete(struct net_bridge *br, u16 vid)
 	br_fdb_find_delete_local(br, NULL, br->dev->dev_addr, vid);
 	br_fdb_delete_by_port(br, NULL, vid, 0);
 
+	vlan_tunnel_info_del(vg, v);
+
 	return __vlan_del(v);
 }
 
@@ -918,6 +930,9 @@ int br_vlan_init(struct net_bridge *br)
 	ret = rhashtable_init(&vg->vlan_hash, &br_vlan_rht_params);
 	if (ret)
 		goto err_rhtbl;
+	ret = vlan_tunnel_init(vg);
+	if (ret)
+		goto err_tunnel_init;
 	INIT_LIST_HEAD(&vg->vlan_list);
 	br->vlan_proto = htons(ETH_P_8021Q);
 	br->default_pvid = 1;
@@ -932,6 +947,8 @@ out:
 	return ret;
 
 err_vlan_add:
+	vlan_tunnel_deinit(vg);
+err_tunnel_init:
 	rhashtable_destroy(&vg->vlan_hash);
 err_rhtbl:
 	kfree(vg);
@@ -961,6 +978,9 @@ int nbp_vlan_init(struct net_bridge_port *p)
 	ret = rhashtable_init(&vg->vlan_hash, &br_vlan_rht_params);
 	if (ret)
 		goto err_rhtbl;
+	ret = vlan_tunnel_init(vg);
+	if (ret)
+		goto err_tunnel_init;
 	INIT_LIST_HEAD(&vg->vlan_list);
 	rcu_assign_pointer(p->vlgrp, vg);
 	if (p->br->default_pvid) {
@@ -976,8 +996,10 @@ out:
 err_vlan_add:
 	RCU_INIT_POINTER(p->vlgrp, NULL);
 	synchronize_rcu();
-	rhashtable_destroy(&vg->vlan_hash);
+	vlan_tunnel_deinit(vg);
 err_vlan_enabled:
+err_tunnel_init:
+	rhashtable_destroy(&vg->vlan_hash);
 err_rhtbl:
 	kfree(vg);
 
diff --git a/net/bridge/br_vlan_tunnel.c b/net/bridge/br_vlan_tunnel.c
new file mode 100644
index 000000000000..6d2c4eed2dc8
--- /dev/null
+++ b/net/bridge/br_vlan_tunnel.c
@@ -0,0 +1,205 @@
+/*
+ *	Bridge per vlan tunnel port dst_metadata handling code
+ *
+ *	Authors:
+ *	Roopa Prabhu		<roopa@cumulusnetworks.com>
+ *
+ *	This program is free software; you can redistribute it and/or
+ *	modify it under the terms of the GNU General Public License
+ *	as published by the Free Software Foundation; either version
+ *	2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/netdevice.h>
+#include <linux/rtnetlink.h>
+#include <linux/slab.h>
+#include <net/switchdev.h>
+#include <net/dst_metadata.h>
+
+#include "br_private.h"
+#include "br_private_tunnel.h"
+
+static inline int br_vlan_tunid_cmp(struct rhashtable_compare_arg *arg,
+				    const void *ptr)
+{
+	const struct net_bridge_vlan *vle = ptr;
+	__be64 tunid = *(__be64 *)arg->key;
+
+	return vle->tinfo.tunnel_id != tunid;
+}
+
+static const struct rhashtable_params br_vlan_tunnel_rht_params = {
+	.head_offset = offsetof(struct net_bridge_vlan, tnode),
+	.key_offset = offsetof(struct net_bridge_vlan, tinfo.tunnel_id),
+	.key_len = sizeof(__be64),
+	.nelem_hint = 3,
+	.locks_mul = 1,
+	.obj_cmpfn = br_vlan_tunid_cmp,
+	.automatic_shrinking = true,
+};
+
+static struct net_bridge_vlan *br_vlan_tunnel_lookup(struct rhashtable *tbl,
+						     u64 tunnel_id)
+{
+	return rhashtable_lookup_fast(tbl, &tunnel_id,
+				      br_vlan_tunnel_rht_params);
+}
+
+void vlan_tunnel_info_del(struct net_bridge_vlan_group *vg,
+			  struct net_bridge_vlan *vlan)
+{
+	if (!vlan->tinfo.tunnel_dst)
+		return;
+	rhashtable_remove_fast(&vg->tunnel_hash, &vlan->tnode,
+			       br_vlan_tunnel_rht_params);
+	vlan->tinfo.tunnel_id = 0;
+	dst_release(&vlan->tinfo.tunnel_dst->dst);
+	vlan->tinfo.tunnel_dst = NULL;
+}
+
+static int __vlan_tunnel_info_add(struct net_bridge_vlan_group *vg,
+				  struct net_bridge_vlan *vlan, u32 tun_id)
+{
+	struct metadata_dst *metadata = NULL;
+	__be64 key = key32_to_tunnel_id(cpu_to_be32(tun_id));
+	int err;
+
+	if (vlan->tinfo.tunnel_dst)
+		return -EEXIST;
+
+	metadata = __ip_tun_set_dst(0, 0, 0, 0, 0, TUNNEL_KEY,
+				    key, 0);
+	if (!metadata)
+		return -EINVAL;
+
+	metadata->u.tun_info.mode |= IP_TUNNEL_INFO_TX | IP_TUNNEL_INFO_BRIDGE;
+	vlan->tinfo.tunnel_dst = metadata;
+	vlan->tinfo.tunnel_id = key;
+
+	err = rhashtable_lookup_insert_fast(&vg->tunnel_hash, &vlan->tnode,
+					    br_vlan_tunnel_rht_params);
+	if (err)
+		goto out;
+
+	return 0;
+out:
+	dst_release(&vlan->tinfo.tunnel_dst->dst);
+	vlan->tinfo.tunnel_dst = NULL;
+	vlan->tinfo.tunnel_id = 0;
+
+	return err;
+}
+
+/* Must be protected by RTNL.
+ * Must be called with vid in range from 1 to 4094 inclusive.
+ */
+int nbp_vlan_tunnel_info_add(struct net_bridge_port *port, u16 vid, u32 tun_id)
+{
+	struct net_bridge_vlan_group *vg;
+	struct net_bridge_vlan *vlan;
+
+	ASSERT_RTNL();
+
+	vg = nbp_vlan_group(port);
+	vlan = br_vlan_find(vg, vid);
+	if (!vlan)
+		return -EINVAL;
+
+	return __vlan_tunnel_info_add(vg, vlan, tun_id);
+}
+
+/* Must be protected by RTNL.
+ * Must be called with vid in range from 1 to 4094 inclusive.
+ */
+int nbp_vlan_tunnel_info_delete(struct net_bridge_port *port, u16 vid)
+{
+	struct net_bridge_vlan_group *vg;
+	struct net_bridge_vlan *v;
+
+	ASSERT_RTNL();
+
+	vg = nbp_vlan_group(port);
+	v = br_vlan_find(vg, vid);
+	if (!v)
+		return -ENOENT;
+
+	vlan_tunnel_info_del(vg, v);
+
+	return 0;
+}
+
+static void __vlan_tunnel_info_flush(struct net_bridge_vlan_group *vg)
+{
+	struct net_bridge_vlan *vlan, *tmp;
+
+	list_for_each_entry_safe(vlan, tmp, &vg->vlan_list, vlist)
+		vlan_tunnel_info_del(vg, vlan);
+}
+
+void nbp_vlan_tunnel_info_flush(struct net_bridge_port *port)
+{
+	struct net_bridge_vlan_group *vg;
+
+	ASSERT_RTNL();
+
+	vg = nbp_vlan_group(port);
+	__vlan_tunnel_info_flush(vg);
+}
+
+int vlan_tunnel_init(struct net_bridge_vlan_group *vg)
+{
+	return rhashtable_init(&vg->tunnel_hash, &br_vlan_tunnel_rht_params);
+}
+
+void vlan_tunnel_deinit(struct net_bridge_vlan_group *vg)
+{
+	rhashtable_destroy(&vg->tunnel_hash);
+}
+
+int br_handle_ingress_vlan_tunnel(struct sk_buff *skb,
+				  struct net_bridge_port *p,
+				  struct net_bridge_vlan_group *vg)
+{
+	struct ip_tunnel_info *tinfo = skb_tunnel_info(skb);
+	struct net_bridge_vlan *vlan;
+
+	if (!vg || !tinfo)
+		return 0;
+
+	/* if already tagged, ignore */
+	if (skb_vlan_tagged(skb))
+		return 0;
+
+	/* lookup vid, given tunnel id */
+	vlan = br_vlan_tunnel_lookup(&vg->tunnel_hash, tinfo->key.tun_id);
+	if (!vlan)
+		return 0;
+
+	skb_dst_drop(skb);
+
+	__vlan_hwaccel_put_tag(skb, p->br->vlan_proto, vlan->vid);
+
+	return 0;
+}
+
+int br_handle_egress_vlan_tunnel(struct sk_buff *skb,
+				 struct net_bridge_vlan *vlan)
+{
+	int err;
+
+	if (!vlan || !vlan->tinfo.tunnel_id)
+		return 0;
+
+	if (unlikely(!skb_vlan_tag_present(skb)))
+		return 0;
+
+	skb_dst_drop(skb);
+	err = skb_vlan_pop(skb);
+	if (err)
+		return err;
+
+	skb_dst_set(skb, dst_clone(&vlan->tinfo.tunnel_dst->dst));
+
+	return 0;
+}
diff --git a/net/bridge/netfilter/ebt_among.c b/net/bridge/netfilter/ebt_among.c
index 9024283d2bca..279527f8b1fe 100644
--- a/net/bridge/netfilter/ebt_among.c
+++ b/net/bridge/netfilter/ebt_among.c
@@ -187,7 +187,7 @@ static int ebt_among_mt_check(const struct xt_mtchk_param *par)
 	expected_length += ebt_mac_wormhash_size(wh_src);
 
 	if (em->match_size != EBT_ALIGN(expected_length)) {
-		pr_info("wrong size: %d against expected %d, rounded to %Zd\n",
+		pr_info("wrong size: %d against expected %d, rounded to %zd\n",
 			em->match_size, expected_length,
 			EBT_ALIGN(expected_length));
 		return -EINVAL;
diff --git a/net/bridge/netfilter/ebt_limit.c b/net/bridge/netfilter/ebt_limit.c
index 517e78befcb2..61a9f1be1263 100644
--- a/net/bridge/netfilter/ebt_limit.c
+++ b/net/bridge/netfilter/ebt_limit.c
@@ -105,6 +105,7 @@ static struct xt_match ebt_limit_mt_reg __read_mostly = {
 	.match		= ebt_limit_mt,
 	.checkentry	= ebt_limit_mt_check,
 	.matchsize	= sizeof(struct ebt_limit_info),
+	.usersize	= offsetof(struct ebt_limit_info, prev),
 #ifdef CONFIG_COMPAT
 	.compatsize	= sizeof(struct ebt_compat_limit_info),
 #endif
diff --git a/net/bridge/netfilter/ebt_log.c b/net/bridge/netfilter/ebt_log.c
index e88bd4827ac1..98b9c8e8615e 100644
--- a/net/bridge/netfilter/ebt_log.c
+++ b/net/bridge/netfilter/ebt_log.c
@@ -78,7 +78,7 @@ ebt_log_packet(struct net *net, u_int8_t pf, unsigned int hooknum,
 	unsigned int bitmask;
 
 	/* FIXME: Disabled from containers until syslog ns is supported */
-	if (!net_eq(net, &init_net))
+	if (!net_eq(net, &init_net) && !sysctl_nf_log_all_netns)
 		return;
 
 	spin_lock_bh(&ebt_log_lock);
diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c
index 537e3d506fc2..79b69917f521 100644
--- a/net/bridge/netfilter/ebtables.c
+++ b/net/bridge/netfilter/ebtables.c
@@ -1346,56 +1346,72 @@ static int update_counters(struct net *net, const void __user *user,
 				hlp.num_counters, user, len);
 }
 
-static inline int ebt_make_matchname(const struct ebt_entry_match *m,
-				     const char *base, char __user *ubase)
+static inline int ebt_obj_to_user(char __user *um, const char *_name,
+				  const char *data, int entrysize,
+				  int usersize, int datasize)
 {
-	char __user *hlp = ubase + ((char *)m - base);
-	char name[EBT_FUNCTION_MAXNAMELEN] = {};
+	char name[EBT_FUNCTION_MAXNAMELEN] = {0};
 
 	/* ebtables expects 32 bytes long names but xt_match names are 29 bytes
 	 * long. Copy 29 bytes and fill remaining bytes with zeroes.
 	 */
-	strlcpy(name, m->u.match->name, sizeof(name));
-	if (copy_to_user(hlp, name, EBT_FUNCTION_MAXNAMELEN))
+	strlcpy(name, _name, sizeof(name));
+	if (copy_to_user(um, name, EBT_FUNCTION_MAXNAMELEN) ||
+	    put_user(datasize, (int __user *)(um + EBT_FUNCTION_MAXNAMELEN)) ||
+	    xt_data_to_user(um + entrysize, data, usersize, datasize))
 		return -EFAULT;
+
 	return 0;
 }
 
-static inline int ebt_make_watchername(const struct ebt_entry_watcher *w,
-				       const char *base, char __user *ubase)
+static inline int ebt_match_to_user(const struct ebt_entry_match *m,
+				    const char *base, char __user *ubase)
 {
-	char __user *hlp = ubase + ((char *)w - base);
-	char name[EBT_FUNCTION_MAXNAMELEN] = {};
+	return ebt_obj_to_user(ubase + ((char *)m - base),
+			       m->u.match->name, m->data, sizeof(*m),
+			       m->u.match->usersize, m->match_size);
+}
 
-	strlcpy(name, w->u.watcher->name, sizeof(name));
-	if (copy_to_user(hlp, name, EBT_FUNCTION_MAXNAMELEN))
-		return -EFAULT;
-	return 0;
+static inline int ebt_watcher_to_user(const struct ebt_entry_watcher *w,
+				      const char *base, char __user *ubase)
+{
+	return ebt_obj_to_user(ubase + ((char *)w - base),
+			       w->u.watcher->name, w->data, sizeof(*w),
+			       w->u.watcher->usersize, w->watcher_size);
 }
 
-static inline int ebt_make_names(struct ebt_entry *e, const char *base,
-				 char __user *ubase)
+static inline int ebt_entry_to_user(struct ebt_entry *e, const char *base,
+				    char __user *ubase)
 {
 	int ret;
 	char __user *hlp;
 	const struct ebt_entry_target *t;
-	char name[EBT_FUNCTION_MAXNAMELEN] = {};
 
-	if (e->bitmask == 0)
+	if (e->bitmask == 0) {
+		/* special case !EBT_ENTRY_OR_ENTRIES */
+		if (copy_to_user(ubase + ((char *)e - base), e,
+				 sizeof(struct ebt_entries)))
+			return -EFAULT;
 		return 0;
+	}
+
+	if (copy_to_user(ubase + ((char *)e - base), e, sizeof(*e)))
+		return -EFAULT;
 
 	hlp = ubase + (((char *)e + e->target_offset) - base);
 	t = (struct ebt_entry_target *)(((char *)e) + e->target_offset);
 
-	ret = EBT_MATCH_ITERATE(e, ebt_make_matchname, base, ubase);
+	ret = EBT_MATCH_ITERATE(e, ebt_match_to_user, base, ubase);
 	if (ret != 0)
 		return ret;
-	ret = EBT_WATCHER_ITERATE(e, ebt_make_watchername, base, ubase);
+	ret = EBT_WATCHER_ITERATE(e, ebt_watcher_to_user, base, ubase);
 	if (ret != 0)
 		return ret;
-	strlcpy(name, t->u.target->name, sizeof(name));
-	if (copy_to_user(hlp, name, EBT_FUNCTION_MAXNAMELEN))
-		return -EFAULT;
+	ret = ebt_obj_to_user(hlp, t->u.target->name, t->data, sizeof(*t),
+			      t->u.target->usersize, t->target_size);
+	if (ret != 0)
+		return ret;
+
 	return 0;
 }
 
@@ -1475,13 +1491,9 @@ static int copy_everything_to_user(struct ebt_table *t, void __user *user,
 	if (ret)
 		return ret;
 
-	if (copy_to_user(tmp.entries, entries, entries_size)) {
-		BUGPRINT("Couldn't copy entries to userspace\n");
-		return -EFAULT;
-	}
 	/* set the match/watcher/target names right */
 	return EBT_ENTRY_ITERATE(entries, entries_size,
-	   ebt_make_names, entries, tmp.entries);
+	   ebt_entry_to_user, entries, tmp.entries);
 }
 
 static int do_ebt_set_ctl(struct sock *sk,
@@ -1630,8 +1642,10 @@ static int compat_match_to_user(struct ebt_entry_match *m, void __user **dstptr,
 	if (match->compat_to_user) {
 		if (match->compat_to_user(cm->data, m->data))
 			return -EFAULT;
-	} else if (copy_to_user(cm->data, m->data, msize))
+	} else {
+		if (xt_data_to_user(cm->data, m->data, match->usersize, msize))
 			return -EFAULT;
+	}
 
 	*size -= ebt_compat_entry_padsize() + off;
 	*dstptr = cm->data;
@@ -1657,8 +1671,10 @@ static int compat_target_to_user(struct ebt_entry_target *t,
 	if (target->compat_to_user) {
 		if (target->compat_to_user(cm->data, t->data))
 			return -EFAULT;
-	} else if (copy_to_user(cm->data, t->data, tsize))
-		return -EFAULT;
+	} else {
+		if (xt_data_to_user(cm->data, t->data, target->usersize, tsize))
+			return -EFAULT;
+	}
 
 	*size -= ebt_compat_entry_padsize() + off;
 	*dstptr = cm->data;
diff --git a/net/caif/chnl_net.c b/net/caif/chnl_net.c
index 3408ed51b611..1816fc9f1ee7 100644
--- a/net/caif/chnl_net.c
+++ b/net/caif/chnl_net.c
@@ -44,7 +44,6 @@ enum caif_states {
 
 struct chnl_net {
 	struct cflayer chnl;
-	struct net_device_stats stats;
 	struct caif_connect_request conn_req;
 	struct list_head list_field;
 	struct net_device *netdev;
diff --git a/net/ceph/cls_lock_client.c b/net/ceph/cls_lock_client.c
index 50f040fdb2a9..b9233b990399 100644
--- a/net/ceph/cls_lock_client.c
+++ b/net/ceph/cls_lock_client.c
@@ -69,8 +69,8 @@ int ceph_cls_lock(struct ceph_osd_client *osdc,
 	dout("%s lock_name %s type %d cookie %s tag %s desc %s flags 0x%x\n",
 	     __func__, lock_name, type, cookie, tag, desc, flags);
 	ret = ceph_osdc_call(osdc, oid, oloc, "lock", "lock",
-			     CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
-			     lock_op_page, lock_op_buf_size, NULL, NULL);
+			     CEPH_OSD_FLAG_WRITE, lock_op_page,
+			     lock_op_buf_size, NULL, NULL);
 
 	dout("%s: status %d\n", __func__, ret);
 	__free_page(lock_op_page);
@@ -117,8 +117,8 @@ int ceph_cls_unlock(struct ceph_osd_client *osdc,
 
 	dout("%s lock_name %s cookie %s\n", __func__, lock_name, cookie);
 	ret = ceph_osdc_call(osdc, oid, oloc, "lock", "unlock",
-			     CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
-			     unlock_op_page, unlock_op_buf_size, NULL, NULL);
+			     CEPH_OSD_FLAG_WRITE, unlock_op_page,
+			     unlock_op_buf_size, NULL, NULL);
 
 	dout("%s: status %d\n", __func__, ret);
 	__free_page(unlock_op_page);
@@ -170,8 +170,8 @@ int ceph_cls_break_lock(struct ceph_osd_client *osdc,
 	dout("%s lock_name %s cookie %s locker %s%llu\n", __func__, lock_name,
 	     cookie, ENTITY_NAME(*locker));
 	ret = ceph_osdc_call(osdc, oid, oloc, "lock", "break_lock",
-			     CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
-			     break_op_page, break_op_buf_size, NULL, NULL);
+			     CEPH_OSD_FLAG_WRITE, break_op_page,
+			     break_op_buf_size, NULL, NULL);
 
 	dout("%s: status %d\n", __func__, ret);
 	__free_page(break_op_page);
@@ -278,7 +278,7 @@ int ceph_cls_lock_info(struct ceph_osd_client *osdc,
 	int get_info_op_buf_size;
 	int name_len = strlen(lock_name);
 	struct page *get_info_op_page, *reply_page;
-	size_t reply_len;
+	size_t reply_len = PAGE_SIZE;
 	void *p, *end;
 	int ret;
 
diff --git a/net/ceph/crush/crush.c b/net/ceph/crush/crush.c
index 80d7c3a97cb8..5bf94c04f645 100644
--- a/net/ceph/crush/crush.c
+++ b/net/ceph/crush/crush.c
@@ -45,7 +45,6 @@ int crush_get_bucket_item_weight(const struct crush_bucket *b, int p)
 
 void crush_destroy_bucket_uniform(struct crush_bucket_uniform *b)
 {
-	kfree(b->h.perm);
 	kfree(b->h.items);
 	kfree(b);
 }
@@ -54,14 +53,12 @@ void crush_destroy_bucket_list(struct crush_bucket_list *b)
 {
 	kfree(b->item_weights);
 	kfree(b->sum_weights);
-	kfree(b->h.perm);
 	kfree(b->h.items);
 	kfree(b);
 }
 
 void crush_destroy_bucket_tree(struct crush_bucket_tree *b)
 {
-	kfree(b->h.perm);
 	kfree(b->h.items);
 	kfree(b->node_weights);
 	kfree(b);
@@ -71,7 +68,6 @@ void crush_destroy_bucket_straw(struct crush_bucket_straw *b)
 {
 	kfree(b->straws);
 	kfree(b->item_weights);
-	kfree(b->h.perm);
 	kfree(b->h.items);
 	kfree(b);
 }
@@ -79,7 +75,6 @@ void crush_destroy_bucket_straw(struct crush_bucket_straw *b)
 void crush_destroy_bucket_straw2(struct crush_bucket_straw2 *b)
 {
 	kfree(b->item_weights);
-	kfree(b->h.perm);
 	kfree(b->h.items);
 	kfree(b);
 }
diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c
index 130ab407c5ec..b5cd8c21bfdf 100644
--- a/net/ceph/crush/mapper.c
+++ b/net/ceph/crush/mapper.c
@@ -54,7 +54,6 @@ int crush_find_rule(const struct crush_map *map, int ruleset, int type, int size
 	return -1;
 }
 
-
 /*
  * bucket choose methods
  *
@@ -72,59 +71,60 @@ int crush_find_rule(const struct crush_map *map, int ruleset, int type, int size
  * Since this is expensive, we optimize for the r=0 case, which
  * captures the vast majority of calls.
  */
-static int bucket_perm_choose(struct crush_bucket *bucket,
+static int bucket_perm_choose(const struct crush_bucket *bucket,
+			      struct crush_work_bucket *work,
 			      int x, int r)
 {
 	unsigned int pr = r % bucket->size;
 	unsigned int i, s;
 
 	/* start a new permutation if @x has changed */
-	if (bucket->perm_x != (__u32)x || bucket->perm_n == 0) {
+	if (work->perm_x != (__u32)x || work->perm_n == 0) {
 		dprintk("bucket %d new x=%d\n", bucket->id, x);
-		bucket->perm_x = x;
+		work->perm_x = x;
 
 		/* optimize common r=0 case */
 		if (pr == 0) {
 			s = crush_hash32_3(bucket->hash, x, bucket->id, 0) %
 				bucket->size;
-			bucket->perm[0] = s;
-			bucket->perm_n = 0xffff;   /* magic value, see below */
+			work->perm[0] = s;
+			work->perm_n = 0xffff;   /* magic value, see below */
 			goto out;
 		}
 
 		for (i = 0; i < bucket->size; i++)
-			bucket->perm[i] = i;
-		bucket->perm_n = 0;
-	} else if (bucket->perm_n == 0xffff) {
+			work->perm[i] = i;
+		work->perm_n = 0;
+	} else if (work->perm_n == 0xffff) {
 		/* clean up after the r=0 case above */
 		for (i = 1; i < bucket->size; i++)
-			bucket->perm[i] = i;
-		bucket->perm[bucket->perm[0]] = 0;
-		bucket->perm_n = 1;
+			work->perm[i] = i;
+		work->perm[work->perm[0]] = 0;
+		work->perm_n = 1;
 	}
 
 	/* calculate permutation up to pr */
-	for (i = 0; i < bucket->perm_n; i++)
-		dprintk(" perm_choose have %d: %d\n", i, bucket->perm[i]);
-	while (bucket->perm_n <= pr) {
-		unsigned int p = bucket->perm_n;
+	for (i = 0; i < work->perm_n; i++)
+		dprintk(" perm_choose have %d: %d\n", i, work->perm[i]);
+	while (work->perm_n <= pr) {
+		unsigned int p = work->perm_n;
 		/* no point in swapping the final entry */
 		if (p < bucket->size - 1) {
 			i = crush_hash32_3(bucket->hash, x, bucket->id, p) %
 				(bucket->size - p);
 			if (i) {
-				unsigned int t = bucket->perm[p + i];
-				bucket->perm[p + i] = bucket->perm[p];
-				bucket->perm[p] = t;
+				unsigned int t = work->perm[p + i];
+				work->perm[p + i] = work->perm[p];
+				work->perm[p] = t;
 			}
 			dprintk(" perm_choose swap %d with %d\n", p, p+i);
 		}
-		bucket->perm_n++;
+		work->perm_n++;
 	}
 	for (i = 0; i < bucket->size; i++)
-		dprintk(" perm_choose  %d: %d\n", i, bucket->perm[i]);
+		dprintk(" perm_choose  %d: %d\n", i, work->perm[i]);
 
-	s = bucket->perm[pr];
+	s = work->perm[pr];
 out:
 	dprintk(" perm_choose %d sz=%d x=%d r=%d (%d) s=%d\n", bucket->id,
 		bucket->size, x, r, pr, s);
@@ -132,14 +132,14 @@ out:
 }
 
 /* uniform */
-static int bucket_uniform_choose(struct crush_bucket_uniform *bucket,
-				 int x, int r)
+static int bucket_uniform_choose(const struct crush_bucket_uniform *bucket,
+				 struct crush_work_bucket *work, int x, int r)
 {
-	return bucket_perm_choose(&bucket->h, x, r);
+	return bucket_perm_choose(&bucket->h, work, x, r);
 }
 
 /* list */
-static int bucket_list_choose(struct crush_bucket_list *bucket,
+static int bucket_list_choose(const struct crush_bucket_list *bucket,
 			      int x, int r)
 {
 	int i;
@@ -155,8 +155,9 @@ static int bucket_list_choose(struct crush_bucket_list *bucket,
 		w *= bucket->sum_weights[i];
 		w = w >> 16;
 		/*dprintk(" scaled %llx\n", w);*/
-		if (w < bucket->item_weights[i])
+		if (w < bucket->item_weights[i]) {
 			return bucket->h.items[i];
+		}
 	}
 
 	dprintk("bad list sums for bucket %d\n", bucket->h.id);
@@ -192,7 +193,7 @@ static int terminal(int x)
 	return x & 1;
 }
 
-static int bucket_tree_choose(struct crush_bucket_tree *bucket,
+static int bucket_tree_choose(const struct crush_bucket_tree *bucket,
 			      int x, int r)
 {
 	int n;
@@ -224,7 +225,7 @@ static int bucket_tree_choose(struct crush_bucket_tree *bucket,
 
 /* straw */
 
-static int bucket_straw_choose(struct crush_bucket_straw *bucket,
+static int bucket_straw_choose(const struct crush_bucket_straw *bucket,
 			       int x, int r)
 {
 	__u32 i;
@@ -301,7 +302,7 @@ static __u64 crush_ln(unsigned int xin)
  *
  */
 
-static int bucket_straw2_choose(struct crush_bucket_straw2 *bucket,
+static int bucket_straw2_choose(const struct crush_bucket_straw2 *bucket,
 				int x, int r)
 {
 	unsigned int i, high = 0;
@@ -344,37 +345,42 @@ static int bucket_straw2_choose(struct crush_bucket_straw2 *bucket,
 			high_draw = draw;
 		}
 	}
+
 	return bucket->h.items[high];
 }
 
 
-static int crush_bucket_choose(struct crush_bucket *in, int x, int r)
+static int crush_bucket_choose(const struct crush_bucket *in,
+			       struct crush_work_bucket *work,
+			       int x, int r)
 {
 	dprintk(" crush_bucket_choose %d x=%d r=%d\n", in->id, x, r);
 	BUG_ON(in->size == 0);
 	switch (in->alg) {
 	case CRUSH_BUCKET_UNIFORM:
-		return bucket_uniform_choose((struct crush_bucket_uniform *)in,
-					  x, r);
+		return bucket_uniform_choose(
+			(const struct crush_bucket_uniform *)in,
+			work, x, r);
 	case CRUSH_BUCKET_LIST:
-		return bucket_list_choose((struct crush_bucket_list *)in,
+		return bucket_list_choose((const struct crush_bucket_list *)in,
 					  x, r);
 	case CRUSH_BUCKET_TREE:
-		return bucket_tree_choose((struct crush_bucket_tree *)in,
+		return bucket_tree_choose((const struct crush_bucket_tree *)in,
 					  x, r);
 	case CRUSH_BUCKET_STRAW:
-		return bucket_straw_choose((struct crush_bucket_straw *)in,
-					   x, r);
+		return bucket_straw_choose(
+			(const struct crush_bucket_straw *)in,
+			x, r);
 	case CRUSH_BUCKET_STRAW2:
-		return bucket_straw2_choose((struct crush_bucket_straw2 *)in,
-					    x, r);
+		return bucket_straw2_choose(
+			(const struct crush_bucket_straw2 *)in,
+			x, r);
 	default:
 		dprintk("unknown bucket %d alg %d\n", in->id, in->alg);
 		return in->items[0];
 	}
 }
 
-
 /*
  * true if device is marked "out" (failed, fully offloaded)
  * of the cluster
@@ -416,7 +422,8 @@ static int is_out(const struct crush_map *map,
  * @parent_r: r value passed from the parent
  */
 static int crush_choose_firstn(const struct crush_map *map,
-			       struct crush_bucket *bucket,
+			       struct crush_work *work,
+			       const struct crush_bucket *bucket,
 			       const __u32 *weight, int weight_max,
 			       int x, int numrep, int type,
 			       int *out, int outpos,
@@ -434,7 +441,7 @@ static int crush_choose_firstn(const struct crush_map *map,
 	int rep;
 	unsigned int ftotal, flocal;
 	int retry_descent, retry_bucket, skip_rep;
-	struct crush_bucket *in = bucket;
+	const struct crush_bucket *in = bucket;
 	int r;
 	int i;
 	int item = 0;
@@ -473,9 +480,13 @@ static int crush_choose_firstn(const struct crush_map *map,
 				if (local_fallback_retries > 0 &&
 				    flocal >= (in->size>>1) &&
 				    flocal > local_fallback_retries)
-					item = bucket_perm_choose(in, x, r);
+					item = bucket_perm_choose(
+						in, work->work[-1-in->id],
+						x, r);
 				else
-					item = crush_bucket_choose(in, x, r);
+					item = crush_bucket_choose(
+						in, work->work[-1-in->id],
+						x, r);
 				if (item >= map->max_devices) {
 					dprintk("   bad item %d\n", item);
 					skip_rep = 1;
@@ -518,19 +529,21 @@ static int crush_choose_firstn(const struct crush_map *map,
 							sub_r = r >> (vary_r-1);
 						else
 							sub_r = 0;
-						if (crush_choose_firstn(map,
-							 map->buckets[-1-item],
-							 weight, weight_max,
-							 x, stable ? 1 : outpos+1, 0,
-							 out2, outpos, count,
-							 recurse_tries, 0,
-							 local_retries,
-							 local_fallback_retries,
-							 0,
-							 vary_r,
-							 stable,
-							 NULL,
-							 sub_r) <= outpos)
+						if (crush_choose_firstn(
+							    map,
+							    work,
+							    map->buckets[-1-item],
+							    weight, weight_max,
+							    x, stable ? 1 : outpos+1, 0,
+							    out2, outpos, count,
+							    recurse_tries, 0,
+							    local_retries,
+							    local_fallback_retries,
+							    0,
+							    vary_r,
+							    stable,
+							    NULL,
+							    sub_r) <= outpos)
 							/* didn't get leaf */
 							reject = 1;
 					} else {
@@ -539,14 +552,12 @@ static int crush_choose_firstn(const struct crush_map *map,
 					}
 				}
 
-				if (!reject) {
+				if (!reject && !collide) {
 					/* out? */
 					if (itemtype == 0)
 						reject = is_out(map, weight,
 								weight_max,
 								item, x);
-					else
-						reject = 0;
 				}
 
 reject:
@@ -600,7 +611,8 @@ reject:
  *
  */
 static void crush_choose_indep(const struct crush_map *map,
-			       struct crush_bucket *bucket,
+			       struct crush_work *work,
+			       const struct crush_bucket *bucket,
 			       const __u32 *weight, int weight_max,
 			       int x, int left, int numrep, int type,
 			       int *out, int outpos,
@@ -610,7 +622,7 @@ static void crush_choose_indep(const struct crush_map *map,
 			       int *out2,
 			       int parent_r)
 {
-	struct crush_bucket *in = bucket;
+	const struct crush_bucket *in = bucket;
 	int endpos = outpos + left;
 	int rep;
 	unsigned int ftotal;
@@ -678,7 +690,9 @@ static void crush_choose_indep(const struct crush_map *map,
 					break;
 				}
 
-				item = crush_bucket_choose(in, x, r);
+				item = crush_bucket_choose(
+					in, work->work[-1-in->id],
+					x, r);
 				if (item >= map->max_devices) {
 					dprintk("   bad item %d\n", item);
 					out[rep] = CRUSH_ITEM_NONE;
@@ -724,13 +738,15 @@ static void crush_choose_indep(const struct crush_map *map,
 
 				if (recurse_to_leaf) {
 					if (item < 0) {
-						crush_choose_indep(map,
-						   map->buckets[-1-item],
-						   weight, weight_max,
-						   x, 1, numrep, 0,
-						   out2, rep,
-						   recurse_tries, 0,
-						   0, NULL, r);
+						crush_choose_indep(
+							map,
+							work,
+							map->buckets[-1-item],
+							weight, weight_max,
+							x, 1, numrep, 0,
+							out2, rep,
+							recurse_tries, 0,
+							0, NULL, r);
 						if (out2[rep] == CRUSH_ITEM_NONE) {
 							/* placed nothing; no leaf */
 							break;
@@ -781,6 +797,53 @@ static void crush_choose_indep(const struct crush_map *map,
 #endif
 }
 
+
+/*
+ * This takes a chunk of memory and sets it up to be a shiny new
+ * working area for a CRUSH placement computation. It must be called
+ * on any newly allocated memory before passing it in to
+ * crush_do_rule. It may be used repeatedly after that, so long as the
+ * map has not changed. If the map /has/ changed, you must make sure
+ * the working size is no smaller than what was allocated and re-run
+ * crush_init_workspace.
+ *
+ * If you do retain the working space between calls to crush, make it
+ * thread-local.
+ */
+void crush_init_workspace(const struct crush_map *map, void *v)
+{
+	struct crush_work *w = v;
+	__s32 b;
+
+	/*
+	 * We work by moving through the available space and setting
+	 * values and pointers as we go.
+	 *
+	 * It's a bit like Forth's use of the 'allot' word since we
+	 * set the pointer first and then reserve the space for it to
+	 * point to by incrementing the point.
+	 */
+	v += sizeof(struct crush_work *);
+	w->work = v;
+	v += map->max_buckets * sizeof(struct crush_work_bucket *);
+	for (b = 0; b < map->max_buckets; ++b) {
+		if (!map->buckets[b])
+			continue;
+
+		w->work[b] = v;
+		switch (map->buckets[b]->alg) {
+		default:
+			v += sizeof(struct crush_work_bucket);
+			break;
+		}
+		w->work[b]->perm_x = 0;
+		w->work[b]->perm_n = 0;
+		w->work[b]->perm = v;
+		v += map->buckets[b]->size * sizeof(__u32);
+	}
+	BUG_ON(v - (void *)w != map->working_size);
+}
+
 /**
  * crush_do_rule - calculate a mapping with the given input and rule
  * @map: the crush_map
@@ -790,24 +853,25 @@ static void crush_choose_indep(const struct crush_map *map,
  * @result_max: maximum result size
  * @weight: weight vector (for map leaves)
  * @weight_max: size of weight vector
- * @scratch: scratch vector for private use; must be >= 3 * result_max
+ * @cwin: pointer to at least crush_work_size() bytes of memory
  */
 int crush_do_rule(const struct crush_map *map,
 		  int ruleno, int x, int *result, int result_max,
 		  const __u32 *weight, int weight_max,
-		  int *scratch)
+		  void *cwin)
 {
 	int result_len;
-	int *a = scratch;
-	int *b = scratch + result_max;
-	int *c = scratch + result_max*2;
+	struct crush_work *cw = cwin;
+	int *a = cwin + map->working_size;
+	int *b = a + result_max;
+	int *c = b + result_max;
+	int *w = a;
+	int *o = b;
 	int recurse_to_leaf;
-	int *w;
 	int wsize = 0;
-	int *o;
 	int osize;
 	int *tmp;
-	struct crush_rule *rule;
+	const struct crush_rule *rule;
 	__u32 step;
 	int i, j;
 	int numrep;
@@ -835,12 +899,10 @@ int crush_do_rule(const struct crush_map *map,
 
 	rule = map->rules[ruleno];
 	result_len = 0;
-	w = a;
-	o = b;
 
 	for (step = 0; step < rule->len; step++) {
 		int firstn = 0;
-		struct crush_rule_step *curstep = &rule->steps[step];
+		const struct crush_rule_step *curstep = &rule->steps[step];
 
 		switch (curstep->op) {
 		case CRUSH_RULE_TAKE:
@@ -936,6 +998,7 @@ int crush_do_rule(const struct crush_map *map,
 						recurse_tries = choose_tries;
 					osize += crush_choose_firstn(
 						map,
+						cw,
 						map->buckets[bno],
 						weight, weight_max,
 						x, numrep,
@@ -956,6 +1019,7 @@ int crush_do_rule(const struct crush_map *map,
 						    numrep : (result_max-osize));
 					crush_choose_indep(
 						map,
+						cw,
 						map->buckets[bno],
 						weight, weight_max,
 						x, out_size, numrep,
@@ -997,5 +1061,6 @@ int crush_do_rule(const struct crush_map *map,
 			break;
 		}
 	}
+
 	return result_len;
 }
diff --git a/net/ceph/crypto.c b/net/ceph/crypto.c
index 292e33bd916e..85747b7f91a9 100644
--- a/net/ceph/crypto.c
+++ b/net/ceph/crypto.c
@@ -3,6 +3,7 @@
 
 #include <linux/err.h>
 #include <linux/scatterlist.h>
+#include <linux/sched.h>
 #include <linux/slab.h>
 #include <crypto/aes.h>
 #include <crypto/skcipher.h>
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 770c52701efa..bad3d4ae43f6 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -3425,7 +3425,7 @@ static void ceph_msg_release(struct kref *kref)
 struct ceph_msg *ceph_msg_get(struct ceph_msg *msg)
 {
 	dout("%s %p (was %d)\n", __func__, msg,
-	     atomic_read(&msg->kref.refcount));
+	     kref_read(&msg->kref));
 	kref_get(&msg->kref);
 	return msg;
 }
@@ -3434,7 +3434,7 @@ EXPORT_SYMBOL(ceph_msg_get);
 void ceph_msg_put(struct ceph_msg *msg)
 {
 	dout("%s %p (was %d)\n", __func__, msg,
-	     atomic_read(&msg->kref.refcount));
+	     kref_read(&msg->kref));
 	kref_put(&msg->kref, ceph_msg_release);
 }
 EXPORT_SYMBOL(ceph_msg_put);
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 842f049abb86..b65bbf9f45eb 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -438,7 +438,7 @@ static void ceph_osdc_release_request(struct kref *kref)
 void ceph_osdc_get_request(struct ceph_osd_request *req)
 {
 	dout("%s %p (was %d)\n", __func__, req,
-	     atomic_read(&req->r_kref.refcount));
+	     kref_read(&req->r_kref));
 	kref_get(&req->r_kref);
 }
 EXPORT_SYMBOL(ceph_osdc_get_request);
@@ -447,7 +447,7 @@ void ceph_osdc_put_request(struct ceph_osd_request *req)
 {
 	if (req) {
 		dout("%s %p (was %d)\n", __func__, req,
-		     atomic_read(&req->r_kref.refcount));
+		     kref_read(&req->r_kref));
 		kref_put(&req->r_kref, ceph_osdc_release_request);
 	}
 }
@@ -460,7 +460,6 @@ static void request_init(struct ceph_osd_request *req)
 
 	kref_init(&req->r_kref);
 	init_completion(&req->r_completion);
-	init_completion(&req->r_done_completion);
 	RB_CLEAR_NODE(&req->r_node);
 	RB_CLEAR_NODE(&req->r_mc_node);
 	INIT_LIST_HEAD(&req->r_unsafe_item);
@@ -487,11 +486,11 @@ static void request_reinit(struct ceph_osd_request *req)
 	struct ceph_msg *reply_msg = req->r_reply;
 
 	dout("%s req %p\n", __func__, req);
-	WARN_ON(atomic_read(&req->r_kref.refcount) != 1);
+	WARN_ON(kref_read(&req->r_kref) != 1);
 	request_release_checks(req);
 
-	WARN_ON(atomic_read(&request_msg->kref.refcount) != 1);
-	WARN_ON(atomic_read(&reply_msg->kref.refcount) != 1);
+	WARN_ON(kref_read(&request_msg->kref) != 1);
+	WARN_ON(kref_read(&reply_msg->kref) != 1);
 	target_destroy(&req->r_t);
 
 	request_init(req);
@@ -672,7 +671,8 @@ void osd_req_op_extent_update(struct ceph_osd_request *osd_req,
 	BUG_ON(length > previous);
 
 	op->extent.length = length;
-	op->indata_len -= previous - length;
+	if (op->op == CEPH_OSD_OP_WRITE || op->op == CEPH_OSD_OP_WRITEFULL)
+		op->indata_len -= previous - length;
 }
 EXPORT_SYMBOL(osd_req_op_extent_update);
 
@@ -1636,7 +1636,7 @@ static void __submit_request(struct ceph_osd_request *req, bool wrlocked)
 	bool need_send = false;
 	bool promoted = false;
 
-	WARN_ON(req->r_tid || req->r_got_reply);
+	WARN_ON(req->r_tid);
 	dout("%s req %p wrlocked %d\n", __func__, req, wrlocked);
 
 again:
@@ -1704,17 +1704,10 @@ promote:
 
 static void account_request(struct ceph_osd_request *req)
 {
-	unsigned int mask = CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK;
+	WARN_ON(req->r_flags & (CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK));
+	WARN_ON(!(req->r_flags & (CEPH_OSD_FLAG_READ | CEPH_OSD_FLAG_WRITE)));
 
-	if (req->r_flags & CEPH_OSD_FLAG_READ) {
-		WARN_ON(req->r_flags & mask);
-		req->r_flags |= CEPH_OSD_FLAG_ACK;
-	} else if (req->r_flags & CEPH_OSD_FLAG_WRITE)
-		WARN_ON(!(req->r_flags & mask));
-	else
-		WARN_ON(1);
-
-	WARN_ON(req->r_unsafe_callback && (req->r_flags & mask) != mask);
+	req->r_flags |= CEPH_OSD_FLAG_ONDISK;
 	atomic_inc(&req->r_osdc->num_requests);
 }
 
@@ -1749,15 +1742,15 @@ static void finish_request(struct ceph_osd_request *req)
 
 static void __complete_request(struct ceph_osd_request *req)
 {
-	if (req->r_callback)
+	if (req->r_callback) {
+		dout("%s req %p tid %llu cb %pf result %d\n", __func__, req,
+		     req->r_tid, req->r_callback, req->r_result);
 		req->r_callback(req);
-	else
-		complete_all(&req->r_completion);
+	}
 }
 
 /*
- * Note that this is open-coded in handle_reply(), which has to deal
- * with ack vs commit, dup acks, etc.
+ * This is open-coded in handle_reply().
  */
 static void complete_request(struct ceph_osd_request *req, int err)
 {
@@ -1766,7 +1759,7 @@ static void complete_request(struct ceph_osd_request *req, int err)
 	req->r_result = err;
 	finish_request(req);
 	__complete_request(req);
-	complete_all(&req->r_done_completion);
+	complete_all(&req->r_completion);
 	ceph_osdc_put_request(req);
 }
 
@@ -1792,7 +1785,7 @@ static void cancel_request(struct ceph_osd_request *req)
 
 	cancel_map_check(req);
 	finish_request(req);
-	complete_all(&req->r_done_completion);
+	complete_all(&req->r_completion);
 	ceph_osdc_put_request(req);
 }
 
@@ -2169,7 +2162,6 @@ static void linger_commit_cb(struct ceph_osd_request *req)
 	mutex_lock(&lreq->lock);
 	dout("%s lreq %p linger_id %llu result %d\n", __func__, lreq,
 	     lreq->linger_id, req->r_result);
-	WARN_ON(!__linger_registered(lreq));
 	linger_reg_commit_complete(lreq, req->r_result);
 	lreq->committed = true;
 
@@ -2785,31 +2777,8 @@ e_inval:
 }
 
 /*
- * We are done with @req if
- *   - @m is a safe reply, or
- *   - @m is an unsafe reply and we didn't want a safe one
- */
-static bool done_request(const struct ceph_osd_request *req,
-			 const struct MOSDOpReply *m)
-{
-	return (m->result < 0 ||
-		(m->flags & CEPH_OSD_FLAG_ONDISK) ||
-		!(req->r_flags & CEPH_OSD_FLAG_ONDISK));
-}
-
-/*
- * handle osd op reply.  either call the callback if it is specified,
- * or do the completion to wake up the waiting thread.
- *
- * ->r_unsafe_callback is set?	yes			no
- *
- * first reply is OK (needed	r_cb/r_completion,	r_cb/r_completion,
- * any or needed/got safe)	r_done_completion	r_done_completion
- *
- * first reply is unsafe	r_unsafe_cb(true)	(nothing)
- *
- * when we get the safe reply	r_unsafe_cb(false),	r_cb/r_completion,
- *				r_done_completion	r_done_completion
+ * Handle MOSDOpReply.  Set ->r_result and call the callback if it is
+ * specified.
  */
 static void handle_reply(struct ceph_osd *osd, struct ceph_msg *msg)
 {
@@ -2818,7 +2787,6 @@ static void handle_reply(struct ceph_osd *osd, struct ceph_msg *msg)
 	struct MOSDOpReply m;
 	u64 tid = le64_to_cpu(msg->hdr.tid);
 	u32 data_len = 0;
-	bool already_acked;
 	int ret;
 	int i;
 
@@ -2897,50 +2865,22 @@ static void handle_reply(struct ceph_osd *osd, struct ceph_msg *msg)
 		       le32_to_cpu(msg->hdr.data_len), req->r_tid);
 		goto fail_request;
 	}
-	dout("%s req %p tid %llu acked %d result %d data_len %u\n", __func__,
-	     req, req->r_tid, req->r_got_reply, m.result, data_len);
-
-	already_acked = req->r_got_reply;
-	if (!already_acked) {
-		req->r_result = m.result ?: data_len;
-		req->r_replay_version = m.replay_version; /* struct */
-		req->r_got_reply = true;
-	} else if (!(m.flags & CEPH_OSD_FLAG_ONDISK)) {
-		dout("req %p tid %llu dup ack\n", req, req->r_tid);
-		goto out_unlock_session;
-	}
-
-	if (done_request(req, &m)) {
-		finish_request(req);
-		if (req->r_linger) {
-			WARN_ON(req->r_unsafe_callback);
-			dout("req %p tid %llu cb (locked)\n", req, req->r_tid);
-			__complete_request(req);
-		}
-	}
+	dout("%s req %p tid %llu result %d data_len %u\n", __func__,
+	     req, req->r_tid, m.result, data_len);
 
+	/*
+	 * Since we only ever request ONDISK, we should only ever get
+	 * one (type of) reply back.
+	 */
+	WARN_ON(!(m.flags & CEPH_OSD_FLAG_ONDISK));
+	req->r_result = m.result ?: data_len;
+	finish_request(req);
 	mutex_unlock(&osd->lock);
 	up_read(&osdc->lock);
 
-	if (done_request(req, &m)) {
-		if (already_acked && req->r_unsafe_callback) {
-			dout("req %p tid %llu safe-cb\n", req, req->r_tid);
-			req->r_unsafe_callback(req, false);
-		} else if (!req->r_linger) {
-			dout("req %p tid %llu cb\n", req, req->r_tid);
-			__complete_request(req);
-		}
-		complete_all(&req->r_done_completion);
-		ceph_osdc_put_request(req);
-	} else {
-		if (req->r_unsafe_callback) {
-			dout("req %p tid %llu unsafe-cb\n", req, req->r_tid);
-			req->r_unsafe_callback(req, true);
-		} else {
-			WARN_ON(1);
-		}
-	}
-
+	__complete_request(req);
+	complete_all(&req->r_completion);
+	ceph_osdc_put_request(req);
 	return;
 
 fail_request:
@@ -3540,7 +3480,7 @@ again:
 			up_read(&osdc->lock);
 			dout("%s waiting on req %p tid %llu last_tid %llu\n",
 			     __func__, req, req->r_tid, last_tid);
-			wait_for_completion(&req->r_done_completion);
+			wait_for_completion(&req->r_completion);
 			ceph_osdc_put_request(req);
 			goto again;
 		}
@@ -3599,7 +3539,7 @@ ceph_osdc_watch(struct ceph_osd_client *osdc,
 
 	ceph_oid_copy(&lreq->t.base_oid, oid);
 	ceph_oloc_copy(&lreq->t.base_oloc, oloc);
-	lreq->t.flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
+	lreq->t.flags = CEPH_OSD_FLAG_WRITE;
 	lreq->mtime = CURRENT_TIME;
 
 	lreq->reg_req = alloc_linger_request(lreq);
@@ -3657,7 +3597,7 @@ int ceph_osdc_unwatch(struct ceph_osd_client *osdc,
 
 	ceph_oid_copy(&req->r_base_oid, &lreq->t.base_oid);
 	ceph_oloc_copy(&req->r_base_oloc, &lreq->t.base_oloc);
-	req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
+	req->r_flags = CEPH_OSD_FLAG_WRITE;
 	req->r_mtime = CURRENT_TIME;
 	osd_req_op_watch_init(req, 0, lreq->linger_id,
 			      CEPH_OSD_WATCH_OP_UNWATCH);
@@ -4022,7 +3962,7 @@ EXPORT_SYMBOL(ceph_osdc_maybe_request_map);
  * Execute an OSD class method on an object.
  *
  * @flags: CEPH_OSD_FLAG_*
- * @resp_len: out param for reply length
+ * @resp_len: in/out param for reply length
  */
 int ceph_osdc_call(struct ceph_osd_client *osdc,
 		   struct ceph_object_id *oid,
@@ -4035,6 +3975,9 @@ int ceph_osdc_call(struct ceph_osd_client *osdc,
 	struct ceph_osd_request *req;
 	int ret;
 
+	if (req_len > PAGE_SIZE || (resp_page && *resp_len > PAGE_SIZE))
+		return -E2BIG;
+
 	req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_NOIO);
 	if (!req)
 		return -ENOMEM;
@@ -4053,7 +3996,7 @@ int ceph_osdc_call(struct ceph_osd_client *osdc,
 						  0, false, false);
 	if (resp_page)
 		osd_req_op_cls_response_data_pages(req, 0, &resp_page,
-						   PAGE_SIZE, 0, false, false);
+						   *resp_len, 0, false, false);
 
 	ceph_osdc_start_request(osdc, req, false);
 	ret = ceph_osdc_wait_request(osdc, req);
@@ -4220,8 +4163,7 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
 	int page_align = off & ~PAGE_MASK;
 
 	req = ceph_osdc_new_request(osdc, layout, vino, off, &len, 0, 1,
-				    CEPH_OSD_OP_WRITE,
-				    CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE,
+				    CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE,
 				    snapc, truncate_seq, truncate_size,
 				    true);
 	if (IS_ERR(req))
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index d2436880b305..6824c0ec8373 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -153,6 +153,32 @@ bad:
         return -EINVAL;
 }
 
+static void crush_finalize(struct crush_map *c)
+{
+	__s32 b;
+
+	/* Space for the array of pointers to per-bucket workspace */
+	c->working_size = sizeof(struct crush_work) +
+	    c->max_buckets * sizeof(struct crush_work_bucket *);
+
+	for (b = 0; b < c->max_buckets; b++) {
+		if (!c->buckets[b])
+			continue;
+
+		switch (c->buckets[b]->alg) {
+		default:
+			/*
+			 * The base case, permutation variables and
+			 * the pointer to the permutation array.
+			 */
+			c->working_size += sizeof(struct crush_work_bucket);
+			break;
+		}
+		/* Every bucket has a permutation array. */
+		c->working_size += c->buckets[b]->size * sizeof(__u32);
+	}
+}
+
 static struct crush_map *crush_decode(void *pbyval, void *end)
 {
 	struct crush_map *c;
@@ -246,10 +272,6 @@ static struct crush_map *crush_decode(void *pbyval, void *end)
 		b->items = kcalloc(b->size, sizeof(__s32), GFP_NOFS);
 		if (b->items == NULL)
 			goto badmem;
-		b->perm = kcalloc(b->size, sizeof(u32), GFP_NOFS);
-		if (b->perm == NULL)
-			goto badmem;
-		b->perm_n = 0;
 
 		ceph_decode_need(p, end, b->size*sizeof(u32), bad);
 		for (j = 0; j < b->size; j++)
@@ -368,6 +390,8 @@ static struct crush_map *crush_decode(void *pbyval, void *end)
 	dout("crush decode tunable chooseleaf_stable = %d\n",
 	     c->chooseleaf_stable);
 
+	crush_finalize(c);
+
 done:
 	dout("crush_decode success\n");
 	return c;
@@ -719,7 +743,7 @@ struct ceph_osdmap *ceph_osdmap_alloc(void)
 	map->pool_max = -1;
 	map->pg_temp = RB_ROOT;
 	map->primary_temp = RB_ROOT;
-	mutex_init(&map->crush_scratch_mutex);
+	mutex_init(&map->crush_workspace_mutex);
 
 	return map;
 }
@@ -753,6 +777,7 @@ void ceph_osdmap_destroy(struct ceph_osdmap *map)
 	kfree(map->osd_weight);
 	kfree(map->osd_addr);
 	kfree(map->osd_primary_affinity);
+	kfree(map->crush_workspace);
 	kfree(map);
 }
 
@@ -808,6 +833,31 @@ static int osdmap_set_max_osd(struct ceph_osdmap *map, int max)
 	return 0;
 }
 
+static int osdmap_set_crush(struct ceph_osdmap *map, struct crush_map *crush)
+{
+	void *workspace;
+	size_t work_size;
+
+	if (IS_ERR(crush))
+		return PTR_ERR(crush);
+
+	work_size = crush_work_size(crush, CEPH_PG_MAX_SIZE);
+	dout("%s work_size %zu bytes\n", __func__, work_size);
+	workspace = kmalloc(work_size, GFP_NOIO);
+	if (!workspace) {
+		crush_destroy(crush);
+		return -ENOMEM;
+	}
+	crush_init_workspace(crush, workspace);
+
+	if (map->crush)
+		crush_destroy(map->crush);
+	kfree(map->crush_workspace);
+	map->crush = crush;
+	map->crush_workspace = workspace;
+	return 0;
+}
+
 #define OSDMAP_WRAPPER_COMPAT_VER	7
 #define OSDMAP_CLIENT_DATA_COMPAT_VER	1
 
@@ -1214,13 +1264,9 @@ static int osdmap_decode(void **p, void *end, struct ceph_osdmap *map)
 
 	/* crush */
 	ceph_decode_32_safe(p, end, len, e_inval);
-	map->crush = crush_decode(*p, min(*p + len, end));
-	if (IS_ERR(map->crush)) {
-		err = PTR_ERR(map->crush);
-		map->crush = NULL;
+	err = osdmap_set_crush(map, crush_decode(*p, min(*p + len, end)));
+	if (err)
 		goto bad;
-	}
-	*p += len;
 
 	/* ignore the rest */
 	*p = end;
@@ -1375,7 +1421,6 @@ e_inval:
 struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
 					     struct ceph_osdmap *map)
 {
-	struct crush_map *newcrush = NULL;
 	struct ceph_fsid fsid;
 	u32 epoch = 0;
 	struct ceph_timespec modified;
@@ -1414,12 +1459,10 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
 	/* new crush? */
 	ceph_decode_32_safe(p, end, len, e_inval);
 	if (len > 0) {
-		newcrush = crush_decode(*p, min(*p+len, end));
-		if (IS_ERR(newcrush)) {
-			err = PTR_ERR(newcrush);
-			newcrush = NULL;
+		err = osdmap_set_crush(map,
+				       crush_decode(*p, min(*p + len, end)));
+		if (err)
 			goto bad;
-		}
 		*p += len;
 	}
 
@@ -1439,12 +1482,6 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
 
 	map->epoch++;
 	map->modified = modified;
-	if (newcrush) {
-		if (map->crush)
-			crush_destroy(map->crush);
-		map->crush = newcrush;
-		newcrush = NULL;
-	}
 
 	/* new_pools */
 	err = decode_new_pools(p, end, map);
@@ -1505,8 +1542,6 @@ bad:
 	print_hex_dump(KERN_DEBUG, "osdmap: ",
 		       DUMP_PREFIX_OFFSET, 16, 1,
 		       start, end - start, true);
-	if (newcrush)
-		crush_destroy(newcrush);
 	return ERR_PTR(err);
 }
 
@@ -1942,10 +1977,10 @@ static int do_crush(struct ceph_osdmap *map, int ruleno, int x,
 
 	BUG_ON(result_max > CEPH_PG_MAX_SIZE);
 
-	mutex_lock(&map->crush_scratch_mutex);
+	mutex_lock(&map->crush_workspace_mutex);
 	r = crush_do_rule(map->crush, ruleno, x, result, result_max,
-			  weight, weight_max, map->crush_scratch_ary);
-	mutex_unlock(&map->crush_scratch_mutex);
+			  weight, weight_max, map->crush_workspace);
+	mutex_unlock(&map->crush_workspace_mutex);
 
 	return r;
 }
@@ -1978,8 +2013,14 @@ static void pg_to_raw_osds(struct ceph_osdmap *osdmap,
 		return;
 	}
 
-	len = do_crush(osdmap, ruleno, pps, raw->osds,
-		       min_t(int, pi->size, ARRAY_SIZE(raw->osds)),
+	if (pi->size > ARRAY_SIZE(raw->osds)) {
+		pr_err_ratelimited("pool %lld ruleset %d type %d too wide: size %d > %zu\n",
+		       pi->id, pi->crush_ruleset, pi->type, pi->size,
+		       ARRAY_SIZE(raw->osds));
+		return;
+	}
+
+	len = do_crush(osdmap, ruleno, pps, raw->osds, pi->size,
 		       osdmap->osd_weight, osdmap->max_osd);
 	if (len < 0) {
 		pr_err("error %d from crush rule %d: pool %lld ruleset %d type %d size %d\n",
diff --git a/net/ceph/snapshot.c b/net/ceph/snapshot.c
index 154683f5f14c..705414e78ae0 100644
--- a/net/ceph/snapshot.c
+++ b/net/ceph/snapshot.c
@@ -18,8 +18,6 @@
  * 02110-1301, USA.
  */
 
-#include <stddef.h>
-
 #include <linux/types.h>
 #include <linux/export.h>
 #include <linux/ceph/libceph.h>
diff --git a/net/compat.c b/net/compat.c
index 96c544b05b15..aba929e5250f 100644
--- a/net/compat.c
+++ b/net/compat.c
@@ -22,6 +22,7 @@
 #include <linux/filter.h>
 #include <linux/compat.h>
 #include <linux/security.h>
+#include <linux/audit.h>
 #include <linux/export.h>
 
 #include <net/scm.h>
@@ -90,11 +91,11 @@ int get_compat_msghdr(struct msghdr *kmsg,
 #define CMSG_COMPAT_ALIGN(len)	ALIGN((len), sizeof(s32))
 
 #define CMSG_COMPAT_DATA(cmsg)				\
-	((void __user *)((char __user *)(cmsg) + CMSG_COMPAT_ALIGN(sizeof(struct compat_cmsghdr))))
+	((void __user *)((char __user *)(cmsg) + sizeof(struct compat_cmsghdr)))
 #define CMSG_COMPAT_SPACE(len)				\
-	(CMSG_COMPAT_ALIGN(sizeof(struct compat_cmsghdr)) + CMSG_COMPAT_ALIGN(len))
+	(sizeof(struct compat_cmsghdr) + CMSG_COMPAT_ALIGN(len))
 #define CMSG_COMPAT_LEN(len)				\
-	(CMSG_COMPAT_ALIGN(sizeof(struct compat_cmsghdr)) + (len))
+	(sizeof(struct compat_cmsghdr) + (len))
 
 #define CMSG_COMPAT_FIRSTHDR(msg)			\
 	(((msg)->msg_controllen) >= sizeof(struct compat_cmsghdr) ?	\
@@ -130,6 +131,9 @@ int cmsghdr_from_user_compat_to_kern(struct msghdr *kmsg, struct sock *sk,
 	__kernel_size_t kcmlen, tmp;
 	int err = -EFAULT;
 
+	BUILD_BUG_ON(sizeof(struct compat_cmsghdr) !=
+		     CMSG_COMPAT_ALIGN(sizeof(struct compat_cmsghdr)));
+
 	kcmlen = 0;
 	kcmsg_base = kcmsg = (struct cmsghdr *)stackbuf;
 	ucmsg = CMSG_COMPAT_FIRSTHDR(kmsg);
@@ -141,8 +145,7 @@ int cmsghdr_from_user_compat_to_kern(struct msghdr *kmsg, struct sock *sk,
 		if (!CMSG_COMPAT_OK(ucmlen, ucmsg, kmsg))
 			return -EINVAL;
 
-		tmp = ((ucmlen - CMSG_COMPAT_ALIGN(sizeof(*ucmsg))) +
-		       CMSG_ALIGN(sizeof(struct cmsghdr)));
+		tmp = ((ucmlen - sizeof(*ucmsg)) + sizeof(struct cmsghdr));
 		tmp = CMSG_ALIGN(tmp);
 		kcmlen += tmp;
 		ucmsg = cmsg_compat_nxthdr(kmsg, ucmsg, ucmlen);
@@ -168,8 +171,7 @@ int cmsghdr_from_user_compat_to_kern(struct msghdr *kmsg, struct sock *sk,
 			goto Efault;
 		if (!CMSG_COMPAT_OK(ucmlen, ucmsg, kmsg))
 			goto Einval;
-		tmp = ((ucmlen - CMSG_COMPAT_ALIGN(sizeof(*ucmsg))) +
-		       CMSG_ALIGN(sizeof(struct cmsghdr)));
+		tmp = ((ucmlen - sizeof(*ucmsg)) + sizeof(struct cmsghdr));
 		if ((char *)kcmsg_base + kcmlen - (char *)kcmsg < CMSG_ALIGN(tmp))
 			goto Einval;
 		kcmsg->cmsg_len = tmp;
@@ -178,7 +180,7 @@ int cmsghdr_from_user_compat_to_kern(struct msghdr *kmsg, struct sock *sk,
 		    __get_user(kcmsg->cmsg_type, &ucmsg->cmsg_type) ||
 		    copy_from_user(CMSG_DATA(kcmsg),
 				   CMSG_COMPAT_DATA(ucmsg),
-				   (ucmlen - CMSG_COMPAT_ALIGN(sizeof(*ucmsg)))))
+				   (ucmlen - sizeof(*ucmsg))))
 			goto Efault;
 
 		/* Advance. */
@@ -781,14 +783,24 @@ COMPAT_SYSCALL_DEFINE5(recvmmsg, int, fd, struct compat_mmsghdr __user *, mmsg,
 
 COMPAT_SYSCALL_DEFINE2(socketcall, int, call, u32 __user *, args)
 {
-	int ret;
-	u32 a[6];
+	u32 a[AUDITSC_ARGS];
+	unsigned int len;
 	u32 a0, a1;
+	int ret;
 
 	if (call < SYS_SOCKET || call > SYS_SENDMMSG)
 		return -EINVAL;
-	if (copy_from_user(a, args, nas[call]))
+	len = nas[call];
+	if (len > sizeof(a))
+		return -EINVAL;
+
+	if (copy_from_user(a, args, len))
 		return -EFAULT;
+
+	ret = audit_socketcall_compat(len / sizeof(a[0]), a);
+	if (ret)
+		return ret;
+
 	a0 = a[0];
 	a1 = a[1];
 
diff --git a/net/core/Makefile b/net/core/Makefile
index f6761b6e3b29..79f9479e9658 100644
--- a/net/core/Makefile
+++ b/net/core/Makefile
@@ -28,3 +28,4 @@ obj-$(CONFIG_LWTUNNEL_BPF) += lwt_bpf.o
 obj-$(CONFIG_DST_CACHE) += dst_cache.o
 obj-$(CONFIG_HWBM) += hwbm.o
 obj-$(CONFIG_NET_DEVLINK) += devlink.o
+obj-$(CONFIG_GRO_CELLS) += gro_cells.o
diff --git a/net/core/datagram.c b/net/core/datagram.c
index 662bea587165..ea633342ab0d 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -332,7 +332,9 @@ void __skb_free_datagram_locked(struct sock *sk, struct sk_buff *skb, int len)
 EXPORT_SYMBOL(__skb_free_datagram_locked);
 
 int __sk_queue_drop_skb(struct sock *sk, struct sk_buff *skb,
-			unsigned int flags)
+			unsigned int flags,
+			void (*destructor)(struct sock *sk,
+					   struct sk_buff *skb))
 {
 	int err = 0;
 
@@ -342,6 +344,8 @@ int __sk_queue_drop_skb(struct sock *sk, struct sk_buff *skb,
 		if (skb == skb_peek(&sk->sk_receive_queue)) {
 			__skb_unlink(skb, &sk->sk_receive_queue);
 			atomic_dec(&skb->users);
+			if (destructor)
+				destructor(sk, skb);
 			err = 0;
 		}
 		spin_unlock_bh(&sk->sk_receive_queue.lock);
@@ -375,7 +379,7 @@ EXPORT_SYMBOL(__sk_queue_drop_skb);
 
 int skb_kill_datagram(struct sock *sk, struct sk_buff *skb, unsigned int flags)
 {
-	int err = __sk_queue_drop_skb(sk, skb, flags);
+	int err = __sk_queue_drop_skb(sk, skb, flags, NULL);
 
 	kfree_skb(skb);
 	sk_mem_reclaim_partial(sk);
diff --git a/net/core/dev.c b/net/core/dev.c
index 7f218e095361..304f2deae5f9 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1,5 +1,5 @@
 /*
- * 	NET3	Protocol independent device support routines.
+ *      NET3    Protocol independent device support routines.
  *
  *		This program is free software; you can redistribute it and/or
  *		modify it under the terms of the GNU General Public License
@@ -7,7 +7,7 @@
  *		2 of the License, or (at your option) any later version.
  *
  *	Derived from the non IP parts of dev.c 1.0.19
- * 		Authors:	Ross Biro
+ *              Authors:	Ross Biro
  *				Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  *				Mark Evans, <evansmp@uhura.aston.ac.uk>
  *
@@ -21,9 +21,9 @@
  *
  *	Changes:
  *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
- *              			to 2 if register_netdev gets called
- *              			before net_dev_init & also removed a
- *              			few lines of code in the process.
+ *                                      to 2 if register_netdev gets called
+ *                                      before net_dev_init & also removed a
+ *                                      few lines of code in the process.
  *		Alan Cox	:	device private ioctl copies fields back.
  *		Alan Cox	:	Transmit queue code does relevant
  *					stunts to keep the queue safe.
@@ -36,7 +36,7 @@
  *		Alan Cox	:	100 backlog just doesn't cut it when
  *					you start doing multicast video 8)
  *		Alan Cox	:	Rewrote net_bh and list manager.
- *		Alan Cox	: 	Fix ETH_P_ALL echoback lengths.
+ *              Alan Cox        :       Fix ETH_P_ALL echoback lengths.
  *		Alan Cox	:	Took out transmit every packet pass
  *					Saved a few bytes in the ioctl handler
  *		Alan Cox	:	Network driver sets packet type before
@@ -46,7 +46,7 @@
  *		Richard Kooijman:	Timestamp fixes.
  *		Alan Cox	:	Wrong field in SIOCGIFDSTADDR
  *		Alan Cox	:	Device lock protection.
- *		Alan Cox	: 	Fixed nasty side effect of device close
+ *              Alan Cox        :       Fixed nasty side effect of device close
  *					changes.
  *		Rudi Cilibrasi	:	Pass the right thing to
  *					set_mac_address()
@@ -67,8 +67,8 @@
  *	Paul Rusty Russell	:	SIOCSIFNAME
  *              Pekka Riikonen  :	Netdev boot-time settings code
  *              Andrew Morton   :       Make unregister_netdevice wait
- *              			indefinitely on dev->refcnt
- * 		J Hadi Salim	:	- Backlog queue sampling
+ *                                      indefinitely on dev->refcnt
+ *              J Hadi Salim    :       - Backlog queue sampling
  *				        - netif_rx() feedback
  */
 
@@ -192,7 +192,8 @@ static seqcount_t devnet_rename_seq;
 
 static inline void dev_base_seq_inc(struct net *net)
 {
-	while (++net->dev_base_seq == 0);
+	while (++net->dev_base_seq == 0)
+		;
 }
 
 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
@@ -274,8 +275,8 @@ EXPORT_PER_CPU_SYMBOL(softnet_data);
  * register_netdevice() inits txq->_xmit_lock and sets lockdep class
  * according to dev->type
  */
-static const unsigned short netdev_lock_type[] =
-	{ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
+static const unsigned short netdev_lock_type[] = {
+	 ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
 	 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
 	 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
 	 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
@@ -291,22 +292,22 @@ static const unsigned short netdev_lock_type[] =
 	 ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
 	 ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
 
-static const char *const netdev_lock_name[] =
-	{"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
-	 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
-	 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
-	 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
-	 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
-	 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
-	 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
-	 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
-	 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
-	 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
-	 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
-	 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
-	 "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
-	 "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
-	 "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
+static const char *const netdev_lock_name[] = {
+	"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
+	"_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
+	"_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
+	"_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
+	"_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
+	"_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
+	"_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
+	"_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
+	"_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
+	"_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
+	"_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
+	"_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
+	"_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
+	"_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
+	"_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
 
 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
@@ -352,10 +353,11 @@ static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 #endif
 
 /*******************************************************************************
+ *
+ *		Protocol management and registration routines
+ *
+ *******************************************************************************/
 
-		Protocol management and registration routines
-
-*******************************************************************************/
 
 /*
  *	Add a protocol ID to the list. Now that the input handler is
@@ -538,10 +540,10 @@ void dev_remove_offload(struct packet_offload *po)
 EXPORT_SYMBOL(dev_remove_offload);
 
 /******************************************************************************
-
-		      Device Boot-time Settings Routines
-
-*******************************************************************************/
+ *
+ *		      Device Boot-time Settings Routines
+ *
+ ******************************************************************************/
 
 /* Boot time configuration table */
 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
@@ -574,13 +576,13 @@ static int netdev_boot_setup_add(char *name, struct ifmap *map)
 }
 
 /**
- *	netdev_boot_setup_check	- check boot time settings
- *	@dev: the netdevice
+ * netdev_boot_setup_check	- check boot time settings
+ * @dev: the netdevice
  *
- * 	Check boot time settings for the device.
- *	The found settings are set for the device to be used
- *	later in the device probing.
- *	Returns 0 if no settings found, 1 if they are.
+ * Check boot time settings for the device.
+ * The found settings are set for the device to be used
+ * later in the device probing.
+ * Returns 0 if no settings found, 1 if they are.
  */
 int netdev_boot_setup_check(struct net_device *dev)
 {
@@ -590,10 +592,10 @@ int netdev_boot_setup_check(struct net_device *dev)
 	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 		if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
 		    !strcmp(dev->name, s[i].name)) {
-			dev->irq 	= s[i].map.irq;
-			dev->base_addr 	= s[i].map.base_addr;
-			dev->mem_start 	= s[i].map.mem_start;
-			dev->mem_end 	= s[i].map.mem_end;
+			dev->irq = s[i].map.irq;
+			dev->base_addr = s[i].map.base_addr;
+			dev->mem_start = s[i].map.mem_start;
+			dev->mem_end = s[i].map.mem_end;
 			return 1;
 		}
 	}
@@ -603,14 +605,14 @@ EXPORT_SYMBOL(netdev_boot_setup_check);
 
 
 /**
- *	netdev_boot_base	- get address from boot time settings
- *	@prefix: prefix for network device
- *	@unit: id for network device
+ * netdev_boot_base	- get address from boot time settings
+ * @prefix: prefix for network device
+ * @unit: id for network device
  *
- * 	Check boot time settings for the base address of device.
- *	The found settings are set for the device to be used
- *	later in the device probing.
- *	Returns 0 if no settings found.
+ * Check boot time settings for the base address of device.
+ * The found settings are set for the device to be used
+ * later in the device probing.
+ * Returns 0 if no settings found.
  */
 unsigned long netdev_boot_base(const char *prefix, int unit)
 {
@@ -663,10 +665,10 @@ int __init netdev_boot_setup(char *str)
 __setup("netdev=", netdev_boot_setup);
 
 /*******************************************************************************
-
-			    Device Interface Subroutines
-
-*******************************************************************************/
+ *
+ *			    Device Interface Subroutines
+ *
+ *******************************************************************************/
 
 /**
  *	dev_get_iflink	- get 'iflink' value of a interface
@@ -737,15 +739,15 @@ struct net_device *__dev_get_by_name(struct net *net, const char *name)
 EXPORT_SYMBOL(__dev_get_by_name);
 
 /**
- *	dev_get_by_name_rcu	- find a device by its name
- *	@net: the applicable net namespace
- *	@name: name to find
+ * dev_get_by_name_rcu	- find a device by its name
+ * @net: the applicable net namespace
+ * @name: name to find
  *
- *	Find an interface by name.
- *	If the name is found a pointer to the device is returned.
- * 	If the name is not found then %NULL is returned.
- *	The reference counters are not incremented so the caller must be
- *	careful with locks. The caller must hold RCU lock.
+ * Find an interface by name.
+ * If the name is found a pointer to the device is returned.
+ * If the name is not found then %NULL is returned.
+ * The reference counters are not incremented so the caller must be
+ * careful with locks. The caller must hold RCU lock.
  */
 
 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
@@ -1289,8 +1291,8 @@ void netdev_state_change(struct net_device *dev)
 EXPORT_SYMBOL(netdev_state_change);
 
 /**
- * 	netdev_notify_peers - notify network peers about existence of @dev
- * 	@dev: network device
+ * netdev_notify_peers - notify network peers about existence of @dev
+ * @dev: network device
  *
  * Generate traffic such that interested network peers are aware of
  * @dev, such as by generating a gratuitous ARP. This may be used when
@@ -1518,17 +1520,17 @@ static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
 static int dev_boot_phase = 1;
 
 /**
- *	register_netdevice_notifier - register a network notifier block
- *	@nb: notifier
+ * register_netdevice_notifier - register a network notifier block
+ * @nb: notifier
  *
- *	Register a notifier to be called when network device events occur.
- *	The notifier passed is linked into the kernel structures and must
- *	not be reused until it has been unregistered. A negative errno code
- *	is returned on a failure.
+ * Register a notifier to be called when network device events occur.
+ * The notifier passed is linked into the kernel structures and must
+ * not be reused until it has been unregistered. A negative errno code
+ * is returned on a failure.
  *
- * 	When registered all registration and up events are replayed
- *	to the new notifier to allow device to have a race free
- *	view of the network device list.
+ * When registered all registration and up events are replayed
+ * to the new notifier to allow device to have a race free
+ * view of the network device list.
  */
 
 int register_netdevice_notifier(struct notifier_block *nb)
@@ -1585,17 +1587,17 @@ outroll:
 EXPORT_SYMBOL(register_netdevice_notifier);
 
 /**
- *	unregister_netdevice_notifier - unregister a network notifier block
- *	@nb: notifier
+ * unregister_netdevice_notifier - unregister a network notifier block
+ * @nb: notifier
  *
- *	Unregister a notifier previously registered by
- *	register_netdevice_notifier(). The notifier is unlinked into the
- *	kernel structures and may then be reused. A negative errno code
- *	is returned on a failure.
+ * Unregister a notifier previously registered by
+ * register_netdevice_notifier(). The notifier is unlinked into the
+ * kernel structures and may then be reused. A negative errno code
+ * is returned on a failure.
  *
- * 	After unregistering unregister and down device events are synthesized
- *	for all devices on the device list to the removed notifier to remove
- *	the need for special case cleanup code.
+ * After unregistering unregister and down device events are synthesized
+ * for all devices on the device list to the removed notifier to remove
+ * the need for special case cleanup code.
  */
 
 int unregister_netdevice_notifier(struct notifier_block *nb)
@@ -1695,24 +1697,19 @@ EXPORT_SYMBOL_GPL(net_dec_egress_queue);
 
 static struct static_key netstamp_needed __read_mostly;
 #ifdef HAVE_JUMP_LABEL
-/* We are not allowed to call static_key_slow_dec() from irq context
- * If net_disable_timestamp() is called from irq context, defer the
- * static_key_slow_dec() calls.
- */
 static atomic_t netstamp_needed_deferred;
-#endif
-
-void net_enable_timestamp(void)
+static void netstamp_clear(struct work_struct *work)
 {
-#ifdef HAVE_JUMP_LABEL
 	int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
 
-	if (deferred) {
-		while (--deferred)
-			static_key_slow_dec(&netstamp_needed);
-		return;
-	}
+	while (deferred--)
+		static_key_slow_dec(&netstamp_needed);
+}
+static DECLARE_WORK(netstamp_work, netstamp_clear);
 #endif
+
+void net_enable_timestamp(void)
+{
 	static_key_slow_inc(&netstamp_needed);
 }
 EXPORT_SYMBOL(net_enable_timestamp);
@@ -1720,12 +1717,12 @@ EXPORT_SYMBOL(net_enable_timestamp);
 void net_disable_timestamp(void)
 {
 #ifdef HAVE_JUMP_LABEL
-	if (in_interrupt()) {
-		atomic_inc(&netstamp_needed_deferred);
-		return;
-	}
-#endif
+	/* net_disable_timestamp() can be called from non process context */
+	atomic_inc(&netstamp_needed_deferred);
+	schedule_work(&netstamp_work);
+#else
 	static_key_slow_dec(&netstamp_needed);
+#endif
 }
 EXPORT_SYMBOL(net_disable_timestamp);
 
@@ -2408,28 +2405,6 @@ void netif_schedule_queue(struct netdev_queue *txq)
 }
 EXPORT_SYMBOL(netif_schedule_queue);
 
-/**
- *	netif_wake_subqueue - allow sending packets on subqueue
- *	@dev: network device
- *	@queue_index: sub queue index
- *
- * Resume individual transmit queue of a device with multiple transmit queues.
- */
-void netif_wake_subqueue(struct net_device *dev, u16 queue_index)
-{
-	struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_index);
-
-	if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &txq->state)) {
-		struct Qdisc *q;
-
-		rcu_read_lock();
-		q = rcu_dereference(txq->qdisc);
-		__netif_schedule(q);
-		rcu_read_unlock();
-	}
-}
-EXPORT_SYMBOL(netif_wake_subqueue);
-
 void netif_tx_wake_queue(struct netdev_queue *dev_queue)
 {
 	if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) {
@@ -2523,6 +2498,7 @@ u16 __skb_tx_hash(const struct net_device *dev, struct sk_buff *skb,
 
 	if (dev->num_tc) {
 		u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
+
 		qoffset = dev->tc_to_txq[tc].offset;
 		qcount = dev->tc_to_txq[tc].count;
 	}
@@ -2659,9 +2635,10 @@ EXPORT_SYMBOL(skb_mac_gso_segment);
 static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
 {
 	if (tx_path)
-		return skb->ip_summed != CHECKSUM_PARTIAL;
-	else
-		return skb->ip_summed == CHECKSUM_NONE;
+		return skb->ip_summed != CHECKSUM_PARTIAL &&
+		       skb->ip_summed != CHECKSUM_NONE;
+
+	return skb->ip_summed == CHECKSUM_NONE;
 }
 
 /**
@@ -2680,11 +2657,12 @@ static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
 struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
 				  netdev_features_t features, bool tx_path)
 {
+	struct sk_buff *segs;
+
 	if (unlikely(skb_needs_check(skb, tx_path))) {
 		int err;
 
-		skb_warn_bad_offload(skb);
-
+		/* We're going to init ->check field in TCP or UDP header */
 		err = skb_cow_head(skb, 0);
 		if (err < 0)
 			return ERR_PTR(err);
@@ -2712,7 +2690,12 @@ struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
 	skb_reset_mac_header(skb);
 	skb_reset_mac_len(skb);
 
-	return skb_mac_gso_segment(skb, features);
+	segs = skb_mac_gso_segment(skb, features);
+
+	if (unlikely(skb_needs_check(skb, tx_path)))
+		skb_warn_bad_offload(skb);
+
+	return segs;
 }
 EXPORT_SYMBOL(__skb_gso_segment);
 
@@ -2737,9 +2720,11 @@ static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
 {
 #ifdef CONFIG_HIGHMEM
 	int i;
+
 	if (!(dev->features & NETIF_F_HIGHDMA)) {
 		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
 			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+
 			if (PageHighMem(skb_frag_page(frag)))
 				return 1;
 		}
@@ -2753,6 +2738,7 @@ static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
 		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
 			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
 			dma_addr_t addr = page_to_phys(skb_frag_page(frag));
+
 			if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
 				return 1;
 		}
@@ -3153,9 +3139,7 @@ sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
 	if (!cl)
 		return skb;
 
-	/* skb->tc_verd and qdisc_skb_cb(skb)->pkt_len were already set
-	 * earlier by the caller.
-	 */
+	/* qdisc_skb_cb(skb)->pkt_len was already set by the caller. */
 	qdisc_bstats_cpu_update(cl->q, skb);
 
 	switch (tc_classify(skb, cl, &cl_res, false)) {
@@ -3230,6 +3214,7 @@ static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb)
 	if (queue_index < 0 || skb->ooo_okay ||
 	    queue_index >= dev->real_num_tx_queues) {
 		int new_index = get_xps_queue(dev, skb);
+
 		if (new_index < 0)
 			new_index = skb_tx_hash(dev, skb);
 
@@ -3259,6 +3244,7 @@ struct netdev_queue *netdev_pick_tx(struct net_device *dev,
 
 	if (dev->real_num_tx_queues != 1) {
 		const struct net_device_ops *ops = dev->netdev_ops;
+
 		if (ops->ndo_select_queue)
 			queue_index = ops->ndo_select_queue(dev, skb, accel_priv,
 							    __netdev_pick_tx);
@@ -3320,7 +3306,7 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
 
 	qdisc_pkt_len_init(skb);
 #ifdef CONFIG_NET_CLS_ACT
-	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
+	skb->tc_at_ingress = 0;
 # ifdef CONFIG_NET_EGRESS
 	if (static_key_false(&egress_needed)) {
 		skb = sch_handle_egress(skb, &rc, dev);
@@ -3347,16 +3333,16 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
 	}
 
 	/* The device has no queue. Common case for software devices:
-	   loopback, all the sorts of tunnels...
+	 * loopback, all the sorts of tunnels...
 
-	   Really, it is unlikely that netif_tx_lock protection is necessary
-	   here.  (f.e. loopback and IP tunnels are clean ignoring statistics
-	   counters.)
-	   However, it is possible, that they rely on protection
-	   made by us here.
+	 * Really, it is unlikely that netif_tx_lock protection is necessary
+	 * here.  (f.e. loopback and IP tunnels are clean ignoring statistics
+	 * counters.)
+	 * However, it is possible, that they rely on protection
+	 * made by us here.
 
-	   Check this and shot the lock. It is not prone from deadlocks.
-	   Either shot noqueue qdisc, it is even simpler 8)
+	 * Check this and shot the lock. It is not prone from deadlocks.
+	 *Either shot noqueue qdisc, it is even simpler 8)
 	 */
 	if (dev->flags & IFF_UP) {
 		int cpu = smp_processor_id(); /* ok because BHs are off */
@@ -3418,16 +3404,20 @@ int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv)
 EXPORT_SYMBOL(dev_queue_xmit_accel);
 
 
-/*=======================================================================
-			Receiver routines
-  =======================================================================*/
+/*************************************************************************
+ *			Receiver routines
+ *************************************************************************/
 
 int netdev_max_backlog __read_mostly = 1000;
 EXPORT_SYMBOL(netdev_max_backlog);
 
 int netdev_tstamp_prequeue __read_mostly = 1;
 int netdev_budget __read_mostly = 300;
-int weight_p __read_mostly = 64;            /* old backlog weight */
+int weight_p __read_mostly = 64;           /* old backlog weight */
+int dev_weight_rx_bias __read_mostly = 1;  /* bias for backlog weight */
+int dev_weight_tx_bias __read_mostly = 1;  /* bias for output_queue quota */
+int dev_rx_weight __read_mostly = 64;
+int dev_tx_weight __read_mostly = 64;
 
 /* Called with irq disabled */
 static inline void ____napi_schedule(struct softnet_data *sd,
@@ -3784,6 +3774,7 @@ static int netif_rx_internal(struct sk_buff *skb)
 #endif
 	{
 		unsigned int qtail;
+
 		ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
 		put_cpu();
 	}
@@ -3843,6 +3834,7 @@ static __latent_entropy void net_tx_action(struct softirq_action *h)
 
 		while (clist) {
 			struct sk_buff *skb = clist;
+
 			clist = clist->next;
 
 			WARN_ON(atomic_read(&skb->users));
@@ -3916,7 +3908,7 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
 	}
 
 	qdisc_skb_cb(skb)->pkt_len = skb->len;
-	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
+	skb->tc_at_ingress = 1;
 	qdisc_bstats_cpu_update(cl->q, skb);
 
 	switch (tc_classify(skb, cl, &cl_res, false)) {
@@ -3981,9 +3973,7 @@ int netdev_rx_handler_register(struct net_device *dev,
 			       rx_handler_func_t *rx_handler,
 			       void *rx_handler_data)
 {
-	ASSERT_RTNL();
-
-	if (dev->rx_handler)
+	if (netdev_is_rx_handler_busy(dev))
 		return -EBUSY;
 
 	/* Note: rx_handler_data must be set before rx_handler */
@@ -4089,12 +4079,8 @@ another_round:
 			goto out;
 	}
 
-#ifdef CONFIG_NET_CLS_ACT
-	if (skb->tc_verd & TC_NCLS) {
-		skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
-		goto ncls;
-	}
-#endif
+	if (skb_skip_tc_classify(skb))
+		goto skip_classify;
 
 	if (pfmemalloc)
 		goto skip_taps;
@@ -4122,10 +4108,8 @@ skip_taps:
 			goto out;
 	}
 #endif
-#ifdef CONFIG_NET_CLS_ACT
-	skb->tc_verd = 0;
-ncls:
-#endif
+	skb_reset_tc(skb);
+skip_classify:
 	if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
 		goto drop;
 
@@ -4526,6 +4510,11 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
 	if (&ptype->list == head)
 		goto normal;
 
+	if (IS_ERR(pp) && PTR_ERR(pp) == -EINPROGRESS) {
+		ret = GRO_CONSUMED;
+		goto ok;
+	}
+
 	same_flow = NAPI_GRO_CB(skb)->same_flow;
 	ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
 
@@ -4621,6 +4610,7 @@ static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
 	case GRO_MERGED_FREE:
 		if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD) {
 			skb_dst_drop(skb);
+			secpath_reset(skb);
 			kmem_cache_free(skbuff_head_cache, skb);
 		} else {
 			__kfree_skb(skb);
@@ -4629,6 +4619,7 @@ static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
 
 	case GRO_HELD:
 	case GRO_MERGED:
+	case GRO_CONSUMED:
 		break;
 	}
 
@@ -4661,6 +4652,7 @@ static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
 	skb->encapsulation = 0;
 	skb_shinfo(skb)->gso_type = 0;
 	skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
+	secpath_reset(skb);
 
 	napi->skb = skb;
 }
@@ -4699,6 +4691,7 @@ static gro_result_t napi_frags_finish(struct napi_struct *napi,
 		break;
 
 	case GRO_MERGED:
+	case GRO_CONSUMED:
 		break;
 	}
 
@@ -4835,7 +4828,7 @@ static int process_backlog(struct napi_struct *napi, int quota)
 		net_rps_action_and_irq_enable(sd);
 	}
 
-	napi->weight = weight_p;
+	napi->weight = dev_rx_weight;
 	while (again) {
 		struct sk_buff *skb;
 
@@ -4902,23 +4895,6 @@ void __napi_schedule_irqoff(struct napi_struct *n)
 }
 EXPORT_SYMBOL(__napi_schedule_irqoff);
 
-bool __napi_complete(struct napi_struct *n)
-{
-	BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
-
-	/* Some drivers call us directly, instead of calling
-	 * napi_complete_done().
-	 */
-	if (unlikely(test_bit(NAPI_STATE_IN_BUSY_POLL, &n->state)))
-		return false;
-
-	list_del_init(&n->poll_list);
-	smp_mb__before_atomic();
-	clear_bit(NAPI_STATE_SCHED, &n->state);
-	return true;
-}
-EXPORT_SYMBOL(__napi_complete);
-
 bool napi_complete_done(struct napi_struct *n, int work_done)
 {
 	unsigned long flags;
@@ -4945,14 +4921,13 @@ bool napi_complete_done(struct napi_struct *n, int work_done)
 		else
 			napi_gro_flush(n, false);
 	}
-	if (likely(list_empty(&n->poll_list))) {
-		WARN_ON_ONCE(!test_and_clear_bit(NAPI_STATE_SCHED, &n->state));
-	} else {
+	if (unlikely(!list_empty(&n->poll_list))) {
 		/* If n->poll_list is not empty, we need to mask irqs */
 		local_irq_save(flags);
-		__napi_complete(n);
+		list_del_init(&n->poll_list);
 		local_irq_restore(flags);
 	}
+	WARN_ON_ONCE(!test_and_clear_bit(NAPI_STATE_SCHED, &n->state));
 	return true;
 }
 EXPORT_SYMBOL(napi_complete_done);
@@ -4998,7 +4973,6 @@ bool sk_busy_loop(struct sock *sk, int nonblock)
 {
 	unsigned long end_time = !nonblock ? sk_busy_loop_end_time(sk) : 0;
 	int (*napi_poll)(struct napi_struct *napi, int budget);
-	int (*busy_poll)(struct napi_struct *dev);
 	void *have_poll_lock = NULL;
 	struct napi_struct *napi;
 	int rc;
@@ -5013,17 +4987,10 @@ restart:
 	if (!napi)
 		goto out;
 
-	/* Note: ndo_busy_poll method is optional in linux-4.5 */
-	busy_poll = napi->dev->netdev_ops->ndo_busy_poll;
-
 	preempt_disable();
 	for (;;) {
 		rc = 0;
 		local_bh_disable();
-		if (busy_poll) {
-			rc = busy_poll(napi);
-			goto count;
-		}
 		if (!napi_poll) {
 			unsigned long val = READ_ONCE(napi->state);
 
@@ -5048,9 +5015,6 @@ count:
 					LINUX_MIB_BUSYPOLLRXPACKETS, rc);
 		local_bh_enable();
 
-		if (rc == LL_FLUSH_FAILED)
-			break; /* permanent failure */
-
 		if (nonblock || !skb_queue_empty(&sk->sk_receive_queue) ||
 		    busy_loop_timeout(end_time))
 			break;
@@ -5125,7 +5089,7 @@ static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
 
 	napi = container_of(timer, struct napi_struct, timer);
 	if (napi->gro_list)
-		napi_schedule(napi);
+		napi_schedule_irqoff(napi);
 
 	return HRTIMER_NORESTART;
 }
@@ -5711,6 +5675,7 @@ static int netdev_adjacent_sysfs_add(struct net_device *dev,
 			      struct list_head *dev_list)
 {
 	char linkname[IFNAMSIZ+7];
+
 	sprintf(linkname, dev_list == &dev->adj_list.upper ?
 		"upper_%s" : "lower_%s", adj_dev->name);
 	return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj),
@@ -5721,6 +5686,7 @@ static void netdev_adjacent_sysfs_del(struct net_device *dev,
 			       struct list_head *dev_list)
 {
 	char linkname[IFNAMSIZ+7];
+
 	sprintf(linkname, dev_list == &dev->adj_list.upper ?
 		"upper_%s" : "lower_%s", name);
 	sysfs_remove_link(&(dev->dev.kobj), linkname);
@@ -5990,6 +5956,7 @@ void netdev_upper_dev_unlink(struct net_device *dev,
 			     struct net_device *upper_dev)
 {
 	struct netdev_notifier_changeupper_info changeupper_info;
+
 	ASSERT_RTNL();
 
 	changeupper_info.upper_dev = upper_dev;
@@ -6156,50 +6123,6 @@ void netdev_lower_state_changed(struct net_device *lower_dev,
 }
 EXPORT_SYMBOL(netdev_lower_state_changed);
 
-int netdev_default_l2upper_neigh_construct(struct net_device *dev,
-					   struct neighbour *n)
-{
-	struct net_device *lower_dev, *stop_dev;
-	struct list_head *iter;
-	int err;
-
-	netdev_for_each_lower_dev(dev, lower_dev, iter) {
-		if (!lower_dev->netdev_ops->ndo_neigh_construct)
-			continue;
-		err = lower_dev->netdev_ops->ndo_neigh_construct(lower_dev, n);
-		if (err) {
-			stop_dev = lower_dev;
-			goto rollback;
-		}
-	}
-	return 0;
-
-rollback:
-	netdev_for_each_lower_dev(dev, lower_dev, iter) {
-		if (lower_dev == stop_dev)
-			break;
-		if (!lower_dev->netdev_ops->ndo_neigh_destroy)
-			continue;
-		lower_dev->netdev_ops->ndo_neigh_destroy(lower_dev, n);
-	}
-	return err;
-}
-EXPORT_SYMBOL_GPL(netdev_default_l2upper_neigh_construct);
-
-void netdev_default_l2upper_neigh_destroy(struct net_device *dev,
-					  struct neighbour *n)
-{
-	struct net_device *lower_dev;
-	struct list_head *iter;
-
-	netdev_for_each_lower_dev(dev, lower_dev, iter) {
-		if (!lower_dev->netdev_ops->ndo_neigh_destroy)
-			continue;
-		lower_dev->netdev_ops->ndo_neigh_destroy(lower_dev, n);
-	}
-}
-EXPORT_SYMBOL_GPL(netdev_default_l2upper_neigh_destroy);
-
 static void dev_change_rx_flags(struct net_device *dev, int flags)
 {
 	const struct net_device_ops *ops = dev->netdev_ops;
@@ -6452,8 +6375,8 @@ int __dev_change_flags(struct net_device *dev, unsigned int flags)
 	}
 
 	/* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
-	   is important. Some (broken) drivers set IFF_PROMISC, when
-	   IFF_ALLMULTI is requested not asking us and not reporting.
+	 * is important. Some (broken) drivers set IFF_PROMISC, when
+	 * IFF_ALLMULTI is requested not asking us and not reporting.
 	 */
 	if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
 		int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
@@ -6751,6 +6674,7 @@ EXPORT_SYMBOL(dev_change_xdp_fd);
 static int dev_new_index(struct net *net)
 {
 	int ifindex = net->ifindex;
+
 	for (;;) {
 		if (++ifindex <= 0)
 			ifindex = 1;
@@ -6817,8 +6741,8 @@ static void rollback_registered_many(struct list_head *head)
 
 
 		/* Notify protocols, that we are about to destroy
-		   this device. They should clean all the things.
-		*/
+		 * this device. They should clean all the things.
+		 */
 		call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
 
 		if (!dev->rtnl_link_ops ||
@@ -6976,13 +6900,6 @@ static netdev_features_t netdev_fix_features(struct net_device *dev,
 		features &= ~dev->gso_partial_features;
 	}
 
-#ifdef CONFIG_NET_RX_BUSY_POLL
-	if (dev->netdev_ops->ndo_busy_poll)
-		features |= NETIF_F_BUSY_POLL;
-	else
-#endif
-		features &= ~NETIF_F_BUSY_POLL;
-
 	return features;
 }
 
@@ -7171,6 +7088,7 @@ void netif_tx_stop_all_queues(struct net_device *dev)
 
 	for (i = 0; i < dev->num_tx_queues; i++) {
 		struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
+
 		netif_tx_stop_queue(txq);
 	}
 }
@@ -7645,17 +7563,17 @@ void netdev_freemem(struct net_device *dev)
 }
 
 /**
- *	alloc_netdev_mqs - allocate network device
- *	@sizeof_priv:		size of private data to allocate space for
- *	@name:			device name format string
- *	@name_assign_type: 	origin of device name
- *	@setup:			callback to initialize device
- *	@txqs:			the number of TX subqueues to allocate
- *	@rxqs:			the number of RX subqueues to allocate
- *
- *	Allocates a struct net_device with private data area for driver use
- *	and performs basic initialization.  Also allocates subqueue structs
- *	for each queue on the device.
+ * alloc_netdev_mqs - allocate network device
+ * @sizeof_priv: size of private data to allocate space for
+ * @name: device name format string
+ * @name_assign_type: origin of device name
+ * @setup: callback to initialize device
+ * @txqs: the number of TX subqueues to allocate
+ * @rxqs: the number of RX subqueues to allocate
+ *
+ * Allocates a struct net_device with private data area for driver use
+ * and performs basic initialization.  Also allocates subqueue structs
+ * for each queue on the device.
  */
 struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
 		unsigned char name_assign_type,
@@ -7767,13 +7685,13 @@ free_dev:
 EXPORT_SYMBOL(alloc_netdev_mqs);
 
 /**
- *	free_netdev - free network device
- *	@dev: device
+ * free_netdev - free network device
+ * @dev: device
  *
- *	This function does the last stage of destroying an allocated device
- * 	interface. The reference to the device object is released.
- *	If this is the last reference then it will be freed.
- *	Must be called in process context.
+ * This function does the last stage of destroying an allocated device
+ * interface. The reference to the device object is released. If this
+ * is the last reference then it will be freed.Must be called in process
+ * context.
  */
 void free_netdev(struct net_device *dev)
 {
@@ -7955,12 +7873,12 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char
 	dev_shutdown(dev);
 
 	/* Notify protocols, that we are about to destroy
-	   this device. They should clean all the things.
-
-	   Note that dev->reg_state stays at NETREG_REGISTERED.
-	   This is wanted because this way 8021q and macvlan know
-	   the device is just moving and can keep their slaves up.
-	*/
+	 * this device. They should clean all the things.
+	 *
+	 * Note that dev->reg_state stays at NETREG_REGISTERED.
+	 * This is wanted because this way 8021q and macvlan know
+	 * the device is just moving and can keep their slaves up.
+	 */
 	call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
 	rcu_barrier();
 	call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
diff --git a/net/core/devlink.c b/net/core/devlink.c
index 2b5bf9efa720..e9c1e6acfb6d 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -1392,9 +1392,9 @@ static int devlink_nl_cmd_sb_occ_max_clear_doit(struct sk_buff *skb,
 	return -EOPNOTSUPP;
 }
 
-static int devlink_eswitch_fill(struct sk_buff *msg, struct devlink *devlink,
-				enum devlink_command cmd, u32 portid,
-				u32 seq, int flags)
+static int devlink_nl_eswitch_fill(struct sk_buff *msg, struct devlink *devlink,
+				   enum devlink_command cmd, u32 portid,
+				   u32 seq, int flags)
 {
 	const struct devlink_ops *ops = devlink->ops;
 	void *hdr;
@@ -1408,50 +1408,52 @@ static int devlink_eswitch_fill(struct sk_buff *msg, struct devlink *devlink,
 
 	err = devlink_nl_put_handle(msg, devlink);
 	if (err)
-		goto out;
+		goto nla_put_failure;
 
-	err = ops->eswitch_mode_get(devlink, &mode);
-	if (err)
-		goto out;
-	err = nla_put_u16(msg, DEVLINK_ATTR_ESWITCH_MODE, mode);
-	if (err)
-		goto out;
+	if (ops->eswitch_mode_get) {
+		err = ops->eswitch_mode_get(devlink, &mode);
+		if (err)
+			goto nla_put_failure;
+		err = nla_put_u16(msg, DEVLINK_ATTR_ESWITCH_MODE, mode);
+		if (err)
+			goto nla_put_failure;
+	}
 
 	if (ops->eswitch_inline_mode_get) {
 		err = ops->eswitch_inline_mode_get(devlink, &inline_mode);
 		if (err)
-			goto out;
+			goto nla_put_failure;
 		err = nla_put_u8(msg, DEVLINK_ATTR_ESWITCH_INLINE_MODE,
 				 inline_mode);
 		if (err)
-			goto out;
+			goto nla_put_failure;
 	}
 
 	genlmsg_end(msg, hdr);
 	return 0;
 
-out:
+nla_put_failure:
 	genlmsg_cancel(msg, hdr);
 	return err;
 }
 
-static int devlink_nl_cmd_eswitch_mode_get_doit(struct sk_buff *skb,
-						struct genl_info *info)
+static int devlink_nl_cmd_eswitch_get_doit(struct sk_buff *skb,
+					   struct genl_info *info)
 {
 	struct devlink *devlink = info->user_ptr[0];
 	const struct devlink_ops *ops = devlink->ops;
 	struct sk_buff *msg;
 	int err;
 
-	if (!ops || !ops->eswitch_mode_get)
+	if (!ops)
 		return -EOPNOTSUPP;
 
 	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
 	if (!msg)
 		return -ENOMEM;
 
-	err = devlink_eswitch_fill(msg, devlink, DEVLINK_CMD_ESWITCH_MODE_GET,
-				   info->snd_portid, info->snd_seq, 0);
+	err = devlink_nl_eswitch_fill(msg, devlink, DEVLINK_CMD_ESWITCH_GET,
+				      info->snd_portid, info->snd_seq, 0);
 
 	if (err) {
 		nlmsg_free(msg);
@@ -1461,8 +1463,8 @@ static int devlink_nl_cmd_eswitch_mode_get_doit(struct sk_buff *skb,
 	return genlmsg_reply(msg, info);
 }
 
-static int devlink_nl_cmd_eswitch_mode_set_doit(struct sk_buff *skb,
-						struct genl_info *info)
+static int devlink_nl_cmd_eswitch_set_doit(struct sk_buff *skb,
+					   struct genl_info *info)
 {
 	struct devlink *devlink = info->user_ptr[0];
 	const struct devlink_ops *ops = devlink->ops;
@@ -1629,15 +1631,15 @@ static const struct genl_ops devlink_nl_ops[] = {
 				  DEVLINK_NL_FLAG_LOCK_PORTS,
 	},
 	{
-		.cmd = DEVLINK_CMD_ESWITCH_MODE_GET,
-		.doit = devlink_nl_cmd_eswitch_mode_get_doit,
+		.cmd = DEVLINK_CMD_ESWITCH_GET,
+		.doit = devlink_nl_cmd_eswitch_get_doit,
 		.policy = devlink_nl_policy,
 		.flags = GENL_ADMIN_PERM,
 		.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
 	},
 	{
-		.cmd = DEVLINK_CMD_ESWITCH_MODE_SET,
-		.doit = devlink_nl_cmd_eswitch_mode_set_doit,
+		.cmd = DEVLINK_CMD_ESWITCH_SET,
+		.doit = devlink_nl_cmd_eswitch_set_doit,
 		.policy = devlink_nl_policy,
 		.flags = GENL_ADMIN_PERM,
 		.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
diff --git a/net/core/dst.c b/net/core/dst.c
index b5cbbe07f786..960e503b5a52 100644
--- a/net/core/dst.c
+++ b/net/core/dst.c
@@ -190,7 +190,6 @@ void dst_init(struct dst_entry *dst, struct dst_ops *ops,
 	dst->__use = 0;
 	dst->lastuse = jiffies;
 	dst->flags = flags;
-	dst->pending_confirm = 0;
 	dst->next = NULL;
 	if (!(flags & DST_NOCOUNT))
 		dst_entries_add(ops, 1);
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index 236a21e3c878..be7bab1adcde 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -102,7 +102,6 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN]
 	[NETIF_F_RXFCS_BIT] =            "rx-fcs",
 	[NETIF_F_RXALL_BIT] =            "rx-all",
 	[NETIF_F_HW_L2FW_DOFFLOAD_BIT] = "l2-fwd-offload",
-	[NETIF_F_BUSY_POLL_BIT] =        "busy-poll",
 	[NETIF_F_HW_TC_BIT] =		 "hw-tc-offload",
 };
 
@@ -1405,9 +1404,12 @@ static int ethtool_get_regs(struct net_device *dev, char __user *useraddr)
 	if (regs.len > reglen)
 		regs.len = reglen;
 
-	regbuf = vzalloc(reglen);
-	if (reglen && !regbuf)
-		return -ENOMEM;
+	regbuf = NULL;
+	if (reglen) {
+		regbuf = vzalloc(reglen);
+		if (!regbuf)
+			return -ENOMEM;
+	}
 
 	ops->get_regs(dev, &regs, regbuf);
 
@@ -1817,11 +1819,13 @@ static int ethtool_get_strings(struct net_device *dev, void __user *useraddr)
 	ret = __ethtool_get_sset_count(dev, gstrings.string_set);
 	if (ret < 0)
 		return ret;
+	if (ret > S32_MAX / ETH_GSTRING_LEN)
+		return -ENOMEM;
+	WARN_ON_ONCE(!ret);
 
 	gstrings.len = ret;
-
-	data = kcalloc(gstrings.len, ETH_GSTRING_LEN, GFP_USER);
-	if (!data)
+	data = vzalloc(gstrings.len * ETH_GSTRING_LEN);
+	if (gstrings.len && !data)
 		return -ENOMEM;
 
 	__ethtool_get_strings(dev, gstrings.string_set, data);
@@ -1830,12 +1834,13 @@ static int ethtool_get_strings(struct net_device *dev, void __user *useraddr)
 	if (copy_to_user(useraddr, &gstrings, sizeof(gstrings)))
 		goto out;
 	useraddr += sizeof(gstrings);
-	if (copy_to_user(useraddr, data, gstrings.len * ETH_GSTRING_LEN))
+	if (gstrings.len &&
+	    copy_to_user(useraddr, data, gstrings.len * ETH_GSTRING_LEN))
 		goto out;
 	ret = 0;
 
 out:
-	kfree(data);
+	vfree(data);
 	return ret;
 }
 
@@ -1912,14 +1917,15 @@ static int ethtool_get_stats(struct net_device *dev, void __user *useraddr)
 	n_stats = ops->get_sset_count(dev, ETH_SS_STATS);
 	if (n_stats < 0)
 		return n_stats;
-	WARN_ON(n_stats == 0);
-
+	if (n_stats > S32_MAX / sizeof(u64))
+		return -ENOMEM;
+	WARN_ON_ONCE(!n_stats);
 	if (copy_from_user(&stats, useraddr, sizeof(stats)))
 		return -EFAULT;
 
 	stats.n_stats = n_stats;
-	data = kmalloc(n_stats * sizeof(u64), GFP_USER);
-	if (!data)
+	data = vzalloc(n_stats * sizeof(u64));
+	if (n_stats && !data)
 		return -ENOMEM;
 
 	ops->get_ethtool_stats(dev, &stats, data);
@@ -1928,12 +1934,12 @@ static int ethtool_get_stats(struct net_device *dev, void __user *useraddr)
 	if (copy_to_user(useraddr, &stats, sizeof(stats)))
 		goto out;
 	useraddr += sizeof(stats);
-	if (copy_to_user(useraddr, data, stats.n_stats * sizeof(u64)))
+	if (n_stats && copy_to_user(useraddr, data, n_stats * sizeof(u64)))
 		goto out;
 	ret = 0;
 
  out:
-	kfree(data);
+	vfree(data);
 	return ret;
 }
 
@@ -1948,17 +1954,18 @@ static int ethtool_get_phy_stats(struct net_device *dev, void __user *useraddr)
 		return -EOPNOTSUPP;
 
 	n_stats = phy_get_sset_count(phydev);
-
 	if (n_stats < 0)
 		return n_stats;
-	WARN_ON(n_stats == 0);
+	if (n_stats > S32_MAX / sizeof(u64))
+		return -ENOMEM;
+	WARN_ON_ONCE(!n_stats);
 
 	if (copy_from_user(&stats, useraddr, sizeof(stats)))
 		return -EFAULT;
 
 	stats.n_stats = n_stats;
-	data = kmalloc_array(n_stats, sizeof(u64), GFP_USER);
-	if (!data)
+	data = vzalloc(n_stats * sizeof(u64));
+	if (n_stats && !data)
 		return -ENOMEM;
 
 	mutex_lock(&phydev->lock);
@@ -1969,12 +1976,12 @@ static int ethtool_get_phy_stats(struct net_device *dev, void __user *useraddr)
 	if (copy_to_user(useraddr, &stats, sizeof(stats)))
 		goto out;
 	useraddr += sizeof(stats);
-	if (copy_to_user(useraddr, data, stats.n_stats * sizeof(u64)))
+	if (n_stats && copy_to_user(useraddr, data, n_stats * sizeof(u64)))
 		goto out;
 	ret = 0;
 
  out:
-	kfree(data);
+	vfree(data);
 	return ret;
 }
 
diff --git a/net/core/filter.c b/net/core/filter.c
index 1969b3f118c1..ebaeaf2e46e8 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -76,9 +76,10 @@ int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap)
 	 * allow SOCK_MEMALLOC sockets to use it as this socket is
 	 * helping free memory
 	 */
-	if (skb_pfmemalloc(skb) && !sock_flag(sk, SOCK_MEMALLOC))
+	if (skb_pfmemalloc(skb) && !sock_flag(sk, SOCK_MEMALLOC)) {
+		NET_INC_STATS(sock_net(sk), LINUX_MIB_PFMEMALLOCDROP);
 		return -ENOMEM;
-
+	}
 	err = BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb);
 	if (err)
 		return err;
@@ -1416,8 +1417,8 @@ static const struct bpf_func_proto bpf_skb_store_bytes_proto = {
 	.ret_type	= RET_INTEGER,
 	.arg1_type	= ARG_PTR_TO_CTX,
 	.arg2_type	= ARG_ANYTHING,
-	.arg3_type	= ARG_PTR_TO_STACK,
-	.arg4_type	= ARG_CONST_STACK_SIZE,
+	.arg3_type	= ARG_PTR_TO_MEM,
+	.arg4_type	= ARG_CONST_SIZE,
 	.arg5_type	= ARG_ANYTHING,
 };
 
@@ -1447,8 +1448,8 @@ static const struct bpf_func_proto bpf_skb_load_bytes_proto = {
 	.ret_type	= RET_INTEGER,
 	.arg1_type	= ARG_PTR_TO_CTX,
 	.arg2_type	= ARG_ANYTHING,
-	.arg3_type	= ARG_PTR_TO_RAW_STACK,
-	.arg4_type	= ARG_CONST_STACK_SIZE,
+	.arg3_type	= ARG_PTR_TO_UNINIT_MEM,
+	.arg4_type	= ARG_CONST_SIZE,
 };
 
 BPF_CALL_2(bpf_skb_pull_data, struct sk_buff *, skb, u32, len)
@@ -1522,10 +1523,11 @@ BPF_CALL_5(bpf_l4_csum_replace, struct sk_buff *, skb, u32, offset,
 {
 	bool is_pseudo = flags & BPF_F_PSEUDO_HDR;
 	bool is_mmzero = flags & BPF_F_MARK_MANGLED_0;
+	bool do_mforce = flags & BPF_F_MARK_ENFORCE;
 	__sum16 *ptr;
 
-	if (unlikely(flags & ~(BPF_F_MARK_MANGLED_0 | BPF_F_PSEUDO_HDR |
-			       BPF_F_HDR_FIELD_MASK)))
+	if (unlikely(flags & ~(BPF_F_MARK_MANGLED_0 | BPF_F_MARK_ENFORCE |
+			       BPF_F_PSEUDO_HDR | BPF_F_HDR_FIELD_MASK)))
 		return -EINVAL;
 	if (unlikely(offset > 0xffff || offset & 1))
 		return -EFAULT;
@@ -1533,7 +1535,7 @@ BPF_CALL_5(bpf_l4_csum_replace, struct sk_buff *, skb, u32, offset,
 		return -EFAULT;
 
 	ptr = (__sum16 *)(skb->data + offset);
-	if (is_mmzero && !*ptr)
+	if (is_mmzero && !do_mforce && !*ptr)
 		return 0;
 
 	switch (flags & BPF_F_HDR_FIELD_MASK) {
@@ -1601,10 +1603,10 @@ static const struct bpf_func_proto bpf_csum_diff_proto = {
 	.gpl_only	= false,
 	.pkt_access	= true,
 	.ret_type	= RET_INTEGER,
-	.arg1_type	= ARG_PTR_TO_STACK,
-	.arg2_type	= ARG_CONST_STACK_SIZE_OR_ZERO,
-	.arg3_type	= ARG_PTR_TO_STACK,
-	.arg4_type	= ARG_CONST_STACK_SIZE_OR_ZERO,
+	.arg1_type	= ARG_PTR_TO_MEM,
+	.arg2_type	= ARG_CONST_SIZE_OR_ZERO,
+	.arg3_type	= ARG_PTR_TO_MEM,
+	.arg4_type	= ARG_CONST_SIZE_OR_ZERO,
 	.arg5_type	= ARG_ANYTHING,
 };
 
@@ -2306,8 +2308,8 @@ static const struct bpf_func_proto bpf_skb_event_output_proto = {
 	.arg1_type	= ARG_PTR_TO_CTX,
 	.arg2_type	= ARG_CONST_MAP_PTR,
 	.arg3_type	= ARG_ANYTHING,
-	.arg4_type	= ARG_PTR_TO_STACK,
-	.arg5_type	= ARG_CONST_STACK_SIZE,
+	.arg4_type	= ARG_PTR_TO_MEM,
+	.arg5_type	= ARG_CONST_SIZE,
 };
 
 static unsigned short bpf_tunnel_key_af(u64 flags)
@@ -2377,8 +2379,8 @@ static const struct bpf_func_proto bpf_skb_get_tunnel_key_proto = {
 	.gpl_only	= false,
 	.ret_type	= RET_INTEGER,
 	.arg1_type	= ARG_PTR_TO_CTX,
-	.arg2_type	= ARG_PTR_TO_RAW_STACK,
-	.arg3_type	= ARG_CONST_STACK_SIZE,
+	.arg2_type	= ARG_PTR_TO_UNINIT_MEM,
+	.arg3_type	= ARG_CONST_SIZE,
 	.arg4_type	= ARG_ANYTHING,
 };
 
@@ -2412,8 +2414,8 @@ static const struct bpf_func_proto bpf_skb_get_tunnel_opt_proto = {
 	.gpl_only	= false,
 	.ret_type	= RET_INTEGER,
 	.arg1_type	= ARG_PTR_TO_CTX,
-	.arg2_type	= ARG_PTR_TO_RAW_STACK,
-	.arg3_type	= ARG_CONST_STACK_SIZE,
+	.arg2_type	= ARG_PTR_TO_UNINIT_MEM,
+	.arg3_type	= ARG_CONST_SIZE,
 };
 
 static struct metadata_dst __percpu *md_dst;
@@ -2483,8 +2485,8 @@ static const struct bpf_func_proto bpf_skb_set_tunnel_key_proto = {
 	.gpl_only	= false,
 	.ret_type	= RET_INTEGER,
 	.arg1_type	= ARG_PTR_TO_CTX,
-	.arg2_type	= ARG_PTR_TO_STACK,
-	.arg3_type	= ARG_CONST_STACK_SIZE,
+	.arg2_type	= ARG_PTR_TO_MEM,
+	.arg3_type	= ARG_CONST_SIZE,
 	.arg4_type	= ARG_ANYTHING,
 };
 
@@ -2509,8 +2511,8 @@ static const struct bpf_func_proto bpf_skb_set_tunnel_opt_proto = {
 	.gpl_only	= false,
 	.ret_type	= RET_INTEGER,
 	.arg1_type	= ARG_PTR_TO_CTX,
-	.arg2_type	= ARG_PTR_TO_STACK,
-	.arg3_type	= ARG_CONST_STACK_SIZE,
+	.arg2_type	= ARG_PTR_TO_MEM,
+	.arg3_type	= ARG_CONST_SIZE,
 };
 
 static const struct bpf_func_proto *
@@ -2582,8 +2584,8 @@ BPF_CALL_5(bpf_xdp_event_output, struct xdp_buff *, xdp, struct bpf_map *, map,
 	if (unlikely(xdp_size > (unsigned long)(xdp->data_end - xdp->data)))
 		return -EFAULT;
 
-	return bpf_event_output(map, flags, meta, meta_size, xdp, xdp_size,
-				bpf_xdp_copy);
+	return bpf_event_output(map, flags, meta, meta_size, xdp->data,
+				xdp_size, bpf_xdp_copy);
 }
 
 static const struct bpf_func_proto bpf_xdp_event_output_proto = {
@@ -2593,12 +2595,12 @@ static const struct bpf_func_proto bpf_xdp_event_output_proto = {
 	.arg1_type	= ARG_PTR_TO_CTX,
 	.arg2_type	= ARG_CONST_MAP_PTR,
 	.arg3_type	= ARG_ANYTHING,
-	.arg4_type	= ARG_PTR_TO_STACK,
-	.arg5_type	= ARG_CONST_STACK_SIZE,
+	.arg4_type	= ARG_PTR_TO_MEM,
+	.arg5_type	= ARG_CONST_SIZE,
 };
 
 static const struct bpf_func_proto *
-sk_filter_func_proto(enum bpf_func_id func_id)
+bpf_base_func_proto(enum bpf_func_id func_id)
 {
 	switch (func_id) {
 	case BPF_FUNC_map_lookup_elem:
@@ -2626,6 +2628,17 @@ sk_filter_func_proto(enum bpf_func_id func_id)
 }
 
 static const struct bpf_func_proto *
+sk_filter_func_proto(enum bpf_func_id func_id)
+{
+	switch (func_id) {
+	case BPF_FUNC_skb_load_bytes:
+		return &bpf_skb_load_bytes_proto;
+	default:
+		return bpf_base_func_proto(func_id);
+	}
+}
+
+static const struct bpf_func_proto *
 tc_cls_act_func_proto(enum bpf_func_id func_id)
 {
 	switch (func_id) {
@@ -2680,7 +2693,7 @@ tc_cls_act_func_proto(enum bpf_func_id func_id)
 	case BPF_FUNC_skb_under_cgroup:
 		return &bpf_skb_under_cgroup_proto;
 	default:
-		return sk_filter_func_proto(func_id);
+		return bpf_base_func_proto(func_id);
 	}
 }
 
@@ -2695,7 +2708,7 @@ xdp_func_proto(enum bpf_func_id func_id)
 	case BPF_FUNC_xdp_adjust_head:
 		return &bpf_xdp_adjust_head_proto;
 	default:
-		return sk_filter_func_proto(func_id);
+		return bpf_base_func_proto(func_id);
 	}
 }
 
@@ -2706,7 +2719,7 @@ cg_skb_func_proto(enum bpf_func_id func_id)
 	case BPF_FUNC_skb_load_bytes:
 		return &bpf_skb_load_bytes_proto;
 	default:
-		return sk_filter_func_proto(func_id);
+		return bpf_base_func_proto(func_id);
 	}
 }
 
@@ -2733,7 +2746,7 @@ lwt_inout_func_proto(enum bpf_func_id func_id)
 	case BPF_FUNC_skb_under_cgroup:
 		return &bpf_skb_under_cgroup_proto;
 	default:
-		return sk_filter_func_proto(func_id);
+		return bpf_base_func_proto(func_id);
 	}
 }
 
@@ -2776,11 +2789,22 @@ static bool __is_valid_access(int off, int size)
 {
 	if (off < 0 || off >= sizeof(struct __sk_buff))
 		return false;
+
 	/* The verifier guarantees that size > 0. */
 	if (off % size != 0)
 		return false;
-	if (size != sizeof(__u32))
-		return false;
+
+	switch (off) {
+	case offsetof(struct __sk_buff, cb[0]) ...
+	     offsetof(struct __sk_buff, cb[4]) + sizeof(__u32) - 1:
+		if (off + size >
+		    offsetof(struct __sk_buff, cb[4]) + sizeof(__u32))
+			return false;
+		break;
+	default:
+		if (size != sizeof(__u32))
+			return false;
+	}
 
 	return true;
 }
@@ -2799,7 +2823,7 @@ static bool sk_filter_is_valid_access(int off, int size,
 	if (type == BPF_WRITE) {
 		switch (off) {
 		case offsetof(struct __sk_buff, cb[0]) ...
-		     offsetof(struct __sk_buff, cb[4]):
+		     offsetof(struct __sk_buff, cb[4]) + sizeof(__u32) - 1:
 			break;
 		default:
 			return false;
@@ -2823,7 +2847,7 @@ static bool lwt_is_valid_access(int off, int size,
 		case offsetof(struct __sk_buff, mark):
 		case offsetof(struct __sk_buff, priority):
 		case offsetof(struct __sk_buff, cb[0]) ...
-		     offsetof(struct __sk_buff, cb[4]):
+		     offsetof(struct __sk_buff, cb[4]) + sizeof(__u32) - 1:
 			break;
 		default:
 			return false;
@@ -2915,7 +2939,7 @@ static bool tc_cls_act_is_valid_access(int off, int size,
 		case offsetof(struct __sk_buff, tc_index):
 		case offsetof(struct __sk_buff, priority):
 		case offsetof(struct __sk_buff, cb[0]) ...
-		     offsetof(struct __sk_buff, cb[4]):
+		     offsetof(struct __sk_buff, cb[4]) + sizeof(__u32) - 1:
 		case offsetof(struct __sk_buff, tc_classid):
 			break;
 		default:
@@ -2972,32 +2996,33 @@ void bpf_warn_invalid_xdp_action(u32 act)
 }
 EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_action);
 
-static u32 sk_filter_convert_ctx_access(enum bpf_access_type type, int dst_reg,
-					int src_reg, int ctx_off,
-					struct bpf_insn *insn_buf,
-					struct bpf_prog *prog)
+static u32 bpf_convert_ctx_access(enum bpf_access_type type,
+				  const struct bpf_insn *si,
+				  struct bpf_insn *insn_buf,
+				  struct bpf_prog *prog)
 {
 	struct bpf_insn *insn = insn_buf;
+	int off;
 
-	switch (ctx_off) {
+	switch (si->off) {
 	case offsetof(struct __sk_buff, len):
 		BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, len) != 4);
 
-		*insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg,
+		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
 				      offsetof(struct sk_buff, len));
 		break;
 
 	case offsetof(struct __sk_buff, protocol):
 		BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, protocol) != 2);
 
-		*insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg,
+		*insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
 				      offsetof(struct sk_buff, protocol));
 		break;
 
 	case offsetof(struct __sk_buff, vlan_proto):
 		BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, vlan_proto) != 2);
 
-		*insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg,
+		*insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
 				      offsetof(struct sk_buff, vlan_proto));
 		break;
 
@@ -3005,17 +3030,17 @@ static u32 sk_filter_convert_ctx_access(enum bpf_access_type type, int dst_reg,
 		BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, priority) != 4);
 
 		if (type == BPF_WRITE)
-			*insn++ = BPF_STX_MEM(BPF_W, dst_reg, src_reg,
+			*insn++ = BPF_STX_MEM(BPF_W, si->dst_reg, si->src_reg,
 					      offsetof(struct sk_buff, priority));
 		else
-			*insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg,
+			*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
 					      offsetof(struct sk_buff, priority));
 		break;
 
 	case offsetof(struct __sk_buff, ingress_ifindex):
 		BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, skb_iif) != 4);
 
-		*insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg,
+		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
 				      offsetof(struct sk_buff, skb_iif));
 		break;
 
@@ -3023,17 +3048,17 @@ static u32 sk_filter_convert_ctx_access(enum bpf_access_type type, int dst_reg,
 		BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, ifindex) != 4);
 
 		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev),
-				      dst_reg, src_reg,
+				      si->dst_reg, si->src_reg,
 				      offsetof(struct sk_buff, dev));
-		*insn++ = BPF_JMP_IMM(BPF_JEQ, dst_reg, 0, 1);
-		*insn++ = BPF_LDX_MEM(BPF_W, dst_reg, dst_reg,
+		*insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1);
+		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
 				      offsetof(struct net_device, ifindex));
 		break;
 
 	case offsetof(struct __sk_buff, hash):
 		BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, hash) != 4);
 
-		*insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg,
+		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
 				      offsetof(struct sk_buff, hash));
 		break;
 
@@ -3041,63 +3066,77 @@ static u32 sk_filter_convert_ctx_access(enum bpf_access_type type, int dst_reg,
 		BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, mark) != 4);
 
 		if (type == BPF_WRITE)
-			*insn++ = BPF_STX_MEM(BPF_W, dst_reg, src_reg,
+			*insn++ = BPF_STX_MEM(BPF_W, si->dst_reg, si->src_reg,
 					      offsetof(struct sk_buff, mark));
 		else
-			*insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg,
+			*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
 					      offsetof(struct sk_buff, mark));
 		break;
 
 	case offsetof(struct __sk_buff, pkt_type):
-		return convert_skb_access(SKF_AD_PKTTYPE, dst_reg, src_reg, insn);
+		return convert_skb_access(SKF_AD_PKTTYPE, si->dst_reg,
+					  si->src_reg, insn);
 
 	case offsetof(struct __sk_buff, queue_mapping):
-		return convert_skb_access(SKF_AD_QUEUE, dst_reg, src_reg, insn);
+		return convert_skb_access(SKF_AD_QUEUE, si->dst_reg,
+					  si->src_reg, insn);
 
 	case offsetof(struct __sk_buff, vlan_present):
 		return convert_skb_access(SKF_AD_VLAN_TAG_PRESENT,
-					  dst_reg, src_reg, insn);
+					  si->dst_reg, si->src_reg, insn);
 
 	case offsetof(struct __sk_buff, vlan_tci):
 		return convert_skb_access(SKF_AD_VLAN_TAG,
-					  dst_reg, src_reg, insn);
+					  si->dst_reg, si->src_reg, insn);
 
 	case offsetof(struct __sk_buff, cb[0]) ...
-	     offsetof(struct __sk_buff, cb[4]):
+	     offsetof(struct __sk_buff, cb[4]) + sizeof(__u32) - 1:
 		BUILD_BUG_ON(FIELD_SIZEOF(struct qdisc_skb_cb, data) < 20);
+		BUILD_BUG_ON((offsetof(struct sk_buff, cb) +
+			      offsetof(struct qdisc_skb_cb, data)) %
+			     sizeof(__u64));
 
 		prog->cb_access = 1;
-		ctx_off -= offsetof(struct __sk_buff, cb[0]);
-		ctx_off += offsetof(struct sk_buff, cb);
-		ctx_off += offsetof(struct qdisc_skb_cb, data);
+		off  = si->off;
+		off -= offsetof(struct __sk_buff, cb[0]);
+		off += offsetof(struct sk_buff, cb);
+		off += offsetof(struct qdisc_skb_cb, data);
 		if (type == BPF_WRITE)
-			*insn++ = BPF_STX_MEM(BPF_W, dst_reg, src_reg, ctx_off);
+			*insn++ = BPF_STX_MEM(BPF_SIZE(si->code), si->dst_reg,
+					      si->src_reg, off);
 		else
-			*insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, ctx_off);
+			*insn++ = BPF_LDX_MEM(BPF_SIZE(si->code), si->dst_reg,
+					      si->src_reg, off);
 		break;
 
 	case offsetof(struct __sk_buff, tc_classid):
-		ctx_off -= offsetof(struct __sk_buff, tc_classid);
-		ctx_off += offsetof(struct sk_buff, cb);
-		ctx_off += offsetof(struct qdisc_skb_cb, tc_classid);
+		BUILD_BUG_ON(FIELD_SIZEOF(struct qdisc_skb_cb, tc_classid) != 2);
+
+		off  = si->off;
+		off -= offsetof(struct __sk_buff, tc_classid);
+		off += offsetof(struct sk_buff, cb);
+		off += offsetof(struct qdisc_skb_cb, tc_classid);
 		if (type == BPF_WRITE)
-			*insn++ = BPF_STX_MEM(BPF_H, dst_reg, src_reg, ctx_off);
+			*insn++ = BPF_STX_MEM(BPF_H, si->dst_reg,
+					      si->src_reg, off);
 		else
-			*insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg, ctx_off);
+			*insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg,
+					      si->src_reg, off);
 		break;
 
 	case offsetof(struct __sk_buff, data):
 		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data),
-				      dst_reg, src_reg,
+				      si->dst_reg, si->src_reg,
 				      offsetof(struct sk_buff, data));
 		break;
 
 	case offsetof(struct __sk_buff, data_end):
-		ctx_off -= offsetof(struct __sk_buff, data_end);
-		ctx_off += offsetof(struct sk_buff, cb);
-		ctx_off += offsetof(struct bpf_skb_data_end, data_end);
-		*insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), dst_reg, src_reg,
-				      ctx_off);
+		off  = si->off;
+		off -= offsetof(struct __sk_buff, data_end);
+		off += offsetof(struct sk_buff, cb);
+		off += offsetof(struct bpf_skb_data_end, data_end);
+		*insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg,
+				      si->src_reg, off);
 		break;
 
 	case offsetof(struct __sk_buff, tc_index):
@@ -3105,110 +3144,107 @@ static u32 sk_filter_convert_ctx_access(enum bpf_access_type type, int dst_reg,
 		BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, tc_index) != 2);
 
 		if (type == BPF_WRITE)
-			*insn++ = BPF_STX_MEM(BPF_H, dst_reg, src_reg,
+			*insn++ = BPF_STX_MEM(BPF_H, si->dst_reg, si->src_reg,
 					      offsetof(struct sk_buff, tc_index));
 		else
-			*insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg,
+			*insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
 					      offsetof(struct sk_buff, tc_index));
-		break;
 #else
 		if (type == BPF_WRITE)
-			*insn++ = BPF_MOV64_REG(dst_reg, dst_reg);
+			*insn++ = BPF_MOV64_REG(si->dst_reg, si->dst_reg);
 		else
-			*insn++ = BPF_MOV64_IMM(dst_reg, 0);
-		break;
+			*insn++ = BPF_MOV64_IMM(si->dst_reg, 0);
 #endif
+		break;
 	}
 
 	return insn - insn_buf;
 }
 
 static u32 sock_filter_convert_ctx_access(enum bpf_access_type type,
-					  int dst_reg, int src_reg,
-					  int ctx_off,
+					  const struct bpf_insn *si,
 					  struct bpf_insn *insn_buf,
 					  struct bpf_prog *prog)
 {
 	struct bpf_insn *insn = insn_buf;
 
-	switch (ctx_off) {
+	switch (si->off) {
 	case offsetof(struct bpf_sock, bound_dev_if):
 		BUILD_BUG_ON(FIELD_SIZEOF(struct sock, sk_bound_dev_if) != 4);
 
 		if (type == BPF_WRITE)
-			*insn++ = BPF_STX_MEM(BPF_W, dst_reg, src_reg,
+			*insn++ = BPF_STX_MEM(BPF_W, si->dst_reg, si->src_reg,
 					offsetof(struct sock, sk_bound_dev_if));
 		else
-			*insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg,
+			*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
 				      offsetof(struct sock, sk_bound_dev_if));
 		break;
 
 	case offsetof(struct bpf_sock, family):
 		BUILD_BUG_ON(FIELD_SIZEOF(struct sock, sk_family) != 2);
 
-		*insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg,
+		*insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
 				      offsetof(struct sock, sk_family));
 		break;
 
 	case offsetof(struct bpf_sock, type):
-		*insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg,
+		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
 				      offsetof(struct sock, __sk_flags_offset));
-		*insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg, SK_FL_TYPE_MASK);
-		*insn++ = BPF_ALU32_IMM(BPF_RSH, dst_reg, SK_FL_TYPE_SHIFT);
+		*insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, SK_FL_TYPE_MASK);
+		*insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, SK_FL_TYPE_SHIFT);
 		break;
 
 	case offsetof(struct bpf_sock, protocol):
-		*insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg,
+		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
 				      offsetof(struct sock, __sk_flags_offset));
-		*insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg, SK_FL_PROTO_MASK);
-		*insn++ = BPF_ALU32_IMM(BPF_RSH, dst_reg, SK_FL_PROTO_SHIFT);
+		*insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, SK_FL_PROTO_MASK);
+		*insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, SK_FL_PROTO_SHIFT);
 		break;
 	}
 
 	return insn - insn_buf;
 }
 
-static u32 tc_cls_act_convert_ctx_access(enum bpf_access_type type, int dst_reg,
-					 int src_reg, int ctx_off,
+static u32 tc_cls_act_convert_ctx_access(enum bpf_access_type type,
+					 const struct bpf_insn *si,
 					 struct bpf_insn *insn_buf,
 					 struct bpf_prog *prog)
 {
 	struct bpf_insn *insn = insn_buf;
 
-	switch (ctx_off) {
+	switch (si->off) {
 	case offsetof(struct __sk_buff, ifindex):
 		BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, ifindex) != 4);
 
 		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev),
-				      dst_reg, src_reg,
+				      si->dst_reg, si->src_reg,
 				      offsetof(struct sk_buff, dev));
-		*insn++ = BPF_LDX_MEM(BPF_W, dst_reg, dst_reg,
+		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
 				      offsetof(struct net_device, ifindex));
 		break;
 	default:
-		return sk_filter_convert_ctx_access(type, dst_reg, src_reg,
-						    ctx_off, insn_buf, prog);
+		return bpf_convert_ctx_access(type, si, insn_buf, prog);
 	}
 
 	return insn - insn_buf;
 }
 
-static u32 xdp_convert_ctx_access(enum bpf_access_type type, int dst_reg,
-				  int src_reg, int ctx_off,
+static u32 xdp_convert_ctx_access(enum bpf_access_type type,
+				  const struct bpf_insn *si,
 				  struct bpf_insn *insn_buf,
 				  struct bpf_prog *prog)
 {
 	struct bpf_insn *insn = insn_buf;
 
-	switch (ctx_off) {
+	switch (si->off) {
 	case offsetof(struct xdp_md, data):
 		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data),
-				      dst_reg, src_reg,
+				      si->dst_reg, si->src_reg,
 				      offsetof(struct xdp_buff, data));
 		break;
 	case offsetof(struct xdp_md, data_end):
 		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data_end),
-				      dst_reg, src_reg,
+				      si->dst_reg, si->src_reg,
 				      offsetof(struct xdp_buff, data_end));
 		break;
 	}
@@ -3219,7 +3255,7 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type type, int dst_reg,
 static const struct bpf_verifier_ops sk_filter_ops = {
 	.get_func_proto		= sk_filter_func_proto,
 	.is_valid_access	= sk_filter_is_valid_access,
-	.convert_ctx_access	= sk_filter_convert_ctx_access,
+	.convert_ctx_access	= bpf_convert_ctx_access,
 };
 
 static const struct bpf_verifier_ops tc_cls_act_ops = {
@@ -3238,69 +3274,69 @@ static const struct bpf_verifier_ops xdp_ops = {
 static const struct bpf_verifier_ops cg_skb_ops = {
 	.get_func_proto		= cg_skb_func_proto,
 	.is_valid_access	= sk_filter_is_valid_access,
-	.convert_ctx_access	= sk_filter_convert_ctx_access,
+	.convert_ctx_access	= bpf_convert_ctx_access,
 };
 
 static const struct bpf_verifier_ops lwt_inout_ops = {
 	.get_func_proto		= lwt_inout_func_proto,
 	.is_valid_access	= lwt_is_valid_access,
-	.convert_ctx_access	= sk_filter_convert_ctx_access,
+	.convert_ctx_access	= bpf_convert_ctx_access,
 };
 
 static const struct bpf_verifier_ops lwt_xmit_ops = {
 	.get_func_proto		= lwt_xmit_func_proto,
 	.is_valid_access	= lwt_is_valid_access,
-	.convert_ctx_access	= sk_filter_convert_ctx_access,
+	.convert_ctx_access	= bpf_convert_ctx_access,
 	.gen_prologue		= tc_cls_act_prologue,
 };
 
 static const struct bpf_verifier_ops cg_sock_ops = {
-	.get_func_proto		= sk_filter_func_proto,
+	.get_func_proto		= bpf_base_func_proto,
 	.is_valid_access	= sock_filter_is_valid_access,
 	.convert_ctx_access	= sock_filter_convert_ctx_access,
 };
 
-static struct bpf_prog_type_list sk_filter_type __read_mostly = {
+static struct bpf_prog_type_list sk_filter_type __ro_after_init = {
 	.ops	= &sk_filter_ops,
 	.type	= BPF_PROG_TYPE_SOCKET_FILTER,
 };
 
-static struct bpf_prog_type_list sched_cls_type __read_mostly = {
+static struct bpf_prog_type_list sched_cls_type __ro_after_init = {
 	.ops	= &tc_cls_act_ops,
 	.type	= BPF_PROG_TYPE_SCHED_CLS,
 };
 
-static struct bpf_prog_type_list sched_act_type __read_mostly = {
+static struct bpf_prog_type_list sched_act_type __ro_after_init = {
 	.ops	= &tc_cls_act_ops,
 	.type	= BPF_PROG_TYPE_SCHED_ACT,
 };
 
-static struct bpf_prog_type_list xdp_type __read_mostly = {
+static struct bpf_prog_type_list xdp_type __ro_after_init = {
 	.ops	= &xdp_ops,
 	.type	= BPF_PROG_TYPE_XDP,
 };
 
-static struct bpf_prog_type_list cg_skb_type __read_mostly = {
+static struct bpf_prog_type_list cg_skb_type __ro_after_init = {
 	.ops	= &cg_skb_ops,
 	.type	= BPF_PROG_TYPE_CGROUP_SKB,
 };
 
-static struct bpf_prog_type_list lwt_in_type __read_mostly = {
+static struct bpf_prog_type_list lwt_in_type __ro_after_init = {
 	.ops	= &lwt_inout_ops,
 	.type	= BPF_PROG_TYPE_LWT_IN,
 };
 
-static struct bpf_prog_type_list lwt_out_type __read_mostly = {
+static struct bpf_prog_type_list lwt_out_type __ro_after_init = {
 	.ops	= &lwt_inout_ops,
 	.type	= BPF_PROG_TYPE_LWT_OUT,
 };
 
-static struct bpf_prog_type_list lwt_xmit_type __read_mostly = {
+static struct bpf_prog_type_list lwt_xmit_type __ro_after_init = {
 	.ops	= &lwt_xmit_ops,
 	.type	= BPF_PROG_TYPE_LWT_XMIT,
 };
 
-static struct bpf_prog_type_list cg_sock_type __read_mostly = {
+static struct bpf_prog_type_list cg_sock_type __ro_after_init = {
 	.ops	= &cg_sock_ops,
 	.type	= BPF_PROG_TYPE_CGROUP_SOCK
 };
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 1b7673aac59d..c35aae13c8d2 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -138,6 +138,7 @@ bool __skb_flow_dissect(const struct sk_buff *skb,
 	struct flow_dissector_key_control *key_control;
 	struct flow_dissector_key_basic *key_basic;
 	struct flow_dissector_key_addrs *key_addrs;
+	struct flow_dissector_key_arp *key_arp;
 	struct flow_dissector_key_ports *key_ports;
 	struct flow_dissector_key_icmp *key_icmp;
 	struct flow_dissector_key_tags *key_tags;
@@ -379,6 +380,62 @@ mpls:
 
 		nhoff += FCOE_HEADER_LEN;
 		goto out_good;
+
+	case htons(ETH_P_ARP):
+	case htons(ETH_P_RARP): {
+		struct {
+			unsigned char ar_sha[ETH_ALEN];
+			unsigned char ar_sip[4];
+			unsigned char ar_tha[ETH_ALEN];
+			unsigned char ar_tip[4];
+		} *arp_eth, _arp_eth;
+		const struct arphdr *arp;
+		struct arphdr *_arp;
+
+		arp = __skb_header_pointer(skb, nhoff, sizeof(_arp), data,
+					   hlen, &_arp);
+		if (!arp)
+			goto out_bad;
+
+		if (arp->ar_hrd != htons(ARPHRD_ETHER) ||
+		    arp->ar_pro != htons(ETH_P_IP) ||
+		    arp->ar_hln != ETH_ALEN ||
+		    arp->ar_pln != 4 ||
+		    (arp->ar_op != htons(ARPOP_REPLY) &&
+		     arp->ar_op != htons(ARPOP_REQUEST)))
+			goto out_bad;
+
+		arp_eth = __skb_header_pointer(skb, nhoff + sizeof(_arp),
+					       sizeof(_arp_eth), data,
+					       hlen,
+					       &_arp_eth);
+		if (!arp_eth)
+			goto out_bad;
+
+		if (dissector_uses_key(flow_dissector,
+				       FLOW_DISSECTOR_KEY_ARP)) {
+
+			key_arp = skb_flow_dissector_target(flow_dissector,
+							    FLOW_DISSECTOR_KEY_ARP,
+							    target_container);
+
+			memcpy(&key_arp->sip, arp_eth->ar_sip,
+			       sizeof(key_arp->sip));
+			memcpy(&key_arp->tip, arp_eth->ar_tip,
+			       sizeof(key_arp->tip));
+
+			/* Only store the lower byte of the opcode;
+			 * this covers ARPOP_REPLY and ARPOP_REQUEST.
+			 */
+			key_arp->op = ntohs(arp->ar_op) & 0xff;
+
+			ether_addr_copy(key_arp->sha, arp_eth->ar_sha);
+			ether_addr_copy(key_arp->tha, arp_eth->ar_tha);
+		}
+
+		goto out_good;
+	}
+
 	default:
 		goto out_bad;
 	}
diff --git a/net/core/gro_cells.c b/net/core/gro_cells.c
new file mode 100644
index 000000000000..c98bbfbd26b8
--- /dev/null
+++ b/net/core/gro_cells.c
@@ -0,0 +1,92 @@
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <linux/netdevice.h>
+#include <net/gro_cells.h>
+
+struct gro_cell {
+	struct sk_buff_head	napi_skbs;
+	struct napi_struct	napi;
+};
+
+int gro_cells_receive(struct gro_cells *gcells, struct sk_buff *skb)
+{
+	struct net_device *dev = skb->dev;
+	struct gro_cell *cell;
+
+	if (!gcells->cells || skb_cloned(skb) || !(dev->features & NETIF_F_GRO))
+		return netif_rx(skb);
+
+	cell = this_cpu_ptr(gcells->cells);
+
+	if (skb_queue_len(&cell->napi_skbs) > netdev_max_backlog) {
+		atomic_long_inc(&dev->rx_dropped);
+		kfree_skb(skb);
+		return NET_RX_DROP;
+	}
+
+	__skb_queue_tail(&cell->napi_skbs, skb);
+	if (skb_queue_len(&cell->napi_skbs) == 1)
+		napi_schedule(&cell->napi);
+	return NET_RX_SUCCESS;
+}
+EXPORT_SYMBOL(gro_cells_receive);
+
+/* called under BH context */
+static int gro_cell_poll(struct napi_struct *napi, int budget)
+{
+	struct gro_cell *cell = container_of(napi, struct gro_cell, napi);
+	struct sk_buff *skb;
+	int work_done = 0;
+
+	while (work_done < budget) {
+		skb = __skb_dequeue(&cell->napi_skbs);
+		if (!skb)
+			break;
+		napi_gro_receive(napi, skb);
+		work_done++;
+	}
+
+	if (work_done < budget)
+		napi_complete_done(napi, work_done);
+	return work_done;
+}
+
+int gro_cells_init(struct gro_cells *gcells, struct net_device *dev)
+{
+	int i;
+
+	gcells->cells = alloc_percpu(struct gro_cell);
+	if (!gcells->cells)
+		return -ENOMEM;
+
+	for_each_possible_cpu(i) {
+		struct gro_cell *cell = per_cpu_ptr(gcells->cells, i);
+
+		__skb_queue_head_init(&cell->napi_skbs);
+
+		set_bit(NAPI_STATE_NO_BUSY_POLL, &cell->napi.state);
+
+		netif_napi_add(dev, &cell->napi, gro_cell_poll,
+			       NAPI_POLL_WEIGHT);
+		napi_enable(&cell->napi);
+	}
+	return 0;
+}
+EXPORT_SYMBOL(gro_cells_init);
+
+void gro_cells_destroy(struct gro_cells *gcells)
+{
+	int i;
+
+	if (!gcells->cells)
+		return;
+	for_each_possible_cpu(i) {
+		struct gro_cell *cell = per_cpu_ptr(gcells->cells, i);
+
+		netif_napi_del(&cell->napi);
+		__skb_queue_purge(&cell->napi_skbs);
+	}
+	free_percpu(gcells->cells);
+	gcells->cells = NULL;
+}
+EXPORT_SYMBOL(gro_cells_destroy);
diff --git a/net/core/lwt_bpf.c b/net/core/lwt_bpf.c
index b3eef90b2df9..0cfe7b0216c3 100644
--- a/net/core/lwt_bpf.c
+++ b/net/core/lwt_bpf.c
@@ -237,7 +237,7 @@ static const struct nla_policy bpf_nl_policy[LWT_BPF_MAX + 1] = {
 	[LWT_BPF_XMIT_HEADROOM]	= { .type = NLA_U32 },
 };
 
-static int bpf_build_state(struct net_device *dev, struct nlattr *nla,
+static int bpf_build_state(struct nlattr *nla,
 			   unsigned int family, const void *cfg,
 			   struct lwtunnel_state **ts)
 {
@@ -352,7 +352,7 @@ static int bpf_encap_nlsize(struct lwtunnel_state *lwtstate)
 	       0;
 }
 
-int bpf_lwt_prog_cmp(struct bpf_lwt_prog *a, struct bpf_lwt_prog *b)
+static int bpf_lwt_prog_cmp(struct bpf_lwt_prog *a, struct bpf_lwt_prog *b)
 {
 	/* FIXME:
 	 * The LWT state is currently rebuilt for delete requests which
diff --git a/net/core/lwtunnel.c b/net/core/lwtunnel.c
index c23465005f2f..6df9f8fabf0c 100644
--- a/net/core/lwtunnel.c
+++ b/net/core/lwtunnel.c
@@ -101,7 +101,7 @@ int lwtunnel_encap_del_ops(const struct lwtunnel_encap_ops *ops,
 }
 EXPORT_SYMBOL(lwtunnel_encap_del_ops);
 
-int lwtunnel_build_state(struct net_device *dev, u16 encap_type,
+int lwtunnel_build_state(u16 encap_type,
 			 struct nlattr *encap, unsigned int family,
 			 const void *cfg, struct lwtunnel_state **lws)
 {
@@ -116,7 +116,7 @@ int lwtunnel_build_state(struct net_device *dev, u16 encap_type,
 	rcu_read_lock();
 	ops = rcu_dereference(lwtun_encaps[encap_type]);
 	if (likely(ops && ops->build_state && try_module_get(ops->owner))) {
-		ret = ops->build_state(dev, encap, family, cfg, lws);
+		ret = ops->build_state(encap, family, cfg, lws);
 		if (ret)
 			module_put(ops->owner);
 	}
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 7bb12e07ffef..e7c12caa20c8 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -2923,7 +2923,8 @@ static void neigh_proc_update(struct ctl_table *ctl, int write)
 		return;
 
 	set_bit(index, p->data_state);
-	call_netevent_notifiers(NETEVENT_DELAY_PROBE_TIME_UPDATE, p);
+	if (index == NEIGH_VAR_DELAY_PROBE_TIME)
+		call_netevent_notifiers(NETEVENT_DELAY_PROBE_TIME_UPDATE, p);
 	if (!dev) /* NULL dev means this is default value */
 		neigh_copy_dflt_parms(net, p, index);
 }
diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c
index 2ec86fc552df..756637dc7a57 100644
--- a/net/core/netprio_cgroup.c
+++ b/net/core/netprio_cgroup.c
@@ -13,6 +13,7 @@
 
 #include <linux/slab.h>
 #include <linux/types.h>
+#include <linux/module.h>
 #include <linux/string.h>
 #include <linux/errno.h>
 #include <linux/skbuff.h>
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index 8e69ce472236..96947f5d41e4 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -3439,9 +3439,7 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev)
 			/* skb was 'freed' by stack, so clean few
 			 * bits and reuse it
 			 */
-#ifdef CONFIG_NET_CLS_ACT
-			skb->tc_verd = 0; /* reset reclass/redir ttl */
-#endif
+			skb_reset_tc(skb);
 		} while (--burst > 0);
 		goto out; /* Skips xmit_mode M_START_XMIT */
 	} else if (pkt_dev->xmit_mode == M_QUEUE_XMIT) {
diff --git a/net/core/request_sock.c b/net/core/request_sock.c
index 5d26056b6d8f..9b8727c67b58 100644
--- a/net/core/request_sock.c
+++ b/net/core/request_sock.c
@@ -34,8 +34,6 @@
  * and it will increase in proportion to the memory of machine.
  * Note : Dont forget somaxconn that may limit backlog too.
  */
-int sysctl_max_syn_backlog = 256;
-EXPORT_SYMBOL(sysctl_max_syn_backlog);
 
 void reqsk_queue_alloc(struct request_sock_queue *queue)
 {
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 75e3ea7bda08..c4e84c558240 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -837,8 +837,7 @@ static void copy_rtnl_link_stats(struct rtnl_link_stats *a,
 static inline int rtnl_vfinfo_size(const struct net_device *dev,
 				   u32 ext_filter_mask)
 {
-	if (dev->dev.parent && dev_is_pci(dev->dev.parent) &&
-	    (ext_filter_mask & RTEXT_FILTER_VF)) {
+	if (dev->dev.parent && (ext_filter_mask & RTEXT_FILTER_VF)) {
 		int num_vfs = dev_num_vf(dev->dev.parent);
 		size_t size = nla_total_size(0);
 		size += num_vfs *
@@ -877,8 +876,6 @@ static size_t rtnl_port_size(const struct net_device *dev,
 {
 	size_t port_size = nla_total_size(4)		/* PORT_VF */
 		+ nla_total_size(PORT_PROFILE_MAX)	/* PORT_PROFILE */
-		+ nla_total_size(sizeof(struct ifla_port_vsi))
-							/* PORT_VSI_TYPE */
 		+ nla_total_size(PORT_UUID_MAX)		/* PORT_INSTANCE_UUID */
 		+ nla_total_size(PORT_UUID_MAX)		/* PORT_HOST_UUID */
 		+ nla_total_size(1)			/* PROT_VDP_REQUEST */
@@ -1492,14 +1489,19 @@ static const struct nla_policy ifla_port_policy[IFLA_PORT_MAX+1] = {
 	[IFLA_PORT_VF]		= { .type = NLA_U32 },
 	[IFLA_PORT_PROFILE]	= { .type = NLA_STRING,
 				    .len = PORT_PROFILE_MAX },
-	[IFLA_PORT_VSI_TYPE]	= { .type = NLA_BINARY,
-				    .len = sizeof(struct ifla_port_vsi)},
 	[IFLA_PORT_INSTANCE_UUID] = { .type = NLA_BINARY,
 				      .len = PORT_UUID_MAX },
 	[IFLA_PORT_HOST_UUID]	= { .type = NLA_STRING,
 				    .len = PORT_UUID_MAX },
 	[IFLA_PORT_REQUEST]	= { .type = NLA_U8, },
 	[IFLA_PORT_RESPONSE]	= { .type = NLA_U16, },
+
+	/* Unused, but we need to keep it here since user space could
+	 * fill it. It's also broken with regard to NLA_BINARY use in
+	 * combination with structs.
+	 */
+	[IFLA_PORT_VSI_TYPE]	= { .type = NLA_BINARY,
+				    .len = sizeof(struct ifla_port_vsi) },
 };
 
 static const struct nla_policy ifla_xdp_policy[IFLA_XDP_MAX + 1] = {
@@ -2356,7 +2358,6 @@ struct net_device *rtnl_create_link(struct net *net,
 	const char *ifname, unsigned char name_assign_type,
 	const struct rtnl_link_ops *ops, struct nlattr *tb[])
 {
-	int err;
 	struct net_device *dev;
 	unsigned int num_tx_queues = 1;
 	unsigned int num_rx_queues = 1;
@@ -2371,11 +2372,10 @@ struct net_device *rtnl_create_link(struct net *net,
 	else if (ops->get_num_rx_queues)
 		num_rx_queues = ops->get_num_rx_queues();
 
-	err = -ENOMEM;
 	dev = alloc_netdev_mqs(ops->priv_size, ifname, name_assign_type,
 			       ops->setup, num_tx_queues, num_rx_queues);
 	if (!dev)
-		goto err;
+		return ERR_PTR(-ENOMEM);
 
 	dev_net_set(dev, net);
 	dev->rtnl_link_ops = ops;
@@ -2401,9 +2401,6 @@ struct net_device *rtnl_create_link(struct net *net,
 		dev_set_group(dev, nla_get_u32(tb[IFLA_GROUP]));
 
 	return dev;
-
-err:
-	return ERR_PTR(err);
 }
 EXPORT_SYMBOL(rtnl_create_link);
 
@@ -2571,7 +2568,7 @@ replay:
 			return -ENODEV;
 		}
 
-		if (tb[IFLA_MAP] || tb[IFLA_MASTER] || tb[IFLA_PROTINFO])
+		if (tb[IFLA_MAP] || tb[IFLA_PROTINFO])
 			return -EOPNOTSUPP;
 
 		if (!ops) {
@@ -2653,6 +2650,11 @@ replay:
 			if (err < 0)
 				goto out_unregister;
 		}
+		if (tb[IFLA_MASTER]) {
+			err = do_set_master(dev, nla_get_u32(tb[IFLA_MASTER]));
+			if (err)
+				goto out_unregister;
+		}
 out:
 		if (link_net)
 			put_net(link_net);
@@ -3829,6 +3831,39 @@ static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev,
 		*idxattr = 0;
 	}
 
+	if (stats_attr_valid(filter_mask, IFLA_STATS_AF_SPEC, *idxattr)) {
+		struct rtnl_af_ops *af_ops;
+
+		*idxattr = IFLA_STATS_AF_SPEC;
+		attr = nla_nest_start(skb, IFLA_STATS_AF_SPEC);
+		if (!attr)
+			goto nla_put_failure;
+
+		list_for_each_entry(af_ops, &rtnl_af_ops, list) {
+			if (af_ops->fill_stats_af) {
+				struct nlattr *af;
+				int err;
+
+				af = nla_nest_start(skb, af_ops->family);
+				if (!af)
+					goto nla_put_failure;
+
+				err = af_ops->fill_stats_af(skb, dev);
+
+				if (err == -ENODATA)
+					nla_nest_cancel(skb, af);
+				else if (err < 0)
+					goto nla_put_failure;
+
+				nla_nest_end(skb, af);
+			}
+		}
+
+		nla_nest_end(skb, attr);
+
+		*idxattr = 0;
+	}
+
 	nlmsg_end(skb, nlh);
 
 	return 0;
@@ -3885,6 +3920,23 @@ static size_t if_nlmsg_stats_size(const struct net_device *dev,
 	if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_OFFLOAD_XSTATS, 0))
 		size += rtnl_get_offload_stats_size(dev);
 
+	if (stats_attr_valid(filter_mask, IFLA_STATS_AF_SPEC, 0)) {
+		struct rtnl_af_ops *af_ops;
+
+		/* for IFLA_STATS_AF_SPEC */
+		size += nla_total_size(0);
+
+		list_for_each_entry(af_ops, &rtnl_af_ops, list) {
+			if (af_ops->get_stats_af_size) {
+				size += nla_total_size(
+					af_ops->get_stats_af_size(dev));
+
+				/* for AF_* */
+				size += nla_total_size(0);
+			}
+		}
+	}
+
 	return size;
 }
 
diff --git a/net/core/scm.c b/net/core/scm.c
index d8820438ba37..b6d83686e149 100644
--- a/net/core/scm.c
+++ b/net/core/scm.c
@@ -71,7 +71,7 @@ static int scm_fp_copy(struct cmsghdr *cmsg, struct scm_fp_list **fplp)
 	struct file **fpp;
 	int i, num;
 
-	num = (cmsg->cmsg_len - CMSG_ALIGN(sizeof(struct cmsghdr)))/sizeof(int);
+	num = (cmsg->cmsg_len - sizeof(struct cmsghdr))/sizeof(int);
 
 	if (num <= 0)
 		return 0;
diff --git a/net/core/secure_seq.c b/net/core/secure_seq.c
index 88a8e429fc3e..758f140b6bed 100644
--- a/net/core/secure_seq.c
+++ b/net/core/secure_seq.c
@@ -1,3 +1,7 @@
+/*
+ * Copyright (C) 2016 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ */
+
 #include <linux/kernel.h>
 #include <linux/init.h>
 #include <linux/cryptohash.h>
@@ -8,18 +12,18 @@
 #include <linux/ktime.h>
 #include <linux/string.h>
 #include <linux/net.h>
-
+#include <linux/siphash.h>
 #include <net/secure_seq.h>
 
 #if IS_ENABLED(CONFIG_IPV6) || IS_ENABLED(CONFIG_INET)
+#include <linux/in6.h>
 #include <net/tcp.h>
-#define NET_SECRET_SIZE (MD5_MESSAGE_BYTES / 4)
 
-static u32 net_secret[NET_SECRET_SIZE] ____cacheline_aligned;
+static siphash_key_t net_secret __read_mostly;
 
 static __always_inline void net_secret_init(void)
 {
-	net_get_random_once(net_secret, sizeof(net_secret));
+	net_get_random_once(&net_secret, sizeof(net_secret));
 }
 #endif
 
@@ -44,80 +48,70 @@ static u32 seq_scale(u32 seq)
 u32 secure_tcpv6_sequence_number(const __be32 *saddr, const __be32 *daddr,
 				 __be16 sport, __be16 dport, u32 *tsoff)
 {
-	u32 secret[MD5_MESSAGE_BYTES / 4];
-	u32 hash[MD5_DIGEST_WORDS];
-	u32 i;
-
+	const struct {
+		struct in6_addr saddr;
+		struct in6_addr daddr;
+		__be16 sport;
+		__be16 dport;
+	} __aligned(SIPHASH_ALIGNMENT) combined = {
+		.saddr = *(struct in6_addr *)saddr,
+		.daddr = *(struct in6_addr *)daddr,
+		.sport = sport,
+		.dport = dport
+	};
+	u64 hash;
 	net_secret_init();
-	memcpy(hash, saddr, 16);
-	for (i = 0; i < 4; i++)
-		secret[i] = net_secret[i] + (__force u32)daddr[i];
-	secret[4] = net_secret[4] +
-		(((__force u16)sport << 16) + (__force u16)dport);
-	for (i = 5; i < MD5_MESSAGE_BYTES / 4; i++)
-		secret[i] = net_secret[i];
-
-	md5_transform(hash, secret);
-
-	*tsoff = sysctl_tcp_timestamps == 1 ? hash[1] : 0;
-	return seq_scale(hash[0]);
+	hash = siphash(&combined, offsetofend(typeof(combined), dport),
+		       &net_secret);
+	*tsoff = sysctl_tcp_timestamps == 1 ? (hash >> 32) : 0;
+	return seq_scale(hash);
 }
 EXPORT_SYMBOL(secure_tcpv6_sequence_number);
 
 u32 secure_ipv6_port_ephemeral(const __be32 *saddr, const __be32 *daddr,
 			       __be16 dport)
 {
-	u32 secret[MD5_MESSAGE_BYTES / 4];
-	u32 hash[MD5_DIGEST_WORDS];
-	u32 i;
-
+	const struct {
+		struct in6_addr saddr;
+		struct in6_addr daddr;
+		__be16 dport;
+	} __aligned(SIPHASH_ALIGNMENT) combined = {
+		.saddr = *(struct in6_addr *)saddr,
+		.daddr = *(struct in6_addr *)daddr,
+		.dport = dport
+	};
 	net_secret_init();
-	memcpy(hash, saddr, 16);
-	for (i = 0; i < 4; i++)
-		secret[i] = net_secret[i] + (__force u32) daddr[i];
-	secret[4] = net_secret[4] + (__force u32)dport;
-	for (i = 5; i < MD5_MESSAGE_BYTES / 4; i++)
-		secret[i] = net_secret[i];
-
-	md5_transform(hash, secret);
-
-	return hash[0];
+	return siphash(&combined, offsetofend(typeof(combined), dport),
+		       &net_secret);
 }
 EXPORT_SYMBOL(secure_ipv6_port_ephemeral);
 #endif
 
 #ifdef CONFIG_INET
 
+/* secure_tcp_sequence_number(a, b, 0, d) == secure_ipv4_port_ephemeral(a, b, d),
+ * but fortunately, `sport' cannot be 0 in any circumstances. If this changes,
+ * it would be easy enough to have the former function use siphash_4u32, passing
+ * the arguments as separate u32.
+ */
+
 u32 secure_tcp_sequence_number(__be32 saddr, __be32 daddr,
 			       __be16 sport, __be16 dport, u32 *tsoff)
 {
-	u32 hash[MD5_DIGEST_WORDS];
-
+	u64 hash;
 	net_secret_init();
-	hash[0] = (__force u32)saddr;
-	hash[1] = (__force u32)daddr;
-	hash[2] = ((__force u16)sport << 16) + (__force u16)dport;
-	hash[3] = net_secret[15];
-
-	md5_transform(hash, net_secret);
-
-	*tsoff = sysctl_tcp_timestamps == 1 ? hash[1] : 0;
-	return seq_scale(hash[0]);
+	hash = siphash_3u32((__force u32)saddr, (__force u32)daddr,
+			    (__force u32)sport << 16 | (__force u32)dport,
+			    &net_secret);
+	*tsoff = sysctl_tcp_timestamps == 1 ? (hash >> 32) : 0;
+	return seq_scale(hash);
 }
 
 u32 secure_ipv4_port_ephemeral(__be32 saddr, __be32 daddr, __be16 dport)
 {
-	u32 hash[MD5_DIGEST_WORDS];
-
 	net_secret_init();
-	hash[0] = (__force u32)saddr;
-	hash[1] = (__force u32)daddr;
-	hash[2] = (__force u32)dport ^ net_secret[14];
-	hash[3] = net_secret[15];
-
-	md5_transform(hash, net_secret);
-
-	return hash[0];
+	return siphash_3u32((__force u32)saddr, (__force u32)daddr,
+			    (__force u16)dport, &net_secret);
 }
 EXPORT_SYMBOL_GPL(secure_ipv4_port_ephemeral);
 #endif
@@ -126,21 +120,13 @@ EXPORT_SYMBOL_GPL(secure_ipv4_port_ephemeral);
 u64 secure_dccp_sequence_number(__be32 saddr, __be32 daddr,
 				__be16 sport, __be16 dport)
 {
-	u32 hash[MD5_DIGEST_WORDS];
 	u64 seq;
-
 	net_secret_init();
-	hash[0] = (__force u32)saddr;
-	hash[1] = (__force u32)daddr;
-	hash[2] = ((__force u16)sport << 16) + (__force u16)dport;
-	hash[3] = net_secret[15];
-
-	md5_transform(hash, net_secret);
-
-	seq = hash[0] | (((u64)hash[1]) << 32);
+	seq = siphash_3u32((__force u32)saddr, (__force u32)daddr,
+			   (__force u32)sport << 16 | (__force u32)dport,
+			   &net_secret);
 	seq += ktime_get_real_ns();
 	seq &= (1ull << 48) - 1;
-
 	return seq;
 }
 EXPORT_SYMBOL(secure_dccp_sequence_number);
@@ -149,26 +135,23 @@ EXPORT_SYMBOL(secure_dccp_sequence_number);
 u64 secure_dccpv6_sequence_number(__be32 *saddr, __be32 *daddr,
 				  __be16 sport, __be16 dport)
 {
-	u32 secret[MD5_MESSAGE_BYTES / 4];
-	u32 hash[MD5_DIGEST_WORDS];
+	const struct {
+		struct in6_addr saddr;
+		struct in6_addr daddr;
+		__be16 sport;
+		__be16 dport;
+	} __aligned(SIPHASH_ALIGNMENT) combined = {
+		.saddr = *(struct in6_addr *)saddr,
+		.daddr = *(struct in6_addr *)daddr,
+		.sport = sport,
+		.dport = dport
+	};
 	u64 seq;
-	u32 i;
-
 	net_secret_init();
-	memcpy(hash, saddr, 16);
-	for (i = 0; i < 4; i++)
-		secret[i] = net_secret[i] + (__force u32)daddr[i];
-	secret[4] = net_secret[4] +
-		(((__force u16)sport << 16) + (__force u16)dport);
-	for (i = 5; i < MD5_MESSAGE_BYTES / 4; i++)
-		secret[i] = net_secret[i];
-
-	md5_transform(hash, secret);
-
-	seq = hash[0] | (((u64)hash[1]) << 32);
+	seq = siphash(&combined, offsetofend(typeof(combined), dport),
+		      &net_secret);
 	seq += ktime_get_real_ns();
 	seq &= (1ull << 48) - 1;
-
 	return seq;
 }
 EXPORT_SYMBOL(secure_dccpv6_sequence_number);
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 734c71468b01..f3557958e9bf 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -271,7 +271,6 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
 		atomic_set(&fclones->fclone_ref, 1);
 
 		fclones->skb2.fclone = SKB_FCLONE_CLONE;
-		fclones->skb2.pfmemalloc = pfmemalloc;
 	}
 out:
 	return skb;
@@ -655,7 +654,7 @@ static void skb_release_head_state(struct sk_buff *skb)
 		skb->destructor(skb);
 	}
 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
-	nf_conntrack_put(skb->nfct);
+	nf_conntrack_put(skb_nfct(skb));
 #endif
 #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
 	nf_bridge_put(skb->nf_bridge);
@@ -878,9 +877,6 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
 #endif
 #ifdef CONFIG_NET_SCHED
 	CHECK_SKB_FIELD(tc_index);
-#ifdef CONFIG_NET_CLS_ACT
-	CHECK_SKB_FIELD(tc_verd);
-#endif
 #endif
 
 }
@@ -1195,10 +1191,10 @@ EXPORT_SYMBOL(__pskb_copy_fclone);
 int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
 		     gfp_t gfp_mask)
 {
-	int i;
-	u8 *data;
-	int size = nhead + skb_end_offset(skb) + ntail;
+	int i, osize = skb_end_offset(skb);
+	int size = osize + nhead + ntail;
 	long off;
+	u8 *data;
 
 	BUG_ON(nhead < 0);
 
@@ -1260,6 +1256,14 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
 	skb->hdr_len  = 0;
 	skb->nohdr    = 0;
 	atomic_set(&skb_shinfo(skb)->dataref, 1);
+
+	/* It is not generally safe to change skb->truesize.
+	 * For the moment, we really care of rx path, or
+	 * when skb is orphaned (not attached to a socket).
+	 */
+	if (!skb->sk || skb->destructor == sock_edemux)
+		skb->truesize += size - osize;
+
 	return 0;
 
 nofrags:
diff --git a/net/core/sock.c b/net/core/sock.c
index 4eca27dc5c94..e7d74940e863 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -222,7 +222,7 @@ static const char *const af_family_key_strings[AF_MAX+1] = {
   "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN"     , "sk_lock-AF_PHONET"   ,
   "sk_lock-AF_IEEE802154", "sk_lock-AF_CAIF" , "sk_lock-AF_ALG"      ,
   "sk_lock-AF_NFC"   , "sk_lock-AF_VSOCK"    , "sk_lock-AF_KCM"      ,
-  "sk_lock-AF_QIPCRTR", "sk_lock-AF_MAX"
+  "sk_lock-AF_QIPCRTR", "sk_lock-AF_SMC"     , "sk_lock-AF_MAX"
 };
 static const char *const af_family_slock_key_strings[AF_MAX+1] = {
   "slock-AF_UNSPEC", "slock-AF_UNIX"     , "slock-AF_INET"     ,
@@ -239,7 +239,7 @@ static const char *const af_family_slock_key_strings[AF_MAX+1] = {
   "slock-AF_RXRPC" , "slock-AF_ISDN"     , "slock-AF_PHONET"   ,
   "slock-AF_IEEE802154", "slock-AF_CAIF" , "slock-AF_ALG"      ,
   "slock-AF_NFC"   , "slock-AF_VSOCK"    ,"slock-AF_KCM"       ,
-  "slock-AF_QIPCRTR", "slock-AF_MAX"
+  "slock-AF_QIPCRTR", "slock-AF_SMC"     , "slock-AF_MAX"
 };
 static const char *const af_family_clock_key_strings[AF_MAX+1] = {
   "clock-AF_UNSPEC", "clock-AF_UNIX"     , "clock-AF_INET"     ,
@@ -256,7 +256,7 @@ static const char *const af_family_clock_key_strings[AF_MAX+1] = {
   "clock-AF_RXRPC" , "clock-AF_ISDN"     , "clock-AF_PHONET"   ,
   "clock-AF_IEEE802154", "clock-AF_CAIF" , "clock-AF_ALG"      ,
   "clock-AF_NFC"   , "clock-AF_VSOCK"    , "clock-AF_KCM"      ,
-  "clock-AF_QIPCRTR", "clock-AF_MAX"
+  "clock-AF_QIPCRTR", "clock-AF_SMC"     , "clock-AF_MAX"
 };
 
 /*
@@ -367,7 +367,7 @@ static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
 	if (tv.tv_sec == 0 && tv.tv_usec == 0)
 		return 0;
 	if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
-		*timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
+		*timeo_p = tv.tv_sec * HZ + DIV_ROUND_UP(tv.tv_usec, USEC_PER_SEC / HZ);
 	return 0;
 }
 
@@ -502,6 +502,7 @@ struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
 
 	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
 		sk_tx_queue_clear(sk);
+		sk->sk_dst_pending_confirm = 0;
 		RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
 		dst_release(dst);
 		return NULL;
@@ -762,11 +763,8 @@ set_rcvbuf:
 		goto set_rcvbuf;
 
 	case SO_KEEPALIVE:
-#ifdef CONFIG_INET
-		if (sk->sk_protocol == IPPROTO_TCP &&
-		    sk->sk_type == SOCK_STREAM)
-			tcp_set_keepalive(sk, valbool);
-#endif
+		if (sk->sk_prot->keepalive)
+			sk->sk_prot->keepalive(sk, valbool);
 		sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
 		break;
 
@@ -1148,7 +1146,7 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
 			v.tm.tv_usec = 0;
 		} else {
 			v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
-			v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
+			v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * USEC_PER_SEC) / HZ;
 		}
 		break;
 
@@ -1159,7 +1157,7 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
 			v.tm.tv_usec = 0;
 		} else {
 			v.tm.tv_sec = sk->sk_sndtimeo / HZ;
-			v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
+			v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * USEC_PER_SEC) / HZ;
 		}
 		break;
 
@@ -1522,6 +1520,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
 				af_family_clock_key_strings[newsk->sk_family]);
 
 		newsk->sk_dst_cache	= NULL;
+		newsk->sk_dst_pending_confirm = 0;
 		newsk->sk_wmem_queued	= 0;
 		newsk->sk_forward_alloc = 0;
 		atomic_set(&newsk->sk_drops, 0);
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index 2a46e4009f62..4ead336e14ea 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -222,6 +222,21 @@ static int set_default_qdisc(struct ctl_table *table, int write,
 }
 #endif
 
+static int proc_do_dev_weight(struct ctl_table *table, int write,
+			   void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	int ret;
+
+	ret = proc_dointvec(table, write, buffer, lenp, ppos);
+	if (ret != 0)
+		return ret;
+
+	dev_rx_weight = weight_p * dev_weight_rx_bias;
+	dev_tx_weight = weight_p * dev_weight_tx_bias;
+
+	return ret;
+}
+
 static int proc_do_rss_key(struct ctl_table *table, int write,
 			   void __user *buffer, size_t *lenp, loff_t *ppos)
 {
@@ -273,7 +288,21 @@ static struct ctl_table net_core_table[] = {
 		.data		= &weight_p,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec
+		.proc_handler	= proc_do_dev_weight,
+	},
+	{
+		.procname	= "dev_weight_rx_bias",
+		.data		= &dev_weight_rx_bias,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_do_dev_weight,
+	},
+	{
+		.procname	= "dev_weight_tx_bias",
+		.data		= &dev_weight_tx_bias,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_do_dev_weight,
 	},
 	{
 		.procname	= "netdev_max_backlog",
@@ -305,6 +334,13 @@ static struct ctl_table net_core_table[] = {
 		.mode		= 0600,
 		.proc_handler	= proc_dointvec,
 	},
+	{
+		.procname	= "bpf_jit_kallsyms",
+		.data		= &bpf_jit_kallsyms,
+		.maxlen		= sizeof(int),
+		.mode		= 0600,
+		.proc_handler	= proc_dointvec,
+	},
 # endif
 #endif
 	{
diff --git a/net/dccp/input.c b/net/dccp/input.c
index ba347184bda9..8fedc2d49770 100644
--- a/net/dccp/input.c
+++ b/net/dccp/input.c
@@ -606,7 +606,8 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
 			if (inet_csk(sk)->icsk_af_ops->conn_request(sk,
 								    skb) < 0)
 				return 1;
-			goto discard;
+			consume_skb(skb);
+			return 0;
 		}
 		if (dh->dccph_type == DCCP_PKT_RESET)
 			goto discard;
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index d859a5c36e70..409d0cfd3447 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -904,7 +904,6 @@ static const struct inet_connection_sock_af_ops dccp_ipv4_af_ops = {
 	.getsockopt	   = ip_getsockopt,
 	.addr2sockaddr	   = inet_csk_addr2sockaddr,
 	.sockaddr_len	   = sizeof(struct sockaddr_in),
-	.bind_conflict	   = inet_csk_bind_conflict,
 #ifdef CONFIG_COMPAT
 	.compat_setsockopt = compat_ip_setsockopt,
 	.compat_getsockopt = compat_ip_getsockopt,
@@ -1018,9 +1017,15 @@ static void __net_exit dccp_v4_exit_net(struct net *net)
 	inet_ctl_sock_destroy(net->dccp.v4_ctl_sk);
 }
 
+static void __net_exit dccp_v4_exit_batch(struct list_head *net_exit_list)
+{
+	inet_twsk_purge(&dccp_hashinfo, AF_INET);
+}
+
 static struct pernet_operations dccp_v4_ops = {
 	.init	= dccp_v4_init_net,
 	.exit	= dccp_v4_exit_net,
+	.exit_batch = dccp_v4_exit_batch,
 };
 
 static int __init dccp_v4_init(void)
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index c4e879c02186..233b57367758 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -937,7 +937,6 @@ static const struct inet_connection_sock_af_ops dccp_ipv6_af_ops = {
 	.getsockopt	   = ipv6_getsockopt,
 	.addr2sockaddr	   = inet6_csk_addr2sockaddr,
 	.sockaddr_len	   = sizeof(struct sockaddr_in6),
-	.bind_conflict	   = inet6_csk_bind_conflict,
 #ifdef CONFIG_COMPAT
 	.compat_setsockopt = compat_ipv6_setsockopt,
 	.compat_getsockopt = compat_ipv6_getsockopt,
@@ -958,7 +957,6 @@ static const struct inet_connection_sock_af_ops dccp_ipv6_mapped = {
 	.getsockopt	   = ipv6_getsockopt,
 	.addr2sockaddr	   = inet6_csk_addr2sockaddr,
 	.sockaddr_len	   = sizeof(struct sockaddr_in6),
-	.bind_conflict	   = inet6_csk_bind_conflict,
 #ifdef CONFIG_COMPAT
 	.compat_setsockopt = compat_ipv6_setsockopt,
 	.compat_getsockopt = compat_ipv6_getsockopt,
@@ -1077,9 +1075,15 @@ static void __net_exit dccp_v6_exit_net(struct net *net)
 	inet_ctl_sock_destroy(net->dccp.v6_ctl_sk);
 }
 
+static void __net_exit dccp_v6_exit_batch(struct list_head *net_exit_list)
+{
+	inet_twsk_purge(&dccp_hashinfo, AF_INET6);
+}
+
 static struct pernet_operations dccp_v6_ops = {
 	.init   = dccp_v6_init_net,
 	.exit   = dccp_v6_exit_net,
+	.exit_batch = dccp_v6_exit_batch,
 };
 
 static int __init dccp_v6_init(void)
diff --git a/net/dsa/Kconfig b/net/dsa/Kconfig
index 96e47c539bee..9649238eef40 100644
--- a/net/dsa/Kconfig
+++ b/net/dsa/Kconfig
@@ -1,12 +1,13 @@
 config HAVE_NET_DSA
 	def_bool y
-	depends on NETDEVICES && !S390
+	depends on INET && NETDEVICES && !S390
 
 # Drivers must select NET_DSA and the appropriate tagging format
 
 config NET_DSA
 	tristate "Distributed Switch Architecture"
-	depends on HAVE_NET_DSA && NET_SWITCHDEV
+	depends on HAVE_NET_DSA
+	select NET_SWITCHDEV
 	select PHYLIB
 	---help---
 	  Say Y if you want to enable support for the hardware switches supported
@@ -14,17 +15,6 @@ config NET_DSA
 
 if NET_DSA
 
-config NET_DSA_HWMON
-	bool "Distributed Switch Architecture HWMON support"
-	default y
-	depends on HWMON && !(NET_DSA=y && HWMON=m)
-	---help---
-	  Say Y if you want to expose thermal sensor data on switches supported
-	  by the Distributed Switch Architecture.
-
-	  Some of those switches contain thermal sensors. This data is available
-	  via the hwmon sysfs interface and exposes the onboard sensors.
-
 # tagging formats
 config NET_DSA_TAG_BRCM
 	bool
diff --git a/net/dsa/Makefile b/net/dsa/Makefile
index a3380ed0e0be..31d343796251 100644
--- a/net/dsa/Makefile
+++ b/net/dsa/Makefile
@@ -1,6 +1,6 @@
 # the core
 obj-$(CONFIG_NET_DSA) += dsa_core.o
-dsa_core-y += dsa.o slave.o dsa2.o
+dsa_core-y += dsa.o slave.o dsa2.o switch.o
 
 # tagging formats
 dsa_core-$(CONFIG_NET_DSA_TAG_BRCM) += tag_brcm.o
diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c
index 7899919cd9f0..b6d4f6a23f06 100644
--- a/net/dsa/dsa.c
+++ b/net/dsa/dsa.c
@@ -9,9 +9,7 @@
  * (at your option) any later version.
  */
 
-#include <linux/ctype.h>
 #include <linux/device.h>
-#include <linux/hwmon.h>
 #include <linux/list.h>
 #include <linux/platform_device.h>
 #include <linux/slab.h>
@@ -27,8 +25,6 @@
 #include <linux/gpio/consumer.h>
 #include "dsa_priv.h"
 
-char dsa_driver_version[] = "0.1";
-
 static struct sk_buff *dsa_slave_notag_xmit(struct sk_buff *skb,
 					    struct net_device *dev)
 {
@@ -64,27 +60,27 @@ const struct dsa_device_ops *dsa_device_ops[DSA_TAG_LAST] = {
 static DEFINE_MUTEX(dsa_switch_drivers_mutex);
 static LIST_HEAD(dsa_switch_drivers);
 
-void register_switch_driver(struct dsa_switch_ops *ops)
+void register_switch_driver(struct dsa_switch_driver *drv)
 {
 	mutex_lock(&dsa_switch_drivers_mutex);
-	list_add_tail(&ops->list, &dsa_switch_drivers);
+	list_add_tail(&drv->list, &dsa_switch_drivers);
 	mutex_unlock(&dsa_switch_drivers_mutex);
 }
 EXPORT_SYMBOL_GPL(register_switch_driver);
 
-void unregister_switch_driver(struct dsa_switch_ops *ops)
+void unregister_switch_driver(struct dsa_switch_driver *drv)
 {
 	mutex_lock(&dsa_switch_drivers_mutex);
-	list_del_init(&ops->list);
+	list_del_init(&drv->list);
 	mutex_unlock(&dsa_switch_drivers_mutex);
 }
 EXPORT_SYMBOL_GPL(unregister_switch_driver);
 
-static struct dsa_switch_ops *
+static const struct dsa_switch_ops *
 dsa_switch_probe(struct device *parent, struct device *host_dev, int sw_addr,
 		 const char **_name, void **priv)
 {
-	struct dsa_switch_ops *ret;
+	const struct dsa_switch_ops *ret;
 	struct list_head *list;
 	const char *name;
 
@@ -93,9 +89,11 @@ dsa_switch_probe(struct device *parent, struct device *host_dev, int sw_addr,
 
 	mutex_lock(&dsa_switch_drivers_mutex);
 	list_for_each(list, &dsa_switch_drivers) {
-		struct dsa_switch_ops *ops;
+		const struct dsa_switch_ops *ops;
+		struct dsa_switch_driver *drv;
 
-		ops = list_entry(list, struct dsa_switch_ops, list);
+		drv = list_entry(list, struct dsa_switch_driver, list);
+		ops = drv->ops;
 
 		name = ops->probe(parent, host_dev, sw_addr, priv);
 		if (name != NULL) {
@@ -110,109 +108,11 @@ dsa_switch_probe(struct device *parent, struct device *host_dev, int sw_addr,
 	return ret;
 }
 
-/* hwmon support ************************************************************/
-
-#ifdef CONFIG_NET_DSA_HWMON
-
-static ssize_t temp1_input_show(struct device *dev,
-				struct device_attribute *attr, char *buf)
-{
-	struct dsa_switch *ds = dev_get_drvdata(dev);
-	int temp, ret;
-
-	ret = ds->ops->get_temp(ds, &temp);
-	if (ret < 0)
-		return ret;
-
-	return sprintf(buf, "%d\n", temp * 1000);
-}
-static DEVICE_ATTR_RO(temp1_input);
-
-static ssize_t temp1_max_show(struct device *dev,
-			      struct device_attribute *attr, char *buf)
-{
-	struct dsa_switch *ds = dev_get_drvdata(dev);
-	int temp, ret;
-
-	ret = ds->ops->get_temp_limit(ds, &temp);
-	if (ret < 0)
-		return ret;
-
-	return sprintf(buf, "%d\n", temp * 1000);
-}
-
-static ssize_t temp1_max_store(struct device *dev,
-			       struct device_attribute *attr, const char *buf,
-			       size_t count)
-{
-	struct dsa_switch *ds = dev_get_drvdata(dev);
-	int temp, ret;
-
-	ret = kstrtoint(buf, 0, &temp);
-	if (ret < 0)
-		return ret;
-
-	ret = ds->ops->set_temp_limit(ds, DIV_ROUND_CLOSEST(temp, 1000));
-	if (ret < 0)
-		return ret;
-
-	return count;
-}
-static DEVICE_ATTR_RW(temp1_max);
-
-static ssize_t temp1_max_alarm_show(struct device *dev,
-				    struct device_attribute *attr, char *buf)
-{
-	struct dsa_switch *ds = dev_get_drvdata(dev);
-	bool alarm;
-	int ret;
-
-	ret = ds->ops->get_temp_alarm(ds, &alarm);
-	if (ret < 0)
-		return ret;
-
-	return sprintf(buf, "%d\n", alarm);
-}
-static DEVICE_ATTR_RO(temp1_max_alarm);
-
-static struct attribute *dsa_hwmon_attrs[] = {
-	&dev_attr_temp1_input.attr,	/* 0 */
-	&dev_attr_temp1_max.attr,	/* 1 */
-	&dev_attr_temp1_max_alarm.attr,	/* 2 */
-	NULL
-};
-
-static umode_t dsa_hwmon_attrs_visible(struct kobject *kobj,
-				       struct attribute *attr, int index)
-{
-	struct device *dev = container_of(kobj, struct device, kobj);
-	struct dsa_switch *ds = dev_get_drvdata(dev);
-	struct dsa_switch_ops *ops = ds->ops;
-	umode_t mode = attr->mode;
-
-	if (index == 1) {
-		if (!ops->get_temp_limit)
-			mode = 0;
-		else if (!ops->set_temp_limit)
-			mode &= ~S_IWUSR;
-	} else if (index == 2 && !ops->get_temp_alarm) {
-		mode = 0;
-	}
-	return mode;
-}
-
-static const struct attribute_group dsa_hwmon_group = {
-	.attrs = dsa_hwmon_attrs,
-	.is_visible = dsa_hwmon_attrs_visible,
-};
-__ATTRIBUTE_GROUPS(dsa_hwmon);
-
-#endif /* CONFIG_NET_DSA_HWMON */
-
 /* basic switch operations **************************************************/
 int dsa_cpu_dsa_setup(struct dsa_switch *ds, struct device *dev,
-		      struct device_node *port_dn, int port)
+		      struct dsa_port *dport, int port)
 {
+	struct device_node *port_dn = dport->dn;
 	struct phy_device *phydev;
 	int ret, mode;
 
@@ -242,15 +142,15 @@ int dsa_cpu_dsa_setup(struct dsa_switch *ds, struct device *dev,
 
 static int dsa_cpu_dsa_setups(struct dsa_switch *ds, struct device *dev)
 {
-	struct device_node *port_dn;
+	struct dsa_port *dport;
 	int ret, port;
 
-	for (port = 0; port < DSA_MAX_PORTS; port++) {
+	for (port = 0; port < ds->num_ports; port++) {
 		if (!(dsa_is_cpu_port(ds, port) || dsa_is_dsa_port(ds, port)))
 			continue;
 
-		port_dn = ds->ports[port].dn;
-		ret = dsa_cpu_dsa_setup(ds, dev, port_dn, port);
+		dport = &ds->ports[port];
+		ret = dsa_cpu_dsa_setup(ds, dev, dport, port);
 		if (ret)
 			return ret;
 	}
@@ -308,7 +208,7 @@ void dsa_cpu_port_ethtool_restore(struct dsa_switch *ds)
 
 static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent)
 {
-	struct dsa_switch_ops *ops = ds->ops;
+	const struct dsa_switch_ops *ops = ds->ops;
 	struct dsa_switch_tree *dst = ds->dst;
 	struct dsa_chip_data *cd = ds->cd;
 	bool valid_name_found = false;
@@ -318,7 +218,7 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent)
 	/*
 	 * Validate supplied switch configuration.
 	 */
-	for (i = 0; i < DSA_MAX_PORTS; i++) {
+	for (i = 0; i < ds->num_ports; i++) {
 		char *name;
 
 		name = cd->port_names[i];
@@ -326,13 +226,12 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent)
 			continue;
 
 		if (!strcmp(name, "cpu")) {
-			if (dst->cpu_switch != -1) {
+			if (dst->cpu_switch) {
 				netdev_err(dst->master_netdev,
 					   "multiple cpu ports?!\n");
-				ret = -EINVAL;
-				goto out;
+				return -EINVAL;
 			}
-			dst->cpu_switch = index;
+			dst->cpu_switch = ds;
 			dst->cpu_port = i;
 			ds->cpu_port_mask |= 1 << i;
 		} else if (!strcmp(name, "dsa")) {
@@ -343,10 +242,8 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent)
 		valid_name_found = true;
 	}
 
-	if (!valid_name_found && i == DSA_MAX_PORTS) {
-		ret = -EINVAL;
-		goto out;
-	}
+	if (!valid_name_found && i == ds->num_ports)
+		return -EINVAL;
 
 	/* Make the built-in MII bus mask match the number of ports,
 	 * switch drivers can override this later
@@ -358,15 +255,13 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent)
 	 * tagging protocol to the preferred tagging format of this
 	 * switch.
 	 */
-	if (dst->cpu_switch == index) {
+	if (dst->cpu_switch == ds) {
 		enum dsa_tag_protocol tag_protocol;
 
 		tag_protocol = ops->get_tag_protocol(ds);
 		dst->tag_ops = dsa_resolve_tag_protocol(tag_protocol);
-		if (IS_ERR(dst->tag_ops)) {
-			ret = PTR_ERR(dst->tag_ops);
-			goto out;
-		}
+		if (IS_ERR(dst->tag_ops))
+			return PTR_ERR(dst->tag_ops);
 
 		dst->rcv = dst->tag_ops->rcv;
 	}
@@ -378,85 +273,55 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent)
 	 */
 	ret = ops->setup(ds);
 	if (ret < 0)
-		goto out;
+		return ret;
+
+	ret = dsa_switch_register_notifier(ds);
+	if (ret)
+		return ret;
 
 	if (ops->set_addr) {
 		ret = ops->set_addr(ds, dst->master_netdev->dev_addr);
 		if (ret < 0)
-			goto out;
+			return ret;
 	}
 
 	if (!ds->slave_mii_bus && ops->phy_read) {
 		ds->slave_mii_bus = devm_mdiobus_alloc(parent);
-		if (!ds->slave_mii_bus) {
-			ret = -ENOMEM;
-			goto out;
-		}
+		if (!ds->slave_mii_bus)
+			return -ENOMEM;
 		dsa_slave_mii_bus_init(ds);
 
 		ret = mdiobus_register(ds->slave_mii_bus);
 		if (ret < 0)
-			goto out;
+			return ret;
 	}
 
 	/*
 	 * Create network devices for physical switch ports.
 	 */
-	for (i = 0; i < DSA_MAX_PORTS; i++) {
+	for (i = 0; i < ds->num_ports; i++) {
 		ds->ports[i].dn = cd->port_dn[i];
 
 		if (!(ds->enabled_port_mask & (1 << i)))
 			continue;
 
 		ret = dsa_slave_create(ds, parent, i, cd->port_names[i]);
-		if (ret < 0) {
+		if (ret < 0)
 			netdev_err(dst->master_netdev, "[%d]: can't create dsa slave device for port %d(%s): %d\n",
 				   index, i, cd->port_names[i], ret);
-			ret = 0;
-		}
 	}
 
 	/* Perform configuration of the CPU and DSA ports */
 	ret = dsa_cpu_dsa_setups(ds, parent);
-	if (ret < 0) {
+	if (ret < 0)
 		netdev_err(dst->master_netdev, "[%d] : can't configure CPU and DSA ports\n",
 			   index);
-		ret = 0;
-	}
 
 	ret = dsa_cpu_port_ethtool_setup(ds);
 	if (ret)
 		return ret;
 
-#ifdef CONFIG_NET_DSA_HWMON
-	/* If the switch provides a temperature sensor,
-	 * register with hardware monitoring subsystem.
-	 * Treat registration error as non-fatal and ignore it.
-	 */
-	if (ops->get_temp) {
-		const char *netname = netdev_name(dst->master_netdev);
-		char hname[IFNAMSIZ + 1];
-		int i, j;
-
-		/* Create valid hwmon 'name' attribute */
-		for (i = j = 0; i < IFNAMSIZ && netname[i]; i++) {
-			if (isalnum(netname[i]))
-				hname[j++] = netname[i];
-		}
-		hname[j] = '\0';
-		scnprintf(ds->hwmon_name, sizeof(ds->hwmon_name), "%s_dsa%d",
-			  hname, index);
-		ds->hwmon_dev = hwmon_device_register_with_groups(NULL,
-					ds->hwmon_name, ds, dsa_hwmon_groups);
-		if (IS_ERR(ds->hwmon_dev))
-			ds->hwmon_dev = NULL;
-	}
-#endif /* CONFIG_NET_DSA_HWMON */
-
-	return ret;
-
-out:
-	return ret;
+	return 0;
 }
 
 static struct dsa_switch *
@@ -464,7 +329,7 @@ dsa_switch_setup(struct dsa_switch_tree *dst, int index,
 		 struct device *parent, struct device *host_dev)
 {
 	struct dsa_chip_data *cd = dst->pd->chip + index;
-	struct dsa_switch_ops *ops;
+	const struct dsa_switch_ops *ops;
 	struct dsa_switch *ds;
 	int ret;
 	const char *name;
@@ -486,8 +351,8 @@ dsa_switch_setup(struct dsa_switch_tree *dst, int index,
 	/*
 	 * Allocate and initialise switch state.
 	 */
-	ds = devm_kzalloc(parent, sizeof(*ds), GFP_KERNEL);
-	if (ds == NULL)
+	ds = dsa_switch_alloc(parent, DSA_MAX_PORTS);
+	if (!ds)
 		return ERR_PTR(-ENOMEM);
 
 	ds->dst = dst;
@@ -495,7 +360,6 @@ dsa_switch_setup(struct dsa_switch_tree *dst, int index,
 	ds->cd = cd;
 	ds->ops = ops;
 	ds->priv = priv;
-	ds->dev = parent;
 
 	ret = dsa_switch_setup_one(ds, parent);
 	if (ret)
@@ -504,8 +368,10 @@ dsa_switch_setup(struct dsa_switch_tree *dst, int index,
 	return ds;
 }
 
-void dsa_cpu_dsa_destroy(struct device_node *port_dn)
+void dsa_cpu_dsa_destroy(struct dsa_port *port)
 {
+	struct device_node *port_dn = port->dn;
+
 	if (of_phy_is_fixed_link(port_dn))
 		of_phy_deregister_fixed_link(port_dn);
 }
@@ -514,13 +380,8 @@ static void dsa_switch_destroy(struct dsa_switch *ds)
 {
 	int port;
 
-#ifdef CONFIG_NET_DSA_HWMON
-	if (ds->hwmon_dev)
-		hwmon_device_unregister(ds->hwmon_dev);
-#endif
-
 	/* Destroy network devices for physical switch ports. */
-	for (port = 0; port < DSA_MAX_PORTS; port++) {
+	for (port = 0; port < ds->num_ports; port++) {
 		if (!(ds->enabled_port_mask & (1 << port)))
 			continue;
 
@@ -531,10 +392,10 @@ static void dsa_switch_destroy(struct dsa_switch *ds)
 	}
 
 	/* Disable configuration of the CPU and DSA ports */
-	for (port = 0; port < DSA_MAX_PORTS; port++) {
+	for (port = 0; port < ds->num_ports; port++) {
 		if (!(dsa_is_cpu_port(ds, port) || dsa_is_dsa_port(ds, port)))
 			continue;
-		dsa_cpu_dsa_destroy(ds->ports[port].dn);
+		dsa_cpu_dsa_destroy(&ds->ports[port]);
 
 		/* Clearing a bit which is not set does no harm */
 		ds->cpu_port_mask |= ~(1 << port);
@@ -543,6 +404,8 @@ static void dsa_switch_destroy(struct dsa_switch *ds)
 
 	if (ds->slave_mii_bus && ds->ops->phy_read)
 		mdiobus_unregister(ds->slave_mii_bus);
+
+	dsa_switch_unregister_notifier(ds);
 }
 
 #ifdef CONFIG_PM_SLEEP
@@ -551,7 +414,7 @@ int dsa_switch_suspend(struct dsa_switch *ds)
 	int i, ret = 0;
 
 	/* Suspend slave network devices */
-	for (i = 0; i < DSA_MAX_PORTS; i++) {
+	for (i = 0; i < ds->num_ports; i++) {
 		if (!dsa_is_port_initialized(ds, i))
 			continue;
 
@@ -578,7 +441,7 @@ int dsa_switch_resume(struct dsa_switch *ds)
 		return ret;
 
 	/* Resume slave network devices */
-	for (i = 0; i < DSA_MAX_PORTS; i++) {
+	for (i = 0; i < ds->num_ports; i++) {
 		if (!dsa_is_port_initialized(ds, i))
 			continue;
 
@@ -629,7 +492,7 @@ struct mii_bus *dsa_host_dev_to_mii_bus(struct device *dev)
 }
 EXPORT_SYMBOL_GPL(dsa_host_dev_to_mii_bus);
 
-static struct net_device *dev_to_net_device(struct device *dev)
+struct net_device *dsa_dev_to_net_device(struct device *dev)
 {
 	struct device *d;
 
@@ -646,6 +509,7 @@ static struct net_device *dev_to_net_device(struct device *dev)
 
 	return NULL;
 }
+EXPORT_SYMBOL_GPL(dsa_dev_to_net_device);
 
 #ifdef CONFIG_OF
 static int dsa_of_setup_routing_table(struct dsa_platform_data *pd,
@@ -898,7 +762,6 @@ static int dsa_setup_dst(struct dsa_switch_tree *dst, struct net_device *dev,
 
 	dst->pd = pd;
 	dst->master_netdev = dev;
-	dst->cpu_switch = -1;
 	dst->cpu_port = -1;
 
 	for (i = 0; i < pd->nr_chips; i++) {
@@ -940,9 +803,6 @@ static int dsa_probe(struct platform_device *pdev)
 	struct dsa_switch_tree *dst;
 	int ret;
 
-	pr_notice_once("Distributed Switch Architecture driver version %s\n",
-		       dsa_driver_version);
-
 	if (pdev->dev.of_node) {
 		ret = dsa_of_probe(&pdev->dev);
 		if (ret)
@@ -958,7 +818,7 @@ static int dsa_probe(struct platform_device *pdev)
 		dev = pd->of_netdev;
 		dev_hold(dev);
 	} else {
-		dev = dev_to_net_device(pd->netdev);
+		dev = dsa_dev_to_net_device(pd->netdev);
 	}
 	if (dev == NULL) {
 		ret = -EPROBE_DEFER;
@@ -1013,7 +873,7 @@ static void dsa_remove_dst(struct dsa_switch_tree *dst)
 			dsa_switch_destroy(ds);
 	}
 
-	dsa_cpu_port_ethtool_restore(dst->ds[0]);
+	dsa_cpu_port_ethtool_restore(dst->cpu_switch);
 
 	dev_put(dst->master_netdev);
 }
@@ -1050,10 +910,6 @@ static struct packet_type dsa_pack_type __read_mostly = {
 	.func	= dsa_switch_rcv,
 };
 
-static struct notifier_block dsa_netdevice_nb __read_mostly = {
-	.notifier_call	= dsa_slave_netdevice_event,
-};
-
 #ifdef CONFIG_PM_SLEEP
 static int dsa_suspend(struct device *d)
 {
@@ -1111,7 +967,9 @@ static int __init dsa_init_module(void)
 {
 	int rc;
 
-	register_netdevice_notifier(&dsa_netdevice_nb);
+	rc = dsa_slave_register_notifier();
+	if (rc)
+		return rc;
 
 	rc = platform_driver_register(&dsa_driver);
 	if (rc)
@@ -1125,7 +983,7 @@ module_init(dsa_init_module);
 
 static void __exit dsa_cleanup_module(void)
 {
-	unregister_netdevice_notifier(&dsa_netdevice_nb);
+	dsa_slave_unregister_notifier();
 	dev_remove_pack(&dsa_pack_type);
 	platform_driver_unregister(&dsa_driver);
 }
diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c
index da3862124545..737be6470c7f 100644
--- a/net/dsa/dsa2.c
+++ b/net/dsa/dsa2.c
@@ -57,7 +57,6 @@ static struct dsa_switch_tree *dsa_add_dst(u32 tree)
 	if (!dst)
 		return NULL;
 	dst->tree = tree;
-	dst->cpu_switch = -1;
 	INIT_LIST_HEAD(&dst->list);
 	list_add_tail(&dsa_switch_trees, &dst->list);
 	kref_init(&dst->refcount);
@@ -79,47 +78,43 @@ static void dsa_dst_del_ds(struct dsa_switch_tree *dst,
 	kref_put(&dst->refcount, dsa_free_dst);
 }
 
-static bool dsa_port_is_dsa(struct device_node *port)
+/* For platform data configurations, we need to have a valid name argument to
+ * differentiate a disabled port from an enabled one
+ */
+static bool dsa_port_is_valid(struct dsa_port *port)
 {
-	const char *name;
-
-	name = of_get_property(port, "label", NULL);
-	if (!name)
-		return false;
+	return !!(port->dn || port->name);
+}
 
-	if (!strcmp(name, "dsa"))
+static bool dsa_port_is_dsa(struct dsa_port *port)
+{
+	if (port->name && !strcmp(port->name, "dsa"))
 		return true;
-
-	return false;
+	else
+		return !!of_parse_phandle(port->dn, "link", 0);
 }
 
-static bool dsa_port_is_cpu(struct device_node *port)
+static bool dsa_port_is_cpu(struct dsa_port *port)
 {
-	const char *name;
-
-	name = of_get_property(port, "label", NULL);
-	if (!name)
-		return false;
-
-	if (!strcmp(name, "cpu"))
+	if (port->name && !strcmp(port->name, "cpu"))
 		return true;
-
-	return false;
+	else
+		return !!of_parse_phandle(port->dn, "ethernet", 0);
 }
 
-static bool dsa_ds_find_port(struct dsa_switch *ds,
-			     struct device_node *port)
+static bool dsa_ds_find_port_dn(struct dsa_switch *ds,
+				struct device_node *port)
 {
 	u32 index;
 
-	for (index = 0; index < DSA_MAX_PORTS; index++)
+	for (index = 0; index < ds->num_ports; index++)
 		if (ds->ports[index].dn == port)
 			return true;
 	return false;
 }
 
-static struct dsa_switch *dsa_dst_find_port(struct dsa_switch_tree *dst,
-					    struct device_node *port)
+static struct dsa_switch *dsa_dst_find_port_dn(struct dsa_switch_tree *dst,
+					       struct device_node *port)
 {
 	struct dsa_switch *ds;
 	u32 index;
@@ -129,7 +124,7 @@ static struct dsa_switch *dsa_dst_find_port(struct dsa_switch_tree *dst,
 		if (!ds)
 			continue;
 
-		if (dsa_ds_find_port(ds, port))
+		if (dsa_ds_find_port_dn(ds, port))
 			return ds;
 	}
 
@@ -138,7 +133,7 @@ static struct dsa_switch *dsa_dst_find_port(struct dsa_switch_tree *dst,
 
 static int dsa_port_complete(struct dsa_switch_tree *dst,
 			     struct dsa_switch *src_ds,
-			     struct device_node *port,
+			     struct dsa_port *port,
 			     u32 src_port)
 {
 	struct device_node *link;
@@ -146,11 +141,11 @@ static int dsa_port_complete(struct dsa_switch_tree *dst,
 	struct dsa_switch *dst_ds;
 
 	for (index = 0;; index++) {
-		link = of_parse_phandle(port, "link", index);
+		link = of_parse_phandle(port->dn, "link", index);
 		if (!link)
 			break;
 
-		dst_ds = dsa_dst_find_port(dst, link);
+		dst_ds = dsa_dst_find_port_dn(dst, link);
 		of_node_put(link);
 
 		if (!dst_ds)
@@ -169,13 +164,13 @@ static int dsa_port_complete(struct dsa_switch_tree *dst,
  */
 static int dsa_ds_complete(struct dsa_switch_tree *dst, struct dsa_switch *ds)
 {
-	struct device_node *port;
+	struct dsa_port *port;
 	u32 index;
 	int err;
 
-	for (index = 0; index < DSA_MAX_PORTS; index++) {
-		port = ds->ports[index].dn;
-		if (!port)
+	for (index = 0; index < ds->num_ports; index++) {
+		port = &ds->ports[index];
+		if (!dsa_port_is_valid(port))
 			continue;
 
 		if (!dsa_port_is_dsa(port))
@@ -215,7 +210,7 @@ static int dsa_dst_complete(struct dsa_switch_tree *dst)
 	return 0;
 }
 
-static int dsa_dsa_port_apply(struct device_node *port, u32 index,
+static int dsa_dsa_port_apply(struct dsa_port *port, u32 index,
 			      struct dsa_switch *ds)
 {
 	int err;
@@ -230,13 +225,13 @@ static int dsa_dsa_port_apply(struct device_node *port, u32 index,
 	return 0;
 }
 
-static void dsa_dsa_port_unapply(struct device_node *port, u32 index,
+static void dsa_dsa_port_unapply(struct dsa_port *port, u32 index,
 				 struct dsa_switch *ds)
 {
 	dsa_cpu_dsa_destroy(port);
 }
 
-static int dsa_cpu_port_apply(struct device_node *port, u32 index,
+static int dsa_cpu_port_apply(struct dsa_port *port, u32 index,
 			      struct dsa_switch *ds)
 {
 	int err;
@@ -253,7 +248,7 @@ static int dsa_cpu_port_apply(struct device_node *port, u32 index,
 	return 0;
 }
 
-static void dsa_cpu_port_unapply(struct device_node *port, u32 index,
+static void dsa_cpu_port_unapply(struct dsa_port *port, u32 index,
 				 struct dsa_switch *ds)
 {
 	dsa_cpu_dsa_destroy(port);
@@ -261,25 +256,29 @@ static void dsa_cpu_port_unapply(struct device_node *port, u32 index,
 
 }
 
-static int dsa_user_port_apply(struct device_node *port, u32 index,
+static int dsa_user_port_apply(struct dsa_port *port, u32 index,
 			       struct dsa_switch *ds)
 {
-	const char *name;
+	const char *name = port->name;
 	int err;
 
-	name = of_get_property(port, "label", NULL);
+	if (port->dn)
+		name = of_get_property(port->dn, "label", NULL);
+	if (!name)
+		name = "eth%d";
 
 	err = dsa_slave_create(ds, ds->dev, index, name);
 	if (err) {
 		dev_warn(ds->dev, "Failed to create slave %d: %d\n",
 			 index, err);
+		ds->ports[index].netdev = NULL;
 		return err;
 	}
 
 	return 0;
 }
 
-static void dsa_user_port_unapply(struct device_node *port, u32 index,
+static void dsa_user_port_unapply(struct dsa_port *port, u32 index,
 				  struct dsa_switch *ds)
 {
 	if (ds->ports[index].netdev) {
@@ -291,7 +290,7 @@ static void dsa_user_port_unapply(struct device_node *port, u32 index,
 
 static int dsa_ds_apply(struct dsa_switch_tree *dst, struct dsa_switch *ds)
 {
-	struct device_node *port;
+	struct dsa_port *port;
 	u32 index;
 	int err;
 
@@ -306,6 +305,10 @@ static int dsa_ds_apply(struct dsa_switch_tree *dst, struct dsa_switch *ds)
 	if (err < 0)
 		return err;
 
+	err = dsa_switch_register_notifier(ds);
+	if (err)
+		return err;
+
 	if (ds->ops->set_addr) {
 		err = ds->ops->set_addr(ds, dst->master_netdev->dev_addr);
 		if (err < 0)
@@ -324,9 +327,9 @@ static int dsa_ds_apply(struct dsa_switch_tree *dst, struct dsa_switch *ds)
 			return err;
 	}
 
-	for (index = 0; index < DSA_MAX_PORTS; index++) {
-		port = ds->ports[index].dn;
-		if (!port)
+	for (index = 0; index < ds->num_ports; index++) {
+		port = &ds->ports[index];
+		if (!dsa_port_is_valid(port))
 			continue;
 
 		if (dsa_port_is_dsa(port)) {
@@ -353,12 +356,12 @@ static int dsa_ds_apply(struct dsa_switch_tree *dst, struct dsa_switch *ds)
 
 static void dsa_ds_unapply(struct dsa_switch_tree *dst, struct dsa_switch *ds)
 {
-	struct device_node *port;
+	struct dsa_port *port;
 	u32 index;
 
-	for (index = 0; index < DSA_MAX_PORTS; index++) {
-		port = ds->ports[index].dn;
-		if (!port)
+	for (index = 0; index < ds->num_ports; index++) {
+		port = &ds->ports[index];
+		if (!dsa_port_is_valid(port))
 			continue;
 
 		if (dsa_port_is_dsa(port)) {
@@ -376,6 +379,8 @@ static void dsa_ds_unapply(struct dsa_switch_tree *dst, struct dsa_switch *ds)
 
 	if (ds->slave_mii_bus && ds->ops->phy_read)
 		mdiobus_unregister(ds->slave_mii_bus);
+
+	dsa_switch_unregister_notifier(ds);
 }
 
 static int dsa_dst_apply(struct dsa_switch_tree *dst)
@@ -394,8 +399,8 @@ static int dsa_dst_apply(struct dsa_switch_tree *dst)
 			return err;
 	}
 
-	if (dst->ds[0]) {
-		err = dsa_cpu_port_ethtool_setup(dst->ds[0]);
+	if (dst->cpu_switch) {
+		err = dsa_cpu_port_ethtool_setup(dst->cpu_switch);
 		if (err)
 			return err;
 	}
@@ -435,14 +440,14 @@ static void dsa_dst_unapply(struct dsa_switch_tree *dst)
 		dsa_ds_unapply(dst, ds);
 	}
 
-	if (dst->ds[0])
-		dsa_cpu_port_ethtool_restore(dst->ds[0]);
+	if (dst->cpu_switch)
+		dsa_cpu_port_ethtool_restore(dst->cpu_switch);
 
 	pr_info("DSA: tree %d unapplied\n", dst->tree);
 	dst->applied = false;
 }
 
-static int dsa_cpu_parse(struct device_node *port, u32 index,
+static int dsa_cpu_parse(struct dsa_port *port, u32 index,
 			 struct dsa_switch_tree *dst,
 			 struct dsa_switch *ds)
 {
@@ -450,11 +455,16 @@ static int dsa_cpu_parse(struct device_node *port, u32 index,
 	struct net_device *ethernet_dev;
 	struct device_node *ethernet;
 
-	ethernet = of_parse_phandle(port, "ethernet", 0);
-	if (!ethernet)
-		return -EINVAL;
+	if (port->dn) {
+		ethernet = of_parse_phandle(port->dn, "ethernet", 0);
+		if (!ethernet)
+			return -EINVAL;
+		ethernet_dev = of_find_net_device_by_node(ethernet);
+	} else {
+		ethernet_dev = dsa_dev_to_net_device(ds->cd->netdev[index]);
+		dev_put(ethernet_dev);
+	}
 
-	ethernet_dev = of_find_net_device_by_node(ethernet);
 	if (!ethernet_dev)
 		return -EPROBE_DEFER;
 
@@ -464,8 +474,8 @@ static int dsa_cpu_parse(struct device_node *port, u32 index,
 	if (!dst->master_netdev)
 		dst->master_netdev = ethernet_dev;
 
-	if (dst->cpu_switch == -1) {
-		dst->cpu_switch = ds->index;
+	if (!dst->cpu_switch) {
+		dst->cpu_switch = ds;
 		dst->cpu_port = index;
 	}
 
@@ -483,13 +493,13 @@ static int dsa_cpu_parse(struct device_node *port, u32 index,
 
 static int dsa_ds_parse(struct dsa_switch_tree *dst, struct dsa_switch *ds)
 {
-	struct device_node *port;
+	struct dsa_port *port;
 	u32 index;
 	int err;
 
-	for (index = 0; index < DSA_MAX_PORTS; index++) {
-		port = ds->ports[index].dn;
-		if (!port)
+	for (index = 0; index < ds->num_ports; index++) {
+		port = &ds->ports[index];
+		if (!dsa_port_is_valid(port))
 			continue;
 
 		if (dsa_port_is_cpu(port)) {
@@ -541,7 +551,7 @@ static int dsa_parse_ports_dn(struct device_node *ports, struct dsa_switch *ds)
 		if (err)
 			return err;
 
-		if (reg >= DSA_MAX_PORTS)
+		if (reg >= ds->num_ports)
 			return -EINVAL;
 
 		ds->ports[reg].dn = port;
@@ -550,14 +560,41 @@ static int dsa_parse_ports_dn(struct device_node *ports, struct dsa_switch *ds)
 		 * to have access to a correct value, just like what
 		 * net/dsa/dsa.c::dsa_switch_setup_one does.
 		 */
-		if (!dsa_port_is_cpu(port))
+		if (!dsa_port_is_cpu(&ds->ports[reg]))
 			ds->enabled_port_mask |= 1 << reg;
 	}
 
 	return 0;
 }
 
-static int dsa_parse_member(struct device_node *np, u32 *tree, u32 *index)
+static int dsa_parse_ports(struct dsa_chip_data *cd, struct dsa_switch *ds)
+{
+	bool valid_name_found = false;
+	unsigned int i;
+
+	for (i = 0; i < DSA_MAX_PORTS; i++) {
+		if (!cd->port_names[i])
+			continue;
+
+		ds->ports[i].name = cd->port_names[i];
+
+		/* Initialize enabled_port_mask now for drv->setup()
+		 * to have access to a correct value, just like what
+		 * net/dsa/dsa.c::dsa_switch_setup_one does.
+		 */
+		if (!dsa_port_is_cpu(&ds->ports[i]))
+			ds->enabled_port_mask |= 1 << i;
+
+		valid_name_found = true;
+	}
+
+	if (!valid_name_found && i == DSA_MAX_PORTS)
+		return -EINVAL;
+
+	return 0;
+}
+
+static int dsa_parse_member_dn(struct device_node *np, u32 *tree, u32 *index)
 {
 	int err;
 
@@ -581,6 +618,18 @@ static int dsa_parse_member(struct device_node *np, u32 *tree, u32 *index)
 	return 0;
 }
 
+static int dsa_parse_member(struct dsa_chip_data *pd, u32 *tree, u32 *index)
+{
+	if (!pd)
+		return -ENODEV;
+
+	/* We do not support complex trees with dsa_chip_data */
+	*tree = 0;
+	*index = 0;
+
+	return 0;
+}
+
 static struct device_node *dsa_get_ports(struct dsa_switch *ds,
 					 struct device_node *np)
 {
@@ -595,23 +644,36 @@ static struct device_node *dsa_get_ports(struct dsa_switch *ds,
 	return ports;
 }
 
-static int _dsa_register_switch(struct dsa_switch *ds, struct device_node *np)
+static int _dsa_register_switch(struct dsa_switch *ds, struct device *dev)
 {
-	struct device_node *ports = dsa_get_ports(ds, np);
+	struct dsa_chip_data *pdata = dev->platform_data;
+	struct device_node *np = dev->of_node;
 	struct dsa_switch_tree *dst;
+	struct device_node *ports;
 	u32 tree, index;
 	int i, err;
 
-	err = dsa_parse_member(np, &tree, &index);
-	if (err)
-		return err;
+	if (np) {
+		err = dsa_parse_member_dn(np, &tree, &index);
+		if (err)
+			return err;
 
-	if (IS_ERR(ports))
-		return PTR_ERR(ports);
+		ports = dsa_get_ports(ds, np);
+		if (IS_ERR(ports))
+			return PTR_ERR(ports);
 
-	err = dsa_parse_ports_dn(ports, ds);
-	if (err)
-		return err;
+		err = dsa_parse_ports_dn(ports, ds);
+		if (err)
+			return err;
+	} else {
+		err = dsa_parse_member(pdata, &tree, &index);
+		if (err)
+			return err;
+
+		err = dsa_parse_ports(pdata, ds);
+		if (err)
+			return err;
+	}
 
 	dst = dsa_get_dst(tree);
 	if (!dst) {
@@ -627,6 +689,7 @@ static int _dsa_register_switch(struct dsa_switch *ds, struct device_node *np)
 
 	ds->dst = dst;
 	ds->index = index;
+	ds->cd = pdata;
 
 	/* Initialize the routing table */
 	for (i = 0; i < DSA_MAX_SWITCHES; ++i)
@@ -650,8 +713,14 @@ static int _dsa_register_switch(struct dsa_switch *ds, struct device_node *np)
 	}
 
 	err = dsa_dst_parse(dst);
-	if (err)
+	if (err) {
+		if (err == -EPROBE_DEFER) {
+			dsa_dst_del_ds(dst, ds, ds->index);
+			return err;
+		}
+
 		goto out_del_dst;
+	}
 
 	err = dsa_dst_apply(dst);
 	if (err) {
@@ -670,12 +739,34 @@ out:
 	return err;
 }
 
-int dsa_register_switch(struct dsa_switch *ds, struct device_node *np)
+struct dsa_switch *dsa_switch_alloc(struct device *dev, size_t n)
+{
+	size_t size = sizeof(struct dsa_switch) + n * sizeof(struct dsa_port);
+	struct dsa_switch *ds;
+	int i;
+
+	ds = devm_kzalloc(dev, size, GFP_KERNEL);
+	if (!ds)
+		return NULL;
+
+	ds->dev = dev;
+	ds->num_ports = n;
+
+	for (i = 0; i < ds->num_ports; ++i) {
+		ds->ports[i].index = i;
+		ds->ports[i].ds = ds;
+	}
+
+	return ds;
+}
+EXPORT_SYMBOL_GPL(dsa_switch_alloc);
+
+int dsa_register_switch(struct dsa_switch *ds, struct device *dev)
 {
 	int err;
 
 	mutex_lock(&dsa2_mutex);
-	err = _dsa_register_switch(ds, np);
+	err = _dsa_register_switch(ds, dev);
 	mutex_unlock(&dsa2_mutex);
 
 	return err;
diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h
index 6cfd7388834e..0706a511244e 100644
--- a/net/dsa/dsa_priv.h
+++ b/net/dsa/dsa_priv.h
@@ -25,12 +25,8 @@ struct dsa_slave_priv {
 	struct sk_buff *	(*xmit)(struct sk_buff *skb,
 					struct net_device *dev);
 
-	/*
-	 * Which switch this port is a part of, and the port index
-	 * for this port.
-	 */
-	struct dsa_switch	*parent;
-	u8			port;
+	/* DSA port data, such as switch, port index, etc. */
+	struct dsa_port		*dp;
 
 	/*
 	 * The phylib phy_device pointer for the PHY connected
@@ -42,17 +38,18 @@ struct dsa_slave_priv {
 	int			old_pause;
 	int			old_duplex;
 
-	struct net_device	*bridge_dev;
 #ifdef CONFIG_NET_POLL_CONTROLLER
 	struct netpoll		*netpoll;
 #endif
+
+	/* TC context */
+	struct list_head	mall_tc_list;
 };
 
 /* dsa.c */
-extern char dsa_driver_version[];
 int dsa_cpu_dsa_setup(struct dsa_switch *ds, struct device *dev,
-		      struct device_node *port_dn, int port);
-void dsa_cpu_dsa_destroy(struct device_node *port_dn);
+		      struct dsa_port *dport, int port);
+void dsa_cpu_dsa_destroy(struct dsa_port *dport);
 const struct dsa_device_ops *dsa_resolve_tag_protocol(int tag_protocol);
 int dsa_cpu_port_ethtool_setup(struct dsa_switch *ds);
 void dsa_cpu_port_ethtool_restore(struct dsa_switch *ds);
@@ -66,8 +63,12 @@ int dsa_slave_create(struct dsa_switch *ds, struct device *parent,
 void dsa_slave_destroy(struct net_device *slave_dev);
 int dsa_slave_suspend(struct net_device *slave_dev);
 int dsa_slave_resume(struct net_device *slave_dev);
-int dsa_slave_netdevice_event(struct notifier_block *unused,
-			      unsigned long event, void *ptr);
+int dsa_slave_register_notifier(void);
+void dsa_slave_unregister_notifier(void);
+
+/* switch.c */
+int dsa_switch_register_notifier(struct dsa_switch *ds);
+void dsa_switch_unregister_notifier(struct dsa_switch *ds);
 
 /* tag_dsa.c */
 extern const struct dsa_device_ops dsa_netdev_ops;
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 7d4596110851..c34872e1febc 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -16,12 +16,28 @@
 #include <linux/of_net.h>
 #include <linux/of_mdio.h>
 #include <linux/mdio.h>
+#include <linux/list.h>
 #include <net/rtnetlink.h>
 #include <net/switchdev.h>
+#include <net/pkt_cls.h>
+#include <net/tc_act/tc_mirred.h>
 #include <linux/if_bridge.h>
 #include <linux/netpoll.h>
 #include "dsa_priv.h"
 
+static bool dsa_slave_dev_check(struct net_device *dev);
+
+static int dsa_slave_notify(struct net_device *dev, unsigned long e, void *v)
+{
+	struct dsa_slave_priv *p = netdev_priv(dev);
+	struct raw_notifier_head *nh = &p->dp->ds->dst->nh;
+	int err;
+
+	err = raw_notifier_call_chain(nh, e, v);
+
+	return notifier_to_errno(err);
+}
+
 /* slave mii_bus handling ***************************************************/
 static int dsa_slave_phy_read(struct mii_bus *bus, int addr, int reg)
 {
@@ -61,17 +77,20 @@ static int dsa_slave_get_iflink(const struct net_device *dev)
 {
 	struct dsa_slave_priv *p = netdev_priv(dev);
 
-	return p->parent->dst->master_netdev->ifindex;
+	return p->dp->ds->dst->master_netdev->ifindex;
 }
 
-static inline bool dsa_port_is_bridged(struct dsa_slave_priv *p)
+static inline bool dsa_port_is_bridged(struct dsa_port *dp)
 {
-	return !!p->bridge_dev;
+	return !!dp->bridge_dev;
 }
 
-static void dsa_port_set_stp_state(struct dsa_switch *ds, int port, u8 state)
+static void dsa_slave_set_state(struct net_device *dev, u8 state)
 {
-	struct dsa_port *dp = &ds->ports[port];
+	struct dsa_slave_priv *p = netdev_priv(dev);
+	struct dsa_port *dp = p->dp;
+	struct dsa_switch *ds = dp->ds;
+	int port = dp->index;
 
 	if (ds->ops->port_stp_state_set)
 		ds->ops->port_stp_state_set(ds, port, state);
@@ -96,9 +115,9 @@ static void dsa_port_set_stp_state(struct dsa_switch *ds, int port, u8 state)
 static int dsa_slave_open(struct net_device *dev)
 {
 	struct dsa_slave_priv *p = netdev_priv(dev);
-	struct net_device *master = p->parent->dst->master_netdev;
-	struct dsa_switch *ds = p->parent;
-	u8 stp_state = dsa_port_is_bridged(p) ?
+	struct net_device *master = p->dp->ds->dst->master_netdev;
+	struct dsa_switch *ds = p->dp->ds;
+	u8 stp_state = dsa_port_is_bridged(p->dp) ?
 			BR_STATE_BLOCKING : BR_STATE_FORWARDING;
 	int err;
 
@@ -123,12 +142,12 @@ static int dsa_slave_open(struct net_device *dev)
 	}
 
 	if (ds->ops->port_enable) {
-		err = ds->ops->port_enable(ds, p->port, p->phy);
+		err = ds->ops->port_enable(ds, p->dp->index, p->phy);
 		if (err)
 			goto clear_promisc;
 	}
 
-	dsa_port_set_stp_state(ds, p->port, stp_state);
+	dsa_slave_set_state(dev, stp_state);
 
 	if (p->phy)
 		phy_start(p->phy);
@@ -151,8 +170,8 @@ out:
 static int dsa_slave_close(struct net_device *dev)
 {
 	struct dsa_slave_priv *p = netdev_priv(dev);
-	struct net_device *master = p->parent->dst->master_netdev;
-	struct dsa_switch *ds = p->parent;
+	struct net_device *master = p->dp->ds->dst->master_netdev;
+	struct dsa_switch *ds = p->dp->ds;
 
 	if (p->phy)
 		phy_stop(p->phy);
@@ -168,9 +187,9 @@ static int dsa_slave_close(struct net_device *dev)
 		dev_uc_del(master, dev->dev_addr);
 
 	if (ds->ops->port_disable)
-		ds->ops->port_disable(ds, p->port, p->phy);
+		ds->ops->port_disable(ds, p->dp->index, p->phy);
 
-	dsa_port_set_stp_state(ds, p->port, BR_STATE_DISABLED);
+	dsa_slave_set_state(dev, BR_STATE_DISABLED);
 
 	return 0;
 }
@@ -178,7 +197,7 @@ static int dsa_slave_close(struct net_device *dev)
 static void dsa_slave_change_rx_flags(struct net_device *dev, int change)
 {
 	struct dsa_slave_priv *p = netdev_priv(dev);
-	struct net_device *master = p->parent->dst->master_netdev;
+	struct net_device *master = p->dp->ds->dst->master_netdev;
 
 	if (change & IFF_ALLMULTI)
 		dev_set_allmulti(master, dev->flags & IFF_ALLMULTI ? 1 : -1);
@@ -189,7 +208,7 @@ static void dsa_slave_change_rx_flags(struct net_device *dev, int change)
 static void dsa_slave_set_rx_mode(struct net_device *dev)
 {
 	struct dsa_slave_priv *p = netdev_priv(dev);
-	struct net_device *master = p->parent->dst->master_netdev;
+	struct net_device *master = p->dp->ds->dst->master_netdev;
 
 	dev_mc_sync(master, dev);
 	dev_uc_sync(master, dev);
@@ -198,7 +217,7 @@ static void dsa_slave_set_rx_mode(struct net_device *dev)
 static int dsa_slave_set_mac_address(struct net_device *dev, void *a)
 {
 	struct dsa_slave_priv *p = netdev_priv(dev);
-	struct net_device *master = p->parent->dst->master_netdev;
+	struct net_device *master = p->dp->ds->dst->master_netdev;
 	struct sockaddr *addr = a;
 	int err;
 
@@ -228,16 +247,17 @@ static int dsa_slave_port_vlan_add(struct net_device *dev,
 				   struct switchdev_trans *trans)
 {
 	struct dsa_slave_priv *p = netdev_priv(dev);
-	struct dsa_switch *ds = p->parent;
+	struct dsa_port *dp = p->dp;
+	struct dsa_switch *ds = dp->ds;
 
 	if (switchdev_trans_ph_prepare(trans)) {
 		if (!ds->ops->port_vlan_prepare || !ds->ops->port_vlan_add)
 			return -EOPNOTSUPP;
 
-		return ds->ops->port_vlan_prepare(ds, p->port, vlan, trans);
+		return ds->ops->port_vlan_prepare(ds, dp->index, vlan, trans);
 	}
 
-	ds->ops->port_vlan_add(ds, p->port, vlan, trans);
+	ds->ops->port_vlan_add(ds, dp->index, vlan, trans);
 
 	return 0;
 }
@@ -246,12 +266,12 @@ static int dsa_slave_port_vlan_del(struct net_device *dev,
 				   const struct switchdev_obj_port_vlan *vlan)
 {
 	struct dsa_slave_priv *p = netdev_priv(dev);
-	struct dsa_switch *ds = p->parent;
+	struct dsa_switch *ds = p->dp->ds;
 
 	if (!ds->ops->port_vlan_del)
 		return -EOPNOTSUPP;
 
-	return ds->ops->port_vlan_del(ds, p->port, vlan);
+	return ds->ops->port_vlan_del(ds, p->dp->index, vlan);
 }
 
 static int dsa_slave_port_vlan_dump(struct net_device *dev,
@@ -259,10 +279,10 @@ static int dsa_slave_port_vlan_dump(struct net_device *dev,
 				    switchdev_obj_dump_cb_t *cb)
 {
 	struct dsa_slave_priv *p = netdev_priv(dev);
-	struct dsa_switch *ds = p->parent;
+	struct dsa_switch *ds = p->dp->ds;
 
 	if (ds->ops->port_vlan_dump)
-		return ds->ops->port_vlan_dump(ds, p->port, vlan, cb);
+		return ds->ops->port_vlan_dump(ds, p->dp->index, vlan, cb);
 
 	return -EOPNOTSUPP;
 }
@@ -272,16 +292,16 @@ static int dsa_slave_port_fdb_add(struct net_device *dev,
 				  struct switchdev_trans *trans)
 {
 	struct dsa_slave_priv *p = netdev_priv(dev);
-	struct dsa_switch *ds = p->parent;
+	struct dsa_switch *ds = p->dp->ds;
 
 	if (switchdev_trans_ph_prepare(trans)) {
 		if (!ds->ops->port_fdb_prepare || !ds->ops->port_fdb_add)
 			return -EOPNOTSUPP;
 
-		return ds->ops->port_fdb_prepare(ds, p->port, fdb, trans);
+		return ds->ops->port_fdb_prepare(ds, p->dp->index, fdb, trans);
 	}
 
-	ds->ops->port_fdb_add(ds, p->port, fdb, trans);
+	ds->ops->port_fdb_add(ds, p->dp->index, fdb, trans);
 
 	return 0;
 }
@@ -290,11 +310,11 @@ static int dsa_slave_port_fdb_del(struct net_device *dev,
 				  const struct switchdev_obj_port_fdb *fdb)
 {
 	struct dsa_slave_priv *p = netdev_priv(dev);
-	struct dsa_switch *ds = p->parent;
+	struct dsa_switch *ds = p->dp->ds;
 	int ret = -EOPNOTSUPP;
 
 	if (ds->ops->port_fdb_del)
-		ret = ds->ops->port_fdb_del(ds, p->port, fdb);
+		ret = ds->ops->port_fdb_del(ds, p->dp->index, fdb);
 
 	return ret;
 }
@@ -304,10 +324,10 @@ static int dsa_slave_port_fdb_dump(struct net_device *dev,
 				   switchdev_obj_dump_cb_t *cb)
 {
 	struct dsa_slave_priv *p = netdev_priv(dev);
-	struct dsa_switch *ds = p->parent;
+	struct dsa_switch *ds = p->dp->ds;
 
 	if (ds->ops->port_fdb_dump)
-		return ds->ops->port_fdb_dump(ds, p->port, fdb, cb);
+		return ds->ops->port_fdb_dump(ds, p->dp->index, fdb, cb);
 
 	return -EOPNOTSUPP;
 }
@@ -317,16 +337,16 @@ static int dsa_slave_port_mdb_add(struct net_device *dev,
 				  struct switchdev_trans *trans)
 {
 	struct dsa_slave_priv *p = netdev_priv(dev);
-	struct dsa_switch *ds = p->parent;
+	struct dsa_switch *ds = p->dp->ds;
 
 	if (switchdev_trans_ph_prepare(trans)) {
 		if (!ds->ops->port_mdb_prepare || !ds->ops->port_mdb_add)
 			return -EOPNOTSUPP;
 
-		return ds->ops->port_mdb_prepare(ds, p->port, mdb, trans);
+		return ds->ops->port_mdb_prepare(ds, p->dp->index, mdb, trans);
 	}
 
-	ds->ops->port_mdb_add(ds, p->port, mdb, trans);
+	ds->ops->port_mdb_add(ds, p->dp->index, mdb, trans);
 
 	return 0;
 }
@@ -335,10 +355,10 @@ static int dsa_slave_port_mdb_del(struct net_device *dev,
 				  const struct switchdev_obj_port_mdb *mdb)
 {
 	struct dsa_slave_priv *p = netdev_priv(dev);
-	struct dsa_switch *ds = p->parent;
+	struct dsa_switch *ds = p->dp->ds;
 
 	if (ds->ops->port_mdb_del)
-		return ds->ops->port_mdb_del(ds, p->port, mdb);
+		return ds->ops->port_mdb_del(ds, p->dp->index, mdb);
 
 	return -EOPNOTSUPP;
 }
@@ -348,10 +368,10 @@ static int dsa_slave_port_mdb_dump(struct net_device *dev,
 				   switchdev_obj_dump_cb_t *cb)
 {
 	struct dsa_slave_priv *p = netdev_priv(dev);
-	struct dsa_switch *ds = p->parent;
+	struct dsa_switch *ds = p->dp->ds;
 
 	if (ds->ops->port_mdb_dump)
-		return ds->ops->port_mdb_dump(ds, p->port, mdb, cb);
+		return ds->ops->port_mdb_dump(ds, p->dp->index, mdb, cb);
 
 	return -EOPNOTSUPP;
 }
@@ -371,12 +391,12 @@ static int dsa_slave_stp_state_set(struct net_device *dev,
 				   struct switchdev_trans *trans)
 {
 	struct dsa_slave_priv *p = netdev_priv(dev);
-	struct dsa_switch *ds = p->parent;
+	struct dsa_switch *ds = p->dp->ds;
 
 	if (switchdev_trans_ph_prepare(trans))
 		return ds->ops->port_stp_state_set ? 0 : -EOPNOTSUPP;
 
-	dsa_port_set_stp_state(ds, p->port, attr->u.stp_state);
+	dsa_slave_set_state(dev, attr->u.stp_state);
 
 	return 0;
 }
@@ -386,14 +406,14 @@ static int dsa_slave_vlan_filtering(struct net_device *dev,
 				    struct switchdev_trans *trans)
 {
 	struct dsa_slave_priv *p = netdev_priv(dev);
-	struct dsa_switch *ds = p->parent;
+	struct dsa_switch *ds = p->dp->ds;
 
 	/* bridge skips -EOPNOTSUPP, so skip the prepare phase */
 	if (switchdev_trans_ph_prepare(trans))
 		return 0;
 
 	if (ds->ops->port_vlan_filtering)
-		return ds->ops->port_vlan_filtering(ds, p->port,
+		return ds->ops->port_vlan_filtering(ds, p->dp->index,
 						    attr->u.vlan_filtering);
 
 	return 0;
@@ -404,7 +424,7 @@ static int dsa_fastest_ageing_time(struct dsa_switch *ds,
 {
 	int i;
 
-	for (i = 0; i < DSA_MAX_PORTS; ++i) {
+	for (i = 0; i < ds->num_ports; ++i) {
 		struct dsa_port *dp = &ds->ports[i];
 
 		if (dp && dp->ageing_time && dp->ageing_time < ageing_time)
@@ -419,7 +439,7 @@ static int dsa_slave_ageing_time(struct net_device *dev,
 				 struct switchdev_trans *trans)
 {
 	struct dsa_slave_priv *p = netdev_priv(dev);
-	struct dsa_switch *ds = p->parent;
+	struct dsa_switch *ds = p->dp->ds;
 	unsigned long ageing_jiffies = clock_t_to_jiffies(attr->u.ageing_time);
 	unsigned int ageing_time = jiffies_to_msecs(ageing_jiffies);
 
@@ -428,7 +448,7 @@ static int dsa_slave_ageing_time(struct net_device *dev,
 		return 0;
 
 	/* Keep the fastest ageing time in case of multiple bridges */
-	ds->ports[p->port].ageing_time = ageing_time;
+	p->dp->ageing_time = ageing_time;
 	ageing_time = dsa_fastest_ageing_time(ds, ageing_time);
 
 	if (ds->ops->set_ageing_time)
@@ -553,39 +573,58 @@ static int dsa_slave_bridge_port_join(struct net_device *dev,
 				      struct net_device *br)
 {
 	struct dsa_slave_priv *p = netdev_priv(dev);
-	struct dsa_switch *ds = p->parent;
-	int ret = -EOPNOTSUPP;
+	struct dsa_notifier_bridge_info info = {
+		.sw_index = p->dp->ds->index,
+		.port = p->dp->index,
+		.br = br,
+	};
+	int err;
+
+	/* Here the port is already bridged. Reflect the current configuration
+	 * so that drivers can program their chips accordingly.
+	 */
+	p->dp->bridge_dev = br;
 
-	p->bridge_dev = br;
+	err = dsa_slave_notify(dev, DSA_NOTIFIER_BRIDGE_JOIN, &info);
 
-	if (ds->ops->port_bridge_join)
-		ret = ds->ops->port_bridge_join(ds, p->port, br);
+	/* The bridging is rolled back on error */
+	if (err)
+		p->dp->bridge_dev = NULL;
 
-	return ret == -EOPNOTSUPP ? 0 : ret;
+	return err;
 }
 
-static void dsa_slave_bridge_port_leave(struct net_device *dev)
+static void dsa_slave_bridge_port_leave(struct net_device *dev,
+					struct net_device *br)
 {
 	struct dsa_slave_priv *p = netdev_priv(dev);
-	struct dsa_switch *ds = p->parent;
-
+	struct dsa_notifier_bridge_info info = {
+		.sw_index = p->dp->ds->index,
+		.port = p->dp->index,
+		.br = br,
+	};
+	int err;
 
-	if (ds->ops->port_bridge_leave)
-		ds->ops->port_bridge_leave(ds, p->port);
+	/* Here the port is already unbridged. Reflect the current configuration
+	 * so that drivers can program their chips accordingly.
+	 */
+	p->dp->bridge_dev = NULL;
 
-	p->bridge_dev = NULL;
+	err = dsa_slave_notify(dev, DSA_NOTIFIER_BRIDGE_LEAVE, &info);
+	if (err)
+		netdev_err(dev, "failed to notify DSA_NOTIFIER_BRIDGE_LEAVE\n");
 
 	/* Port left the bridge, put in BR_STATE_DISABLED by the bridge layer,
 	 * so allow it to be in BR_STATE_FORWARDING to be kept functional
 	 */
-	dsa_port_set_stp_state(ds, p->port, BR_STATE_FORWARDING);
+	dsa_slave_set_state(dev, BR_STATE_FORWARDING);
 }
 
 static int dsa_slave_port_attr_get(struct net_device *dev,
 				   struct switchdev_attr *attr)
 {
 	struct dsa_slave_priv *p = netdev_priv(dev);
-	struct dsa_switch *ds = p->parent;
+	struct dsa_switch *ds = p->dp->ds;
 
 	switch (attr->id) {
 	case SWITCHDEV_ATTR_ID_PORT_PARENT_ID:
@@ -633,7 +672,7 @@ static netdev_tx_t dsa_slave_xmit(struct sk_buff *skb, struct net_device *dev)
 	/* Queue the SKB for transmission on the parent interface, but
 	 * do not modify its EtherType
 	 */
-	nskb->dev = p->parent->dst->master_netdev;
+	nskb->dev = p->dp->ds->dst->master_netdev;
 	dev_queue_xmit(nskb);
 
 	return NETDEV_TX_OK;
@@ -645,14 +684,10 @@ dsa_slave_get_link_ksettings(struct net_device *dev,
 			     struct ethtool_link_ksettings *cmd)
 {
 	struct dsa_slave_priv *p = netdev_priv(dev);
-	int err;
+	int err = -EOPNOTSUPP;
 
-	err = -EOPNOTSUPP;
-	if (p->phy != NULL) {
-		err = phy_read_status(p->phy);
-		if (err == 0)
-			err = phy_ethtool_ksettings_get(p->phy, cmd);
-	}
+	if (p->phy != NULL)
+		err = phy_ethtool_ksettings_get(p->phy, cmd);
 
 	return err;
 }
@@ -673,7 +708,6 @@ static void dsa_slave_get_drvinfo(struct net_device *dev,
 				  struct ethtool_drvinfo *drvinfo)
 {
 	strlcpy(drvinfo->driver, "dsa", sizeof(drvinfo->driver));
-	strlcpy(drvinfo->version, dsa_driver_version, sizeof(drvinfo->version));
 	strlcpy(drvinfo->fw_version, "N/A", sizeof(drvinfo->fw_version));
 	strlcpy(drvinfo->bus_info, "platform", sizeof(drvinfo->bus_info));
 }
@@ -681,10 +715,10 @@ static void dsa_slave_get_drvinfo(struct net_device *dev,
 static int dsa_slave_get_regs_len(struct net_device *dev)
 {
 	struct dsa_slave_priv *p = netdev_priv(dev);
-	struct dsa_switch *ds = p->parent;
+	struct dsa_switch *ds = p->dp->ds;
 
 	if (ds->ops->get_regs_len)
-		return ds->ops->get_regs_len(ds, p->port);
+		return ds->ops->get_regs_len(ds, p->dp->index);
 
 	return -EOPNOTSUPP;
 }
@@ -693,10 +727,10 @@ static void
 dsa_slave_get_regs(struct net_device *dev, struct ethtool_regs *regs, void *_p)
 {
 	struct dsa_slave_priv *p = netdev_priv(dev);
-	struct dsa_switch *ds = p->parent;
+	struct dsa_switch *ds = p->dp->ds;
 
 	if (ds->ops->get_regs)
-		ds->ops->get_regs(ds, p->port, regs, _p);
+		ds->ops->get_regs(ds, p->dp->index, regs, _p);
 }
 
 static int dsa_slave_nway_reset(struct net_device *dev)
@@ -724,7 +758,7 @@ static u32 dsa_slave_get_link(struct net_device *dev)
 static int dsa_slave_get_eeprom_len(struct net_device *dev)
 {
 	struct dsa_slave_priv *p = netdev_priv(dev);
-	struct dsa_switch *ds = p->parent;
+	struct dsa_switch *ds = p->dp->ds;
 
 	if (ds->cd && ds->cd->eeprom_len)
 		return ds->cd->eeprom_len;
@@ -739,7 +773,7 @@ static int dsa_slave_get_eeprom(struct net_device *dev,
 				struct ethtool_eeprom *eeprom, u8 *data)
 {
 	struct dsa_slave_priv *p = netdev_priv(dev);
-	struct dsa_switch *ds = p->parent;
+	struct dsa_switch *ds = p->dp->ds;
 
 	if (ds->ops->get_eeprom)
 		return ds->ops->get_eeprom(ds, eeprom, data);
@@ -751,7 +785,7 @@ static int dsa_slave_set_eeprom(struct net_device *dev,
 				struct ethtool_eeprom *eeprom, u8 *data)
 {
 	struct dsa_slave_priv *p = netdev_priv(dev);
-	struct dsa_switch *ds = p->parent;
+	struct dsa_switch *ds = p->dp->ds;
 
 	if (ds->ops->set_eeprom)
 		return ds->ops->set_eeprom(ds, eeprom, data);
@@ -763,7 +797,7 @@ static void dsa_slave_get_strings(struct net_device *dev,
 				  uint32_t stringset, uint8_t *data)
 {
 	struct dsa_slave_priv *p = netdev_priv(dev);
-	struct dsa_switch *ds = p->parent;
+	struct dsa_switch *ds = p->dp->ds;
 
 	if (stringset == ETH_SS_STATS) {
 		int len = ETH_GSTRING_LEN;
@@ -773,7 +807,7 @@ static void dsa_slave_get_strings(struct net_device *dev,
 		strncpy(data + 2 * len, "rx_packets", len);
 		strncpy(data + 3 * len, "rx_bytes", len);
 		if (ds->ops->get_strings)
-			ds->ops->get_strings(ds, p->port, data + 4 * len);
+			ds->ops->get_strings(ds, p->dp->index, data + 4 * len);
 	}
 }
 
@@ -782,7 +816,7 @@ static void dsa_cpu_port_get_ethtool_stats(struct net_device *dev,
 					   uint64_t *data)
 {
 	struct dsa_switch_tree *dst = dev->dsa_ptr;
-	struct dsa_switch *ds = dst->ds[0];
+	struct dsa_switch *ds = dst->cpu_switch;
 	s8 cpu_port = dst->cpu_port;
 	int count = 0;
 
@@ -799,7 +833,7 @@ static void dsa_cpu_port_get_ethtool_stats(struct net_device *dev,
 static int dsa_cpu_port_get_sset_count(struct net_device *dev, int sset)
 {
 	struct dsa_switch_tree *dst = dev->dsa_ptr;
-	struct dsa_switch *ds = dst->ds[0];
+	struct dsa_switch *ds = dst->cpu_switch;
 	int count = 0;
 
 	if (dst->master_ethtool_ops.get_sset_count)
@@ -815,7 +849,7 @@ static void dsa_cpu_port_get_strings(struct net_device *dev,
 				     uint32_t stringset, uint8_t *data)
 {
 	struct dsa_switch_tree *dst = dev->dsa_ptr;
-	struct dsa_switch *ds = dst->ds[0];
+	struct dsa_switch *ds = dst->cpu_switch;
 	s8 cpu_port = dst->cpu_port;
 	int len = ETH_GSTRING_LEN;
 	int mcount = 0, count;
@@ -854,20 +888,20 @@ static void dsa_slave_get_ethtool_stats(struct net_device *dev,
 					uint64_t *data)
 {
 	struct dsa_slave_priv *p = netdev_priv(dev);
-	struct dsa_switch *ds = p->parent;
+	struct dsa_switch *ds = p->dp->ds;
 
 	data[0] = dev->stats.tx_packets;
 	data[1] = dev->stats.tx_bytes;
 	data[2] = dev->stats.rx_packets;
 	data[3] = dev->stats.rx_bytes;
 	if (ds->ops->get_ethtool_stats)
-		ds->ops->get_ethtool_stats(ds, p->port, data + 4);
+		ds->ops->get_ethtool_stats(ds, p->dp->index, data + 4);
 }
 
 static int dsa_slave_get_sset_count(struct net_device *dev, int sset)
 {
 	struct dsa_slave_priv *p = netdev_priv(dev);
-	struct dsa_switch *ds = p->parent;
+	struct dsa_switch *ds = p->dp->ds;
 
 	if (sset == ETH_SS_STATS) {
 		int count;
@@ -885,20 +919,20 @@ static int dsa_slave_get_sset_count(struct net_device *dev, int sset)
 static void dsa_slave_get_wol(struct net_device *dev, struct ethtool_wolinfo *w)
 {
 	struct dsa_slave_priv *p = netdev_priv(dev);
-	struct dsa_switch *ds = p->parent;
+	struct dsa_switch *ds = p->dp->ds;
 
 	if (ds->ops->get_wol)
-		ds->ops->get_wol(ds, p->port, w);
+		ds->ops->get_wol(ds, p->dp->index, w);
 }
 
 static int dsa_slave_set_wol(struct net_device *dev, struct ethtool_wolinfo *w)
 {
 	struct dsa_slave_priv *p = netdev_priv(dev);
-	struct dsa_switch *ds = p->parent;
+	struct dsa_switch *ds = p->dp->ds;
 	int ret = -EOPNOTSUPP;
 
 	if (ds->ops->set_wol)
-		ret = ds->ops->set_wol(ds, p->port, w);
+		ret = ds->ops->set_wol(ds, p->dp->index, w);
 
 	return ret;
 }
@@ -906,13 +940,13 @@ static int dsa_slave_set_wol(struct net_device *dev, struct ethtool_wolinfo *w)
 static int dsa_slave_set_eee(struct net_device *dev, struct ethtool_eee *e)
 {
 	struct dsa_slave_priv *p = netdev_priv(dev);
-	struct dsa_switch *ds = p->parent;
+	struct dsa_switch *ds = p->dp->ds;
 	int ret;
 
 	if (!ds->ops->set_eee)
 		return -EOPNOTSUPP;
 
-	ret = ds->ops->set_eee(ds, p->port, p->phy, e);
+	ret = ds->ops->set_eee(ds, p->dp->index, p->phy, e);
 	if (ret)
 		return ret;
 
@@ -925,13 +959,13 @@ static int dsa_slave_set_eee(struct net_device *dev, struct ethtool_eee *e)
 static int dsa_slave_get_eee(struct net_device *dev, struct ethtool_eee *e)
 {
 	struct dsa_slave_priv *p = netdev_priv(dev);
-	struct dsa_switch *ds = p->parent;
+	struct dsa_switch *ds = p->dp->ds;
 	int ret;
 
 	if (!ds->ops->get_eee)
 		return -EOPNOTSUPP;
 
-	ret = ds->ops->get_eee(ds, p->port, e);
+	ret = ds->ops->get_eee(ds, p->dp->index, e);
 	if (ret)
 		return ret;
 
@@ -946,7 +980,7 @@ static int dsa_slave_netpoll_setup(struct net_device *dev,
 				   struct netpoll_info *ni)
 {
 	struct dsa_slave_priv *p = netdev_priv(dev);
-	struct dsa_switch *ds = p->parent;
+	struct dsa_switch *ds = p->dp->ds;
 	struct net_device *master = ds->dst->master_netdev;
 	struct netpoll *netpoll;
 	int err = 0;
@@ -984,6 +1018,144 @@ static void dsa_slave_poll_controller(struct net_device *dev)
 }
 #endif
 
+static int dsa_slave_get_phys_port_name(struct net_device *dev,
+					char *name, size_t len)
+{
+	struct dsa_slave_priv *p = netdev_priv(dev);
+
+	if (snprintf(name, len, "p%d", p->dp->index) >= len)
+		return -EINVAL;
+
+	return 0;
+}
+
+static struct dsa_mall_tc_entry *
+dsa_slave_mall_tc_entry_find(struct dsa_slave_priv *p,
+			     unsigned long cookie)
+{
+	struct dsa_mall_tc_entry *mall_tc_entry;
+
+	list_for_each_entry(mall_tc_entry, &p->mall_tc_list, list)
+		if (mall_tc_entry->cookie == cookie)
+			return mall_tc_entry;
+
+	return NULL;
+}
+
+static int dsa_slave_add_cls_matchall(struct net_device *dev,
+				      __be16 protocol,
+				      struct tc_cls_matchall_offload *cls,
+				      bool ingress)
+{
+	struct dsa_slave_priv *p = netdev_priv(dev);
+	struct dsa_mall_tc_entry *mall_tc_entry;
+	struct dsa_switch *ds = p->dp->ds;
+	struct net *net = dev_net(dev);
+	struct dsa_slave_priv *to_p;
+	struct net_device *to_dev;
+	const struct tc_action *a;
+	int err = -EOPNOTSUPP;
+	LIST_HEAD(actions);
+	int ifindex;
+
+	if (!ds->ops->port_mirror_add)
+		return err;
+
+	if (!tc_single_action(cls->exts))
+		return err;
+
+	tcf_exts_to_list(cls->exts, &actions);
+	a = list_first_entry(&actions, struct tc_action, list);
+
+	if (is_tcf_mirred_egress_mirror(a) && protocol == htons(ETH_P_ALL)) {
+		struct dsa_mall_mirror_tc_entry *mirror;
+
+		ifindex = tcf_mirred_ifindex(a);
+		to_dev = __dev_get_by_index(net, ifindex);
+		if (!to_dev)
+			return -EINVAL;
+
+		if (!dsa_slave_dev_check(to_dev))
+			return -EOPNOTSUPP;
+
+		mall_tc_entry = kzalloc(sizeof(*mall_tc_entry), GFP_KERNEL);
+		if (!mall_tc_entry)
+			return -ENOMEM;
+
+		mall_tc_entry->cookie = cls->cookie;
+		mall_tc_entry->type = DSA_PORT_MALL_MIRROR;
+		mirror = &mall_tc_entry->mirror;
+
+		to_p = netdev_priv(to_dev);
+
+		mirror->to_local_port = to_p->dp->index;
+		mirror->ingress = ingress;
+
+		err = ds->ops->port_mirror_add(ds, p->dp->index, mirror,
+					       ingress);
+		if (err) {
+			kfree(mall_tc_entry);
+			return err;
+		}
+
+		list_add_tail(&mall_tc_entry->list, &p->mall_tc_list);
+	}
+
+	return 0;
+}
+
+static void dsa_slave_del_cls_matchall(struct net_device *dev,
+				       struct tc_cls_matchall_offload *cls)
+{
+	struct dsa_slave_priv *p = netdev_priv(dev);
+	struct dsa_mall_tc_entry *mall_tc_entry;
+	struct dsa_switch *ds = p->dp->ds;
+
+	if (!ds->ops->port_mirror_del)
+		return;
+
+	mall_tc_entry = dsa_slave_mall_tc_entry_find(p, cls->cookie);
+	if (!mall_tc_entry)
+		return;
+
+	list_del(&mall_tc_entry->list);
+
+	switch (mall_tc_entry->type) {
+	case DSA_PORT_MALL_MIRROR:
+		ds->ops->port_mirror_del(ds, p->dp->index,
+					 &mall_tc_entry->mirror);
+		break;
+	default:
+		WARN_ON(1);
+	}
+
+	kfree(mall_tc_entry);
+}
+
+static int dsa_slave_setup_tc(struct net_device *dev, u32 handle,
+			      __be16 protocol, struct tc_to_netdev *tc)
+{
+	bool ingress = TC_H_MAJ(handle) == TC_H_MAJ(TC_H_INGRESS);
+	int ret = -EOPNOTSUPP;
+
+	switch (tc->type) {
+	case TC_SETUP_MATCHALL:
+		switch (tc->cls_mall->command) {
+		case TC_CLSMATCHALL_REPLACE:
+			return dsa_slave_add_cls_matchall(dev, protocol,
+							  tc->cls_mall,
+							  ingress);
+		case TC_CLSMATCHALL_DESTROY:
+			dsa_slave_del_cls_matchall(dev, tc->cls_mall);
+			return 0;
+		}
+	default:
+		break;
+	}
+
+	return ret;
+}
+
 void dsa_cpu_port_ethtool_init(struct ethtool_ops *ops)
 {
 	ops->get_sset_count = dsa_cpu_port_get_sset_count;
@@ -991,6 +1163,30 @@ void dsa_cpu_port_ethtool_init(struct ethtool_ops *ops)
 	ops->get_strings = dsa_cpu_port_get_strings;
 }
 
+static int dsa_slave_get_rxnfc(struct net_device *dev,
+			       struct ethtool_rxnfc *nfc, u32 *rule_locs)
+{
+	struct dsa_slave_priv *p = netdev_priv(dev);
+	struct dsa_switch *ds = p->dp->ds;
+
+	if (!ds->ops->get_rxnfc)
+		return -EOPNOTSUPP;
+
+	return ds->ops->get_rxnfc(ds, p->dp->index, nfc, rule_locs);
+}
+
+static int dsa_slave_set_rxnfc(struct net_device *dev,
+			       struct ethtool_rxnfc *nfc)
+{
+	struct dsa_slave_priv *p = netdev_priv(dev);
+	struct dsa_switch *ds = p->dp->ds;
+
+	if (!ds->ops->set_rxnfc)
+		return -EOPNOTSUPP;
+
+	return ds->ops->set_rxnfc(ds, p->dp->index, nfc);
+}
+
 static const struct ethtool_ops dsa_slave_ethtool_ops = {
 	.get_drvinfo		= dsa_slave_get_drvinfo,
 	.get_regs_len		= dsa_slave_get_regs_len,
@@ -1009,6 +1205,8 @@ static const struct ethtool_ops dsa_slave_ethtool_ops = {
 	.get_eee		= dsa_slave_get_eee,
 	.get_link_ksettings	= dsa_slave_get_link_ksettings,
 	.set_link_ksettings	= dsa_slave_set_link_ksettings,
+	.get_rxnfc		= dsa_slave_get_rxnfc,
+	.set_rxnfc		= dsa_slave_set_rxnfc,
 };
 
 static const struct net_device_ops dsa_slave_netdev_ops = {
@@ -1031,6 +1229,8 @@ static const struct net_device_ops dsa_slave_netdev_ops = {
 	.ndo_bridge_getlink	= switchdev_port_bridge_getlink,
 	.ndo_bridge_setlink	= switchdev_port_bridge_setlink,
 	.ndo_bridge_dellink	= switchdev_port_bridge_dellink,
+	.ndo_get_phys_port_name	= dsa_slave_get_phys_port_name,
+	.ndo_setup_tc		= dsa_slave_setup_tc,
 };
 
 static const struct switchdev_ops dsa_slave_switchdev_ops = {
@@ -1048,7 +1248,7 @@ static struct device_type dsa_type = {
 static void dsa_slave_adjust_link(struct net_device *dev)
 {
 	struct dsa_slave_priv *p = netdev_priv(dev);
-	struct dsa_switch *ds = p->parent;
+	struct dsa_switch *ds = p->dp->ds;
 	unsigned int status_changed = 0;
 
 	if (p->old_link != p->phy->link) {
@@ -1067,7 +1267,7 @@ static void dsa_slave_adjust_link(struct net_device *dev)
 	}
 
 	if (ds->ops->adjust_link && status_changed)
-		ds->ops->adjust_link(ds, p->port, p->phy);
+		ds->ops->adjust_link(ds, p->dp->index, p->phy);
 
 	if (status_changed)
 		phy_print_status(p->phy);
@@ -1081,9 +1281,9 @@ static int dsa_slave_fixed_link_update(struct net_device *dev,
 
 	if (dev) {
 		p = netdev_priv(dev);
-		ds = p->parent;
+		ds = p->dp->ds;
 		if (ds->ops->fixed_link_update)
-			ds->ops->fixed_link_update(ds, p->port, status);
+			ds->ops->fixed_link_update(ds, p->dp->index, status);
 	}
 
 	return 0;
@@ -1094,7 +1294,7 @@ static int dsa_slave_phy_connect(struct dsa_slave_priv *p,
 				 struct net_device *slave_dev,
 				 int addr)
 {
-	struct dsa_switch *ds = p->parent;
+	struct dsa_switch *ds = p->dp->ds;
 
 	p->phy = mdiobus_get_phy(ds->slave_mii_bus, addr);
 	if (!p->phy) {
@@ -1112,13 +1312,13 @@ static int dsa_slave_phy_connect(struct dsa_slave_priv *p,
 static int dsa_slave_phy_setup(struct dsa_slave_priv *p,
 				struct net_device *slave_dev)
 {
-	struct dsa_switch *ds = p->parent;
+	struct dsa_switch *ds = p->dp->ds;
 	struct device_node *phy_dn, *port_dn;
 	bool phy_is_fixed = false;
 	u32 phy_flags = 0;
 	int mode, ret;
 
-	port_dn = ds->ports[p->port].dn;
+	port_dn = p->dp->dn;
 	mode = of_get_phy_mode(port_dn);
 	if (mode < 0)
 		mode = PHY_INTERFACE_MODE_NA;
@@ -1139,7 +1339,7 @@ static int dsa_slave_phy_setup(struct dsa_slave_priv *p,
 	}
 
 	if (ds->ops->get_phy_flags)
-		phy_flags = ds->ops->get_phy_flags(ds, p->port);
+		phy_flags = ds->ops->get_phy_flags(ds, p->dp->index);
 
 	if (phy_dn) {
 		int phy_id = of_mdio_parse_addr(&slave_dev->dev, phy_dn);
@@ -1174,9 +1374,10 @@ static int dsa_slave_phy_setup(struct dsa_slave_priv *p,
 	 * MDIO bus instead
 	 */
 	if (!p->phy) {
-		ret = dsa_slave_phy_connect(p, slave_dev, p->port);
+		ret = dsa_slave_phy_connect(p, slave_dev, p->dp->index);
 		if (ret) {
-			netdev_err(slave_dev, "failed to connect to port %d: %d\n", p->port, ret);
+			netdev_err(slave_dev, "failed to connect to port %d: %d\n",
+				   p->dp->index, ret);
 			if (phy_is_fixed)
 				of_phy_deregister_fixed_link(port_dn);
 			return ret;
@@ -1246,7 +1447,8 @@ int dsa_slave_create(struct dsa_switch *ds, struct device *parent,
 	if (slave_dev == NULL)
 		return -ENOMEM;
 
-	slave_dev->features = master->vlan_features;
+	slave_dev->features = master->vlan_features | NETIF_F_HW_TC;
+	slave_dev->hw_features |= NETIF_F_HW_TC;
 	slave_dev->ethtool_ops = &dsa_slave_ethtool_ops;
 	eth_hw_addr_inherit(slave_dev, master);
 	slave_dev->priv_flags |= IFF_NO_QUEUE;
@@ -1264,8 +1466,8 @@ int dsa_slave_create(struct dsa_switch *ds, struct device *parent,
 	slave_dev->vlan_features = master->vlan_features;
 
 	p = netdev_priv(slave_dev);
-	p->parent = ds;
-	p->port = port;
+	p->dp = &ds->ports[port];
+	INIT_LIST_HEAD(&p->mall_tc_list);
 	p->xmit = dst->tag_ops->xmit;
 
 	p->old_pause = -1;
@@ -1298,10 +1500,9 @@ int dsa_slave_create(struct dsa_switch *ds, struct device *parent,
 void dsa_slave_destroy(struct net_device *slave_dev)
 {
 	struct dsa_slave_priv *p = netdev_priv(slave_dev);
-	struct dsa_switch *ds = p->parent;
 	struct device_node *port_dn;
 
-	port_dn = ds->ports[p->port].dn;
+	port_dn = p->dp->dn;
 
 	netif_carrier_off(slave_dev);
 	if (p->phy) {
@@ -1319,46 +1520,52 @@ static bool dsa_slave_dev_check(struct net_device *dev)
 	return dev->netdev_ops == &dsa_slave_netdev_ops;
 }
 
-static int dsa_slave_port_upper_event(struct net_device *dev,
-				      unsigned long event, void *ptr)
+static int dsa_slave_changeupper(struct net_device *dev,
+				 struct netdev_notifier_changeupper_info *info)
 {
-	struct netdev_notifier_changeupper_info *info = ptr;
-	struct net_device *upper = info->upper_dev;
-	int err = 0;
+	int err = NOTIFY_DONE;
 
-	switch (event) {
-	case NETDEV_CHANGEUPPER:
-		if (netif_is_bridge_master(upper)) {
-			if (info->linking)
-				err = dsa_slave_bridge_port_join(dev, upper);
-			else
-				dsa_slave_bridge_port_leave(dev);
+	if (netif_is_bridge_master(info->upper_dev)) {
+		if (info->linking) {
+			err = dsa_slave_bridge_port_join(dev, info->upper_dev);
+			err = notifier_from_errno(err);
+		} else {
+			dsa_slave_bridge_port_leave(dev, info->upper_dev);
+			err = NOTIFY_OK;
 		}
-
-		break;
 	}
 
-	return notifier_from_errno(err);
+	return err;
 }
 
-static int dsa_slave_port_event(struct net_device *dev, unsigned long event,
-				void *ptr)
+static int dsa_slave_netdevice_event(struct notifier_block *nb,
+				     unsigned long event, void *ptr)
 {
-	switch (event) {
-	case NETDEV_CHANGEUPPER:
-		return dsa_slave_port_upper_event(dev, event, ptr);
-	}
+	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+
+	if (dev->netdev_ops != &dsa_slave_netdev_ops)
+		return NOTIFY_DONE;
+
+	if (event == NETDEV_CHANGEUPPER)
+		return dsa_slave_changeupper(dev, ptr);
 
 	return NOTIFY_DONE;
 }
 
-int dsa_slave_netdevice_event(struct notifier_block *unused,
-			      unsigned long event, void *ptr)
+static struct notifier_block dsa_slave_nb __read_mostly = {
+	.notifier_call	= dsa_slave_netdevice_event,
+};
+
+int dsa_slave_register_notifier(void)
 {
-	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+	return register_netdevice_notifier(&dsa_slave_nb);
+}
 
-	if (dsa_slave_dev_check(dev))
-		return dsa_slave_port_event(dev, event, ptr);
+void dsa_slave_unregister_notifier(void)
+{
+	int err;
 
-	return NOTIFY_DONE;
+	err = unregister_netdevice_notifier(&dsa_slave_nb);
+	if (err)
+		pr_err("DSA: failed to unregister slave notifier (%d)\n", err);
 }
diff --git a/net/dsa/switch.c b/net/dsa/switch.c
new file mode 100644
index 000000000000..6456dacf9ae9
--- /dev/null
+++ b/net/dsa/switch.c
@@ -0,0 +1,85 @@
+/*
+ * Handling of a single switch chip, part of a switch fabric
+ *
+ * Copyright (c) 2017 Vivien Didelot <vivien.didelot@savoirfairelinux.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/netdevice.h>
+#include <linux/notifier.h>
+#include <net/dsa.h>
+
+static int dsa_switch_bridge_join(struct dsa_switch *ds,
+				  struct dsa_notifier_bridge_info *info)
+{
+	if (ds->index == info->sw_index && ds->ops->port_bridge_join)
+		return ds->ops->port_bridge_join(ds, info->port, info->br);
+
+	if (ds->index != info->sw_index)
+		dev_dbg(ds->dev, "crosschip DSA port %d.%d bridged to %s\n",
+			info->sw_index, info->port, netdev_name(info->br));
+
+	return 0;
+}
+
+static int dsa_switch_bridge_leave(struct dsa_switch *ds,
+				   struct dsa_notifier_bridge_info *info)
+{
+	if (ds->index == info->sw_index && ds->ops->port_bridge_leave)
+		ds->ops->port_bridge_leave(ds, info->port, info->br);
+
+	if (ds->index != info->sw_index)
+		dev_dbg(ds->dev, "crosschip DSA port %d.%d unbridged from %s\n",
+			info->sw_index, info->port, netdev_name(info->br));
+
+	return 0;
+}
+
+static int dsa_switch_event(struct notifier_block *nb,
+			    unsigned long event, void *info)
+{
+	struct dsa_switch *ds = container_of(nb, struct dsa_switch, nb);
+	int err;
+
+	switch (event) {
+	case DSA_NOTIFIER_BRIDGE_JOIN:
+		err = dsa_switch_bridge_join(ds, info);
+		break;
+	case DSA_NOTIFIER_BRIDGE_LEAVE:
+		err = dsa_switch_bridge_leave(ds, info);
+		break;
+	default:
+		err = -EOPNOTSUPP;
+		break;
+	}
+
+	/* Non-switchdev operations cannot be rolled back. If a DSA driver
+	 * returns an error during the chained call, switch chips may be in an
+	 * inconsistent state.
+	 */
+	if (err)
+		dev_dbg(ds->dev, "breaking chain for DSA event %lu (%d)\n",
+			event, err);
+
+	return notifier_from_errno(err);
+}
+
+int dsa_switch_register_notifier(struct dsa_switch *ds)
+{
+	ds->nb.notifier_call = dsa_switch_event;
+
+	return raw_notifier_chain_register(&ds->dst->nh, &ds->nb);
+}
+
+void dsa_switch_unregister_notifier(struct dsa_switch *ds)
+{
+	int err;
+
+	err = raw_notifier_chain_unregister(&ds->dst->nh, &ds->nb);
+	if (err)
+		dev_err(ds->dev, "failed to unregister notifier (%d)\n", err);
+}
diff --git a/net/dsa/tag_brcm.c b/net/dsa/tag_brcm.c
index 21bffde6e4bf..5d925b6b2bb1 100644
--- a/net/dsa/tag_brcm.c
+++ b/net/dsa/tag_brcm.c
@@ -80,9 +80,9 @@ static struct sk_buff *brcm_tag_xmit(struct sk_buff *skb, struct net_device *dev
 			((skb->priority << BRCM_IG_TC_SHIFT) & BRCM_IG_TC_MASK);
 	brcm_tag[1] = 0;
 	brcm_tag[2] = 0;
-	if (p->port == 8)
+	if (p->dp->index == 8)
 		brcm_tag[2] = BRCM_IG_DSTMAP2_MASK;
-	brcm_tag[3] = (1 << p->port) & BRCM_IG_DSTMAP1_MASK;
+	brcm_tag[3] = (1 << p->dp->index) & BRCM_IG_DSTMAP1_MASK;
 
 	return skb;
 
@@ -102,7 +102,7 @@ static int brcm_tag_rcv(struct sk_buff *skb, struct net_device *dev,
 	if (unlikely(dst == NULL))
 		goto out_drop;
 
-	ds = dst->ds[0];
+	ds = dst->cpu_switch;
 
 	skb = skb_unshare(skb, GFP_ATOMIC);
 	if (skb == NULL)
@@ -121,13 +121,14 @@ static int brcm_tag_rcv(struct sk_buff *skb, struct net_device *dev,
 	/* We should never see a reserved reason code without knowing how to
 	 * handle it
 	 */
-	WARN_ON(brcm_tag[2] & BRCM_EG_RC_RSVD);
+	if (unlikely(brcm_tag[2] & BRCM_EG_RC_RSVD))
+		goto out_drop;
 
 	/* Locate which port this is coming from */
 	source_port = brcm_tag[3] & BRCM_EG_PID_MASK;
 
 	/* Validate port against switch setup, either the port is totally */
-	if (source_port >= DSA_MAX_PORTS || !ds->ports[source_port].netdev)
+	if (source_port >= ds->num_ports || !ds->ports[source_port].netdev)
 		goto out_drop;
 
 	/* Remove Broadcom tag and update checksum */
diff --git a/net/dsa/tag_dsa.c b/net/dsa/tag_dsa.c
index bce79ffe342b..72579ceea381 100644
--- a/net/dsa/tag_dsa.c
+++ b/net/dsa/tag_dsa.c
@@ -33,8 +33,8 @@ static struct sk_buff *dsa_xmit(struct sk_buff *skb, struct net_device *dev)
 		 * Construct tagged FROM_CPU DSA tag from 802.1q tag.
 		 */
 		dsa_header = skb->data + 2 * ETH_ALEN;
-		dsa_header[0] = 0x60 | p->parent->index;
-		dsa_header[1] = p->port << 3;
+		dsa_header[0] = 0x60 | p->dp->ds->index;
+		dsa_header[1] = p->dp->index << 3;
 
 		/*
 		 * Move CFI field from byte 2 to byte 1.
@@ -54,8 +54,8 @@ static struct sk_buff *dsa_xmit(struct sk_buff *skb, struct net_device *dev)
 		 * Construct untagged FROM_CPU DSA tag.
 		 */
 		dsa_header = skb->data + 2 * ETH_ALEN;
-		dsa_header[0] = 0x40 | p->parent->index;
-		dsa_header[1] = p->port << 3;
+		dsa_header[0] = 0x40 | p->dp->ds->index;
+		dsa_header[1] = p->dp->index << 3;
 		dsa_header[2] = 0x00;
 		dsa_header[3] = 0x00;
 	}
@@ -114,7 +114,7 @@ static int dsa_rcv(struct sk_buff *skb, struct net_device *dev,
 	if (!ds)
 		goto out_drop;
 
-	if (source_port >= DSA_MAX_PORTS || !ds->ports[source_port].netdev)
+	if (source_port >= ds->num_ports || !ds->ports[source_port].netdev)
 		goto out_drop;
 
 	/*
diff --git a/net/dsa/tag_edsa.c b/net/dsa/tag_edsa.c
index 6c1720e88537..648c051817a1 100644
--- a/net/dsa/tag_edsa.c
+++ b/net/dsa/tag_edsa.c
@@ -42,8 +42,8 @@ static struct sk_buff *edsa_xmit(struct sk_buff *skb, struct net_device *dev)
 		edsa_header[1] = ETH_P_EDSA & 0xff;
 		edsa_header[2] = 0x00;
 		edsa_header[3] = 0x00;
-		edsa_header[4] = 0x60 | p->parent->index;
-		edsa_header[5] = p->port << 3;
+		edsa_header[4] = 0x60 | p->dp->ds->index;
+		edsa_header[5] = p->dp->index << 3;
 
 		/*
 		 * Move CFI field from byte 6 to byte 5.
@@ -67,8 +67,8 @@ static struct sk_buff *edsa_xmit(struct sk_buff *skb, struct net_device *dev)
 		edsa_header[1] = ETH_P_EDSA & 0xff;
 		edsa_header[2] = 0x00;
 		edsa_header[3] = 0x00;
-		edsa_header[4] = 0x40 | p->parent->index;
-		edsa_header[5] = p->port << 3;
+		edsa_header[4] = 0x40 | p->dp->ds->index;
+		edsa_header[5] = p->dp->index << 3;
 		edsa_header[6] = 0x00;
 		edsa_header[7] = 0x00;
 	}
@@ -127,7 +127,7 @@ static int edsa_rcv(struct sk_buff *skb, struct net_device *dev,
 	if (!ds)
 		goto out_drop;
 
-	if (source_port >= DSA_MAX_PORTS || !ds->ports[source_port].netdev)
+	if (source_port >= ds->num_ports || !ds->ports[source_port].netdev)
 		goto out_drop;
 
 	/*
diff --git a/net/dsa/tag_qca.c b/net/dsa/tag_qca.c
index 0c90cacee7aa..30240f343aea 100644
--- a/net/dsa/tag_qca.c
+++ b/net/dsa/tag_qca.c
@@ -54,7 +54,7 @@ static struct sk_buff *qca_tag_xmit(struct sk_buff *skb, struct net_device *dev)
 	/* Set the version field, and set destination port information */
 	hdr = QCA_HDR_VERSION << QCA_HDR_XMIT_VERSION_S |
 		QCA_HDR_XMIT_FROM_CPU |
-		BIT(p->port);
+		BIT(p->dp->index);
 
 	*phdr = htons(hdr);
 
@@ -104,7 +104,7 @@ static int qca_tag_rcv(struct sk_buff *skb, struct net_device *dev,
 	/* This protocol doesn't support cascading multiple switches so it's
 	 * safe to assume the switch is first in the tree
 	 */
-	ds = dst->ds[0];
+	ds = dst->cpu_switch;
 	if (!ds)
 		goto out_drop;
 
diff --git a/net/dsa/tag_trailer.c b/net/dsa/tag_trailer.c
index 5e3903eb1afa..26f977176978 100644
--- a/net/dsa/tag_trailer.c
+++ b/net/dsa/tag_trailer.c
@@ -50,7 +50,7 @@ static struct sk_buff *trailer_xmit(struct sk_buff *skb, struct net_device *dev)
 
 	trailer = skb_put(nskb, 4);
 	trailer[0] = 0x80;
-	trailer[1] = 1 << p->port;
+	trailer[1] = 1 << p->dp->index;
 	trailer[2] = 0x10;
 	trailer[3] = 0x00;
 
@@ -67,7 +67,7 @@ static int trailer_rcv(struct sk_buff *skb, struct net_device *dev,
 
 	if (unlikely(dst == NULL))
 		goto out_drop;
-	ds = dst->ds[0];
+	ds = dst->cpu_switch;
 
 	skb = skb_unshare(skb, GFP_ATOMIC);
 	if (skb == NULL)
@@ -82,7 +82,7 @@ static int trailer_rcv(struct sk_buff *skb, struct net_device *dev,
 		goto out_drop;
 
 	source_port = trailer[1] & 7;
-	if (source_port >= DSA_MAX_PORTS || !ds->ports[source_port].netdev)
+	if (source_port >= ds->num_ports || !ds->ports[source_port].netdev)
 		goto out_drop;
 
 	pskb_trim_rcsum(skb, skb->len - 4);
diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c
index 8c5a479681ca..1446810047f5 100644
--- a/net/ethernet/eth.c
+++ b/net/ethernet/eth.c
@@ -356,6 +356,7 @@ void ether_setup(struct net_device *dev)
 	dev->header_ops		= &eth_header_ops;
 	dev->type		= ARPHRD_ETHER;
 	dev->hard_header_len 	= ETH_HLEN;
+	dev->min_header_len	= ETH_HLEN;
 	dev->mtu		= ETH_DATA_LEN;
 	dev->min_mtu		= ETH_MIN_MTU;
 	dev->max_mtu		= ETH_DATA_LEN;
@@ -392,6 +393,34 @@ struct net_device *alloc_etherdev_mqs(int sizeof_priv, unsigned int txqs,
 }
 EXPORT_SYMBOL(alloc_etherdev_mqs);
 
+static void devm_free_netdev(struct device *dev, void *res)
+{
+	free_netdev(*(struct net_device **)res);
+}
+
+struct net_device *devm_alloc_etherdev_mqs(struct device *dev, int sizeof_priv,
+					   unsigned int txqs, unsigned int rxqs)
+{
+	struct net_device **dr;
+	struct net_device *netdev;
+
+	dr = devres_alloc(devm_free_netdev, sizeof(*dr), GFP_KERNEL);
+	if (!dr)
+		return NULL;
+
+	netdev = alloc_etherdev_mqs(sizeof_priv, txqs, rxqs);
+	if (!netdev) {
+		devres_free(dr);
+		return NULL;
+	}
+
+	*dr = netdev;
+	devres_add(dev, dr);
+
+	return netdev;
+}
+EXPORT_SYMBOL(devm_alloc_etherdev_mqs);
+
 ssize_t sysfs_format_mac(char *buf, const unsigned char *addr, int len)
 {
 	return scnprintf(buf, PAGE_SIZE, "%*phC\n", len, addr);
@@ -446,7 +475,7 @@ struct sk_buff **eth_gro_receive(struct sk_buff **head,
 out_unlock:
 	rcu_read_unlock();
 out:
-	NAPI_GRO_CB(skb)->flush |= flush;
+	skb_gro_flush_final(skb, pp, flush);
 
 	return pp;
 }
diff --git a/net/hsr/hsr_device.c b/net/hsr/hsr_device.c
index fc65b145f6e7..c73160fb11e7 100644
--- a/net/hsr/hsr_device.c
+++ b/net/hsr/hsr_device.c
@@ -395,7 +395,7 @@ static struct device_type hsr_type = {
 
 void hsr_dev_setup(struct net_device *dev)
 {
-	random_ether_addr(dev->dev_addr);
+	eth_hw_addr_random(dev);
 
 	ether_setup(dev);
 	dev->min_mtu = 0;
diff --git a/net/hsr/hsr_slave.c b/net/hsr/hsr_slave.c
index f5b60388d02f..56080da4aa77 100644
--- a/net/hsr/hsr_slave.c
+++ b/net/hsr/hsr_slave.c
@@ -12,6 +12,7 @@
 #include "hsr_slave.h"
 #include <linux/etherdevice.h>
 #include <linux/if_arp.h>
+#include <linux/if_vlan.h>
 #include "hsr_main.h"
 #include "hsr_device.h"
 #include "hsr_forward.h"
@@ -81,7 +82,7 @@ static int hsr_check_dev_ok(struct net_device *dev)
 		return -EINVAL;
 	}
 
-	if (dev->priv_flags & IFF_802_1Q_VLAN) {
+	if (is_vlan_dev(dev)) {
 		netdev_info(dev, "HSR on top of VLAN is not yet supported in this driver.\n");
 		return -EINVAL;
 	}
diff --git a/net/ieee802154/socket.c b/net/ieee802154/socket.c
index e0bd013a1e5e..eedba7670b51 100644
--- a/net/ieee802154/socket.c
+++ b/net/ieee802154/socket.c
@@ -279,7 +279,7 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
 	pr_debug("name = %s, mtu = %u\n", dev->name, mtu);
 
 	if (size > mtu) {
-		pr_debug("size = %Zu, mtu = %u\n", size, mtu);
+		pr_debug("size = %zu, mtu = %u\n", size, mtu);
 		err = -EMSGSIZE;
 		goto out_dev;
 	}
@@ -645,7 +645,7 @@ static int dgram_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
 	pr_debug("name = %s, mtu = %u\n", dev->name, mtu);
 
 	if (size > mtu) {
-		pr_debug("size = %Zu, mtu = %u\n", size, mtu);
+		pr_debug("size = %zu, mtu = %u\n", size, mtu);
 		err = -EMSGSIZE;
 		goto out_dev;
 	}
diff --git a/net/ife/Kconfig b/net/ife/Kconfig
new file mode 100644
index 000000000000..31e48b652c7c
--- /dev/null
+++ b/net/ife/Kconfig
@@ -0,0 +1,16 @@
+#
+# IFE subsystem configuration
+#
+
+menuconfig NET_IFE
+	depends on NET
+        tristate "Inter-FE based on IETF ForCES InterFE LFB"
+	default n
+	help
+	  Say Y here to add support of IFE encapsulation protocol
+	  For details refer to netdev01 paper:
+	  "Distributing Linux Traffic Control Classifier-Action Subsystem"
+	   Authors: Jamal Hadi Salim and Damascene M. Joachimpillai
+
+	  To compile this support as a module, choose M here: the module will
+	  be called ife.
diff --git a/net/ife/Makefile b/net/ife/Makefile
new file mode 100644
index 000000000000..2a90d97746cc
--- /dev/null
+++ b/net/ife/Makefile
@@ -0,0 +1,5 @@
+#
+# Makefile for the IFE encapsulation protocol
+#
+
+obj-$(CONFIG_NET_IFE) += ife.o
diff --git a/net/ife/ife.c b/net/ife/ife.c
new file mode 100644
index 000000000000..f360341c72eb
--- /dev/null
+++ b/net/ife/ife.c
@@ -0,0 +1,142 @@
+/*
+ * net/ife/ife.c - Inter-FE protocol based on ForCES WG InterFE LFB
+ * Copyright (c) 2015 Jamal Hadi Salim <jhs@mojatatu.com>
+ * Copyright (c) 2017 Yotam Gigi <yotamg@mellanox.com>
+ *
+ * Refer to: draft-ietf-forces-interfelfb-03 and netdev01 paper:
+ * "Distributing Linux Traffic Control Classifier-Action Subsystem"
+ * Authors: Jamal Hadi Salim and Damascene M. Joachimpillai
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation.
+ */
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <linux/rtnetlink.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <net/net_namespace.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+#include <linux/etherdevice.h>
+#include <net/ife.h>
+
+struct ifeheadr {
+	__be16 metalen;
+	u8 tlv_data[];
+};
+
+void *ife_encode(struct sk_buff *skb, u16 metalen)
+{
+	/* OUTERHDR:TOTMETALEN:{TLVHDR:Metadatum:TLVHDR..}:ORIGDATA
+	 * where ORIGDATA = original ethernet header ...
+	 */
+	int hdrm = metalen + IFE_METAHDRLEN;
+	int total_push = hdrm + skb->dev->hard_header_len;
+	struct ifeheadr *ifehdr;
+	struct ethhdr *iethh;	/* inner ether header */
+	int skboff = 0;
+	int err;
+
+	err = skb_cow_head(skb, total_push);
+	if (unlikely(err))
+		return NULL;
+
+	iethh = (struct ethhdr *) skb->data;
+
+	__skb_push(skb, total_push);
+	memcpy(skb->data, iethh, skb->dev->hard_header_len);
+	skb_reset_mac_header(skb);
+	skboff += skb->dev->hard_header_len;
+
+	/* total metadata length */
+	ifehdr = (struct ifeheadr *) (skb->data + skboff);
+	metalen += IFE_METAHDRLEN;
+	ifehdr->metalen = htons(metalen);
+
+	return ifehdr->tlv_data;
+}
+EXPORT_SYMBOL_GPL(ife_encode);
+
+void *ife_decode(struct sk_buff *skb, u16 *metalen)
+{
+	struct ifeheadr *ifehdr;
+	int total_pull;
+	u16 ifehdrln;
+
+	ifehdr = (struct ifeheadr *) (skb->data + skb->dev->hard_header_len);
+	ifehdrln = ntohs(ifehdr->metalen);
+	total_pull = skb->dev->hard_header_len + ifehdrln;
+
+	if (unlikely(ifehdrln < 2))
+		return NULL;
+
+	if (unlikely(!pskb_may_pull(skb, total_pull)))
+		return NULL;
+
+	skb_set_mac_header(skb, total_pull);
+	__skb_pull(skb, total_pull);
+	*metalen = ifehdrln - IFE_METAHDRLEN;
+
+	return &ifehdr->tlv_data;
+}
+EXPORT_SYMBOL_GPL(ife_decode);
+
+struct meta_tlvhdr {
+	__be16 type;
+	__be16 len;
+};
+
+/* Caller takes care of presenting data in network order
+ */
+void *ife_tlv_meta_decode(void *skbdata, u16 *attrtype, u16 *dlen, u16 *totlen)
+{
+	struct meta_tlvhdr *tlv = (struct meta_tlvhdr *) skbdata;
+
+	*dlen = ntohs(tlv->len) - NLA_HDRLEN;
+	*attrtype = ntohs(tlv->type);
+
+	if (totlen)
+		*totlen = nla_total_size(*dlen);
+
+	return skbdata + sizeof(struct meta_tlvhdr);
+}
+EXPORT_SYMBOL_GPL(ife_tlv_meta_decode);
+
+void *ife_tlv_meta_next(void *skbdata)
+{
+	struct meta_tlvhdr *tlv = (struct meta_tlvhdr *) skbdata;
+	u16 tlvlen = ntohs(tlv->len);
+
+	tlvlen = NLA_ALIGN(tlvlen);
+
+	return skbdata + tlvlen;
+}
+EXPORT_SYMBOL_GPL(ife_tlv_meta_next);
+
+/* Caller takes care of presenting data in network order
+ */
+int ife_tlv_meta_encode(void *skbdata, u16 attrtype, u16 dlen, const void *dval)
+{
+	__be32 *tlv = (__be32 *) (skbdata);
+	u16 totlen = nla_total_size(dlen);	/*alignment + hdr */
+	char *dptr = (char *) tlv + NLA_HDRLEN;
+	u32 htlv = attrtype << 16 | (dlen + NLA_HDRLEN);
+
+	*tlv = htonl(htlv);
+	memset(dptr, 0, totlen - NLA_HDRLEN);
+	memcpy(dptr, dval, dlen);
+
+	return totlen;
+}
+EXPORT_SYMBOL_GPL(ife_tlv_meta_encode);
+
+MODULE_AUTHOR("Jamal Hadi Salim <jhs@mojatatu.com>");
+MODULE_AUTHOR("Yotam Gigi <yotamg@mellanox.com>");
+MODULE_DESCRIPTION("Inter-FE LFB action");
+MODULE_LICENSE("GPL");
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 6e7baaf814c6..91a2557942fa 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -187,6 +187,7 @@ config NET_IPGRE_DEMUX
 config NET_IP_TUNNEL
 	tristate
 	select DST_CACHE
+	select GRO_CELLS
 	default n
 
 config NET_IPGRE
@@ -360,6 +361,19 @@ config INET_ESP
 
 	  If unsure, say Y.
 
+config INET_ESP_OFFLOAD
+	tristate "IP: ESP transformation offload"
+	depends on INET_ESP
+	select XFRM_OFFLOAD
+	default n
+	---help---
+	  Support for ESP transformation offload. This makes sense
+	  only if this system really does IPsec and want to do it
+	  with high throughput. A typical desktop system does not
+	  need it, even if it does IPsec.
+
+	  If unsure, say N.
+
 config INET_IPCOMP
 	tristate "IP: IPComp transformation"
 	select INET_XFRM_TUNNEL
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index 48af58a5686e..c6d4238ff94a 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -29,6 +29,7 @@ obj-$(CONFIG_NET_IPVTI) += ip_vti.o
 obj-$(CONFIG_SYN_COOKIES) += syncookies.o
 obj-$(CONFIG_INET_AH) += ah4.o
 obj-$(CONFIG_INET_ESP) += esp4.o
+obj-$(CONFIG_INET_ESP_OFFLOAD) += esp4_offload.o
 obj-$(CONFIG_INET_IPCOMP) += ipcomp.o
 obj-$(CONFIG_INET_XFRM_TUNNEL) += xfrm4_tunnel.o
 obj-$(CONFIG_INET_XFRM_MODE_BEET) += xfrm4_mode_beet.o
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index f75069883f2b..602d40f43687 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -479,7 +479,7 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 
 	snum = ntohs(addr->sin_port);
 	err = -EACCES;
-	if (snum && snum < PROT_SOCK &&
+	if (snum && snum < inet_prot_sock(net) &&
 	    !ns_capable(net->user_ns, CAP_NET_BIND_SERVICE))
 		goto out;
 
@@ -570,19 +570,30 @@ static long inet_wait_for_connect(struct sock *sk, long timeo, int writebias)
  *	TCP 'magic' in here.
  */
 int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
-			  int addr_len, int flags)
+			  int addr_len, int flags, int is_sendmsg)
 {
 	struct sock *sk = sock->sk;
 	int err;
 	long timeo;
 
-	if (addr_len < sizeof(uaddr->sa_family))
-		return -EINVAL;
+	/*
+	 * uaddr can be NULL and addr_len can be 0 if:
+	 * sk is a TCP fastopen active socket and
+	 * TCP_FASTOPEN_CONNECT sockopt is set and
+	 * we already have a valid cookie for this socket.
+	 * In this case, user can call write() after connect().
+	 * write() will invoke tcp_sendmsg_fastopen() which calls
+	 * __inet_stream_connect().
+	 */
+	if (uaddr) {
+		if (addr_len < sizeof(uaddr->sa_family))
+			return -EINVAL;
 
-	if (uaddr->sa_family == AF_UNSPEC) {
-		err = sk->sk_prot->disconnect(sk, flags);
-		sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED;
-		goto out;
+		if (uaddr->sa_family == AF_UNSPEC) {
+			err = sk->sk_prot->disconnect(sk, flags);
+			sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED;
+			goto out;
+		}
 	}
 
 	switch (sock->state) {
@@ -593,7 +604,10 @@ int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
 		err = -EISCONN;
 		goto out;
 	case SS_CONNECTING:
-		err = -EALREADY;
+		if (inet_sk(sk)->defer_connect)
+			err = is_sendmsg ? -EINPROGRESS : -EISCONN;
+		else
+			err = -EALREADY;
 		/* Fall out of switch with err, set for this state */
 		break;
 	case SS_UNCONNECTED:
@@ -607,6 +621,9 @@ int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
 
 		sock->state = SS_CONNECTING;
 
+		if (!err && inet_sk(sk)->defer_connect)
+			goto out;
+
 		/* Just entered SS_CONNECTING state; the only
 		 * difference is that return value in non-blocking
 		 * case is EINPROGRESS, rather than EALREADY.
@@ -662,7 +679,7 @@ int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
 	int err;
 
 	lock_sock(sock->sk);
-	err = __inet_stream_connect(sock, uaddr, addr_len, flags);
+	err = __inet_stream_connect(sock, uaddr, addr_len, flags, 0);
 	release_sock(sock->sk);
 	return err;
 }
@@ -1406,7 +1423,7 @@ out_unlock:
 	rcu_read_unlock();
 
 out:
-	NAPI_GRO_CB(skb)->flush |= flush;
+	skb_gro_flush_final(skb, pp, flush);
 
 	return pp;
 }
@@ -1700,6 +1717,9 @@ static __net_init int inet_init_net(struct net *net)
 	net->ipv4.sysctl_ip_default_ttl = IPDEFTTL;
 	net->ipv4.sysctl_ip_dynaddr = 0;
 	net->ipv4.sysctl_ip_early_demux = 1;
+#ifdef CONFIG_SYSCTL
+	net->ipv4.sysctl_ip_prot_sock = PROT_SOCK;
+#endif
 
 	return 0;
 }
@@ -1831,8 +1851,6 @@ static int __init inet_init(void)
 
 	ip_init();
 
-	tcp_v4_init();
-
 	/* Setup TCP slab cache for open requests. */
 	tcp_init();
 
diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c
index f2a71025a770..22377c8ff14b 100644
--- a/net/ipv4/ah4.c
+++ b/net/ipv4/ah4.c
@@ -270,6 +270,9 @@ static void ah_input_done(struct crypto_async_request *base, int err)
 	int ihl = ip_hdrlen(skb);
 	int ah_hlen = (ah->hdrlen + 2) << 2;
 
+	if (err)
+		goto out;
+
 	work_iph = AH_SKB_CB(skb)->tmp;
 	auth_data = ah_tmp_auth(work_iph, ihl);
 	icv = ah_tmp_icv(ahp->ahash, auth_data, ahp->icv_trunc_len);
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index 89a8cac4726a..51b27ae09fbd 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -1263,7 +1263,7 @@ void __init arp_init(void)
 /*
  *	ax25 -> ASCII conversion
  */
-static char *ax2asc2(ax25_address *a, char *buf)
+static void ax2asc2(ax25_address *a, char *buf)
 {
 	char c, *s;
 	int n;
@@ -1285,10 +1285,10 @@ static char *ax2asc2(ax25_address *a, char *buf)
 	*s++ = n + '0';
 	*s++ = '\0';
 
-	if (*buf == '\0' || *buf == '-')
-		return "*";
-
-	return buf;
+	if (*buf == '\0' || *buf == '-') {
+		buf[0] = '*';
+		buf[1] = '\0';
+	}
 }
 #endif /* CONFIG_AX25 */
 
@@ -1322,7 +1322,7 @@ static void arp_format_neigh_entry(struct seq_file *seq,
 	}
 #endif
 	sprintf(tbuf, "%pI4", n->primary_key);
-	seq_printf(seq, "%-16s 0x%-10x0x%-10x%s     *        %s\n",
+	seq_printf(seq, "%-16s 0x%-10x0x%-10x%-17s     *        %s\n",
 		   tbuf, hatype, arp_state_to_flags(n), hbuffer, dev->name);
 	read_unlock(&n->lock);
 }
diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c
index 72d6f056d863..ae206163c273 100644
--- a/net/ipv4/cipso_ipv4.c
+++ b/net/ipv4/cipso_ipv4.c
@@ -1587,6 +1587,10 @@ int cipso_v4_validate(const struct sk_buff *skb, unsigned char **option)
 				goto validate_return_locked;
 			}
 
+		if (opt_iter + 1 == opt_len) {
+			err_offset = opt_iter;
+			goto validate_return_locked;
+		}
 		tag_len = tag[1];
 		if (tag_len > (opt_len - opt_iter)) {
 			err_offset = opt_iter + 1;
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 4cd2ee8857d2..5d367b7ff542 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -65,8 +65,6 @@
 #include <net/net_namespace.h>
 #include <net/addrconf.h>
 
-#include "fib_lookup.h"
-
 static struct ipv4_devconf ipv4_devconf = {
 	.data = {
 		[IPV4_DEVCONF_ACCEPT_REDIRECTS - 1] = 1,
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index 20fb25e3027b..b1e24446e297 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -18,6 +18,8 @@
 #include <net/protocol.h>
 #include <net/udp.h>
 
+#include <linux/highmem.h>
+
 struct esp_skb_cb {
 	struct xfrm_skb_cb xfrm;
 	void *tmp;
@@ -92,11 +94,40 @@ static inline struct scatterlist *esp_req_sg(struct crypto_aead *aead,
 			     __alignof__(struct scatterlist));
 }
 
+static void esp_ssg_unref(struct xfrm_state *x, void *tmp)
+{
+	struct esp_output_extra *extra = esp_tmp_extra(tmp);
+	struct crypto_aead *aead = x->data;
+	int extralen = 0;
+	u8 *iv;
+	struct aead_request *req;
+	struct scatterlist *sg;
+
+	if (x->props.flags & XFRM_STATE_ESN)
+		extralen += sizeof(*extra);
+
+	extra = esp_tmp_extra(tmp);
+	iv = esp_tmp_iv(aead, tmp, extralen);
+	req = esp_tmp_req(aead, iv);
+
+	/* Unref skb_frag_pages in the src scatterlist if necessary.
+	 * Skip the first sg which comes from skb->data.
+	 */
+	if (req->src != req->dst)
+		for (sg = sg_next(req->src); sg; sg = sg_next(sg))
+			put_page(sg_page(sg));
+}
+
 static void esp_output_done(struct crypto_async_request *base, int err)
 {
 	struct sk_buff *skb = base->data;
+	void *tmp;
+	struct dst_entry *dst = skb_dst(skb);
+	struct xfrm_state *x = dst->xfrm;
 
-	kfree(ESP_SKB_CB(skb)->tmp);
+	tmp = ESP_SKB_CB(skb)->tmp;
+	esp_ssg_unref(x, tmp);
+	kfree(tmp);
 	xfrm_output_resume(skb, err);
 }
 
@@ -120,6 +151,29 @@ static void esp_output_restore_header(struct sk_buff *skb)
 				sizeof(__be32));
 }
 
+static struct ip_esp_hdr *esp_output_set_extra(struct sk_buff *skb,
+					       struct ip_esp_hdr *esph,
+					       struct esp_output_extra *extra)
+{
+	struct xfrm_state *x = skb_dst(skb)->xfrm;
+
+	/* For ESN we move the header forward by 4 bytes to
+	 * accomodate the high bits.  We will move it back after
+	 * encryption.
+	 */
+	if ((x->props.flags & XFRM_STATE_ESN)) {
+		extra->esphoff = (unsigned char *)esph -
+				 skb_transport_header(skb);
+		esph = (struct ip_esp_hdr *)((unsigned char *)esph - 4);
+		extra->seqhi = esph->spi;
+		esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.hi);
+	}
+
+	esph->spi = x->id.spi;
+
+	return esph;
+}
+
 static void esp_output_done_esn(struct crypto_async_request *base, int err)
 {
 	struct sk_buff *skb = base->data;
@@ -128,18 +182,36 @@ static void esp_output_done_esn(struct crypto_async_request *base, int err)
 	esp_output_done(base, err);
 }
 
+static void esp_output_fill_trailer(u8 *tail, int tfclen, int plen, __u8 proto)
+{
+	/* Fill padding... */
+	if (tfclen) {
+		memset(tail, 0, tfclen);
+		tail += tfclen;
+	}
+	do {
+		int i;
+		for (i = 0; i < plen - 2; i++)
+			tail[i] = i + 1;
+	} while (0);
+	tail[plen - 2] = plen - 2;
+	tail[plen - 1] = proto;
+}
+
 static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
 {
-	int err;
 	struct esp_output_extra *extra;
+	int err = -ENOMEM;
 	struct ip_esp_hdr *esph;
 	struct crypto_aead *aead;
 	struct aead_request *req;
-	struct scatterlist *sg;
+	struct scatterlist *sg, *dsg;
 	struct sk_buff *trailer;
+	struct page *page;
 	void *tmp;
 	u8 *iv;
 	u8 *tail;
+	u8 *vaddr;
 	int blksize;
 	int clen;
 	int alen;
@@ -149,7 +221,9 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
 	int nfrags;
 	int assoclen;
 	int extralen;
+	int tailen;
 	__be64 seqno;
+	__u8 proto = *skb_mac_header(skb);
 
 	/* skb is pure payload to encrypt */
 
@@ -169,12 +243,7 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
 	blksize = ALIGN(crypto_aead_blocksize(aead), 4);
 	clen = ALIGN(skb->len + 2 + tfclen, blksize);
 	plen = clen - skb->len - tfclen;
-
-	err = skb_cow_data(skb, tfclen + plen + alen, &trailer);
-	if (err < 0)
-		goto error;
-	nfrags = err;
-
+	tailen = tfclen + plen + alen;
 	assoclen = sizeof(*esph);
 	extralen = 0;
 
@@ -183,35 +252,8 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
 		assoclen += sizeof(__be32);
 	}
 
-	tmp = esp_alloc_tmp(aead, nfrags, extralen);
-	if (!tmp) {
-		err = -ENOMEM;
-		goto error;
-	}
-
-	extra = esp_tmp_extra(tmp);
-	iv = esp_tmp_iv(aead, tmp, extralen);
-	req = esp_tmp_req(aead, iv);
-	sg = esp_req_sg(aead, req);
-
-	/* Fill padding... */
-	tail = skb_tail_pointer(trailer);
-	if (tfclen) {
-		memset(tail, 0, tfclen);
-		tail += tfclen;
-	}
-	do {
-		int i;
-		for (i = 0; i < plen - 2; i++)
-			tail[i] = i + 1;
-	} while (0);
-	tail[plen - 2] = plen - 2;
-	tail[plen - 1] = *skb_mac_header(skb);
-	pskb_put(skb, trailer, clen - skb->len + alen);
-
-	skb_push(skb, -skb_network_offset(skb));
-	esph = ip_esp_hdr(skb);
 	*skb_mac_header(skb) = IPPROTO_ESP;
+	esph = ip_esp_hdr(skb);
 
 	/* this is non-NULL only with UDP Encapsulation */
 	if (x->encap) {
@@ -230,7 +272,8 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
 		uh = (struct udphdr *)esph;
 		uh->source = sport;
 		uh->dest = dport;
-		uh->len = htons(skb->len - skb_transport_offset(skb));
+		uh->len = htons(skb->len + tailen
+				- skb_transport_offset(skb));
 		uh->check = 0;
 
 		switch (encap_type) {
@@ -248,31 +291,148 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
 		*skb_mac_header(skb) = IPPROTO_UDP;
 	}
 
-	esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low);
+	if (!skb_cloned(skb)) {
+		if (tailen <= skb_availroom(skb)) {
+			nfrags = 1;
+			trailer = skb;
+			tail = skb_tail_pointer(trailer);
 
-	aead_request_set_callback(req, 0, esp_output_done, skb);
+			goto skip_cow;
+		} else if ((skb_shinfo(skb)->nr_frags < MAX_SKB_FRAGS)
+			   && !skb_has_frag_list(skb)) {
+			int allocsize;
+			struct sock *sk = skb->sk;
+			struct page_frag *pfrag = &x->xfrag;
 
-	/* For ESN we move the header forward by 4 bytes to
-	 * accomodate the high bits.  We will move it back after
-	 * encryption.
-	 */
-	if ((x->props.flags & XFRM_STATE_ESN)) {
-		extra->esphoff = (unsigned char *)esph -
-				 skb_transport_header(skb);
-		esph = (struct ip_esp_hdr *)((unsigned char *)esph - 4);
-		extra->seqhi = esph->spi;
-		esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.hi);
-		aead_request_set_callback(req, 0, esp_output_done_esn, skb);
+			allocsize = ALIGN(tailen, L1_CACHE_BYTES);
+
+			spin_lock_bh(&x->lock);
+
+			if (unlikely(!skb_page_frag_refill(allocsize, pfrag, GFP_ATOMIC))) {
+				spin_unlock_bh(&x->lock);
+				goto cow;
+			}
+
+			page = pfrag->page;
+			get_page(page);
+
+			vaddr = kmap_atomic(page);
+
+			tail = vaddr + pfrag->offset;
+
+			esp_output_fill_trailer(tail, tfclen, plen, proto);
+
+			kunmap_atomic(vaddr);
+
+			nfrags = skb_shinfo(skb)->nr_frags;
+
+			__skb_fill_page_desc(skb, nfrags, page, pfrag->offset,
+					     tailen);
+			skb_shinfo(skb)->nr_frags = ++nfrags;
+
+			pfrag->offset = pfrag->offset + allocsize;
+			nfrags++;
+
+			skb->len += tailen;
+			skb->data_len += tailen;
+			skb->truesize += tailen;
+			if (sk)
+				atomic_add(tailen, &sk->sk_wmem_alloc);
+
+			skb_push(skb, -skb_network_offset(skb));
+
+			esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low);
+			esph->spi = x->id.spi;
+
+			tmp = esp_alloc_tmp(aead, nfrags + 2, extralen);
+			if (!tmp) {
+				spin_unlock_bh(&x->lock);
+				err = -ENOMEM;
+				goto error;
+			}
+
+			extra = esp_tmp_extra(tmp);
+			iv = esp_tmp_iv(aead, tmp, extralen);
+			req = esp_tmp_req(aead, iv);
+			sg = esp_req_sg(aead, req);
+			dsg = &sg[nfrags];
+
+			esph = esp_output_set_extra(skb, esph, extra);
+
+			sg_init_table(sg, nfrags);
+			skb_to_sgvec(skb, sg,
+				     (unsigned char *)esph - skb->data,
+				     assoclen + ivlen + clen + alen);
+
+			allocsize = ALIGN(skb->data_len, L1_CACHE_BYTES);
+
+			if (unlikely(!skb_page_frag_refill(allocsize, pfrag, GFP_ATOMIC))) {
+				spin_unlock_bh(&x->lock);
+				err = -ENOMEM;
+				goto error;
+			}
+
+			skb_shinfo(skb)->nr_frags = 1;
+
+			page = pfrag->page;
+			get_page(page);
+			/* replace page frags in skb with new page */
+			__skb_fill_page_desc(skb, 0, page, pfrag->offset, skb->data_len);
+			pfrag->offset = pfrag->offset + allocsize;
+
+			sg_init_table(dsg, skb_shinfo(skb)->nr_frags + 1);
+			skb_to_sgvec(skb, dsg,
+				     (unsigned char *)esph - skb->data,
+				     assoclen + ivlen + clen + alen);
+
+			spin_unlock_bh(&x->lock);
+
+			goto skip_cow2;
+		}
 	}
 
+cow:
+	err = skb_cow_data(skb, tailen, &trailer);
+	if (err < 0)
+		goto error;
+	nfrags = err;
+	tail = skb_tail_pointer(trailer);
+	esph = ip_esp_hdr(skb);
+
+skip_cow:
+	esp_output_fill_trailer(tail, tfclen, plen, proto);
+
+	pskb_put(skb, trailer, clen - skb->len + alen);
+	skb_push(skb, -skb_network_offset(skb));
+	esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low);
 	esph->spi = x->id.spi;
 
+	tmp = esp_alloc_tmp(aead, nfrags, extralen);
+	if (!tmp) {
+		err = -ENOMEM;
+		goto error;
+	}
+
+	extra = esp_tmp_extra(tmp);
+	iv = esp_tmp_iv(aead, tmp, extralen);
+	req = esp_tmp_req(aead, iv);
+	sg = esp_req_sg(aead, req);
+	dsg = sg;
+
+	esph = esp_output_set_extra(skb, esph, extra);
+
 	sg_init_table(sg, nfrags);
 	skb_to_sgvec(skb, sg,
 		     (unsigned char *)esph - skb->data,
 		     assoclen + ivlen + clen + alen);
 
-	aead_request_set_crypt(req, sg, sg, ivlen + clen, iv);
+skip_cow2:
+	if ((x->props.flags & XFRM_STATE_ESN))
+		aead_request_set_callback(req, 0, esp_output_done_esn, skb);
+	else
+		aead_request_set_callback(req, 0, esp_output_done, skb);
+
+	aead_request_set_crypt(req, sg, dsg, ivlen + clen, iv);
 	aead_request_set_ad(req, assoclen);
 
 	seqno = cpu_to_be64(XFRM_SKB_CB(skb)->seq.output.low +
@@ -298,6 +458,8 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
 			esp_output_restore_header(skb);
 	}
 
+	if (sg != dsg)
+		esp_ssg_unref(x, tmp);
 	kfree(tmp);
 
 error:
@@ -401,6 +563,23 @@ static void esp_input_restore_header(struct sk_buff *skb)
 	__skb_pull(skb, 4);
 }
 
+static void esp_input_set_header(struct sk_buff *skb, __be32 *seqhi)
+{
+	struct xfrm_state *x = xfrm_input_state(skb);
+	struct ip_esp_hdr *esph = (struct ip_esp_hdr *)skb->data;
+
+	/* For ESN we move the header forward by 4 bytes to
+	 * accomodate the high bits.  We will move it back after
+	 * decryption.
+	 */
+	if ((x->props.flags & XFRM_STATE_ESN)) {
+		esph = (void *)skb_push(skb, 4);
+		*seqhi = esph->spi;
+		esph->spi = esph->seq_no;
+		esph->seq_no = XFRM_SKB_CB(skb)->seq.input.hi;
+	}
+}
+
 static void esp_input_done_esn(struct crypto_async_request *base, int err)
 {
 	struct sk_buff *skb = base->data;
@@ -437,12 +616,6 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb)
 	if (elen <= 0)
 		goto out;
 
-	err = skb_cow_data(skb, 0, &trailer);
-	if (err < 0)
-		goto out;
-
-	nfrags = err;
-
 	assoclen = sizeof(*esph);
 	seqhilen = 0;
 
@@ -451,6 +624,26 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb)
 		assoclen += seqhilen;
 	}
 
+	if (!skb_cloned(skb)) {
+		if (!skb_is_nonlinear(skb)) {
+			nfrags = 1;
+
+			goto skip_cow;
+		} else if (!skb_has_frag_list(skb)) {
+			nfrags = skb_shinfo(skb)->nr_frags;
+			nfrags++;
+
+			goto skip_cow;
+		}
+	}
+
+	err = skb_cow_data(skb, 0, &trailer);
+	if (err < 0)
+		goto out;
+
+	nfrags = err;
+
+skip_cow:
 	err = -ENOMEM;
 	tmp = esp_alloc_tmp(aead, nfrags, seqhilen);
 	if (!tmp)
@@ -462,26 +655,17 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb)
 	req = esp_tmp_req(aead, iv);
 	sg = esp_req_sg(aead, req);
 
-	skb->ip_summed = CHECKSUM_NONE;
+	esp_input_set_header(skb, seqhi);
 
-	esph = (struct ip_esp_hdr *)skb->data;
+	sg_init_table(sg, nfrags);
+	skb_to_sgvec(skb, sg, 0, skb->len);
 
-	aead_request_set_callback(req, 0, esp_input_done, skb);
+	skb->ip_summed = CHECKSUM_NONE;
 
-	/* For ESN we move the header forward by 4 bytes to
-	 * accomodate the high bits.  We will move it back after
-	 * decryption.
-	 */
-	if ((x->props.flags & XFRM_STATE_ESN)) {
-		esph = (void *)skb_push(skb, 4);
-		*seqhi = esph->spi;
-		esph->spi = esph->seq_no;
-		esph->seq_no = XFRM_SKB_CB(skb)->seq.input.hi;
+	if ((x->props.flags & XFRM_STATE_ESN))
 		aead_request_set_callback(req, 0, esp_input_done_esn, skb);
-	}
-
-	sg_init_table(sg, nfrags);
-	skb_to_sgvec(skb, sg, 0, skb->len);
+	else
+		aead_request_set_callback(req, 0, esp_input_done, skb);
 
 	aead_request_set_crypt(req, sg, sg, elen + ivlen, iv);
 	aead_request_set_ad(req, assoclen);
diff --git a/net/ipv4/esp4_offload.c b/net/ipv4/esp4_offload.c
new file mode 100644
index 000000000000..1de442632406
--- /dev/null
+++ b/net/ipv4/esp4_offload.c
@@ -0,0 +1,106 @@
+/*
+ * IPV4 GSO/GRO offload support
+ * Linux INET implementation
+ *
+ * Copyright (C) 2016 secunet Security Networks AG
+ * Author: Steffen Klassert <steffen.klassert@secunet.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * ESP GRO support
+ */
+
+#include <linux/skbuff.h>
+#include <linux/init.h>
+#include <net/protocol.h>
+#include <crypto/aead.h>
+#include <crypto/authenc.h>
+#include <linux/err.h>
+#include <linux/module.h>
+#include <net/ip.h>
+#include <net/xfrm.h>
+#include <net/esp.h>
+#include <linux/scatterlist.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <net/udp.h>
+
+static struct sk_buff **esp4_gro_receive(struct sk_buff **head,
+					 struct sk_buff *skb)
+{
+	int offset = skb_gro_offset(skb);
+	struct xfrm_offload *xo;
+	struct xfrm_state *x;
+	__be32 seq;
+	__be32 spi;
+	int err;
+
+	skb_pull(skb, offset);
+
+	if ((err = xfrm_parse_spi(skb, IPPROTO_ESP, &spi, &seq)) != 0)
+		goto out;
+
+	err = secpath_set(skb);
+	if (err)
+		goto out;
+
+	if (skb->sp->len == XFRM_MAX_DEPTH)
+		goto out;
+
+	x = xfrm_state_lookup(dev_net(skb->dev), skb->mark,
+			      (xfrm_address_t *)&ip_hdr(skb)->daddr,
+			      spi, IPPROTO_ESP, AF_INET);
+	if (!x)
+		goto out;
+
+	skb->sp->xvec[skb->sp->len++] = x;
+	skb->sp->olen++;
+
+	xo = xfrm_offload(skb);
+	if (!xo) {
+		xfrm_state_put(x);
+		goto out;
+	}
+	xo->flags |= XFRM_GRO;
+
+	XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = NULL;
+	XFRM_SPI_SKB_CB(skb)->family = AF_INET;
+	XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct iphdr, daddr);
+	XFRM_SPI_SKB_CB(skb)->seq = seq;
+
+	/* We don't need to handle errors from xfrm_input, it does all
+	 * the error handling and frees the resources on error. */
+	xfrm_input(skb, IPPROTO_ESP, spi, -2);
+
+	return ERR_PTR(-EINPROGRESS);
+out:
+	skb_push(skb, offset);
+	NAPI_GRO_CB(skb)->same_flow = 0;
+	NAPI_GRO_CB(skb)->flush = 1;
+
+	return NULL;
+}
+
+static const struct net_offload esp4_offload = {
+	.callbacks = {
+		.gro_receive = esp4_gro_receive,
+	},
+};
+
+static int __init esp4_offload_init(void)
+{
+	return inet_add_offload(&esp4_offload, IPPROTO_ESP);
+}
+
+static void __exit esp4_offload_exit(void)
+{
+	inet_del_offload(&esp4_offload, IPPROTO_ESP);
+}
+
+module_init(esp4_offload_init);
+module_exit(esp4_offload_exit);
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Steffen Klassert <steffen.klassert@secunet.com>");
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 7db2ad2e82d3..b39a791f6756 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -319,7 +319,7 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
 	int ret, no_addr;
 	struct fib_result res;
 	struct flowi4 fl4;
-	struct net *net;
+	struct net *net = dev_net(dev);
 	bool dev_match;
 
 	fl4.flowi4_oif = 0;
@@ -332,6 +332,7 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
 	fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
 	fl4.flowi4_tun_key.tun_id = 0;
 	fl4.flowi4_flags = 0;
+	fl4.flowi4_uid = sock_net_uid(net, NULL);
 
 	no_addr = idev->ifa_list == NULL;
 
@@ -339,13 +340,12 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
 
 	trace_fib_validate_source(dev, &fl4);
 
-	net = dev_net(dev);
 	if (fib_lookup(net, &fl4, &res, 0))
 		goto last_resort;
 	if (res.type != RTN_UNICAST &&
 	    (res.type != RTN_LOCAL || !IN_DEV_ACCEPT_LOCAL(idev)))
 		goto e_inval;
-	if (!rpf && !fib_num_tclassid_users(dev_net(dev)) &&
+	if (!rpf && !fib_num_tclassid_users(net) &&
 	    (dev->ifindex != oif || !IN_DEV_TX_REDIRECTS(idev)))
 		goto last_resort;
 	fib_combine_itag(itag, &res);
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 9a375b908d01..317026a39cfa 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -471,7 +471,6 @@ static int fib_count_nexthops(struct rtnexthop *rtnh, int remaining)
 static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh,
 		       int remaining, struct fib_config *cfg)
 {
-	struct net *net = cfg->fc_nlinfo.nl_net;
 	int ret;
 
 	change_nexthops(fi) {
@@ -503,16 +502,14 @@ static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh,
 			nla = nla_find(attrs, attrlen, RTA_ENCAP);
 			if (nla) {
 				struct lwtunnel_state *lwtstate;
-				struct net_device *dev = NULL;
 				struct nlattr *nla_entype;
 
 				nla_entype = nla_find(attrs, attrlen,
 						      RTA_ENCAP_TYPE);
 				if (!nla_entype)
 					goto err_inval;
-				if (cfg->fc_oif)
-					dev = __dev_get_by_index(net, cfg->fc_oif);
-				ret = lwtunnel_build_state(dev, nla_get_u16(
+
+				ret = lwtunnel_build_state(nla_get_u16(
 							   nla_entype),
 							   nla,  AF_INET, cfg,
 							   &lwtstate);
@@ -597,21 +594,18 @@ static inline void fib_add_weight(struct fib_info *fi,
 
 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
 
-static int fib_encap_match(struct net *net, u16 encap_type,
+static int fib_encap_match(u16 encap_type,
 			   struct nlattr *encap,
-			   int oif, const struct fib_nh *nh,
+			   const struct fib_nh *nh,
 			   const struct fib_config *cfg)
 {
 	struct lwtunnel_state *lwtstate;
-	struct net_device *dev = NULL;
 	int ret, result = 0;
 
 	if (encap_type == LWTUNNEL_ENCAP_NONE)
 		return 0;
 
-	if (oif)
-		dev = __dev_get_by_index(net, oif);
-	ret = lwtunnel_build_state(dev, encap_type, encap,
+	ret = lwtunnel_build_state(encap_type, encap,
 				   AF_INET, cfg, &lwtstate);
 	if (!ret) {
 		result = lwtunnel_cmp_encap(lwtstate, nh->nh_lwtstate);
@@ -623,7 +617,6 @@ static int fib_encap_match(struct net *net, u16 encap_type,
 
 int fib_nh_match(struct fib_config *cfg, struct fib_info *fi)
 {
-	struct net *net = cfg->fc_nlinfo.nl_net;
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
 	struct rtnexthop *rtnh;
 	int remaining;
@@ -634,9 +627,8 @@ int fib_nh_match(struct fib_config *cfg, struct fib_info *fi)
 
 	if (cfg->fc_oif || cfg->fc_gw) {
 		if (cfg->fc_encap) {
-			if (fib_encap_match(net, cfg->fc_encap_type,
-					    cfg->fc_encap, cfg->fc_oif,
-					    fi->fib_nh, cfg))
+			if (fib_encap_match(cfg->fc_encap_type,
+					    cfg->fc_encap, fi->fib_nh, cfg))
 			    return 1;
 		}
 		if ((!cfg->fc_oif || cfg->fc_oif == fi->fib_nh->nh_oif) &&
@@ -1093,13 +1085,10 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
 
 		if (cfg->fc_encap) {
 			struct lwtunnel_state *lwtstate;
-			struct net_device *dev = NULL;
 
 			if (cfg->fc_encap_type == LWTUNNEL_ENCAP_NONE)
 				goto err_inval;
-			if (cfg->fc_oif)
-				dev = __dev_get_by_index(net, cfg->fc_oif);
-			err = lwtunnel_build_state(dev, cfg->fc_encap_type,
+			err = lwtunnel_build_state(cfg->fc_encap_type,
 						   cfg->fc_encap, AF_INET, cfg,
 						   &lwtstate);
 			if (err)
@@ -1366,6 +1355,36 @@ int fib_sync_down_addr(struct net_device *dev, __be32 local)
 	return ret;
 }
 
+static int call_fib_nh_notifiers(struct fib_nh *fib_nh,
+				 enum fib_event_type event_type)
+{
+	struct in_device *in_dev = __in_dev_get_rtnl(fib_nh->nh_dev);
+	struct fib_nh_notifier_info info = {
+		.fib_nh = fib_nh,
+	};
+
+	switch (event_type) {
+	case FIB_EVENT_NH_ADD:
+		if (fib_nh->nh_flags & RTNH_F_DEAD)
+			break;
+		if (IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev) &&
+		    fib_nh->nh_flags & RTNH_F_LINKDOWN)
+			break;
+		return call_fib_notifiers(dev_net(fib_nh->nh_dev), event_type,
+					  &info.info);
+	case FIB_EVENT_NH_DEL:
+		if ((IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev) &&
+		     fib_nh->nh_flags & RTNH_F_LINKDOWN) ||
+		    (fib_nh->nh_flags & RTNH_F_DEAD))
+			return call_fib_notifiers(dev_net(fib_nh->nh_dev),
+						  event_type, &info.info);
+	default:
+		break;
+	}
+
+	return NOTIFY_DONE;
+}
+
 /* Event              force Flags           Description
  * NETDEV_CHANGE      0     LINKDOWN        Carrier OFF, not for scope host
  * NETDEV_DOWN        0     LINKDOWN|DEAD   Link down, not for scope host
@@ -1407,6 +1426,8 @@ int fib_sync_down_dev(struct net_device *dev, unsigned long event, bool force)
 					nexthop_nh->nh_flags |= RTNH_F_LINKDOWN;
 					break;
 				}
+				call_fib_nh_notifiers(nexthop_nh,
+						      FIB_EVENT_NH_DEL);
 				dead++;
 			}
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
@@ -1437,7 +1458,7 @@ int fib_sync_down_dev(struct net_device *dev, unsigned long event, bool force)
 }
 
 /* Must be invoked inside of an RCU protected region.  */
-void fib_select_default(const struct flowi4 *flp, struct fib_result *res)
+static void fib_select_default(const struct flowi4 *flp, struct fib_result *res)
 {
 	struct fib_info *fi = NULL, *last_resort = NULL;
 	struct hlist_head *fa_head = res->fa_head;
@@ -1561,6 +1582,7 @@ int fib_sync_up(struct net_device *dev, unsigned int nh_flags)
 				continue;
 			alive++;
 			nexthop_nh->nh_flags &= ~nh_flags;
+			call_fib_nh_notifiers(nexthop_nh, FIB_EVENT_NH_ADD);
 		} endfor_nexthops(fi)
 
 		if (alive > 0) {
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 2919d1a10cfd..2f0d8233950f 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -124,7 +124,7 @@ static void fib_notify(struct net *net, struct notifier_block *nb,
 static int call_fib_entry_notifier(struct notifier_block *nb, struct net *net,
 				   enum fib_event_type event_type, u32 dst,
 				   int dst_len, struct fib_info *fi,
-				   u8 tos, u8 type, u32 tb_id, u32 nlflags)
+				   u8 tos, u8 type, u32 tb_id)
 {
 	struct fib_entry_notifier_info info = {
 		.dst = dst,
@@ -133,7 +133,6 @@ static int call_fib_entry_notifier(struct notifier_block *nb, struct net *net,
 		.tos = tos,
 		.type = type,
 		.tb_id = tb_id,
-		.nlflags = nlflags,
 	};
 	return call_fib_notifier(nb, net, event_type, &info.info);
 }
@@ -197,7 +196,7 @@ int call_fib_notifiers(struct net *net, enum fib_event_type event_type,
 static int call_fib_entry_notifiers(struct net *net,
 				    enum fib_event_type event_type, u32 dst,
 				    int dst_len, struct fib_info *fi,
-				    u8 tos, u8 type, u32 tb_id, u32 nlflags)
+				    u8 tos, u8 type, u32 tb_id)
 {
 	struct fib_entry_notifier_info info = {
 		.dst = dst,
@@ -206,7 +205,6 @@ static int call_fib_entry_notifiers(struct net *net,
 		.tos = tos,
 		.type = type,
 		.tb_id = tb_id,
-		.nlflags = nlflags,
 	};
 	return call_fib_notifiers(net, event_type, &info.info);
 }
@@ -1198,6 +1196,7 @@ static int fib_insert_alias(struct trie *t, struct key_vector *tp,
 int fib_table_insert(struct net *net, struct fib_table *tb,
 		     struct fib_config *cfg)
 {
+	enum fib_event_type event = FIB_EVENT_ENTRY_ADD;
 	struct trie *t = (struct trie *)tb->tb_data;
 	struct fib_alias *fa, *new_fa;
 	struct key_vector *l, *tp;
@@ -1295,6 +1294,13 @@ int fib_table_insert(struct net *net, struct fib_table *tb,
 			new_fa->tb_id = tb->tb_id;
 			new_fa->fa_default = -1;
 
+			call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_REPLACE,
+						 key, plen, fi,
+						 new_fa->fa_tos, cfg->fc_type,
+						 tb->tb_id);
+			rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen,
+				  tb->tb_id, &cfg->fc_nlinfo, nlflags);
+
 			hlist_replace_rcu(&fa->fa_list, &new_fa->fa_list);
 
 			alias_free_mem_rcu(fa);
@@ -1303,13 +1309,6 @@ int fib_table_insert(struct net *net, struct fib_table *tb,
 			if (state & FA_S_ACCESSED)
 				rt_cache_flush(cfg->fc_nlinfo.nl_net);
 
-			call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_ADD,
-						 key, plen, fi,
-						 new_fa->fa_tos, cfg->fc_type,
-						 tb->tb_id, cfg->fc_nlflags);
-			rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen,
-				tb->tb_id, &cfg->fc_nlinfo, nlflags);
-
 			goto succeeded;
 		}
 		/* Error if we find a perfect match which
@@ -1319,10 +1318,12 @@ int fib_table_insert(struct net *net, struct fib_table *tb,
 		if (fa_match)
 			goto out;
 
-		if (cfg->fc_nlflags & NLM_F_APPEND)
+		if (cfg->fc_nlflags & NLM_F_APPEND) {
+			event = FIB_EVENT_ENTRY_APPEND;
 			nlflags |= NLM_F_APPEND;
-		else
+		} else {
 			fa = fa_first;
+		}
 	}
 	err = -ENOENT;
 	if (!(cfg->fc_nlflags & NLM_F_CREATE))
@@ -1351,8 +1352,8 @@ int fib_table_insert(struct net *net, struct fib_table *tb,
 		tb->tb_num_default++;
 
 	rt_cache_flush(cfg->fc_nlinfo.nl_net);
-	call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_ADD, key, plen, fi, tos,
-				 cfg->fc_type, tb->tb_id, cfg->fc_nlflags);
+	call_fib_entry_notifiers(net, event, key, plen, fi, tos, cfg->fc_type,
+				 tb->tb_id);
 	rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, new_fa->tb_id,
 		  &cfg->fc_nlinfo, nlflags);
 succeeded:
@@ -1653,8 +1654,8 @@ int fib_table_delete(struct net *net, struct fib_table *tb,
 		return -ESRCH;
 
 	call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, key, plen,
-				 fa_to_delete->fa_info, tos, cfg->fc_type,
-				 tb->tb_id, 0);
+				 fa_to_delete->fa_info, tos,
+				 fa_to_delete->fa_type, tb->tb_id);
 	rtmsg_fib(RTM_DELROUTE, htonl(key), fa_to_delete, plen, tb->tb_id,
 		  &cfg->fc_nlinfo, 0);
 
@@ -1963,7 +1964,8 @@ int fib_table_flush(struct net *net, struct fib_table *tb)
 		hlist_for_each_entry_safe(fa, tmp, &n->leaf, fa_list) {
 			struct fib_info *fi = fa->fa_info;
 
-			if (!fi || !(fi->fib_flags & RTNH_F_DEAD)) {
+			if (!fi || !(fi->fib_flags & RTNH_F_DEAD) ||
+			    tb->tb_id != fa->tb_id) {
 				slen = fa->fa_slen;
 				continue;
 			}
@@ -1972,7 +1974,7 @@ int fib_table_flush(struct net *net, struct fib_table *tb)
 						 n->key,
 						 KEYLENGTH - fa->fa_slen,
 						 fi, fa->fa_tos, fa->fa_type,
-						 tb->tb_id, 0);
+						 tb->tb_id);
 			hlist_del_rcu(&fa->fa_list);
 			fib_release_info(fa->fa_info);
 			alias_free_mem_rcu(fa);
@@ -2012,7 +2014,7 @@ static void fib_leaf_notify(struct net *net, struct key_vector *l,
 
 		call_fib_entry_notifier(nb, net, event_type, l->key,
 					KEYLENGTH - fa->fa_slen, fi, fa->fa_tos,
-					fa->fa_type, fa->tb_id, 0);
+					fa->fa_type, fa->tb_id);
 	}
 }
 
@@ -2386,7 +2388,7 @@ static int fib_triestat_seq_show(struct seq_file *seq, void *v)
 
 	seq_printf(seq,
 		   "Basic info: size of leaf:"
-		   " %Zd bytes, size of tnode: %Zd bytes.\n",
+		   " %zd bytes, size of tnode: %zd bytes.\n",
 		   LEAF_SIZE, TNODE_SIZE(0));
 
 	for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 0777ea949223..fc310db2708b 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -209,19 +209,17 @@ static struct sock *icmp_sk(struct net *net)
 	return *this_cpu_ptr(net->ipv4.icmp_sk);
 }
 
+/* Called with BH disabled */
 static inline struct sock *icmp_xmit_lock(struct net *net)
 {
 	struct sock *sk;
 
-	local_bh_disable();
-
 	sk = icmp_sk(net);
 
 	if (unlikely(!spin_trylock(&sk->sk_lock.slock))) {
 		/* This can happen if the output path signals a
 		 * dst_link_failure() for an outgoing ICMP packet.
 		 */
-		local_bh_enable();
 		return NULL;
 	}
 	return sk;
@@ -229,7 +227,7 @@ static inline struct sock *icmp_xmit_lock(struct net *net)
 
 static inline void icmp_xmit_unlock(struct sock *sk)
 {
-	spin_unlock_bh(&sk->sk_lock.slock);
+	spin_unlock(&sk->sk_lock.slock);
 }
 
 int sysctl_icmp_msgs_per_sec __read_mostly = 1000;
@@ -282,6 +280,33 @@ bool icmp_global_allow(void)
 }
 EXPORT_SYMBOL(icmp_global_allow);
 
+static bool icmpv4_mask_allow(struct net *net, int type, int code)
+{
+	if (type > NR_ICMP_TYPES)
+		return true;
+
+	/* Don't limit PMTU discovery. */
+	if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)
+		return true;
+
+	/* Limit if icmp type is enabled in ratemask. */
+	if (!((1 << type) & net->ipv4.sysctl_icmp_ratemask))
+		return true;
+
+	return false;
+}
+
+static bool icmpv4_global_allow(struct net *net, int type, int code)
+{
+	if (icmpv4_mask_allow(net, type, code))
+		return true;
+
+	if (icmp_global_allow())
+		return true;
+
+	return false;
+}
+
 /*
  *	Send an ICMP frame.
  */
@@ -290,34 +315,22 @@ static bool icmpv4_xrlim_allow(struct net *net, struct rtable *rt,
 			       struct flowi4 *fl4, int type, int code)
 {
 	struct dst_entry *dst = &rt->dst;
+	struct inet_peer *peer;
 	bool rc = true;
+	int vif;
 
-	if (type > NR_ICMP_TYPES)
-		goto out;
-
-	/* Don't limit PMTU discovery. */
-	if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)
+	if (icmpv4_mask_allow(net, type, code))
 		goto out;
 
 	/* No rate limit on loopback */
 	if (dst->dev && (dst->dev->flags&IFF_LOOPBACK))
 		goto out;
 
-	/* Limit if icmp type is enabled in ratemask. */
-	if (!((1 << type) & net->ipv4.sysctl_icmp_ratemask))
-		goto out;
-
-	rc = false;
-	if (icmp_global_allow()) {
-		int vif = l3mdev_master_ifindex(dst->dev);
-		struct inet_peer *peer;
-
-		peer = inet_getpeer_v4(net->ipv4.peers, fl4->daddr, vif, 1);
-		rc = inet_peer_xrlim_allow(peer,
-					   net->ipv4.sysctl_icmp_ratelimit);
-		if (peer)
-			inet_putpeer(peer);
-	}
+	vif = l3mdev_master_ifindex(dst->dev);
+	peer = inet_getpeer_v4(net->ipv4.peers, fl4->daddr, vif, 1);
+	rc = inet_peer_xrlim_allow(peer, net->ipv4.sysctl_icmp_ratelimit);
+	if (peer)
+		inet_putpeer(peer);
 out:
 	return rc;
 }
@@ -396,13 +409,22 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
 	struct inet_sock *inet;
 	__be32 daddr, saddr;
 	u32 mark = IP4_REPLY_MARK(net, skb->mark);
+	int type = icmp_param->data.icmph.type;
+	int code = icmp_param->data.icmph.code;
 
 	if (ip_options_echo(&icmp_param->replyopts.opt.opt, skb))
 		return;
 
+	/* Needed by both icmp_global_allow and icmp_xmit_lock */
+	local_bh_disable();
+
+	/* global icmp_msgs_per_sec */
+	if (!icmpv4_global_allow(net, type, code))
+		goto out_bh_enable;
+
 	sk = icmp_xmit_lock(net);
 	if (!sk)
-		return;
+		goto out_bh_enable;
 	inet = inet_sk(sk);
 
 	icmp_param->data.icmph.checksum = 0;
@@ -433,12 +455,13 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
 	rt = ip_route_output_key(net, &fl4);
 	if (IS_ERR(rt))
 		goto out_unlock;
-	if (icmpv4_xrlim_allow(net, rt, &fl4, icmp_param->data.icmph.type,
-			       icmp_param->data.icmph.code))
+	if (icmpv4_xrlim_allow(net, rt, &fl4, type, code))
 		icmp_push_reply(icmp_param, &fl4, &ipc, &rt);
 	ip_rt_put(rt);
 out_unlock:
 	icmp_xmit_unlock(sk);
+out_bh_enable:
+	local_bh_enable();
 }
 
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
@@ -571,7 +594,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
 {
 	struct iphdr *iph;
 	int room;
-	struct icmp_bxm *icmp_param;
+	struct icmp_bxm icmp_param;
 	struct rtable *rt = skb_rtable(skb_in);
 	struct ipcm_cookie ipc;
 	struct flowi4 fl4;
@@ -648,13 +671,16 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
 		}
 	}
 
-	icmp_param = kmalloc(sizeof(*icmp_param), GFP_ATOMIC);
-	if (!icmp_param)
-		return;
+	/* Needed by both icmp_global_allow and icmp_xmit_lock */
+	local_bh_disable();
+
+	/* Check global sysctl_icmp_msgs_per_sec ratelimit */
+	if (!icmpv4_global_allow(net, type, code))
+		goto out_bh_enable;
 
 	sk = icmp_xmit_lock(net);
 	if (!sk)
-		goto out_free;
+		goto out_bh_enable;
 
 	/*
 	 *	Construct source address and options.
@@ -681,7 +707,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
 					  iph->tos;
 	mark = IP4_REPLY_MARK(net, skb_in->mark);
 
-	if (ip_options_echo(&icmp_param->replyopts.opt.opt, skb_in))
+	if (ip_options_echo(&icmp_param.replyopts.opt.opt, skb_in))
 		goto out_unlock;
 
 
@@ -689,25 +715,26 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
 	 *	Prepare data for ICMP header.
 	 */
 
-	icmp_param->data.icmph.type	 = type;
-	icmp_param->data.icmph.code	 = code;
-	icmp_param->data.icmph.un.gateway = info;
-	icmp_param->data.icmph.checksum	 = 0;
-	icmp_param->skb	  = skb_in;
-	icmp_param->offset = skb_network_offset(skb_in);
+	icmp_param.data.icmph.type	 = type;
+	icmp_param.data.icmph.code	 = code;
+	icmp_param.data.icmph.un.gateway = info;
+	icmp_param.data.icmph.checksum	 = 0;
+	icmp_param.skb	  = skb_in;
+	icmp_param.offset = skb_network_offset(skb_in);
 	inet_sk(sk)->tos = tos;
 	sk->sk_mark = mark;
 	ipc.addr = iph->saddr;
-	ipc.opt = &icmp_param->replyopts.opt;
+	ipc.opt = &icmp_param.replyopts.opt;
 	ipc.tx_flags = 0;
 	ipc.ttl = 0;
 	ipc.tos = -1;
 
 	rt = icmp_route_lookup(net, &fl4, skb_in, iph, saddr, tos, mark,
-			       type, code, icmp_param);
+			       type, code, &icmp_param);
 	if (IS_ERR(rt))
 		goto out_unlock;
 
+	/* peer icmp_ratelimit */
 	if (!icmpv4_xrlim_allow(net, rt, &fl4, type, code))
 		goto ende;
 
@@ -716,21 +743,21 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
 	room = dst_mtu(&rt->dst);
 	if (room > 576)
 		room = 576;
-	room -= sizeof(struct iphdr) + icmp_param->replyopts.opt.opt.optlen;
+	room -= sizeof(struct iphdr) + icmp_param.replyopts.opt.opt.optlen;
 	room -= sizeof(struct icmphdr);
 
-	icmp_param->data_len = skb_in->len - icmp_param->offset;
-	if (icmp_param->data_len > room)
-		icmp_param->data_len = room;
-	icmp_param->head_len = sizeof(struct icmphdr);
+	icmp_param.data_len = skb_in->len - icmp_param.offset;
+	if (icmp_param.data_len > room)
+		icmp_param.data_len = room;
+	icmp_param.head_len = sizeof(struct icmphdr);
 
-	icmp_push_reply(icmp_param, &fl4, &ipc, &rt);
+	icmp_push_reply(&icmp_param, &fl4, &ipc, &rt);
 ende:
 	ip_rt_put(rt);
 out_unlock:
 	icmp_xmit_unlock(sk);
-out_free:
-	kfree(icmp_param);
+out_bh_enable:
+	local_bh_enable();
 out:;
 }
 EXPORT_SYMBOL(icmp_send);
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 5b15459955f8..44fd86de2823 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -1172,6 +1172,7 @@ static void igmpv3_del_delrec(struct in_device *in_dev, struct ip_mc_list *im)
 				psf->sf_crcount = im->crcount;
 		}
 		in_dev_put(pmc->interface);
+		kfree(pmc);
 	}
 	spin_unlock_bh(&im->lock);
 }
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 19ea045c50ed..b4d5980ade3b 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -31,6 +31,86 @@ const char inet_csk_timer_bug_msg[] = "inet_csk BUG: unknown timer value\n";
 EXPORT_SYMBOL(inet_csk_timer_bug_msg);
 #endif
 
+#if IS_ENABLED(CONFIG_IPV6)
+/* match_wildcard == true:  IPV6_ADDR_ANY equals to any IPv6 addresses if IPv6
+ *                          only, and any IPv4 addresses if not IPv6 only
+ * match_wildcard == false: addresses must be exactly the same, i.e.
+ *                          IPV6_ADDR_ANY only equals to IPV6_ADDR_ANY,
+ *                          and 0.0.0.0 equals to 0.0.0.0 only
+ */
+static int ipv6_rcv_saddr_equal(const struct in6_addr *sk1_rcv_saddr6,
+				const struct in6_addr *sk2_rcv_saddr6,
+				__be32 sk1_rcv_saddr, __be32 sk2_rcv_saddr,
+				bool sk1_ipv6only, bool sk2_ipv6only,
+				bool match_wildcard)
+{
+	int addr_type = ipv6_addr_type(sk1_rcv_saddr6);
+	int addr_type2 = sk2_rcv_saddr6 ? ipv6_addr_type(sk2_rcv_saddr6) : IPV6_ADDR_MAPPED;
+
+	/* if both are mapped, treat as IPv4 */
+	if (addr_type == IPV6_ADDR_MAPPED && addr_type2 == IPV6_ADDR_MAPPED) {
+		if (!sk2_ipv6only) {
+			if (sk1_rcv_saddr == sk2_rcv_saddr)
+				return 1;
+			if (!sk1_rcv_saddr || !sk2_rcv_saddr)
+				return match_wildcard;
+		}
+		return 0;
+	}
+
+	if (addr_type == IPV6_ADDR_ANY && addr_type2 == IPV6_ADDR_ANY)
+		return 1;
+
+	if (addr_type2 == IPV6_ADDR_ANY && match_wildcard &&
+	    !(sk2_ipv6only && addr_type == IPV6_ADDR_MAPPED))
+		return 1;
+
+	if (addr_type == IPV6_ADDR_ANY && match_wildcard &&
+	    !(sk1_ipv6only && addr_type2 == IPV6_ADDR_MAPPED))
+		return 1;
+
+	if (sk2_rcv_saddr6 &&
+	    ipv6_addr_equal(sk1_rcv_saddr6, sk2_rcv_saddr6))
+		return 1;
+
+	return 0;
+}
+#endif
+
+/* match_wildcard == true:  0.0.0.0 equals to any IPv4 addresses
+ * match_wildcard == false: addresses must be exactly the same, i.e.
+ *                          0.0.0.0 only equals to 0.0.0.0
+ */
+static int ipv4_rcv_saddr_equal(__be32 sk1_rcv_saddr, __be32 sk2_rcv_saddr,
+				bool sk2_ipv6only, bool match_wildcard)
+{
+	if (!sk2_ipv6only) {
+		if (sk1_rcv_saddr == sk2_rcv_saddr)
+			return 1;
+		if (!sk1_rcv_saddr || !sk2_rcv_saddr)
+			return match_wildcard;
+	}
+	return 0;
+}
+
+int inet_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2,
+			 bool match_wildcard)
+{
+#if IS_ENABLED(CONFIG_IPV6)
+	if (sk->sk_family == AF_INET6)
+		return ipv6_rcv_saddr_equal(&sk->sk_v6_rcv_saddr,
+					    inet6_rcv_saddr(sk2),
+					    sk->sk_rcv_saddr,
+					    sk2->sk_rcv_saddr,
+					    ipv6_only_sock(sk),
+					    ipv6_only_sock(sk2),
+					    match_wildcard);
+#endif
+	return ipv4_rcv_saddr_equal(sk->sk_rcv_saddr, sk2->sk_rcv_saddr,
+				    ipv6_only_sock(sk2), match_wildcard);
+}
+EXPORT_SYMBOL(inet_rcv_saddr_equal);
+
 void inet_get_local_port_range(struct net *net, int *low, int *high)
 {
 	unsigned int seq;
@@ -44,9 +124,9 @@ void inet_get_local_port_range(struct net *net, int *low, int *high)
 }
 EXPORT_SYMBOL(inet_get_local_port_range);
 
-int inet_csk_bind_conflict(const struct sock *sk,
-			   const struct inet_bind_bucket *tb, bool relax,
-			   bool reuseport_ok)
+static int inet_csk_bind_conflict(const struct sock *sk,
+				  const struct inet_bind_bucket *tb,
+				  bool relax, bool reuseport_ok)
 {
 	struct sock *sk2;
 	bool reuse = sk->sk_reuse;
@@ -62,7 +142,6 @@ int inet_csk_bind_conflict(const struct sock *sk,
 
 	sk_for_each_bound(sk2, &tb->owners) {
 		if (sk != sk2 &&
-		    !inet_v6_ipv6only(sk2) &&
 		    (!sk->sk_bound_dev_if ||
 		     !sk2->sk_bound_dev_if ||
 		     sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
@@ -72,54 +151,34 @@ int inet_csk_bind_conflict(const struct sock *sk,
 			     rcu_access_pointer(sk->sk_reuseport_cb) ||
 			     (sk2->sk_state != TCP_TIME_WAIT &&
 			     !uid_eq(uid, sock_i_uid(sk2))))) {
-
-				if (!sk2->sk_rcv_saddr || !sk->sk_rcv_saddr ||
-				    sk2->sk_rcv_saddr == sk->sk_rcv_saddr)
+				if (inet_rcv_saddr_equal(sk, sk2, true))
 					break;
 			}
 			if (!relax && reuse && sk2->sk_reuse &&
 			    sk2->sk_state != TCP_LISTEN) {
-
-				if (!sk2->sk_rcv_saddr || !sk->sk_rcv_saddr ||
-				    sk2->sk_rcv_saddr == sk->sk_rcv_saddr)
+				if (inet_rcv_saddr_equal(sk, sk2, true))
 					break;
 			}
 		}
 	}
 	return sk2 != NULL;
 }
-EXPORT_SYMBOL_GPL(inet_csk_bind_conflict);
 
-/* Obtain a reference to a local port for the given sock,
- * if snum is zero it means select any available local port.
- * We try to allocate an odd port (and leave even ports for connect())
+/*
+ * Find an open port number for the socket.  Returns with the
+ * inet_bind_hashbucket lock held.
  */
-int inet_csk_get_port(struct sock *sk, unsigned short snum)
+static struct inet_bind_hashbucket *
+inet_csk_find_open_port(struct sock *sk, struct inet_bind_bucket **tb_ret, int *port_ret)
 {
-	bool reuse = sk->sk_reuse && sk->sk_state != TCP_LISTEN;
 	struct inet_hashinfo *hinfo = sk->sk_prot->h.hashinfo;
-	int ret = 1, attempts = 5, port = snum;
-	int smallest_size = -1, smallest_port;
+	int port = 0;
 	struct inet_bind_hashbucket *head;
 	struct net *net = sock_net(sk);
 	int i, low, high, attempt_half;
 	struct inet_bind_bucket *tb;
-	kuid_t uid = sock_i_uid(sk);
 	u32 remaining, offset;
-	bool reuseport_ok = !!snum;
 
-	if (port) {
-have_port:
-		head = &hinfo->bhash[inet_bhashfn(net, port,
-						  hinfo->bhash_size)];
-		spin_lock_bh(&head->lock);
-		inet_bind_bucket_for_each(tb, &head->chain)
-			if (net_eq(ib_net(tb), net) && tb->port == port)
-				goto tb_found;
-
-		goto tb_not_found;
-	}
-again:
 	attempt_half = (sk->sk_reuse == SK_CAN_REUSE) ? 1 : 0;
 other_half_scan:
 	inet_get_local_port_range(net, &low, &high);
@@ -143,8 +202,6 @@ other_half_scan:
 	 * We do the opposite to not pollute connect() users.
 	 */
 	offset |= 1U;
-	smallest_size = -1;
-	smallest_port = low; /* avoid compiler warning */
 
 other_parity_scan:
 	port = low + offset;
@@ -158,30 +215,17 @@ other_parity_scan:
 		spin_lock_bh(&head->lock);
 		inet_bind_bucket_for_each(tb, &head->chain)
 			if (net_eq(ib_net(tb), net) && tb->port == port) {
-				if (((tb->fastreuse > 0 && reuse) ||
-				     (tb->fastreuseport > 0 &&
-				      sk->sk_reuseport &&
-				      !rcu_access_pointer(sk->sk_reuseport_cb) &&
-				      uid_eq(tb->fastuid, uid))) &&
-				    (tb->num_owners < smallest_size || smallest_size == -1)) {
-					smallest_size = tb->num_owners;
-					smallest_port = port;
-				}
-				if (!inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, false,
-									      reuseport_ok))
-					goto tb_found;
+				if (!inet_csk_bind_conflict(sk, tb, false, false))
+					goto success;
 				goto next_port;
 			}
-		goto tb_not_found;
+		tb = NULL;
+		goto success;
 next_port:
 		spin_unlock_bh(&head->lock);
 		cond_resched();
 	}
 
-	if (smallest_size != -1) {
-		port = smallest_port;
-		goto have_port;
-	}
 	offset--;
 	if (!(offset & 1))
 		goto other_parity_scan;
@@ -191,8 +235,74 @@ next_port:
 		attempt_half = 2;
 		goto other_half_scan;
 	}
-	return ret;
+	return NULL;
+success:
+	*port_ret = port;
+	*tb_ret = tb;
+	return head;
+}
 
+static inline int sk_reuseport_match(struct inet_bind_bucket *tb,
+				     struct sock *sk)
+{
+	kuid_t uid = sock_i_uid(sk);
+
+	if (tb->fastreuseport <= 0)
+		return 0;
+	if (!sk->sk_reuseport)
+		return 0;
+	if (rcu_access_pointer(sk->sk_reuseport_cb))
+		return 0;
+	if (!uid_eq(tb->fastuid, uid))
+		return 0;
+	/* We only need to check the rcv_saddr if this tb was once marked
+	 * without fastreuseport and then was reset, as we can only know that
+	 * the fast_*rcv_saddr doesn't have any conflicts with the socks on the
+	 * owners list.
+	 */
+	if (tb->fastreuseport == FASTREUSEPORT_ANY)
+		return 1;
+#if IS_ENABLED(CONFIG_IPV6)
+	if (tb->fast_sk_family == AF_INET6)
+		return ipv6_rcv_saddr_equal(&tb->fast_v6_rcv_saddr,
+					    &sk->sk_v6_rcv_saddr,
+					    tb->fast_rcv_saddr,
+					    sk->sk_rcv_saddr,
+					    tb->fast_ipv6_only,
+					    ipv6_only_sock(sk), true);
+#endif
+	return ipv4_rcv_saddr_equal(tb->fast_rcv_saddr, sk->sk_rcv_saddr,
+				    ipv6_only_sock(sk), true);
+}
+
+/* Obtain a reference to a local port for the given sock,
+ * if snum is zero it means select any available local port.
+ * We try to allocate an odd port (and leave even ports for connect())
+ */
+int inet_csk_get_port(struct sock *sk, unsigned short snum)
+{
+	bool reuse = sk->sk_reuse && sk->sk_state != TCP_LISTEN;
+	struct inet_hashinfo *hinfo = sk->sk_prot->h.hashinfo;
+	int ret = 1, port = snum;
+	struct inet_bind_hashbucket *head;
+	struct net *net = sock_net(sk);
+	struct inet_bind_bucket *tb = NULL;
+	kuid_t uid = sock_i_uid(sk);
+
+	if (!port) {
+		head = inet_csk_find_open_port(sk, &tb, &port);
+		if (!head)
+			return ret;
+		if (!tb)
+			goto tb_not_found;
+		goto success;
+	}
+	head = &hinfo->bhash[inet_bhashfn(net, port,
+					  hinfo->bhash_size)];
+	spin_lock_bh(&head->lock);
+	inet_bind_bucket_for_each(tb, &head->chain)
+		if (net_eq(ib_net(tb), net) && tb->port == port)
+			goto tb_found;
 tb_not_found:
 	tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep,
 				     net, head, port);
@@ -203,39 +313,54 @@ tb_found:
 		if (sk->sk_reuse == SK_FORCE_REUSE)
 			goto success;
 
-		if (((tb->fastreuse > 0 && reuse) ||
-		     (tb->fastreuseport > 0 &&
-		      !rcu_access_pointer(sk->sk_reuseport_cb) &&
-		      sk->sk_reuseport && uid_eq(tb->fastuid, uid))) &&
-		    smallest_size == -1)
+		if ((tb->fastreuse > 0 && reuse) ||
+		    sk_reuseport_match(tb, sk))
 			goto success;
-		if (inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, true,
-							     reuseport_ok)) {
-			if ((reuse ||
-			     (tb->fastreuseport > 0 &&
-			      sk->sk_reuseport &&
-			      !rcu_access_pointer(sk->sk_reuseport_cb) &&
-			      uid_eq(tb->fastuid, uid))) &&
-			    !snum && smallest_size != -1 && --attempts >= 0) {
-				spin_unlock_bh(&head->lock);
-				goto again;
-			}
+		if (inet_csk_bind_conflict(sk, tb, true, true))
 			goto fail_unlock;
+	}
+success:
+	if (!hlist_empty(&tb->owners)) {
+		tb->fastreuse = reuse;
+		if (sk->sk_reuseport) {
+			tb->fastreuseport = FASTREUSEPORT_ANY;
+			tb->fastuid = uid;
+			tb->fast_rcv_saddr = sk->sk_rcv_saddr;
+			tb->fast_ipv6_only = ipv6_only_sock(sk);
+#if IS_ENABLED(CONFIG_IPV6)
+			tb->fast_v6_rcv_saddr = sk->sk_v6_rcv_saddr;
+#endif
+		} else {
+			tb->fastreuseport = 0;
 		}
+	} else {
 		if (!reuse)
 			tb->fastreuse = 0;
-		if (!sk->sk_reuseport || !uid_eq(tb->fastuid, uid))
-			tb->fastreuseport = 0;
-	} else {
-		tb->fastreuse = reuse;
 		if (sk->sk_reuseport) {
-			tb->fastreuseport = 1;
-			tb->fastuid = uid;
+			/* We didn't match or we don't have fastreuseport set on
+			 * the tb, but we have sk_reuseport set on this socket
+			 * and we know that there are no bind conflicts with
+			 * this socket in this tb, so reset our tb's reuseport
+			 * settings so that any subsequent sockets that match
+			 * our current socket will be put on the fast path.
+			 *
+			 * If we reset we need to set FASTREUSEPORT_STRICT so we
+			 * do extra checking for all subsequent sk_reuseport
+			 * socks.
+			 */
+			if (!sk_reuseport_match(tb, sk)) {
+				tb->fastreuseport = FASTREUSEPORT_STRICT;
+				tb->fastuid = uid;
+				tb->fast_rcv_saddr = sk->sk_rcv_saddr;
+				tb->fast_ipv6_only = ipv6_only_sock(sk);
+#if IS_ENABLED(CONFIG_IPV6)
+				tb->fast_v6_rcv_saddr = sk->sk_v6_rcv_saddr;
+#endif
+			}
 		} else {
 			tb->fastreuseport = 0;
 		}
 	}
-success:
 	if (!inet_csk(sk)->icsk_bind_hash)
 		inet_bind_hash(sk, tb, port);
 	WARN_ON(inet_csk(sk)->icsk_bind_hash != tb);
@@ -711,9 +836,8 @@ void inet_csk_destroy_sock(struct sock *sk)
 
 	sk_refcnt_debug_release(sk);
 
-	local_bh_disable();
 	percpu_counter_dec(sk->sk_prot->orphan_count);
-	local_bh_enable();
+
 	sock_put(sk);
 }
 EXPORT_SYMBOL(inet_csk_destroy_sock);
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index 4dea33e5f295..3828b3a805cd 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -215,7 +215,7 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
 	}
 
 	if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
-	    icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
+	    icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
 	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
 		r->idiag_timer = 1;
 		r->idiag_retrans = icsk->icsk_retransmits;
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index ca97835bfec4..8bea74298173 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -73,7 +73,6 @@ struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep,
 		tb->port      = snum;
 		tb->fastreuse = 0;
 		tb->fastreuseport = 0;
-		tb->num_owners = 0;
 		INIT_HLIST_HEAD(&tb->owners);
 		hlist_add_head(&tb->node, &head->chain);
 	}
@@ -96,7 +95,6 @@ void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb,
 {
 	inet_sk(sk)->inet_num = snum;
 	sk_add_bind_node(sk, &tb->owners);
-	tb->num_owners++;
 	inet_csk(sk)->icsk_bind_hash = tb;
 }
 
@@ -114,7 +112,6 @@ static void __inet_put_port(struct sock *sk)
 	spin_lock(&head->lock);
 	tb = inet_csk(sk)->icsk_bind_hash;
 	__sk_del_bind_node(sk);
-	tb->num_owners--;
 	inet_csk(sk)->icsk_bind_hash = NULL;
 	inet_sk(sk)->inet_num = 0;
 	inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb);
@@ -435,10 +432,7 @@ bool inet_ehash_nolisten(struct sock *sk, struct sock *osk)
 EXPORT_SYMBOL_GPL(inet_ehash_nolisten);
 
 static int inet_reuseport_add_sock(struct sock *sk,
-				   struct inet_listen_hashbucket *ilb,
-				   int (*saddr_same)(const struct sock *sk1,
-						     const struct sock *sk2,
-						     bool match_wildcard))
+				   struct inet_listen_hashbucket *ilb)
 {
 	struct inet_bind_bucket *tb = inet_csk(sk)->icsk_bind_hash;
 	struct sock *sk2;
@@ -451,7 +445,7 @@ static int inet_reuseport_add_sock(struct sock *sk,
 		    sk2->sk_bound_dev_if == sk->sk_bound_dev_if &&
 		    inet_csk(sk2)->icsk_bind_hash == tb &&
 		    sk2->sk_reuseport && uid_eq(uid, sock_i_uid(sk2)) &&
-		    saddr_same(sk, sk2, false))
+		    inet_rcv_saddr_equal(sk, sk2, false))
 			return reuseport_add_sock(sk, sk2);
 	}
 
@@ -461,10 +455,7 @@ static int inet_reuseport_add_sock(struct sock *sk,
 	return 0;
 }
 
-int __inet_hash(struct sock *sk, struct sock *osk,
-		 int (*saddr_same)(const struct sock *sk1,
-				   const struct sock *sk2,
-				   bool match_wildcard))
+int __inet_hash(struct sock *sk, struct sock *osk)
 {
 	struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
 	struct inet_listen_hashbucket *ilb;
@@ -479,7 +470,7 @@ int __inet_hash(struct sock *sk, struct sock *osk,
 
 	spin_lock(&ilb->lock);
 	if (sk->sk_reuseport) {
-		err = inet_reuseport_add_sock(sk, ilb, saddr_same);
+		err = inet_reuseport_add_sock(sk, ilb);
 		if (err)
 			goto unlock;
 	}
@@ -503,7 +494,7 @@ int inet_hash(struct sock *sk)
 
 	if (sk->sk_state != TCP_CLOSE) {
 		local_bh_disable();
-		err = __inet_hash(sk, NULL, ipv4_rcv_saddr_equal);
+		err = __inet_hash(sk, NULL);
 		local_bh_enable();
 	}
 
diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
index ddcd56c08d14..f8aff2c71cde 100644
--- a/net/ipv4/inet_timewait_sock.c
+++ b/net/ipv4/inet_timewait_sock.c
@@ -257,8 +257,7 @@ void __inet_twsk_schedule(struct inet_timewait_sock *tw, int timeo, bool rearm)
 }
 EXPORT_SYMBOL_GPL(__inet_twsk_schedule);
 
-void inet_twsk_purge(struct inet_hashinfo *hashinfo,
-		     struct inet_timewait_death_row *twdr, int family)
+void inet_twsk_purge(struct inet_hashinfo *hashinfo, int family)
 {
 	struct inet_timewait_sock *tw;
 	struct sock *sk;
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index b67719f45953..737ce826d7ec 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -222,7 +222,10 @@ static int ip_finish_output2(struct net *net, struct sock *sk, struct sk_buff *s
 	if (unlikely(!neigh))
 		neigh = __neigh_create(&arp_tbl, &nexthop, dev, false);
 	if (!IS_ERR(neigh)) {
-		int res = dst_neigh_output(dst, neigh, skb);
+		int res;
+
+		sock_confirm_neigh(skb, neigh);
+		res = neigh_output(neigh, skb);
 
 		rcu_read_unlock_bh();
 		return res;
@@ -886,6 +889,9 @@ static inline int ip_ufo_append_data(struct sock *sk,
 
 		skb->csum = 0;
 
+		if (flags & MSG_CONFIRM)
+			skb_set_dst_pending_confirm(skb, 1);
+
 		__skb_queue_tail(queue, skb);
 	} else if (skb_is_gso(skb)) {
 		goto append;
@@ -1086,6 +1092,9 @@ alloc_new_skb:
 			exthdrlen = 0;
 			csummode = CHECKSUM_NONE;
 
+			if ((flags & MSG_CONFIRM) && !skb_prev)
+				skb_set_dst_pending_confirm(skb, 1);
+
 			/*
 			 * Put the packet on the pending queue.
 			 */
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 53ae0c6315ad..ebd953bc5607 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -116,10 +116,10 @@ static void ip_cmsg_recv_checksum(struct msghdr *msg, struct sk_buff *skb,
 	if (skb->ip_summed != CHECKSUM_COMPLETE)
 		return;
 
-	if (offset != 0)
-		csum = csum_sub(csum,
-				csum_partial(skb_transport_header(skb) + tlen,
-					     offset, 0));
+	if (offset != 0) {
+		int tend_off = skb_transport_offset(skb) + tlen;
+		csum = csum_sub(csum, skb_checksum(skb, tend_off, offset, 0));
+	}
 
 	put_cmsg(msg, SOL_IP, IP_CHECKSUM, sizeof(__wsum), &csum);
 }
@@ -272,7 +272,7 @@ int ip_cmsg_send(struct sock *sk, struct msghdr *msg, struct ipcm_cookie *ipc,
 			continue;
 		switch (cmsg->cmsg_type) {
 		case IP_RETOPTS:
-			err = cmsg->cmsg_len - CMSG_ALIGN(sizeof(struct cmsghdr));
+			err = cmsg->cmsg_len - sizeof(struct cmsghdr);
 
 			/* Our caller is responsible for freeing ipc->opt */
 			err = ip_options_get(net, &ipc->opt, CMSG_DATA(cmsg),
@@ -843,6 +843,7 @@ static int do_ip_setsockopt(struct sock *sk, int level,
 	{
 		struct ip_mreqn mreq;
 		struct net_device *dev = NULL;
+		int midx;
 
 		if (sk->sk_type == SOCK_STREAM)
 			goto e_inval;
@@ -887,11 +888,15 @@ static int do_ip_setsockopt(struct sock *sk, int level,
 		err = -EADDRNOTAVAIL;
 		if (!dev)
 			break;
+
+		midx = l3mdev_master_ifindex(dev);
+
 		dev_put(dev);
 
 		err = -EINVAL;
 		if (sk->sk_bound_dev_if &&
-		    mreq.imr_ifindex != sk->sk_bound_dev_if)
+		    mreq.imr_ifindex != sk->sk_bound_dev_if &&
+		    (!midx || midx != sk->sk_bound_dev_if))
 			break;
 
 		inet->mc_index = mreq.imr_ifindex;
@@ -1238,7 +1243,14 @@ void ipv4_pktinfo_prepare(const struct sock *sk, struct sk_buff *skb)
 		pktinfo->ipi_ifindex = 0;
 		pktinfo->ipi_spec_dst.s_addr = 0;
 	}
-	skb_dst_drop(skb);
+	/* We need to keep the dst for __ip_options_echo()
+	 * We could restrict the test to opt.ts_needtime || opt.srr,
+	 * but the following is good enough as IP options are not often used.
+	 */
+	if (unlikely(IPCB(skb)->opt.optlen))
+		skb_dst_force(skb);
+	else
+		skb_dst_drop(skb);
 }
 
 int ip_setsockopt(struct sock *sk, int level,
diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c
index 0fd1976ab63b..a31f47ccaad9 100644
--- a/net/ipv4/ip_tunnel_core.c
+++ b/net/ipv4/ip_tunnel_core.c
@@ -188,8 +188,8 @@ int iptunnel_handle_offloads(struct sk_buff *skb,
 EXPORT_SYMBOL_GPL(iptunnel_handle_offloads);
 
 /* Often modified stats are per cpu, other are shared (netdev->stats) */
-struct rtnl_link_stats64 *ip_tunnel_get_stats64(struct net_device *dev,
-						struct rtnl_link_stats64 *tot)
+void ip_tunnel_get_stats64(struct net_device *dev,
+			   struct rtnl_link_stats64 *tot)
 {
 	int i;
 
@@ -214,8 +214,6 @@ struct rtnl_link_stats64 *ip_tunnel_get_stats64(struct net_device *dev,
 		tot->rx_bytes   += rx_bytes;
 		tot->tx_bytes   += tx_bytes;
 	}
-
-	return tot;
 }
 EXPORT_SYMBOL_GPL(ip_tunnel_get_stats64);
 
@@ -228,7 +226,7 @@ static const struct nla_policy ip_tun_policy[LWTUNNEL_IP_MAX + 1] = {
 	[LWTUNNEL_IP_FLAGS]	= { .type = NLA_U16 },
 };
 
-static int ip_tun_build_state(struct net_device *dev, struct nlattr *attr,
+static int ip_tun_build_state(struct nlattr *attr,
 			      unsigned int family, const void *cfg,
 			      struct lwtunnel_state **ts)
 {
@@ -325,7 +323,7 @@ static const struct nla_policy ip6_tun_policy[LWTUNNEL_IP6_MAX + 1] = {
 	[LWTUNNEL_IP6_FLAGS]		= { .type = NLA_U16 },
 };
 
-static int ip6_tun_build_state(struct net_device *dev, struct nlattr *attr,
+static int ip6_tun_build_state(struct nlattr *attr,
 			       unsigned int family, const void *cfg,
 			       struct lwtunnel_state **ts)
 {
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index efc1e76d4977..c0317c940bcd 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -299,10 +299,29 @@ static void __net_exit ipmr_rules_exit(struct net *net)
 }
 #endif
 
+static inline int ipmr_hash_cmp(struct rhashtable_compare_arg *arg,
+				const void *ptr)
+{
+	const struct mfc_cache_cmp_arg *cmparg = arg->key;
+	struct mfc_cache *c = (struct mfc_cache *)ptr;
+
+	return cmparg->mfc_mcastgrp != c->mfc_mcastgrp ||
+	       cmparg->mfc_origin != c->mfc_origin;
+}
+
+static const struct rhashtable_params ipmr_rht_params = {
+	.head_offset = offsetof(struct mfc_cache, mnode),
+	.key_offset = offsetof(struct mfc_cache, cmparg),
+	.key_len = sizeof(struct mfc_cache_cmp_arg),
+	.nelem_hint = 3,
+	.locks_mul = 1,
+	.obj_cmpfn = ipmr_hash_cmp,
+	.automatic_shrinking = true,
+};
+
 static struct mr_table *ipmr_new_table(struct net *net, u32 id)
 {
 	struct mr_table *mrt;
-	unsigned int i;
 
 	/* "pimreg%u" should not exceed 16 bytes (IFNAMSIZ) */
 	if (id != RT_TABLE_DEFAULT && id >= 1000000000)
@@ -318,10 +337,8 @@ static struct mr_table *ipmr_new_table(struct net *net, u32 id)
 	write_pnet(&mrt->net, net);
 	mrt->id = id;
 
-	/* Forwarding cache */
-	for (i = 0; i < MFC_LINES; i++)
-		INIT_LIST_HEAD(&mrt->mfc_cache_array[i]);
-
+	rhltable_init(&mrt->mfc_hash, &ipmr_rht_params);
+	INIT_LIST_HEAD(&mrt->mfc_cache_list);
 	INIT_LIST_HEAD(&mrt->mfc_unres_queue);
 
 	setup_timer(&mrt->ipmr_expire_timer, ipmr_expire_process,
@@ -338,6 +355,7 @@ static void ipmr_free_table(struct mr_table *mrt)
 {
 	del_timer_sync(&mrt->ipmr_expire_timer);
 	mroute_clean_tables(mrt, true);
+	rhltable_destroy(&mrt->mfc_hash);
 	kfree(mrt);
 }
 
@@ -839,13 +857,17 @@ static struct mfc_cache *ipmr_cache_find(struct mr_table *mrt,
 					 __be32 origin,
 					 __be32 mcastgrp)
 {
-	int line = MFC_HASH(mcastgrp, origin);
+	struct mfc_cache_cmp_arg arg = {
+			.mfc_mcastgrp = mcastgrp,
+			.mfc_origin = origin
+	};
+	struct rhlist_head *tmp, *list;
 	struct mfc_cache *c;
 
-	list_for_each_entry_rcu(c, &mrt->mfc_cache_array[line], list) {
-		if (c->mfc_origin == origin && c->mfc_mcastgrp == mcastgrp)
-			return c;
-	}
+	list = rhltable_lookup(&mrt->mfc_hash, &arg, ipmr_rht_params);
+	rhl_for_each_entry_rcu(c, tmp, list, mnode)
+		return c;
+
 	return NULL;
 }
 
@@ -853,13 +875,16 @@ static struct mfc_cache *ipmr_cache_find(struct mr_table *mrt,
 static struct mfc_cache *ipmr_cache_find_any_parent(struct mr_table *mrt,
 						    int vifi)
 {
-	int line = MFC_HASH(htonl(INADDR_ANY), htonl(INADDR_ANY));
+	struct mfc_cache_cmp_arg arg = {
+			.mfc_mcastgrp = htonl(INADDR_ANY),
+			.mfc_origin = htonl(INADDR_ANY)
+	};
+	struct rhlist_head *tmp, *list;
 	struct mfc_cache *c;
 
-	list_for_each_entry_rcu(c, &mrt->mfc_cache_array[line], list)
-		if (c->mfc_origin == htonl(INADDR_ANY) &&
-		    c->mfc_mcastgrp == htonl(INADDR_ANY) &&
-		    c->mfc_un.res.ttls[vifi] < 255)
+	list = rhltable_lookup(&mrt->mfc_hash, &arg, ipmr_rht_params);
+	rhl_for_each_entry_rcu(c, tmp, list, mnode)
+		if (c->mfc_un.res.ttls[vifi] < 255)
 			return c;
 
 	return NULL;
@@ -869,29 +894,51 @@ static struct mfc_cache *ipmr_cache_find_any_parent(struct mr_table *mrt,
 static struct mfc_cache *ipmr_cache_find_any(struct mr_table *mrt,
 					     __be32 mcastgrp, int vifi)
 {
-	int line = MFC_HASH(mcastgrp, htonl(INADDR_ANY));
+	struct mfc_cache_cmp_arg arg = {
+			.mfc_mcastgrp = mcastgrp,
+			.mfc_origin = htonl(INADDR_ANY)
+	};
+	struct rhlist_head *tmp, *list;
 	struct mfc_cache *c, *proxy;
 
 	if (mcastgrp == htonl(INADDR_ANY))
 		goto skip;
 
-	list_for_each_entry_rcu(c, &mrt->mfc_cache_array[line], list)
-		if (c->mfc_origin == htonl(INADDR_ANY) &&
-		    c->mfc_mcastgrp == mcastgrp) {
-			if (c->mfc_un.res.ttls[vifi] < 255)
-				return c;
-
-			/* It's ok if the vifi is part of the static tree */
-			proxy = ipmr_cache_find_any_parent(mrt,
-							   c->mfc_parent);
-			if (proxy && proxy->mfc_un.res.ttls[vifi] < 255)
-				return c;
-		}
+	list = rhltable_lookup(&mrt->mfc_hash, &arg, ipmr_rht_params);
+	rhl_for_each_entry_rcu(c, tmp, list, mnode) {
+		if (c->mfc_un.res.ttls[vifi] < 255)
+			return c;
+
+		/* It's ok if the vifi is part of the static tree */
+		proxy = ipmr_cache_find_any_parent(mrt, c->mfc_parent);
+		if (proxy && proxy->mfc_un.res.ttls[vifi] < 255)
+			return c;
+	}
 
 skip:
 	return ipmr_cache_find_any_parent(mrt, vifi);
 }
 
+/* Look for a (S,G,iif) entry if parent != -1 */
+static struct mfc_cache *ipmr_cache_find_parent(struct mr_table *mrt,
+						__be32 origin, __be32 mcastgrp,
+						int parent)
+{
+	struct mfc_cache_cmp_arg arg = {
+			.mfc_mcastgrp = mcastgrp,
+			.mfc_origin = origin,
+	};
+	struct rhlist_head *tmp, *list;
+	struct mfc_cache *c;
+
+	list = rhltable_lookup(&mrt->mfc_hash, &arg, ipmr_rht_params);
+	rhl_for_each_entry_rcu(c, tmp, list, mnode)
+		if (parent == -1 || parent == c->mfc_parent)
+			return c;
+
+	return NULL;
+}
+
 /* Allocate a multicast cache entry */
 static struct mfc_cache *ipmr_cache_alloc(void)
 {
@@ -1028,10 +1075,10 @@ static int ipmr_cache_report(struct mr_table *mrt,
 static int ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi,
 				 struct sk_buff *skb)
 {
+	const struct iphdr *iph = ip_hdr(skb);
+	struct mfc_cache *c;
 	bool found = false;
 	int err;
-	struct mfc_cache *c;
-	const struct iphdr *iph = ip_hdr(skb);
 
 	spin_lock_bh(&mfc_unres_lock);
 	list_for_each_entry(c, &mrt->mfc_unres_queue, list) {
@@ -1095,46 +1142,39 @@ static int ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi,
 
 static int ipmr_mfc_delete(struct mr_table *mrt, struct mfcctl *mfc, int parent)
 {
-	int line;
-	struct mfc_cache *c, *next;
+	struct mfc_cache *c;
 
-	line = MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr);
+	/* The entries are added/deleted only under RTNL */
+	rcu_read_lock();
+	c = ipmr_cache_find_parent(mrt, mfc->mfcc_origin.s_addr,
+				   mfc->mfcc_mcastgrp.s_addr, parent);
+	rcu_read_unlock();
+	if (!c)
+		return -ENOENT;
+	rhltable_remove(&mrt->mfc_hash, &c->mnode, ipmr_rht_params);
+	list_del_rcu(&c->list);
+	mroute_netlink_event(mrt, c, RTM_DELROUTE);
+	ipmr_cache_free(c);
 
-	list_for_each_entry_safe(c, next, &mrt->mfc_cache_array[line], list) {
-		if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
-		    c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr &&
-		    (parent == -1 || parent == c->mfc_parent)) {
-			list_del_rcu(&c->list);
-			mroute_netlink_event(mrt, c, RTM_DELROUTE);
-			ipmr_cache_free(c);
-			return 0;
-		}
-	}
-	return -ENOENT;
+	return 0;
 }
 
 static int ipmr_mfc_add(struct net *net, struct mr_table *mrt,
 			struct mfcctl *mfc, int mrtsock, int parent)
 {
-	bool found = false;
-	int line;
 	struct mfc_cache *uc, *c;
+	bool found;
+	int ret;
 
 	if (mfc->mfcc_parent >= MAXVIFS)
 		return -ENFILE;
 
-	line = MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr);
-
-	list_for_each_entry(c, &mrt->mfc_cache_array[line], list) {
-		if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
-		    c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr &&
-		    (parent == -1 || parent == c->mfc_parent)) {
-			found = true;
-			break;
-		}
-	}
-
-	if (found) {
+	/* The entries are added/deleted only under RTNL */
+	rcu_read_lock();
+	c = ipmr_cache_find_parent(mrt, mfc->mfcc_origin.s_addr,
+				   mfc->mfcc_mcastgrp.s_addr, parent);
+	rcu_read_unlock();
+	if (c) {
 		write_lock_bh(&mrt_lock);
 		c->mfc_parent = mfc->mfcc_parent;
 		ipmr_update_thresholds(mrt, c, mfc->mfcc_ttls);
@@ -1160,8 +1200,14 @@ static int ipmr_mfc_add(struct net *net, struct mr_table *mrt,
 	if (!mrtsock)
 		c->mfc_flags |= MFC_STATIC;
 
-	list_add_rcu(&c->list, &mrt->mfc_cache_array[line]);
-
+	ret = rhltable_insert_key(&mrt->mfc_hash, &c->cmparg, &c->mnode,
+				  ipmr_rht_params);
+	if (ret) {
+		pr_err("ipmr: rhtable insert error %d\n", ret);
+		ipmr_cache_free(c);
+		return ret;
+	}
+	list_add_tail_rcu(&c->list, &mrt->mfc_cache_list);
 	/* Check to see if we resolved a queued list. If so we
 	 * need to send on the frames and tidy up.
 	 */
@@ -1191,9 +1237,9 @@ static int ipmr_mfc_add(struct net *net, struct mr_table *mrt,
 /* Close the multicast socket, and clear the vif tables etc */
 static void mroute_clean_tables(struct mr_table *mrt, bool all)
 {
-	int i;
+	struct mfc_cache *c, *tmp;
 	LIST_HEAD(list);
-	struct mfc_cache *c, *next;
+	int i;
 
 	/* Shut down all active vif entries */
 	for (i = 0; i < mrt->maxvif; i++) {
@@ -1204,19 +1250,18 @@ static void mroute_clean_tables(struct mr_table *mrt, bool all)
 	unregister_netdevice_many(&list);
 
 	/* Wipe the cache */
-	for (i = 0; i < MFC_LINES; i++) {
-		list_for_each_entry_safe(c, next, &mrt->mfc_cache_array[i], list) {
-			if (!all && (c->mfc_flags & MFC_STATIC))
-				continue;
-			list_del_rcu(&c->list);
-			mroute_netlink_event(mrt, c, RTM_DELROUTE);
-			ipmr_cache_free(c);
-		}
+	list_for_each_entry_safe(c, tmp, &mrt->mfc_cache_list, list) {
+		if (!all && (c->mfc_flags & MFC_STATIC))
+			continue;
+		rhltable_remove(&mrt->mfc_hash, &c->mnode, ipmr_rht_params);
+		list_del_rcu(&c->list);
+		mroute_netlink_event(mrt, c, RTM_DELROUTE);
+		ipmr_cache_free(c);
 	}
 
 	if (atomic_read(&mrt->cache_resolve_queue_len) != 0) {
 		spin_lock_bh(&mfc_unres_lock);
-		list_for_each_entry_safe(c, next, &mrt->mfc_unres_queue, list) {
+		list_for_each_entry_safe(c, tmp, &mrt->mfc_unres_queue, list) {
 			list_del(&c->list);
 			mroute_netlink_event(mrt, c, RTM_DELROUTE);
 			ipmr_destroy_unres(mrt, c);
@@ -1791,9 +1836,9 @@ static void ip_mr_forward(struct net *net, struct mr_table *mrt,
 			  struct sk_buff *skb, struct mfc_cache *cache,
 			  int local)
 {
+	int true_vifi = ipmr_find_vif(mrt, skb->dev);
 	int psend = -1;
 	int vif, ct;
-	int true_vifi = ipmr_find_vif(mrt, skb->dev);
 
 	vif = cache->mfc_parent;
 	cache->mfc_un.res.pkt++;
@@ -2091,8 +2136,10 @@ static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
 	int ct;
 
 	/* If cache is unresolved, don't try to parse IIF and OIF */
-	if (c->mfc_parent >= MAXVIFS)
+	if (c->mfc_parent >= MAXVIFS) {
+		rtm->rtm_flags |= RTNH_F_UNRESOLVED;
 		return -ENOENT;
+	}
 
 	if (VIF_EXISTS(mrt, c->mfc_parent) &&
 	    nla_put_u32(skb, RTA_IIF, mrt->vif_table[c->mfc_parent].dev->ifindex) < 0)
@@ -2134,7 +2181,7 @@ static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
 
 int ipmr_get_route(struct net *net, struct sk_buff *skb,
 		   __be32 saddr, __be32 daddr,
-		   struct rtmsg *rtm, int nowait, u32 portid)
+		   struct rtmsg *rtm, u32 portid)
 {
 	struct mfc_cache *cache;
 	struct mr_table *mrt;
@@ -2158,11 +2205,6 @@ int ipmr_get_route(struct net *net, struct sk_buff *skb,
 		struct net_device *dev;
 		int vif = -1;
 
-		if (nowait) {
-			rcu_read_unlock();
-			return -EAGAIN;
-		}
-
 		dev = skb->dev;
 		read_lock(&mrt_lock);
 		if (dev)
@@ -2296,34 +2338,30 @@ static int ipmr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb)
 	struct mr_table *mrt;
 	struct mfc_cache *mfc;
 	unsigned int t = 0, s_t;
-	unsigned int h = 0, s_h;
 	unsigned int e = 0, s_e;
 
 	s_t = cb->args[0];
-	s_h = cb->args[1];
-	s_e = cb->args[2];
+	s_e = cb->args[1];
 
 	rcu_read_lock();
 	ipmr_for_each_table(mrt, net) {
 		if (t < s_t)
 			goto next_table;
-		if (t > s_t)
-			s_h = 0;
-		for (h = s_h; h < MFC_LINES; h++) {
-			list_for_each_entry_rcu(mfc, &mrt->mfc_cache_array[h], list) {
-				if (e < s_e)
-					goto next_entry;
-				if (ipmr_fill_mroute(mrt, skb,
-						     NETLINK_CB(cb->skb).portid,
-						     cb->nlh->nlmsg_seq,
-						     mfc, RTM_NEWROUTE,
-						     NLM_F_MULTI) < 0)
-					goto done;
+		list_for_each_entry_rcu(mfc, &mrt->mfc_cache_list, list) {
+			if (e < s_e)
+				goto next_entry;
+			if (ipmr_fill_mroute(mrt, skb,
+					     NETLINK_CB(cb->skb).portid,
+					     cb->nlh->nlmsg_seq,
+					     mfc, RTM_NEWROUTE,
+					     NLM_F_MULTI) < 0)
+				goto done;
 next_entry:
-				e++;
-			}
-			e = s_e = 0;
+			e++;
 		}
+		e = 0;
+		s_e = 0;
+
 		spin_lock_bh(&mfc_unres_lock);
 		list_for_each_entry(mfc, &mrt->mfc_unres_queue, list) {
 			if (e < s_e)
@@ -2340,16 +2378,15 @@ next_entry2:
 			e++;
 		}
 		spin_unlock_bh(&mfc_unres_lock);
-		e = s_e = 0;
-		s_h = 0;
+		e = 0;
+		s_e = 0;
 next_table:
 		t++;
 	}
 done:
 	rcu_read_unlock();
 
-	cb->args[2] = e;
-	cb->args[1] = h;
+	cb->args[1] = e;
 	cb->args[0] = t;
 
 	return skb->len;
@@ -2559,7 +2596,7 @@ static int ipmr_vif_seq_show(struct seq_file *seq, void *v)
 		const char *name =  vif->dev ? vif->dev->name : "none";
 
 		seq_printf(seq,
-			   "%2Zd %-10s %8ld %7ld  %8ld %7ld %05X %08X %08X\n",
+			   "%2zd %-10s %8ld %7ld  %8ld %7ld %05X %08X %08X\n",
 			   vif - mrt->vif_table,
 			   name, vif->bytes_in, vif->pkt_in,
 			   vif->bytes_out, vif->pkt_out,
@@ -2593,10 +2630,8 @@ struct ipmr_mfc_iter {
 	struct seq_net_private p;
 	struct mr_table *mrt;
 	struct list_head *cache;
-	int ct;
 };
 
-
 static struct mfc_cache *ipmr_mfc_seq_idx(struct net *net,
 					  struct ipmr_mfc_iter *it, loff_t pos)
 {
@@ -2604,12 +2639,10 @@ static struct mfc_cache *ipmr_mfc_seq_idx(struct net *net,
 	struct mfc_cache *mfc;
 
 	rcu_read_lock();
-	for (it->ct = 0; it->ct < MFC_LINES; it->ct++) {
-		it->cache = &mrt->mfc_cache_array[it->ct];
-		list_for_each_entry_rcu(mfc, it->cache, list)
-			if (pos-- == 0)
-				return mfc;
-	}
+	it->cache = &mrt->mfc_cache_list;
+	list_for_each_entry_rcu(mfc, &mrt->mfc_cache_list, list)
+		if (pos-- == 0)
+			return mfc;
 	rcu_read_unlock();
 
 	spin_lock_bh(&mfc_unres_lock);
@@ -2636,17 +2669,16 @@ static void *ipmr_mfc_seq_start(struct seq_file *seq, loff_t *pos)
 
 	it->mrt = mrt;
 	it->cache = NULL;
-	it->ct = 0;
 	return *pos ? ipmr_mfc_seq_idx(net, seq->private, *pos - 1)
 		: SEQ_START_TOKEN;
 }
 
 static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 {
-	struct mfc_cache *mfc = v;
 	struct ipmr_mfc_iter *it = seq->private;
 	struct net *net = seq_file_net(seq);
 	struct mr_table *mrt = it->mrt;
+	struct mfc_cache *mfc = v;
 
 	++*pos;
 
@@ -2659,19 +2691,9 @@ static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 	if (it->cache == &mrt->mfc_unres_queue)
 		goto end_of_list;
 
-	BUG_ON(it->cache != &mrt->mfc_cache_array[it->ct]);
-
-	while (++it->ct < MFC_LINES) {
-		it->cache = &mrt->mfc_cache_array[it->ct];
-		if (list_empty(it->cache))
-			continue;
-		return list_first_entry(it->cache, struct mfc_cache, list);
-	}
-
 	/* exhausted cache_array, show unresolved */
 	rcu_read_unlock();
 	it->cache = &mrt->mfc_unres_queue;
-	it->ct = 0;
 
 	spin_lock_bh(&mfc_unres_lock);
 	if (!list_empty(it->cache))
@@ -2691,7 +2713,7 @@ static void ipmr_mfc_seq_stop(struct seq_file *seq, void *v)
 
 	if (it->cache == &mrt->mfc_unres_queue)
 		spin_unlock_bh(&mfc_unres_lock);
-	else if (it->cache == &mrt->mfc_cache_array[it->ct])
+	else if (it->cache == &mrt->mfc_cache_list)
 		rcu_read_unlock();
 }
 
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index a467e1236c43..6241a81fd7f5 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -677,11 +677,6 @@ static int copy_entries_to_user(unsigned int total_size,
 		return PTR_ERR(counters);
 
 	loc_cpu_entry = private->entries;
-	/* ... then copy entire thing ... */
-	if (copy_to_user(userptr, loc_cpu_entry, total_size) != 0) {
-		ret = -EFAULT;
-		goto free_counters;
-	}
 
 	/* FIXME: use iterator macros --RR */
 	/* ... then go back and fix counters and names */
@@ -689,6 +684,10 @@ static int copy_entries_to_user(unsigned int total_size,
 		const struct xt_entry_target *t;
 
 		e = (struct arpt_entry *)(loc_cpu_entry + off);
+		if (copy_to_user(userptr + off, e, sizeof(*e))) {
+			ret = -EFAULT;
+			goto free_counters;
+		}
 		if (copy_to_user(userptr + off
 				 + offsetof(struct arpt_entry, counters),
 				 &counters[num],
@@ -698,11 +697,7 @@ static int copy_entries_to_user(unsigned int total_size,
 		}
 
 		t = arpt_get_target_c(e);
-		if (copy_to_user(userptr + off + e->target_offset
-				 + offsetof(struct xt_entry_target,
-					    u.user.name),
-				 t->u.kernel.target->name,
-				 strlen(t->u.kernel.target->name)+1) != 0) {
+		if (xt_target_to_user(t, userptr + off + e->target_offset)) {
 			ret = -EFAULT;
 			goto free_counters;
 		}
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 91656a1d8fbd..384b85713e06 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -826,10 +826,6 @@ copy_entries_to_user(unsigned int total_size,
 		return PTR_ERR(counters);
 
 	loc_cpu_entry = private->entries;
-	if (copy_to_user(userptr, loc_cpu_entry, total_size) != 0) {
-		ret = -EFAULT;
-		goto free_counters;
-	}
 
 	/* FIXME: use iterator macros --RR */
 	/* ... then go back and fix counters and names */
@@ -839,6 +835,10 @@ copy_entries_to_user(unsigned int total_size,
 		const struct xt_entry_target *t;
 
 		e = (struct ipt_entry *)(loc_cpu_entry + off);
+		if (copy_to_user(userptr + off, e, sizeof(*e))) {
+			ret = -EFAULT;
+			goto free_counters;
+		}
 		if (copy_to_user(userptr + off
 				 + offsetof(struct ipt_entry, counters),
 				 &counters[num],
@@ -852,23 +852,14 @@ copy_entries_to_user(unsigned int total_size,
 		     i += m->u.match_size) {
 			m = (void *)e + i;
 
-			if (copy_to_user(userptr + off + i
-					 + offsetof(struct xt_entry_match,
-						    u.user.name),
-					 m->u.kernel.match->name,
-					 strlen(m->u.kernel.match->name)+1)
-			    != 0) {
+			if (xt_match_to_user(m, userptr + off + i)) {
 				ret = -EFAULT;
 				goto free_counters;
 			}
 		}
 
 		t = ipt_get_target_c(e);
-		if (copy_to_user(userptr + off + e->target_offset
-				 + offsetof(struct xt_entry_target,
-					    u.user.name),
-				 t->u.kernel.target->name,
-				 strlen(t->u.kernel.target->name)+1) != 0) {
+		if (xt_target_to_user(t, userptr + off + e->target_offset)) {
 			ret = -EFAULT;
 			goto free_counters;
 		}
diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c
index 0a783cd73faf..52f26459efc3 100644
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -485,6 +485,7 @@ static struct xt_target clusterip_tg_reg __read_mostly = {
 	.checkentry	= clusterip_tg_check,
 	.destroy	= clusterip_tg_destroy,
 	.targetsize	= sizeof(struct ipt_clusterip_tgt_info),
+	.usersize	= offsetof(struct ipt_clusterip_tgt_info, config),
 #ifdef CONFIG_COMPAT
 	.compatsize	= sizeof(struct compat_ipt_clusterip_tgt_info),
 #endif /* CONFIG_COMPAT */
diff --git a/net/ipv4/netfilter/ipt_SYNPROXY.c b/net/ipv4/netfilter/ipt_SYNPROXY.c
index 30c0de53e254..3240a2614e82 100644
--- a/net/ipv4/netfilter/ipt_SYNPROXY.c
+++ b/net/ipv4/netfilter/ipt_SYNPROXY.c
@@ -57,8 +57,7 @@ synproxy_send_tcp(struct net *net,
 		goto free_nskb;
 
 	if (nfct) {
-		nskb->nfct = nfct;
-		nskb->nfctinfo = ctinfo;
+		nf_ct_set(nskb, (struct nf_conn *)nfct, ctinfo);
 		nf_conntrack_get(nfct);
 	}
 
@@ -107,8 +106,8 @@ synproxy_send_client_synack(struct net *net,
 
 	synproxy_build_options(nth, opts);
 
-	synproxy_send_tcp(net, skb, nskb, skb->nfct, IP_CT_ESTABLISHED_REPLY,
-			  niph, nth, tcp_hdr_size);
+	synproxy_send_tcp(net, skb, nskb, skb_nfct(skb),
+			  IP_CT_ESTABLISHED_REPLY, niph, nth, tcp_hdr_size);
 }
 
 static void
@@ -230,8 +229,8 @@ synproxy_send_client_ack(struct net *net,
 
 	synproxy_build_options(nth, opts);
 
-	synproxy_send_tcp(net, skb, nskb, skb->nfct, IP_CT_ESTABLISHED_REPLY,
-			  niph, nth, tcp_hdr_size);
+	synproxy_send_tcp(net, skb, nskb, skb_nfct(skb),
+			  IP_CT_ESTABLISHED_REPLY, niph, nth, tcp_hdr_size);
 }
 
 static bool
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
index fcfd071f4705..bc1486f2c064 100644
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
@@ -235,7 +235,7 @@ getorigdst(struct sock *sk, int optval, void __user *user, int *len)
 	}
 
 	if ((unsigned int) *len < sizeof(struct sockaddr_in)) {
-		pr_debug("SO_ORIGINAL_DST: len %d not %Zu\n",
+		pr_debug("SO_ORIGINAL_DST: len %d not %zu\n",
 			 *len, sizeof(struct sockaddr_in));
 		return -EINVAL;
 	}
diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
index d075b3cf2400..73c591d8a9a8 100644
--- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
+++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
@@ -128,16 +128,16 @@ static bool icmp_new(struct nf_conn *ct, const struct sk_buff *skb,
 /* Returns conntrack if it dealt with ICMP, and filled in skb fields */
 static int
 icmp_error_message(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb,
-		 enum ip_conntrack_info *ctinfo,
 		 unsigned int hooknum)
 {
 	struct nf_conntrack_tuple innertuple, origtuple;
 	const struct nf_conntrack_l4proto *innerproto;
 	const struct nf_conntrack_tuple_hash *h;
 	const struct nf_conntrack_zone *zone;
+	enum ip_conntrack_info ctinfo;
 	struct nf_conntrack_zone tmp;
 
-	NF_CT_ASSERT(skb->nfct == NULL);
+	NF_CT_ASSERT(!skb_nfct(skb));
 	zone = nf_ct_zone_tmpl(tmpl, skb, &tmp);
 
 	/* Are they talking about one of our connections? */
@@ -160,7 +160,7 @@ icmp_error_message(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb,
 		return -NF_ACCEPT;
 	}
 
-	*ctinfo = IP_CT_RELATED;
+	ctinfo = IP_CT_RELATED;
 
 	h = nf_conntrack_find_get(net, zone, &innertuple);
 	if (!h) {
@@ -169,11 +169,10 @@ icmp_error_message(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb,
 	}
 
 	if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY)
-		*ctinfo += IP_CT_IS_REPLY;
+		ctinfo += IP_CT_IS_REPLY;
 
 	/* Update skb to refer to this connection */
-	skb->nfct = &nf_ct_tuplehash_to_ctrack(h)->ct_general;
-	skb->nfctinfo = *ctinfo;
+	nf_ct_set(skb, nf_ct_tuplehash_to_ctrack(h), ctinfo);
 	return NF_ACCEPT;
 }
 
@@ -181,7 +180,7 @@ icmp_error_message(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb,
 static int
 icmp_error(struct net *net, struct nf_conn *tmpl,
 	   struct sk_buff *skb, unsigned int dataoff,
-	   enum ip_conntrack_info *ctinfo, u_int8_t pf, unsigned int hooknum)
+	   u8 pf, unsigned int hooknum)
 {
 	const struct icmphdr *icmph;
 	struct icmphdr _ih;
@@ -225,7 +224,7 @@ icmp_error(struct net *net, struct nf_conn *tmpl,
 	    icmph->type != ICMP_REDIRECT)
 		return NF_ACCEPT;
 
-	return icmp_error_message(net, tmpl, skb, ctinfo, hooknum);
+	return icmp_error_message(net, tmpl, skb, hooknum);
 }
 
 #if IS_ENABLED(CONFIG_NF_CT_NETLINK)
diff --git a/net/ipv4/netfilter/nf_defrag_ipv4.c b/net/ipv4/netfilter/nf_defrag_ipv4.c
index 49bd6a54404f..346bf7ccac08 100644
--- a/net/ipv4/netfilter/nf_defrag_ipv4.c
+++ b/net/ipv4/netfilter/nf_defrag_ipv4.c
@@ -45,7 +45,7 @@ static enum ip_defrag_users nf_ct_defrag_user(unsigned int hooknum,
 {
 	u16 zone_id = NF_CT_DEFAULT_ZONE_ID;
 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
-	if (skb->nfct) {
+	if (skb_nfct(skb)) {
 		enum ip_conntrack_info ctinfo;
 		const struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
 
@@ -75,7 +75,7 @@ static unsigned int ipv4_conntrack_defrag(void *priv,
 #if !IS_ENABLED(CONFIG_NF_NAT)
 	/* Previously seen (loopback)?  Ignore.  Do this before
 	   fragment check. */
-	if (skb->nfct && !nf_ct_is_template((struct nf_conn *)skb->nfct))
+	if (skb_nfct(skb) && !nf_ct_is_template((struct nf_conn *)skb_nfct(skb)))
 		return NF_ACCEPT;
 #endif
 #endif
diff --git a/net/ipv4/netfilter/nf_dup_ipv4.c b/net/ipv4/netfilter/nf_dup_ipv4.c
index cf986e1c7bbd..f0dbff05fc28 100644
--- a/net/ipv4/netfilter/nf_dup_ipv4.c
+++ b/net/ipv4/netfilter/nf_dup_ipv4.c
@@ -68,10 +68,9 @@ void nf_dup_ipv4(struct net *net, struct sk_buff *skb, unsigned int hooknum,
 
 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
 	/* Avoid counting cloned packets towards the original connection. */
-	nf_conntrack_put(skb->nfct);
-	skb->nfct     = &nf_ct_untracked_get()->ct_general;
-	skb->nfctinfo = IP_CT_NEW;
-	nf_conntrack_get(skb->nfct);
+	nf_reset(skb);
+	nf_ct_set(skb, nf_ct_untracked_get(), IP_CT_NEW);
+	nf_conntrack_get(skb_nfct(skb));
 #endif
 	/*
 	 * If we are in PREROUTING/INPUT, decrease the TTL to mitigate potential
diff --git a/net/ipv4/netfilter/nf_log_arp.c b/net/ipv4/netfilter/nf_log_arp.c
index b24795e2ee6d..2f3895ddc275 100644
--- a/net/ipv4/netfilter/nf_log_arp.c
+++ b/net/ipv4/netfilter/nf_log_arp.c
@@ -69,7 +69,7 @@ static void dump_arp_packet(struct nf_log_buf *m,
 
 	ap = skb_header_pointer(skb, sizeof(_arph), sizeof(_arpp), &_arpp);
 	if (ap == NULL) {
-		nf_log_buf_add(m, " INCOMPLETE [%Zu bytes]",
+		nf_log_buf_add(m, " INCOMPLETE [%zu bytes]",
 			       skb->len - sizeof(_arph));
 		return;
 	}
@@ -87,7 +87,7 @@ static void nf_log_arp_packet(struct net *net, u_int8_t pf,
 	struct nf_log_buf *m;
 
 	/* FIXME: Disabled from containers until syslog ns is supported */
-	if (!net_eq(net, &init_net))
+	if (!net_eq(net, &init_net) && !sysctl_nf_log_all_netns)
 		return;
 
 	m = nf_log_buf_open();
diff --git a/net/ipv4/netfilter/nf_log_ipv4.c b/net/ipv4/netfilter/nf_log_ipv4.c
index 856648966f4c..c83a9963269b 100644
--- a/net/ipv4/netfilter/nf_log_ipv4.c
+++ b/net/ipv4/netfilter/nf_log_ipv4.c
@@ -319,7 +319,7 @@ static void nf_log_ip_packet(struct net *net, u_int8_t pf,
 	struct nf_log_buf *m;
 
 	/* FIXME: Disabled from containers until syslog ns is supported */
-	if (!net_eq(net, &init_net))
+	if (!net_eq(net, &init_net) && !sysctl_nf_log_all_netns)
 		return;
 
 	m = nf_log_buf_open();
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index 86cca610f4c2..2af6244b83e2 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -433,9 +433,9 @@ int ping_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 		goto out;
 	}
 
-	pr_debug("after bind(): num = %d, dif = %d\n",
-		 (int)isk->inet_num,
-		 (int)sk->sk_bound_dev_if);
+	pr_debug("after bind(): num = %hu, dif = %d\n",
+		 isk->inet_num,
+		 sk->sk_bound_dev_if);
 
 	err = 0;
 	if (sk->sk_family == AF_INET && isk->inet_rcv_saddr)
@@ -642,6 +642,8 @@ static int ping_v4_push_pending_frames(struct sock *sk, struct pingfakehdr *pfh,
 {
 	struct sk_buff *skb = skb_peek(&sk->sk_write_queue);
 
+	if (!skb)
+		return 0;
 	pfh->wcheck = csum_partial((char *)&pfh->icmph,
 		sizeof(struct icmphdr), pfh->wcheck);
 	pfh->icmph.checksum = csum_fold(pfh->wcheck);
@@ -848,7 +850,8 @@ out:
 	return err;
 
 do_confirm:
-	dst_confirm(&rt->dst);
+	if (msg->msg_flags & MSG_PROBE)
+		dst_confirm_neigh(&rt->dst, &fl4.daddr);
 	if (!(msg->msg_flags & MSG_PROBE) || len)
 		goto back_from_confirm;
 	err = 0;
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 7143ca1a6af9..69cf49e8356d 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -57,15 +57,13 @@ static int sockstat_seq_show(struct seq_file *seq, void *v)
 	unsigned int frag_mem;
 	int orphans, sockets;
 
-	local_bh_disable();
 	orphans = percpu_counter_sum_positive(&tcp_orphan_count);
 	sockets = proto_sockets_allocated_sum_positive(&tcp_prot);
-	local_bh_enable();
 
 	socket_seq_show(seq);
 	seq_printf(seq, "TCP: inuse %d orphan %d tw %d alloc %d mem %ld\n",
 		   sock_prot_inuse_get(net, &tcp_prot), orphans,
-		   atomic_read(&tcp_death_row.tw_count), sockets,
+		   atomic_read(&net->ipv4.tcp_death_row.tw_count), sockets,
 		   proto_memory_allocated(&tcp_prot));
 	seq_printf(seq, "UDP: inuse %d mem %ld\n",
 		   sock_prot_inuse_get(net, &udp_prot),
@@ -264,6 +262,7 @@ static const struct snmp_mib snmp4_net_list[] = {
 	SNMP_MIB_ITEM("TCPSackMerged", LINUX_MIB_SACKMERGED),
 	SNMP_MIB_ITEM("TCPSackShiftFallback", LINUX_MIB_SACKSHIFTFALLBACK),
 	SNMP_MIB_ITEM("TCPBacklogDrop", LINUX_MIB_TCPBACKLOGDROP),
+	SNMP_MIB_ITEM("PFMemallocDrop", LINUX_MIB_PFMEMALLOCDROP),
 	SNMP_MIB_ITEM("TCPMinTTLDrop", LINUX_MIB_TCPMINTTLDROP),
 	SNMP_MIB_ITEM("TCPDeferAcceptDrop", LINUX_MIB_TCPDEFERACCEPTDROP),
 	SNMP_MIB_ITEM("IPReversePathFilter", LINUX_MIB_IPRPFILTER),
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 4e49e5cb001c..8119e1f66e03 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -383,6 +383,9 @@ static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4,
 
 	sock_tx_timestamp(sk, sockc->tsflags, &skb_shinfo(skb)->tx_flags);
 
+	if (flags & MSG_CONFIRM)
+		skb_set_dst_pending_confirm(skb, 1);
+
 	skb->transport_header = skb->network_header;
 	err = -EFAULT;
 	if (memcpy_from_msg(iph, msg, length))
@@ -666,7 +669,8 @@ out:
 	return len;
 
 do_confirm:
-	dst_confirm(&rt->dst);
+	if (msg->msg_flags & MSG_PROBE)
+		dst_confirm_neigh(&rt->dst, &fl4.daddr);
 	if (!(msg->msg_flags & MSG_PROBE) || len)
 		goto back_from_confirm;
 	err = 0;
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 709ffe67d1de..8471dd116771 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -154,6 +154,7 @@ static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 					   struct sk_buff *skb,
 					   const void *daddr);
+static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr);
 
 static struct dst_ops ipv4_dst_ops = {
 	.family =		AF_INET,
@@ -168,6 +169,7 @@ static struct dst_ops ipv4_dst_ops = {
 	.redirect =		ip_do_redirect,
 	.local_out =		__ip_local_out,
 	.neigh_lookup =		ipv4_neigh_lookup,
+	.confirm_neigh =	ipv4_confirm_neigh,
 };
 
 #define ECN_OR_COST(class)	TC_PRIO_##class
@@ -461,6 +463,23 @@ static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 	return neigh_create(&arp_tbl, pkey, dev);
 }
 
+static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr)
+{
+	struct net_device *dev = dst->dev;
+	const __be32 *pkey = daddr;
+	const struct rtable *rt;
+
+	rt = (const struct rtable *)dst;
+	if (rt->rt_gateway)
+		pkey = (const __be32 *)&rt->rt_gateway;
+	else if (!daddr ||
+		 (rt->rt_flags &
+		  (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL)))
+		return;
+
+	__ipv4_confirm_neigh(dev, *(__force u32 *)pkey);
+}
+
 #define IP_IDENTS_SZ 2048u
 
 static atomic_t *ip_idents __read_mostly;
@@ -1758,7 +1777,6 @@ standard_hash:
 
 static int ip_mkroute_input(struct sk_buff *skb,
 			    struct fib_result *res,
-			    const struct flowi4 *fl4,
 			    struct in_device *in_dev,
 			    __be32 daddr, __be32 saddr, u32 tos)
 {
@@ -1858,6 +1876,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 	fl4.flowi4_flags = 0;
 	fl4.daddr = daddr;
 	fl4.saddr = saddr;
+	fl4.flowi4_uid = sock_net_uid(net, NULL);
 	err = fib_lookup(net, &fl4, &res, 0);
 	if (err != 0) {
 		if (!IN_DEV_FORWARD(in_dev))
@@ -1883,7 +1902,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 	if (res.type != RTN_UNICAST)
 		goto martian_destination;
 
-	err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
+	err = ip_mkroute_input(skb, &res, in_dev, daddr, saddr, tos);
 out:	return err;
 
 brd_input:
@@ -1990,6 +2009,7 @@ int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 {
 	int res;
 
+	tos &= IPTOS_RT_MASK;
 	rcu_read_lock();
 
 	/* Multicast recognition logic is moved from route cache to here.
@@ -2454,7 +2474,7 @@ EXPORT_SYMBOL_GPL(ip_route_output_flow);
 
 static int rt_fill_info(struct net *net,  __be32 dst, __be32 src, u32 table_id,
 			struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
-			u32 seq, int event, int nowait, unsigned int flags)
+			u32 seq, int event)
 {
 	struct rtable *rt = skb_rtable(skb);
 	struct rtmsg *r;
@@ -2463,7 +2483,7 @@ static int rt_fill_info(struct net *net,  __be32 dst, __be32 src, u32 table_id,
 	u32 error;
 	u32 metrics[RTAX_MAX];
 
-	nlh = nlmsg_put(skb, portid, seq, event, sizeof(*r), flags);
+	nlh = nlmsg_put(skb, portid, seq, event, sizeof(*r), 0);
 	if (!nlh)
 		return -EMSGSIZE;
 
@@ -2541,18 +2561,12 @@ static int rt_fill_info(struct net *net,  __be32 dst, __be32 src, u32 table_id,
 		    IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
 			int err = ipmr_get_route(net, skb,
 						 fl4->saddr, fl4->daddr,
-						 r, nowait, portid);
+						 r, portid);
 
 			if (err <= 0) {
-				if (!nowait) {
-					if (err == 0)
-						return 0;
-					goto nla_put_failure;
-				} else {
-					if (err == -EMSGSIZE)
-						goto nla_put_failure;
-					error = err;
-				}
+				if (err == 0)
+					return 0;
+				goto nla_put_failure;
 			}
 		} else
 #endif
@@ -2638,9 +2652,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
 		skb->protocol	= htons(ETH_P_IP);
 		skb->dev	= dev;
 		skb->mark	= mark;
-		local_bh_disable();
 		err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
-		local_bh_enable();
 
 		rt = skb_rtable(skb);
 		if (err == 0 && rt->dst.error)
@@ -2665,7 +2677,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
 
 	err = rt_fill_info(net, dst, src, table_id, &fl4, skb,
 			   NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
-			   RTM_NEWROUTE, 0, 0);
+			   RTM_NEWROUTE);
 	if (err < 0)
 		goto errout_free;
 
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index 3e88467d70ee..496b97e17aaf 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -13,13 +13,13 @@
 #include <linux/tcp.h>
 #include <linux/slab.h>
 #include <linux/random.h>
-#include <linux/cryptohash.h>
+#include <linux/siphash.h>
 #include <linux/kernel.h>
 #include <linux/export.h>
 #include <net/tcp.h>
 #include <net/route.h>
 
-static u32 syncookie_secret[2][16-4+SHA_DIGEST_WORDS] __read_mostly;
+static siphash_key_t syncookie_secret[2] __read_mostly;
 
 #define COOKIEBITS 24	/* Upper bits store count */
 #define COOKIEMASK (((__u32)1 << COOKIEBITS) - 1)
@@ -48,24 +48,13 @@ static u32 syncookie_secret[2][16-4+SHA_DIGEST_WORDS] __read_mostly;
 #define TSBITS	6
 #define TSMASK	(((__u32)1 << TSBITS) - 1)
 
-static DEFINE_PER_CPU(__u32 [16 + 5 + SHA_WORKSPACE_WORDS], ipv4_cookie_scratch);
-
 static u32 cookie_hash(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport,
 		       u32 count, int c)
 {
-	__u32 *tmp;
-
 	net_get_random_once(syncookie_secret, sizeof(syncookie_secret));
-
-	tmp  = this_cpu_ptr(ipv4_cookie_scratch);
-	memcpy(tmp + 4, syncookie_secret[c], sizeof(syncookie_secret[c]));
-	tmp[0] = (__force u32)saddr;
-	tmp[1] = (__force u32)daddr;
-	tmp[2] = ((__force u32)sport << 16) + (__force u32)dport;
-	tmp[3] = count;
-	sha_transform(tmp + 16, (__u8 *)tmp, tmp + 16 + 5);
-
-	return tmp[17];
+	return siphash_4u32((__force u32)saddr, (__force u32)daddr,
+			    (__force u32)sport << 16 | (__force u32)dport,
+			    count, &syncookie_secret[c]);
 }
 
 
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index b2fa498b15d1..d6880a6149ee 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -35,6 +35,8 @@ static int ip_local_port_range_min[] = { 1, 1 };
 static int ip_local_port_range_max[] = { 65535, 65535 };
 static int tcp_adv_win_scale_min = -31;
 static int tcp_adv_win_scale_max = 31;
+static int ip_privileged_port_min;
+static int ip_privileged_port_max = 65535;
 static int ip_ttl_min = 1;
 static int ip_ttl_max = 255;
 static int tcp_syn_retries_min = 1;
@@ -79,7 +81,12 @@ static int ipv4_local_port_range(struct ctl_table *table, int write,
 	ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
 
 	if (write && ret == 0) {
-		if (range[1] < range[0])
+		/* Ensure that the upper limit is not smaller than the lower,
+		 * and that the lower does not encroach upon the privileged
+		 * port limit.
+		 */
+		if ((range[1] < range[0]) ||
+		    (range[0] < net->ipv4.sysctl_ip_prot_sock))
 			ret = -EINVAL;
 		else
 			set_local_port_range(net, range);
@@ -88,6 +95,40 @@ static int ipv4_local_port_range(struct ctl_table *table, int write,
 	return ret;
 }
 
+/* Validate changes from /proc interface. */
+static int ipv4_privileged_ports(struct ctl_table *table, int write,
+				void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	struct net *net = container_of(table->data, struct net,
+	    ipv4.sysctl_ip_prot_sock);
+	int ret;
+	int pports;
+	int range[2];
+	struct ctl_table tmp = {
+		.data = &pports,
+		.maxlen = sizeof(pports),
+		.mode = table->mode,
+		.extra1 = &ip_privileged_port_min,
+		.extra2 = &ip_privileged_port_max,
+	};
+
+	pports = net->ipv4.sysctl_ip_prot_sock;
+
+	ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
+
+	if (write && ret == 0) {
+		inet_get_local_port_range(net, &range[0], &range[1]);
+		/* Ensure that the local port range doesn't overlap with the
+		 * privileged port range.
+		 */
+		if (range[0] < pports)
+			ret = -EINVAL;
+		else
+			net->ipv4.sysctl_ip_prot_sock = pports;
+	}
+
+	return ret;
+}
 
 static void inet_get_ping_group_range_table(struct ctl_table *table, kgid_t *low, kgid_t *high)
 {
@@ -290,13 +331,6 @@ static struct ctl_table ipv4_table[] = {
 		.proc_handler	= proc_dointvec
 	},
 	{
-		.procname	= "tcp_max_tw_buckets",
-		.data		= &tcp_death_row.sysctl_max_tw_buckets,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec
-	},
-	{
 		.procname	= "tcp_fastopen",
 		.data		= &sysctl_tcp_fastopen,
 		.maxlen		= sizeof(int),
@@ -310,13 +344,6 @@ static struct ctl_table ipv4_table[] = {
 		.proc_handler	= proc_tcp_fastopen_key,
 	},
 	{
-		.procname	= "tcp_tw_recycle",
-		.data		= &tcp_death_row.sysctl_tw_recycle,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec
-	},
-	{
 		.procname	= "tcp_abort_on_overflow",
 		.data		= &sysctl_tcp_abort_on_overflow,
 		.maxlen		= sizeof(int),
@@ -338,13 +365,6 @@ static struct ctl_table ipv4_table[] = {
 		.proc_handler	= proc_dointvec
 	},
 	{
-		.procname	= "tcp_max_syn_backlog",
-		.data		= &sysctl_max_syn_backlog,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec
-	},
-	{
 		.procname	= "inet_peer_threshold",
 		.data		= &inet_peer_threshold,
 		.maxlen		= sizeof(int),
@@ -558,13 +578,6 @@ static struct ctl_table ipv4_table[] = {
 		.proc_handler   = proc_dointvec
 	},
 	{
-		.procname       = "tcp_thin_dupack",
-		.data           = &sysctl_tcp_thin_dupack,
-		.maxlen         = sizeof(int),
-		.mode           = 0644,
-		.proc_handler   = proc_dointvec
-	},
-	{
 		.procname	= "tcp_early_retrans",
 		.data		= &sysctl_tcp_early_retrans,
 		.maxlen		= sizeof(int),
@@ -960,6 +973,27 @@ static struct ctl_table ipv4_net_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec
 	},
+	{
+		.procname	= "tcp_max_tw_buckets",
+		.data		= &init_net.ipv4.tcp_death_row.sysctl_max_tw_buckets,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
+	{
+		.procname	= "tcp_tw_recycle",
+		.data		= &init_net.ipv4.tcp_death_row.sysctl_tw_recycle,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
+	{
+		.procname	= "tcp_max_syn_backlog",
+		.data		= &init_net.ipv4.sysctl_max_syn_backlog,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
 	{
 		.procname	= "fib_multipath_use_neigh",
@@ -971,6 +1005,24 @@ static struct ctl_table ipv4_net_table[] = {
 		.extra2		= &one,
 	},
 #endif
+	{
+		.procname	= "ip_unprivileged_port_start",
+		.maxlen		= sizeof(int),
+		.data		= &init_net.ipv4.sysctl_ip_prot_sock,
+		.mode		= 0644,
+		.proc_handler	= ipv4_privileged_ports,
+	},
+#ifdef CONFIG_NET_L3_MASTER_DEV
+	{
+		.procname	= "udp_l3mdev_accept",
+		.data		= &init_net.ipv4.sysctl_udp_l3mdev_accept,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &zero,
+		.extra2		= &one,
+	},
+#endif
 	{ }
 };
 
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 4a044964da66..da385ae997a3 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -406,7 +406,6 @@ void tcp_init_sock(struct sock *sk)
 	tp->mss_cache = TCP_MSS_DEFAULT;
 
 	tp->reordering = sock_net(sk)->ipv4.sysctl_tcp_reordering;
-	tcp_enable_early_retrans(tp);
 	tcp_assign_congestion_control(sk);
 
 	tp->tsoffset = 0;
@@ -421,15 +420,13 @@ void tcp_init_sock(struct sock *sk)
 	sk->sk_sndbuf = sysctl_tcp_wmem[1];
 	sk->sk_rcvbuf = sysctl_tcp_rmem[1];
 
-	local_bh_disable();
 	sk_sockets_allocated_inc(sk);
-	local_bh_enable();
 }
 EXPORT_SYMBOL(tcp_init_sock);
 
 static void tcp_tx_timestamp(struct sock *sk, u16 tsflags, struct sk_buff *skb)
 {
-	if (tsflags) {
+	if (tsflags && skb) {
 		struct skb_shared_info *shinfo = skb_shinfo(skb);
 		struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
 
@@ -536,6 +533,12 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
 
 		if (tp->urg_data & TCP_URG_VALID)
 			mask |= POLLPRI;
+	} else if (sk->sk_state == TCP_SYN_SENT && inet_sk(sk)->defer_connect) {
+		/* Active TCP fastopen socket with defer_connect
+		 * Return POLLOUT so application can call write()
+		 * in order for kernel to generate SYN+data
+		 */
+		mask |= POLLOUT | POLLWRNORM;
 	}
 	/* This barrier is coupled with smp_wmb() in tcp_reset() */
 	smp_rmb();
@@ -770,6 +773,12 @@ ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos,
 				ret = -EAGAIN;
 				break;
 			}
+			/* if __tcp_splice_read() got nothing while we have
+			 * an skb in receive queue, we do not want to loop.
+			 * This might happen with URG data.
+			 */
+			if (!skb_queue_empty(&sk->sk_receive_queue))
+				break;
 			sk_wait_data(sk, &timeo, NULL);
 			if (signal_pending(current)) {
 				ret = sock_intr_errno(timeo);
@@ -958,10 +967,8 @@ new_segment:
 		copied += copy;
 		offset += copy;
 		size -= copy;
-		if (!size) {
-			tcp_tx_timestamp(sk, sk->sk_tsflags, skb);
+		if (!size)
 			goto out;
-		}
 
 		if (skb->len < size_goal || (flags & MSG_OOB))
 			continue;
@@ -987,8 +994,11 @@ wait_for_memory:
 	}
 
 out:
-	if (copied && !(flags & MSG_SENDPAGE_NOTLAST))
-		tcp_push(sk, flags, mss_now, tp->nonagle, size_goal);
+	if (copied) {
+		tcp_tx_timestamp(sk, sk->sk_tsflags, tcp_write_queue_tail(sk));
+		if (!(flags & MSG_SENDPAGE_NOTLAST))
+			tcp_push(sk, flags, mss_now, tp->nonagle, size_goal);
+	}
 	return copied;
 
 do_error:
@@ -1073,6 +1083,7 @@ static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg,
 				int *copied, size_t size)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
+	struct inet_sock *inet = inet_sk(sk);
 	int err, flags;
 
 	if (!(sysctl_tcp_fastopen & TFO_CLIENT_ENABLE))
@@ -1087,9 +1098,19 @@ static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg,
 	tp->fastopen_req->data = msg;
 	tp->fastopen_req->size = size;
 
+	if (inet->defer_connect) {
+		err = tcp_connect(sk);
+		/* Same failure procedure as in tcp_v4/6_connect */
+		if (err) {
+			tcp_set_state(sk, TCP_CLOSE);
+			inet->inet_dport = 0;
+			sk->sk_route_caps = 0;
+		}
+	}
 	flags = (msg->msg_flags & MSG_DONTWAIT) ? O_NONBLOCK : 0;
 	err = __inet_stream_connect(sk->sk_socket, msg->msg_name,
-				    msg->msg_namelen, flags);
+				    msg->msg_namelen, flags, 1);
+	inet->defer_connect = 0;
 	*copied = tp->fastopen_req->copied;
 	tcp_free_fastopen_req(tp);
 	return err;
@@ -1109,7 +1130,7 @@ int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
 	lock_sock(sk);
 
 	flags = msg->msg_flags;
-	if (flags & MSG_FASTOPEN) {
+	if (unlikely(flags & MSG_FASTOPEN || inet_sk(sk)->defer_connect)) {
 		err = tcp_sendmsg_fastopen(sk, msg, &copied_syn, size);
 		if (err == -EINPROGRESS && copied_syn > 0)
 			goto out;
@@ -1267,7 +1288,7 @@ new_segment:
 			} else {
 				skb_fill_page_desc(skb, i, pfrag->page,
 						   pfrag->offset, copy);
-				get_page(pfrag->page);
+				page_ref_inc(pfrag->page);
 			}
 			pfrag->offset += copy;
 		}
@@ -1281,7 +1302,6 @@ new_segment:
 
 		copied += copy;
 		if (!msg_data_left(msg)) {
-			tcp_tx_timestamp(sk, sockc.tsflags, skb);
 			if (unlikely(flags & MSG_EOR))
 				TCP_SKB_CB(skb)->eor = 1;
 			goto out;
@@ -1312,8 +1332,10 @@ wait_for_memory:
 	}
 
 out:
-	if (copied)
+	if (copied) {
+		tcp_tx_timestamp(sk, sockc.tsflags, tcp_write_queue_tail(sk));
 		tcp_push(sk, flags, mss_now, tp->nonagle, size_goal);
+	}
 out_nopush:
 	release_sock(sk);
 	return copied + copied_syn;
@@ -2473,11 +2495,6 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
 	case TCP_THIN_DUPACK:
 		if (val < 0 || val > 1)
 			err = -EINVAL;
-		else {
-			tp->thin_dupack = val;
-			if (tp->thin_dupack)
-				tcp_disable_early_retrans(tp);
-		}
 		break;
 
 	case TCP_REPAIR:
@@ -2662,6 +2679,18 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
 			err = -EINVAL;
 		}
 		break;
+	case TCP_FASTOPEN_CONNECT:
+		if (val > 1 || val < 0) {
+			err = -EINVAL;
+		} else if (sysctl_tcp_fastopen & TFO_CLIENT_ENABLE) {
+			if (sk->sk_state == TCP_CLOSE)
+				tp->fastopen_connect = val;
+			else
+				err = -EINVAL;
+		} else {
+			err = -EOPNOTSUPP;
+		}
+		break;
 	case TCP_TIMESTAMP:
 		if (!tp->repair)
 			err = -EPERM;
@@ -2764,6 +2793,9 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
 		info->tcpi_sacked = sk->sk_max_ack_backlog;
 		return;
 	}
+
+	slow = lock_sock_fast(sk);
+
 	info->tcpi_ca_state = icsk->icsk_ca_state;
 	info->tcpi_retransmits = icsk->icsk_retransmits;
 	info->tcpi_probes = icsk->icsk_probes_out;
@@ -2814,15 +2846,11 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
 
 	info->tcpi_total_retrans = tp->total_retrans;
 
-	slow = lock_sock_fast(sk);
-
 	info->tcpi_bytes_acked = tp->bytes_acked;
 	info->tcpi_bytes_received = tp->bytes_received;
 	info->tcpi_notsent_bytes = max_t(int, 0, tp->write_seq - tp->snd_nxt);
 	tcp_get_info_chrono_stats(tp, info);
 
-	unlock_sock_fast(sk, slow);
-
 	info->tcpi_segs_out = tp->segs_out;
 	info->tcpi_segs_in = tp->segs_in;
 
@@ -2838,6 +2866,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
 		do_div(rate64, intv);
 		info->tcpi_delivery_rate = rate64;
 	}
+	unlock_sock_fast(sk, slow);
 }
 EXPORT_SYMBOL_GPL(tcp_get_info);
 
@@ -2847,7 +2876,7 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk)
 	struct sk_buff *stats;
 	struct tcp_info info;
 
-	stats = alloc_skb(3 * nla_total_size_64bit(sizeof(u64)), GFP_ATOMIC);
+	stats = alloc_skb(5 * nla_total_size_64bit(sizeof(u64)), GFP_ATOMIC);
 	if (!stats)
 		return NULL;
 
@@ -2858,6 +2887,10 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk)
 			  info.tcpi_rwnd_limited, TCP_NLA_PAD);
 	nla_put_u64_64bit(stats, TCP_NLA_SNDBUF_LIMITED,
 			  info.tcpi_sndbuf_limited, TCP_NLA_PAD);
+	nla_put_u64_64bit(stats, TCP_NLA_DATA_SEGS_OUT,
+			  tp->data_segs_out, TCP_NLA_PAD);
+	nla_put_u64_64bit(stats, TCP_NLA_TOTAL_RETRANS,
+			  tp->total_retrans, TCP_NLA_PAD);
 	return stats;
 }
 
@@ -2967,8 +3000,9 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
 	case TCP_THIN_LINEAR_TIMEOUTS:
 		val = tp->thin_lto;
 		break;
+
 	case TCP_THIN_DUPACK:
-		val = tp->thin_dupack;
+		val = 0;
 		break;
 
 	case TCP_REPAIR:
@@ -3021,6 +3055,10 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
 		val = icsk->icsk_accept_queue.fastopenq.max_qlen;
 		break;
 
+	case TCP_FASTOPEN_CONNECT:
+		val = tp->fastopen_connect;
+		break;
+
 	case TCP_TIMESTAMP:
 		val = tcp_time_stamp + tp->tsoffset;
 		break;
@@ -3334,6 +3372,7 @@ void __init tcp_init(void)
 
 	percpu_counter_init(&tcp_sockets_allocated, 0, GFP_KERNEL);
 	percpu_counter_init(&tcp_orphan_count, 0, GFP_KERNEL);
+	inet_hashinfo_init(&tcp_hashinfo);
 	tcp_hashinfo.bind_bucket_cachep =
 		kmem_cache_create("tcp_bind_bucket",
 				  sizeof(struct inet_bind_bucket), 0,
@@ -3377,10 +3416,7 @@ void __init tcp_init(void)
 
 
 	cnt = tcp_hashinfo.ehash_mask + 1;
-
-	tcp_death_row.sysctl_max_tw_buckets = cnt / 2;
 	sysctl_tcp_max_orphans = cnt / 2;
-	sysctl_max_syn_backlog = max(128, cnt / 256);
 
 	tcp_init_mem();
 	/* Set per-socket limits to no more than 1/128 the pressure threshold */
@@ -3399,6 +3435,7 @@ void __init tcp_init(void)
 	pr_info("Hash tables configured (established %u bind %u)\n",
 		tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size);
 
+	tcp_v4_init();
 	tcp_metrics_init();
 	BUG_ON(tcp_register_congestion_control(&tcp_reno) != 0);
 	tcp_tasklet_init();
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
index dd2560c83a85..8ea4e9787f82 100644
--- a/net/ipv4/tcp_fastopen.c
+++ b/net/ipv4/tcp_fastopen.c
@@ -326,3 +326,57 @@ fastopen:
 	*foc = valid_foc;
 	return NULL;
 }
+
+bool tcp_fastopen_cookie_check(struct sock *sk, u16 *mss,
+			       struct tcp_fastopen_cookie *cookie)
+{
+	unsigned long last_syn_loss = 0;
+	int syn_loss = 0;
+
+	tcp_fastopen_cache_get(sk, mss, cookie, &syn_loss, &last_syn_loss);
+
+	/* Recurring FO SYN losses: no cookie or data in SYN */
+	if (syn_loss > 1 &&
+	    time_before(jiffies, last_syn_loss + (60*HZ << syn_loss))) {
+		cookie->len = -1;
+		return false;
+	}
+	if (sysctl_tcp_fastopen & TFO_CLIENT_NO_COOKIE) {
+		cookie->len = -1;
+		return true;
+	}
+	return cookie->len > 0;
+}
+
+/* This function checks if we want to defer sending SYN until the first
+ * write().  We defer under the following conditions:
+ * 1. fastopen_connect sockopt is set
+ * 2. we have a valid cookie
+ * Return value: return true if we want to defer until application writes data
+ *               return false if we want to send out SYN immediately
+ */
+bool tcp_fastopen_defer_connect(struct sock *sk, int *err)
+{
+	struct tcp_fastopen_cookie cookie = { .len = 0 };
+	struct tcp_sock *tp = tcp_sk(sk);
+	u16 mss;
+
+	if (tp->fastopen_connect && !tp->fastopen_req) {
+		if (tcp_fastopen_cookie_check(sk, &mss, &cookie)) {
+			inet_sk(sk)->defer_connect = 1;
+			return true;
+		}
+
+		/* Alloc fastopen_req in order for FO option to be included
+		 * in SYN
+		 */
+		tp->fastopen_req = kzalloc(sizeof(*tp->fastopen_req),
+					   sk->sk_allocation);
+		if (tp->fastopen_req)
+			tp->fastopen_req->cookie = cookie;
+		else
+			*err = -ENOBUFS;
+	}
+	return false;
+}
+EXPORT_SYMBOL(tcp_fastopen_defer_connect);
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 41dcbd568cbe..2c0ff327b6df 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -79,7 +79,7 @@
 int sysctl_tcp_timestamps __read_mostly = 1;
 int sysctl_tcp_window_scaling __read_mostly = 1;
 int sysctl_tcp_sack __read_mostly = 1;
-int sysctl_tcp_fack __read_mostly = 1;
+int sysctl_tcp_fack __read_mostly;
 int sysctl_tcp_max_reordering __read_mostly = 300;
 int sysctl_tcp_dsack __read_mostly = 1;
 int sysctl_tcp_app_win __read_mostly = 31;
@@ -95,9 +95,6 @@ int sysctl_tcp_rfc1337 __read_mostly;
 int sysctl_tcp_max_orphans __read_mostly = NR_FILE;
 int sysctl_tcp_frto __read_mostly = 2;
 int sysctl_tcp_min_rtt_wlen __read_mostly = 300;
-
-int sysctl_tcp_thin_dupack __read_mostly;
-
 int sysctl_tcp_moderate_rcvbuf __read_mostly = 1;
 int sysctl_tcp_early_retrans __read_mostly = 3;
 int sysctl_tcp_invalid_ratelimit __read_mostly = HZ/2;
@@ -904,8 +901,6 @@ static void tcp_update_reordering(struct sock *sk, const int metric,
 		tcp_disable_fack(tp);
 	}
 
-	if (metric > 0)
-		tcp_disable_early_retrans(tp);
 	tp->rack.reord = 1;
 }
 
@@ -916,10 +911,6 @@ static void tcp_verify_retransmit_hint(struct tcp_sock *tp, struct sk_buff *skb)
 	    before(TCP_SKB_CB(skb)->seq,
 		   TCP_SKB_CB(tp->retransmit_skb_hint)->seq))
 		tp->retransmit_skb_hint = skb;
-
-	if (!tp->lost_out ||
-	    after(TCP_SKB_CB(skb)->end_seq, tp->retransmit_high))
-		tp->retransmit_high = TCP_SKB_CB(skb)->end_seq;
 }
 
 /* Sum the number of packets on the wire we have marked as lost.
@@ -1135,6 +1126,7 @@ struct tcp_sacktag_state {
 	 */
 	struct skb_mstamp first_sackt;
 	struct skb_mstamp last_sackt;
+	struct skb_mstamp ack_time; /* Timestamp when the S/ACK was received */
 	struct rate_sample *rate;
 	int	flag;
 };
@@ -1217,7 +1209,8 @@ static u8 tcp_sacktag_one(struct sock *sk,
 		return sacked;
 
 	if (!(sacked & TCPCB_SACKED_ACKED)) {
-		tcp_rack_advance(tp, xmit_time, sacked);
+		tcp_rack_advance(tp, sacked, end_seq,
+				 xmit_time, &state->ack_time);
 
 		if (sacked & TCPCB_SACKED_RETRANS) {
 			/* If the segment is not tagged as lost,
@@ -1937,7 +1930,6 @@ void tcp_enter_loss(struct sock *sk)
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct net *net = sock_net(sk);
 	struct sk_buff *skb;
-	bool new_recovery = icsk->icsk_ca_state < TCP_CA_Recovery;
 	bool is_reneg;			/* is receiver reneging on SACKs? */
 	bool mark_lost;
 
@@ -1982,7 +1974,6 @@ void tcp_enter_loss(struct sock *sk)
 			TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED;
 			TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
 			tp->lost_out += tcp_skb_pcount(skb);
-			tp->retransmit_high = TCP_SKB_CB(skb)->end_seq;
 		}
 	}
 	tcp_verify_left_out(tp);
@@ -1998,13 +1989,15 @@ void tcp_enter_loss(struct sock *sk)
 	tp->high_seq = tp->snd_nxt;
 	tcp_ecn_queue_cwr(tp);
 
-	/* F-RTO RFC5682 sec 3.1 step 1: retransmit SND.UNA if no previous
-	 * loss recovery is underway except recurring timeout(s) on
-	 * the same SND.UNA (sec 3.2). Disable F-RTO on path MTU probing
+	/* F-RTO RFC5682 sec 3.1 step 1 mandates to disable F-RTO
+	 * if a previous recovery is underway, otherwise it may incorrectly
+	 * call a timeout spurious if some previously retransmitted packets
+	 * are s/acked (sec 3.2). We do not apply that retriction since
+	 * retransmitted skbs are permanently tagged with TCPCB_EVER_RETRANS
+	 * so FLAG_ORIG_SACK_ACKED is always correct. But we do disable F-RTO
+	 * on PTMU discovery to avoid sending new data.
 	 */
-	tp->frto = sysctl_tcp_frto &&
-		   (new_recovery || icsk->icsk_retransmits) &&
-		   !inet_csk(sk)->icsk_mtup.probe_size;
+	tp->frto = sysctl_tcp_frto && !inet_csk(sk)->icsk_mtup.probe_size;
 }
 
 /* If ACK arrived pointing to a remembered SACK, it means that our
@@ -2056,30 +2049,6 @@ static inline int tcp_dupack_heuristics(const struct tcp_sock *tp)
 	return tcp_is_fack(tp) ? tp->fackets_out : tp->sacked_out + 1;
 }
 
-static bool tcp_pause_early_retransmit(struct sock *sk, int flag)
-{
-	struct tcp_sock *tp = tcp_sk(sk);
-	unsigned long delay;
-
-	/* Delay early retransmit and entering fast recovery for
-	 * max(RTT/4, 2msec) unless ack has ECE mark, no RTT samples
-	 * available, or RTO is scheduled to fire first.
-	 */
-	if (sysctl_tcp_early_retrans < 2 || sysctl_tcp_early_retrans > 3 ||
-	    (flag & FLAG_ECE) || !tp->srtt_us)
-		return false;
-
-	delay = max(usecs_to_jiffies(tp->srtt_us >> 5),
-		    msecs_to_jiffies(2));
-
-	if (!time_after(inet_csk(sk)->icsk_timeout, (jiffies + delay)))
-		return false;
-
-	inet_csk_reset_xmit_timer(sk, ICSK_TIME_EARLY_RETRANS, delay,
-				  TCP_RTO_MAX);
-	return true;
-}
-
 /* Linux NewReno/SACK/FACK/ECN state machine.
  * --------------------------------------
  *
@@ -2127,10 +2096,26 @@ static bool tcp_pause_early_retransmit(struct sock *sk, int flag)
  *	F.e. after RTO, when all the queue is considered as lost,
  *	lost_out = packets_out and in_flight = retrans_out.
  *
- *		Essentially, we have now two algorithms counting
+ *		Essentially, we have now a few algorithms detecting
  *		lost packets.
  *
- *		FACK: It is the simplest heuristics. As soon as we decided
+ *		If the receiver supports SACK:
+ *
+ *		RFC6675/3517: It is the conventional algorithm. A packet is
+ *		considered lost if the number of higher sequence packets
+ *		SACKed is greater than or equal the DUPACK thoreshold
+ *		(reordering). This is implemented in tcp_mark_head_lost and
+ *		tcp_update_scoreboard.
+ *
+ *		RACK (draft-ietf-tcpm-rack-01): it is a newer algorithm
+ *		(2017-) that checks timing instead of counting DUPACKs.
+ *		Essentially a packet is considered lost if it's not S/ACKed
+ *		after RTT + reordering_window, where both metrics are
+ *		dynamically measured and adjusted. This is implemented in
+ *		tcp_rack_mark_lost.
+ *
+ *		FACK (Disabled by default. Subsumbed by RACK):
+ *		It is the simplest heuristics. As soon as we decided
  *		that something is lost, we decide that _all_ not SACKed
  *		packets until the most forward SACK are lost. I.e.
  *		lost_out = fackets_out - sacked_out and left_out = fackets_out.
@@ -2139,16 +2124,14 @@ static bool tcp_pause_early_retransmit(struct sock *sk, int flag)
  *		takes place. We use FACK by default until reordering
  *		is suspected on the path to this destination.
  *
- *		NewReno: when Recovery is entered, we assume that one segment
+ *		If the receiver does not support SACK:
+ *
+ *		NewReno (RFC6582): in Recovery we assume that one segment
  *		is lost (classic Reno). While we are in Recovery and
  *		a partial ACK arrives, we assume that one more packet
  *		is lost (NewReno). This heuristics are the same in NewReno
  *		and SACK.
  *
- *  Imagine, that's all! Forget about all this shamanism about CWND inflation
- *  deflation etc. CWND is real congestion window, never inflated, changes
- *  only according to classic VJ rules.
- *
  * Really tricky (and requiring careful tuning) part of algorithm
  * is hidden in functions tcp_time_to_recover() and tcp_xmit_retransmit_queue().
  * The first determines the moment _when_ we should reduce CWND and,
@@ -2176,8 +2159,6 @@ static bool tcp_pause_early_retransmit(struct sock *sk, int flag)
 static bool tcp_time_to_recover(struct sock *sk, int flag)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
-	__u32 packets_out;
-	int tcp_reordering = sock_net(sk)->ipv4.sysctl_tcp_reordering;
 
 	/* Trick#1: The loss is proven. */
 	if (tp->lost_out)
@@ -2187,39 +2168,6 @@ static bool tcp_time_to_recover(struct sock *sk, int flag)
 	if (tcp_dupack_heuristics(tp) > tp->reordering)
 		return true;
 
-	/* Trick#4: It is still not OK... But will it be useful to delay
-	 * recovery more?
-	 */
-	packets_out = tp->packets_out;
-	if (packets_out <= tp->reordering &&
-	    tp->sacked_out >= max_t(__u32, packets_out/2, tcp_reordering) &&
-	    !tcp_may_send_now(sk)) {
-		/* We have nothing to send. This connection is limited
-		 * either by receiver window or by application.
-		 */
-		return true;
-	}
-
-	/* If a thin stream is detected, retransmit after first
-	 * received dupack. Employ only if SACK is supported in order
-	 * to avoid possible corner-case series of spurious retransmissions
-	 * Use only if there are no unsent data.
-	 */
-	if ((tp->thin_dupack || sysctl_tcp_thin_dupack) &&
-	    tcp_stream_is_thin(tp) && tcp_dupack_heuristics(tp) > 1 &&
-	    tcp_is_sack(tp) && !tcp_send_head(sk))
-		return true;
-
-	/* Trick#6: TCP early retransmit, per RFC5827.  To avoid spurious
-	 * retransmissions due to small network reorderings, we implement
-	 * Mitigation A.3 in the RFC and delay the retransmission for a short
-	 * interval if appropriate.
-	 */
-	if (tp->do_early_retrans && !tp->retrans_out && tp->sacked_out &&
-	    (tp->packets_out >= (tp->sacked_out + 1) && tp->packets_out < 4) &&
-	    !tcp_may_send_now(sk))
-		return !tcp_pause_early_retransmit(sk, flag);
-
 	return false;
 }
 
@@ -2521,8 +2469,7 @@ static void tcp_init_cwnd_reduction(struct sock *sk)
 	tcp_ecn_queue_cwr(tp);
 }
 
-static void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked,
-			       int flag)
+void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked, int flag)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	int sndcnt = 0;
@@ -2690,7 +2637,7 @@ void tcp_simple_retransmit(struct sock *sk)
 }
 EXPORT_SYMBOL(tcp_simple_retransmit);
 
-static void tcp_enter_recovery(struct sock *sk, bool ece_ack)
+void tcp_enter_recovery(struct sock *sk, bool ece_ack)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	int mib_idx;
@@ -2726,14 +2673,18 @@ static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack,
 	    tcp_try_undo_loss(sk, false))
 		return;
 
-	if (tp->frto) { /* F-RTO RFC5682 sec 3.1 (sack enhanced version). */
-		/* Step 3.b. A timeout is spurious if not all data are
-		 * lost, i.e., never-retransmitted data are (s)acked.
-		 */
-		if ((flag & FLAG_ORIG_SACK_ACKED) &&
-		    tcp_try_undo_loss(sk, true))
-			return;
+	/* The ACK (s)acks some never-retransmitted data meaning not all
+	 * the data packets before the timeout were lost. Therefore we
+	 * undo the congestion window and state. This is essentially
+	 * the operation in F-RTO (RFC5682 section 3.1 step 3.b). Since
+	 * a retransmitted skb is permantly marked, we can apply such an
+	 * operation even if F-RTO was not used.
+	 */
+	if ((flag & FLAG_ORIG_SACK_ACKED) &&
+	    tcp_try_undo_loss(sk, tp->undo_marker))
+		return;
 
+	if (tp->frto) { /* F-RTO RFC5682 sec 3.1 (sack enhanced version). */
 		if (after(tp->snd_nxt, tp->high_seq)) {
 			if (flag & FLAG_DATA_SACKED || is_dupack)
 				tp->frto = 0; /* Step 3.a. loss was real */
@@ -2800,6 +2751,21 @@ static bool tcp_try_undo_partial(struct sock *sk, const int acked)
 	return false;
 }
 
+static void tcp_rack_identify_loss(struct sock *sk, int *ack_flag,
+				   const struct skb_mstamp *ack_time)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	/* Use RACK to detect loss */
+	if (sysctl_tcp_recovery & TCP_RACK_LOSS_DETECTION) {
+		u32 prior_retrans = tp->retrans_out;
+
+		tcp_rack_mark_lost(sk, ack_time);
+		if (prior_retrans > tp->retrans_out)
+			*ack_flag |= FLAG_LOST_RETRANS;
+	}
+}
+
 /* Process an event, which can update packets-in-flight not trivially.
  * Main goal of this function is to calculate new estimate for left_out,
  * taking into account both packets sitting in receiver's buffer and
@@ -2813,7 +2779,8 @@ static bool tcp_try_undo_partial(struct sock *sk, const int acked)
  * tcp_xmit_retransmit_queue().
  */
 static void tcp_fastretrans_alert(struct sock *sk, const int acked,
-				  bool is_dupack, int *ack_flag, int *rexmit)
+				  bool is_dupack, int *ack_flag, int *rexmit,
+				  const struct skb_mstamp *ack_time)
 {
 	struct inet_connection_sock *icsk = inet_csk(sk);
 	struct tcp_sock *tp = tcp_sk(sk);
@@ -2864,13 +2831,6 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,
 		}
 	}
 
-	/* Use RACK to detect loss */
-	if (sysctl_tcp_recovery & TCP_RACK_LOST_RETRANS &&
-	    tcp_rack_mark_lost(sk)) {
-		flag |= FLAG_LOST_RETRANS;
-		*ack_flag |= FLAG_LOST_RETRANS;
-	}
-
 	/* E. Process state. */
 	switch (icsk->icsk_ca_state) {
 	case TCP_CA_Recovery:
@@ -2888,11 +2848,13 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,
 			tcp_try_keep_open(sk);
 			return;
 		}
+		tcp_rack_identify_loss(sk, ack_flag, ack_time);
 		break;
 	case TCP_CA_Loss:
 		tcp_process_loss(sk, flag, is_dupack, rexmit);
-		if (icsk->icsk_ca_state != TCP_CA_Open &&
-		    !(flag & FLAG_LOST_RETRANS))
+		tcp_rack_identify_loss(sk, ack_flag, ack_time);
+		if (!(icsk->icsk_ca_state == TCP_CA_Open ||
+		      (*ack_flag & FLAG_LOST_RETRANS)))
 			return;
 		/* Change state if cwnd is undone or retransmits are lost */
 	default:
@@ -2906,6 +2868,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,
 		if (icsk->icsk_ca_state <= TCP_CA_Disorder)
 			tcp_try_undo_dsack(sk);
 
+		tcp_rack_identify_loss(sk, ack_flag, ack_time);
 		if (!tcp_time_to_recover(sk, flag)) {
 			tcp_try_to_open(sk, flag);
 			return;
@@ -3024,7 +2987,7 @@ void tcp_rearm_rto(struct sock *sk)
 	} else {
 		u32 rto = inet_csk(sk)->icsk_rto;
 		/* Offset the time elapsed after installing regular RTO */
-		if (icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
+		if (icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
 		    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
 			struct sk_buff *skb = tcp_write_queue_head(sk);
 			const u32 rto_time_stamp =
@@ -3041,24 +3004,6 @@ void tcp_rearm_rto(struct sock *sk)
 	}
 }
 
-/* This function is called when the delayed ER timer fires. TCP enters
- * fast recovery and performs fast-retransmit.
- */
-void tcp_resume_early_retransmit(struct sock *sk)
-{
-	struct tcp_sock *tp = tcp_sk(sk);
-
-	tcp_rearm_rto(sk);
-
-	/* Stop if ER is disabled after the delayed ER timer is scheduled */
-	if (!tp->do_early_retrans)
-		return;
-
-	tcp_enter_recovery(sk, false);
-	tcp_update_scoreboard(sk, 1);
-	tcp_xmit_retransmit_queue(sk);
-}
-
 /* If we get here, the whole TSO packet has not been acked. */
 static u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb)
 {
@@ -3101,11 +3046,11 @@ static void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb,
  */
 static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
 			       u32 prior_snd_una, int *acked,
-			       struct tcp_sacktag_state *sack,
-			       struct skb_mstamp *now)
+			       struct tcp_sacktag_state *sack)
 {
 	const struct inet_connection_sock *icsk = inet_csk(sk);
 	struct skb_mstamp first_ackt, last_ackt;
+	struct skb_mstamp *now = &sack->ack_time;
 	struct tcp_sock *tp = tcp_sk(sk);
 	u32 prior_sacked = tp->sacked_out;
 	u32 reord = tp->packets_out;
@@ -3165,7 +3110,9 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
 		} else if (tcp_is_sack(tp)) {
 			tp->delivered += acked_pcount;
 			if (!tcp_skb_spurious_retrans(tp, skb))
-				tcp_rack_advance(tp, &skb->skb_mstamp, sacked);
+				tcp_rack_advance(tp, sacked, scb->end_seq,
+						 &skb->skb_mstamp,
+						 &sack->ack_time);
 		}
 		if (sacked & TCPCB_LOST)
 			tp->lost_out -= acked_pcount;
@@ -3595,7 +3542,6 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
 	u32 lost = tp->lost;
 	int acked = 0; /* Number of packets newly acked */
 	int rexmit = REXMIT_NONE; /* Flag to (re)transmit to recover losses */
-	struct skb_mstamp now;
 
 	sack_state.first_sackt.v64 = 0;
 	sack_state.rate = &rs;
@@ -3621,10 +3567,9 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
 	if (after(ack, tp->snd_nxt))
 		goto invalid_ack;
 
-	skb_mstamp_get(&now);
+	skb_mstamp_get(&sack_state.ack_time);
 
-	if (icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
-	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)
+	if (icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)
 		tcp_rearm_rto(sk);
 
 	if (after(ack, prior_snd_una)) {
@@ -3689,34 +3634,34 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
 
 	/* See if we can take anything off of the retransmit queue. */
 	flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una, &acked,
-				    &sack_state, &now);
+				    &sack_state);
 
 	if (tcp_ack_is_dubious(sk, flag)) {
 		is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP));
-		tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit);
+		tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit,
+				      &sack_state.ack_time);
 	}
 	if (tp->tlp_high_seq)
 		tcp_process_tlp_ack(sk, ack, flag);
 
-	if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP)) {
-		struct dst_entry *dst = __sk_dst_get(sk);
-		if (dst)
-			dst_confirm(dst);
-	}
+	if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP))
+		sk_dst_confirm(sk);
 
 	if (icsk->icsk_pending == ICSK_TIME_RETRANS)
 		tcp_schedule_loss_probe(sk);
 	delivered = tp->delivered - delivered;	/* freshly ACKed or SACKed */
 	lost = tp->lost - lost;			/* freshly marked lost */
-	tcp_rate_gen(sk, delivered, lost, &now, &rs);
-	tcp_cong_control(sk, ack, delivered, flag, &rs);
+	tcp_rate_gen(sk, delivered, lost, &sack_state.ack_time,
+		     sack_state.rate);
+	tcp_cong_control(sk, ack, delivered, flag, sack_state.rate);
 	tcp_xmit_recovery(sk, rexmit);
 	return 1;
 
 no_queue:
 	/* If data was DSACKed, see if we can undo a cwnd reduction. */
 	if (flag & FLAG_DSACKING_ACK)
-		tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit);
+		tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit,
+				      &sack_state.ack_time);
 	/* If this ack opens up a zero window, clear backoff.  It was
 	 * being used to time the probes, and is probably far higher than
 	 * it needs to be for normal retransmission.
@@ -3737,9 +3682,11 @@ old_ack:
 	 * If data was DSACKed, see if we can undo a cwnd reduction.
 	 */
 	if (TCP_SKB_CB(skb)->sacked) {
+		skb_mstamp_get(&sack_state.ack_time);
 		flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una,
 						&sack_state);
-		tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit);
+		tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit,
+				      &sack_state.ack_time);
 		tcp_xmit_recovery(sk, rexmit);
 	}
 
@@ -4557,6 +4504,7 @@ add_sack:
 end:
 	if (skb) {
 		tcp_grow_window(sk, skb);
+		skb_condense(skb);
 		skb_set_owner_r(skb, sk);
 	}
 }
@@ -5249,6 +5197,23 @@ static int tcp_copy_to_iovec(struct sock *sk, struct sk_buff *skb, int hlen)
 	return err;
 }
 
+/* Accept RST for rcv_nxt - 1 after a FIN.
+ * When tcp connections are abruptly terminated from Mac OSX (via ^C), a
+ * FIN is sent followed by a RST packet. The RST is sent with the same
+ * sequence number as the FIN, and thus according to RFC 5961 a challenge
+ * ACK should be sent. However, Mac OSX rate limits replies to challenge
+ * ACKs on the closed socket. In addition middleboxes can drop either the
+ * challenge ACK or a subsequent RST.
+ */
+static bool tcp_reset_check(const struct sock *sk, const struct sk_buff *skb)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	return unlikely(TCP_SKB_CB(skb)->seq == (tp->rcv_nxt - 1) &&
+			(1 << sk->sk_state) & (TCPF_CLOSE_WAIT | TCPF_LAST_ACK |
+					       TCPF_CLOSING));
+}
+
 /* Does PAWS and seqno based validation of an incoming segment, flags will
  * play significant role here.
  */
@@ -5287,20 +5252,25 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
 						  LINUX_MIB_TCPACKSKIPPEDSEQ,
 						  &tp->last_oow_ack_time))
 				tcp_send_dupack(sk, skb);
+		} else if (tcp_reset_check(sk, skb)) {
+			tcp_reset(sk);
 		}
 		goto discard;
 	}
 
 	/* Step 2: check RST bit */
 	if (th->rst) {
-		/* RFC 5961 3.2 (extend to match against SACK too if available):
-		 * If seq num matches RCV.NXT or the right-most SACK block,
+		/* RFC 5961 3.2 (extend to match against (RCV.NXT - 1) after a
+		 * FIN and SACK too if available):
+		 * If seq num matches RCV.NXT or (RCV.NXT - 1) after a FIN, or
+		 * the right-most SACK block,
 		 * then
 		 *     RESET the connection
 		 * else
 		 *     Send a challenge ACK
 		 */
-		if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt) {
+		if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt ||
+		    tcp_reset_check(sk, skb)) {
 			rst_seq_match = true;
 		} else if (tcp_is_sack(tp) && tp->rx_opt.num_sacks > 0) {
 			struct tcp_sack_block *sp = &tp->selective_acks[0];
@@ -6022,7 +5992,6 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
 		break;
 
 	case TCP_FIN_WAIT1: {
-		struct dst_entry *dst;
 		int tmo;
 
 		/* If we enter the TCP_FIN_WAIT1 state and we are a
@@ -6049,9 +6018,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
 		tcp_set_state(sk, TCP_FIN_WAIT2);
 		sk->sk_shutdown |= SEND_SHUTDOWN;
 
-		dst = __sk_dst_get(sk);
-		if (dst)
-			dst_confirm(dst);
+		sk_dst_confirm(sk);
 
 		if (!sock_flag(sk, SOCK_DEAD)) {
 			/* Wake up lingering close() */
@@ -6363,7 +6330,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
 		 * timewait bucket, so that all the necessary checks
 		 * are made in the function processing timewait state.
 		 */
-		if (tcp_death_row.sysctl_tw_recycle) {
+		if (net->ipv4.tcp_death_row.sysctl_tw_recycle) {
 			bool strict;
 
 			dst = af_ops->route_req(sk, &fl, req, &strict);
@@ -6377,8 +6344,8 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
 		}
 		/* Kill the following clause, if you dislike this way. */
 		else if (!net->ipv4.sysctl_tcp_syncookies &&
-			 (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
-			  (sysctl_max_syn_backlog >> 2)) &&
+			 (net->ipv4.sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
+			  (net->ipv4.sysctl_max_syn_backlog >> 2)) &&
 			 !tcp_peer_is_proven(req, dst, false,
 					     tmp_opt.saw_tstamp)) {
 			/* Without syncookies last quarter of
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index fe9da4fb96bf..9a89b8deafae 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -145,7 +145,9 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 	struct flowi4 *fl4;
 	struct rtable *rt;
 	int err;
+	u32 seq;
 	struct ip_options_rcu *inet_opt;
+	struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
 
 	if (addr_len < sizeof(struct sockaddr_in))
 		return -EINVAL;
@@ -196,7 +198,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 			tp->write_seq	   = 0;
 	}
 
-	if (tcp_death_row.sysctl_tw_recycle &&
+	if (tcp_death_row->sysctl_tw_recycle &&
 	    !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr)
 		tcp_fetch_timewait_stamp(sk, &rt->dst);
 
@@ -215,7 +217,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 	 * complete initialization after this.
 	 */
 	tcp_set_state(sk, TCP_SYN_SENT);
-	err = inet_hash_connect(&tcp_death_row, sk);
+	err = inet_hash_connect(tcp_death_row, sk);
 	if (err)
 		goto failure;
 
@@ -231,19 +233,27 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 	/* OK, now commit destination to socket.  */
 	sk->sk_gso_type = SKB_GSO_TCPV4;
 	sk_setup_caps(sk, &rt->dst);
+	rt = NULL;
 
-	if (!tp->write_seq && likely(!tp->repair))
-		tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
-							   inet->inet_daddr,
-							   inet->inet_sport,
-							   usin->sin_port,
-							   &tp->tsoffset);
+	if (likely(!tp->repair)) {
+		seq = secure_tcp_sequence_number(inet->inet_saddr,
+						 inet->inet_daddr,
+						 inet->inet_sport,
+						 usin->sin_port,
+						 &tp->tsoffset);
+		if (!tp->write_seq)
+			tp->write_seq = seq;
+	}
 
 	inet->inet_id = tp->write_seq ^ jiffies;
 
+	if (tcp_fastopen_defer_connect(sk, &err))
+		return err;
+	if (err)
+		goto failure;
+
 	err = tcp_connect(sk);
 
-	rt = NULL;
 	if (err)
 		goto failure;
 
@@ -1318,10 +1328,7 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
 	tcp_ca_openreq_child(newsk, dst);
 
 	tcp_sync_mss(newsk, dst_mtu(dst));
-	newtp->advmss = dst_metric_advmss(dst);
-	if (tcp_sk(sk)->rx_opt.user_mss &&
-	    tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
-		newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
+	newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
 
 	tcp_initialize_rcv_mss(newsk);
 
@@ -1555,8 +1562,7 @@ bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
 	 * It has been noticed pure SACK packets were sometimes dropped
 	 * (if cooked by drivers without copybreak feature).
 	 */
-	if (!skb->data_len)
-		skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
+	skb_condense(skb);
 
 	if (unlikely(sk_add_backlog(sk, skb, limit))) {
 		bh_unlock_sock(sk);
@@ -1816,7 +1822,6 @@ const struct inet_connection_sock_af_ops ipv4_specific = {
 	.getsockopt	   = ip_getsockopt,
 	.addr2sockaddr	   = inet_csk_addr2sockaddr,
 	.sockaddr_len	   = sizeof(struct sockaddr_in),
-	.bind_conflict	   = inet_csk_bind_conflict,
 #ifdef CONFIG_COMPAT
 	.compat_setsockopt = compat_ip_setsockopt,
 	.compat_getsockopt = compat_ip_getsockopt,
@@ -1887,9 +1892,7 @@ void tcp_v4_destroy_sock(struct sock *sk)
 	tcp_free_fastopen_req(tp);
 	tcp_saved_syn_free(tp);
 
-	local_bh_disable();
 	sk_sockets_allocated_dec(sk);
-	local_bh_enable();
 }
 EXPORT_SYMBOL(tcp_v4_destroy_sock);
 
@@ -2228,7 +2231,7 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
 	int state;
 
 	if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
-	    icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
+	    icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
 	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
 		timer_active	= 1;
 		timer_expires	= icsk->icsk_timeout;
@@ -2375,6 +2378,7 @@ struct proto tcp_prot = {
 	.shutdown		= tcp_shutdown,
 	.setsockopt		= tcp_setsockopt,
 	.getsockopt		= tcp_getsockopt,
+	.keepalive		= tcp_set_keepalive,
 	.recvmsg		= tcp_recvmsg,
 	.sendmsg		= tcp_sendmsg,
 	.sendpage		= tcp_sendpage,
@@ -2418,7 +2422,7 @@ static void __net_exit tcp_sk_exit(struct net *net)
 
 static int __net_init tcp_sk_init(struct net *net)
 {
-	int res, cpu;
+	int res, cpu, cnt;
 
 	net->ipv4.tcp_sk = alloc_percpu(struct sock *);
 	if (!net->ipv4.tcp_sk)
@@ -2457,6 +2461,13 @@ static int __net_init tcp_sk_init(struct net *net)
 	net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
 	net->ipv4.sysctl_tcp_tw_reuse = 0;
 
+	cnt = tcp_hashinfo.ehash_mask + 1;
+	net->ipv4.tcp_death_row.sysctl_tw_recycle = 0;
+	net->ipv4.tcp_death_row.sysctl_max_tw_buckets = (cnt + 1) / 2;
+	net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
+
+	net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 256);
+
 	return 0;
 fail:
 	tcp_sk_exit(net);
@@ -2466,7 +2477,7 @@ fail:
 
 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
 {
-	inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET);
+	inet_twsk_purge(&tcp_hashinfo, AF_INET);
 }
 
 static struct pernet_operations __net_initdata tcp_sk_ops = {
@@ -2477,7 +2488,6 @@ static struct pernet_operations __net_initdata tcp_sk_ops = {
 
 void __init tcp_v4_init(void)
 {
-	inet_hashinfo_init(&tcp_hashinfo);
 	if (register_pernet_subsys(&tcp_sk_ops))
 		panic("Failed to create the TCP control socket.\n");
 }
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
index ba8f02d0f283..0f46e5fe31ad 100644
--- a/net/ipv4/tcp_metrics.c
+++ b/net/ipv4/tcp_metrics.c
@@ -375,12 +375,10 @@ void tcp_update_metrics(struct sock *sk)
 	u32 val;
 	int m;
 
+	sk_dst_confirm(sk);
 	if (sysctl_tcp_nometrics_save || !dst)
 		return;
 
-	if (dst->flags & DST_HOST)
-		dst_confirm(dst);
-
 	rcu_read_lock();
 	if (icsk->icsk_backoff || !tp->srtt_us) {
 		/* This session failed to estimate rtt. Why?
@@ -493,11 +491,10 @@ void tcp_init_metrics(struct sock *sk)
 	struct tcp_metrics_block *tm;
 	u32 val, crtt = 0; /* cached RTT scaled by 8 */
 
+	sk_dst_confirm(sk);
 	if (!dst)
 		goto reset;
 
-	dst_confirm(dst);
-
 	rcu_read_lock();
 	tm = tcp_get_metrics(sk, dst, true);
 	if (!tm) {
@@ -522,7 +519,6 @@ void tcp_init_metrics(struct sock *sk)
 	val = tcp_metric_get(tm, TCP_METRIC_REORDERING);
 	if (val && tp->reordering != val) {
 		tcp_disable_fack(tp);
-		tcp_disable_early_retrans(tp);
 		tp->reordering = val;
 	}
 
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 28ce5ee831f5..7e16243cdb58 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -29,12 +29,6 @@
 
 int sysctl_tcp_abort_on_overflow __read_mostly;
 
-struct inet_timewait_death_row tcp_death_row = {
-	.sysctl_max_tw_buckets = NR_FILE * 2,
-	.hashinfo	= &tcp_hashinfo,
-};
-EXPORT_SYMBOL_GPL(tcp_death_row);
-
 static bool tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win)
 {
 	if (seq == s_win)
@@ -100,13 +94,15 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
 	struct tcp_options_received tmp_opt;
 	struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
 	bool paws_reject = false;
+	struct inet_timewait_death_row *tcp_death_row = &sock_net((struct sock*)tw)->ipv4.tcp_death_row;
 
 	tmp_opt.saw_tstamp = 0;
 	if (th->doff > (sizeof(*th) >> 2) && tcptw->tw_ts_recent_stamp) {
 		tcp_parse_options(skb, &tmp_opt, 0, NULL);
 
 		if (tmp_opt.saw_tstamp) {
-			tmp_opt.rcv_tsecr	-= tcptw->tw_ts_offset;
+			if (tmp_opt.rcv_tsecr)
+				tmp_opt.rcv_tsecr -= tcptw->tw_ts_offset;
 			tmp_opt.ts_recent	= tcptw->tw_ts_recent;
 			tmp_opt.ts_recent_stamp	= tcptw->tw_ts_recent_stamp;
 			paws_reject = tcp_paws_reject(&tmp_opt, th->rst);
@@ -153,7 +149,7 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
 			tcptw->tw_ts_recent	  = tmp_opt.rcv_tsval;
 		}
 
-		if (tcp_death_row.sysctl_tw_recycle &&
+		if (tcp_death_row->sysctl_tw_recycle &&
 		    tcptw->tw_ts_recent_stamp &&
 		    tcp_tw_remember_stamp(tw))
 			inet_twsk_reschedule(tw, tw->tw_timeout);
@@ -264,11 +260,12 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
 	const struct tcp_sock *tp = tcp_sk(sk);
 	struct inet_timewait_sock *tw;
 	bool recycle_ok = false;
+	struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
 
-	if (tcp_death_row.sysctl_tw_recycle && tp->rx_opt.ts_recent_stamp)
+	if (tcp_death_row->sysctl_tw_recycle && tp->rx_opt.ts_recent_stamp)
 		recycle_ok = tcp_remember_stamp(sk);
 
-	tw = inet_twsk_alloc(sk, &tcp_death_row, state);
+	tw = inet_twsk_alloc(sk, tcp_death_row, state);
 
 	if (tw) {
 		struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
@@ -364,15 +361,12 @@ void tcp_openreq_init_rwin(struct request_sock *req,
 {
 	struct inet_request_sock *ireq = inet_rsk(req);
 	const struct tcp_sock *tp = tcp_sk(sk_listener);
-	u16 user_mss = READ_ONCE(tp->rx_opt.user_mss);
 	int full_space = tcp_full_space(sk_listener);
-	int mss = dst_metric_advmss(dst);
 	u32 window_clamp;
 	__u8 rcv_wscale;
+	int mss;
 
-	if (user_mss && user_mss < mss)
-		mss = user_mss;
-
+	mss = tcp_mss_clamp(tp, dst_metric_advmss(dst));
 	window_clamp = READ_ONCE(tp->window_clamp);
 	/* Set this up on the first call only */
 	req->rsk_window_clamp = window_clamp ? : dst_metric(dst, RTAX_WINDOW);
@@ -472,7 +466,6 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
 		newtp->sacked_out = 0;
 		newtp->fackets_out = 0;
 		newtp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
-		tcp_enable_early_retrans(newtp);
 		newtp->tlp_high_seq = 0;
 		newtp->lsndtime = treq->snt_synack.stamp_jiffies;
 		newsk->sk_txhash = treq->txhash;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 8ce50dc3ab8c..22548b5f05cb 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -76,16 +76,15 @@ static void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb)
 	tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
 
 	tp->packets_out += tcp_skb_pcount(skb);
-	if (!prior_packets || icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
-	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
+	if (!prior_packets || icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)
 		tcp_rearm_rto(sk);
-	}
 
 	NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT,
 		      tcp_skb_pcount(skb));
 }
 
-/* SND.NXT, if window was not shrunk.
+/* SND.NXT, if window was not shrunk or the amount of shrunk was less than one
+ * window scaling factor due to loss of precision.
  * If window has been shrunk, what should we make? It is not clear at all.
  * Using SND.UNA we will fail to open window, SND.NXT is out of window. :-(
  * Anything in between SND.UNA...SND.UNA+SND.WND also can be already
@@ -95,7 +94,9 @@ static inline __u32 tcp_acceptable_seq(const struct sock *sk)
 {
 	const struct tcp_sock *tp = tcp_sk(sk);
 
-	if (!before(tcp_wnd_end(tp), tp->snd_nxt))
+	if (!before(tcp_wnd_end(tp), tp->snd_nxt) ||
+	    (tp->rx_opt.wscale_ok &&
+	     ((tp->snd_nxt - tcp_wnd_end(tp)) < (1 << tp->rx_opt.rcv_wscale))))
 		return tp->snd_nxt;
 	else
 		return tcp_wnd_end(tp);
@@ -966,6 +967,13 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
 	 */
 	skb->ooo_okay = sk_wmem_alloc_get(sk) < SKB_TRUESIZE(1);
 
+	/* If we had to use memory reserve to allocate this skb,
+	 * this might cause drops if packet is looped back :
+	 * Other socket might not have SOCK_MEMALLOC.
+	 * Packets not looped back do not care about pfmemalloc.
+	 */
+	skb->pfmemalloc = 0;
+
 	skb_push(skb, tcp_header_size);
 	skb_reset_transport_header(skb);
 
@@ -975,6 +983,8 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
 	skb_set_hash_from_sk(skb, sk);
 	atomic_add(skb->truesize, &sk->sk_wmem_alloc);
 
+	skb_set_dst_pending_confirm(skb, sk->sk_dst_pending_confirm);
+
 	/* Build TCP header and checksum it. */
 	th = (struct tcphdr *)skb->data;
 	th->source		= inet->inet_sport;
@@ -2289,8 +2299,6 @@ bool tcp_schedule_loss_probe(struct sock *sk)
 	u32 timeout, tlp_time_stamp, rto_time_stamp;
 	u32 rtt = usecs_to_jiffies(tp->srtt_us >> 3);
 
-	if (WARN_ON(icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS))
-		return false;
 	/* No consecutive loss probes. */
 	if (WARN_ON(icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)) {
 		tcp_rearm_rto(sk);
@@ -2309,8 +2317,9 @@ bool tcp_schedule_loss_probe(struct sock *sk)
 	/* Schedule a loss probe in 2*RTT for SACK capable connections
 	 * in Open state, that are either limited by cwnd or application.
 	 */
-	if (sysctl_tcp_early_retrans < 3 || !tp->packets_out ||
-	    !tcp_is_sack(tp) || inet_csk(sk)->icsk_ca_state != TCP_CA_Open)
+	if ((sysctl_tcp_early_retrans != 3 && sysctl_tcp_early_retrans != 4) ||
+	    !tp->packets_out || !tcp_is_sack(tp) ||
+	    icsk->icsk_ca_state != TCP_CA_Open)
 		return false;
 
 	if ((tp->snd_cwnd > tcp_packets_in_flight(tp)) &&
@@ -2776,6 +2785,13 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
 	if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN_ECN) == TCPHDR_SYN_ECN)
 		tcp_ecn_clear_syn(sk, skb);
 
+	/* Update global and local TCP statistics. */
+	segs = tcp_skb_pcount(skb);
+	TCP_ADD_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS, segs);
+	if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)
+		__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
+	tp->total_retrans += segs;
+
 	/* make sure skb->data is aligned on arches that require it
 	 * and check if ack-trimming & collapsing extended the headroom
 	 * beyond what csum_start can cover.
@@ -2793,14 +2809,9 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
 	}
 
 	if (likely(!err)) {
-		segs = tcp_skb_pcount(skb);
-
 		TCP_SKB_CB(skb)->sacked |= TCPCB_EVER_RETRANS;
-		/* Update global TCP statistics. */
-		TCP_ADD_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS, segs);
-		if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)
-			__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
-		tp->total_retrans += segs;
+	} else if (err != -EBUSY) {
+		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL);
 	}
 	return err;
 }
@@ -2823,8 +2834,6 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
 		if (!tp->retrans_stamp)
 			tp->retrans_stamp = tcp_skb_timestamp(skb);
 
-	} else if (err != -EBUSY) {
-		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL);
 	}
 
 	if (tp->undo_retrans < 0)
@@ -2833,36 +2842,6 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
 	return err;
 }
 
-/* Check if we forward retransmits are possible in the current
- * window/congestion state.
- */
-static bool tcp_can_forward_retransmit(struct sock *sk)
-{
-	const struct inet_connection_sock *icsk = inet_csk(sk);
-	const struct tcp_sock *tp = tcp_sk(sk);
-
-	/* Forward retransmissions are possible only during Recovery. */
-	if (icsk->icsk_ca_state != TCP_CA_Recovery)
-		return false;
-
-	/* No forward retransmissions in Reno are possible. */
-	if (tcp_is_reno(tp))
-		return false;
-
-	/* Yeah, we have to make difficult choice between forward transmission
-	 * and retransmission... Both ways have their merits...
-	 *
-	 * For now we do not retransmit anything, while we have some new
-	 * segments to send. In the other cases, follow rule 3 for
-	 * NextSeg() specified in RFC3517.
-	 */
-
-	if (tcp_may_send_now(sk))
-		return false;
-
-	return true;
-}
-
 /* This gets called after a retransmit timeout, and the initially
  * retransmitted data is acknowledged.  It tries to continue
  * resending the rest of the retransmit queue, until either
@@ -2877,24 +2856,16 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct sk_buff *skb;
 	struct sk_buff *hole = NULL;
-	u32 max_segs, last_lost;
+	u32 max_segs;
 	int mib_idx;
-	int fwd_rexmitting = 0;
 
 	if (!tp->packets_out)
 		return;
 
-	if (!tp->lost_out)
-		tp->retransmit_high = tp->snd_una;
-
 	if (tp->retransmit_skb_hint) {
 		skb = tp->retransmit_skb_hint;
-		last_lost = TCP_SKB_CB(skb)->end_seq;
-		if (after(last_lost, tp->retransmit_high))
-			last_lost = tp->retransmit_high;
 	} else {
 		skb = tcp_write_queue_head(sk);
-		last_lost = tp->snd_una;
 	}
 
 	max_segs = tcp_tso_segs(sk, tcp_current_mss(sk));
@@ -2917,31 +2888,14 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
 		 */
 		segs = min_t(int, segs, max_segs);
 
-		if (fwd_rexmitting) {
-begin_fwd:
-			if (!before(TCP_SKB_CB(skb)->seq, tcp_highest_sack_seq(tp)))
-				break;
-			mib_idx = LINUX_MIB_TCPFORWARDRETRANS;
-
-		} else if (!before(TCP_SKB_CB(skb)->seq, tp->retransmit_high)) {
-			tp->retransmit_high = last_lost;
-			if (!tcp_can_forward_retransmit(sk))
-				break;
-			/* Backtrack if necessary to non-L'ed skb */
-			if (hole) {
-				skb = hole;
-				hole = NULL;
-			}
-			fwd_rexmitting = 1;
-			goto begin_fwd;
-
+		if (tp->retrans_out >= tp->lost_out) {
+			break;
 		} else if (!(sacked & TCPCB_LOST)) {
 			if (!hole && !(sacked & (TCPCB_SACKED_RETRANS|TCPCB_SACKED_ACKED)))
 				hole = skb;
 			continue;
 
 		} else {
-			last_lost = TCP_SKB_CB(skb)->end_seq;
 			if (icsk->icsk_ca_state != TCP_CA_Loss)
 				mib_idx = LINUX_MIB_TCPFASTRETRANS;
 			else
@@ -2962,7 +2916,8 @@ begin_fwd:
 		if (tcp_in_cwnd_reduction(sk))
 			tp->prr_out += tcp_skb_pcount(skb);
 
-		if (skb == tcp_write_queue_head(sk))
+		if (skb == tcp_write_queue_head(sk) &&
+		    icsk->icsk_pending != ICSK_TIME_REO_TIMEOUT)
 			inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
 						  inet_csk(sk)->icsk_rto,
 						  TCP_RTO_MAX);
@@ -3119,7 +3074,6 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
 	struct sk_buff *skb;
 	int tcp_header_size;
 	struct tcphdr *th;
-	u16 user_mss;
 	int mss;
 
 	skb = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);
@@ -3149,10 +3103,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
 	}
 	skb_dst_set(skb, dst);
 
-	mss = dst_metric_advmss(dst);
-	user_mss = READ_ONCE(tp->rx_opt.user_mss);
-	if (user_mss && user_mss < mss)
-		mss = user_mss;
+	mss = tcp_mss_clamp(tp, dst_metric_advmss(dst));
 
 	memset(&opts, 0, sizeof(opts));
 #ifdef CONFIG_SYN_COOKIES
@@ -3258,9 +3209,7 @@ static void tcp_connect_init(struct sock *sk)
 
 	if (!tp->window_clamp)
 		tp->window_clamp = dst_metric(dst, RTAX_WINDOW);
-	tp->advmss = dst_metric_advmss(dst);
-	if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < tp->advmss)
-		tp->advmss = tp->rx_opt.user_mss;
+	tp->advmss = tcp_mss_clamp(tp, dst_metric_advmss(dst));
 
 	tcp_initialize_rcv_mss(sk);
 
@@ -3326,31 +3275,19 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct tcp_fastopen_request *fo = tp->fastopen_req;
-	int syn_loss = 0, space, err = 0;
-	unsigned long last_syn_loss = 0;
+	int space, err = 0;
 	struct sk_buff *syn_data;
 
 	tp->rx_opt.mss_clamp = tp->advmss;  /* If MSS is not cached */
-	tcp_fastopen_cache_get(sk, &tp->rx_opt.mss_clamp, &fo->cookie,
-			       &syn_loss, &last_syn_loss);
-	/* Recurring FO SYN losses: revert to regular handshake temporarily */
-	if (syn_loss > 1 &&
-	    time_before(jiffies, last_syn_loss + (60*HZ << syn_loss))) {
-		fo->cookie.len = -1;
-		goto fallback;
-	}
-
-	if (sysctl_tcp_fastopen & TFO_CLIENT_NO_COOKIE)
-		fo->cookie.len = -1;
-	else if (fo->cookie.len <= 0)
+	if (!tcp_fastopen_cookie_check(sk, &tp->rx_opt.mss_clamp, &fo->cookie))
 		goto fallback;
 
 	/* MSS for SYN-data is based on cached MSS and bounded by PMTU and
 	 * user-MSS. Reserve maximum option space for middleboxes that add
 	 * private TCP options. The cost is reduced data space in SYN :(
 	 */
-	if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < tp->rx_opt.mss_clamp)
-		tp->rx_opt.mss_clamp = tp->rx_opt.user_mss;
+	tp->rx_opt.mss_clamp = tcp_mss_clamp(tp, tp->rx_opt.mss_clamp);
+
 	space = __tcp_mtu_to_mss(sk, inet_csk(sk)->icsk_pmtu_cookie) -
 		MAX_TCP_OPTION_SPACE;
 
diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c
index e36df4fcfeba..4ecb38ae8504 100644
--- a/net/ipv4/tcp_recovery.c
+++ b/net/ipv4/tcp_recovery.c
@@ -1,9 +1,32 @@
 #include <linux/tcp.h>
 #include <net/tcp.h>
 
-int sysctl_tcp_recovery __read_mostly = TCP_RACK_LOST_RETRANS;
+int sysctl_tcp_recovery __read_mostly = TCP_RACK_LOSS_DETECTION;
 
-/* Marks a packet lost, if some packet sent later has been (s)acked.
+static void tcp_rack_mark_skb_lost(struct sock *sk, struct sk_buff *skb)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	tcp_skb_mark_lost_uncond_verify(tp, skb);
+	if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) {
+		/* Account for retransmits that are lost again */
+		TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
+		tp->retrans_out -= tcp_skb_pcount(skb);
+		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPLOSTRETRANSMIT);
+	}
+}
+
+static bool tcp_rack_sent_after(const struct skb_mstamp *t1,
+				const struct skb_mstamp *t2,
+				u32 seq1, u32 seq2)
+{
+	return skb_mstamp_after(t1, t2) ||
+	       (t1->v64 == t2->v64 && after(seq1, seq2));
+}
+
+/* RACK loss detection (IETF draft draft-ietf-tcpm-rack-01):
+ *
+ * Marks a packet lost, if some packet sent later has been (s)acked.
  * The underlying idea is similar to the traditional dupthresh and FACK
  * but they look at different metrics:
  *
@@ -16,31 +39,26 @@ int sysctl_tcp_recovery __read_mostly = TCP_RACK_LOST_RETRANS;
  * is being more resilient to reordering by simply allowing some
  * "settling delay", instead of tweaking the dupthresh.
  *
- * The current version is only used after recovery starts but can be
- * easily extended to detect the first loss.
+ * When tcp_rack_detect_loss() detects some packets are lost and we
+ * are not already in the CA_Recovery state, either tcp_rack_reo_timeout()
+ * or tcp_time_to_recover()'s "Trick#1: the loss is proven" code path will
+ * make us enter the CA_Recovery state.
  */
-int tcp_rack_mark_lost(struct sock *sk)
+static void tcp_rack_detect_loss(struct sock *sk, const struct skb_mstamp *now,
+				 u32 *reo_timeout)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct sk_buff *skb;
-	u32 reo_wnd, prior_retrans = tp->retrans_out;
-
-	if (inet_csk(sk)->icsk_ca_state < TCP_CA_Recovery || !tp->rack.advanced)
-		return 0;
-
-	/* Reset the advanced flag to avoid unnecessary queue scanning */
-	tp->rack.advanced = 0;
+	u32 reo_wnd;
 
+	*reo_timeout = 0;
 	/* To be more reordering resilient, allow min_rtt/4 settling delay
 	 * (lower-bounded to 1000uS). We use min_rtt instead of the smoothed
 	 * RTT because reordering is often a path property and less related
 	 * to queuing or delayed ACKs.
-	 *
-	 * TODO: measure and adapt to the observed reordering delay, and
-	 * use a timer to retransmit like the delayed early retransmit.
 	 */
 	reo_wnd = 1000;
-	if (tp->rack.reord && tcp_min_rtt(tp) != ~0U)
+	if ((tp->rack.reord || !tp->lost_out) && tcp_min_rtt(tp) != ~0U)
 		reo_wnd = max(tcp_min_rtt(tp) >> 2, reo_wnd);
 
 	tcp_for_write_queue(skb, sk) {
@@ -54,20 +72,29 @@ int tcp_rack_mark_lost(struct sock *sk)
 		    scb->sacked & TCPCB_SACKED_ACKED)
 			continue;
 
-		if (skb_mstamp_after(&tp->rack.mstamp, &skb->skb_mstamp)) {
+		if (tcp_rack_sent_after(&tp->rack.mstamp, &skb->skb_mstamp,
+					tp->rack.end_seq, scb->end_seq)) {
+			/* Step 3 in draft-cheng-tcpm-rack-00.txt:
+			 * A packet is lost if its elapsed time is beyond
+			 * the recent RTT plus the reordering window.
+			 */
+			u32 elapsed = skb_mstamp_us_delta(now,
+							  &skb->skb_mstamp);
+			s32 remaining = tp->rack.rtt_us + reo_wnd - elapsed;
 
-			if (skb_mstamp_us_delta(&tp->rack.mstamp,
-						&skb->skb_mstamp) <= reo_wnd)
+			if (remaining < 0) {
+				tcp_rack_mark_skb_lost(sk, skb);
 				continue;
-
-			/* skb is lost if packet sent later is sacked */
-			tcp_skb_mark_lost_uncond_verify(tp, skb);
-			if (scb->sacked & TCPCB_SACKED_RETRANS) {
-				scb->sacked &= ~TCPCB_SACKED_RETRANS;
-				tp->retrans_out -= tcp_skb_pcount(skb);
-				NET_INC_STATS(sock_net(sk),
-					      LINUX_MIB_TCPLOSTRETRANSMIT);
 			}
+
+			/* Skip ones marked lost but not yet retransmitted */
+			if ((scb->sacked & TCPCB_LOST) &&
+			    !(scb->sacked & TCPCB_SACKED_RETRANS))
+				continue;
+
+			/* Record maximum wait time (+1 to avoid 0) */
+			*reo_timeout = max_t(u32, *reo_timeout, 1 + remaining);
+
 		} else if (!(scb->sacked & TCPCB_RETRANS)) {
 			/* Original data are sent sequentially so stop early
 			 * b/c the rest are all sent after rack_sent
@@ -75,20 +102,43 @@ int tcp_rack_mark_lost(struct sock *sk)
 			break;
 		}
 	}
-	return prior_retrans - tp->retrans_out;
 }
 
-/* Record the most recently (re)sent time among the (s)acked packets */
-void tcp_rack_advance(struct tcp_sock *tp,
-		      const struct skb_mstamp *xmit_time, u8 sacked)
+void tcp_rack_mark_lost(struct sock *sk, const struct skb_mstamp *now)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	u32 timeout;
+
+	if (!tp->rack.advanced)
+		return;
+
+	/* Reset the advanced flag to avoid unnecessary queue scanning */
+	tp->rack.advanced = 0;
+	tcp_rack_detect_loss(sk, now, &timeout);
+	if (timeout) {
+		timeout = usecs_to_jiffies(timeout + TCP_REO_TIMEOUT_MIN);
+		inet_csk_reset_xmit_timer(sk, ICSK_TIME_REO_TIMEOUT,
+					  timeout, inet_csk(sk)->icsk_rto);
+	}
+}
+
+/* Record the most recently (re)sent time among the (s)acked packets
+ * This is "Step 3: Advance RACK.xmit_time and update RACK.RTT" from
+ * draft-cheng-tcpm-rack-00.txt
+ */
+void tcp_rack_advance(struct tcp_sock *tp, u8 sacked, u32 end_seq,
+		      const struct skb_mstamp *xmit_time,
+		      const struct skb_mstamp *ack_time)
 {
+	u32 rtt_us;
+
 	if (tp->rack.mstamp.v64 &&
-	    !skb_mstamp_after(xmit_time, &tp->rack.mstamp))
+	    !tcp_rack_sent_after(xmit_time, &tp->rack.mstamp,
+				 end_seq, tp->rack.end_seq))
 		return;
 
+	rtt_us = skb_mstamp_us_delta(ack_time, xmit_time);
 	if (sacked & TCPCB_RETRANS) {
-		struct skb_mstamp now;
-
 		/* If the sacked packet was retransmitted, it's ambiguous
 		 * whether the retransmission or the original (or the prior
 		 * retransmission) was sacked.
@@ -99,11 +149,35 @@ void tcp_rack_advance(struct tcp_sock *tp,
 		 * so it's at least one RTT (i.e., retransmission is at least
 		 * an RTT later).
 		 */
-		skb_mstamp_get(&now);
-		if (skb_mstamp_us_delta(&now, xmit_time) < tcp_min_rtt(tp))
+		if (rtt_us < tcp_min_rtt(tp))
 			return;
 	}
-
+	tp->rack.rtt_us = rtt_us;
 	tp->rack.mstamp = *xmit_time;
+	tp->rack.end_seq = end_seq;
 	tp->rack.advanced = 1;
 }
+
+/* We have waited long enough to accommodate reordering. Mark the expired
+ * packets lost and retransmit them.
+ */
+void tcp_rack_reo_timeout(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct skb_mstamp now;
+	u32 timeout, prior_inflight;
+
+	skb_mstamp_get(&now);
+	prior_inflight = tcp_packets_in_flight(tp);
+	tcp_rack_detect_loss(sk, &now, &timeout);
+	if (prior_inflight != tcp_packets_in_flight(tp)) {
+		if (inet_csk(sk)->icsk_ca_state != TCP_CA_Recovery) {
+			tcp_enter_recovery(sk, false);
+			if (!inet_csk(sk)->icsk_ca_ops->cong_control)
+				tcp_cwnd_reduction(sk, 1, 0);
+		}
+		tcp_xmit_retransmit_queue(sk);
+	}
+	if (inet_csk(sk)->icsk_pending != ICSK_TIME_RETRANS)
+		tcp_rearm_rto(sk);
+}
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 3705075f42c3..40d893556e67 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -563,8 +563,8 @@ void tcp_write_timer_handler(struct sock *sk)
 	event = icsk->icsk_pending;
 
 	switch (event) {
-	case ICSK_TIME_EARLY_RETRANS:
-		tcp_resume_early_retransmit(sk);
+	case ICSK_TIME_REO_TIMEOUT:
+		tcp_rack_reo_timeout(sk);
 		break;
 	case ICSK_TIME_LOSS_PROBE:
 		tcp_send_loss_probe(sk);
@@ -617,6 +617,7 @@ void tcp_set_keepalive(struct sock *sk, int val)
 	else if (!val)
 		inet_csk_delete_keepalive_timer(sk);
 }
+EXPORT_SYMBOL_GPL(tcp_set_keepalive);
 
 
 static void tcp_keepalive_timer (unsigned long data)
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 1307a7c2e544..ea6e4cff9faf 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -134,14 +134,21 @@ EXPORT_SYMBOL(udp_memory_allocated);
 #define MAX_UDP_PORTS 65536
 #define PORTS_PER_CHAIN (MAX_UDP_PORTS / UDP_HTABLE_SIZE_MIN)
 
+/* IPCB reference means this can not be used from early demux */
+static bool udp_lib_exact_dif_match(struct net *net, struct sk_buff *skb)
+{
+#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV)
+	if (!net->ipv4.sysctl_udp_l3mdev_accept &&
+	    skb && ipv4_l3mdev_skb(IPCB(skb)->flags))
+		return true;
+#endif
+	return false;
+}
+
 static int udp_lib_lport_inuse(struct net *net, __u16 num,
 			       const struct udp_hslot *hslot,
 			       unsigned long *bitmap,
-			       struct sock *sk,
-			       int (*saddr_comp)(const struct sock *sk1,
-						 const struct sock *sk2,
-						 bool match_wildcard),
-			       unsigned int log)
+			       struct sock *sk, unsigned int log)
 {
 	struct sock *sk2;
 	kuid_t uid = sock_i_uid(sk);
@@ -153,13 +160,18 @@ static int udp_lib_lport_inuse(struct net *net, __u16 num,
 		    (!sk2->sk_reuse || !sk->sk_reuse) &&
 		    (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if ||
 		     sk2->sk_bound_dev_if == sk->sk_bound_dev_if) &&
-		    (!sk2->sk_reuseport || !sk->sk_reuseport ||
-		     rcu_access_pointer(sk->sk_reuseport_cb) ||
-		     !uid_eq(uid, sock_i_uid(sk2))) &&
-		    saddr_comp(sk, sk2, true)) {
-			if (!bitmap)
-				return 1;
-			__set_bit(udp_sk(sk2)->udp_port_hash >> log, bitmap);
+		    inet_rcv_saddr_equal(sk, sk2, true)) {
+			if (sk2->sk_reuseport && sk->sk_reuseport &&
+			    !rcu_access_pointer(sk->sk_reuseport_cb) &&
+			    uid_eq(uid, sock_i_uid(sk2))) {
+				if (!bitmap)
+					return 0;
+			} else {
+				if (!bitmap)
+					return 1;
+				__set_bit(udp_sk(sk2)->udp_port_hash >> log,
+					  bitmap);
+			}
 		}
 	}
 	return 0;
@@ -171,10 +183,7 @@ static int udp_lib_lport_inuse(struct net *net, __u16 num,
  */
 static int udp_lib_lport_inuse2(struct net *net, __u16 num,
 				struct udp_hslot *hslot2,
-				struct sock *sk,
-				int (*saddr_comp)(const struct sock *sk1,
-						  const struct sock *sk2,
-						  bool match_wildcard))
+				struct sock *sk)
 {
 	struct sock *sk2;
 	kuid_t uid = sock_i_uid(sk);
@@ -188,11 +197,14 @@ static int udp_lib_lport_inuse2(struct net *net, __u16 num,
 		    (!sk2->sk_reuse || !sk->sk_reuse) &&
 		    (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if ||
 		     sk2->sk_bound_dev_if == sk->sk_bound_dev_if) &&
-		    (!sk2->sk_reuseport || !sk->sk_reuseport ||
-		     rcu_access_pointer(sk->sk_reuseport_cb) ||
-		     !uid_eq(uid, sock_i_uid(sk2))) &&
-		    saddr_comp(sk, sk2, true)) {
-			res = 1;
+		    inet_rcv_saddr_equal(sk, sk2, true)) {
+			if (sk2->sk_reuseport && sk->sk_reuseport &&
+			    !rcu_access_pointer(sk->sk_reuseport_cb) &&
+			    uid_eq(uid, sock_i_uid(sk2))) {
+				res = 0;
+			} else {
+				res = 1;
+			}
 			break;
 		}
 	}
@@ -200,10 +212,7 @@ static int udp_lib_lport_inuse2(struct net *net, __u16 num,
 	return res;
 }
 
-static int udp_reuseport_add_sock(struct sock *sk, struct udp_hslot *hslot,
-				  int (*saddr_same)(const struct sock *sk1,
-						    const struct sock *sk2,
-						    bool match_wildcard))
+static int udp_reuseport_add_sock(struct sock *sk, struct udp_hslot *hslot)
 {
 	struct net *net = sock_net(sk);
 	kuid_t uid = sock_i_uid(sk);
@@ -217,7 +226,7 @@ static int udp_reuseport_add_sock(struct sock *sk, struct udp_hslot *hslot,
 		    (udp_sk(sk2)->udp_port_hash == udp_sk(sk)->udp_port_hash) &&
 		    (sk2->sk_bound_dev_if == sk->sk_bound_dev_if) &&
 		    sk2->sk_reuseport && uid_eq(uid, sock_i_uid(sk2)) &&
-		    (*saddr_same)(sk, sk2, false)) {
+		    inet_rcv_saddr_equal(sk, sk2, false)) {
 			return reuseport_add_sock(sk, sk2);
 		}
 	}
@@ -233,14 +242,10 @@ static int udp_reuseport_add_sock(struct sock *sk, struct udp_hslot *hslot,
  *
  *  @sk:          socket struct in question
  *  @snum:        port number to look up
- *  @saddr_comp:  AF-dependent comparison of bound local IP addresses
  *  @hash2_nulladdr: AF-dependent hash value in secondary hash chains,
  *                   with NULL address
  */
 int udp_lib_get_port(struct sock *sk, unsigned short snum,
-		     int (*saddr_comp)(const struct sock *sk1,
-				       const struct sock *sk2,
-				       bool match_wildcard),
 		     unsigned int hash2_nulladdr)
 {
 	struct udp_hslot *hslot, *hslot2;
@@ -269,7 +274,7 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum,
 			bitmap_zero(bitmap, PORTS_PER_CHAIN);
 			spin_lock_bh(&hslot->lock);
 			udp_lib_lport_inuse(net, snum, hslot, bitmap, sk,
-					    saddr_comp, udptable->log);
+					    udptable->log);
 
 			snum = first;
 			/*
@@ -285,6 +290,7 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum,
 				snum += rand;
 			} while (snum != first);
 			spin_unlock_bh(&hslot->lock);
+			cond_resched();
 		} while (++first != last);
 		goto fail;
 	} else {
@@ -301,12 +307,11 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum,
 			if (hslot->count < hslot2->count)
 				goto scan_primary_hash;
 
-			exist = udp_lib_lport_inuse2(net, snum, hslot2,
-						     sk, saddr_comp);
+			exist = udp_lib_lport_inuse2(net, snum, hslot2, sk);
 			if (!exist && (hash2_nulladdr != slot2)) {
 				hslot2 = udp_hashslot2(udptable, hash2_nulladdr);
 				exist = udp_lib_lport_inuse2(net, snum, hslot2,
-							     sk, saddr_comp);
+							     sk);
 			}
 			if (exist)
 				goto fail_unlock;
@@ -314,8 +319,7 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum,
 				goto found;
 		}
 scan_primary_hash:
-		if (udp_lib_lport_inuse(net, snum, hslot, NULL, sk,
-					saddr_comp, 0))
+		if (udp_lib_lport_inuse(net, snum, hslot, NULL, sk, 0))
 			goto fail_unlock;
 	}
 found:
@@ -324,7 +328,7 @@ found:
 	udp_sk(sk)->udp_portaddr_hash ^= snum;
 	if (sk_unhashed(sk)) {
 		if (sk->sk_reuseport &&
-		    udp_reuseport_add_sock(sk, hslot, saddr_comp)) {
+		    udp_reuseport_add_sock(sk, hslot)) {
 			inet_sk(sk)->inet_num = 0;
 			udp_sk(sk)->udp_port_hash = 0;
 			udp_sk(sk)->udp_portaddr_hash ^= snum;
@@ -356,24 +360,6 @@ fail:
 }
 EXPORT_SYMBOL(udp_lib_get_port);
 
-/* match_wildcard == true:  0.0.0.0 equals to any IPv4 addresses
- * match_wildcard == false: addresses must be exactly the same, i.e.
- *                          0.0.0.0 only equals to 0.0.0.0
- */
-int ipv4_rcv_saddr_equal(const struct sock *sk1, const struct sock *sk2,
-			 bool match_wildcard)
-{
-	struct inet_sock *inet1 = inet_sk(sk1), *inet2 = inet_sk(sk2);
-
-	if (!ipv6_only_sock(sk2)) {
-		if (inet1->inet_rcv_saddr == inet2->inet_rcv_saddr)
-			return 1;
-		if (!inet1->inet_rcv_saddr || !inet2->inet_rcv_saddr)
-			return match_wildcard;
-	}
-	return 0;
-}
-
 static u32 udp4_portaddr_hash(const struct net *net, __be32 saddr,
 			      unsigned int port)
 {
@@ -389,12 +375,13 @@ int udp_v4_get_port(struct sock *sk, unsigned short snum)
 
 	/* precompute partial secondary hash */
 	udp_sk(sk)->udp_portaddr_hash = hash2_partial;
-	return udp_lib_get_port(sk, snum, ipv4_rcv_saddr_equal, hash2_nulladdr);
+	return udp_lib_get_port(sk, snum, hash2_nulladdr);
 }
 
 static int compute_score(struct sock *sk, struct net *net,
 			 __be32 saddr, __be16 sport,
-			 __be32 daddr, unsigned short hnum, int dif)
+			 __be32 daddr, unsigned short hnum, int dif,
+			 bool exact_dif)
 {
 	int score;
 	struct inet_sock *inet;
@@ -425,7 +412,7 @@ static int compute_score(struct sock *sk, struct net *net,
 		score += 4;
 	}
 
-	if (sk->sk_bound_dev_if) {
+	if (sk->sk_bound_dev_if || exact_dif) {
 		if (sk->sk_bound_dev_if != dif)
 			return -1;
 		score += 4;
@@ -450,7 +437,7 @@ static u32 udp_ehashfn(const struct net *net, const __be32 laddr,
 /* called with rcu_read_lock() */
 static struct sock *udp4_lib_lookup2(struct net *net,
 		__be32 saddr, __be16 sport,
-		__be32 daddr, unsigned int hnum, int dif,
+		__be32 daddr, unsigned int hnum, int dif, bool exact_dif,
 		struct udp_hslot *hslot2,
 		struct sk_buff *skb)
 {
@@ -462,7 +449,7 @@ static struct sock *udp4_lib_lookup2(struct net *net,
 	badness = 0;
 	udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) {
 		score = compute_score(sk, net, saddr, sport,
-				      daddr, hnum, dif);
+				      daddr, hnum, dif, exact_dif);
 		if (score > badness) {
 			reuseport = sk->sk_reuseport;
 			if (reuseport) {
@@ -497,6 +484,7 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
 	unsigned short hnum = ntohs(dport);
 	unsigned int hash2, slot2, slot = udp_hashfn(net, hnum, udptable->mask);
 	struct udp_hslot *hslot2, *hslot = &udptable->hash[slot];
+	bool exact_dif = udp_lib_exact_dif_match(net, skb);
 	int score, badness, matches = 0, reuseport = 0;
 	u32 hash = 0;
 
@@ -509,7 +497,7 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
 
 		result = udp4_lib_lookup2(net, saddr, sport,
 					  daddr, hnum, dif,
-					  hslot2, skb);
+					  exact_dif, hslot2, skb);
 		if (!result) {
 			unsigned int old_slot2 = slot2;
 			hash2 = udp4_portaddr_hash(net, htonl(INADDR_ANY), hnum);
@@ -524,7 +512,7 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
 
 			result = udp4_lib_lookup2(net, saddr, sport,
 						  daddr, hnum, dif,
-						  hslot2, skb);
+						  exact_dif, hslot2, skb);
 		}
 		return result;
 	}
@@ -533,7 +521,7 @@ begin:
 	badness = 0;
 	sk_for_each_rcu(sk, &hslot->head) {
 		score = compute_score(sk, net, saddr, sport,
-				      daddr, hnum, dif);
+				      daddr, hnum, dif, exact_dif);
 		if (score > badness) {
 			reuseport = sk->sk_reuseport;
 			if (reuseport) {
@@ -1113,7 +1101,8 @@ out:
 	return err;
 
 do_confirm:
-	dst_confirm(&rt->dst);
+	if (msg->msg_flags & MSG_PROBE)
+		dst_confirm_neigh(&rt->dst, &fl4->daddr);
 	if (!(msg->msg_flags&MSG_PROBE) || len)
 		goto back_from_confirm;
 	err = 0;
@@ -1501,7 +1490,7 @@ try_again:
 	return err;
 
 csum_copy_err:
-	if (!__sk_queue_drop_skb(sk, skb, flags)) {
+	if (!__sk_queue_drop_skb(sk, skb, flags, udp_skb_destructor)) {
 		UDP_INC_STATS(sock_net(sk), UDP_MIB_CSUMERRORS, is_udplite);
 		UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
 	}
diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c
index 62e1e72db461..1fc684111ce6 100644
--- a/net/ipv4/xfrm4_input.c
+++ b/net/ipv4/xfrm4_input.c
@@ -40,6 +40,7 @@ drop:
 
 int xfrm4_transport_finish(struct sk_buff *skb, int async)
 {
+	struct xfrm_offload *xo = xfrm_offload(skb);
 	struct iphdr *iph = ip_hdr(skb);
 
 	iph->protocol = XFRM_MODE_SKB_CB(skb)->protocol;
@@ -53,6 +54,11 @@ int xfrm4_transport_finish(struct sk_buff *skb, int async)
 	iph->tot_len = htons(skb->len);
 	ip_send_check(iph);
 
+	if (xo && (xo->flags & XFRM_GRO)) {
+		skb_mac_header_rebuild(skb);
+		return 0;
+	}
+
 	NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING,
 		dev_net(skb->dev), NULL, skb, skb->dev, NULL,
 		xfrm4_rcv_encap_finish);
diff --git a/net/ipv4/xfrm4_mode_transport.c b/net/ipv4/xfrm4_mode_transport.c
index fd840c7d75ea..4acc0508c5eb 100644
--- a/net/ipv4/xfrm4_mode_transport.c
+++ b/net/ipv4/xfrm4_mode_transport.c
@@ -43,6 +43,7 @@ static int xfrm4_transport_output(struct xfrm_state *x, struct sk_buff *skb)
 static int xfrm4_transport_input(struct xfrm_state *x, struct sk_buff *skb)
 {
 	int ihl = skb->data - skb_transport_header(skb);
+	struct xfrm_offload *xo = xfrm_offload(skb);
 
 	if (skb->transport_header != skb->network_header) {
 		memmove(skb_transport_header(skb),
@@ -50,7 +51,8 @@ static int xfrm4_transport_input(struct xfrm_state *x, struct sk_buff *skb)
 		skb->network_header = skb->transport_header;
 	}
 	ip_hdr(skb)->tot_len = htons(skb->len + ihl);
-	skb_reset_transport_header(skb);
+	if (!xo || !(xo->flags & XFRM_GRO))
+		skb_reset_transport_header(skb);
 	return 0;
 }
 
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index 6a7ff6957535..71b4ecc195c7 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -17,8 +17,6 @@
 #include <net/ip.h>
 #include <net/l3mdev.h>
 
-static struct xfrm_policy_afinfo xfrm4_policy_afinfo;
-
 static struct dst_entry *__xfrm4_dst_lookup(struct net *net, struct flowi4 *fl4,
 					    int tos, int oif,
 					    const xfrm_address_t *saddr,
@@ -219,7 +217,7 @@ static inline int xfrm4_garbage_collect(struct dst_ops *ops)
 {
 	struct net *net = container_of(ops, struct net, xfrm.xfrm4_dst_ops);
 
-	xfrm4_policy_afinfo.garbage_collect(net);
+	xfrm_garbage_collect_deferred(net);
 	return (dst_entries_get_slow(ops) > ops->gc_thresh * 2);
 }
 
@@ -271,8 +269,7 @@ static struct dst_ops xfrm4_dst_ops_template = {
 	.gc_thresh =		INT_MAX,
 };
 
-static struct xfrm_policy_afinfo xfrm4_policy_afinfo = {
-	.family = 		AF_INET,
+static const struct xfrm_policy_afinfo xfrm4_policy_afinfo = {
 	.dst_ops =		&xfrm4_dst_ops_template,
 	.dst_lookup =		xfrm4_dst_lookup,
 	.get_saddr =		xfrm4_get_saddr,
@@ -376,7 +373,7 @@ static struct pernet_operations __net_initdata xfrm4_net_ops = {
 
 static void __init xfrm4_policy_init(void)
 {
-	xfrm_policy_register_afinfo(&xfrm4_policy_afinfo);
+	xfrm_policy_register_afinfo(&xfrm4_policy_afinfo, AF_INET);
 }
 
 void __init xfrm4_init(void)
diff --git a/net/ipv4/xfrm4_protocol.c b/net/ipv4/xfrm4_protocol.c
index dccefa9d84cf..8dd0e6ab8606 100644
--- a/net/ipv4/xfrm4_protocol.c
+++ b/net/ipv4/xfrm4_protocol.c
@@ -188,9 +188,8 @@ static const struct net_protocol ipcomp4_protocol = {
 	.netns_ok	=	1,
 };
 
-static struct xfrm_input_afinfo xfrm4_input_afinfo = {
+static const struct xfrm_input_afinfo xfrm4_input_afinfo = {
 	.family		=	AF_INET,
-	.owner		=	THIS_MODULE,
 	.callback	=	xfrm4_rcv_cb,
 };
 
diff --git a/net/ipv4/xfrm4_state.c b/net/ipv4/xfrm4_state.c
index 542074c00c78..d6660a8c0ea5 100644
--- a/net/ipv4/xfrm4_state.c
+++ b/net/ipv4/xfrm4_state.c
@@ -90,11 +90,3 @@ void __init xfrm4_state_init(void)
 {
 	xfrm_state_register_afinfo(&xfrm4_state_afinfo);
 }
-
-#if 0
-void __exit xfrm4_state_fini(void)
-{
-	xfrm_state_unregister_afinfo(&xfrm4_state_afinfo);
-}
-#endif  /*  0  */
-
diff --git a/net/ipv6/Kconfig b/net/ipv6/Kconfig
index ec1267e2bd1f..e2afe677a9d9 100644
--- a/net/ipv6/Kconfig
+++ b/net/ipv6/Kconfig
@@ -75,6 +75,19 @@ config INET6_ESP
 
 	  If unsure, say Y.
 
+config INET6_ESP_OFFLOAD
+	tristate "IPv6: ESP transformation offload"
+	depends on INET6_ESP
+	select XFRM_OFFLOAD
+	default n
+	---help---
+	  Support for ESP transformation offload. This makes sense
+	  only if this system really does IPsec and want to do it
+	  with high throughput. A typical desktop system does not
+	  need it, even if it does IPsec.
+
+	  If unsure, say N.
+
 config INET6_IPCOMP
 	tristate "IPv6: IPComp transformation"
 	select INET6_XFRM_TUNNEL
@@ -208,6 +221,7 @@ config IPV6_TUNNEL
 	tristate "IPv6: IP-in-IPv6 tunnel (RFC2473)"
 	select INET6_TUNNEL
 	select DST_CACHE
+	select GRO_CELLS
 	---help---
 	  Support for IPv6-in-IPv6 and IPv4-in-IPv6 tunnels described in
 	  RFC 2473.
diff --git a/net/ipv6/Makefile b/net/ipv6/Makefile
index a9e9fec387ce..217e9ff0e24b 100644
--- a/net/ipv6/Makefile
+++ b/net/ipv6/Makefile
@@ -30,6 +30,7 @@ ipv6-objs += $(ipv6-y)
 
 obj-$(CONFIG_INET6_AH) += ah6.o
 obj-$(CONFIG_INET6_ESP) += esp6.o
+obj-$(CONFIG_INET6_ESP_OFFLOAD) += esp6_offload.o
 obj-$(CONFIG_INET6_IPCOMP) += ipcomp6.o
 obj-$(CONFIG_INET6_XFRM_TUNNEL) += xfrm6_tunnel.o
 obj-$(CONFIG_INET6_TUNNEL) += tunnel6.o
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index f60e88e56255..3a2025f5bf2c 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -243,6 +243,7 @@ static struct ipv6_devconf ipv6_devconf __read_mostly = {
 	.seg6_require_hmac	= 0,
 #endif
 	.enhanced_dad           = 1,
+	.addr_gen_mode		= IN6_ADDR_GEN_MODE_EUI64,
 };
 
 static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = {
@@ -294,6 +295,7 @@ static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = {
 	.seg6_require_hmac	= 0,
 #endif
 	.enhanced_dad           = 1,
+	.addr_gen_mode		= IN6_ADDR_GEN_MODE_EUI64,
 };
 
 /* Check if a valid qdisc is available */
@@ -386,9 +388,9 @@ static struct inet6_dev *ipv6_add_dev(struct net_device *dev)
 	memcpy(&ndev->cnf, dev_net(dev)->ipv6.devconf_dflt, sizeof(ndev->cnf));
 
 	if (ndev->cnf.stable_secret.initialized)
-		ndev->addr_gen_mode = IN6_ADDR_GEN_MODE_STABLE_PRIVACY;
+		ndev->cnf.addr_gen_mode = IN6_ADDR_GEN_MODE_STABLE_PRIVACY;
 	else
-		ndev->addr_gen_mode = IN6_ADDR_GEN_MODE_EUI64;
+		ndev->cnf.addr_gen_mode = ipv6_devconf_dflt.addr_gen_mode;
 
 	ndev->cnf.mtu6 = dev->mtu;
 	ndev->nd_parms = neigh_parms_alloc(dev, &nd_tbl);
@@ -2144,12 +2146,14 @@ static int ipv6_generate_eui64(u8 *eui, struct net_device *dev)
 	case ARPHRD_SIT:
 		return addrconf_ifid_sit(eui, dev);
 	case ARPHRD_IPGRE:
+	case ARPHRD_TUNNEL:
 		return addrconf_ifid_gre(eui, dev);
 	case ARPHRD_6LOWPAN:
 		return addrconf_ifid_eui64(eui, dev);
 	case ARPHRD_IEEE1394:
 		return addrconf_ifid_ieee1394(eui, dev);
 	case ARPHRD_TUNNEL6:
+	case ARPHRD_IP6GRE:
 		return addrconf_ifid_ip6tnl(eui, dev);
 	}
 	return -1;
@@ -2387,8 +2391,8 @@ static void manage_tempaddrs(struct inet6_dev *idev,
 
 static bool is_addr_mode_generate_stable(struct inet6_dev *idev)
 {
-	return idev->addr_gen_mode == IN6_ADDR_GEN_MODE_STABLE_PRIVACY ||
-	       idev->addr_gen_mode == IN6_ADDR_GEN_MODE_RANDOM;
+	return idev->cnf.addr_gen_mode == IN6_ADDR_GEN_MODE_STABLE_PRIVACY ||
+	       idev->cnf.addr_gen_mode == IN6_ADDR_GEN_MODE_RANDOM;
 }
 
 int addrconf_prefix_rcv_add_addr(struct net *net, struct net_device *dev,
@@ -3152,7 +3156,7 @@ static void addrconf_addr_gen(struct inet6_dev *idev, bool prefix_route)
 
 	ipv6_addr_set(&addr, htonl(0xFE800000), 0, 0, 0);
 
-	switch (idev->addr_gen_mode) {
+	switch (idev->cnf.addr_gen_mode) {
 	case IN6_ADDR_GEN_MODE_RANDOM:
 		ipv6_gen_mode_random_init(idev);
 		/* fallthrough */
@@ -3193,6 +3197,9 @@ static void addrconf_dev_config(struct net_device *dev)
 	    (dev->type != ARPHRD_IEEE1394) &&
 	    (dev->type != ARPHRD_TUNNEL6) &&
 	    (dev->type != ARPHRD_6LOWPAN) &&
+	    (dev->type != ARPHRD_IP6GRE) &&
+	    (dev->type != ARPHRD_IPGRE) &&
+	    (dev->type != ARPHRD_TUNNEL) &&
 	    (dev->type != ARPHRD_NONE)) {
 		/* Alas, we support only Ethernet autoconfiguration. */
 		return;
@@ -3204,8 +3211,8 @@ static void addrconf_dev_config(struct net_device *dev)
 
 	/* this device type has no EUI support */
 	if (dev->type == ARPHRD_NONE &&
-	    idev->addr_gen_mode == IN6_ADDR_GEN_MODE_EUI64)
-		idev->addr_gen_mode = IN6_ADDR_GEN_MODE_RANDOM;
+	    idev->cnf.addr_gen_mode == IN6_ADDR_GEN_MODE_EUI64)
+		idev->cnf.addr_gen_mode = IN6_ADDR_GEN_MODE_RANDOM;
 
 	addrconf_addr_gen(idev, false);
 }
@@ -3386,9 +3393,15 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event,
 			}
 
 			if (idev) {
-				if (idev->if_flags & IF_READY)
-					/* device is already configured. */
+				if (idev->if_flags & IF_READY) {
+					/* device is already configured -
+					 * but resend MLD reports, we might
+					 * have roamed and need to update
+					 * multicast snooping switches
+					 */
+					ipv6_mc_up(idev);
 					break;
+				}
 				idev->if_flags |= IF_READY;
 			}
 
@@ -4009,6 +4022,12 @@ static void addrconf_dad_completed(struct inet6_ifaddr *ifp, bool bump_id)
 
 	if (bump_id)
 		rt_genid_bump_ipv6(dev_net(dev));
+
+	/* Make sure that a new temporary address will be created
+	 * before this temporary address becomes deprecated.
+	 */
+	if (ifp->flags & IFA_F_TEMPORARY)
+		addrconf_verify_rtnl();
 }
 
 static void addrconf_dad_run(struct inet6_dev *idev)
@@ -4888,6 +4907,13 @@ static void inet6_ifa_notify(int event, struct inet6_ifaddr *ifa)
 	struct net *net = dev_net(ifa->idev->dev);
 	int err = -ENOBUFS;
 
+	/* Don't send DELADDR notification for TENTATIVE address,
+	 * since NEWADDR notification is sent only after removing
+	 * TENTATIVE flag.
+	 */
+	if (ifa->flags & IFA_F_TENTATIVE && event == RTM_DELADDR)
+		return;
+
 	skb = nlmsg_new(inet6_ifaddr_msgsize(), GFP_ATOMIC);
 	if (!skb)
 		goto errout;
@@ -4975,6 +5001,7 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf,
 	array[DEVCONF_SEG6_REQUIRE_HMAC] = cnf->seg6_require_hmac;
 #endif
 	array[DEVCONF_ENHANCED_DAD] = cnf->enhanced_dad;
+	array[DEVCONF_ADDR_GEN_MODE] = cnf->addr_gen_mode;
 }
 
 static inline size_t inet6_ifla6_size(void)
@@ -5086,7 +5113,7 @@ static int inet6_fill_ifla6_attrs(struct sk_buff *skb, struct inet6_dev *idev,
 	if (!nla)
 		goto nla_put_failure;
 
-	if (nla_put_u8(skb, IFLA_INET6_ADDR_GEN_MODE, idev->addr_gen_mode))
+	if (nla_put_u8(skb, IFLA_INET6_ADDR_GEN_MODE, idev->cnf.addr_gen_mode))
 		goto nla_put_failure;
 
 	read_lock_bh(&idev->lock);
@@ -5204,6 +5231,26 @@ static int inet6_validate_link_af(const struct net_device *dev,
 	return nla_parse_nested(tb, IFLA_INET6_MAX, nla, inet6_af_policy);
 }
 
+static int check_addr_gen_mode(int mode)
+{
+	if (mode != IN6_ADDR_GEN_MODE_EUI64 &&
+	    mode != IN6_ADDR_GEN_MODE_NONE &&
+	    mode != IN6_ADDR_GEN_MODE_STABLE_PRIVACY &&
+	    mode != IN6_ADDR_GEN_MODE_RANDOM)
+		return -EINVAL;
+	return 1;
+}
+
+static int check_stable_privacy(struct inet6_dev *idev, struct net *net,
+				int mode)
+{
+	if (mode == IN6_ADDR_GEN_MODE_STABLE_PRIVACY &&
+	    !idev->cnf.stable_secret.initialized &&
+	    !net->ipv6.devconf_dflt->stable_secret.initialized)
+		return -EINVAL;
+	return 1;
+}
+
 static int inet6_set_link_af(struct net_device *dev, const struct nlattr *nla)
 {
 	int err = -EINVAL;
@@ -5225,18 +5272,11 @@ static int inet6_set_link_af(struct net_device *dev, const struct nlattr *nla)
 	if (tb[IFLA_INET6_ADDR_GEN_MODE]) {
 		u8 mode = nla_get_u8(tb[IFLA_INET6_ADDR_GEN_MODE]);
 
-		if (mode != IN6_ADDR_GEN_MODE_EUI64 &&
-		    mode != IN6_ADDR_GEN_MODE_NONE &&
-		    mode != IN6_ADDR_GEN_MODE_STABLE_PRIVACY &&
-		    mode != IN6_ADDR_GEN_MODE_RANDOM)
-			return -EINVAL;
-
-		if (mode == IN6_ADDR_GEN_MODE_STABLE_PRIVACY &&
-		    !idev->cnf.stable_secret.initialized &&
-		    !dev_net(dev)->ipv6.devconf_dflt->stable_secret.initialized)
+		if (check_addr_gen_mode(mode) < 0 ||
+		    check_stable_privacy(idev, dev_net(dev), mode) < 0)
 			return -EINVAL;
 
-		idev->addr_gen_mode = mode;
+		idev->cnf.addr_gen_mode = mode;
 		err = 0;
 	}
 
@@ -5643,6 +5683,47 @@ int addrconf_sysctl_proxy_ndp(struct ctl_table *ctl, int write,
 	return ret;
 }
 
+static int addrconf_sysctl_addr_gen_mode(struct ctl_table *ctl, int write,
+					 void __user *buffer, size_t *lenp,
+					 loff_t *ppos)
+{
+	int ret = 0;
+	int new_val;
+	struct inet6_dev *idev = (struct inet6_dev *)ctl->extra1;
+	struct net *net = (struct net *)ctl->extra2;
+
+	ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
+
+	if (write) {
+		new_val = *((int *)ctl->data);
+
+		if (check_addr_gen_mode(new_val) < 0)
+			return -EINVAL;
+
+		/* request for default */
+		if (&net->ipv6.devconf_dflt->addr_gen_mode == ctl->data) {
+			ipv6_devconf_dflt.addr_gen_mode = new_val;
+
+		/* request for individual net device */
+		} else {
+			if (!idev)
+				return ret;
+
+			if (check_stable_privacy(idev, net, new_val) < 0)
+				return -EINVAL;
+
+			if (idev->cnf.addr_gen_mode != new_val) {
+				idev->cnf.addr_gen_mode = new_val;
+				rtnl_lock();
+				addrconf_dev_config(idev->dev);
+				rtnl_unlock();
+			}
+		}
+	}
+
+	return ret;
+}
+
 static int addrconf_sysctl_stable_secret(struct ctl_table *ctl, int write,
 					 void __user *buffer, size_t *lenp,
 					 loff_t *ppos)
@@ -5693,14 +5774,14 @@ static int addrconf_sysctl_stable_secret(struct ctl_table *ctl, int write,
 			struct inet6_dev *idev = __in6_dev_get(dev);
 
 			if (idev) {
-				idev->addr_gen_mode =
+				idev->cnf.addr_gen_mode =
 					IN6_ADDR_GEN_MODE_STABLE_PRIVACY;
 			}
 		}
 	} else {
 		struct inet6_dev *idev = ctl->extra1;
 
-		idev->addr_gen_mode = IN6_ADDR_GEN_MODE_STABLE_PRIVACY;
+		idev->cnf.addr_gen_mode = IN6_ADDR_GEN_MODE_STABLE_PRIVACY;
 	}
 
 out:
@@ -6088,6 +6169,13 @@ static const struct ctl_table addrconf_sysctl[] = {
 		.proc_handler   = proc_dointvec,
 	},
 	{
+		.procname		= "addr_gen_mode",
+		.data			= &ipv6_devconf.addr_gen_mode,
+		.maxlen			= sizeof(int),
+		.mode			= 0644,
+		.proc_handler	= addrconf_sysctl_addr_gen_mode,
+	},
+	{
 		/* sentinel */
 	}
 };
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index aa42123bc301..04db40620ea6 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -302,7 +302,8 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 		return -EINVAL;
 
 	snum = ntohs(addr->sin6_port);
-	if (snum && snum < PROT_SOCK && !ns_capable(net->user_ns, CAP_NET_BIND_SERVICE))
+	if (snum && snum < inet_prot_sock(net) &&
+	    !ns_capable(net->user_ns, CAP_NET_BIND_SERVICE))
 		return -EACCES;
 
 	lock_sock(sk);
diff --git a/net/ipv6/ah6.c b/net/ipv6/ah6.c
index 189eb10b742d..dda6035e3b84 100644
--- a/net/ipv6/ah6.c
+++ b/net/ipv6/ah6.c
@@ -474,6 +474,9 @@ static void ah6_input_done(struct crypto_async_request *base, int err)
 	int hdr_len = skb_network_header_len(skb);
 	int ah_hlen = (ah->hdrlen + 2) << 2;
 
+	if (err)
+		goto out;
+
 	work_iph = AH_SKB_CB(skb)->tmp;
 	auth_data = ah_tmp_auth(work_iph, hdr_len);
 	icv = ah_tmp_icv(ahp->ahash, auth_data, ahp->icv_trunc_len);
diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c
index a3eaafd87100..eec27f87efac 100644
--- a/net/ipv6/datagram.c
+++ b/net/ipv6/datagram.c
@@ -167,18 +167,22 @@ int __ip6_datagram_connect(struct sock *sk, struct sockaddr *uaddr,
 	if (np->sndflow)
 		fl6_flowlabel = usin->sin6_flowinfo & IPV6_FLOWINFO_MASK;
 
-	addr_type = ipv6_addr_type(&usin->sin6_addr);
-
-	if (addr_type == IPV6_ADDR_ANY) {
+	if (ipv6_addr_any(&usin->sin6_addr)) {
 		/*
 		 *	connect to self
 		 */
-		usin->sin6_addr.s6_addr[15] = 0x01;
+		if (ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr))
+			ipv6_addr_set_v4mapped(htonl(INADDR_LOOPBACK),
+					       &usin->sin6_addr);
+		else
+			usin->sin6_addr = in6addr_loopback;
 	}
 
+	addr_type = ipv6_addr_type(&usin->sin6_addr);
+
 	daddr = &usin->sin6_addr;
 
-	if (addr_type == IPV6_ADDR_MAPPED) {
+	if (addr_type & IPV6_ADDR_MAPPED) {
 		struct sockaddr_in sin;
 
 		if (__ipv6_only_sock(sk)) {
diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c
index cbcdd5db31f4..ff54faa75631 100644
--- a/net/ipv6/esp6.c
+++ b/net/ipv6/esp6.c
@@ -44,6 +44,8 @@
 #include <net/protocol.h>
 #include <linux/icmpv6.h>
 
+#include <linux/highmem.h>
+
 struct esp_skb_cb {
 	struct xfrm_skb_cb xfrm;
 	void *tmp;
@@ -114,11 +116,40 @@ static inline struct scatterlist *esp_req_sg(struct crypto_aead *aead,
 			     __alignof__(struct scatterlist));
 }
 
+static void esp_ssg_unref(struct xfrm_state *x, void *tmp)
+{
+	__be32 *seqhi;
+	struct crypto_aead *aead = x->data;
+	int seqhilen = 0;
+	u8 *iv;
+	struct aead_request *req;
+	struct scatterlist *sg;
+
+	if (x->props.flags & XFRM_STATE_ESN)
+		seqhilen += sizeof(__be32);
+
+	seqhi = esp_tmp_seqhi(tmp);
+	iv = esp_tmp_iv(aead, tmp, seqhilen);
+	req = esp_tmp_req(aead, iv);
+
+	/* Unref skb_frag_pages in the src scatterlist if necessary.
+	 * Skip the first sg which comes from skb->data.
+	 */
+	if (req->src != req->dst)
+		for (sg = sg_next(req->src); sg; sg = sg_next(sg))
+			put_page(sg_page(sg));
+}
+
 static void esp_output_done(struct crypto_async_request *base, int err)
 {
 	struct sk_buff *skb = base->data;
+	void *tmp;
+	struct dst_entry *dst = skb_dst(skb);
+	struct xfrm_state *x = dst->xfrm;
 
-	kfree(ESP_SKB_CB(skb)->tmp);
+	tmp = ESP_SKB_CB(skb)->tmp;
+	esp_ssg_unref(x, tmp);
+	kfree(tmp);
 	xfrm_output_resume(skb, err);
 }
 
@@ -138,6 +169,27 @@ static void esp_output_restore_header(struct sk_buff *skb)
 	esp_restore_header(skb, skb_transport_offset(skb) - sizeof(__be32));
 }
 
+static struct ip_esp_hdr *esp_output_set_esn(struct sk_buff *skb,
+					     struct ip_esp_hdr *esph,
+					     __be32 *seqhi)
+{
+	struct xfrm_state *x = skb_dst(skb)->xfrm;
+
+	/* For ESN we move the header forward by 4 bytes to
+	 * accomodate the high bits.  We will move it back after
+	 * encryption.
+	 */
+	if ((x->props.flags & XFRM_STATE_ESN)) {
+		esph = (void *)(skb_transport_header(skb) - sizeof(__be32));
+		*seqhi = esph->spi;
+		esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.hi);
+	}
+
+	esph->spi = x->id.spi;
+
+	return esph;
+}
+
 static void esp_output_done_esn(struct crypto_async_request *base, int err)
 {
 	struct sk_buff *skb = base->data;
@@ -146,14 +198,31 @@ static void esp_output_done_esn(struct crypto_async_request *base, int err)
 	esp_output_done(base, err);
 }
 
+static void esp_output_fill_trailer(u8 *tail, int tfclen, int plen, __u8 proto)
+{
+	/* Fill padding... */
+	if (tfclen) {
+		memset(tail, 0, tfclen);
+		tail += tfclen;
+	}
+	do {
+		int i;
+		for (i = 0; i < plen - 2; i++)
+			tail[i] = i + 1;
+	} while (0);
+	tail[plen - 2] = plen - 2;
+	tail[plen - 1] = proto;
+}
+
 static int esp6_output(struct xfrm_state *x, struct sk_buff *skb)
 {
 	int err;
 	struct ip_esp_hdr *esph;
 	struct crypto_aead *aead;
 	struct aead_request *req;
-	struct scatterlist *sg;
+	struct scatterlist *sg, *dsg;
 	struct sk_buff *trailer;
+	struct page *page;
 	void *tmp;
 	int blksize;
 	int clen;
@@ -164,10 +233,13 @@ static int esp6_output(struct xfrm_state *x, struct sk_buff *skb)
 	int nfrags;
 	int assoclen;
 	int seqhilen;
+	int tailen;
 	u8 *iv;
 	u8 *tail;
+	u8 *vaddr;
 	__be32 *seqhi;
 	__be64 seqno;
+	__u8 proto = *skb_mac_header(skb);
 
 	/* skb is pure payload to encrypt */
 	aead = x->data;
@@ -186,11 +258,7 @@ static int esp6_output(struct xfrm_state *x, struct sk_buff *skb)
 	blksize = ALIGN(crypto_aead_blocksize(aead), 4);
 	clen = ALIGN(skb->len + 2 + tfclen, blksize);
 	plen = clen - skb->len - tfclen;
-
-	err = skb_cow_data(skb, tfclen + plen + alen, &trailer);
-	if (err < 0)
-		goto error;
-	nfrags = err;
+	tailen = tfclen + plen + alen;
 
 	assoclen = sizeof(*esph);
 	seqhilen = 0;
@@ -200,59 +268,152 @@ static int esp6_output(struct xfrm_state *x, struct sk_buff *skb)
 		assoclen += seqhilen;
 	}
 
-	tmp = esp_alloc_tmp(aead, nfrags, seqhilen);
-	if (!tmp) {
-		err = -ENOMEM;
-		goto error;
+	*skb_mac_header(skb) = IPPROTO_ESP;
+	esph = ip_esp_hdr(skb);
+
+	if (!skb_cloned(skb)) {
+		if (tailen <= skb_availroom(skb)) {
+			nfrags = 1;
+			trailer = skb;
+			tail = skb_tail_pointer(trailer);
+
+			goto skip_cow;
+		} else if ((skb_shinfo(skb)->nr_frags < MAX_SKB_FRAGS)
+			   && !skb_has_frag_list(skb)) {
+			int allocsize;
+			struct sock *sk = skb->sk;
+			struct page_frag *pfrag = &x->xfrag;
+
+			allocsize = ALIGN(tailen, L1_CACHE_BYTES);
+
+			spin_lock_bh(&x->lock);
+
+			if (unlikely(!skb_page_frag_refill(allocsize, pfrag, GFP_ATOMIC))) {
+				spin_unlock_bh(&x->lock);
+				goto cow;
+			}
+
+			page = pfrag->page;
+			get_page(page);
+
+			vaddr = kmap_atomic(page);
+
+			tail = vaddr + pfrag->offset;
+
+			esp_output_fill_trailer(tail, tfclen, plen, proto);
+
+			kunmap_atomic(vaddr);
+
+			nfrags = skb_shinfo(skb)->nr_frags;
+
+			__skb_fill_page_desc(skb, nfrags, page, pfrag->offset,
+					     tailen);
+			skb_shinfo(skb)->nr_frags = ++nfrags;
+
+			pfrag->offset = pfrag->offset + allocsize;
+			nfrags++;
+
+			skb->len += tailen;
+			skb->data_len += tailen;
+			skb->truesize += tailen;
+			if (sk)
+				atomic_add(tailen, &sk->sk_wmem_alloc);
+
+			skb_push(skb, -skb_network_offset(skb));
+
+			esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low);
+			esph->spi = x->id.spi;
+
+			tmp = esp_alloc_tmp(aead, nfrags + 2, seqhilen);
+			if (!tmp) {
+				spin_unlock_bh(&x->lock);
+				err = -ENOMEM;
+				goto error;
+			}
+			seqhi = esp_tmp_seqhi(tmp);
+			iv = esp_tmp_iv(aead, tmp, seqhilen);
+			req = esp_tmp_req(aead, iv);
+			sg = esp_req_sg(aead, req);
+			dsg = &sg[nfrags];
+
+			esph = esp_output_set_esn(skb, esph, seqhi);
+
+			sg_init_table(sg, nfrags);
+			skb_to_sgvec(skb, sg,
+				     (unsigned char *)esph - skb->data,
+				     assoclen + ivlen + clen + alen);
+
+			allocsize = ALIGN(skb->data_len, L1_CACHE_BYTES);
+
+			if (unlikely(!skb_page_frag_refill(allocsize, pfrag, GFP_ATOMIC))) {
+				spin_unlock_bh(&x->lock);
+				err = -ENOMEM;
+				goto error;
+			}
+
+			skb_shinfo(skb)->nr_frags = 1;
+
+			page = pfrag->page;
+			get_page(page);
+			/* replace page frags in skb with new page */
+			__skb_fill_page_desc(skb, 0, page, pfrag->offset, skb->data_len);
+			pfrag->offset = pfrag->offset + allocsize;
+
+			sg_init_table(dsg, skb_shinfo(skb)->nr_frags + 1);
+			skb_to_sgvec(skb, dsg,
+				     (unsigned char *)esph - skb->data,
+				     assoclen + ivlen + clen + alen);
+
+			spin_unlock_bh(&x->lock);
+
+			goto skip_cow2;
+		}
 	}
 
-	seqhi = esp_tmp_seqhi(tmp);
-	iv = esp_tmp_iv(aead, tmp, seqhilen);
-	req = esp_tmp_req(aead, iv);
-	sg = esp_req_sg(aead, req);
+cow:
+	err = skb_cow_data(skb, tailen, &trailer);
+	if (err < 0)
+		goto error;
+	nfrags = err;
 
-	/* Fill padding... */
 	tail = skb_tail_pointer(trailer);
-	if (tfclen) {
-		memset(tail, 0, tfclen);
-		tail += tfclen;
-	}
-	do {
-		int i;
-		for (i = 0; i < plen - 2; i++)
-			tail[i] = i + 1;
-	} while (0);
-	tail[plen - 2] = plen - 2;
-	tail[plen - 1] = *skb_mac_header(skb);
-	pskb_put(skb, trailer, clen - skb->len + alen);
+	esph = ip_esp_hdr(skb);
 
+skip_cow:
+	esp_output_fill_trailer(tail, tfclen, plen, proto);
+
+	pskb_put(skb, trailer, clen - skb->len + alen);
 	skb_push(skb, -skb_network_offset(skb));
-	esph = ip_esp_hdr(skb);
-	*skb_mac_header(skb) = IPPROTO_ESP;
 
 	esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low);
+	esph->spi = x->id.spi;
 
-	aead_request_set_callback(req, 0, esp_output_done, skb);
-
-	/* For ESN we move the header forward by 4 bytes to
-	 * accomodate the high bits.  We will move it back after
-	 * encryption.
-	 */
-	if ((x->props.flags & XFRM_STATE_ESN)) {
-		esph = (void *)(skb_transport_header(skb) - sizeof(__be32));
-		*seqhi = esph->spi;
-		esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.hi);
-		aead_request_set_callback(req, 0, esp_output_done_esn, skb);
+	tmp = esp_alloc_tmp(aead, nfrags, seqhilen);
+	if (!tmp) {
+		err = -ENOMEM;
+		goto error;
 	}
 
-	esph->spi = x->id.spi;
+	seqhi = esp_tmp_seqhi(tmp);
+	iv = esp_tmp_iv(aead, tmp, seqhilen);
+	req = esp_tmp_req(aead, iv);
+	sg = esp_req_sg(aead, req);
+	dsg = sg;
+
+	esph = esp_output_set_esn(skb, esph, seqhi);
 
 	sg_init_table(sg, nfrags);
 	skb_to_sgvec(skb, sg,
 		     (unsigned char *)esph - skb->data,
 		     assoclen + ivlen + clen + alen);
 
-	aead_request_set_crypt(req, sg, sg, ivlen + clen, iv);
+skip_cow2:
+	if ((x->props.flags & XFRM_STATE_ESN))
+		aead_request_set_callback(req, 0, esp_output_done_esn, skb);
+	else
+		aead_request_set_callback(req, 0, esp_output_done, skb);
+
+	aead_request_set_crypt(req, sg, dsg, ivlen + clen, iv);
 	aead_request_set_ad(req, assoclen);
 
 	seqno = cpu_to_be64(XFRM_SKB_CB(skb)->seq.output.low +
@@ -278,6 +439,8 @@ static int esp6_output(struct xfrm_state *x, struct sk_buff *skb)
 			esp_output_restore_header(skb);
 	}
 
+	if (sg != dsg)
+		esp_ssg_unref(x, tmp);
 	kfree(tmp);
 
 error:
@@ -343,6 +506,23 @@ static void esp_input_restore_header(struct sk_buff *skb)
 	__skb_pull(skb, 4);
 }
 
+static void esp_input_set_header(struct sk_buff *skb, __be32 *seqhi)
+{
+	struct xfrm_state *x = xfrm_input_state(skb);
+	struct ip_esp_hdr *esph = (struct ip_esp_hdr *)skb->data;
+
+	/* For ESN we move the header forward by 4 bytes to
+	 * accomodate the high bits.  We will move it back after
+	 * decryption.
+	 */
+	if ((x->props.flags & XFRM_STATE_ESN)) {
+		esph = (void *)skb_push(skb, 4);
+		*seqhi = esph->spi;
+		esph->spi = esph->seq_no;
+		esph->seq_no = XFRM_SKB_CB(skb)->seq.input.hi;
+	}
+}
+
 static void esp_input_done_esn(struct crypto_async_request *base, int err)
 {
 	struct sk_buff *skb = base->data;
@@ -378,14 +558,6 @@ static int esp6_input(struct xfrm_state *x, struct sk_buff *skb)
 		goto out;
 	}
 
-	nfrags = skb_cow_data(skb, 0, &trailer);
-	if (nfrags < 0) {
-		ret = -EINVAL;
-		goto out;
-	}
-
-	ret = -ENOMEM;
-
 	assoclen = sizeof(*esph);
 	seqhilen = 0;
 
@@ -394,6 +566,27 @@ static int esp6_input(struct xfrm_state *x, struct sk_buff *skb)
 		assoclen += seqhilen;
 	}
 
+	if (!skb_cloned(skb)) {
+		if (!skb_is_nonlinear(skb)) {
+			nfrags = 1;
+
+			goto skip_cow;
+		} else if (!skb_has_frag_list(skb)) {
+			nfrags = skb_shinfo(skb)->nr_frags;
+			nfrags++;
+
+			goto skip_cow;
+		}
+	}
+
+	nfrags = skb_cow_data(skb, 0, &trailer);
+	if (nfrags < 0) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+skip_cow:
+	ret = -ENOMEM;
 	tmp = esp_alloc_tmp(aead, nfrags, seqhilen);
 	if (!tmp)
 		goto out;
@@ -404,26 +597,17 @@ static int esp6_input(struct xfrm_state *x, struct sk_buff *skb)
 	req = esp_tmp_req(aead, iv);
 	sg = esp_req_sg(aead, req);
 
-	skb->ip_summed = CHECKSUM_NONE;
+	esp_input_set_header(skb, seqhi);
 
-	esph = (struct ip_esp_hdr *)skb->data;
+	sg_init_table(sg, nfrags);
+	skb_to_sgvec(skb, sg, 0, skb->len);
 
-	aead_request_set_callback(req, 0, esp_input_done, skb);
+	skb->ip_summed = CHECKSUM_NONE;
 
-	/* For ESN we move the header forward by 4 bytes to
-	 * accomodate the high bits.  We will move it back after
-	 * decryption.
-	 */
-	if ((x->props.flags & XFRM_STATE_ESN)) {
-		esph = (void *)skb_push(skb, 4);
-		*seqhi = esph->spi;
-		esph->spi = esph->seq_no;
-		esph->seq_no = XFRM_SKB_CB(skb)->seq.input.hi;
+	if ((x->props.flags & XFRM_STATE_ESN))
 		aead_request_set_callback(req, 0, esp_input_done_esn, skb);
-	}
-
-	sg_init_table(sg, nfrags);
-	skb_to_sgvec(skb, sg, 0, skb->len);
+	else
+		aead_request_set_callback(req, 0, esp_input_done, skb);
 
 	aead_request_set_crypt(req, sg, sg, elen + ivlen, iv);
 	aead_request_set_ad(req, assoclen);
diff --git a/net/ipv6/esp6_offload.c b/net/ipv6/esp6_offload.c
new file mode 100644
index 000000000000..d914eb93204a
--- /dev/null
+++ b/net/ipv6/esp6_offload.c
@@ -0,0 +1,108 @@
+/*
+ * IPV6 GSO/GRO offload support
+ * Linux INET implementation
+ *
+ * Copyright (C) 2016 secunet Security Networks AG
+ * Author: Steffen Klassert <steffen.klassert@secunet.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * ESP GRO support
+ */
+
+#include <linux/skbuff.h>
+#include <linux/init.h>
+#include <net/protocol.h>
+#include <crypto/aead.h>
+#include <crypto/authenc.h>
+#include <linux/err.h>
+#include <linux/module.h>
+#include <net/ip.h>
+#include <net/xfrm.h>
+#include <net/esp.h>
+#include <linux/scatterlist.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <net/ip6_route.h>
+#include <net/ipv6.h>
+#include <linux/icmpv6.h>
+
+static struct sk_buff **esp6_gro_receive(struct sk_buff **head,
+					 struct sk_buff *skb)
+{
+	int offset = skb_gro_offset(skb);
+	struct xfrm_offload *xo;
+	struct xfrm_state *x;
+	__be32 seq;
+	__be32 spi;
+	int err;
+
+	skb_pull(skb, offset);
+
+	if ((err = xfrm_parse_spi(skb, IPPROTO_ESP, &spi, &seq)) != 0)
+		goto out;
+
+	err = secpath_set(skb);
+	if (err)
+		goto out;
+
+	if (skb->sp->len == XFRM_MAX_DEPTH)
+		goto out;
+
+	x = xfrm_state_lookup(dev_net(skb->dev), skb->mark,
+			      (xfrm_address_t *)&ipv6_hdr(skb)->daddr,
+			      spi, IPPROTO_ESP, AF_INET6);
+	if (!x)
+		goto out;
+
+	skb->sp->xvec[skb->sp->len++] = x;
+	skb->sp->olen++;
+
+	xo = xfrm_offload(skb);
+	if (!xo) {
+		xfrm_state_put(x);
+		goto out;
+	}
+	xo->flags |= XFRM_GRO;
+
+	XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6 = NULL;
+	XFRM_SPI_SKB_CB(skb)->family = AF_INET6;
+	XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct ipv6hdr, daddr);
+	XFRM_SPI_SKB_CB(skb)->seq = seq;
+
+	/* We don't need to handle errors from xfrm_input, it does all
+	 * the error handling and frees the resources on error. */
+	xfrm_input(skb, IPPROTO_ESP, spi, -2);
+
+	return ERR_PTR(-EINPROGRESS);
+out:
+	skb_push(skb, offset);
+	NAPI_GRO_CB(skb)->same_flow = 0;
+	NAPI_GRO_CB(skb)->flush = 1;
+
+	return NULL;
+}
+
+static const struct net_offload esp6_offload = {
+	.callbacks = {
+		.gro_receive = esp6_gro_receive,
+	},
+};
+
+static int __init esp6_offload_init(void)
+{
+	return inet6_add_offload(&esp6_offload, IPPROTO_ESP);
+}
+
+static void __exit esp6_offload_exit(void)
+{
+	inet6_del_offload(&esp6_offload, IPPROTO_ESP);
+}
+
+module_init(esp6_offload_init);
+module_exit(esp6_offload_exit);
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Steffen Klassert <steffen.klassert@secunet.com>");
diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c
index e4198502fd98..275cac628a95 100644
--- a/net/ipv6/exthdrs.c
+++ b/net/ipv6/exthdrs.c
@@ -327,7 +327,6 @@ static int ipv6_srh_rcv(struct sk_buff *skb)
 	struct ipv6_sr_hdr *hdr;
 	struct inet6_dev *idev;
 	struct in6_addr *addr;
-	bool cleanup = false;
 	int accept_seg6;
 
 	hdr = (struct ipv6_sr_hdr *)skb_transport_header(skb);
@@ -351,11 +350,7 @@ static int ipv6_srh_rcv(struct sk_buff *skb)
 #endif
 
 looped_back:
-	if (hdr->segments_left > 0) {
-		if (hdr->nexthdr != NEXTHDR_IPV6 && hdr->segments_left == 1 &&
-		    sr_has_cleanup(hdr))
-			cleanup = true;
-	} else {
+	if (hdr->segments_left == 0) {
 		if (hdr->nexthdr == NEXTHDR_IPV6) {
 			int offset = (hdr->hdrlen + 1) << 3;
 
@@ -418,21 +413,6 @@ looped_back:
 
 	ipv6_hdr(skb)->daddr = *addr;
 
-	if (cleanup) {
-		int srhlen = (hdr->hdrlen + 1) << 3;
-		int nh = hdr->nexthdr;
-
-		skb_pull_rcsum(skb, sizeof(struct ipv6hdr) + srhlen);
-		memmove(skb_network_header(skb) + srhlen,
-			skb_network_header(skb),
-			(unsigned char *)hdr - skb_network_header(skb));
-		skb->network_header += srhlen;
-		ipv6_hdr(skb)->nexthdr = nh;
-		ipv6_hdr(skb)->payload_len = htons(skb->len -
-						   sizeof(struct ipv6hdr));
-		skb_push_rcsum(skb, sizeof(struct ipv6hdr));
-	}
-
 	skb_dst_drop(skb);
 
 	ip6_route_input(skb);
@@ -453,13 +433,8 @@ looped_back:
 		}
 		ipv6_hdr(skb)->hop_limit--;
 
-		/* be sure that srh is still present before reinjecting */
-		if (!cleanup) {
-			skb_pull(skb, sizeof(struct ipv6hdr));
-			goto looped_back;
-		}
-		skb_set_transport_header(skb, sizeof(struct ipv6hdr));
-		IP6CB(skb)->nhoff = offsetof(struct ipv6hdr, nexthdr);
+		skb_pull(skb, sizeof(struct ipv6hdr));
+		goto looped_back;
 	}
 
 	dst_input(skb);
diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index 3036f665e6c8..230b5aac9f03 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -110,19 +110,17 @@ static const struct inet6_protocol icmpv6_protocol = {
 	.flags		=	INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL,
 };
 
+/* Called with BH disabled */
 static __inline__ struct sock *icmpv6_xmit_lock(struct net *net)
 {
 	struct sock *sk;
 
-	local_bh_disable();
-
 	sk = icmpv6_sk(net);
 	if (unlikely(!spin_trylock(&sk->sk_lock.slock))) {
 		/* This can happen if the output path (f.e. SIT or
 		 * ip6ip6 tunnel) signals dst_link_failure() for an
 		 * outgoing ICMP6 packet.
 		 */
-		local_bh_enable();
 		return NULL;
 	}
 	return sk;
@@ -130,7 +128,7 @@ static __inline__ struct sock *icmpv6_xmit_lock(struct net *net)
 
 static __inline__ void icmpv6_xmit_unlock(struct sock *sk)
 {
-	spin_unlock_bh(&sk->sk_lock.slock);
+	spin_unlock(&sk->sk_lock.slock);
 }
 
 /*
@@ -168,6 +166,30 @@ static bool is_ineligible(const struct sk_buff *skb)
 	return false;
 }
 
+static bool icmpv6_mask_allow(int type)
+{
+	/* Informational messages are not limited. */
+	if (type & ICMPV6_INFOMSG_MASK)
+		return true;
+
+	/* Do not limit pmtu discovery, it would break it. */
+	if (type == ICMPV6_PKT_TOOBIG)
+		return true;
+
+	return false;
+}
+
+static bool icmpv6_global_allow(int type)
+{
+	if (icmpv6_mask_allow(type))
+		return true;
+
+	if (icmp_global_allow())
+		return true;
+
+	return false;
+}
+
 /*
  * Check the ICMP output rate limit
  */
@@ -178,12 +200,7 @@ static bool icmpv6_xrlim_allow(struct sock *sk, u8 type,
 	struct dst_entry *dst;
 	bool res = false;
 
-	/* Informational messages are not limited. */
-	if (type & ICMPV6_INFOMSG_MASK)
-		return true;
-
-	/* Do not limit pmtu discovery, it would break it. */
-	if (type == ICMPV6_PKT_TOOBIG)
+	if (icmpv6_mask_allow(type))
 		return true;
 
 	/*
@@ -200,20 +217,16 @@ static bool icmpv6_xrlim_allow(struct sock *sk, u8 type,
 	} else {
 		struct rt6_info *rt = (struct rt6_info *)dst;
 		int tmo = net->ipv6.sysctl.icmpv6_time;
+		struct inet_peer *peer;
 
 		/* Give more bandwidth to wider prefixes. */
 		if (rt->rt6i_dst.plen < 128)
 			tmo >>= ((128 - rt->rt6i_dst.plen)>>5);
 
-		if (icmp_global_allow()) {
-			struct inet_peer *peer;
-
-			peer = inet_getpeer_v6(net->ipv6.peers,
-					       &fl6->daddr, 1);
-			res = inet_peer_xrlim_allow(peer, tmo);
-			if (peer)
-				inet_putpeer(peer);
-		}
+		peer = inet_getpeer_v6(net->ipv6.peers, &fl6->daddr, 1);
+		res = inet_peer_xrlim_allow(peer, tmo);
+		if (peer)
+			inet_putpeer(peer);
 	}
 	dst_release(dst);
 	return res;
@@ -474,6 +487,13 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info,
 		return;
 	}
 
+	/* Needed by both icmp_global_allow and icmpv6_xmit_lock */
+	local_bh_disable();
+
+	/* Check global sysctl_icmp_msgs_per_sec ratelimit */
+	if (!icmpv6_global_allow(type))
+		goto out_bh_enable;
+
 	mip6_addr_swap(skb);
 
 	memset(&fl6, 0, sizeof(fl6));
@@ -492,7 +512,8 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info,
 
 	sk = icmpv6_xmit_lock(net);
 	if (!sk)
-		return;
+		goto out_bh_enable;
+
 	sk->sk_mark = mark;
 	np = inet6_sk(sk);
 
@@ -552,6 +573,8 @@ out_dst_release:
 	dst_release(dst);
 out:
 	icmpv6_xmit_unlock(sk);
+out_bh_enable:
+	local_bh_enable();
 }
 
 /* Slightly more convenient version of icmp6_send.
@@ -665,9 +688,10 @@ static void icmpv6_echo_reply(struct sk_buff *skb)
 	fl6.flowi6_uid = sock_net_uid(net, NULL);
 	security_skb_classify_flow(skb, flowi6_to_flowi(&fl6));
 
+	local_bh_disable();
 	sk = icmpv6_xmit_lock(net);
 	if (!sk)
-		return;
+		goto out_bh_enable;
 	sk->sk_mark = mark;
 	np = inet6_sk(sk);
 
@@ -709,6 +733,8 @@ static void icmpv6_echo_reply(struct sk_buff *skb)
 	dst_release(dst);
 out:
 	icmpv6_xmit_unlock(sk);
+out_bh_enable:
+	local_bh_enable();
 }
 
 void icmpv6_notify(struct sk_buff *skb, u8 type, u8 code, __be32 info)
diff --git a/net/ipv6/ila/ila_lwt.c b/net/ipv6/ila/ila_lwt.c
index 13b5e85fe0d5..ce1aae4a7fc8 100644
--- a/net/ipv6/ila/ila_lwt.c
+++ b/net/ipv6/ila/ila_lwt.c
@@ -115,7 +115,7 @@ static const struct nla_policy ila_nl_policy[ILA_ATTR_MAX + 1] = {
 	[ILA_ATTR_CSUM_MODE] = { .type = NLA_U8, },
 };
 
-static int ila_build_state(struct net_device *dev, struct nlattr *nla,
+static int ila_build_state(struct nlattr *nla,
 			   unsigned int family, const void *cfg,
 			   struct lwtunnel_state **ts)
 {
diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c
index 75c308239243..9a31d13bf180 100644
--- a/net/ipv6/inet6_connection_sock.c
+++ b/net/ipv6/inet6_connection_sock.c
@@ -28,46 +28,6 @@
 #include <net/inet6_connection_sock.h>
 #include <net/sock_reuseport.h>
 
-int inet6_csk_bind_conflict(const struct sock *sk,
-			    const struct inet_bind_bucket *tb, bool relax,
-			    bool reuseport_ok)
-{
-	const struct sock *sk2;
-	bool reuse = !!sk->sk_reuse;
-	bool reuseport = !!sk->sk_reuseport && reuseport_ok;
-	kuid_t uid = sock_i_uid((struct sock *)sk);
-
-	/* We must walk the whole port owner list in this case. -DaveM */
-	/*
-	 * See comment in inet_csk_bind_conflict about sock lookup
-	 * vs net namespaces issues.
-	 */
-	sk_for_each_bound(sk2, &tb->owners) {
-		if (sk != sk2 &&
-		    (!sk->sk_bound_dev_if ||
-		     !sk2->sk_bound_dev_if ||
-		     sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
-			if ((!reuse || !sk2->sk_reuse ||
-			     sk2->sk_state == TCP_LISTEN) &&
-			    (!reuseport || !sk2->sk_reuseport ||
-			     rcu_access_pointer(sk->sk_reuseport_cb) ||
-			     (sk2->sk_state != TCP_TIME_WAIT &&
-			      !uid_eq(uid,
-				      sock_i_uid((struct sock *)sk2))))) {
-				if (ipv6_rcv_saddr_equal(sk, sk2, true))
-					break;
-			}
-			if (!relax && reuse && sk2->sk_reuse &&
-			    sk2->sk_state != TCP_LISTEN &&
-			    ipv6_rcv_saddr_equal(sk, sk2, true))
-				break;
-		}
-	}
-
-	return sk2 != NULL;
-}
-EXPORT_SYMBOL_GPL(inet6_csk_bind_conflict);
-
 struct dst_entry *inet6_csk_route_req(const struct sock *sk,
 				      struct flowi6 *fl6,
 				      const struct request_sock *req,
diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c
index 02761c9fe43e..d0900918a19e 100644
--- a/net/ipv6/inet6_hashtables.c
+++ b/net/ipv6/inet6_hashtables.c
@@ -268,54 +268,10 @@ int inet6_hash(struct sock *sk)
 
 	if (sk->sk_state != TCP_CLOSE) {
 		local_bh_disable();
-		err = __inet_hash(sk, NULL, ipv6_rcv_saddr_equal);
+		err = __inet_hash(sk, NULL);
 		local_bh_enable();
 	}
 
 	return err;
 }
 EXPORT_SYMBOL_GPL(inet6_hash);
-
-/* match_wildcard == true:  IPV6_ADDR_ANY equals to any IPv6 addresses if IPv6
- *                          only, and any IPv4 addresses if not IPv6 only
- * match_wildcard == false: addresses must be exactly the same, i.e.
- *                          IPV6_ADDR_ANY only equals to IPV6_ADDR_ANY,
- *                          and 0.0.0.0 equals to 0.0.0.0 only
- */
-int ipv6_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2,
-			 bool match_wildcard)
-{
-	const struct in6_addr *sk2_rcv_saddr6 = inet6_rcv_saddr(sk2);
-	int sk2_ipv6only = inet_v6_ipv6only(sk2);
-	int addr_type = ipv6_addr_type(&sk->sk_v6_rcv_saddr);
-	int addr_type2 = sk2_rcv_saddr6 ? ipv6_addr_type(sk2_rcv_saddr6) : IPV6_ADDR_MAPPED;
-
-	/* if both are mapped, treat as IPv4 */
-	if (addr_type == IPV6_ADDR_MAPPED && addr_type2 == IPV6_ADDR_MAPPED) {
-		if (!sk2_ipv6only) {
-			if (sk->sk_rcv_saddr == sk2->sk_rcv_saddr)
-				return 1;
-			if (!sk->sk_rcv_saddr || !sk2->sk_rcv_saddr)
-				return match_wildcard;
-		}
-		return 0;
-	}
-
-	if (addr_type == IPV6_ADDR_ANY && addr_type2 == IPV6_ADDR_ANY)
-		return 1;
-
-	if (addr_type2 == IPV6_ADDR_ANY && match_wildcard &&
-	    !(sk2_ipv6only && addr_type == IPV6_ADDR_MAPPED))
-		return 1;
-
-	if (addr_type == IPV6_ADDR_ANY && match_wildcard &&
-	    !(ipv6_only_sock(sk) && addr_type2 == IPV6_ADDR_MAPPED))
-		return 1;
-
-	if (sk2_rcv_saddr6 &&
-	    ipv6_addr_equal(&sk->sk_v6_rcv_saddr, sk2_rcv_saddr6))
-		return 1;
-
-	return 0;
-}
-EXPORT_SYMBOL_GPL(ipv6_rcv_saddr_equal);
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index ef5485204522..e4266746e4a2 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -318,6 +318,16 @@ static int fib6_dump_node(struct fib6_walker *w)
 			w->leaf = rt;
 			return 1;
 		}
+
+		/* Multipath routes are dumped in one route with the
+		 * RTA_MULTIPATH attribute. Jump 'rt' to point to the
+		 * last sibling of this route (no need to dump the
+		 * sibling routes again)
+		 */
+		if (rt->rt6i_nsiblings)
+			rt = list_last_entry(&rt->rt6i_siblings,
+					     struct rt6_info,
+					     rt6i_siblings);
 	}
 	w->leaf = NULL;
 	return 0;
@@ -746,6 +756,9 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt,
 	u16 nlflags = NLM_F_EXCL;
 	int err;
 
+	if (info->nlh && (info->nlh->nlmsg_flags & NLM_F_APPEND))
+		nlflags |= NLM_F_APPEND;
+
 	ins = &fn->leaf;
 
 	for (iter = fn->leaf; iter; iter = iter->dst.rt6_next) {
@@ -868,7 +881,8 @@ add:
 		*ins = rt;
 		rt->rt6i_node = fn;
 		atomic_inc(&rt->rt6i_ref);
-		inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
+		if (!info->skip_notify)
+			inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
 		info->nl_net->ipv6.rt6_stats->fib_rt_entries++;
 
 		if (!(fn->fn_flags & RTN_RTINFO)) {
@@ -894,7 +908,8 @@ add:
 		rt->rt6i_node = fn;
 		rt->dst.rt6_next = iter->dst.rt6_next;
 		atomic_inc(&rt->rt6i_ref);
-		inet6_rt_notify(RTM_NEWROUTE, rt, info, NLM_F_REPLACE);
+		if (!info->skip_notify)
+			inet6_rt_notify(RTM_NEWROUTE, rt, info, NLM_F_REPLACE);
 		if (!(fn->fn_flags & RTN_RTINFO)) {
 			info->nl_net->ipv6.rt6_stats->fib_route_nodes++;
 			fn->fn_flags |= RTN_RTINFO;
@@ -1439,7 +1454,8 @@ static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp,
 
 	fib6_purge_rt(rt, fn, net);
 
-	inet6_rt_notify(RTM_DELROUTE, rt, info, 0);
+	if (!info->skip_notify)
+		inet6_rt_notify(RTM_DELROUTE, rt, info, 0);
 	rt6_release(rt);
 }
 
diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index 558631860d91..6fcb7cb49bb2 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -367,35 +367,37 @@ static void ip6gre_tunnel_uninit(struct net_device *dev)
 
 
 static void ip6gre_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
-		u8 type, u8 code, int offset, __be32 info)
+		       u8 type, u8 code, int offset, __be32 info)
 {
-	const struct ipv6hdr *ipv6h = (const struct ipv6hdr *)skb->data;
-	__be16 *p = (__be16 *)(skb->data + offset);
-	int grehlen = offset + 4;
+	const struct gre_base_hdr *greh;
+	const struct ipv6hdr *ipv6h;
+	int grehlen = sizeof(*greh);
 	struct ip6_tnl *t;
+	int key_off = 0;
 	__be16 flags;
+	__be32 key;
 
-	flags = p[0];
-	if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) {
-		if (flags&(GRE_VERSION|GRE_ROUTING))
-			return;
-		if (flags&GRE_KEY) {
-			grehlen += 4;
-			if (flags&GRE_CSUM)
-				grehlen += 4;
-		}
+	if (!pskb_may_pull(skb, offset + grehlen))
+		return;
+	greh = (const struct gre_base_hdr *)(skb->data + offset);
+	flags = greh->flags;
+	if (flags & (GRE_VERSION | GRE_ROUTING))
+		return;
+	if (flags & GRE_CSUM)
+		grehlen += 4;
+	if (flags & GRE_KEY) {
+		key_off = grehlen + offset;
+		grehlen += 4;
 	}
 
-	/* If only 8 bytes returned, keyed message will be dropped here */
-	if (!pskb_may_pull(skb, grehlen))
+	if (!pskb_may_pull(skb, offset + grehlen))
 		return;
 	ipv6h = (const struct ipv6hdr *)skb->data;
-	p = (__be16 *)(skb->data + offset);
+	greh = (const struct gre_base_hdr *)(skb->data + offset);
+	key = key_off ? *(__be32 *)(skb->data + key_off) : 0;
 
 	t = ip6gre_tunnel_lookup(skb->dev, &ipv6h->daddr, &ipv6h->saddr,
-				flags & GRE_KEY ?
-				*(((__be32 *)p) + (grehlen / 4) - 1) : 0,
-				p[1]);
+				 key, greh->protocol);
 	if (!t)
 		return;
 
@@ -484,11 +486,6 @@ drop:
 	return 0;
 }
 
-struct ipv6_tel_txoption {
-	struct ipv6_txoptions ops;
-	__u8 dst_opt[8];
-};
-
 static int gre_handle_offloads(struct sk_buff *skb, bool csum)
 {
 	return iptunnel_handle_offloads(skb,
@@ -1001,6 +998,9 @@ static void ip6gre_tunnel_setup(struct net_device *dev)
 	dev->flags |= IFF_NOARP;
 	dev->addr_len = sizeof(struct in6_addr);
 	netif_keep_dst(dev);
+	/* This perm addr will be used as interface identifier by IPv6 */
+	dev->addr_assign_type = NET_ADDR_RANDOM;
+	eth_random_addr(dev->perm_addr);
 }
 
 static int ip6gre_tunnel_init_common(struct net_device *dev)
diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c
index fc7b4017ba24..0838e6d01d2e 100644
--- a/net/ipv6/ip6_offload.c
+++ b/net/ipv6/ip6_offload.c
@@ -253,7 +253,7 @@ out_unlock:
 	rcu_read_unlock();
 
 out:
-	NAPI_GRO_CB(skb)->flush |= flush;
+	skb_gro_flush_final(skb, pp, flush);
 
 	return pp;
 }
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index b6a94ff0bbd0..528b3c1f3fde 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -119,7 +119,8 @@ static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *
 	if (unlikely(!neigh))
 		neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
 	if (!IS_ERR(neigh)) {
-		ret = dst_neigh_output(dst, neigh, skb);
+		sock_confirm_neigh(skb, neigh);
+		ret = neigh_output(neigh, skb);
 		rcu_read_unlock_bh();
 		return ret;
 	}
@@ -1021,6 +1022,11 @@ static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
 		}
 	}
 #endif
+	if (ipv6_addr_v4mapped(&fl6->saddr) &&
+	    !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) {
+		err = -EAFNOSUPPORT;
+		goto out_err_release;
+	}
 
 	return 0;
 
@@ -1144,6 +1150,9 @@ static inline int ip6_ufo_append_data(struct sock *sk,
 		skb->protocol = htons(ETH_P_IPV6);
 		skb->csum = 0;
 
+		if (flags & MSG_CONFIRM)
+			skb_set_dst_pending_confirm(skb, 1);
+
 		__skb_queue_tail(queue, skb);
 	} else if (skb_is_gso(skb)) {
 		goto append;
@@ -1516,6 +1525,9 @@ alloc_new_skb:
 			exthdrlen = 0;
 			dst_exthdrlen = 0;
 
+			if ((flags & MSG_CONFIRM) && !skb_prev)
+				skb_set_dst_pending_confirm(skb, 1);
+
 			/*
 			 * Put the packet on the pending queue
 			 */
diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c
index d82042c8d8fd..644ba59fbd9d 100644
--- a/net/ipv6/ip6_vti.c
+++ b/net/ipv6/ip6_vti.c
@@ -49,6 +49,7 @@
 #include <net/xfrm.h>
 #include <net/net_namespace.h>
 #include <net/netns/generic.h>
+#include <linux/etherdevice.h>
 
 #define IP6_VTI_HASH_SIZE_SHIFT  5
 #define IP6_VTI_HASH_SIZE (1 << IP6_VTI_HASH_SIZE_SHIFT)
@@ -692,6 +693,10 @@ vti6_parm_to_user(struct ip6_tnl_parm2 *u, const struct __ip6_tnl_parm *p)
 	u->link = p->link;
 	u->i_key = p->i_key;
 	u->o_key = p->o_key;
+	if (u->i_key)
+		u->i_flags |= GRE_KEY;
+	if (u->o_key)
+		u->o_flags |= GRE_KEY;
 	u->proto = p->proto;
 
 	memcpy(u->name, p->name, sizeof(u->name));
@@ -842,6 +847,9 @@ static void vti6_dev_setup(struct net_device *dev)
 	dev->flags |= IFF_NOARP;
 	dev->addr_len = sizeof(struct in6_addr);
 	netif_keep_dst(dev);
+	/* This perm addr will be used as interface identifier by IPv6 */
+	dev->addr_assign_type = NET_ADDR_RANDOM;
+	eth_random_addr(dev->perm_addr);
 }
 
 /**
diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index 604d8953c775..6ba6c900ebcf 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -1666,6 +1666,10 @@ int ip6_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, uns
 	struct net *net = sock_net(sk);
 	struct mr6_table *mrt;
 
+	if (sk->sk_type != SOCK_RAW ||
+	    inet_sk(sk)->inet_num != IPPROTO_ICMPV6)
+		return -EOPNOTSUPP;
+
 	mrt = ip6mr_get_table(net, raw6_sk(sk)->ip6mr_table ? : RT6_TABLE_DFLT);
 	if (!mrt)
 		return -ENOENT;
@@ -1677,9 +1681,6 @@ int ip6_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, uns
 
 	switch (optname) {
 	case MRT6_INIT:
-		if (sk->sk_type != SOCK_RAW ||
-		    inet_sk(sk)->inet_num != IPPROTO_ICMPV6)
-			return -EOPNOTSUPP;
 		if (optlen < sizeof(int))
 			return -EINVAL;
 
@@ -1815,6 +1816,10 @@ int ip6_mroute_getsockopt(struct sock *sk, int optname, char __user *optval,
 	struct net *net = sock_net(sk);
 	struct mr6_table *mrt;
 
+	if (sk->sk_type != SOCK_RAW ||
+	    inet_sk(sk)->inet_num != IPPROTO_ICMPV6)
+		return -EOPNOTSUPP;
+
 	mrt = ip6mr_get_table(net, raw6_sk(sk)->ip6mr_table ? : RT6_TABLE_DFLT);
 	if (!mrt)
 		return -ENOENT;
@@ -2243,8 +2248,10 @@ static int __ip6mr_fill_mroute(struct mr6_table *mrt, struct sk_buff *skb,
 	int ct;
 
 	/* If cache is unresolved, don't try to parse IIF and OIF */
-	if (c->mf6c_parent >= MAXMIFS)
+	if (c->mf6c_parent >= MAXMIFS) {
+		rtm->rtm_flags |= RTNH_F_UNRESOLVED;
 		return -ENOENT;
+	}
 
 	if (MIF_EXISTS(mrt, c->mf6c_parent) &&
 	    nla_put_u32(skb, RTA_IIF, mrt->vif6_table[c->mf6c_parent].dev->ifindex) < 0)
@@ -2286,7 +2293,7 @@ static int __ip6mr_fill_mroute(struct mr6_table *mrt, struct sk_buff *skb,
 }
 
 int ip6mr_get_route(struct net *net, struct sk_buff *skb, struct rtmsg *rtm,
-		    int nowait, u32 portid)
+		    u32 portid)
 {
 	int err;
 	struct mr6_table *mrt;
@@ -2313,11 +2320,6 @@ int ip6mr_get_route(struct net *net, struct sk_buff *skb, struct rtmsg *rtm,
 		struct net_device *dev;
 		int vif;
 
-		if (nowait) {
-			read_unlock(&mrt_lock);
-			return -EAGAIN;
-		}
-
 		dev = skb->dev;
 		if (!dev || (vif = ip6mr_find_vif(mrt, dev)) < 0) {
 			read_unlock(&mrt_lock);
@@ -2355,7 +2357,7 @@ int ip6mr_get_route(struct net *net, struct sk_buff *skb, struct rtmsg *rtm,
 		return err;
 	}
 
-	if (!nowait && (rtm->rtm_flags&RTM_F_NOTIFY))
+	if (rtm->rtm_flags & RTM_F_NOTIFY)
 		cache->mfc_flags |= MFC_NOTIFY;
 
 	err = __ip6mr_fill_mroute(mrt, skb, cache, rtm);
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index ee97c44e2aa0..a531ba032b85 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -595,16 +595,24 @@ done:
 
 		if (val) {
 			struct net_device *dev;
+			int midx;
 
-			if (sk->sk_bound_dev_if && sk->sk_bound_dev_if != val)
-				goto e_inval;
+			rcu_read_lock();
 
-			dev = dev_get_by_index(net, val);
+			dev = dev_get_by_index_rcu(net, val);
 			if (!dev) {
+				rcu_read_unlock();
 				retv = -ENODEV;
 				break;
 			}
-			dev_put(dev);
+			midx = l3mdev_master_ifindex_rcu(dev);
+
+			rcu_read_unlock();
+
+			if (sk->sk_bound_dev_if &&
+			    sk->sk_bound_dev_if != val &&
+			    (!midx || midx != sk->sk_bound_dev_if))
+				goto e_inval;
 		}
 		np->mcast_oif = val;
 		retv = 0;
diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c
index 7139fffd61b6..1bdc703cb966 100644
--- a/net/ipv6/mcast.c
+++ b/net/ipv6/mcast.c
@@ -779,6 +779,7 @@ static void mld_del_delrec(struct inet6_dev *idev, struct ifmcaddr6 *im)
 				psf->sf_crcount = im->mca_crcount;
 		}
 		in6_dev_put(pmc->idev);
+		kfree(pmc);
 	}
 	spin_unlock_bh(&im->mca_lock);
 }
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index 25a022d41a70..1e15c54fd5e2 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -855,10 +855,6 @@ copy_entries_to_user(unsigned int total_size,
 		return PTR_ERR(counters);
 
 	loc_cpu_entry = private->entries;
-	if (copy_to_user(userptr, loc_cpu_entry, total_size) != 0) {
-		ret = -EFAULT;
-		goto free_counters;
-	}
 
 	/* FIXME: use iterator macros --RR */
 	/* ... then go back and fix counters and names */
@@ -868,6 +864,10 @@ copy_entries_to_user(unsigned int total_size,
 		const struct xt_entry_target *t;
 
 		e = (struct ip6t_entry *)(loc_cpu_entry + off);
+		if (copy_to_user(userptr + off, e, sizeof(*e))) {
+			ret = -EFAULT;
+			goto free_counters;
+		}
 		if (copy_to_user(userptr + off
 				 + offsetof(struct ip6t_entry, counters),
 				 &counters[num],
@@ -881,23 +881,14 @@ copy_entries_to_user(unsigned int total_size,
 		     i += m->u.match_size) {
 			m = (void *)e + i;
 
-			if (copy_to_user(userptr + off + i
-					 + offsetof(struct xt_entry_match,
-						    u.user.name),
-					 m->u.kernel.match->name,
-					 strlen(m->u.kernel.match->name)+1)
-			    != 0) {
+			if (xt_match_to_user(m, userptr + off + i)) {
 				ret = -EFAULT;
 				goto free_counters;
 			}
 		}
 
 		t = ip6t_get_target_c(e);
-		if (copy_to_user(userptr + off + e->target_offset
-				 + offsetof(struct xt_entry_target,
-					    u.user.name),
-				 t->u.kernel.target->name,
-				 strlen(t->u.kernel.target->name)+1) != 0) {
+		if (xt_target_to_user(t, userptr + off + e->target_offset)) {
 			ret = -EFAULT;
 			goto free_counters;
 		}
diff --git a/net/ipv6/netfilter/ip6t_NPT.c b/net/ipv6/netfilter/ip6t_NPT.c
index 590f767db5d4..a379d2f79b19 100644
--- a/net/ipv6/netfilter/ip6t_NPT.c
+++ b/net/ipv6/netfilter/ip6t_NPT.c
@@ -112,6 +112,7 @@ static struct xt_target ip6t_npt_target_reg[] __read_mostly = {
 		.table		= "mangle",
 		.target		= ip6t_snpt_tg,
 		.targetsize	= sizeof(struct ip6t_npt_tginfo),
+		.usersize	= offsetof(struct ip6t_npt_tginfo, adjustment),
 		.checkentry	= ip6t_npt_checkentry,
 		.family		= NFPROTO_IPV6,
 		.hooks		= (1 << NF_INET_LOCAL_IN) |
@@ -123,6 +124,7 @@ static struct xt_target ip6t_npt_target_reg[] __read_mostly = {
 		.table		= "mangle",
 		.target		= ip6t_dnpt_tg,
 		.targetsize	= sizeof(struct ip6t_npt_tginfo),
+		.usersize	= offsetof(struct ip6t_npt_tginfo, adjustment),
 		.checkentry	= ip6t_npt_checkentry,
 		.family		= NFPROTO_IPV6,
 		.hooks		= (1 << NF_INET_PRE_ROUTING) |
diff --git a/net/ipv6/netfilter/ip6t_SYNPROXY.c b/net/ipv6/netfilter/ip6t_SYNPROXY.c
index 98c8dd38575a..4ef1ddd4bbbd 100644
--- a/net/ipv6/netfilter/ip6t_SYNPROXY.c
+++ b/net/ipv6/netfilter/ip6t_SYNPROXY.c
@@ -71,8 +71,7 @@ synproxy_send_tcp(struct net *net,
 	skb_dst_set(nskb, dst);
 
 	if (nfct) {
-		nskb->nfct = nfct;
-		nskb->nfctinfo = ctinfo;
+		nf_ct_set(nskb, (struct nf_conn *)nfct, ctinfo);
 		nf_conntrack_get(nfct);
 	}
 
@@ -121,8 +120,8 @@ synproxy_send_client_synack(struct net *net,
 
 	synproxy_build_options(nth, opts);
 
-	synproxy_send_tcp(net, skb, nskb, skb->nfct, IP_CT_ESTABLISHED_REPLY,
-			  niph, nth, tcp_hdr_size);
+	synproxy_send_tcp(net, skb, nskb, skb_nfct(skb),
+			  IP_CT_ESTABLISHED_REPLY, niph, nth, tcp_hdr_size);
 }
 
 static void
@@ -244,8 +243,8 @@ synproxy_send_client_ack(struct net *net,
 
 	synproxy_build_options(nth, opts);
 
-	synproxy_send_tcp(net, skb, nskb, skb->nfct, IP_CT_ESTABLISHED_REPLY,
-			  niph, nth, tcp_hdr_size);
+	synproxy_send_tcp(net, skb, nskb, skb_nfct(skb),
+			  IP_CT_ESTABLISHED_REPLY, niph, nth, tcp_hdr_size);
 }
 
 static bool
diff --git a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
index f5a61bc3ec2b..d2c2ccbfbe72 100644
--- a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
+++ b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
@@ -145,15 +145,15 @@ static int
 icmpv6_error_message(struct net *net, struct nf_conn *tmpl,
 		     struct sk_buff *skb,
 		     unsigned int icmp6off,
-		     enum ip_conntrack_info *ctinfo,
 		     unsigned int hooknum)
 {
 	struct nf_conntrack_tuple intuple, origtuple;
 	const struct nf_conntrack_tuple_hash *h;
 	const struct nf_conntrack_l4proto *inproto;
+	enum ip_conntrack_info ctinfo;
 	struct nf_conntrack_zone tmp;
 
-	NF_CT_ASSERT(skb->nfct == NULL);
+	NF_CT_ASSERT(!skb_nfct(skb));
 
 	/* Are they talking about one of our connections? */
 	if (!nf_ct_get_tuplepr(skb,
@@ -176,7 +176,7 @@ icmpv6_error_message(struct net *net, struct nf_conn *tmpl,
 		return -NF_ACCEPT;
 	}
 
-	*ctinfo = IP_CT_RELATED;
+	ctinfo = IP_CT_RELATED;
 
 	h = nf_conntrack_find_get(net, nf_ct_zone_tmpl(tmpl, skb, &tmp),
 				  &intuple);
@@ -185,19 +185,18 @@ icmpv6_error_message(struct net *net, struct nf_conn *tmpl,
 		return -NF_ACCEPT;
 	} else {
 		if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY)
-			*ctinfo += IP_CT_IS_REPLY;
+			ctinfo += IP_CT_IS_REPLY;
 	}
 
 	/* Update skb to refer to this connection */
-	skb->nfct = &nf_ct_tuplehash_to_ctrack(h)->ct_general;
-	skb->nfctinfo = *ctinfo;
+	nf_ct_set(skb, nf_ct_tuplehash_to_ctrack(h), ctinfo);
 	return NF_ACCEPT;
 }
 
 static int
 icmpv6_error(struct net *net, struct nf_conn *tmpl,
 	     struct sk_buff *skb, unsigned int dataoff,
-	     enum ip_conntrack_info *ctinfo, u_int8_t pf, unsigned int hooknum)
+	     u8 pf, unsigned int hooknum)
 {
 	const struct icmp6hdr *icmp6h;
 	struct icmp6hdr _ih;
@@ -222,9 +221,8 @@ icmpv6_error(struct net *net, struct nf_conn *tmpl,
 	type = icmp6h->icmp6_type - 130;
 	if (type >= 0 && type < sizeof(noct_valid_new) &&
 	    noct_valid_new[type]) {
-		skb->nfct = &nf_ct_untracked_get()->ct_general;
-		skb->nfctinfo = IP_CT_NEW;
-		nf_conntrack_get(skb->nfct);
+		nf_ct_set(skb, nf_ct_untracked_get(), IP_CT_NEW);
+		nf_conntrack_get(skb_nfct(skb));
 		return NF_ACCEPT;
 	}
 
@@ -232,7 +230,7 @@ icmpv6_error(struct net *net, struct nf_conn *tmpl,
 	if (icmp6h->icmp6_type >= 128)
 		return NF_ACCEPT;
 
-	return icmpv6_error_message(net, tmpl, skb, dataoff, ctinfo, hooknum);
+	return icmpv6_error_message(net, tmpl, skb, dataoff, hooknum);
 }
 
 #if IS_ENABLED(CONFIG_NF_CT_NETLINK)
diff --git a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c
index 8e0bdd058787..ada60d1a991b 100644
--- a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c
+++ b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c
@@ -37,7 +37,7 @@ static enum ip6_defrag_users nf_ct6_defrag_user(unsigned int hooknum,
 {
 	u16 zone_id = NF_CT_DEFAULT_ZONE_ID;
 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
-	if (skb->nfct) {
+	if (skb_nfct(skb)) {
 		enum ip_conntrack_info ctinfo;
 		const struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
 
@@ -61,7 +61,7 @@ static unsigned int ipv6_defrag(void *priv,
 
 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
 	/* Previously seen (loopback)?	*/
-	if (skb->nfct && !nf_ct_is_template((struct nf_conn *)skb->nfct))
+	if (skb_nfct(skb) && !nf_ct_is_template((struct nf_conn *)skb_nfct(skb)))
 		return NF_ACCEPT;
 #endif
 
diff --git a/net/ipv6/netfilter/nf_dup_ipv6.c b/net/ipv6/netfilter/nf_dup_ipv6.c
index 4a84b5ad9ecb..888ecd106e5f 100644
--- a/net/ipv6/netfilter/nf_dup_ipv6.c
+++ b/net/ipv6/netfilter/nf_dup_ipv6.c
@@ -57,10 +57,9 @@ void nf_dup_ipv6(struct net *net, struct sk_buff *skb, unsigned int hooknum,
 		return;
 
 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
-	nf_conntrack_put(skb->nfct);
-	skb->nfct     = &nf_ct_untracked_get()->ct_general;
-	skb->nfctinfo = IP_CT_NEW;
-	nf_conntrack_get(skb->nfct);
+	nf_reset(skb);
+	nf_ct_set(skb, nf_ct_untracked_get(), IP_CT_NEW);
+	nf_conntrack_get(skb_nfct(skb));
 #endif
 	if (hooknum == NF_INET_PRE_ROUTING ||
 	    hooknum == NF_INET_LOCAL_IN) {
diff --git a/net/ipv6/netfilter/nf_log_ipv6.c b/net/ipv6/netfilter/nf_log_ipv6.c
index 57d86066a13b..97c724224da7 100644
--- a/net/ipv6/netfilter/nf_log_ipv6.c
+++ b/net/ipv6/netfilter/nf_log_ipv6.c
@@ -64,7 +64,7 @@ static void dump_ipv6_packet(struct nf_log_buf *m,
 	nf_log_buf_add(m, "SRC=%pI6 DST=%pI6 ", &ih->saddr, &ih->daddr);
 
 	/* Max length: 44 "LEN=65535 TC=255 HOPLIMIT=255 FLOWLBL=FFFFF " */
-	nf_log_buf_add(m, "LEN=%Zu TC=%u HOPLIMIT=%u FLOWLBL=%u ",
+	nf_log_buf_add(m, "LEN=%zu TC=%u HOPLIMIT=%u FLOWLBL=%u ",
 	       ntohs(ih->payload_len) + sizeof(struct ipv6hdr),
 	       (ntohl(*(__be32 *)ih) & 0x0ff00000) >> 20,
 	       ih->hop_limit,
@@ -351,7 +351,7 @@ static void nf_log_ip6_packet(struct net *net, u_int8_t pf,
 	struct nf_log_buf *m;
 
 	/* FIXME: Disabled from containers until syslog ns is supported */
-	if (!net_eq(net, &init_net))
+	if (!net_eq(net, &init_net) && !sysctl_nf_log_all_netns)
 		return;
 
 	m = nf_log_buf_open();
diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c
index e1f8b34d7a2e..9b522fa90e6d 100644
--- a/net/ipv6/ping.c
+++ b/net/ipv6/ping.c
@@ -126,12 +126,6 @@ static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 		return PTR_ERR(dst);
 	rt = (struct rt6_info *) dst;
 
-	np = inet6_sk(sk);
-	if (!np) {
-		err = -EBADF;
-		goto dst_err_out;
-	}
-
 	if (!fl6.flowi6_oif && ipv6_addr_is_multicast(&fl6.daddr))
 		fl6.flowi6_oif = np->mcast_oif;
 	else if (!fl6.flowi6_oif)
@@ -166,7 +160,6 @@ static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 	}
 	release_sock(sk);
 
-dst_err_out:
 	dst_release(dst);
 
 	if (err)
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index ea89073c8247..f174e76e6505 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -654,6 +654,9 @@ static int rawv6_send_hdrinc(struct sock *sk, struct msghdr *msg, int length,
 
 	skb->ip_summed = CHECKSUM_NONE;
 
+	if (flags & MSG_CONFIRM)
+		skb_set_dst_pending_confirm(skb, 1);
+
 	skb->transport_header = skb->network_header;
 	err = memcpy_from_msg(iph, msg, length);
 	if (err)
@@ -934,7 +937,8 @@ out:
 	txopt_put(opt_to_free);
 	return err < 0 ? err : len;
 do_confirm:
-	dst_confirm(dst);
+	if (msg->msg_flags & MSG_PROBE)
+		dst_confirm_neigh(dst, &fl6.daddr);
 	if (!(msg->msg_flags & MSG_PROBE) || len)
 		goto back_from_confirm;
 	err = 0;
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 7ea85370c11c..f54f4265b37f 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -98,6 +98,12 @@ static void		rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
 					struct sk_buff *skb);
 static void		rt6_dst_from_metrics_check(struct rt6_info *rt);
 static int rt6_score_route(struct rt6_info *rt, int oif, int strict);
+static size_t rt6_nlmsg_size(struct rt6_info *rt);
+static int rt6_fill_node(struct net *net,
+			 struct sk_buff *skb, struct rt6_info *rt,
+			 struct in6_addr *dst, struct in6_addr *src,
+			 int iif, int type, u32 portid, u32 seq,
+			 unsigned int flags);
 
 #ifdef CONFIG_IPV6_ROUTE_INFO
 static struct rt6_info *rt6_add_route_info(struct net *net,
@@ -217,6 +223,21 @@ static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst,
 	return neigh_create(&nd_tbl, daddr, dst->dev);
 }
 
+static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
+{
+	struct net_device *dev = dst->dev;
+	struct rt6_info *rt = (struct rt6_info *)dst;
+
+	daddr = choose_neigh_daddr(rt, NULL, daddr);
+	if (!daddr)
+		return;
+	if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
+		return;
+	if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
+		return;
+	__ipv6_confirm_neigh(dev, daddr);
+}
+
 static struct dst_ops ip6_dst_ops_template = {
 	.family			=	AF_INET6,
 	.gc			=	ip6_dst_gc,
@@ -233,6 +254,7 @@ static struct dst_ops ip6_dst_ops_template = {
 	.redirect		=	rt6_do_redirect,
 	.local_out		=	__ip6_local_out,
 	.neigh_lookup		=	ip6_neigh_lookup,
+	.confirm_neigh		=	ip6_confirm_neigh,
 };
 
 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
@@ -1359,6 +1381,7 @@ static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
 				 const struct ipv6hdr *iph, u32 mtu)
 {
+	const struct in6_addr *daddr, *saddr;
 	struct rt6_info *rt6 = (struct rt6_info *)dst;
 
 	if (rt6->rt6i_flags & RTF_LOCAL)
@@ -1367,26 +1390,26 @@ static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
 	if (dst_metric_locked(dst, RTAX_MTU))
 		return;
 
-	dst_confirm(dst);
+	if (iph) {
+		daddr = &iph->daddr;
+		saddr = &iph->saddr;
+	} else if (sk) {
+		daddr = &sk->sk_v6_daddr;
+		saddr = &inet6_sk(sk)->saddr;
+	} else {
+		daddr = NULL;
+		saddr = NULL;
+	}
+	dst_confirm_neigh(dst, daddr);
 	mtu = max_t(u32, mtu, IPV6_MIN_MTU);
 	if (mtu >= dst_mtu(dst))
 		return;
 
 	if (!rt6_cache_allowed_for_pmtu(rt6)) {
 		rt6_do_update_pmtu(rt6, mtu);
-	} else {
-		const struct in6_addr *daddr, *saddr;
+	} else if (daddr) {
 		struct rt6_info *nrt6;
 
-		if (iph) {
-			daddr = &iph->daddr;
-			saddr = &iph->saddr;
-		} else if (sk) {
-			daddr = &sk->sk_v6_daddr;
-			saddr = &inet6_sk(sk)->saddr;
-		} else {
-			return;
-		}
 		nrt6 = ip6_rt_cache_alloc(rt6, daddr, saddr);
 		if (nrt6) {
 			rt6_do_update_pmtu(nrt6, mtu);
@@ -1897,7 +1920,7 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg)
 	if (cfg->fc_encap) {
 		struct lwtunnel_state *lwtstate;
 
-		err = lwtunnel_build_state(dev, cfg->fc_encap_type,
+		err = lwtunnel_build_state(cfg->fc_encap_type,
 					   cfg->fc_encap, AF_INET6, cfg,
 					   &lwtstate);
 		if (err)
@@ -2143,6 +2166,54 @@ int ip6_del_rt(struct rt6_info *rt)
 	return __ip6_del_rt(rt, &info);
 }
 
+static int __ip6_del_rt_siblings(struct rt6_info *rt, struct fib6_config *cfg)
+{
+	struct nl_info *info = &cfg->fc_nlinfo;
+	struct sk_buff *skb = NULL;
+	struct fib6_table *table;
+	int err;
+
+	table = rt->rt6i_table;
+	write_lock_bh(&table->tb6_lock);
+
+	if (rt->rt6i_nsiblings && cfg->fc_delete_all_nh) {
+		struct rt6_info *sibling, *next_sibling;
+
+		/* prefer to send a single notification with all hops */
+		skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
+		if (skb) {
+			u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
+
+			if (rt6_fill_node(info->nl_net, skb, rt,
+					  NULL, NULL, 0, RTM_DELROUTE,
+					  info->portid, seq, 0) < 0) {
+				kfree_skb(skb);
+				skb = NULL;
+			} else
+				info->skip_notify = 1;
+		}
+
+		list_for_each_entry_safe(sibling, next_sibling,
+					 &rt->rt6i_siblings,
+					 rt6i_siblings) {
+			err = fib6_del(sibling, info);
+			if (err)
+				goto out;
+		}
+	}
+
+	err = fib6_del(rt, info);
+out:
+	write_unlock_bh(&table->tb6_lock);
+	ip6_rt_put(rt);
+
+	if (skb) {
+		rtnl_notify(skb, info->nl_net, info->portid, RTNLGRP_IPV6_ROUTE,
+			    info->nlh, gfp_any());
+	}
+	return err;
+}
+
 static int ip6_route_del(struct fib6_config *cfg)
 {
 	struct fib6_table *table;
@@ -2179,7 +2250,11 @@ static int ip6_route_del(struct fib6_config *cfg)
 			dst_hold(&rt->dst);
 			read_unlock_bh(&table->tb6_lock);
 
-			return __ip6_del_rt(rt, &cfg->fc_nlinfo);
+			/* if gateway was specified only delete the one hop */
+			if (cfg->fc_flags & RTF_GATEWAY)
+				return __ip6_del_rt(rt, &cfg->fc_nlinfo);
+
+			return __ip6_del_rt_siblings(rt, cfg);
 		}
 	}
 	read_unlock_bh(&table->tb6_lock);
@@ -2258,7 +2333,7 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu
 	 * Look, redirects are sent only in response to data packets,
 	 * so that this nexthop apparently is reachable. --ANK
 	 */
-	dst_confirm(&rt->dst);
+	dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
 
 	neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
 	if (!neigh)
@@ -2634,6 +2709,7 @@ struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
 	rt->dst.output = ip6_output;
 	rt->rt6i_idev = idev;
 
+	rt->rt6i_protocol = RTPROT_KERNEL;
 	rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
 	if (anycast)
 		rt->rt6i_flags |= RTF_ANYCAST;
@@ -2711,13 +2787,16 @@ struct arg_dev_net {
 	struct net *net;
 };
 
+/* called with write lock held for table with rt */
 static int fib6_ifdown(struct rt6_info *rt, void *arg)
 {
 	const struct arg_dev_net *adn = arg;
 	const struct net_device *dev = adn->dev;
 
 	if ((rt->dst.dev == dev || !dev) &&
-	    rt != adn->net->ipv6.ip6_null_entry)
+	    rt != adn->net->ipv6.ip6_null_entry &&
+	    (rt->rt6i_nsiblings == 0 ||
+	     !rt->rt6i_idev->cnf.ignore_routes_with_linkdown))
 		return -1;
 
 	return 0;
@@ -2948,7 +3027,7 @@ static void ip6_print_replace_route_err(struct list_head *rt6_nh_list)
 	struct rt6_nh *nh;
 
 	list_for_each_entry(nh, rt6_nh_list, next) {
-		pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6 nexthop %pI6 ifi %d\n",
+		pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6c nexthop %pI6c ifi %d\n",
 		        &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway,
 		        nh->r_cfg.fc_ifindex);
 	}
@@ -2987,13 +3066,37 @@ static int ip6_route_info_append(struct list_head *rt6_nh_list,
 	return 0;
 }
 
+static void ip6_route_mpath_notify(struct rt6_info *rt,
+				   struct rt6_info *rt_last,
+				   struct nl_info *info,
+				   __u16 nlflags)
+{
+	/* if this is an APPEND route, then rt points to the first route
+	 * inserted and rt_last points to last route inserted. Userspace
+	 * wants a consistent dump of the route which starts at the first
+	 * nexthop. Since sibling routes are always added at the end of
+	 * the list, find the first sibling of the last route appended
+	 */
+	if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->rt6i_nsiblings) {
+		rt = list_first_entry(&rt_last->rt6i_siblings,
+				      struct rt6_info,
+				      rt6i_siblings);
+	}
+
+	if (rt)
+		inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
+}
+
 static int ip6_route_multipath_add(struct fib6_config *cfg)
 {
+	struct rt6_info *rt_notif = NULL, *rt_last = NULL;
+	struct nl_info *info = &cfg->fc_nlinfo;
 	struct fib6_config r_cfg;
 	struct rtnexthop *rtnh;
 	struct rt6_info *rt;
 	struct rt6_nh *err_nh;
 	struct rt6_nh *nh, *nh_safe;
+	__u16 nlflags;
 	int remaining;
 	int attrlen;
 	int err = 1;
@@ -3002,6 +3105,10 @@ static int ip6_route_multipath_add(struct fib6_config *cfg)
 		       (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
 	LIST_HEAD(rt6_nh_list);
 
+	nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
+	if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
+		nlflags |= NLM_F_APPEND;
+
 	remaining = cfg->fc_mp_len;
 	rtnh = (struct rtnexthop *)cfg->fc_mp;
 
@@ -3044,9 +3151,20 @@ static int ip6_route_multipath_add(struct fib6_config *cfg)
 		rtnh = rtnh_next(rtnh, &remaining);
 	}
 
+	/* for add and replace send one notification with all nexthops.
+	 * Skip the notification in fib6_add_rt2node and send one with
+	 * the full route when done
+	 */
+	info->skip_notify = 1;
+
 	err_nh = NULL;
 	list_for_each_entry(nh, &rt6_nh_list, next) {
-		err = __ip6_ins_rt(nh->rt6_info, &cfg->fc_nlinfo, &nh->mxc);
+		rt_last = nh->rt6_info;
+		err = __ip6_ins_rt(nh->rt6_info, info, &nh->mxc);
+		/* save reference to first route for notification */
+		if (!rt_notif && !err)
+			rt_notif = nh->rt6_info;
+
 		/* nh->rt6_info is used or freed at this point, reset to NULL*/
 		nh->rt6_info = NULL;
 		if (err) {
@@ -3068,9 +3186,18 @@ static int ip6_route_multipath_add(struct fib6_config *cfg)
 		nhn++;
 	}
 
+	/* success ... tell user about new route */
+	ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
 	goto cleanup;
 
 add_errout:
+	/* send notification for routes that were added so that
+	 * the delete notifications sent by ip6_route_del are
+	 * coherent
+	 */
+	if (rt_notif)
+		ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
+
 	/* Delete routes that were already added */
 	list_for_each_entry(nh, &rt6_nh_list, next) {
 		if (err_nh == nh)
@@ -3138,8 +3265,10 @@ static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh)
 
 	if (cfg.fc_mp)
 		return ip6_route_multipath_del(&cfg);
-	else
+	else {
+		cfg.fc_delete_all_nh = 1;
 		return ip6_route_del(&cfg);
+	}
 }
 
 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh)
@@ -3157,8 +3286,20 @@ static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh)
 		return ip6_route_add(&cfg);
 }
 
-static inline size_t rt6_nlmsg_size(struct rt6_info *rt)
+static size_t rt6_nlmsg_size(struct rt6_info *rt)
 {
+	int nexthop_len = 0;
+
+	if (rt->rt6i_nsiblings) {
+		nexthop_len = nla_total_size(0)	 /* RTA_MULTIPATH */
+			    + NLA_ALIGN(sizeof(struct rtnexthop))
+			    + nla_total_size(16) /* RTA_GATEWAY */
+			    + nla_total_size(4)  /* RTA_OIF */
+			    + lwtunnel_get_encap_size(rt->dst.lwtstate);
+
+		nexthop_len *= rt->rt6i_nsiblings;
+	}
+
 	return NLMSG_ALIGN(sizeof(struct rtmsg))
 	       + nla_total_size(16) /* RTA_SRC */
 	       + nla_total_size(16) /* RTA_DST */
@@ -3172,14 +3313,69 @@ static inline size_t rt6_nlmsg_size(struct rt6_info *rt)
 	       + nla_total_size(sizeof(struct rta_cacheinfo))
 	       + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
 	       + nla_total_size(1) /* RTA_PREF */
-	       + lwtunnel_get_encap_size(rt->dst.lwtstate);
+	       + lwtunnel_get_encap_size(rt->dst.lwtstate)
+	       + nexthop_len;
+}
+
+static int rt6_nexthop_info(struct sk_buff *skb, struct rt6_info *rt,
+			    unsigned int *flags)
+{
+	if (!netif_running(rt->dst.dev) || !netif_carrier_ok(rt->dst.dev)) {
+		*flags |= RTNH_F_LINKDOWN;
+		if (rt->rt6i_idev->cnf.ignore_routes_with_linkdown)
+			*flags |= RTNH_F_DEAD;
+	}
+
+	if (rt->rt6i_flags & RTF_GATEWAY) {
+		if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->rt6i_gateway) < 0)
+			goto nla_put_failure;
+	}
+
+	if (rt->dst.dev &&
+	    nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
+		goto nla_put_failure;
+
+	if (rt->dst.lwtstate &&
+	    lwtunnel_fill_encap(skb, rt->dst.lwtstate) < 0)
+		goto nla_put_failure;
+
+	return 0;
+
+nla_put_failure:
+	return -EMSGSIZE;
+}
+
+static int rt6_add_nexthop(struct sk_buff *skb, struct rt6_info *rt)
+{
+	struct rtnexthop *rtnh;
+	unsigned int flags = 0;
+
+	rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
+	if (!rtnh)
+		goto nla_put_failure;
+
+	rtnh->rtnh_hops = 0;
+	rtnh->rtnh_ifindex = rt->dst.dev ? rt->dst.dev->ifindex : 0;
+
+	if (rt6_nexthop_info(skb, rt, &flags) < 0)
+		goto nla_put_failure;
+
+	rtnh->rtnh_flags = flags;
+
+	/* length of rtnetlink header + attributes */
+	rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh;
+
+	return 0;
+
+nla_put_failure:
+	return -EMSGSIZE;
 }
 
 static int rt6_fill_node(struct net *net,
 			 struct sk_buff *skb, struct rt6_info *rt,
 			 struct in6_addr *dst, struct in6_addr *src,
 			 int iif, int type, u32 portid, u32 seq,
-			 int prefix, int nowait, unsigned int flags)
+			 unsigned int flags)
 {
 	u32 metrics[RTAX_MAX];
 	struct rtmsg *rtm;
@@ -3187,13 +3383,6 @@ static int rt6_fill_node(struct net *net,
 	long expires;
 	u32 table;
 
-	if (prefix) {	/* user wants prefix routes only */
-		if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
-			/* success since this is not a prefix route */
-			return 1;
-		}
-	}
-
 	nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
 	if (!nlh)
 		return -EMSGSIZE;
@@ -3233,11 +3422,6 @@ static int rt6_fill_node(struct net *net,
 	else
 		rtm->rtm_type = RTN_UNICAST;
 	rtm->rtm_flags = 0;
-	if (!netif_carrier_ok(rt->dst.dev)) {
-		rtm->rtm_flags |= RTNH_F_LINKDOWN;
-		if (rt->rt6i_idev->cnf.ignore_routes_with_linkdown)
-			rtm->rtm_flags |= RTNH_F_DEAD;
-	}
 	rtm->rtm_scope = RT_SCOPE_UNIVERSE;
 	rtm->rtm_protocol = rt->rt6i_protocol;
 	if (rt->rt6i_flags & RTF_DYNAMIC)
@@ -3271,19 +3455,12 @@ static int rt6_fill_node(struct net *net,
 	if (iif) {
 #ifdef CONFIG_IPV6_MROUTE
 		if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
-			int err = ip6mr_get_route(net, skb, rtm, nowait,
-						  portid);
-
-			if (err <= 0) {
-				if (!nowait) {
-					if (err == 0)
-						return 0;
-					goto nla_put_failure;
-				} else {
-					if (err == -EMSGSIZE)
-						goto nla_put_failure;
-				}
-			}
+			int err = ip6mr_get_route(net, skb, rtm, portid);
+
+			if (err == 0)
+				return 0;
+			if (err < 0)
+				goto nla_put_failure;
 		} else
 #endif
 			if (nla_put_u32(skb, RTA_IIF, iif))
@@ -3308,17 +3485,35 @@ static int rt6_fill_node(struct net *net,
 	if (rtnetlink_put_metrics(skb, metrics) < 0)
 		goto nla_put_failure;
 
-	if (rt->rt6i_flags & RTF_GATEWAY) {
-		if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->rt6i_gateway) < 0)
-			goto nla_put_failure;
-	}
-
-	if (rt->dst.dev &&
-	    nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
-		goto nla_put_failure;
 	if (nla_put_u32(skb, RTA_PRIORITY, rt->rt6i_metric))
 		goto nla_put_failure;
 
+	/* For multipath routes, walk the siblings list and add
+	 * each as a nexthop within RTA_MULTIPATH.
+	 */
+	if (rt->rt6i_nsiblings) {
+		struct rt6_info *sibling, *next_sibling;
+		struct nlattr *mp;
+
+		mp = nla_nest_start(skb, RTA_MULTIPATH);
+		if (!mp)
+			goto nla_put_failure;
+
+		if (rt6_add_nexthop(skb, rt) < 0)
+			goto nla_put_failure;
+
+		list_for_each_entry_safe(sibling, next_sibling,
+					 &rt->rt6i_siblings, rt6i_siblings) {
+			if (rt6_add_nexthop(skb, sibling) < 0)
+				goto nla_put_failure;
+		}
+
+		nla_nest_end(skb, mp);
+	} else {
+		if (rt6_nexthop_info(skb, rt, &rtm->rtm_flags) < 0)
+			goto nla_put_failure;
+	}
+
 	expires = (rt->rt6i_flags & RTF_EXPIRES) ? rt->dst.expires - jiffies : 0;
 
 	if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, rt->dst.error) < 0)
@@ -3327,8 +3522,6 @@ static int rt6_fill_node(struct net *net,
 	if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->rt6i_flags)))
 		goto nla_put_failure;
 
-	if (lwtunnel_fill_encap(skb, rt->dst.lwtstate) < 0)
-		goto nla_put_failure;
 
 	nlmsg_end(skb, nlh);
 	return 0;
@@ -3341,18 +3534,26 @@ nla_put_failure:
 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
 {
 	struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
-	int prefix;
+	struct net *net = arg->net;
+
+	if (rt == net->ipv6.ip6_null_entry)
+		return 0;
 
 	if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
 		struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
-		prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
-	} else
-		prefix = 0;
 
-	return rt6_fill_node(arg->net,
+		/* user wants prefix routes only */
+		if (rtm->rtm_flags & RTM_F_PREFIX &&
+		    !(rt->rt6i_flags & RTF_PREFIX_RT)) {
+			/* success since this is not a prefix route */
+			return 1;
+		}
+	}
+
+	return rt6_fill_node(net,
 		     arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
 		     NETLINK_CB(arg->cb->skb).portid, arg->cb->nlh->nlmsg_seq,
-		     prefix, 0, NLM_F_MULTI);
+		     NLM_F_MULTI);
 }
 
 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
@@ -3433,17 +3634,11 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
 		goto errout;
 	}
 
-	/* Reserve room for dummy headers, this skb can pass
-	   through good chunk of routing engine.
-	 */
-	skb_reset_mac_header(skb);
-	skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
-
 	skb_dst_set(skb, &rt->dst);
 
 	err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
 			    RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
-			    nlh->nlmsg_seq, 0, 0, 0);
+			    nlh->nlmsg_seq, 0);
 	if (err < 0) {
 		kfree_skb(skb);
 		goto errout;
@@ -3470,7 +3665,7 @@ void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info,
 		goto errout;
 
 	err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
-				event, info->portid, seq, 0, 0, nlm_flags);
+				event, info->portid, seq, nlm_flags);
 	if (err < 0) {
 		/* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
 		WARN_ON(err == -EMSGSIZE);
diff --git a/net/ipv6/seg6_hmac.c b/net/ipv6/seg6_hmac.c
index 03a064803626..f950cb53d5e3 100644
--- a/net/ipv6/seg6_hmac.c
+++ b/net/ipv6/seg6_hmac.c
@@ -45,7 +45,7 @@
 #include <net/seg6_hmac.h>
 #include <linux/random.h>
 
-static char * __percpu *hmac_ring;
+static DEFINE_PER_CPU(char [SEG6_HMAC_RING_SIZE], hmac_ring);
 
 static int seg6_hmac_cmpfn(struct rhashtable_compare_arg *arg, const void *obj)
 {
@@ -174,7 +174,7 @@ int seg6_hmac_compute(struct seg6_hmac_info *hinfo, struct ipv6_sr_hdr *hdr,
 	 * hash function (RadioGatun) with up to 1216 bits
 	 */
 
-	/* saddr(16) + first_seg(1) + cleanup(1) + keyid(4) + seglist(16n) */
+	/* saddr(16) + first_seg(1) + flags(1) + keyid(4) + seglist(16n) */
 	plen = 16 + 1 + 1 + 4 + (hdr->first_segment + 1) * 16;
 
 	/* this limit allows for 14 segments */
@@ -186,13 +186,13 @@ int seg6_hmac_compute(struct seg6_hmac_info *hinfo, struct ipv6_sr_hdr *hdr,
 	 *
 	 * 1. Source IPv6 address (128 bits)
 	 * 2. first_segment value (8 bits)
-	 * 3. cleanup flag (8 bits: highest bit is cleanup value, others are 0)
+	 * 3. Flags (8 bits)
 	 * 4. HMAC Key ID (32 bits)
 	 * 5. All segments in the segments list (n * 128 bits)
 	 */
 
 	local_bh_disable();
-	ring = *this_cpu_ptr(hmac_ring);
+	ring = this_cpu_ptr(hmac_ring);
 	off = ring;
 
 	/* source address */
@@ -202,8 +202,8 @@ int seg6_hmac_compute(struct seg6_hmac_info *hinfo, struct ipv6_sr_hdr *hdr,
 	/* first_segment value */
 	*off++ = hdr->first_segment;
 
-	/* cleanup flag */
-	*off++ = !!(sr_has_cleanup(hdr)) << 7;
+	/* flags */
+	*off++ = hdr->flags;
 
 	/* HMAC Key ID */
 	memcpy(off, &hmackeyid, 4);
@@ -353,27 +353,6 @@ out:
 }
 EXPORT_SYMBOL(seg6_push_hmac);
 
-static int seg6_hmac_init_ring(void)
-{
-	int i;
-
-	hmac_ring = alloc_percpu(char *);
-
-	if (!hmac_ring)
-		return -ENOMEM;
-
-	for_each_possible_cpu(i) {
-		char *ring = kzalloc(SEG6_HMAC_RING_SIZE, GFP_KERNEL);
-
-		if (!ring)
-			return -ENOMEM;
-
-		*per_cpu_ptr(hmac_ring, i) = ring;
-	}
-
-	return 0;
-}
-
 static int seg6_hmac_init_algo(void)
 {
 	struct seg6_hmac_algo *algo;
@@ -410,7 +389,8 @@ static int seg6_hmac_init_algo(void)
 			return -ENOMEM;
 
 		for_each_possible_cpu(cpu) {
-			shash = kzalloc(shsize, GFP_KERNEL);
+			shash = kzalloc_node(shsize, GFP_KERNEL,
+					     cpu_to_node(cpu));
 			if (!shash)
 				return -ENOMEM;
 			*per_cpu_ptr(algo->shashs, cpu) = shash;
@@ -422,16 +402,7 @@ static int seg6_hmac_init_algo(void)
 
 int __init seg6_hmac_init(void)
 {
-	int ret;
-
-	ret = seg6_hmac_init_ring();
-	if (ret < 0)
-		goto out;
-
-	ret = seg6_hmac_init_algo();
-
-out:
-	return ret;
+	return seg6_hmac_init_algo();
 }
 EXPORT_SYMBOL(seg6_hmac_init);
 
@@ -450,13 +421,6 @@ void seg6_hmac_exit(void)
 	struct seg6_hmac_algo *algo = NULL;
 	int i, alg_count, cpu;
 
-	for_each_possible_cpu(i) {
-		char *ring = *per_cpu_ptr(hmac_ring, i);
-
-		kfree(ring);
-	}
-	free_percpu(hmac_ring);
-
 	alg_count = sizeof(hmac_algos) / sizeof(struct seg6_hmac_algo);
 	for (i = 0; i < alg_count; i++) {
 		algo = &hmac_algos[i];
diff --git a/net/ipv6/seg6_iptunnel.c b/net/ipv6/seg6_iptunnel.c
index c46f8cbf5ab5..85582257d3af 100644
--- a/net/ipv6/seg6_iptunnel.c
+++ b/net/ipv6/seg6_iptunnel.c
@@ -55,8 +55,8 @@ static const struct nla_policy seg6_iptunnel_policy[SEG6_IPTUNNEL_MAX + 1] = {
 	[SEG6_IPTUNNEL_SRH]	= { .type = NLA_BINARY },
 };
 
-int nla_put_srh(struct sk_buff *skb, int attrtype,
-		struct seg6_iptunnel_encap *tuninfo)
+static int nla_put_srh(struct sk_buff *skb, int attrtype,
+		       struct seg6_iptunnel_encap *tuninfo)
 {
 	struct seg6_iptunnel_encap *data;
 	struct nlattr *nla;
@@ -235,7 +235,7 @@ static int seg6_do_srh(struct sk_buff *skb)
 	return 0;
 }
 
-int seg6_input(struct sk_buff *skb)
+static int seg6_input(struct sk_buff *skb)
 {
 	int err;
 
@@ -251,7 +251,7 @@ int seg6_input(struct sk_buff *skb)
 	return dst_input(skb);
 }
 
-int seg6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
+static int seg6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 {
 	struct dst_entry *orig_dst = skb_dst(skb);
 	struct dst_entry *dst = NULL;
@@ -303,7 +303,7 @@ drop:
 	return err;
 }
 
-static int seg6_build_state(struct net_device *dev, struct nlattr *nla,
+static int seg6_build_state(struct nlattr *nla,
 			    unsigned int family, const void *cfg,
 			    struct lwtunnel_state **ts)
 {
diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c
index fad992ad4bc8..99853c6e33a8 100644
--- a/net/ipv6/sit.c
+++ b/net/ipv6/sit.c
@@ -1380,6 +1380,7 @@ static int ipip6_tunnel_init(struct net_device *dev)
 	err = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL);
 	if (err) {
 		free_percpu(dev->tstats);
+		dev->tstats = NULL;
 		return err;
 	}
 
diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c
index a4d49760bf43..895ff650db43 100644
--- a/net/ipv6/syncookies.c
+++ b/net/ipv6/syncookies.c
@@ -16,7 +16,7 @@
 
 #include <linux/tcp.h>
 #include <linux/random.h>
-#include <linux/cryptohash.h>
+#include <linux/siphash.h>
 #include <linux/kernel.h>
 #include <net/ipv6.h>
 #include <net/tcp.h>
@@ -24,7 +24,7 @@
 #define COOKIEBITS 24	/* Upper bits store count */
 #define COOKIEMASK (((__u32)1 << COOKIEBITS) - 1)
 
-static u32 syncookie6_secret[2][16-4+SHA_DIGEST_WORDS] __read_mostly;
+static siphash_key_t syncookie6_secret[2] __read_mostly;
 
 /* RFC 2460, Section 8.3:
  * [ipv6 tcp] MSS must be computed as the maximum packet size minus 60 [..]
@@ -41,30 +41,27 @@ static __u16 const msstab[] = {
 	9000 - 60,
 };
 
-static DEFINE_PER_CPU(__u32 [16 + 5 + SHA_WORKSPACE_WORDS], ipv6_cookie_scratch);
-
-static u32 cookie_hash(const struct in6_addr *saddr, const struct in6_addr *daddr,
+static u32 cookie_hash(const struct in6_addr *saddr,
+		       const struct in6_addr *daddr,
 		       __be16 sport, __be16 dport, u32 count, int c)
 {
-	__u32 *tmp;
+	const struct {
+		struct in6_addr saddr;
+		struct in6_addr daddr;
+		u32 count;
+		__be16 sport;
+		__be16 dport;
+	} __aligned(SIPHASH_ALIGNMENT) combined = {
+		.saddr = *saddr,
+		.daddr = *daddr,
+		.count = count,
+		.sport = sport,
+		.dport = dport
+	};
 
 	net_get_random_once(syncookie6_secret, sizeof(syncookie6_secret));
-
-	tmp  = this_cpu_ptr(ipv6_cookie_scratch);
-
-	/*
-	 * we have 320 bits of information to hash, copy in the remaining
-	 * 192 bits required for sha_transform, from the syncookie6_secret
-	 * and overwrite the digest with the secret
-	 */
-	memcpy(tmp + 10, syncookie6_secret[c], 44);
-	memcpy(tmp, saddr, 16);
-	memcpy(tmp + 4, daddr, 16);
-	tmp[8] = ((__force u32)sport << 16) + (__force u32)dport;
-	tmp[9] = count;
-	sha_transform(tmp + 16, (__u8 *)tmp, tmp + 16 + 5);
-
-	return tmp[17];
+	return siphash(&combined, offsetofend(typeof(combined), dport),
+		       &syncookie6_secret[c]);
 }
 
 static __u32 secure_tcp_syn_cookie(const struct in6_addr *saddr,
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index cb8929681dc7..60a5295a7de6 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -122,7 +122,9 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
 	struct flowi6 fl6;
 	struct dst_entry *dst;
 	int addr_type;
+	u32 seq;
 	int err;
+	struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
 
 	if (addr_len < SIN6_LEN_RFC2133)
 		return -EINVAL;
@@ -148,8 +150,13 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
 	 *	connect() to INADDR_ANY means loopback (BSD'ism).
 	 */
 
-	if (ipv6_addr_any(&usin->sin6_addr))
-		usin->sin6_addr.s6_addr[15] = 0x1;
+	if (ipv6_addr_any(&usin->sin6_addr)) {
+		if (ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr))
+			ipv6_addr_set_v4mapped(htonl(INADDR_LOOPBACK),
+					       &usin->sin6_addr);
+		else
+			usin->sin6_addr = in6addr_loopback;
+	}
 
 	addr_type = ipv6_addr_type(&usin->sin6_addr);
 
@@ -188,7 +195,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
 	 *	TCP over IPv4
 	 */
 
-	if (addr_type == IPV6_ADDR_MAPPED) {
+	if (addr_type & IPV6_ADDR_MAPPED) {
 		u32 exthdrlen = icsk->icsk_ext_hdr_len;
 		struct sockaddr_in sin;
 
@@ -258,7 +265,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
 	sk->sk_gso_type = SKB_GSO_TCPV6;
 	ip6_dst_store(sk, dst, NULL, NULL);
 
-	if (tcp_death_row.sysctl_tw_recycle &&
+	if (tcp_death_row->sysctl_tw_recycle &&
 	    !tp->rx_opt.ts_recent_stamp &&
 	    ipv6_addr_equal(&fl6.daddr, &sk->sk_v6_daddr))
 		tcp_fetch_timewait_stamp(sk, dst);
@@ -273,18 +280,26 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
 	inet->inet_dport = usin->sin6_port;
 
 	tcp_set_state(sk, TCP_SYN_SENT);
-	err = inet6_hash_connect(&tcp_death_row, sk);
+	err = inet6_hash_connect(tcp_death_row, sk);
 	if (err)
 		goto late_failure;
 
 	sk_set_txhash(sk);
 
-	if (!tp->write_seq && likely(!tp->repair))
-		tp->write_seq = secure_tcpv6_sequence_number(np->saddr.s6_addr32,
-							     sk->sk_v6_daddr.s6_addr32,
-							     inet->inet_sport,
-							     inet->inet_dport,
-							     &tp->tsoffset);
+	if (likely(!tp->repair)) {
+		seq = secure_tcpv6_sequence_number(np->saddr.s6_addr32,
+						   sk->sk_v6_daddr.s6_addr32,
+						   inet->inet_sport,
+						   inet->inet_dport,
+						   &tp->tsoffset);
+		if (!tp->write_seq)
+			tp->write_seq = seq;
+	}
+
+	if (tcp_fastopen_defer_connect(sk, &err))
+		return err;
+	if (err)
+		goto late_failure;
 
 	err = tcp_connect(sk);
 	if (err)
@@ -294,7 +309,6 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
 
 late_failure:
 	tcp_set_state(sk, TCP_CLOSE);
-	__sk_dst_reset(sk);
 failure:
 	inet->inet_dport = 0;
 	sk->sk_route_caps = 0;
@@ -991,6 +1005,16 @@ drop:
 	return 0; /* don't send reset */
 }
 
+static void tcp_v6_restore_cb(struct sk_buff *skb)
+{
+	/* We need to move header back to the beginning if xfrm6_policy_check()
+	 * and tcp_v6_fill_cb() are going to be called again.
+	 * ip6_datagram_recv_specific_ctl() also expects IP6CB to be there.
+	 */
+	memmove(IP6CB(skb), &TCP_SKB_CB(skb)->header.h6,
+		sizeof(struct inet6_skb_parm));
+}
+
 static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
 					 struct request_sock *req,
 					 struct dst_entry *dst,
@@ -1142,10 +1166,7 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff *
 	tcp_ca_openreq_child(newsk, dst);
 
 	tcp_sync_mss(newsk, dst_mtu(dst));
-	newtp->advmss = dst_metric_advmss(dst);
-	if (tcp_sk(sk)->rx_opt.user_mss &&
-	    tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
-		newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
+	newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
 
 	tcp_initialize_rcv_mss(newsk);
 
@@ -1182,8 +1203,10 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff *
 						      sk_gfp_mask(sk, GFP_ATOMIC));
 			consume_skb(ireq->pktopts);
 			ireq->pktopts = NULL;
-			if (newnp->pktoptions)
+			if (newnp->pktoptions) {
+				tcp_v6_restore_cb(newnp->pktoptions);
 				skb_set_owner_r(newnp->pktoptions, newsk);
+			}
 		}
 	}
 
@@ -1198,16 +1221,6 @@ out:
 	return NULL;
 }
 
-static void tcp_v6_restore_cb(struct sk_buff *skb)
-{
-	/* We need to move header back to the beginning if xfrm6_policy_check()
-	 * and tcp_v6_fill_cb() are going to be called again.
-	 * ip6_datagram_recv_specific_ctl() also expects IP6CB to be there.
-	 */
-	memmove(IP6CB(skb), &TCP_SKB_CB(skb)->header.h6,
-		sizeof(struct inet6_skb_parm));
-}
-
 /* The socket must have it's spinlock held when we get
  * here, unless it is a TCP_LISTEN socket.
  *
@@ -1620,7 +1633,6 @@ static const struct inet_connection_sock_af_ops ipv6_specific = {
 	.getsockopt	   = ipv6_getsockopt,
 	.addr2sockaddr	   = inet6_csk_addr2sockaddr,
 	.sockaddr_len	   = sizeof(struct sockaddr_in6),
-	.bind_conflict	   = inet6_csk_bind_conflict,
 #ifdef CONFIG_COMPAT
 	.compat_setsockopt = compat_ipv6_setsockopt,
 	.compat_getsockopt = compat_ipv6_getsockopt,
@@ -1651,7 +1663,6 @@ static const struct inet_connection_sock_af_ops ipv6_mapped = {
 	.getsockopt	   = ipv6_getsockopt,
 	.addr2sockaddr	   = inet6_csk_addr2sockaddr,
 	.sockaddr_len	   = sizeof(struct sockaddr_in6),
-	.bind_conflict	   = inet6_csk_bind_conflict,
 #ifdef CONFIG_COMPAT
 	.compat_setsockopt = compat_ipv6_setsockopt,
 	.compat_getsockopt = compat_ipv6_getsockopt,
@@ -1744,7 +1755,7 @@ static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i)
 	srcp  = ntohs(inet->inet_sport);
 
 	if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
-	    icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
+	    icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
 	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
 		timer_active	= 1;
 		timer_expires	= icsk->icsk_timeout;
@@ -1888,6 +1899,7 @@ struct proto tcpv6_prot = {
 	.shutdown		= tcp_shutdown,
 	.setsockopt		= tcp_setsockopt,
 	.getsockopt		= tcp_getsockopt,
+	.keepalive		= tcp_set_keepalive,
 	.recvmsg		= tcp_recvmsg,
 	.sendmsg		= tcp_sendmsg,
 	.sendpage		= tcp_sendpage,
@@ -1948,7 +1960,7 @@ static void __net_exit tcpv6_net_exit(struct net *net)
 
 static void __net_exit tcpv6_net_exit_batch(struct list_head *net_exit_list)
 {
-	inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET6);
+	inet_twsk_purge(&tcp_hashinfo, AF_INET6);
 }
 
 static struct pernet_operations tcpv6_net_ops = {
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 4d5c4eee4b3f..4e4c401e3bc6 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -55,6 +55,16 @@
 #include <trace/events/skb.h>
 #include "udp_impl.h"
 
+static bool udp6_lib_exact_dif_match(struct net *net, struct sk_buff *skb)
+{
+#if defined(CONFIG_NET_L3_MASTER_DEV)
+	if (!net->ipv4.sysctl_udp_l3mdev_accept &&
+	    skb && ipv6_l3mdev_skb(IP6CB(skb)->flags))
+		return true;
+#endif
+	return false;
+}
+
 static u32 udp6_ehashfn(const struct net *net,
 			const struct in6_addr *laddr,
 			const u16 lport,
@@ -103,7 +113,7 @@ int udp_v6_get_port(struct sock *sk, unsigned short snum)
 
 	/* precompute partial secondary hash */
 	udp_sk(sk)->udp_portaddr_hash = hash2_partial;
-	return udp_lib_get_port(sk, snum, ipv6_rcv_saddr_equal, hash2_nulladdr);
+	return udp_lib_get_port(sk, snum, hash2_nulladdr);
 }
 
 static void udp_v6_rehash(struct sock *sk)
@@ -118,7 +128,7 @@ static void udp_v6_rehash(struct sock *sk)
 static int compute_score(struct sock *sk, struct net *net,
 			 const struct in6_addr *saddr, __be16 sport,
 			 const struct in6_addr *daddr, unsigned short hnum,
-			 int dif)
+			 int dif, bool exact_dif)
 {
 	int score;
 	struct inet_sock *inet;
@@ -149,7 +159,7 @@ static int compute_score(struct sock *sk, struct net *net,
 		score++;
 	}
 
-	if (sk->sk_bound_dev_if) {
+	if (sk->sk_bound_dev_if || exact_dif) {
 		if (sk->sk_bound_dev_if != dif)
 			return -1;
 		score++;
@@ -165,7 +175,7 @@ static int compute_score(struct sock *sk, struct net *net,
 static struct sock *udp6_lib_lookup2(struct net *net,
 		const struct in6_addr *saddr, __be16 sport,
 		const struct in6_addr *daddr, unsigned int hnum, int dif,
-		struct udp_hslot *hslot2,
+		bool exact_dif, struct udp_hslot *hslot2,
 		struct sk_buff *skb)
 {
 	struct sock *sk, *result;
@@ -176,7 +186,7 @@ static struct sock *udp6_lib_lookup2(struct net *net,
 	badness = -1;
 	udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) {
 		score = compute_score(sk, net, saddr, sport,
-				      daddr, hnum, dif);
+				      daddr, hnum, dif, exact_dif);
 		if (score > badness) {
 			reuseport = sk->sk_reuseport;
 			if (reuseport) {
@@ -212,6 +222,7 @@ struct sock *__udp6_lib_lookup(struct net *net,
 	unsigned short hnum = ntohs(dport);
 	unsigned int hash2, slot2, slot = udp_hashfn(net, hnum, udptable->mask);
 	struct udp_hslot *hslot2, *hslot = &udptable->hash[slot];
+	bool exact_dif = udp6_lib_exact_dif_match(net, skb);
 	int score, badness, matches = 0, reuseport = 0;
 	u32 hash = 0;
 
@@ -223,7 +234,7 @@ struct sock *__udp6_lib_lookup(struct net *net,
 			goto begin;
 
 		result = udp6_lib_lookup2(net, saddr, sport,
-					  daddr, hnum, dif,
+					  daddr, hnum, dif, exact_dif,
 					  hslot2, skb);
 		if (!result) {
 			unsigned int old_slot2 = slot2;
@@ -239,7 +250,8 @@ struct sock *__udp6_lib_lookup(struct net *net,
 
 			result = udp6_lib_lookup2(net, saddr, sport,
 						  daddr, hnum, dif,
-						  hslot2, skb);
+						  exact_dif, hslot2,
+						  skb);
 		}
 		return result;
 	}
@@ -247,7 +259,8 @@ begin:
 	result = NULL;
 	badness = -1;
 	sk_for_each_rcu(sk, &hslot->head) {
-		score = compute_score(sk, net, saddr, sport, daddr, hnum, dif);
+		score = compute_score(sk, net, saddr, sport, daddr, hnum, dif,
+				      exact_dif);
 		if (score > badness) {
 			reuseport = sk->sk_reuseport;
 			if (reuseport) {
@@ -441,7 +454,7 @@ try_again:
 	return err;
 
 csum_copy_err:
-	if (!__sk_queue_drop_skb(sk, skb, flags)) {
+	if (!__sk_queue_drop_skb(sk, skb, flags, udp_skb_destructor)) {
 		if (is_udp4) {
 			UDP_INC_STATS(sock_net(sk),
 				      UDP_MIB_CSUMERRORS, is_udplite);
@@ -1033,6 +1046,10 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 			if (addr_len < SIN6_LEN_RFC2133)
 				return -EINVAL;
 			daddr = &sin6->sin6_addr;
+			if (ipv6_addr_any(daddr) &&
+			    ipv6_addr_v4mapped(&np->saddr))
+				ipv6_addr_set_v4mapped(htonl(INADDR_LOOPBACK),
+						       daddr);
 			break;
 		case AF_INET:
 			goto do_udp_sendmsg;
@@ -1295,7 +1312,8 @@ out:
 	return err;
 
 do_confirm:
-	dst_confirm(dst);
+	if (msg->msg_flags & MSG_PROBE)
+		dst_confirm_neigh(dst, &fl6.daddr);
 	if (!(msg->msg_flags&MSG_PROBE) || len)
 		goto back_from_confirm;
 	err = 0;
diff --git a/net/ipv6/xfrm6_input.c b/net/ipv6/xfrm6_input.c
index b5789562aded..08a807b29298 100644
--- a/net/ipv6/xfrm6_input.c
+++ b/net/ipv6/xfrm6_input.c
@@ -33,6 +33,8 @@ EXPORT_SYMBOL(xfrm6_rcv_spi);
 
 int xfrm6_transport_finish(struct sk_buff *skb, int async)
 {
+	struct xfrm_offload *xo = xfrm_offload(skb);
+
 	skb_network_header(skb)[IP6CB(skb)->nhoff] =
 		XFRM_MODE_SKB_CB(skb)->protocol;
 
@@ -44,6 +46,11 @@ int xfrm6_transport_finish(struct sk_buff *skb, int async)
 	ipv6_hdr(skb)->payload_len = htons(skb->len);
 	__skb_push(skb, skb->data - skb_network_header(skb));
 
+	if (xo && (xo->flags & XFRM_GRO)) {
+		skb_mac_header_rebuild(skb);
+		return -1;
+	}
+
 	NF_HOOK(NFPROTO_IPV6, NF_INET_PRE_ROUTING,
 		dev_net(skb->dev), NULL, skb, skb->dev, NULL,
 		ip6_rcv_finish);
@@ -69,18 +76,9 @@ int xfrm6_input_addr(struct sk_buff *skb, xfrm_address_t *daddr,
 	struct xfrm_state *x = NULL;
 	int i = 0;
 
-	/* Allocate new secpath or COW existing one. */
-	if (!skb->sp || atomic_read(&skb->sp->refcnt) != 1) {
-		struct sec_path *sp;
-
-		sp = secpath_dup(skb->sp);
-		if (!sp) {
-			XFRM_INC_STATS(net, LINUX_MIB_XFRMINERROR);
-			goto drop;
-		}
-		if (skb->sp)
-			secpath_put(skb->sp);
-		skb->sp = sp;
+	if (secpath_set(skb)) {
+		XFRM_INC_STATS(net, LINUX_MIB_XFRMINERROR);
+		goto drop;
 	}
 
 	if (1 + skb->sp->len == XFRM_MAX_DEPTH) {
diff --git a/net/ipv6/xfrm6_mode_transport.c b/net/ipv6/xfrm6_mode_transport.c
index 4e344105b3fd..4439ee44c8b0 100644
--- a/net/ipv6/xfrm6_mode_transport.c
+++ b/net/ipv6/xfrm6_mode_transport.c
@@ -47,6 +47,7 @@ static int xfrm6_transport_output(struct xfrm_state *x, struct sk_buff *skb)
 static int xfrm6_transport_input(struct xfrm_state *x, struct sk_buff *skb)
 {
 	int ihl = skb->data - skb_transport_header(skb);
+	struct xfrm_offload *xo = xfrm_offload(skb);
 
 	if (skb->transport_header != skb->network_header) {
 		memmove(skb_transport_header(skb),
@@ -55,7 +56,8 @@ static int xfrm6_transport_input(struct xfrm_state *x, struct sk_buff *skb)
 	}
 	ipv6_hdr(skb)->payload_len = htons(skb->len + ihl -
 					   sizeof(struct ipv6hdr));
-	skb_reset_transport_header(skb);
+	if (!xo || !(xo->flags & XFRM_GRO))
+		skb_reset_transport_header(skb);
 	return 0;
 }
 
diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c
index e0f71c01d728..79651bc71bf0 100644
--- a/net/ipv6/xfrm6_policy.c
+++ b/net/ipv6/xfrm6_policy.c
@@ -25,8 +25,6 @@
 #include <net/mip6.h>
 #endif
 
-static struct xfrm_policy_afinfo xfrm6_policy_afinfo;
-
 static struct dst_entry *xfrm6_dst_lookup(struct net *net, int tos, int oif,
 					  const xfrm_address_t *saddr,
 					  const xfrm_address_t *daddr)
@@ -220,7 +218,7 @@ static inline int xfrm6_garbage_collect(struct dst_ops *ops)
 {
 	struct net *net = container_of(ops, struct net, xfrm.xfrm6_dst_ops);
 
-	xfrm6_policy_afinfo.garbage_collect(net);
+	xfrm_garbage_collect_deferred(net);
 	return dst_entries_get_fast(ops) > ops->gc_thresh * 2;
 }
 
@@ -291,8 +289,7 @@ static struct dst_ops xfrm6_dst_ops_template = {
 	.gc_thresh =		INT_MAX,
 };
 
-static struct xfrm_policy_afinfo xfrm6_policy_afinfo = {
-	.family =		AF_INET6,
+static const struct xfrm_policy_afinfo xfrm6_policy_afinfo = {
 	.dst_ops =		&xfrm6_dst_ops_template,
 	.dst_lookup =		xfrm6_dst_lookup,
 	.get_saddr =		xfrm6_get_saddr,
@@ -305,7 +302,7 @@ static struct xfrm_policy_afinfo xfrm6_policy_afinfo = {
 
 static int __init xfrm6_policy_init(void)
 {
-	return xfrm_policy_register_afinfo(&xfrm6_policy_afinfo);
+	return xfrm_policy_register_afinfo(&xfrm6_policy_afinfo, AF_INET6);
 }
 
 static void xfrm6_policy_fini(void)
diff --git a/net/ipv6/xfrm6_protocol.c b/net/ipv6/xfrm6_protocol.c
index 54d13f8dbbae..b2dc8ce49378 100644
--- a/net/ipv6/xfrm6_protocol.c
+++ b/net/ipv6/xfrm6_protocol.c
@@ -162,9 +162,8 @@ static const struct inet6_protocol ipcomp6_protocol = {
 	.flags		=	INET6_PROTO_NOPOLICY,
 };
 
-static struct xfrm_input_afinfo xfrm6_input_afinfo = {
+static const struct xfrm_input_afinfo xfrm6_input_afinfo = {
 	.family		=	AF_INET6,
-	.owner		=	THIS_MODULE,
 	.callback	=	xfrm6_rcv_cb,
 };
 
diff --git a/net/irda/irnet/irnet_ppp.c b/net/irda/irnet/irnet_ppp.c
index 1215693fdd22..35dbf3dc3d28 100644
--- a/net/irda/irnet/irnet_ppp.c
+++ b/net/irda/irnet/irnet_ppp.c
@@ -51,7 +51,7 @@ irnet_ctrl_write(irnet_socket *	ap,
   char *	next;		/* Next command to process */
   int		length;		/* Length of current command */
 
-  DENTER(CTRL_TRACE, "(ap=0x%p, count=%Zd)\n", ap, count);
+  DENTER(CTRL_TRACE, "(ap=0x%p, count=%zd)\n", ap, count);
 
   /* Check for overflow... */
   DABORT(count >= IRNET_MAX_COMMAND, -ENOMEM,
@@ -66,7 +66,7 @@ irnet_ctrl_write(irnet_socket *	ap,
 
   /* Safe terminate the string */
   command[count] = '\0';
-  DEBUG(CTRL_INFO, "Command line received is ``%s'' (%Zd).\n",
+  DEBUG(CTRL_INFO, "Command line received is ``%s'' (%zd).\n",
 	command, count);
 
   /* Check every commands in the command line */
@@ -285,7 +285,7 @@ irnet_ctrl_read(irnet_socket *	ap,
   char		event[75];
   ssize_t	ret = 0;
 
-  DENTER(CTRL_TRACE, "(ap=0x%p, count=%Zd)\n", ap, count);
+  DENTER(CTRL_TRACE, "(ap=0x%p, count=%zd)\n", ap, count);
 
 #ifdef INITIAL_DISCOVERY
   /* Check if we have read the log */
@@ -328,7 +328,7 @@ irnet_ctrl_read(irnet_socket *	ap,
   if(ret != 0)
     {
       /* No, return the error code */
-      DEXIT(CTRL_TRACE, " - ret %Zd\n", ret);
+      DEXIT(CTRL_TRACE, " - ret %zd\n", ret);
       return ret;
     }
 
@@ -568,7 +568,7 @@ dev_irnet_write(struct file *	file,
 {
   irnet_socket *	ap = file->private_data;
 
-  DPASS(FS_TRACE, "(file=0x%p, ap=0x%p, count=%Zd)\n",
+  DPASS(FS_TRACE, "(file=0x%p, ap=0x%p, count=%zd)\n",
 	file, ap, count);
   DABORT(ap == NULL, -ENXIO, FS_ERROR, "ap is NULL !!!\n");
 
@@ -592,7 +592,7 @@ dev_irnet_read(struct file *	file,
 {
   irnet_socket *	ap = file->private_data;
 
-  DPASS(FS_TRACE, "(file=0x%p, ap=0x%p, count=%Zd)\n",
+  DPASS(FS_TRACE, "(file=0x%p, ap=0x%p, count=%zd)\n",
 	file, ap, count);
   DABORT(ap == NULL, -ENXIO, FS_ERROR, "ap is NULL !!!\n");
 
diff --git a/net/irda/irqueue.c b/net/irda/irqueue.c
index acbe61c7e683..160dc89335e2 100644
--- a/net/irda/irqueue.c
+++ b/net/irda/irqueue.c
@@ -383,9 +383,6 @@ EXPORT_SYMBOL(hashbin_new);
  *    for deallocating this structure if it's complex. If not the user can
  *    just supply kfree, which should take care of the job.
  */
-#ifdef CONFIG_LOCKDEP
-static int hashbin_lock_depth = 0;
-#endif
 int hashbin_delete( hashbin_t* hashbin, FREE_FUNC free_func)
 {
 	irda_queue_t* queue;
@@ -396,22 +393,27 @@ int hashbin_delete( hashbin_t* hashbin, FREE_FUNC free_func)
 	IRDA_ASSERT(hashbin->magic == HB_MAGIC, return -1;);
 
 	/* Synchronize */
-	if ( hashbin->hb_type & HB_LOCK ) {
-		spin_lock_irqsave_nested(&hashbin->hb_spinlock, flags,
-					 hashbin_lock_depth++);
-	}
+	if (hashbin->hb_type & HB_LOCK)
+		spin_lock_irqsave(&hashbin->hb_spinlock, flags);
 
 	/*
 	 *  Free the entries in the hashbin, TODO: use hashbin_clear when
 	 *  it has been shown to work
 	 */
 	for (i = 0; i < HASHBIN_SIZE; i ++ ) {
-		queue = dequeue_first((irda_queue_t**) &hashbin->hb_queue[i]);
-		while (queue ) {
-			if (free_func)
-				(*free_func)(queue);
-			queue = dequeue_first(
-				(irda_queue_t**) &hashbin->hb_queue[i]);
+		while (1) {
+			queue = dequeue_first((irda_queue_t**) &hashbin->hb_queue[i]);
+
+			if (!queue)
+				break;
+
+			if (free_func) {
+				if (hashbin->hb_type & HB_LOCK)
+					spin_unlock_irqrestore(&hashbin->hb_spinlock, flags);
+				free_func(queue);
+				if (hashbin->hb_type & HB_LOCK)
+					spin_lock_irqsave(&hashbin->hb_spinlock, flags);
+			}
 		}
 	}
 
@@ -420,12 +422,8 @@ int hashbin_delete( hashbin_t* hashbin, FREE_FUNC free_func)
 	hashbin->magic = ~HB_MAGIC;
 
 	/* Release lock */
-	if ( hashbin->hb_type & HB_LOCK) {
+	if (hashbin->hb_type & HB_LOCK)
 		spin_unlock_irqrestore(&hashbin->hb_spinlock, flags);
-#ifdef CONFIG_LOCKDEP
-		hashbin_lock_depth--;
-#endif
-	}
 
 	/*
 	 *  Free the hashbin structure
diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c
index 7e08a4d3d77d..a646f3481240 100644
--- a/net/kcm/kcmsock.c
+++ b/net/kcm/kcmsock.c
@@ -929,23 +929,25 @@ static int kcm_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
 			goto out_error;
 	}
 
-	/* New message, alloc head skb */
-	head = alloc_skb(0, sk->sk_allocation);
-	while (!head) {
-		kcm_push(kcm);
-		err = sk_stream_wait_memory(sk, &timeo);
-		if (err)
-			goto out_error;
-
+	if (msg_data_left(msg)) {
+		/* New message, alloc head skb */
 		head = alloc_skb(0, sk->sk_allocation);
-	}
+		while (!head) {
+			kcm_push(kcm);
+			err = sk_stream_wait_memory(sk, &timeo);
+			if (err)
+				goto out_error;
 
-	skb = head;
+			head = alloc_skb(0, sk->sk_allocation);
+		}
 
-	/* Set ip_summed to CHECKSUM_UNNECESSARY to avoid calling
-	 * csum_and_copy_from_iter from skb_do_copy_data_nocache.
-	 */
-	skb->ip_summed = CHECKSUM_UNNECESSARY;
+		skb = head;
+
+		/* Set ip_summed to CHECKSUM_UNNECESSARY to avoid calling
+		 * csum_and_copy_from_iter from skb_do_copy_data_nocache.
+		 */
+		skb->ip_summed = CHECKSUM_UNNECESSARY;
+	}
 
 start:
 	while (msg_data_left(msg)) {
@@ -1018,10 +1020,12 @@ wait_for_memory:
 	if (eor) {
 		bool not_busy = skb_queue_empty(&sk->sk_write_queue);
 
-		/* Message complete, queue it on send buffer */
-		__skb_queue_tail(&sk->sk_write_queue, head);
-		kcm->seq_skb = NULL;
-		KCM_STATS_INCR(kcm->stats.tx_msgs);
+		if (head) {
+			/* Message complete, queue it on send buffer */
+			__skb_queue_tail(&sk->sk_write_queue, head);
+			kcm->seq_skb = NULL;
+			KCM_STATS_INCR(kcm->stats.tx_msgs);
+		}
 
 		if (msg->msg_flags & MSG_BATCH) {
 			kcm->tx_wait_more = true;
@@ -1040,8 +1044,10 @@ wait_for_memory:
 	} else {
 		/* Message not complete, save state */
 partial_message:
-		kcm->seq_skb = head;
-		kcm_tx_msg(head)->last_skb = skb;
+		if (head) {
+			kcm->seq_skb = head;
+			kcm_tx_msg(head)->last_skb = skb;
+		}
 	}
 
 	KCM_STATS_ADD(kcm->stats.tx_bytes, copied);
diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c
index 85948c69b236..8adab6335ced 100644
--- a/net/l2tp/l2tp_core.c
+++ b/net/l2tp/l2tp_core.c
@@ -1058,10 +1058,10 @@ static int l2tp_xmit_core(struct l2tp_session *session, struct sk_buff *skb,
 
 	/* Debug */
 	if (session->send_seq)
-		l2tp_dbg(session, L2TP_MSG_DATA, "%s: send %Zd bytes, ns=%u\n",
+		l2tp_dbg(session, L2TP_MSG_DATA, "%s: send %zd bytes, ns=%u\n",
 			 session->name, data_len, session->ns - 1);
 	else
-		l2tp_dbg(session, L2TP_MSG_DATA, "%s: send %Zd bytes\n",
+		l2tp_dbg(session, L2TP_MSG_DATA, "%s: send %zd bytes\n",
 			 session->name, data_len);
 
 	if (session->debug & L2TP_MSG_DATA) {
@@ -1317,6 +1317,9 @@ static void l2tp_tunnel_del_work(struct work_struct *work)
 	struct sock *sk = NULL;
 
 	tunnel = container_of(work, struct l2tp_tunnel, del_work);
+
+	l2tp_tunnel_closeall(tunnel);
+
 	sk = l2tp_tunnel_sock_lookup(tunnel);
 	if (!sk)
 		goto out;
@@ -1639,7 +1642,6 @@ EXPORT_SYMBOL_GPL(l2tp_tunnel_create);
 int l2tp_tunnel_delete(struct l2tp_tunnel *tunnel)
 {
 	l2tp_tunnel_inc_refcount(tunnel);
-	l2tp_tunnel_closeall(tunnel);
 	if (false == queue_work(l2tp_wq, &tunnel->del_work)) {
 		l2tp_tunnel_dec_refcount(tunnel);
 		return 1;
diff --git a/net/l2tp/l2tp_core.h b/net/l2tp/l2tp_core.h
index 8f560f7140a0..aebf281d09ee 100644
--- a/net/l2tp/l2tp_core.h
+++ b/net/l2tp/l2tp_core.h
@@ -263,6 +263,7 @@ int l2tp_xmit_skb(struct l2tp_session *session, struct sk_buff *skb,
 int l2tp_nl_register_ops(enum l2tp_pwtype pw_type,
 			 const struct l2tp_nl_cmd_ops *ops);
 void l2tp_nl_unregister_ops(enum l2tp_pwtype pw_type);
+int l2tp_ioctl(struct sock *sk, int cmd, unsigned long arg);
 
 /* Session reference counts. Incremented when code obtains a reference
  * to a session.
diff --git a/net/l2tp/l2tp_eth.c b/net/l2tp/l2tp_eth.c
index e2c6ae024565..8bf18a5f66e0 100644
--- a/net/l2tp/l2tp_eth.c
+++ b/net/l2tp/l2tp_eth.c
@@ -106,8 +106,8 @@ static int l2tp_eth_dev_xmit(struct sk_buff *skb, struct net_device *dev)
 	return NETDEV_TX_OK;
 }
 
-static struct rtnl_link_stats64 *l2tp_eth_get_stats64(struct net_device *dev,
-						      struct rtnl_link_stats64 *stats)
+static void l2tp_eth_get_stats64(struct net_device *dev,
+				 struct rtnl_link_stats64 *stats)
 {
 	struct l2tp_eth *priv = netdev_priv(dev);
 
@@ -117,10 +117,8 @@ static struct rtnl_link_stats64 *l2tp_eth_get_stats64(struct net_device *dev,
 	stats->rx_bytes   = atomic_long_read(&priv->rx_bytes);
 	stats->rx_packets = atomic_long_read(&priv->rx_packets);
 	stats->rx_errors  = atomic_long_read(&priv->rx_errors);
-	return stats;
 }
 
-
 static const struct net_device_ops l2tp_eth_netdev_ops = {
 	.ndo_init		= l2tp_eth_dev_init,
 	.ndo_uninit		= l2tp_eth_dev_uninit,
diff --git a/net/l2tp/l2tp_ip.c b/net/l2tp/l2tp_ip.c
index 3d73278b86ca..d25038cfd64e 100644
--- a/net/l2tp/l2tp_ip.c
+++ b/net/l2tp/l2tp_ip.c
@@ -11,6 +11,7 @@
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
+#include <asm/ioctls.h>
 #include <linux/icmp.h>
 #include <linux/module.h>
 #include <linux/skbuff.h>
@@ -53,19 +54,26 @@ static struct sock *__l2tp_ip_bind_lookup(const struct net *net, __be32 laddr,
 	struct sock *sk;
 
 	sk_for_each_bound(sk, &l2tp_ip_bind_table) {
-		struct inet_sock *inet = inet_sk(sk);
-		struct l2tp_ip_sock *l2tp = l2tp_ip_sk(sk);
+		const struct l2tp_ip_sock *l2tp = l2tp_ip_sk(sk);
+		const struct inet_sock *inet = inet_sk(sk);
 
-		if (l2tp == NULL)
+		if (!net_eq(sock_net(sk), net))
 			continue;
 
-		if ((l2tp->conn_id == tunnel_id) &&
-		    net_eq(sock_net(sk), net) &&
-		    !(inet->inet_rcv_saddr && inet->inet_rcv_saddr != laddr) &&
-		    (!inet->inet_daddr || !raddr || inet->inet_daddr == raddr) &&
-		    (!sk->sk_bound_dev_if || !dif ||
-		     sk->sk_bound_dev_if == dif))
-			goto found;
+		if (sk->sk_bound_dev_if && dif && sk->sk_bound_dev_if != dif)
+			continue;
+
+		if (inet->inet_rcv_saddr && laddr &&
+		    inet->inet_rcv_saddr != laddr)
+			continue;
+
+		if (inet->inet_daddr && raddr && inet->inet_daddr != raddr)
+			continue;
+
+		if (l2tp->conn_id != tunnel_id)
+			continue;
+
+		goto found;
 	}
 
 	sk = NULL;
@@ -258,7 +266,7 @@ static int l2tp_ip_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 	if (!sock_flag(sk, SOCK_ZAPPED))
 		goto out;
 
-	if (sk->sk_state != TCP_CLOSE || addr_len < sizeof(struct sockaddr_l2tpip))
+	if (sk->sk_state != TCP_CLOSE)
 		goto out;
 
 	chk_addr_ret = inet_addr_type(net, addr->l2tp_addr.s_addr);
@@ -380,7 +388,7 @@ static int l2tp_ip_backlog_recv(struct sock *sk, struct sk_buff *skb)
 drop:
 	IP_INC_STATS(sock_net(sk), IPSTATS_MIB_INDISCARDS);
 	kfree_skb(skb);
-	return -1;
+	return 0;
 }
 
 /* Userspace will call sendmsg() on the tunnel socket to send L2TP
@@ -553,6 +561,30 @@ out:
 	return err ? err : copied;
 }
 
+int l2tp_ioctl(struct sock *sk, int cmd, unsigned long arg)
+{
+	struct sk_buff *skb;
+	int amount;
+
+	switch (cmd) {
+	case SIOCOUTQ:
+		amount = sk_wmem_alloc_get(sk);
+		break;
+	case SIOCINQ:
+		spin_lock_bh(&sk->sk_receive_queue.lock);
+		skb = skb_peek(&sk->sk_receive_queue);
+		amount = skb ? skb->len : 0;
+		spin_unlock_bh(&sk->sk_receive_queue.lock);
+		break;
+
+	default:
+		return -ENOIOCTLCMD;
+	}
+
+	return put_user(amount, (int __user *)arg);
+}
+EXPORT_SYMBOL(l2tp_ioctl);
+
 static struct proto l2tp_ip_prot = {
 	.name		   = "L2TP/IP",
 	.owner		   = THIS_MODULE,
@@ -561,7 +593,7 @@ static struct proto l2tp_ip_prot = {
 	.bind		   = l2tp_ip_bind,
 	.connect	   = l2tp_ip_connect,
 	.disconnect	   = l2tp_ip_disconnect,
-	.ioctl		   = udp_ioctl,
+	.ioctl		   = l2tp_ioctl,
 	.destroy	   = l2tp_ip_destroy_sock,
 	.setsockopt	   = ip_setsockopt,
 	.getsockopt	   = ip_getsockopt,
diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c
index 331ccf5a7bad..a4abcbc4c09a 100644
--- a/net/l2tp/l2tp_ip6.c
+++ b/net/l2tp/l2tp_ip6.c
@@ -57,8 +57,8 @@ static inline struct l2tp_ip6_sock *l2tp_ip6_sk(const struct sock *sk)
 	return (struct l2tp_ip6_sock *)sk;
 }
 
-static struct sock *__l2tp_ip6_bind_lookup(struct net *net,
-					   struct in6_addr *laddr,
+static struct sock *__l2tp_ip6_bind_lookup(const struct net *net,
+					   const struct in6_addr *laddr,
 					   const struct in6_addr *raddr,
 					   int dif, u32 tunnel_id)
 {
@@ -67,18 +67,26 @@ static struct sock *__l2tp_ip6_bind_lookup(struct net *net,
 	sk_for_each_bound(sk, &l2tp_ip6_bind_table) {
 		const struct in6_addr *sk_laddr = inet6_rcv_saddr(sk);
 		const struct in6_addr *sk_raddr = &sk->sk_v6_daddr;
-		struct l2tp_ip6_sock *l2tp = l2tp_ip6_sk(sk);
+		const struct l2tp_ip6_sock *l2tp = l2tp_ip6_sk(sk);
 
-		if (l2tp == NULL)
+		if (!net_eq(sock_net(sk), net))
 			continue;
 
-		if ((l2tp->conn_id == tunnel_id) &&
-		    net_eq(sock_net(sk), net) &&
-		    (!sk_laddr || ipv6_addr_any(sk_laddr) || ipv6_addr_equal(sk_laddr, laddr)) &&
-		    (!raddr || ipv6_addr_any(sk_raddr) || ipv6_addr_equal(sk_raddr, raddr)) &&
-		    (!sk->sk_bound_dev_if || !dif ||
-		     sk->sk_bound_dev_if == dif))
-			goto found;
+		if (sk->sk_bound_dev_if && dif && sk->sk_bound_dev_if != dif)
+			continue;
+
+		if (sk_laddr && !ipv6_addr_any(sk_laddr) &&
+		    !ipv6_addr_any(laddr) && !ipv6_addr_equal(sk_laddr, laddr))
+			continue;
+
+		if (!ipv6_addr_any(sk_raddr) && raddr &&
+		    !ipv6_addr_any(raddr) && !ipv6_addr_equal(sk_raddr, raddr))
+			continue;
+
+		if (l2tp->conn_id != tunnel_id)
+			continue;
+
+		goto found;
 	}
 
 	sk = NULL;
@@ -650,7 +658,8 @@ out:
 	return err < 0 ? err : len;
 
 do_confirm:
-	dst_confirm(dst);
+	if (msg->msg_flags & MSG_PROBE)
+		dst_confirm_neigh(dst, &fl6.daddr);
 	if (!(msg->msg_flags & MSG_PROBE) || len)
 		goto back_from_confirm;
 	err = 0;
@@ -722,7 +731,7 @@ static struct proto l2tp_ip6_prot = {
 	.bind		   = l2tp_ip6_bind,
 	.connect	   = l2tp_ip6_connect,
 	.disconnect	   = l2tp_ip6_disconnect,
-	.ioctl		   = udp_ioctl,
+	.ioctl		   = l2tp_ioctl,
 	.destroy	   = l2tp_ip6_destroy_sock,
 	.setsockopt	   = ipv6_setsockopt,
 	.getsockopt	   = ipv6_getsockopt,
diff --git a/net/llc/llc_conn.c b/net/llc/llc_conn.c
index 3e821daf9dd4..8bc5a1bd2d45 100644
--- a/net/llc/llc_conn.c
+++ b/net/llc/llc_conn.c
@@ -821,7 +821,10 @@ void llc_conn_handler(struct llc_sap *sap, struct sk_buff *skb)
 		 * another trick required to cope with how the PROCOM state
 		 * machine works. -acme
 		 */
+		skb_orphan(skb);
+		sock_hold(sk);
 		skb->sk = sk;
+		skb->destructor = sock_efree;
 	}
 	if (!sock_owned_by_user(sk))
 		llc_conn_rcv(sk, skb);
diff --git a/net/llc/llc_sap.c b/net/llc/llc_sap.c
index d0e1e804ebd7..5404d0d195cc 100644
--- a/net/llc/llc_sap.c
+++ b/net/llc/llc_sap.c
@@ -290,7 +290,10 @@ static void llc_sap_rcv(struct llc_sap *sap, struct sk_buff *skb,
 
 	ev->type   = LLC_SAP_EV_TYPE_PDU;
 	ev->reason = 0;
+	skb_orphan(skb);
+	sock_hold(sk);
 	skb->sk = sk;
+	skb->destructor = sock_efree;
 	llc_sap_state_process(sap, skb);
 }
 
diff --git a/net/mac80211/Kconfig b/net/mac80211/Kconfig
index 3891cbd2adea..76e30f4797fb 100644
--- a/net/mac80211/Kconfig
+++ b/net/mac80211/Kconfig
@@ -6,6 +6,7 @@ config MAC80211
 	select CRYPTO_AES
 	select CRYPTO_CCM
 	select CRYPTO_GCM
+	select CRYPTO_CMAC
 	select CRC32
 	---help---
 	  This option enables the hardware independent IEEE 802.11
diff --git a/net/mac80211/aes_cmac.c b/net/mac80211/aes_cmac.c
index d0bd5fff5f0a..2fb65588490c 100644
--- a/net/mac80211/aes_cmac.c
+++ b/net/mac80211/aes_cmac.c
@@ -22,126 +22,50 @@
 #define CMAC_TLEN_256 16 /* CMAC TLen = 128 bits (16 octets) */
 #define AAD_LEN 20
 
+static const u8 zero[CMAC_TLEN_256];
 
-void gf_mulx(u8 *pad)
-{
-	int i, carry;
-
-	carry = pad[0] & 0x80;
-	for (i = 0; i < AES_BLOCK_SIZE - 1; i++)
-		pad[i] = (pad[i] << 1) | (pad[i + 1] >> 7);
-	pad[AES_BLOCK_SIZE - 1] <<= 1;
-	if (carry)
-		pad[AES_BLOCK_SIZE - 1] ^= 0x87;
-}
-
-void aes_cmac_vector(struct crypto_cipher *tfm, size_t num_elem,
-		     const u8 *addr[], const size_t *len, u8 *mac,
-		     size_t mac_len)
-{
-	u8 cbc[AES_BLOCK_SIZE], pad[AES_BLOCK_SIZE];
-	const u8 *pos, *end;
-	size_t i, e, left, total_len;
-
-	memset(cbc, 0, AES_BLOCK_SIZE);
-
-	total_len = 0;
-	for (e = 0; e < num_elem; e++)
-		total_len += len[e];
-	left = total_len;
-
-	e = 0;
-	pos = addr[0];
-	end = pos + len[0];
-
-	while (left >= AES_BLOCK_SIZE) {
-		for (i = 0; i < AES_BLOCK_SIZE; i++) {
-			cbc[i] ^= *pos++;
-			if (pos >= end) {
-				e++;
-				pos = addr[e];
-				end = pos + len[e];
-			}
-		}
-		if (left > AES_BLOCK_SIZE)
-			crypto_cipher_encrypt_one(tfm, cbc, cbc);
-		left -= AES_BLOCK_SIZE;
-	}
-
-	memset(pad, 0, AES_BLOCK_SIZE);
-	crypto_cipher_encrypt_one(tfm, pad, pad);
-	gf_mulx(pad);
-
-	if (left || total_len == 0) {
-		for (i = 0; i < left; i++) {
-			cbc[i] ^= *pos++;
-			if (pos >= end) {
-				e++;
-				pos = addr[e];
-				end = pos + len[e];
-			}
-		}
-		cbc[left] ^= 0x80;
-		gf_mulx(pad);
-	}
-
-	for (i = 0; i < AES_BLOCK_SIZE; i++)
-		pad[i] ^= cbc[i];
-	crypto_cipher_encrypt_one(tfm, pad, pad);
-	memcpy(mac, pad, mac_len);
-}
-
-
-void ieee80211_aes_cmac(struct crypto_cipher *tfm, const u8 *aad,
+void ieee80211_aes_cmac(struct crypto_shash *tfm, const u8 *aad,
 			const u8 *data, size_t data_len, u8 *mic)
 {
-	const u8 *addr[3];
-	size_t len[3];
-	u8 zero[CMAC_TLEN];
+	SHASH_DESC_ON_STACK(desc, tfm);
+	u8 out[AES_BLOCK_SIZE];
 
-	memset(zero, 0, CMAC_TLEN);
-	addr[0] = aad;
-	len[0] = AAD_LEN;
-	addr[1] = data;
-	len[1] = data_len - CMAC_TLEN;
-	addr[2] = zero;
-	len[2] = CMAC_TLEN;
+	desc->tfm = tfm;
 
-	aes_cmac_vector(tfm, 3, addr, len, mic, CMAC_TLEN);
+	crypto_shash_init(desc);
+	crypto_shash_update(desc, aad, AAD_LEN);
+	crypto_shash_update(desc, data, data_len - CMAC_TLEN);
+	crypto_shash_finup(desc, zero, CMAC_TLEN, out);
+
+	memcpy(mic, out, CMAC_TLEN);
 }
 
-void ieee80211_aes_cmac_256(struct crypto_cipher *tfm, const u8 *aad,
+void ieee80211_aes_cmac_256(struct crypto_shash *tfm, const u8 *aad,
 			    const u8 *data, size_t data_len, u8 *mic)
 {
-	const u8 *addr[3];
-	size_t len[3];
-	u8 zero[CMAC_TLEN_256];
+	SHASH_DESC_ON_STACK(desc, tfm);
 
-	memset(zero, 0, CMAC_TLEN_256);
-	addr[0] = aad;
-	len[0] = AAD_LEN;
-	addr[1] = data;
-	len[1] = data_len - CMAC_TLEN_256;
-	addr[2] = zero;
-	len[2] = CMAC_TLEN_256;
+	desc->tfm = tfm;
 
-	aes_cmac_vector(tfm, 3, addr, len, mic, CMAC_TLEN_256);
+	crypto_shash_init(desc);
+	crypto_shash_update(desc, aad, AAD_LEN);
+	crypto_shash_update(desc, data, data_len - CMAC_TLEN_256);
+	crypto_shash_finup(desc, zero, CMAC_TLEN_256, mic);
 }
 
-struct crypto_cipher *ieee80211_aes_cmac_key_setup(const u8 key[],
-						   size_t key_len)
+struct crypto_shash *ieee80211_aes_cmac_key_setup(const u8 key[],
+						  size_t key_len)
 {
-	struct crypto_cipher *tfm;
+	struct crypto_shash *tfm;
 
-	tfm = crypto_alloc_cipher("aes", 0, CRYPTO_ALG_ASYNC);
+	tfm = crypto_alloc_shash("cmac(aes)", 0, 0);
 	if (!IS_ERR(tfm))
-		crypto_cipher_setkey(tfm, key, key_len);
+		crypto_shash_setkey(tfm, key, key_len);
 
 	return tfm;
 }
 
-
-void ieee80211_aes_cmac_key_free(struct crypto_cipher *tfm)
+void ieee80211_aes_cmac_key_free(struct crypto_shash *tfm)
 {
-	crypto_free_cipher(tfm);
+	crypto_free_shash(tfm);
 }
diff --git a/net/mac80211/aes_cmac.h b/net/mac80211/aes_cmac.h
index c827e1d5de8b..fef531f42003 100644
--- a/net/mac80211/aes_cmac.h
+++ b/net/mac80211/aes_cmac.h
@@ -10,17 +10,14 @@
 #define AES_CMAC_H
 
 #include <linux/crypto.h>
+#include <crypto/hash.h>
 
-void gf_mulx(u8 *pad);
-void aes_cmac_vector(struct crypto_cipher *tfm, size_t num_elem,
-		     const u8 *addr[], const size_t *len, u8 *mac,
-		     size_t mac_len);
-struct crypto_cipher *ieee80211_aes_cmac_key_setup(const u8 key[],
-						   size_t key_len);
-void ieee80211_aes_cmac(struct crypto_cipher *tfm, const u8 *aad,
+struct crypto_shash *ieee80211_aes_cmac_key_setup(const u8 key[],
+						  size_t key_len);
+void ieee80211_aes_cmac(struct crypto_shash *tfm, const u8 *aad,
 			const u8 *data, size_t data_len, u8 *mic);
-void ieee80211_aes_cmac_256(struct crypto_cipher *tfm, const u8 *aad,
+void ieee80211_aes_cmac_256(struct crypto_shash *tfm, const u8 *aad,
 			    const u8 *data, size_t data_len, u8 *mic);
-void ieee80211_aes_cmac_key_free(struct crypto_cipher *tfm);
+void ieee80211_aes_cmac_key_free(struct crypto_shash *tfm);
 
 #endif /* AES_CMAC_H */
diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index e91e503bf992..ac879bb17870 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -208,8 +208,8 @@ static int ieee80211_nan_change_conf(struct wiphy *wiphy,
 	if (changes & CFG80211_NAN_CONF_CHANGED_PREF)
 		new_conf.master_pref = conf->master_pref;
 
-	if (changes & CFG80211_NAN_CONF_CHANGED_DUAL)
-		new_conf.dual = conf->dual;
+	if (changes & CFG80211_NAN_CONF_CHANGED_BANDS)
+		new_conf.bands = conf->bands;
 
 	ret = drv_nan_change_conf(sdata->local, sdata, &new_conf, changes);
 	if (!ret)
@@ -3563,6 +3563,17 @@ void ieee80211_nan_func_match(struct ieee80211_vif *vif,
 }
 EXPORT_SYMBOL(ieee80211_nan_func_match);
 
+static int ieee80211_set_multicast_to_unicast(struct wiphy *wiphy,
+					      struct net_device *dev,
+					      const bool enabled)
+{
+	struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+
+	sdata->u.ap.multicast_to_unicast = enabled;
+
+	return 0;
+}
+
 const struct cfg80211_ops mac80211_config_ops = {
 	.add_virtual_intf = ieee80211_add_iface,
 	.del_virtual_intf = ieee80211_del_iface,
@@ -3653,4 +3664,5 @@ const struct cfg80211_ops mac80211_config_ops = {
 	.nan_change_conf = ieee80211_nan_change_conf,
 	.add_nan_func = ieee80211_add_nan_func,
 	.del_nan_func = ieee80211_del_nan_func,
+	.set_multicast_to_unicast = ieee80211_set_multicast_to_unicast,
 };
diff --git a/net/mac80211/chan.c b/net/mac80211/chan.c
index a0d901d8992e..89178b46b32f 100644
--- a/net/mac80211/chan.c
+++ b/net/mac80211/chan.c
@@ -1267,7 +1267,7 @@ static int ieee80211_vif_use_reserved_switch(struct ieee80211_local *local)
 	struct ieee80211_sub_if_data *sdata, *sdata_tmp;
 	struct ieee80211_chanctx *ctx, *ctx_tmp, *old_ctx;
 	struct ieee80211_chanctx *new_ctx = NULL;
-	int i, err, n_assigned, n_reserved, n_ready;
+	int err, n_assigned, n_reserved, n_ready;
 	int n_ctx = 0, n_vifs_switch = 0, n_vifs_assign = 0, n_vifs_ctxless = 0;
 
 	lockdep_assert_held(&local->mtx);
@@ -1388,8 +1388,6 @@ static int ieee80211_vif_use_reserved_switch(struct ieee80211_local *local)
 	 * Update all structures, values and pointers to point to new channel
 	 * context(s).
 	 */
-
-	i = 0;
 	list_for_each_entry(ctx, &local->chanctx_list, list) {
 		if (ctx->replace_state != IEEE80211_CHANCTX_REPLACES_OTHER)
 			continue;
diff --git a/net/mac80211/debugfs.c b/net/mac80211/debugfs.c
index e02ba42ca827..5fae001f286c 100644
--- a/net/mac80211/debugfs.c
+++ b/net/mac80211/debugfs.c
@@ -243,6 +243,38 @@ static ssize_t hwflags_read(struct file *file, char __user *user_buf,
 	return rv;
 }
 
+static ssize_t misc_read(struct file *file, char __user *user_buf,
+			 size_t count, loff_t *ppos)
+{
+	struct ieee80211_local *local = file->private_data;
+	/* Max len of each line is 16 characters, plus 9 for 'pending:\n' */
+	size_t bufsz = IEEE80211_MAX_QUEUES * 16 + 9;
+	char *buf;
+	char *pos, *end;
+	ssize_t rv;
+	int i;
+	int ln;
+
+	buf = kzalloc(bufsz, GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+
+	pos = buf;
+	end = buf + bufsz - 1;
+
+	pos += scnprintf(pos, end - pos, "pending:\n");
+
+	for (i = 0; i < IEEE80211_MAX_QUEUES; i++) {
+		ln = skb_queue_len(&local->pending[i]);
+		pos += scnprintf(pos, end - pos, "[%i] %d\n",
+				 i, ln);
+	}
+
+	rv = simple_read_from_buffer(user_buf, count, ppos, buf, strlen(buf));
+	kfree(buf);
+	return rv;
+}
+
 static ssize_t queues_read(struct file *file, char __user *user_buf,
 			   size_t count, loff_t *ppos)
 {
@@ -263,6 +295,7 @@ static ssize_t queues_read(struct file *file, char __user *user_buf,
 
 DEBUGFS_READONLY_FILE_OPS(hwflags);
 DEBUGFS_READONLY_FILE_OPS(queues);
+DEBUGFS_READONLY_FILE_OPS(misc);
 
 /* statistics stuff */
 
@@ -330,7 +363,9 @@ void debugfs_hw_add(struct ieee80211_local *local)
 
 	DEBUGFS_ADD(total_ps_buffered);
 	DEBUGFS_ADD(wep_iv);
+	DEBUGFS_ADD(rate_ctrl_alg);
 	DEBUGFS_ADD(queues);
+	DEBUGFS_ADD(misc);
 #ifdef CONFIG_PM
 	DEBUGFS_ADD_MODE(reset, 0200);
 #endif
diff --git a/net/mac80211/debugfs_netdev.c b/net/mac80211/debugfs_netdev.c
index 1a05f85cb1f0..8f5fff8b2040 100644
--- a/net/mac80211/debugfs_netdev.c
+++ b/net/mac80211/debugfs_netdev.c
@@ -519,6 +519,8 @@ static ssize_t ieee80211_if_fmt_aqm(
 }
 IEEE80211_IF_FILE_R(aqm);
 
+IEEE80211_IF_FILE(multicast_to_unicast, u.ap.multicast_to_unicast, HEX);
+
 /* IBSS attributes */
 static ssize_t ieee80211_if_fmt_tsf(
 	const struct ieee80211_sub_if_data *sdata, char *buf, int buflen)
@@ -683,6 +685,7 @@ static void add_ap_files(struct ieee80211_sub_if_data *sdata)
 	DEBUGFS_ADD(dtim_count);
 	DEBUGFS_ADD(num_buffered_multicast);
 	DEBUGFS_ADD_MODE(tkip_mic_test, 0200);
+	DEBUGFS_ADD_MODE(multicast_to_unicast, 0600);
 }
 
 static void add_vlan_files(struct ieee80211_sub_if_data *sdata)
diff --git a/net/mac80211/debugfs_sta.c b/net/mac80211/debugfs_sta.c
index f6003b8c2c33..42601820db20 100644
--- a/net/mac80211/debugfs_sta.c
+++ b/net/mac80211/debugfs_sta.c
@@ -522,6 +522,7 @@ void ieee80211_sta_debugfs_add(struct sta_info *sta)
 		return;
 
 	DEBUGFS_ADD(flags);
+	DEBUGFS_ADD(aid);
 	DEBUGFS_ADD(num_ps_buf_frames);
 	DEBUGFS_ADD(last_seq_ctrl);
 	DEBUGFS_ADD(agg_status);
diff --git a/net/mac80211/fils_aead.c b/net/mac80211/fils_aead.c
index ecfdd97758a3..3cfb1e2ab7ac 100644
--- a/net/mac80211/fils_aead.c
+++ b/net/mac80211/fils_aead.c
@@ -9,66 +9,58 @@
 
 #include <crypto/aes.h>
 #include <crypto/algapi.h>
+#include <crypto/hash.h>
 #include <crypto/skcipher.h>
 
 #include "ieee80211_i.h"
 #include "aes_cmac.h"
 #include "fils_aead.h"
 
-static int aes_s2v(struct crypto_cipher *tfm,
+static void gf_mulx(u8 *pad)
+{
+	u64 a = get_unaligned_be64(pad);
+	u64 b = get_unaligned_be64(pad + 8);
+
+	put_unaligned_be64((a << 1) | (b >> 63), pad);
+	put_unaligned_be64((b << 1) ^ ((a >> 63) ? 0x87 : 0), pad + 8);
+}
+
+static int aes_s2v(struct crypto_shash *tfm,
 		   size_t num_elem, const u8 *addr[], size_t len[], u8 *v)
 {
-	u8 d[AES_BLOCK_SIZE], tmp[AES_BLOCK_SIZE];
+	u8 d[AES_BLOCK_SIZE], tmp[AES_BLOCK_SIZE] = {};
+	SHASH_DESC_ON_STACK(desc, tfm);
 	size_t i;
-	const u8 *data[2];
-	size_t data_len[2], data_elems;
+
+	desc->tfm = tfm;
 
 	/* D = AES-CMAC(K, <zero>) */
-	memset(tmp, 0, AES_BLOCK_SIZE);
-	data[0] = tmp;
-	data_len[0] = AES_BLOCK_SIZE;
-	aes_cmac_vector(tfm, 1, data, data_len, d, AES_BLOCK_SIZE);
+	crypto_shash_digest(desc, tmp, AES_BLOCK_SIZE, d);
 
 	for (i = 0; i < num_elem - 1; i++) {
 		/* D = dbl(D) xor AES_CMAC(K, Si) */
 		gf_mulx(d); /* dbl */
-		aes_cmac_vector(tfm, 1, &addr[i], &len[i], tmp,
-				AES_BLOCK_SIZE);
+		crypto_shash_digest(desc, addr[i], len[i], tmp);
 		crypto_xor(d, tmp, AES_BLOCK_SIZE);
 	}
 
+	crypto_shash_init(desc);
+
 	if (len[i] >= AES_BLOCK_SIZE) {
 		/* len(Sn) >= 128 */
-		size_t j;
-		const u8 *pos;
-
 		/* T = Sn xorend D */
-
-		/* Use a temporary buffer to perform xorend on Sn (addr[i]) to
-		 * avoid modifying the const input argument.
-		 */
-		data[0] = addr[i];
-		data_len[0] = len[i] - AES_BLOCK_SIZE;
-		pos = addr[i] + data_len[0];
-		for (j = 0; j < AES_BLOCK_SIZE; j++)
-			tmp[j] = pos[j] ^ d[j];
-		data[1] = tmp;
-		data_len[1] = AES_BLOCK_SIZE;
-		data_elems = 2;
+		crypto_shash_update(desc, addr[i], len[i] - AES_BLOCK_SIZE);
+		crypto_xor(d, addr[i] + len[i] - AES_BLOCK_SIZE,
+			   AES_BLOCK_SIZE);
 	} else {
 		/* len(Sn) < 128 */
 		/* T = dbl(D) xor pad(Sn) */
 		gf_mulx(d); /* dbl */
-		memset(tmp, 0, AES_BLOCK_SIZE);
-		memcpy(tmp, addr[i], len[i]);
-		tmp[len[i]] = 0x80;
-		crypto_xor(d, tmp, AES_BLOCK_SIZE);
-		data[0] = d;
-		data_len[0] = sizeof(d);
-		data_elems = 1;
+		crypto_xor(d, addr[i], len[i]);
+		d[len[i]] ^= 0x80;
 	}
 	/* V = AES-CMAC(K, T) */
-	aes_cmac_vector(tfm, data_elems, data, data_len, v, AES_BLOCK_SIZE);
+	crypto_shash_finup(desc, d, AES_BLOCK_SIZE, v);
 
 	return 0;
 }
@@ -80,7 +72,7 @@ static int aes_siv_encrypt(const u8 *key, size_t key_len,
 			   size_t len[], u8 *out)
 {
 	u8 v[AES_BLOCK_SIZE];
-	struct crypto_cipher *tfm;
+	struct crypto_shash *tfm;
 	struct crypto_skcipher *tfm2;
 	struct skcipher_request *req;
 	int res;
@@ -95,14 +87,14 @@ static int aes_siv_encrypt(const u8 *key, size_t key_len,
 
 	/* S2V */
 
-	tfm = crypto_alloc_cipher("aes", 0, 0);
+	tfm = crypto_alloc_shash("cmac(aes)", 0, 0);
 	if (IS_ERR(tfm))
 		return PTR_ERR(tfm);
 	/* K1 for S2V */
-	res = crypto_cipher_setkey(tfm, key, key_len);
+	res = crypto_shash_setkey(tfm, key, key_len);
 	if (!res)
 		res = aes_s2v(tfm, num_elem, addr, len, v);
-	crypto_free_cipher(tfm);
+	crypto_free_shash(tfm);
 	if (res)
 		return res;
 
@@ -124,7 +116,7 @@ static int aes_siv_encrypt(const u8 *key, size_t key_len,
 
 	/* CTR */
 
-	tfm2 = crypto_alloc_skcipher("ctr(aes)", 0, 0);
+	tfm2 = crypto_alloc_skcipher("ctr(aes)", 0, CRYPTO_ALG_ASYNC);
 	if (IS_ERR(tfm2)) {
 		kfree(tmp);
 		return PTR_ERR(tfm2);
@@ -157,7 +149,7 @@ static int aes_siv_decrypt(const u8 *key, size_t key_len,
 			   size_t num_elem, const u8 *addr[], size_t len[],
 			   u8 *out)
 {
-	struct crypto_cipher *tfm;
+	struct crypto_shash *tfm;
 	struct crypto_skcipher *tfm2;
 	struct skcipher_request *req;
 	struct scatterlist src[1], dst[1];
@@ -183,7 +175,7 @@ static int aes_siv_decrypt(const u8 *key, size_t key_len,
 
 	/* CTR */
 
-	tfm2 = crypto_alloc_skcipher("ctr(aes)", 0, 0);
+	tfm2 = crypto_alloc_skcipher("ctr(aes)", 0, CRYPTO_ALG_ASYNC);
 	if (IS_ERR(tfm2))
 		return PTR_ERR(tfm2);
 	/* K2 for CTR */
@@ -210,14 +202,14 @@ static int aes_siv_decrypt(const u8 *key, size_t key_len,
 
 	/* S2V */
 
-	tfm = crypto_alloc_cipher("aes", 0, 0);
+	tfm = crypto_alloc_shash("cmac(aes)", 0, 0);
 	if (IS_ERR(tfm))
 		return PTR_ERR(tfm);
 	/* K1 for S2V */
-	res = crypto_cipher_setkey(tfm, key, key_len);
+	res = crypto_shash_setkey(tfm, key, key_len);
 	if (!res)
 		res = aes_s2v(tfm, num_elem, addr, len, check);
-	crypto_free_cipher(tfm);
+	crypto_free_shash(tfm);
 	if (res)
 		return res;
 	if (memcmp(check, frame_iv, AES_BLOCK_SIZE) != 0)
@@ -272,7 +264,7 @@ int fils_encrypt_assoc_req(struct sk_buff *skb,
 	crypt_len = skb->data + skb->len - encr;
 	skb_put(skb, AES_BLOCK_SIZE);
 	return aes_siv_encrypt(assoc_data->fils_kek, assoc_data->fils_kek_len,
-			       encr, crypt_len, 1, addr, len, encr);
+			       encr, crypt_len, 5, addr, len, encr);
 }
 
 int fils_decrypt_assoc_resp(struct ieee80211_sub_if_data *sdata,
diff --git a/net/mac80211/ibss.c b/net/mac80211/ibss.c
index a31d30713d08..98999d3d5262 100644
--- a/net/mac80211/ibss.c
+++ b/net/mac80211/ibss.c
@@ -487,14 +487,14 @@ int ieee80211_ibss_csa_beacon(struct ieee80211_sub_if_data *sdata,
 	struct beacon_data *presp, *old_presp;
 	struct cfg80211_bss *cbss;
 	const struct cfg80211_bss_ies *ies;
-	u16 capability = 0;
+	u16 capability = WLAN_CAPABILITY_IBSS;
 	u64 tsf;
 	int ret = 0;
 
 	sdata_assert_lock(sdata);
 
 	if (ifibss->privacy)
-		capability = WLAN_CAPABILITY_PRIVACY;
+		capability |= WLAN_CAPABILITY_PRIVACY;
 
 	cbss = cfg80211_get_bss(sdata->local->hw.wiphy, ifibss->chandef.chan,
 				ifibss->bssid, ifibss->ssid,
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index b2069fbd60f9..159a1a733725 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -297,6 +297,7 @@ struct ieee80211_if_ap {
 			 driver_smps_mode; /* smps mode request */
 
 	struct work_struct request_smps_work;
+	bool multicast_to_unicast;
 };
 
 struct ieee80211_if_wds {
@@ -624,8 +625,8 @@ struct ieee80211_mesh_sync_ops {
 			     struct ieee80211_rx_status *rx_status);
 
 	/* should be called with beacon_data under RCU read lock */
-	void (*adjust_tbtt)(struct ieee80211_sub_if_data *sdata,
-			    struct beacon_data *beacon);
+	void (*adjust_tsf)(struct ieee80211_sub_if_data *sdata,
+			   struct beacon_data *beacon);
 	/* add other framework functions here */
 };
 
@@ -688,7 +689,6 @@ struct ieee80211_if_mesh {
 	const struct ieee80211_mesh_sync_ops *sync_ops;
 	s64 sync_offset_clockdrift_max;
 	spinlock_t sync_offset_lock;
-	bool adjusting_tbtt;
 	/* mesh power save */
 	enum nl80211_mesh_power_mode nonpeer_pm;
 	int ps_peers_light_sleep;
diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c
index d37ae7dc114b..40813dd3301c 100644
--- a/net/mac80211/iface.c
+++ b/net/mac80211/iface.c
@@ -1123,7 +1123,7 @@ static u16 ieee80211_netdev_select_queue(struct net_device *dev,
 	return ieee80211_select_queue(IEEE80211_DEV_TO_SUB_IF(dev), skb);
 }
 
-static struct rtnl_link_stats64 *
+static void
 ieee80211_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
 {
 	int i;
@@ -1148,8 +1148,6 @@ ieee80211_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
 		stats->rx_bytes   += rx_bytes;
 		stats->tx_bytes   += tx_bytes;
 	}
-
-	return stats;
 }
 
 static const struct net_device_ops ieee80211_dataif_ops = {
diff --git a/net/mac80211/key.h b/net/mac80211/key.h
index 4aa20cef0859..ebdb80b85dc3 100644
--- a/net/mac80211/key.h
+++ b/net/mac80211/key.h
@@ -93,7 +93,7 @@ struct ieee80211_key {
 		} ccmp;
 		struct {
 			u8 rx_pn[IEEE80211_CMAC_PN_LEN];
-			struct crypto_cipher *tfm;
+			struct crypto_shash *tfm;
 			u32 replays; /* dot11RSNAStatsCMACReplays */
 			u32 icverrors; /* dot11RSNAStatsCMACICVErrors */
 		} aes_cmac;
diff --git a/net/mac80211/mesh.c b/net/mac80211/mesh.c
index 42120d965263..6e7b6a07b7d5 100644
--- a/net/mac80211/mesh.c
+++ b/net/mac80211/mesh.c
@@ -279,10 +279,6 @@ int mesh_add_meshconf_ie(struct ieee80211_sub_if_data *sdata,
 	/* Mesh PS mode. See IEEE802.11-2012 8.4.2.100.8 */
 	*pos |= ifmsh->ps_peers_deep_sleep ?
 			IEEE80211_MESHCONF_CAPAB_POWER_SAVE_LEVEL : 0x00;
-	*pos++ |= ifmsh->adjusting_tbtt ?
-			IEEE80211_MESHCONF_CAPAB_TBTT_ADJUSTING : 0x00;
-	*pos++ = 0x00;
-
 	return 0;
 }
 
@@ -339,7 +335,7 @@ int mesh_add_vendor_ies(struct ieee80211_sub_if_data *sdata,
 	/* fast-forward to vendor IEs */
 	offset = ieee80211_ie_split_vendor(ifmsh->ie, ifmsh->ie_len, 0);
 
-	if (offset) {
+	if (offset < ifmsh->ie_len) {
 		len = ifmsh->ie_len - offset;
 		data = ifmsh->ie + offset;
 		if (skb_tailroom(skb) < len)
@@ -685,7 +681,7 @@ ieee80211_mesh_build_beacon(struct ieee80211_if_mesh *ifmsh)
 		   2 + /* NULL SSID */
 		   /* Channel Switch Announcement */
 		   2 + sizeof(struct ieee80211_channel_sw_ie) +
-		   /* Mesh Channel Swith Parameters */
+		   /* Mesh Channel Switch Parameters */
 		   2 + sizeof(struct ieee80211_mesh_chansw_params_ie) +
 		   2 + 8 + /* supported rates */
 		   2 + 3; /* DS params */
@@ -850,7 +846,6 @@ int ieee80211_start_mesh(struct ieee80211_sub_if_data *sdata)
 	ifmsh->mesh_cc_id = 0;	/* Disabled */
 	/* register sync ops from extensible synchronization framework */
 	ifmsh->sync_ops = ieee80211_mesh_sync_ops_get(ifmsh->mesh_sp_id);
-	ifmsh->adjusting_tbtt = false;
 	ifmsh->sync_offset_clockdrift_max = 0;
 	set_bit(MESH_WORK_HOUSEKEEPING, &ifmsh->wrkq_flags);
 	ieee80211_mesh_root_setup(ifmsh);
@@ -1349,7 +1344,7 @@ void ieee80211_mesh_work(struct ieee80211_sub_if_data *sdata)
 		ieee80211_mesh_rootpath(sdata);
 
 	if (test_and_clear_bit(MESH_WORK_DRIFT_ADJUST, &ifmsh->wrkq_flags))
-		mesh_sync_adjust_tbtt(sdata);
+		mesh_sync_adjust_tsf(sdata);
 
 	if (test_and_clear_bit(MESH_WORK_MBSS_CHANGED, &ifmsh->wrkq_flags))
 		mesh_bss_info_changed(sdata);
diff --git a/net/mac80211/mesh.h b/net/mac80211/mesh.h
index 26b9ccbe1fce..7e5f271e3c30 100644
--- a/net/mac80211/mesh.h
+++ b/net/mac80211/mesh.h
@@ -341,7 +341,7 @@ static inline bool mesh_path_sel_is_hwmp(struct ieee80211_sub_if_data *sdata)
 }
 
 void mesh_path_flush_by_iface(struct ieee80211_sub_if_data *sdata);
-void mesh_sync_adjust_tbtt(struct ieee80211_sub_if_data *sdata);
+void mesh_sync_adjust_tsf(struct ieee80211_sub_if_data *sdata);
 void ieee80211s_stop(void);
 #else
 static inline bool mesh_path_sel_is_hwmp(struct ieee80211_sub_if_data *sdata)
diff --git a/net/mac80211/mesh_plink.c b/net/mac80211/mesh_plink.c
index 7fcdcf622655..fcba70e57073 100644
--- a/net/mac80211/mesh_plink.c
+++ b/net/mac80211/mesh_plink.c
@@ -505,12 +505,14 @@ mesh_sta_info_alloc(struct ieee80211_sub_if_data *sdata, u8 *addr,
 
 	/* Userspace handles station allocation */
 	if (sdata->u.mesh.user_mpm ||
-	    sdata->u.mesh.security & IEEE80211_MESH_SEC_AUTHED)
-		cfg80211_notify_new_peer_candidate(sdata->dev, addr,
-						   elems->ie_start,
-						   elems->total_len,
-						   GFP_KERNEL);
-	else
+	    sdata->u.mesh.security & IEEE80211_MESH_SEC_AUTHED) {
+		if (mesh_peer_accepts_plinks(elems) &&
+		    mesh_plink_availables(sdata))
+			cfg80211_notify_new_peer_candidate(sdata->dev, addr,
+							   elems->ie_start,
+							   elems->total_len,
+							   GFP_KERNEL);
+	} else
 		sta = __mesh_sta_info_alloc(sdata, addr);
 
 	return sta;
diff --git a/net/mac80211/mesh_sync.c b/net/mac80211/mesh_sync.c
index faca22cd02b5..a435f094a82e 100644
--- a/net/mac80211/mesh_sync.c
+++ b/net/mac80211/mesh_sync.c
@@ -12,7 +12,7 @@
 #include "mesh.h"
 #include "driver-ops.h"
 
-/* This is not in the standard.  It represents a tolerable tbtt drift below
+/* This is not in the standard.  It represents a tolerable tsf drift below
  * which we do no TSF adjustment.
  */
 #define TOFFSET_MINIMUM_ADJUSTMENT 10
@@ -46,7 +46,7 @@ static bool mesh_peer_tbtt_adjusting(struct ieee802_11_elems *ie)
 			IEEE80211_MESHCONF_CAPAB_TBTT_ADJUSTING) != 0;
 }
 
-void mesh_sync_adjust_tbtt(struct ieee80211_sub_if_data *sdata)
+void mesh_sync_adjust_tsf(struct ieee80211_sub_if_data *sdata)
 {
 	struct ieee80211_local *local = sdata->local;
 	struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh;
@@ -57,12 +57,12 @@ void mesh_sync_adjust_tbtt(struct ieee80211_sub_if_data *sdata)
 
 	spin_lock_bh(&ifmsh->sync_offset_lock);
 	if (ifmsh->sync_offset_clockdrift_max < beacon_int_fraction) {
-		msync_dbg(sdata, "TBTT : max clockdrift=%lld; adjusting\n",
+		msync_dbg(sdata, "TSF : max clockdrift=%lld; adjusting\n",
 			  (long long) ifmsh->sync_offset_clockdrift_max);
 		tsfdelta = -ifmsh->sync_offset_clockdrift_max;
 		ifmsh->sync_offset_clockdrift_max = 0;
 	} else {
-		msync_dbg(sdata, "TBTT : max clockdrift=%lld; adjusting by %llu\n",
+		msync_dbg(sdata, "TSF : max clockdrift=%lld; adjusting by %llu\n",
 			  (long long) ifmsh->sync_offset_clockdrift_max,
 			  (unsigned long long) beacon_int_fraction);
 		tsfdelta = -beacon_int_fraction;
@@ -123,7 +123,6 @@ static void mesh_sync_offset_rx_bcn_presp(struct ieee80211_sub_if_data *sdata,
 	 */
 
 	if (elems->mesh_config && mesh_peer_tbtt_adjusting(elems)) {
-		clear_sta_flag(sta, WLAN_STA_TOFFSET_KNOWN);
 		msync_dbg(sdata, "STA %pM : is adjusting TBTT\n",
 			  sta->sta.addr);
 		goto no_sync;
@@ -168,15 +167,13 @@ no_sync:
 	rcu_read_unlock();
 }
 
-static void mesh_sync_offset_adjust_tbtt(struct ieee80211_sub_if_data *sdata,
+static void mesh_sync_offset_adjust_tsf(struct ieee80211_sub_if_data *sdata,
 					 struct beacon_data *beacon)
 {
 	struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh;
-	u8 cap;
 
 	WARN_ON(ifmsh->mesh_sp_id != IEEE80211_SYNC_METHOD_NEIGHBOR_OFFSET);
 	WARN_ON(!rcu_read_lock_held());
-	cap = beacon->meshconf->meshconf_cap;
 
 	spin_lock_bh(&ifmsh->sync_offset_lock);
 
@@ -187,24 +184,16 @@ static void mesh_sync_offset_adjust_tbtt(struct ieee80211_sub_if_data *sdata,
 		 * the tsf adjustment to the mesh tasklet
 		 */
 		msync_dbg(sdata,
-			  "TBTT : kicking off TBTT adjustment with clockdrift_max=%lld\n",
+			  "TSF : kicking off TSF adjustment with clockdrift_max=%lld\n",
 			  ifmsh->sync_offset_clockdrift_max);
 		set_bit(MESH_WORK_DRIFT_ADJUST, &ifmsh->wrkq_flags);
-
-		ifmsh->adjusting_tbtt = true;
 	} else {
 		msync_dbg(sdata,
-			  "TBTT : max clockdrift=%lld; too small to adjust\n",
+			  "TSF : max clockdrift=%lld; too small to adjust\n",
 			  (long long)ifmsh->sync_offset_clockdrift_max);
 		ifmsh->sync_offset_clockdrift_max = 0;
-
-		ifmsh->adjusting_tbtt = false;
 	}
 	spin_unlock_bh(&ifmsh->sync_offset_lock);
-
-	beacon->meshconf->meshconf_cap = ifmsh->adjusting_tbtt ?
-			IEEE80211_MESHCONF_CAPAB_TBTT_ADJUSTING | cap :
-			~IEEE80211_MESHCONF_CAPAB_TBTT_ADJUSTING & cap;
 }
 
 static const struct sync_method sync_methods[] = {
@@ -212,7 +201,7 @@ static const struct sync_method sync_methods[] = {
 		.method = IEEE80211_SYNC_METHOD_NEIGHBOR_OFFSET,
 		.ops = {
 			.rx_bcn_presp = &mesh_sync_offset_rx_bcn_presp,
-			.adjust_tbtt = &mesh_sync_offset_adjust_tbtt,
+			.adjust_tsf = &mesh_sync_offset_adjust_tsf,
 		}
 	},
 };
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index 098ce9b179ee..6e90301154d5 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -1486,10 +1486,6 @@ void ieee80211_recalc_ps(struct ieee80211_local *local)
 
 	if (count == 1 && ieee80211_powersave_allowed(found)) {
 		u8 dtimper = found->u.mgd.dtim_period;
-		s32 beaconint_us;
-
-		beaconint_us = ieee80211_tu_to_usec(
-					found->vif.bss_conf.beacon_int);
 
 		timeout = local->dynamic_ps_forced_timeout;
 		if (timeout < 0)
@@ -3423,14 +3419,14 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata,
 			ieee80211_cqm_rssi_notify(
 				&sdata->vif,
 				NL80211_CQM_RSSI_THRESHOLD_EVENT_LOW,
-				GFP_KERNEL);
+				sig, GFP_KERNEL);
 		} else if (sig > thold &&
 			   (last_event == 0 || sig > last_event + hyst)) {
 			ifmgd->last_cqm_event_signal = sig;
 			ieee80211_cqm_rssi_notify(
 				&sdata->vif,
 				NL80211_CQM_RSSI_THRESHOLD_EVENT_HIGH,
-				GFP_KERNEL);
+				sig, GFP_KERNEL);
 		}
 	}
 
@@ -5045,13 +5041,14 @@ void ieee80211_mgd_stop(struct ieee80211_sub_if_data *sdata)
 
 void ieee80211_cqm_rssi_notify(struct ieee80211_vif *vif,
 			       enum nl80211_cqm_rssi_threshold_event rssi_event,
+			       s32 rssi_level,
 			       gfp_t gfp)
 {
 	struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif);
 
-	trace_api_cqm_rssi_notify(sdata, rssi_event);
+	trace_api_cqm_rssi_notify(sdata, rssi_event, rssi_level);
 
-	cfg80211_cqm_rssi_notify(sdata->dev, rssi_event, gfp);
+	cfg80211_cqm_rssi_notify(sdata->dev, rssi_event, rssi_level, gfp);
 }
 EXPORT_SYMBOL(ieee80211_cqm_rssi_notify);
 
diff --git a/net/mac80211/rc80211_minstrel.c b/net/mac80211/rc80211_minstrel.c
index 14c5ba3a1b1c..3ebe4405a2d4 100644
--- a/net/mac80211/rc80211_minstrel.c
+++ b/net/mac80211/rc80211_minstrel.c
@@ -159,21 +159,23 @@ minstrel_update_rates(struct minstrel_priv *mp, struct minstrel_sta_info *mi)
 void
 minstrel_calc_rate_stats(struct minstrel_rate_stats *mrs)
 {
+	unsigned int cur_prob;
+
 	if (unlikely(mrs->attempts > 0)) {
 		mrs->sample_skipped = 0;
-		mrs->cur_prob = MINSTREL_FRAC(mrs->success, mrs->attempts);
+		cur_prob = MINSTREL_FRAC(mrs->success, mrs->attempts);
 		if (unlikely(!mrs->att_hist)) {
-			mrs->prob_ewma = mrs->cur_prob;
+			mrs->prob_ewma = cur_prob;
 		} else {
 			/* update exponential weighted moving variance */
-			mrs->prob_ewmsd = minstrel_ewmsd(mrs->prob_ewmsd,
-							 mrs->cur_prob,
-							 mrs->prob_ewma,
-							 EWMA_LEVEL);
+			mrs->prob_ewmv = minstrel_ewmv(mrs->prob_ewmv,
+							cur_prob,
+							mrs->prob_ewma,
+							EWMA_LEVEL);
 
 			/*update exponential weighted moving avarage */
 			mrs->prob_ewma = minstrel_ewma(mrs->prob_ewma,
-						       mrs->cur_prob,
+						       cur_prob,
 						       EWMA_LEVEL);
 		}
 		mrs->att_hist += mrs->attempts;
@@ -365,6 +367,11 @@ minstrel_get_rate(void *priv, struct ieee80211_sta *sta,
 		return;
 #endif
 
+	/* Don't use EAPOL frames for sampling on non-mrr hw */
+	if (mp->hw->max_rates == 1 &&
+	    (info->control.flags & IEEE80211_TX_CTRL_PORT_CTRL_PROTO))
+		return;
+
 	delta = (mi->total_packets * sampling_ratio / 100) -
 			(mi->sample_packets + mi->sample_deferred / 2);
 
diff --git a/net/mac80211/rc80211_minstrel.h b/net/mac80211/rc80211_minstrel.h
index c230bbe93262..be6c3f35f48b 100644
--- a/net/mac80211/rc80211_minstrel.h
+++ b/net/mac80211/rc80211_minstrel.h
@@ -14,7 +14,7 @@
 #define SAMPLE_COLUMNS	10	/* number of columns in sample table */
 
 /* scaled fraction values */
-#define MINSTREL_SCALE  16
+#define MINSTREL_SCALE  12
 #define MINSTREL_FRAC(val, div) (((val) << MINSTREL_SCALE) / div)
 #define MINSTREL_TRUNC(val) ((val) >> MINSTREL_SCALE)
 
@@ -36,21 +36,16 @@ minstrel_ewma(int old, int new, int weight)
 }
 
 /*
- * Perform EWMSD (Exponentially Weighted Moving Standard Deviation) calculation
+ * Perform EWMV (Exponentially Weighted Moving Variance) calculation
  */
 static inline int
-minstrel_ewmsd(int old_ewmsd, int cur_prob, int prob_ewma, int weight)
+minstrel_ewmv(int old_ewmv, int cur_prob, int prob_ewma, int weight)
 {
-	int diff, incr, tmp_var;
+	int diff, incr;
 
-	/* calculate exponential weighted moving variance */
-	diff = MINSTREL_TRUNC((cur_prob - prob_ewma) * 1000000);
+	diff = cur_prob - prob_ewma;
 	incr = (EWMA_DIV - weight) * diff / EWMA_DIV;
-	tmp_var = old_ewmsd * old_ewmsd;
-	tmp_var = weight * (tmp_var + diff * incr / 1000000) / EWMA_DIV;
-
-	/* return standard deviation */
-	return (u16) int_sqrt(tmp_var);
+	return weight * (old_ewmv + MINSTREL_TRUNC(diff * incr)) / EWMA_DIV;
 }
 
 struct minstrel_rate_stats {
@@ -59,15 +54,13 @@ struct minstrel_rate_stats {
 	u16 success, last_success;
 
 	/* total attempts/success counters */
-	u64 att_hist, succ_hist;
+	u32 att_hist, succ_hist;
 
 	/* statistis of packet delivery probability
-	 *  cur_prob  - current prob within last update intervall
 	 *  prob_ewma - exponential weighted moving average of prob
 	 *  prob_ewmsd - exp. weighted moving standard deviation of prob */
-	unsigned int cur_prob;
-	unsigned int prob_ewma;
-	u16 prob_ewmsd;
+	u16 prob_ewma;
+	u16 prob_ewmv;
 
 	/* maximum retry counts */
 	u8 retry_count;
@@ -153,6 +146,14 @@ struct minstrel_debugfs_info {
 	char buf[];
 };
 
+/* Get EWMSD (Exponentially Weighted Moving Standard Deviation) * 10 */
+static inline int
+minstrel_get_ewmsd10(struct minstrel_rate_stats *mrs)
+{
+	unsigned int ewmv = mrs->prob_ewmv;
+	return int_sqrt(MINSTREL_TRUNC(ewmv * 1000 * 1000));
+}
+
 extern const struct rate_control_ops mac80211_minstrel;
 void minstrel_add_sta_debugfs(void *priv, void *priv_sta, struct dentry *dir);
 void minstrel_remove_sta_debugfs(void *priv, void *priv_sta);
diff --git a/net/mac80211/rc80211_minstrel_debugfs.c b/net/mac80211/rc80211_minstrel_debugfs.c
index 820b0abc9c0d..36fc971deb86 100644
--- a/net/mac80211/rc80211_minstrel_debugfs.c
+++ b/net/mac80211/rc80211_minstrel_debugfs.c
@@ -75,7 +75,7 @@ minstrel_stats_open(struct inode *inode, struct file *file)
 {
 	struct minstrel_sta_info *mi = inode->i_private;
 	struct minstrel_debugfs_info *ms;
-	unsigned int i, tp_max, tp_avg, prob, eprob;
+	unsigned int i, tp_max, tp_avg, eprob;
 	char *p;
 
 	ms = kmalloc(2048, GFP_KERNEL);
@@ -86,13 +86,14 @@ minstrel_stats_open(struct inode *inode, struct file *file)
 	p = ms->buf;
 	p += sprintf(p, "\n");
 	p += sprintf(p,
-		     "best   __________rate_________    ________statistics________    ________last_______    ______sum-of________\n");
+		     "best   __________rate_________    ________statistics________    ____last_____    ______sum-of________\n");
 	p += sprintf(p,
-		     "rate  [name idx airtime max_tp]  [avg(tp) avg(prob) sd(prob)]  [prob.|retry|suc|att]  [#success | #attempts]\n");
+		     "rate  [name idx airtime max_tp]  [avg(tp) avg(prob) sd(prob)]  [retry|suc|att]  [#success | #attempts]\n");
 
 	for (i = 0; i < mi->n_rates; i++) {
 		struct minstrel_rate *mr = &mi->r[i];
 		struct minstrel_rate_stats *mrs = &mi->r[i].stats;
+		unsigned int prob_ewmsd;
 
 		*(p++) = (i == mi->max_tp_rate[0]) ? 'A' : ' ';
 		*(p++) = (i == mi->max_tp_rate[1]) ? 'B' : ' ';
@@ -107,17 +108,16 @@ minstrel_stats_open(struct inode *inode, struct file *file)
 
 		tp_max = minstrel_get_tp_avg(mr, MINSTREL_FRAC(100,100));
 		tp_avg = minstrel_get_tp_avg(mr, mrs->prob_ewma);
-		prob = MINSTREL_TRUNC(mrs->cur_prob * 1000);
 		eprob = MINSTREL_TRUNC(mrs->prob_ewma * 1000);
+		prob_ewmsd = minstrel_get_ewmsd10(mrs);
 
 		p += sprintf(p, "%4u.%1u    %4u.%1u     %3u.%1u    %3u.%1u"
-				"     %3u.%1u %3u   %3u %-3u   "
+				"     %3u   %3u %-3u   "
 				"%9llu   %-9llu\n",
 				tp_max / 10, tp_max % 10,
 				tp_avg / 10, tp_avg % 10,
 				eprob / 10, eprob % 10,
-				mrs->prob_ewmsd / 10, mrs->prob_ewmsd % 10,
-				prob / 10, prob % 10,
+				prob_ewmsd / 10, prob_ewmsd % 10,
 				mrs->retry_count,
 				mrs->last_success,
 				mrs->last_attempts,
@@ -148,7 +148,7 @@ minstrel_stats_csv_open(struct inode *inode, struct file *file)
 {
 	struct minstrel_sta_info *mi = inode->i_private;
 	struct minstrel_debugfs_info *ms;
-	unsigned int i, tp_max, tp_avg, prob, eprob;
+	unsigned int i, tp_max, tp_avg, eprob;
 	char *p;
 
 	ms = kmalloc(2048, GFP_KERNEL);
@@ -161,6 +161,7 @@ minstrel_stats_csv_open(struct inode *inode, struct file *file)
 	for (i = 0; i < mi->n_rates; i++) {
 		struct minstrel_rate *mr = &mi->r[i];
 		struct minstrel_rate_stats *mrs = &mi->r[i].stats;
+		unsigned int prob_ewmsd;
 
 		p += sprintf(p, "%s" ,((i == mi->max_tp_rate[0]) ? "A" : ""));
 		p += sprintf(p, "%s" ,((i == mi->max_tp_rate[1]) ? "B" : ""));
@@ -175,16 +176,15 @@ minstrel_stats_csv_open(struct inode *inode, struct file *file)
 
 		tp_max = minstrel_get_tp_avg(mr, MINSTREL_FRAC(100,100));
 		tp_avg = minstrel_get_tp_avg(mr, mrs->prob_ewma);
-		prob = MINSTREL_TRUNC(mrs->cur_prob * 1000);
 		eprob = MINSTREL_TRUNC(mrs->prob_ewma * 1000);
+		prob_ewmsd = minstrel_get_ewmsd10(mrs);
 
-		p += sprintf(p, "%u.%u,%u.%u,%u.%u,%u.%u,%u.%u,%u,%u,%u,"
+		p += sprintf(p, "%u.%u,%u.%u,%u.%u,%u.%u,%u,%u,%u,"
 				"%llu,%llu,%d,%d\n",
 				tp_max / 10, tp_max % 10,
 				tp_avg / 10, tp_avg % 10,
 				eprob / 10, eprob % 10,
-				mrs->prob_ewmsd / 10, mrs->prob_ewmsd % 10,
-				prob / 10, prob % 10,
+				prob_ewmsd / 10, prob_ewmsd % 10,
 				mrs->retry_count,
 				mrs->last_success,
 				mrs->last_attempts,
diff --git a/net/mac80211/rc80211_minstrel_ht.c b/net/mac80211/rc80211_minstrel_ht.c
index 30fbabf4bcbc..8e783e197e93 100644
--- a/net/mac80211/rc80211_minstrel_ht.c
+++ b/net/mac80211/rc80211_minstrel_ht.c
@@ -14,6 +14,7 @@
 #include <linux/ieee80211.h>
 #include <net/mac80211.h>
 #include "rate.h"
+#include "sta_info.h"
 #include "rc80211_minstrel.h"
 #include "rc80211_minstrel_ht.h"
 
@@ -154,67 +155,47 @@ MODULE_PARM_DESC(minstrel_vht_only,
 const struct mcs_group minstrel_mcs_groups[] = {
 	MCS_GROUP(1, 0, BW_20),
 	MCS_GROUP(2, 0, BW_20),
-#if MINSTREL_MAX_STREAMS >= 3
 	MCS_GROUP(3, 0, BW_20),
-#endif
 
 	MCS_GROUP(1, 1, BW_20),
 	MCS_GROUP(2, 1, BW_20),
-#if MINSTREL_MAX_STREAMS >= 3
 	MCS_GROUP(3, 1, BW_20),
-#endif
 
 	MCS_GROUP(1, 0, BW_40),
 	MCS_GROUP(2, 0, BW_40),
-#if MINSTREL_MAX_STREAMS >= 3
 	MCS_GROUP(3, 0, BW_40),
-#endif
 
 	MCS_GROUP(1, 1, BW_40),
 	MCS_GROUP(2, 1, BW_40),
-#if MINSTREL_MAX_STREAMS >= 3
 	MCS_GROUP(3, 1, BW_40),
-#endif
 
 	CCK_GROUP,
 
 #ifdef CONFIG_MAC80211_RC_MINSTREL_VHT
 	VHT_GROUP(1, 0, BW_20),
 	VHT_GROUP(2, 0, BW_20),
-#if MINSTREL_MAX_STREAMS >= 3
 	VHT_GROUP(3, 0, BW_20),
-#endif
 
 	VHT_GROUP(1, 1, BW_20),
 	VHT_GROUP(2, 1, BW_20),
-#if MINSTREL_MAX_STREAMS >= 3
 	VHT_GROUP(3, 1, BW_20),
-#endif
 
 	VHT_GROUP(1, 0, BW_40),
 	VHT_GROUP(2, 0, BW_40),
-#if MINSTREL_MAX_STREAMS >= 3
 	VHT_GROUP(3, 0, BW_40),
-#endif
 
 	VHT_GROUP(1, 1, BW_40),
 	VHT_GROUP(2, 1, BW_40),
-#if MINSTREL_MAX_STREAMS >= 3
 	VHT_GROUP(3, 1, BW_40),
-#endif
 
 	VHT_GROUP(1, 0, BW_80),
 	VHT_GROUP(2, 0, BW_80),
-#if MINSTREL_MAX_STREAMS >= 3
 	VHT_GROUP(3, 0, BW_80),
-#endif
 
 	VHT_GROUP(1, 1, BW_80),
 	VHT_GROUP(2, 1, BW_80),
-#if MINSTREL_MAX_STREAMS >= 3
 	VHT_GROUP(3, 1, BW_80),
 #endif
-#endif
 };
 
 static u8 sample_table[SAMPLE_COLUMNS][MCS_GROUP_RATES] __read_mostly;
@@ -301,7 +282,7 @@ minstrel_ht_get_stats(struct minstrel_priv *mp, struct minstrel_ht_sta *mi,
 				break;
 
 		/* short preamble */
-		if (!(mi->groups[group].supported & BIT(idx)))
+		if (!(mi->supported[group] & BIT(idx)))
 			idx += 4;
 	}
 	return &mi->groups[group].rates[idx];
@@ -486,7 +467,7 @@ minstrel_ht_prob_rate_reduce_streams(struct minstrel_ht_sta *mi)
 			  MCS_GROUP_RATES].streams;
 	for (group = 0; group < ARRAY_SIZE(minstrel_mcs_groups); group++) {
 		mg = &mi->groups[group];
-		if (!mg->supported || group == MINSTREL_CCK_GROUP)
+		if (!mi->supported[group] || group == MINSTREL_CCK_GROUP)
 			continue;
 
 		tmp_idx = mg->max_group_prob_rate % MCS_GROUP_RATES;
@@ -540,7 +521,7 @@ minstrel_ht_update_stats(struct minstrel_priv *mp, struct minstrel_ht_sta *mi)
 	for (group = 0; group < ARRAY_SIZE(minstrel_mcs_groups); group++) {
 
 		mg = &mi->groups[group];
-		if (!mg->supported)
+		if (!mi->supported[group])
 			continue;
 
 		mi->sample_count++;
@@ -550,7 +531,7 @@ minstrel_ht_update_stats(struct minstrel_priv *mp, struct minstrel_ht_sta *mi)
 			tmp_group_tp_rate[j] = group;
 
 		for (i = 0; i < MCS_GROUP_RATES; i++) {
-			if (!(mg->supported & BIT(i)))
+			if (!(mi->supported[group] & BIT(i)))
 				continue;
 
 			index = MCS_GROUP_RATES * group + i;
@@ -636,7 +617,7 @@ minstrel_set_next_sample_idx(struct minstrel_ht_sta *mi)
 		mi->sample_group %= ARRAY_SIZE(minstrel_mcs_groups);
 		mg = &mi->groups[mi->sample_group];
 
-		if (!mg->supported)
+		if (!mi->supported[mi->sample_group])
 			continue;
 
 		if (++mg->index >= MCS_GROUP_RATES) {
@@ -657,7 +638,7 @@ minstrel_downgrade_rate(struct minstrel_ht_sta *mi, u16 *idx, bool primary)
 	while (group > 0) {
 		group--;
 
-		if (!mi->groups[group].supported)
+		if (!mi->supported[group])
 			continue;
 
 		if (minstrel_mcs_groups[group].streams >
@@ -994,7 +975,7 @@ minstrel_get_sample_rate(struct minstrel_priv *mp, struct minstrel_ht_sta *mi)
 	sample_idx = sample_table[mg->column][mg->index];
 	minstrel_set_next_sample_idx(mi);
 
-	if (!(mg->supported & BIT(sample_idx)))
+	if (!(mi->supported[sample_group] & BIT(sample_idx)))
 		return -1;
 
 	mrs = &mg->rates[sample_idx];
@@ -1049,22 +1030,6 @@ minstrel_get_sample_rate(struct minstrel_priv *mp, struct minstrel_ht_sta *mi)
 }
 
 static void
-minstrel_ht_check_cck_shortpreamble(struct minstrel_priv *mp,
-				    struct minstrel_ht_sta *mi, bool val)
-{
-	u8 supported = mi->groups[MINSTREL_CCK_GROUP].supported;
-
-	if (!supported || !mi->cck_supported_short)
-		return;
-
-	if (supported & (mi->cck_supported_short << (val * 4)))
-		return;
-
-	supported ^= mi->cck_supported_short | (mi->cck_supported_short << 4);
-	mi->groups[MINSTREL_CCK_GROUP].supported = supported;
-}
-
-static void
 minstrel_ht_get_rate(void *priv, struct ieee80211_sta *sta, void *priv_sta,
                      struct ieee80211_tx_rate_control *txrc)
 {
@@ -1087,7 +1052,6 @@ minstrel_ht_get_rate(void *priv, struct ieee80211_sta *sta, void *priv_sta,
 		minstrel_aggr_check(sta, txrc->skb);
 
 	info->flags |= mi->tx_flags;
-	minstrel_ht_check_cck_shortpreamble(mp, mi, txrc->short_preamble);
 
 #ifdef CONFIG_MAC80211_DEBUGFS
 	if (mp->fixed_rate_idx != -1)
@@ -1154,7 +1118,7 @@ minstrel_ht_update_cck(struct minstrel_priv *mp, struct minstrel_ht_sta *mi,
 			mi->cck_supported_short |= BIT(i);
 	}
 
-	mi->groups[MINSTREL_CCK_GROUP].supported = mi->cck_supported;
+	mi->supported[MINSTREL_CCK_GROUP] = mi->cck_supported;
 }
 
 static void
@@ -1168,6 +1132,7 @@ minstrel_ht_update_caps(void *priv, struct ieee80211_supported_band *sband,
 	struct ieee80211_mcs_info *mcs = &sta->ht_cap.mcs;
 	u16 sta_cap = sta->ht_cap.cap;
 	struct ieee80211_sta_vht_cap *vht_cap = &sta->vht_cap;
+	struct sta_info *sinfo = container_of(sta, struct sta_info, sta);
 	int use_vht;
 	int n_supported = 0;
 	int ack_dur;
@@ -1224,7 +1189,7 @@ minstrel_ht_update_caps(void *priv, struct ieee80211_supported_band *sband,
 		u32 gflags = minstrel_mcs_groups[i].flags;
 		int bw, nss;
 
-		mi->groups[i].supported = 0;
+		mi->supported[i] = 0;
 		if (i == MINSTREL_CCK_GROUP) {
 			minstrel_ht_update_cck(mp, mi, sband, sta);
 			continue;
@@ -1256,8 +1221,8 @@ minstrel_ht_update_caps(void *priv, struct ieee80211_supported_band *sband,
 			if (use_vht && minstrel_vht_only)
 				continue;
 #endif
-			mi->groups[i].supported = mcs->rx_mask[nss - 1];
-			if (mi->groups[i].supported)
+			mi->supported[i] = mcs->rx_mask[nss - 1];
+			if (mi->supported[i])
 				n_supported++;
 			continue;
 		}
@@ -1283,16 +1248,19 @@ minstrel_ht_update_caps(void *priv, struct ieee80211_supported_band *sband,
 		else
 			bw = BW_20;
 
-		mi->groups[i].supported = minstrel_get_valid_vht_rates(bw, nss,
+		mi->supported[i] = minstrel_get_valid_vht_rates(bw, nss,
 				vht_cap->vht_mcs.tx_mcs_map);
 
-		if (mi->groups[i].supported)
+		if (mi->supported[i])
 			n_supported++;
 	}
 
 	if (!n_supported)
 		goto use_legacy;
 
+	if (test_sta_flag(sinfo, WLAN_STA_SHORT_PREAMBLE))
+		mi->cck_supported_short |= mi->cck_supported_short << 4;
+
 	/* create an initial rate table with the lowest supported rates */
 	minstrel_ht_update_stats(mp, mi);
 	minstrel_ht_update_rates(mp, mi);
diff --git a/net/mac80211/rc80211_minstrel_ht.h b/net/mac80211/rc80211_minstrel_ht.h
index e8b52a94d24b..de1646c42e82 100644
--- a/net/mac80211/rc80211_minstrel_ht.h
+++ b/net/mac80211/rc80211_minstrel_ht.h
@@ -52,9 +52,6 @@ struct minstrel_mcs_group_data {
 	u8 index;
 	u8 column;
 
-	/* bitfield of supported MCS rates of this group */
-	u16 supported;
-
 	/* sorted rate set within a MCS group*/
 	u16 max_group_tp_rate[MAX_THR_RATES];
 	u16 max_group_prob_rate;
@@ -101,6 +98,9 @@ struct minstrel_ht_sta {
 	u8 cck_supported;
 	u8 cck_supported_short;
 
+	/* Bitfield of supported MCS rates of all groups */
+	u16 supported[MINSTREL_GROUPS_NB];
+
 	/* MCS rate group info and statistics */
 	struct minstrel_mcs_group_data groups[MINSTREL_GROUPS_NB];
 };
diff --git a/net/mac80211/rc80211_minstrel_ht_debugfs.c b/net/mac80211/rc80211_minstrel_ht_debugfs.c
index 5320e35ed3d0..7d969e300fb3 100644
--- a/net/mac80211/rc80211_minstrel_ht_debugfs.c
+++ b/net/mac80211/rc80211_minstrel_ht_debugfs.c
@@ -19,12 +19,12 @@ static char *
 minstrel_ht_stats_dump(struct minstrel_ht_sta *mi, int i, char *p)
 {
 	const struct mcs_group *mg;
-	unsigned int j, tp_max, tp_avg, prob, eprob, tx_time;
+	unsigned int j, tp_max, tp_avg, eprob, tx_time;
 	char htmode = '2';
 	char gimode = 'L';
 	u32 gflags;
 
-	if (!mi->groups[i].supported)
+	if (!mi->supported[i])
 		return p;
 
 	mg = &minstrel_mcs_groups[i];
@@ -41,8 +41,9 @@ minstrel_ht_stats_dump(struct minstrel_ht_sta *mi, int i, char *p)
 		struct minstrel_rate_stats *mrs = &mi->groups[i].rates[j];
 		static const int bitrates[4] = { 10, 20, 55, 110 };
 		int idx = i * MCS_GROUP_RATES + j;
+		unsigned int prob_ewmsd;
 
-		if (!(mi->groups[i].supported & BIT(j)))
+		if (!(mi->supported[i] & BIT(j)))
 			continue;
 
 		if (gflags & IEEE80211_TX_RC_MCS) {
@@ -83,17 +84,16 @@ minstrel_ht_stats_dump(struct minstrel_ht_sta *mi, int i, char *p)
 
 		tp_max = minstrel_ht_get_tp_avg(mi, i, j, MINSTREL_FRAC(100, 100));
 		tp_avg = minstrel_ht_get_tp_avg(mi, i, j, mrs->prob_ewma);
-		prob = MINSTREL_TRUNC(mrs->cur_prob * 1000);
 		eprob = MINSTREL_TRUNC(mrs->prob_ewma * 1000);
+		prob_ewmsd = minstrel_get_ewmsd10(mrs);
 
 		p += sprintf(p, "%4u.%1u    %4u.%1u     %3u.%1u    %3u.%1u"
-				"     %3u.%1u %3u   %3u %-3u   "
+				"     %3u   %3u %-3u   "
 				"%9llu   %-9llu\n",
 				tp_max / 10, tp_max % 10,
 				tp_avg / 10, tp_avg % 10,
 				eprob / 10, eprob % 10,
-				mrs->prob_ewmsd / 10, mrs->prob_ewmsd % 10,
-				prob / 10, prob % 10,
+				prob_ewmsd / 10, prob_ewmsd % 10,
 				mrs->retry_count,
 				mrs->last_success,
 				mrs->last_attempts,
@@ -130,9 +130,9 @@ minstrel_ht_stats_open(struct inode *inode, struct file *file)
 
 	p += sprintf(p, "\n");
 	p += sprintf(p,
-		     "              best   ____________rate__________    ________statistics________    ________last_______    ______sum-of________\n");
+		     "              best   ____________rate__________    ________statistics________    _____last____    ______sum-of________\n");
 	p += sprintf(p,
-		     "mode guard #  rate  [name   idx airtime  max_tp]  [avg(tp) avg(prob) sd(prob)]  [prob.|retry|suc|att]  [#success | #attempts]\n");
+		     "mode guard #  rate  [name   idx airtime  max_tp]  [avg(tp) avg(prob) sd(prob)]  [retry|suc|att]  [#success | #attempts]\n");
 
 	p = minstrel_ht_stats_dump(mi, MINSTREL_CCK_GROUP, p);
 	for (i = 0; i < MINSTREL_CCK_GROUP; i++)
@@ -165,12 +165,12 @@ static char *
 minstrel_ht_stats_csv_dump(struct minstrel_ht_sta *mi, int i, char *p)
 {
 	const struct mcs_group *mg;
-	unsigned int j, tp_max, tp_avg, prob, eprob, tx_time;
+	unsigned int j, tp_max, tp_avg, eprob, tx_time;
 	char htmode = '2';
 	char gimode = 'L';
 	u32 gflags;
 
-	if (!mi->groups[i].supported)
+	if (!mi->supported[i])
 		return p;
 
 	mg = &minstrel_mcs_groups[i];
@@ -187,8 +187,9 @@ minstrel_ht_stats_csv_dump(struct minstrel_ht_sta *mi, int i, char *p)
 		struct minstrel_rate_stats *mrs = &mi->groups[i].rates[j];
 		static const int bitrates[4] = { 10, 20, 55, 110 };
 		int idx = i * MCS_GROUP_RATES + j;
+		unsigned int prob_ewmsd;
 
-		if (!(mi->groups[i].supported & BIT(j)))
+		if (!(mi->supported[i] & BIT(j)))
 			continue;
 
 		if (gflags & IEEE80211_TX_RC_MCS) {
@@ -226,16 +227,15 @@ minstrel_ht_stats_csv_dump(struct minstrel_ht_sta *mi, int i, char *p)
 
 		tp_max = minstrel_ht_get_tp_avg(mi, i, j, MINSTREL_FRAC(100, 100));
 		tp_avg = minstrel_ht_get_tp_avg(mi, i, j, mrs->prob_ewma);
-		prob = MINSTREL_TRUNC(mrs->cur_prob * 1000);
 		eprob = MINSTREL_TRUNC(mrs->prob_ewma * 1000);
+		prob_ewmsd = minstrel_get_ewmsd10(mrs);
 
-		p += sprintf(p, "%u.%u,%u.%u,%u.%u,%u.%u,%u.%u,%u,%u,"
+		p += sprintf(p, "%u.%u,%u.%u,%u.%u,%u.%u,%u,%u,"
 				"%u,%llu,%llu,",
 				tp_max / 10, tp_max % 10,
 				tp_avg / 10, tp_avg % 10,
 				eprob / 10, eprob % 10,
-				mrs->prob_ewmsd / 10, mrs->prob_ewmsd % 10,
-				prob / 10, prob % 10,
+				prob_ewmsd / 10, prob_ewmsd % 10,
 				mrs->retry_count,
 				mrs->last_success,
 				mrs->last_attempts,
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index 3090dd4342f6..50ca3828b124 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -1391,7 +1391,7 @@ EXPORT_SYMBOL(ieee80211_sta_pspoll);
 void ieee80211_sta_uapsd_trigger(struct ieee80211_sta *pubsta, u8 tid)
 {
 	struct sta_info *sta = container_of(pubsta, struct sta_info, sta);
-	u8 ac = ieee802_1d_to_ac[tid & 7];
+	int ac = ieee80211_ac_from_tid(tid);
 
 	/*
 	 * If this AC is not trigger-enabled do nothing unless the
@@ -1908,7 +1908,6 @@ ieee80211_rx_h_defragment(struct ieee80211_rx_data *rx)
 	unsigned int frag, seq;
 	struct ieee80211_fragment_entry *entry;
 	struct sk_buff *skb;
-	struct ieee80211_rx_status *status;
 
 	hdr = (struct ieee80211_hdr *)rx->skb->data;
 	fc = hdr->frame_control;
@@ -2034,9 +2033,6 @@ ieee80211_rx_h_defragment(struct ieee80211_rx_data *rx)
 		dev_kfree_skb(skb);
 	}
 
-	/* Complete frame has been reassembled - process it now */
-	status = IEEE80211_SKB_RXCB(rx->skb);
-
  out:
 	ieee80211_led_rx(rx->local);
  out_no_led:
diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c
index 23d8ac829279..faab3c490d2b 100644
--- a/net/mac80211/scan.c
+++ b/net/mac80211/scan.c
@@ -1120,7 +1120,6 @@ int __ieee80211_request_sched_scan_start(struct ieee80211_sub_if_data *sdata,
 	u32 rate_masks[NUM_NL80211_BANDS] = {};
 	u8 bands_used = 0;
 	u8 *ie;
-	size_t len;
 
 	iebufsz = local->scan_ies_len + req->ie_len;
 
@@ -1145,10 +1144,9 @@ int __ieee80211_request_sched_scan_start(struct ieee80211_sub_if_data *sdata,
 
 	ieee80211_prepare_scan_chandef(&chandef, req->scan_width);
 
-	len = ieee80211_build_preq_ies(local, ie, num_bands * iebufsz,
-				       &sched_scan_ies, req->ie,
-				       req->ie_len, bands_used,
-				       rate_masks, &chandef);
+	ieee80211_build_preq_ies(local, ie, num_bands * iebufsz,
+				 &sched_scan_ies, req->ie,
+				 req->ie_len, bands_used, rate_masks, &chandef);
 
 	ret = drv_sched_scan_start(local, sdata, req, &sched_scan_ies);
 	if (ret == 0) {
diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c
index 50c309094c37..4774e663a411 100644
--- a/net/mac80211/sta_info.c
+++ b/net/mac80211/sta_info.c
@@ -513,23 +513,23 @@ static int sta_info_insert_finish(struct sta_info *sta) __acquires(RCU)
 {
 	struct ieee80211_local *local = sta->local;
 	struct ieee80211_sub_if_data *sdata = sta->sdata;
-	struct station_info *sinfo;
+	struct station_info *sinfo = NULL;
 	int err = 0;
 
 	lockdep_assert_held(&local->sta_mtx);
 
-	sinfo = kzalloc(sizeof(struct station_info), GFP_KERNEL);
-	if (!sinfo) {
-		err = -ENOMEM;
-		goto out_err;
-	}
-
 	/* check if STA exists already */
 	if (sta_info_get_bss(sdata, sta->sta.addr)) {
 		err = -EEXIST;
 		goto out_err;
 	}
 
+	sinfo = kzalloc(sizeof(struct station_info), GFP_KERNEL);
+	if (!sinfo) {
+		err = -ENOMEM;
+		goto out_err;
+	}
+
 	local->num_sta++;
 	local->sta_generation++;
 	smp_mb();
@@ -2051,16 +2051,12 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo)
 {
 	struct ieee80211_sub_if_data *sdata = sta->sdata;
 	struct ieee80211_local *local = sdata->local;
-	struct rate_control_ref *ref = NULL;
 	u32 thr = 0;
 	int i, ac, cpu;
 	struct ieee80211_sta_rx_stats *last_rxstats;
 
 	last_rxstats = sta_get_last_rx_stats(sta);
 
-	if (test_sta_flag(sta, WLAN_STA_RATE_CONTROL))
-		ref = local->rate_ctrl;
-
 	sinfo->generation = sdata->local->sta_generation;
 
 	/* do before driver, so beacon filtering drivers have a
diff --git a/net/mac80211/status.c b/net/mac80211/status.c
index ddf71c648cab..a3af6e1bfd98 100644
--- a/net/mac80211/status.c
+++ b/net/mac80211/status.c
@@ -95,7 +95,7 @@ static void ieee80211_handle_filtered_frame(struct ieee80211_local *local,
 		 */
 		if (*p & IEEE80211_QOS_CTL_EOSP)
 			*p &= ~IEEE80211_QOS_CTL_EOSP;
-		ac = ieee802_1d_to_ac[tid & 7];
+		ac = ieee80211_ac_from_tid(tid);
 	} else {
 		ac = IEEE80211_AC_BE;
 	}
@@ -541,6 +541,11 @@ static void ieee80211_report_used_skb(struct ieee80211_local *local,
 	} else if (info->ack_frame_id) {
 		ieee80211_report_ack_skb(local, info, acked, dropped);
 	}
+
+	if (!dropped && skb->destructor) {
+		skb->wifi_acked_valid = 1;
+		skb->wifi_acked = acked;
+	}
 }
 
 /*
@@ -633,10 +638,9 @@ void ieee80211_tx_status_noskb(struct ieee80211_hw *hw,
 	struct ieee80211_local *local = hw_to_local(hw);
 	struct ieee80211_supported_band *sband;
 	int retry_count;
-	int rates_idx;
 	bool acked, noack_success;
 
-	rates_idx = ieee80211_tx_get_rates(hw, info, &retry_count);
+	ieee80211_tx_get_rates(hw, info, &retry_count);
 
 	sband = hw->wiphy->bands[info->band];
 
diff --git a/net/mac80211/trace.h b/net/mac80211/trace.h
index 92a47afaa989..0d645bc148d0 100644
--- a/net/mac80211/trace.h
+++ b/net/mac80211/trace.h
@@ -1736,21 +1736,21 @@ TRACE_EVENT(drv_start_nan,
 		LOCAL_ENTRY
 		VIF_ENTRY
 		__field(u8, master_pref)
-		__field(u8, dual)
+		__field(u8, bands)
 	),
 
 	TP_fast_assign(
 		LOCAL_ASSIGN;
 		VIF_ASSIGN;
 		__entry->master_pref = conf->master_pref;
-		__entry->dual = conf->dual;
+		__entry->bands = conf->bands;
 	),
 
 	TP_printk(
 		LOCAL_PR_FMT  VIF_PR_FMT
-		", master preference: %u, dual: %d",
+		", master preference: %u, bands: 0x%0x",
 		LOCAL_PR_ARG, VIF_PR_ARG, __entry->master_pref,
-		__entry->dual
+		__entry->bands
 	)
 );
 
@@ -1787,7 +1787,7 @@ TRACE_EVENT(drv_nan_change_conf,
 		LOCAL_ENTRY
 		VIF_ENTRY
 		__field(u8, master_pref)
-		__field(u8, dual)
+		__field(u8, bands)
 		__field(u32, changes)
 	),
 
@@ -1795,15 +1795,15 @@ TRACE_EVENT(drv_nan_change_conf,
 		LOCAL_ASSIGN;
 		VIF_ASSIGN;
 		__entry->master_pref = conf->master_pref;
-		__entry->dual = conf->dual;
+		__entry->bands = conf->bands;
 		__entry->changes = changes;
 	),
 
 	TP_printk(
 		LOCAL_PR_FMT  VIF_PR_FMT
-		", master preference: %u, dual: %d, changes: 0x%x",
+		", master preference: %u, bands: 0x%0x, changes: 0x%x",
 		LOCAL_PR_ARG, VIF_PR_ARG, __entry->master_pref,
-		__entry->dual, __entry->changes
+		__entry->bands, __entry->changes
 	)
 );
 
@@ -1996,23 +1996,26 @@ TRACE_EVENT(api_connection_loss,
 
 TRACE_EVENT(api_cqm_rssi_notify,
 	TP_PROTO(struct ieee80211_sub_if_data *sdata,
-		 enum nl80211_cqm_rssi_threshold_event rssi_event),
+		 enum nl80211_cqm_rssi_threshold_event rssi_event,
+		 s32 rssi_level),
 
-	TP_ARGS(sdata, rssi_event),
+	TP_ARGS(sdata, rssi_event, rssi_level),
 
 	TP_STRUCT__entry(
 		VIF_ENTRY
 		__field(u32, rssi_event)
+		__field(s32, rssi_level)
 	),
 
 	TP_fast_assign(
 		VIF_ASSIGN;
 		__entry->rssi_event = rssi_event;
+		__entry->rssi_level = rssi_level;
 	),
 
 	TP_printk(
-		VIF_PR_FMT " event:%d",
-		VIF_PR_ARG, __entry->rssi_event
+		VIF_PR_FMT " event:%d rssi:%d",
+		VIF_PR_ARG, __entry->rssi_event, __entry->rssi_level
 	)
 );
 
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index 797e847cbc49..ba8d7db0a071 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -16,6 +16,7 @@
 #include <linux/kernel.h>
 #include <linux/slab.h>
 #include <linux/skbuff.h>
+#include <linux/if_vlan.h>
 #include <linux/etherdevice.h>
 #include <linux/bitmap.h>
 #include <linux/rcupdate.h>
@@ -63,6 +64,10 @@ static __le16 ieee80211_duration(struct ieee80211_tx_data *tx,
 	struct ieee80211_chanctx_conf *chanctx_conf;
 	u32 rate_flags = 0;
 
+	/* assume HW handles this */
+	if (tx->rate.flags & (IEEE80211_TX_RC_MCS | IEEE80211_TX_RC_VHT_MCS))
+		return 0;
+
 	rcu_read_lock();
 	chanctx_conf = rcu_dereference(tx->sdata->vif.chanctx_conf);
 	if (chanctx_conf) {
@@ -71,10 +76,6 @@ static __le16 ieee80211_duration(struct ieee80211_tx_data *tx,
 	}
 	rcu_read_unlock();
 
-	/* assume HW handles this */
-	if (tx->rate.flags & (IEEE80211_TX_RC_MCS | IEEE80211_TX_RC_VHT_MCS))
-		return 0;
-
 	/* uh huh? */
 	if (WARN_ON_ONCE(tx->rate.idx < 0))
 		return 0;
@@ -1413,7 +1414,7 @@ void ieee80211_txq_init(struct ieee80211_sub_if_data *sdata,
 		txqi->txq.sta = &sta->sta;
 		sta->sta.txq[tid] = &txqi->txq;
 		txqi->txq.tid = tid;
-		txqi->txq.ac = ieee802_1d_to_ac[tid & 7];
+		txqi->txq.ac = ieee80211_ac_from_tid(tid);
 	} else {
 		sdata->vif.txq = &txqi->txq;
 		txqi->txq.tid = 0;
@@ -3571,6 +3572,115 @@ void __ieee80211_subif_start_xmit(struct sk_buff *skb,
 	rcu_read_unlock();
 }
 
+static int ieee80211_change_da(struct sk_buff *skb, struct sta_info *sta)
+{
+	struct ethhdr *eth;
+	int err;
+
+	err = skb_ensure_writable(skb, ETH_HLEN);
+	if (unlikely(err))
+		return err;
+
+	eth = (void *)skb->data;
+	ether_addr_copy(eth->h_dest, sta->sta.addr);
+
+	return 0;
+}
+
+static bool ieee80211_multicast_to_unicast(struct sk_buff *skb,
+					   struct net_device *dev)
+{
+	struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+	const struct ethhdr *eth = (void *)skb->data;
+	const struct vlan_ethhdr *ethvlan = (void *)skb->data;
+	__be16 ethertype;
+
+	if (likely(!is_multicast_ether_addr(eth->h_dest)))
+		return false;
+
+	switch (sdata->vif.type) {
+	case NL80211_IFTYPE_AP_VLAN:
+		if (sdata->u.vlan.sta)
+			return false;
+		if (sdata->wdev.use_4addr)
+			return false;
+		/* fall through */
+	case NL80211_IFTYPE_AP:
+		/* check runtime toggle for this bss */
+		if (!sdata->bss->multicast_to_unicast)
+			return false;
+		break;
+	default:
+		return false;
+	}
+
+	/* multicast to unicast conversion only for some payload */
+	ethertype = eth->h_proto;
+	if (ethertype == htons(ETH_P_8021Q) && skb->len >= VLAN_ETH_HLEN)
+		ethertype = ethvlan->h_vlan_encapsulated_proto;
+	switch (ethertype) {
+	case htons(ETH_P_ARP):
+	case htons(ETH_P_IP):
+	case htons(ETH_P_IPV6):
+		break;
+	default:
+		return false;
+	}
+
+	return true;
+}
+
+static void
+ieee80211_convert_to_unicast(struct sk_buff *skb, struct net_device *dev,
+			     struct sk_buff_head *queue)
+{
+	struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+	struct ieee80211_local *local = sdata->local;
+	const struct ethhdr *eth = (struct ethhdr *)skb->data;
+	struct sta_info *sta, *first = NULL;
+	struct sk_buff *cloned_skb;
+
+	rcu_read_lock();
+
+	list_for_each_entry_rcu(sta, &local->sta_list, list) {
+		if (sdata != sta->sdata)
+			/* AP-VLAN mismatch */
+			continue;
+		if (unlikely(ether_addr_equal(eth->h_source, sta->sta.addr)))
+			/* do not send back to source */
+			continue;
+		if (!first) {
+			first = sta;
+			continue;
+		}
+		cloned_skb = skb_clone(skb, GFP_ATOMIC);
+		if (!cloned_skb)
+			goto multicast;
+		if (unlikely(ieee80211_change_da(cloned_skb, sta))) {
+			dev_kfree_skb(cloned_skb);
+			goto multicast;
+		}
+		__skb_queue_tail(queue, cloned_skb);
+	}
+
+	if (likely(first)) {
+		if (unlikely(ieee80211_change_da(skb, first)))
+			goto multicast;
+		__skb_queue_tail(queue, skb);
+	} else {
+		/* no STA connected, drop */
+		kfree_skb(skb);
+		skb = NULL;
+	}
+
+	goto out;
+multicast:
+	__skb_queue_purge(queue);
+	__skb_queue_tail(queue, skb);
+out:
+	rcu_read_unlock();
+}
+
 /**
  * ieee80211_subif_start_xmit - netif start_xmit function for 802.3 vifs
  * @skb: packet to be sent
@@ -3581,7 +3691,17 @@ void __ieee80211_subif_start_xmit(struct sk_buff *skb,
 netdev_tx_t ieee80211_subif_start_xmit(struct sk_buff *skb,
 				       struct net_device *dev)
 {
-	__ieee80211_subif_start_xmit(skb, dev, 0);
+	if (unlikely(ieee80211_multicast_to_unicast(skb, dev))) {
+		struct sk_buff_head queue;
+
+		__skb_queue_head_init(&queue);
+		ieee80211_convert_to_unicast(skb, dev, &queue);
+		while ((skb = __skb_dequeue(&queue)))
+			__ieee80211_subif_start_xmit(skb, dev, 0);
+	} else {
+		__ieee80211_subif_start_xmit(skb, dev, 0);
+	}
+
 	return NETDEV_TX_OK;
 }
 
@@ -4074,7 +4194,7 @@ __ieee80211_beacon_get(struct ieee80211_hw *hw,
 		}
 
 		if (ifmsh->sync_ops)
-			ifmsh->sync_ops->adjust_tbtt(sdata, beacon);
+			ifmsh->sync_ops->adjust_tsf(sdata, beacon);
 
 		skb = dev_alloc_skb(local->tx_headroom +
 				    beacon->head_len +
@@ -4539,7 +4659,7 @@ void __ieee80211_tx_skb_tid_band(struct ieee80211_sub_if_data *sdata,
 				 struct sk_buff *skb, int tid,
 				 enum nl80211_band band)
 {
-	int ac = ieee802_1d_to_ac[tid & 7];
+	int ac = ieee80211_ac_from_tid(tid);
 
 	skb_reset_mac_header(skb);
 	skb_set_queue_mapping(skb, ac);
diff --git a/net/mac80211/vht.c b/net/mac80211/vht.c
index 43e45bb660bc..19ec2189d3ac 100644
--- a/net/mac80211/vht.c
+++ b/net/mac80211/vht.c
@@ -436,14 +436,10 @@ u32 __ieee80211_vht_handle_opmode(struct ieee80211_sub_if_data *sdata,
 				  struct sta_info *sta, u8 opmode,
 				  enum nl80211_band band)
 {
-	struct ieee80211_local *local = sdata->local;
-	struct ieee80211_supported_band *sband;
 	enum ieee80211_sta_rx_bandwidth new_bw;
 	u32 changed = 0;
 	u8 nss;
 
-	sband = local->hw.wiphy->bands[band];
-
 	/* ignore - no support for BF yet */
 	if (opmode & IEEE80211_OPMODE_NOTIF_RX_NSS_TYPE_BF)
 		return 0;
diff --git a/net/mac80211/wep.c b/net/mac80211/wep.c
index efa3f48f1ec5..73e8f347802e 100644
--- a/net/mac80211/wep.c
+++ b/net/mac80211/wep.c
@@ -293,7 +293,8 @@ ieee80211_crypto_wep_decrypt(struct ieee80211_rx_data *rx)
 			return RX_DROP_UNUSABLE;
 		ieee80211_wep_remove_iv(rx->local, rx->skb, rx->key);
 		/* remove ICV */
-		if (pskb_trim(rx->skb, rx->skb->len - IEEE80211_WEP_ICV_LEN))
+		if (!(status->flag & RX_FLAG_ICV_STRIPPED) &&
+		    pskb_trim(rx->skb, rx->skb->len - IEEE80211_WEP_ICV_LEN))
 			return RX_DROP_UNUSABLE;
 	}
 
diff --git a/net/mac80211/wpa.c b/net/mac80211/wpa.c
index 8af6dd388d11..c1ef22df865f 100644
--- a/net/mac80211/wpa.c
+++ b/net/mac80211/wpa.c
@@ -294,7 +294,8 @@ ieee80211_crypto_tkip_decrypt(struct ieee80211_rx_data *rx)
 		return RX_DROP_UNUSABLE;
 
 	/* Trim ICV */
-	skb_trim(skb, skb->len - IEEE80211_TKIP_ICV_LEN);
+	if (!(status->flag & RX_FLAG_ICV_STRIPPED))
+		skb_trim(skb, skb->len - IEEE80211_TKIP_ICV_LEN);
 
 	/* Remove IV */
 	memmove(skb->data + IEEE80211_TKIP_IV_LEN, skb->data, hdrlen);
diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c
index 5b77377e5a15..3818686182b2 100644
--- a/net/mpls/af_mpls.c
+++ b/net/mpls/af_mpls.c
@@ -7,7 +7,9 @@
 #include <linux/if_arp.h>
 #include <linux/ipv6.h>
 #include <linux/mpls.h>
+#include <linux/netconf.h>
 #include <linux/vmalloc.h>
+#include <linux/percpu.h>
 #include <net/ip.h>
 #include <net/dst.h>
 #include <net/sock.h>
@@ -17,8 +19,8 @@
 #include <net/netns/generic.h>
 #if IS_ENABLED(CONFIG_IPV6)
 #include <net/ipv6.h>
-#include <net/addrconf.h>
 #endif
+#include <net/addrconf.h>
 #include <net/nexthop.h>
 #include "internal.h"
 
@@ -48,11 +50,6 @@ static struct mpls_route *mpls_route_input_rcu(struct net *net, unsigned index)
 	return rt;
 }
 
-static inline struct mpls_dev *mpls_dev_get(const struct net_device *dev)
-{
-	return rcu_dereference_rtnl(dev->mpls_ptr);
-}
-
 bool mpls_output_possible(const struct net_device *dev)
 {
 	return dev && (dev->flags & IFF_UP) && netif_carrier_ok(dev);
@@ -98,6 +95,31 @@ bool mpls_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
 }
 EXPORT_SYMBOL_GPL(mpls_pkt_too_big);
 
+void mpls_stats_inc_outucastpkts(struct net_device *dev,
+				 const struct sk_buff *skb)
+{
+	struct mpls_dev *mdev;
+
+	if (skb->protocol == htons(ETH_P_MPLS_UC)) {
+		mdev = mpls_dev_get(dev);
+		if (mdev)
+			MPLS_INC_STATS_LEN(mdev, skb->len,
+					   tx_packets,
+					   tx_bytes);
+	} else if (skb->protocol == htons(ETH_P_IP)) {
+		IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);
+#if IS_ENABLED(CONFIG_IPV6)
+	} else if (skb->protocol == htons(ETH_P_IPV6)) {
+		struct inet6_dev *in6dev = __in6_dev_get(dev);
+
+		if (in6dev)
+			IP6_UPD_PO_STATS(dev_net(dev), in6dev,
+					 IPSTATS_MIB_OUT, skb->len);
+#endif
+	}
+}
+EXPORT_SYMBOL_GPL(mpls_stats_inc_outucastpkts);
+
 static u32 mpls_multipath_hash(struct mpls_route *rt, struct sk_buff *skb)
 {
 	struct mpls_entry_decoded dec;
@@ -255,6 +277,7 @@ static int mpls_forward(struct sk_buff *skb, struct net_device *dev,
 	struct mpls_nh *nh;
 	struct mpls_entry_decoded dec;
 	struct net_device *out_dev;
+	struct mpls_dev *out_mdev;
 	struct mpls_dev *mdev;
 	unsigned int hh_len;
 	unsigned int new_header_size;
@@ -264,34 +287,39 @@ static int mpls_forward(struct sk_buff *skb, struct net_device *dev,
 	/* Careful this entire function runs inside of an rcu critical section */
 
 	mdev = mpls_dev_get(dev);
-	if (!mdev || !mdev->input_enabled)
+	if (!mdev)
 		goto drop;
 
-	if (skb->pkt_type != PACKET_HOST)
+	MPLS_INC_STATS_LEN(mdev, skb->len, rx_packets,
+			   rx_bytes);
+
+	if (!mdev->input_enabled) {
+		MPLS_INC_STATS(mdev, rx_dropped);
 		goto drop;
+	}
+
+	if (skb->pkt_type != PACKET_HOST)
+		goto err;
 
 	if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)
-		goto drop;
+		goto err;
 
 	if (!pskb_may_pull(skb, sizeof(*hdr)))
-		goto drop;
+		goto err;
 
 	/* Read and decode the label */
 	hdr = mpls_hdr(skb);
 	dec = mpls_entry_decode(hdr);
 
 	rt = mpls_route_input_rcu(net, dec.label);
-	if (!rt)
+	if (!rt) {
+		MPLS_INC_STATS(mdev, rx_noroute);
 		goto drop;
+	}
 
 	nh = mpls_select_multipath(rt, skb);
 	if (!nh)
-		goto drop;
-
-	/* Find the output device */
-	out_dev = rcu_dereference(nh->nh_dev);
-	if (!mpls_output_possible(out_dev))
-		goto drop;
+		goto err;
 
 	/* Pop the label */
 	skb_pull(skb, sizeof(*hdr));
@@ -300,20 +328,25 @@ static int mpls_forward(struct sk_buff *skb, struct net_device *dev,
 	skb_orphan(skb);
 
 	if (skb_warn_if_lro(skb))
-		goto drop;
+		goto err;
 
 	skb_forward_csum(skb);
 
 	/* Verify ttl is valid */
 	if (dec.ttl <= 1)
-		goto drop;
+		goto err;
 	dec.ttl -= 1;
 
+	/* Find the output device */
+	out_dev = rcu_dereference(nh->nh_dev);
+	if (!mpls_output_possible(out_dev))
+		goto tx_err;
+
 	/* Verify the destination can hold the packet */
 	new_header_size = mpls_nh_header_size(nh);
 	mtu = mpls_dev_mtu(out_dev);
 	if (mpls_pkt_too_big(skb, mtu - new_header_size))
-		goto drop;
+		goto tx_err;
 
 	hh_len = LL_RESERVED_SPACE(out_dev);
 	if (!out_dev->header_ops)
@@ -321,7 +354,7 @@ static int mpls_forward(struct sk_buff *skb, struct net_device *dev,
 
 	/* Ensure there is enough space for the headers in the skb */
 	if (skb_cow(skb, hh_len + new_header_size))
-		goto drop;
+		goto tx_err;
 
 	skb->dev = out_dev;
 	skb->protocol = htons(ETH_P_MPLS_UC);
@@ -329,7 +362,7 @@ static int mpls_forward(struct sk_buff *skb, struct net_device *dev,
 	if (unlikely(!new_header_size && dec.bos)) {
 		/* Penultimate hop popping */
 		if (!mpls_egress(rt, skb, dec))
-			goto drop;
+			goto err;
 	} else {
 		bool bos;
 		int i;
@@ -345,6 +378,8 @@ static int mpls_forward(struct sk_buff *skb, struct net_device *dev,
 		}
 	}
 
+	mpls_stats_inc_outucastpkts(out_dev, skb);
+
 	/* If via wasn't specified then send out using device address */
 	if (nh->nh_via_table == MPLS_NEIGH_TABLE_UNSPEC)
 		err = neigh_xmit(NEIGH_LINK_TABLE, out_dev,
@@ -357,6 +392,13 @@ static int mpls_forward(struct sk_buff *skb, struct net_device *dev,
 				    __func__, err);
 	return 0;
 
+tx_err:
+	out_mdev = out_dev ? mpls_dev_get(out_dev) : NULL;
+	if (out_mdev)
+		MPLS_INC_STATS(out_mdev, tx_errors);
+	goto drop;
+err:
+	MPLS_INC_STATS(mdev, rx_errors);
 drop:
 	kfree_skb(skb);
 	return NET_RX_DROP;
@@ -855,15 +897,279 @@ errout:
 	return err;
 }
 
+static void mpls_get_stats(struct mpls_dev *mdev,
+			   struct mpls_link_stats *stats)
+{
+	struct mpls_pcpu_stats *p;
+	int i;
+
+	memset(stats, 0, sizeof(*stats));
+
+	for_each_possible_cpu(i) {
+		struct mpls_link_stats local;
+		unsigned int start;
+
+		p = per_cpu_ptr(mdev->stats, i);
+		do {
+			start = u64_stats_fetch_begin(&p->syncp);
+			local = p->stats;
+		} while (u64_stats_fetch_retry(&p->syncp, start));
+
+		stats->rx_packets	+= local.rx_packets;
+		stats->rx_bytes		+= local.rx_bytes;
+		stats->tx_packets	+= local.tx_packets;
+		stats->tx_bytes		+= local.tx_bytes;
+		stats->rx_errors	+= local.rx_errors;
+		stats->tx_errors	+= local.tx_errors;
+		stats->rx_dropped	+= local.rx_dropped;
+		stats->tx_dropped	+= local.tx_dropped;
+		stats->rx_noroute	+= local.rx_noroute;
+	}
+}
+
+static int mpls_fill_stats_af(struct sk_buff *skb,
+			      const struct net_device *dev)
+{
+	struct mpls_link_stats *stats;
+	struct mpls_dev *mdev;
+	struct nlattr *nla;
+
+	mdev = mpls_dev_get(dev);
+	if (!mdev)
+		return -ENODATA;
+
+	nla = nla_reserve_64bit(skb, MPLS_STATS_LINK,
+				sizeof(struct mpls_link_stats),
+				MPLS_STATS_UNSPEC);
+	if (!nla)
+		return -EMSGSIZE;
+
+	stats = nla_data(nla);
+	mpls_get_stats(mdev, stats);
+
+	return 0;
+}
+
+static size_t mpls_get_stats_af_size(const struct net_device *dev)
+{
+	struct mpls_dev *mdev;
+
+	mdev = mpls_dev_get(dev);
+	if (!mdev)
+		return 0;
+
+	return nla_total_size_64bit(sizeof(struct mpls_link_stats));
+}
+
+static int mpls_netconf_fill_devconf(struct sk_buff *skb, struct mpls_dev *mdev,
+				     u32 portid, u32 seq, int event,
+				     unsigned int flags, int type)
+{
+	struct nlmsghdr  *nlh;
+	struct netconfmsg *ncm;
+	bool all = false;
+
+	nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct netconfmsg),
+			flags);
+	if (!nlh)
+		return -EMSGSIZE;
+
+	if (type == NETCONFA_ALL)
+		all = true;
+
+	ncm = nlmsg_data(nlh);
+	ncm->ncm_family = AF_MPLS;
+
+	if (nla_put_s32(skb, NETCONFA_IFINDEX, mdev->dev->ifindex) < 0)
+		goto nla_put_failure;
+
+	if ((all || type == NETCONFA_INPUT) &&
+	    nla_put_s32(skb, NETCONFA_INPUT,
+			mdev->input_enabled) < 0)
+		goto nla_put_failure;
+
+	nlmsg_end(skb, nlh);
+	return 0;
+
+nla_put_failure:
+	nlmsg_cancel(skb, nlh);
+	return -EMSGSIZE;
+}
+
+static int mpls_netconf_msgsize_devconf(int type)
+{
+	int size = NLMSG_ALIGN(sizeof(struct netconfmsg))
+			+ nla_total_size(4); /* NETCONFA_IFINDEX */
+	bool all = false;
+
+	if (type == NETCONFA_ALL)
+		all = true;
+
+	if (all || type == NETCONFA_INPUT)
+		size += nla_total_size(4);
+
+	return size;
+}
+
+static void mpls_netconf_notify_devconf(struct net *net, int type,
+					struct mpls_dev *mdev)
+{
+	struct sk_buff *skb;
+	int err = -ENOBUFS;
+
+	skb = nlmsg_new(mpls_netconf_msgsize_devconf(type), GFP_KERNEL);
+	if (!skb)
+		goto errout;
+
+	err = mpls_netconf_fill_devconf(skb, mdev, 0, 0, RTM_NEWNETCONF,
+					0, type);
+	if (err < 0) {
+		/* -EMSGSIZE implies BUG in mpls_netconf_msgsize_devconf() */
+		WARN_ON(err == -EMSGSIZE);
+		kfree_skb(skb);
+		goto errout;
+	}
+
+	rtnl_notify(skb, net, 0, RTNLGRP_MPLS_NETCONF, NULL, GFP_KERNEL);
+	return;
+errout:
+	if (err < 0)
+		rtnl_set_sk_err(net, RTNLGRP_MPLS_NETCONF, err);
+}
+
+static const struct nla_policy devconf_mpls_policy[NETCONFA_MAX + 1] = {
+	[NETCONFA_IFINDEX]	= { .len = sizeof(int) },
+};
+
+static int mpls_netconf_get_devconf(struct sk_buff *in_skb,
+				    struct nlmsghdr *nlh)
+{
+	struct net *net = sock_net(in_skb->sk);
+	struct nlattr *tb[NETCONFA_MAX + 1];
+	struct netconfmsg *ncm;
+	struct net_device *dev;
+	struct mpls_dev *mdev;
+	struct sk_buff *skb;
+	int ifindex;
+	int err;
+
+	err = nlmsg_parse(nlh, sizeof(*ncm), tb, NETCONFA_MAX,
+			  devconf_mpls_policy);
+	if (err < 0)
+		goto errout;
+
+	err = -EINVAL;
+	if (!tb[NETCONFA_IFINDEX])
+		goto errout;
+
+	ifindex = nla_get_s32(tb[NETCONFA_IFINDEX]);
+	dev = __dev_get_by_index(net, ifindex);
+	if (!dev)
+		goto errout;
+
+	mdev = mpls_dev_get(dev);
+	if (!mdev)
+		goto errout;
+
+	err = -ENOBUFS;
+	skb = nlmsg_new(mpls_netconf_msgsize_devconf(NETCONFA_ALL), GFP_KERNEL);
+	if (!skb)
+		goto errout;
+
+	err = mpls_netconf_fill_devconf(skb, mdev,
+					NETLINK_CB(in_skb).portid,
+					nlh->nlmsg_seq, RTM_NEWNETCONF, 0,
+					NETCONFA_ALL);
+	if (err < 0) {
+		/* -EMSGSIZE implies BUG in mpls_netconf_msgsize_devconf() */
+		WARN_ON(err == -EMSGSIZE);
+		kfree_skb(skb);
+		goto errout;
+	}
+	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
+errout:
+	return err;
+}
+
+static int mpls_netconf_dump_devconf(struct sk_buff *skb,
+				     struct netlink_callback *cb)
+{
+	struct net *net = sock_net(skb->sk);
+	struct hlist_head *head;
+	struct net_device *dev;
+	struct mpls_dev *mdev;
+	int idx, s_idx;
+	int h, s_h;
+
+	s_h = cb->args[0];
+	s_idx = idx = cb->args[1];
+
+	for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) {
+		idx = 0;
+		head = &net->dev_index_head[h];
+		rcu_read_lock();
+		cb->seq = net->dev_base_seq;
+		hlist_for_each_entry_rcu(dev, head, index_hlist) {
+			if (idx < s_idx)
+				goto cont;
+			mdev = mpls_dev_get(dev);
+			if (!mdev)
+				goto cont;
+			if (mpls_netconf_fill_devconf(skb, mdev,
+						      NETLINK_CB(cb->skb).portid,
+						      cb->nlh->nlmsg_seq,
+						      RTM_NEWNETCONF,
+						      NLM_F_MULTI,
+						      NETCONFA_ALL) < 0) {
+				rcu_read_unlock();
+				goto done;
+			}
+			nl_dump_check_consistent(cb, nlmsg_hdr(skb));
+cont:
+			idx++;
+		}
+		rcu_read_unlock();
+	}
+done:
+	cb->args[0] = h;
+	cb->args[1] = idx;
+
+	return skb->len;
+}
+
 #define MPLS_PERDEV_SYSCTL_OFFSET(field)	\
 	(&((struct mpls_dev *)0)->field)
 
+static int mpls_conf_proc(struct ctl_table *ctl, int write,
+			  void __user *buffer,
+			  size_t *lenp, loff_t *ppos)
+{
+	int oval = *(int *)ctl->data;
+	int ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
+
+	if (write) {
+		struct mpls_dev *mdev = ctl->extra1;
+		int i = (int *)ctl->data - (int *)mdev;
+		struct net *net = ctl->extra2;
+		int val = *(int *)ctl->data;
+
+		if (i == offsetof(struct mpls_dev, input_enabled) &&
+		    val != oval) {
+			mpls_netconf_notify_devconf(net,
+						    NETCONFA_INPUT,
+						    mdev);
+		}
+	}
+
+	return ret;
+}
+
 static const struct ctl_table mpls_dev_table[] = {
 	{
 		.procname	= "input",
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
+		.proc_handler	= mpls_conf_proc,
 		.data		= MPLS_PERDEV_SYSCTL_OFFSET(input_enabled),
 	},
 	{ }
@@ -873,6 +1179,7 @@ static int mpls_dev_sysctl_register(struct net_device *dev,
 				    struct mpls_dev *mdev)
 {
 	char path[sizeof("net/mpls/conf/") + IFNAMSIZ];
+	struct net *net = dev_net(dev);
 	struct ctl_table *table;
 	int i;
 
@@ -883,8 +1190,11 @@ static int mpls_dev_sysctl_register(struct net_device *dev,
 	/* Table data contains only offsets relative to the base of
 	 * the mdev at this point, so make them absolute.
 	 */
-	for (i = 0; i < ARRAY_SIZE(mpls_dev_table); i++)
+	for (i = 0; i < ARRAY_SIZE(mpls_dev_table); i++) {
 		table[i].data = (char *)mdev + (uintptr_t)table[i].data;
+		table[i].extra1 = mdev;
+		table[i].extra2 = net;
+	}
 
 	snprintf(path, sizeof(path), "net/mpls/conf/%s", dev->name);
 
@@ -913,6 +1223,7 @@ static struct mpls_dev *mpls_add_dev(struct net_device *dev)
 {
 	struct mpls_dev *mdev;
 	int err = -ENOMEM;
+	int i;
 
 	ASSERT_RTNL();
 
@@ -920,19 +1231,40 @@ static struct mpls_dev *mpls_add_dev(struct net_device *dev)
 	if (!mdev)
 		return ERR_PTR(err);
 
+	mdev->stats = alloc_percpu(struct mpls_pcpu_stats);
+	if (!mdev->stats)
+		goto free;
+
+	for_each_possible_cpu(i) {
+		struct mpls_pcpu_stats *mpls_stats;
+
+		mpls_stats = per_cpu_ptr(mdev->stats, i);
+		u64_stats_init(&mpls_stats->syncp);
+	}
+
 	err = mpls_dev_sysctl_register(dev, mdev);
 	if (err)
 		goto free;
 
+	mdev->dev = dev;
 	rcu_assign_pointer(dev->mpls_ptr, mdev);
 
 	return mdev;
 
 free:
+	free_percpu(mdev->stats);
 	kfree(mdev);
 	return ERR_PTR(err);
 }
 
+static void mpls_dev_destroy_rcu(struct rcu_head *head)
+{
+	struct mpls_dev *mdev = container_of(head, struct mpls_dev, rcu);
+
+	free_percpu(mdev->stats);
+	kfree(mdev);
+}
+
 static void mpls_ifdown(struct net_device *dev, int event)
 {
 	struct mpls_route __rcu **platform_label;
@@ -1047,7 +1379,7 @@ static int mpls_dev_notify(struct notifier_block *this, unsigned long event,
 		if (mdev) {
 			mpls_dev_sysctl_unregister(mdev);
 			RCU_INIT_POINTER(dev->mpls_ptr, NULL);
-			kfree_rcu(mdev, rcu);
+			call_rcu(&mdev->rcu, mpls_dev_destroy_rcu);
 		}
 		break;
 	case NETDEV_CHANGENAME:
@@ -1708,6 +2040,12 @@ static struct pernet_operations mpls_net_ops = {
 	.exit = mpls_net_exit,
 };
 
+static struct rtnl_af_ops mpls_af_ops __read_mostly = {
+	.family		   = AF_MPLS,
+	.fill_stats_af	   = mpls_fill_stats_af,
+	.get_stats_af_size = mpls_get_stats_af_size,
+};
+
 static int __init mpls_init(void)
 {
 	int err;
@@ -1724,9 +2062,13 @@ static int __init mpls_init(void)
 
 	dev_add_pack(&mpls_packet_type);
 
+	rtnl_af_register(&mpls_af_ops);
+
 	rtnl_register(PF_MPLS, RTM_NEWROUTE, mpls_rtm_newroute, NULL, NULL);
 	rtnl_register(PF_MPLS, RTM_DELROUTE, mpls_rtm_delroute, NULL, NULL);
 	rtnl_register(PF_MPLS, RTM_GETROUTE, NULL, mpls_dump_routes, NULL);
+	rtnl_register(PF_MPLS, RTM_GETNETCONF, mpls_netconf_get_devconf,
+		      mpls_netconf_dump_devconf, NULL);
 	err = 0;
 out:
 	return err;
@@ -1740,6 +2082,7 @@ module_init(mpls_init);
 static void __exit mpls_exit(void)
 {
 	rtnl_unregister_all(PF_MPLS);
+	rtnl_af_unregister(&mpls_af_ops);
 	dev_remove_pack(&mpls_packet_type);
 	unregister_netdevice_notifier(&mpls_dev_notifier);
 	unregister_pernet_subsys(&mpls_net_ops);
diff --git a/net/mpls/internal.h b/net/mpls/internal.h
index bdfef6c3271a..76360d8b9579 100644
--- a/net/mpls/internal.h
+++ b/net/mpls/internal.h
@@ -9,13 +9,58 @@ struct mpls_entry_decoded {
 	u8 bos;
 };
 
+struct mpls_pcpu_stats {
+	struct mpls_link_stats	stats;
+	struct u64_stats_sync	syncp;
+};
+
 struct mpls_dev {
-	int			input_enabled;
+	int				input_enabled;
+	struct net_device		*dev;
+	struct mpls_pcpu_stats __percpu	*stats;
 
-	struct ctl_table_header *sysctl;
-	struct rcu_head		rcu;
+	struct ctl_table_header		*sysctl;
+	struct rcu_head			rcu;
 };
 
+#if BITS_PER_LONG == 32
+
+#define MPLS_INC_STATS_LEN(mdev, len, pkts_field, bytes_field)		\
+	do {								\
+		__typeof__(*(mdev)->stats) *ptr =			\
+			raw_cpu_ptr((mdev)->stats);			\
+		local_bh_disable();					\
+		u64_stats_update_begin(&ptr->syncp);			\
+		ptr->stats.pkts_field++;				\
+		ptr->stats.bytes_field += (len);			\
+		u64_stats_update_end(&ptr->syncp);			\
+		local_bh_enable();					\
+	} while (0)
+
+#define MPLS_INC_STATS(mdev, field)					\
+	do {								\
+		__typeof__(*(mdev)->stats) *ptr =			\
+			raw_cpu_ptr((mdev)->stats);			\
+		local_bh_disable();					\
+		u64_stats_update_begin(&ptr->syncp);			\
+		ptr->stats.field++;					\
+		u64_stats_update_end(&ptr->syncp);			\
+		local_bh_enable();					\
+	} while (0)
+
+#else
+
+#define MPLS_INC_STATS_LEN(mdev, len, pkts_field, bytes_field)		\
+	do {								\
+		this_cpu_inc((mdev)->stats->stats.pkts_field);		\
+		this_cpu_add((mdev)->stats->stats.bytes_field, (len));	\
+	} while (0)
+
+#define MPLS_INC_STATS(mdev, field)			\
+	this_cpu_inc((mdev)->stats->stats.field)
+
+#endif
+
 struct sk_buff;
 
 #define LABEL_NOT_SPECIFIED (1 << 20)
@@ -114,6 +159,11 @@ static inline struct mpls_entry_decoded mpls_entry_decode(struct mpls_shim_hdr *
 	return result;
 }
 
+static inline struct mpls_dev *mpls_dev_get(const struct net_device *dev)
+{
+	return rcu_dereference_rtnl(dev->mpls_ptr);
+}
+
 int nla_put_labels(struct sk_buff *skb, int attrtype,  u8 labels,
 		   const u32 label[]);
 int nla_get_labels(const struct nlattr *nla, u32 max_labels, u8 *labels,
@@ -123,5 +173,7 @@ int nla_get_via(const struct nlattr *nla, u8 *via_alen, u8 *via_table,
 bool mpls_output_possible(const struct net_device *dev);
 unsigned int mpls_dev_mtu(const struct net_device *dev);
 bool mpls_pkt_too_big(const struct sk_buff *skb, unsigned int mtu);
+void mpls_stats_inc_outucastpkts(struct net_device *dev,
+				 const struct sk_buff *skb);
 
 #endif /* MPLS_INTERNAL_H */
diff --git a/net/mpls/mpls_iptunnel.c b/net/mpls/mpls_iptunnel.c
index 1d281c1ff7c1..e4e4424f9eb1 100644
--- a/net/mpls/mpls_iptunnel.c
+++ b/net/mpls/mpls_iptunnel.c
@@ -48,11 +48,15 @@ static int mpls_xmit(struct sk_buff *skb)
 	struct dst_entry *dst = skb_dst(skb);
 	struct rtable *rt = NULL;
 	struct rt6_info *rt6 = NULL;
+	struct mpls_dev *out_mdev;
 	int err = 0;
 	bool bos;
 	int i;
 	unsigned int ttl;
 
+	/* Find the output device */
+	out_dev = dst->dev;
+
 	/* Obtain the ttl */
 	if (dst->ops->family == AF_INET) {
 		ttl = ip_hdr(skb)->ttl;
@@ -66,8 +70,6 @@ static int mpls_xmit(struct sk_buff *skb)
 
 	skb_orphan(skb);
 
-	/* Find the output device */
-	out_dev = dst->dev;
 	if (!mpls_output_possible(out_dev) ||
 	    !dst->lwtstate || skb_warn_if_lro(skb))
 		goto drop;
@@ -109,6 +111,8 @@ static int mpls_xmit(struct sk_buff *skb)
 		bos = false;
 	}
 
+	mpls_stats_inc_outucastpkts(out_dev, skb);
+
 	if (rt)
 		err = neigh_xmit(NEIGH_ARP_TABLE, out_dev, &rt->rt_gateway,
 				 skb);
@@ -122,11 +126,14 @@ static int mpls_xmit(struct sk_buff *skb)
 	return LWTUNNEL_XMIT_DONE;
 
 drop:
+	out_mdev = out_dev ? mpls_dev_get(out_dev) : NULL;
+	if (out_mdev)
+		MPLS_INC_STATS(out_mdev, tx_errors);
 	kfree_skb(skb);
 	return -EINVAL;
 }
 
-static int mpls_build_state(struct net_device *dev, struct nlattr *nla,
+static int mpls_build_state(struct nlattr *nla,
 			    unsigned int family, const void *cfg,
 			    struct lwtunnel_state **ts)
 {
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index bbc45f8a7b2d..9b28864cc36a 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -162,6 +162,7 @@ config NF_CT_PROTO_SCTP
 	bool 'SCTP protocol connection tracking support'
 	depends on NETFILTER_ADVANCED
 	default y
+	select LIBCRC32C
 	help
 	  With this option enabled, the layer 3 independent connection
 	  tracking code will be able to do state tracking on SCTP connections.
@@ -397,7 +398,6 @@ config NF_NAT_PROTO_SCTP
 	bool
 	default NF_NAT && NF_CT_PROTO_SCTP
 	depends on NF_NAT && NF_CT_PROTO_SCTP
-	select LIBCRC32C
 
 config NF_NAT_AMANDA
 	tristate
@@ -467,10 +467,10 @@ config NF_TABLES_NETDEV
 	  This option enables support for the "netdev" table.
 
 config NFT_EXTHDR
-	tristate "Netfilter nf_tables IPv6 exthdr module"
+	tristate "Netfilter nf_tables exthdr module"
 	help
 	  This option adds the "exthdr" expression that you can use to match
-	  IPv6 extension headers.
+	  IPv6 extension headers and tcp options.
 
 config NFT_META
 	tristate "Netfilter nf_tables meta module"
@@ -509,6 +509,12 @@ config NFT_SET_HASH
 	  This option adds the "hash" set type that is used to build one-way
 	  mappings between matchings and actions.
 
+config NFT_SET_BITMAP
+	tristate "Netfilter nf_tables bitmap set module"
+	help
+	  This option adds the "bitmap" set type that is used to build sets
+	  whose keys are smaller or equal to 16 bits.
+
 config NFT_COUNTER
 	tristate "Netfilter nf_tables counter module"
 	help
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index ca30d1960f1d..c9b78e7b342f 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -7,7 +7,6 @@ nf_conntrack-$(CONFIG_NF_CONNTRACK_EVENTS) += nf_conntrack_ecache.o
 nf_conntrack-$(CONFIG_NF_CONNTRACK_LABELS) += nf_conntrack_labels.o
 nf_conntrack-$(CONFIG_NF_CT_PROTO_DCCP) += nf_conntrack_proto_dccp.o
 nf_conntrack-$(CONFIG_NF_CT_PROTO_SCTP) += nf_conntrack_proto_sctp.o
-nf_conntrack-$(CONFIG_NF_CT_PROTO_UDPLITE) += nf_conntrack_proto_udplite.o
 
 obj-$(CONFIG_NETFILTER) = netfilter.o
 
@@ -47,7 +46,6 @@ nf_nat-y	:= nf_nat_core.o nf_nat_proto_unknown.o nf_nat_proto_common.o \
 # NAT protocols (nf_nat)
 nf_nat-$(CONFIG_NF_NAT_PROTO_DCCP) += nf_nat_proto_dccp.o
 nf_nat-$(CONFIG_NF_NAT_PROTO_SCTP) += nf_nat_proto_sctp.o
-nf_nat-$(CONFIG_NF_NAT_PROTO_UDPLITE) += nf_nat_proto_udplite.o
 
 # generic transport layer logging
 obj-$(CONFIG_NF_LOG_COMMON) += nf_log_common.o
@@ -95,6 +93,7 @@ obj-$(CONFIG_NFT_REJECT) 	+= nft_reject.o
 obj-$(CONFIG_NFT_REJECT_INET)	+= nft_reject_inet.o
 obj-$(CONFIG_NFT_SET_RBTREE)	+= nft_set_rbtree.o
 obj-$(CONFIG_NFT_SET_HASH)	+= nft_set_hash.o
+obj-$(CONFIG_NFT_SET_BITMAP)	+= nft_set_bitmap.o
 obj-$(CONFIG_NFT_COUNTER)	+= nft_counter.o
 obj-$(CONFIG_NFT_LOG)		+= nft_log.o
 obj-$(CONFIG_NFT_MASQ)		+= nft_masq.o
diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index ce6adfae521a..a87a6f8a74d8 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -375,7 +375,7 @@ void nf_ct_attach(struct sk_buff *new, const struct sk_buff *skb)
 {
 	void (*attach)(struct sk_buff *, const struct sk_buff *);
 
-	if (skb->nfct) {
+	if (skb->_nfct) {
 		rcu_read_lock();
 		attach = rcu_dereference(ip_ct_attach);
 		if (attach)
diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h
index 1b05d4a7d5a1..f236c0bc7b3f 100644
--- a/net/netfilter/ipset/ip_set_hash_gen.h
+++ b/net/netfilter/ipset/ip_set_hash_gen.h
@@ -897,7 +897,7 @@ mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext,
 					continue;
 				data = ahash_data(n, j, dsize);
 				memcpy(tmp->value + k * dsize, data, dsize);
-				set_bit(j, tmp->used);
+				set_bit(k, tmp->used);
 				k++;
 			}
 			tmp->pos = k;
diff --git a/net/netfilter/ipset/ip_set_list_set.c b/net/netfilter/ipset/ip_set_list_set.c
index 51077c53d76b..178d4eba013b 100644
--- a/net/netfilter/ipset/ip_set_list_set.c
+++ b/net/netfilter/ipset/ip_set_list_set.c
@@ -260,11 +260,14 @@ list_set_uadd(struct ip_set *set, void *value, const struct ip_set_ext *ext,
 		else
 			prev = e;
 	}
+
+	/* If before/after is used on an empty set */
+	if ((d->before > 0 && !next) ||
+	    (d->before < 0 && !prev))
+		return -IPSET_ERR_REF_EXIST;
+
 	/* Re-add already existing element */
 	if (n) {
-		if ((d->before > 0 && !next) ||
-		    (d->before < 0 && !prev))
-			return -IPSET_ERR_REF_EXIST;
 		if (!flag_exist)
 			return -IPSET_ERR_EXIST;
 		/* Update extensions */
diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c
index 096a45103f14..e6a2753dff9e 100644
--- a/net/netfilter/ipvs/ip_vs_conn.c
+++ b/net/netfilter/ipvs/ip_vs_conn.c
@@ -1429,7 +1429,7 @@ int __init ip_vs_conn_init(void)
 		"(size=%d, memory=%ldKbytes)\n",
 		ip_vs_conn_tab_size,
 		(long)(ip_vs_conn_tab_size*sizeof(struct list_head))/1024);
-	IP_VS_DBG(0, "Each connection entry needs %Zd bytes at least\n",
+	IP_VS_DBG(0, "Each connection entry needs %zd bytes at least\n",
 		  sizeof(struct ip_vs_conn));
 
 	for (idx = 0; idx < ip_vs_conn_tab_size; idx++)
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index 55e0169caa4c..5aeb0dde6ccc 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -426,10 +426,9 @@ ip_vs_service_find(struct netns_ipvs *ipvs, int af, __u32 fwmark, __u16 protocol
 	 */
 	svc = __ip_vs_service_find(ipvs, af, protocol, vaddr, vport);
 
-	if (svc == NULL
-	    && protocol == IPPROTO_TCP
-	    && atomic_read(&ipvs->ftpsvc_counter)
-	    && (vport == FTPDATA || ntohs(vport) >= PROT_SOCK)) {
+	if (!svc && protocol == IPPROTO_TCP &&
+	    atomic_read(&ipvs->ftpsvc_counter) &&
+	    (vport == FTPDATA || ntohs(vport) >= inet_prot_sock(ipvs->net))) {
 		/*
 		 * Check if ftp service entry exists, the packet
 		 * might belong to FTP data connections.
@@ -711,7 +710,6 @@ ip_vs_trash_get_dest(struct ip_vs_service *svc, int dest_af,
 		      dest->vport == svc->port))) {
 			/* HIT */
 			list_del(&dest->t_list);
-			ip_vs_dest_hold(dest);
 			goto out;
 		}
 	}
@@ -741,7 +739,7 @@ static void ip_vs_dest_free(struct ip_vs_dest *dest)
  *  When the ip_vs_control_clearup is activated by ipvs module exit,
  *  the service tables must have been flushed and all the connections
  *  are expired, and the refcnt of each destination in the trash must
- *  be 0, so we simply release them here.
+ *  be 1, so we simply release them here.
  */
 static void ip_vs_trash_cleanup(struct netns_ipvs *ipvs)
 {
@@ -1080,11 +1078,10 @@ static void __ip_vs_del_dest(struct netns_ipvs *ipvs, struct ip_vs_dest *dest,
 	if (list_empty(&ipvs->dest_trash) && !cleanup)
 		mod_timer(&ipvs->dest_trash_timer,
 			  jiffies + (IP_VS_DEST_TRASH_PERIOD >> 1));
-	/* dest lives in trash without reference */
+	/* dest lives in trash with reference */
 	list_add(&dest->t_list, &ipvs->dest_trash);
 	dest->idle_start = 0;
 	spin_unlock_bh(&ipvs->dest_trash_lock);
-	ip_vs_dest_put(dest);
 }
 
 
@@ -1160,7 +1157,7 @@ static void ip_vs_dest_trash_expire(unsigned long data)
 
 	spin_lock(&ipvs->dest_trash_lock);
 	list_for_each_entry_safe(dest, next, &ipvs->dest_trash, t_list) {
-		if (atomic_read(&dest->refcnt) > 0)
+		if (atomic_read(&dest->refcnt) > 1)
 			continue;
 		if (dest->idle_start) {
 			if (time_before(now, dest->idle_start +
diff --git a/net/netfilter/ipvs/ip_vs_dh.c b/net/netfilter/ipvs/ip_vs_dh.c
index 6be5c538b71e..75f798f8e83b 100644
--- a/net/netfilter/ipvs/ip_vs_dh.c
+++ b/net/netfilter/ipvs/ip_vs_dh.c
@@ -163,7 +163,7 @@ static int ip_vs_dh_init_svc(struct ip_vs_service *svc)
 		return -ENOMEM;
 
 	svc->sched_data = s;
-	IP_VS_DBG(6, "DH hash table (memory=%Zdbytes) allocated for "
+	IP_VS_DBG(6, "DH hash table (memory=%zdbytes) allocated for "
 		  "current service\n",
 		  sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE);
 
@@ -183,7 +183,7 @@ static void ip_vs_dh_done_svc(struct ip_vs_service *svc)
 
 	/* release the table itself */
 	kfree_rcu(s, rcu_head);
-	IP_VS_DBG(6, "DH hash table (memory=%Zdbytes) released\n",
+	IP_VS_DBG(6, "DH hash table (memory=%zdbytes) released\n",
 		  sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE);
 }
 
diff --git a/net/netfilter/ipvs/ip_vs_lblc.c b/net/netfilter/ipvs/ip_vs_lblc.c
index cccf4d637412..5824927cf8e0 100644
--- a/net/netfilter/ipvs/ip_vs_lblc.c
+++ b/net/netfilter/ipvs/ip_vs_lblc.c
@@ -356,7 +356,7 @@ static int ip_vs_lblc_init_svc(struct ip_vs_service *svc)
 		return -ENOMEM;
 
 	svc->sched_data = tbl;
-	IP_VS_DBG(6, "LBLC hash table (memory=%Zdbytes) allocated for "
+	IP_VS_DBG(6, "LBLC hash table (memory=%zdbytes) allocated for "
 		  "current service\n", sizeof(*tbl));
 
 	/*
@@ -393,7 +393,7 @@ static void ip_vs_lblc_done_svc(struct ip_vs_service *svc)
 
 	/* release the table itself */
 	kfree_rcu(tbl, rcu_head);
-	IP_VS_DBG(6, "LBLC hash table (memory=%Zdbytes) released\n",
+	IP_VS_DBG(6, "LBLC hash table (memory=%zdbytes) released\n",
 		  sizeof(*tbl));
 }
 
diff --git a/net/netfilter/ipvs/ip_vs_lblcr.c b/net/netfilter/ipvs/ip_vs_lblcr.c
index 796d70e47ddd..703f11877bee 100644
--- a/net/netfilter/ipvs/ip_vs_lblcr.c
+++ b/net/netfilter/ipvs/ip_vs_lblcr.c
@@ -519,7 +519,7 @@ static int ip_vs_lblcr_init_svc(struct ip_vs_service *svc)
 		return -ENOMEM;
 
 	svc->sched_data = tbl;
-	IP_VS_DBG(6, "LBLCR hash table (memory=%Zdbytes) allocated for "
+	IP_VS_DBG(6, "LBLCR hash table (memory=%zdbytes) allocated for "
 		  "current service\n", sizeof(*tbl));
 
 	/*
@@ -556,7 +556,7 @@ static void ip_vs_lblcr_done_svc(struct ip_vs_service *svc)
 
 	/* release the table itself */
 	kfree_rcu(tbl, rcu_head);
-	IP_VS_DBG(6, "LBLCR hash table (memory=%Zdbytes) released\n",
+	IP_VS_DBG(6, "LBLCR hash table (memory=%zdbytes) released\n",
 		  sizeof(*tbl));
 }
 
diff --git a/net/netfilter/ipvs/ip_vs_sh.c b/net/netfilter/ipvs/ip_vs_sh.c
index 1e373a5e44e3..16aaac6eedc9 100644
--- a/net/netfilter/ipvs/ip_vs_sh.c
+++ b/net/netfilter/ipvs/ip_vs_sh.c
@@ -239,7 +239,7 @@ static int ip_vs_sh_init_svc(struct ip_vs_service *svc)
 		return -ENOMEM;
 
 	svc->sched_data = s;
-	IP_VS_DBG(6, "SH hash table (memory=%Zdbytes) allocated for "
+	IP_VS_DBG(6, "SH hash table (memory=%zdbytes) allocated for "
 		  "current service\n",
 		  sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE);
 
@@ -259,7 +259,7 @@ static void ip_vs_sh_done_svc(struct ip_vs_service *svc)
 
 	/* release the table itself */
 	kfree_rcu(s, rcu_head);
-	IP_VS_DBG(6, "SH hash table (memory=%Zdbytes) released\n",
+	IP_VS_DBG(6, "SH hash table (memory=%zdbytes) released\n",
 		  sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE);
 }
 
diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c
index 9350530c16c1..b03c28084f81 100644
--- a/net/netfilter/ipvs/ip_vs_sync.c
+++ b/net/netfilter/ipvs/ip_vs_sync.c
@@ -1791,7 +1791,7 @@ int start_sync_thread(struct netns_ipvs *ipvs, struct ipvs_sync_daemon_cfg *c,
 	u16 mtu, min_mtu;
 
 	IP_VS_DBG(7, "%s(): pid %d\n", __func__, task_pid_nr(current));
-	IP_VS_DBG(7, "Each ip_vs_sync_conn entry needs %Zd bytes\n",
+	IP_VS_DBG(7, "Each ip_vs_sync_conn entry needs %zd bytes\n",
 		  sizeof(struct ip_vs_sync_conn_v0));
 
 	if (!ipvs->sync_state) {
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 4e8083c5e01d..071b97fcbefb 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -350,16 +350,31 @@ static void nf_ct_del_from_dying_or_unconfirmed_list(struct nf_conn *ct)
 	spin_unlock(&pcpu->lock);
 }
 
+#define NFCT_ALIGN(len)	(((len) + NFCT_INFOMASK) & ~NFCT_INFOMASK)
+
 /* Released via destroy_conntrack() */
 struct nf_conn *nf_ct_tmpl_alloc(struct net *net,
 				 const struct nf_conntrack_zone *zone,
 				 gfp_t flags)
 {
-	struct nf_conn *tmpl;
+	struct nf_conn *tmpl, *p;
 
-	tmpl = kzalloc(sizeof(*tmpl), flags);
-	if (tmpl == NULL)
-		return NULL;
+	if (ARCH_KMALLOC_MINALIGN <= NFCT_INFOMASK) {
+		tmpl = kzalloc(sizeof(*tmpl) + NFCT_INFOMASK, flags);
+		if (!tmpl)
+			return NULL;
+
+		p = tmpl;
+		tmpl = (struct nf_conn *)NFCT_ALIGN((unsigned long)p);
+		if (tmpl != p) {
+			tmpl = (struct nf_conn *)NFCT_ALIGN((unsigned long)p);
+			tmpl->proto.tmpl_padto = (char *)tmpl - (char *)p;
+		}
+	} else {
+		tmpl = kzalloc(sizeof(*tmpl), flags);
+		if (!tmpl)
+			return NULL;
+	}
 
 	tmpl->status = IPS_TEMPLATE;
 	write_pnet(&tmpl->ct_net, net);
@@ -374,7 +389,11 @@ void nf_ct_tmpl_free(struct nf_conn *tmpl)
 {
 	nf_ct_ext_destroy(tmpl);
 	nf_ct_ext_free(tmpl);
-	kfree(tmpl);
+
+	if (ARCH_KMALLOC_MINALIGN <= NFCT_INFOMASK)
+		kfree((char *)tmpl - tmpl->proto.tmpl_padto);
+	else
+		kfree(tmpl);
 }
 EXPORT_SYMBOL_GPL(nf_ct_tmpl_free);
 
@@ -686,12 +705,12 @@ static int nf_ct_resolve_clash(struct net *net, struct sk_buff *skb,
 	    !nfct_nat(ct) &&
 	    !nf_ct_is_dying(ct) &&
 	    atomic_inc_not_zero(&ct->ct_general.use)) {
-		nf_ct_acct_merge(ct, ctinfo, (struct nf_conn *)skb->nfct);
-		nf_conntrack_put(skb->nfct);
-		/* Assign conntrack already in hashes to this skbuff. Don't
-		 * modify skb->nfctinfo to ensure consistent stateful filtering.
-		 */
-		skb->nfct = &ct->ct_general;
+		enum ip_conntrack_info oldinfo;
+		struct nf_conn *loser_ct = nf_ct_get(skb, &oldinfo);
+
+		nf_ct_acct_merge(ct, ctinfo, loser_ct);
+		nf_conntrack_put(&loser_ct->ct_general);
+		nf_ct_set(skb, ct, oldinfo);
 		return NF_ACCEPT;
 	}
 	NF_CT_STAT_INC(net, drop);
@@ -1218,7 +1237,7 @@ init_conntrack(struct net *net, struct nf_conn *tmpl,
 	return &ct->tuplehash[IP_CT_DIR_ORIGINAL];
 }
 
-/* On success, returns conntrack ptr, sets skb->nfct and ctinfo */
+/* On success, returns conntrack ptr, sets skb->_nfct | ctinfo */
 static inline struct nf_conn *
 resolve_normal_ct(struct net *net, struct nf_conn *tmpl,
 		  struct sk_buff *skb,
@@ -1277,8 +1296,7 @@ resolve_normal_ct(struct net *net, struct nf_conn *tmpl,
 		}
 		*set_reply = 0;
 	}
-	skb->nfct = &ct->ct_general;
-	skb->nfctinfo = *ctinfo;
+	nf_ct_set(skb, ct, *ctinfo);
 	return ct;
 }
 
@@ -1286,7 +1304,7 @@ unsigned int
 nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum,
 		struct sk_buff *skb)
 {
-	struct nf_conn *ct, *tmpl = NULL;
+	struct nf_conn *ct, *tmpl;
 	enum ip_conntrack_info ctinfo;
 	struct nf_conntrack_l3proto *l3proto;
 	struct nf_conntrack_l4proto *l4proto;
@@ -1296,14 +1314,14 @@ nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum,
 	int set_reply = 0;
 	int ret;
 
-	if (skb->nfct) {
+	tmpl = nf_ct_get(skb, &ctinfo);
+	if (tmpl) {
 		/* Previously seen (loopback or untracked)?  Ignore. */
-		tmpl = (struct nf_conn *)skb->nfct;
 		if (!nf_ct_is_template(tmpl)) {
 			NF_CT_STAT_INC_ATOMIC(net, ignore);
 			return NF_ACCEPT;
 		}
-		skb->nfct = NULL;
+		skb->_nfct = 0;
 	}
 
 	/* rcu_read_lock()ed by nf_hook_thresh */
@@ -1324,8 +1342,7 @@ nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum,
 	 * inverse of the return code tells to the netfilter
 	 * core what to do with the packet. */
 	if (l4proto->error != NULL) {
-		ret = l4proto->error(net, tmpl, skb, dataoff, &ctinfo,
-				     pf, hooknum);
+		ret = l4proto->error(net, tmpl, skb, dataoff, pf, hooknum);
 		if (ret <= 0) {
 			NF_CT_STAT_INC_ATOMIC(net, error);
 			NF_CT_STAT_INC_ATOMIC(net, invalid);
@@ -1333,7 +1350,7 @@ nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum,
 			goto out;
 		}
 		/* ICMP[v6] protocol trackers may assign one conntrack. */
-		if (skb->nfct)
+		if (skb->_nfct)
 			goto out;
 	}
 repeat:
@@ -1353,7 +1370,7 @@ repeat:
 		goto out;
 	}
 
-	NF_CT_ASSERT(skb->nfct);
+	NF_CT_ASSERT(skb_nfct(skb));
 
 	/* Decide what timeout policy we want to apply to this flow. */
 	timeouts = nf_ct_timeout_lookup(net, ct, l4proto);
@@ -1363,8 +1380,8 @@ repeat:
 		/* Invalid: inverse of the return code tells
 		 * the netfilter core what to do */
 		pr_debug("nf_conntrack_in: Can't track with proto module\n");
-		nf_conntrack_put(skb->nfct);
-		skb->nfct = NULL;
+		nf_conntrack_put(&ct->ct_general);
+		skb->_nfct = 0;
 		NF_CT_STAT_INC_ATOMIC(net, invalid);
 		if (ret == -NF_DROP)
 			NF_CT_STAT_INC_ATOMIC(net, drop);
@@ -1522,9 +1539,8 @@ static void nf_conntrack_attach(struct sk_buff *nskb, const struct sk_buff *skb)
 		ctinfo = IP_CT_RELATED;
 
 	/* Attach to new skbuff, and increment count */
-	nskb->nfct = &ct->ct_general;
-	nskb->nfctinfo = ctinfo;
-	nf_conntrack_get(nskb->nfct);
+	nf_ct_set(nskb, ct, ctinfo);
+	nf_conntrack_get(skb_nfct(nskb));
 }
 
 /* Bring out ya dead! */
@@ -1860,7 +1876,8 @@ int nf_conntrack_init_start(void)
 	nf_conntrack_max = max_factor * nf_conntrack_htable_size;
 
 	nf_conntrack_cachep = kmem_cache_create("nf_conntrack",
-						sizeof(struct nf_conn), 0,
+						sizeof(struct nf_conn),
+						NFCT_INFOMASK + 1,
 						SLAB_DESTROY_BY_RCU | SLAB_HWCACHE_ALIGN, NULL);
 	if (!nf_conntrack_cachep)
 		goto err_cachep;
diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c
index f8dbacf66795..4b2e1fb28bb4 100644
--- a/net/netfilter/nf_conntrack_expect.c
+++ b/net/netfilter/nf_conntrack_expect.c
@@ -353,7 +353,7 @@ void nf_ct_expect_put(struct nf_conntrack_expect *exp)
 }
 EXPORT_SYMBOL_GPL(nf_ct_expect_put);
 
-static int nf_ct_expect_insert(struct nf_conntrack_expect *exp)
+static void nf_ct_expect_insert(struct nf_conntrack_expect *exp)
 {
 	struct nf_conn_help *master_help = nfct_help(exp->master);
 	struct nf_conntrack_helper *helper;
@@ -380,7 +380,6 @@ static int nf_ct_expect_insert(struct nf_conntrack_expect *exp)
 	add_timer(&exp->timeout);
 
 	NF_CT_STAT_INC(net, expect_create);
-	return 0;
 }
 
 /* Race with expectations being used means we could have none to find; OK. */
@@ -411,7 +410,7 @@ static inline int __nf_ct_expect_check(struct nf_conntrack_expect *expect)
 	struct net *net = nf_ct_exp_net(expect);
 	struct hlist_node *next;
 	unsigned int h;
-	int ret = 1;
+	int ret = 0;
 
 	if (!master_help) {
 		ret = -ESHUTDOWN;
@@ -461,15 +460,14 @@ int nf_ct_expect_related_report(struct nf_conntrack_expect *expect,
 
 	spin_lock_bh(&nf_conntrack_expect_lock);
 	ret = __nf_ct_expect_check(expect);
-	if (ret <= 0)
-		goto out;
-
-	ret = nf_ct_expect_insert(expect);
 	if (ret < 0)
 		goto out;
+
+	nf_ct_expect_insert(expect);
+
 	spin_unlock_bh(&nf_conntrack_expect_lock);
 	nf_ct_expect_event_report(IPEXP_NEW, expect, portid, report);
-	return ret;
+	return 0;
 out:
 	spin_unlock_bh(&nf_conntrack_expect_lock);
 	return ret;
diff --git a/net/netfilter/nf_conntrack_ftp.c b/net/netfilter/nf_conntrack_ftp.c
index e3ed20060878..4aecef4a89fb 100644
--- a/net/netfilter/nf_conntrack_ftp.c
+++ b/net/netfilter/nf_conntrack_ftp.c
@@ -300,7 +300,7 @@ static int find_pattern(const char *data, size_t dlen,
 {
 	size_t i = plen;
 
-	pr_debug("find_pattern `%s': dlen = %Zu\n", pattern, dlen);
+	pr_debug("find_pattern `%s': dlen = %zu\n", pattern, dlen);
 
 	if (dlen <= plen) {
 		/* Short packet: try for partial? */
diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c
index 7341adf7059d..6dc44d9b4190 100644
--- a/net/netfilter/nf_conntrack_helper.c
+++ b/net/netfilter/nf_conntrack_helper.c
@@ -188,6 +188,26 @@ nf_ct_helper_ext_add(struct nf_conn *ct,
 }
 EXPORT_SYMBOL_GPL(nf_ct_helper_ext_add);
 
+static struct nf_conntrack_helper *
+nf_ct_lookup_helper(struct nf_conn *ct, struct net *net)
+{
+	if (!net->ct.sysctl_auto_assign_helper) {
+		if (net->ct.auto_assign_helper_warned)
+			return NULL;
+		if (!__nf_ct_helper_find(&ct->tuplehash[IP_CT_DIR_REPLY].tuple))
+			return NULL;
+		pr_info("nf_conntrack: default automatic helper assignment "
+			"has been turned off for security reasons and CT-based "
+			" firewall rule not found. Use the iptables CT target "
+			"to attach helpers instead.\n");
+		net->ct.auto_assign_helper_warned = 1;
+		return NULL;
+	}
+
+	return __nf_ct_helper_find(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
+}
+
+
 int __nf_ct_try_assign_helper(struct nf_conn *ct, struct nf_conn *tmpl,
 			      gfp_t flags)
 {
@@ -213,21 +233,14 @@ int __nf_ct_try_assign_helper(struct nf_conn *ct, struct nf_conn *tmpl,
 	}
 
 	help = nfct_help(ct);
-	if (net->ct.sysctl_auto_assign_helper && helper == NULL) {
-		helper = __nf_ct_helper_find(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
-		if (unlikely(!net->ct.auto_assign_helper_warned && helper)) {
-			pr_info("nf_conntrack: automatic helper "
-				"assignment is deprecated and it will "
-				"be removed soon. Use the iptables CT target "
-				"to attach helpers instead.\n");
-			net->ct.auto_assign_helper_warned = true;
-		}
-	}
 
 	if (helper == NULL) {
-		if (help)
-			RCU_INIT_POINTER(help->helper, NULL);
-		return 0;
+		helper = nf_ct_lookup_helper(ct, net);
+		if (helper == NULL) {
+			if (help)
+				RCU_INIT_POINTER(help->helper, NULL);
+			return 0;
+		}
 	}
 
 	if (help == NULL) {
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index 27540455dc62..6806b5e73567 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -1478,14 +1478,23 @@ static int ctnetlink_change_helper(struct nf_conn *ct,
 	struct nlattr *helpinfo = NULL;
 	int err;
 
-	/* don't change helper of sibling connections */
-	if (ct->master)
-		return -EBUSY;
-
 	err = ctnetlink_parse_help(cda[CTA_HELP], &helpname, &helpinfo);
 	if (err < 0)
 		return err;
 
+	/* don't change helper of sibling connections */
+	if (ct->master) {
+		/* If we try to change the helper to the same thing twice,
+		 * treat the second attempt as a no-op instead of returning
+		 * an error.
+		 */
+		if (help && help->helper &&
+		    !strcmp(help->helper->name, helpname))
+			return 0;
+		else
+			return -EBUSY;
+	}
+
 	if (!strcmp(helpname, "")) {
 		if (help && help->helper) {
 			/* we had a helper before ... */
@@ -2270,6 +2279,30 @@ nla_put_failure:
 }
 
 static int
+ctnetlink_update_status(struct nf_conn *ct, const struct nlattr * const cda[])
+{
+	unsigned int status = ntohl(nla_get_be32(cda[CTA_STATUS]));
+	unsigned long d = ct->status ^ status;
+
+	if (d & IPS_SEEN_REPLY && !(status & IPS_SEEN_REPLY))
+		/* SEEN_REPLY bit can only be set */
+		return -EBUSY;
+
+	if (d & IPS_ASSURED && !(status & IPS_ASSURED))
+		/* ASSURED bit can only be set */
+		return -EBUSY;
+
+	/* This check is less strict than ctnetlink_change_status()
+	 * because callers often flip IPS_EXPECTED bits when sending
+	 * an NFQA_CT attribute to the kernel.  So ignore the
+	 * unchangeable bits but do not error out.
+	 */
+	ct->status = (status & ~IPS_UNCHANGEABLE_MASK) |
+		     (ct->status & IPS_UNCHANGEABLE_MASK);
+	return 0;
+}
+
+static int
 ctnetlink_glue_parse_ct(const struct nlattr *cda[], struct nf_conn *ct)
 {
 	int err;
@@ -2280,7 +2313,7 @@ ctnetlink_glue_parse_ct(const struct nlattr *cda[], struct nf_conn *ct)
 			return err;
 	}
 	if (cda[CTA_STATUS]) {
-		err = ctnetlink_change_status(ct, cda);
+		err = ctnetlink_update_status(ct, cda);
 		if (err < 0)
 			return err;
 	}
diff --git a/net/netfilter/nf_conntrack_proto_dccp.c b/net/netfilter/nf_conntrack_proto_dccp.c
index b68ce6ac13b3..93dd1c5b7bff 100644
--- a/net/netfilter/nf_conntrack_proto_dccp.c
+++ b/net/netfilter/nf_conntrack_proto_dccp.c
@@ -561,7 +561,6 @@ static int dccp_packet(struct nf_conn *ct, const struct sk_buff *skb,
 
 static int dccp_error(struct net *net, struct nf_conn *tmpl,
 		      struct sk_buff *skb, unsigned int dataoff,
-		      enum ip_conntrack_info *ctinfo,
 		      u_int8_t pf, unsigned int hooknum)
 {
 	struct dccp_hdr _dh, *dh;
diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c
index a0efde38da44..33279aab583d 100644
--- a/net/netfilter/nf_conntrack_proto_sctp.c
+++ b/net/netfilter/nf_conntrack_proto_sctp.c
@@ -22,7 +22,9 @@
 #include <linux/seq_file.h>
 #include <linux/spinlock.h>
 #include <linux/interrupt.h>
+#include <net/sctp/checksum.h>
 
+#include <net/netfilter/nf_log.h>
 #include <net/netfilter/nf_conntrack.h>
 #include <net/netfilter/nf_conntrack_l4proto.h>
 #include <net/netfilter/nf_conntrack_ecache.h>
@@ -505,6 +507,34 @@ static bool sctp_new(struct nf_conn *ct, const struct sk_buff *skb,
 	return true;
 }
 
+static int sctp_error(struct net *net, struct nf_conn *tpl, struct sk_buff *skb,
+		      unsigned int dataoff,
+		      u8 pf, unsigned int hooknum)
+{
+	const struct sctphdr *sh;
+	struct sctphdr _sctph;
+	const char *logmsg;
+
+	sh = skb_header_pointer(skb, dataoff, sizeof(_sctph), &_sctph);
+	if (!sh) {
+		logmsg = "nf_ct_sctp: short packet ";
+		goto out_invalid;
+	}
+	if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING &&
+	    skb->ip_summed == CHECKSUM_NONE) {
+		if (sh->checksum != sctp_compute_cksum(skb, dataoff)) {
+			logmsg = "nf_ct_sctp: bad CRC ";
+			goto out_invalid;
+		}
+		skb->ip_summed = CHECKSUM_UNNECESSARY;
+	}
+	return NF_ACCEPT;
+out_invalid:
+	if (LOG_INVALID(net, IPPROTO_SCTP))
+		nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, "%s", logmsg);
+	return -NF_ACCEPT;
+}
+
 #if IS_ENABLED(CONFIG_NF_CT_NETLINK)
 
 #include <linux/netfilter/nfnetlink.h>
@@ -752,6 +782,7 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp4 __read_mostly = {
 	.packet 		= sctp_packet,
 	.get_timeouts		= sctp_get_timeouts,
 	.new 			= sctp_new,
+	.error			= sctp_error,
 	.me 			= THIS_MODULE,
 #if IS_ENABLED(CONFIG_NF_CT_NETLINK)
 	.to_nlattr		= sctp_to_nlattr,
@@ -786,6 +817,7 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp6 __read_mostly = {
 	.packet 		= sctp_packet,
 	.get_timeouts		= sctp_get_timeouts,
 	.new 			= sctp_new,
+	.error			= sctp_error,
 	.me 			= THIS_MODULE,
 #if IS_ENABLED(CONFIG_NF_CT_NETLINK)
 	.to_nlattr		= sctp_to_nlattr,
diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c
index 69f687740c76..b122e9dacfed 100644
--- a/net/netfilter/nf_conntrack_proto_tcp.c
+++ b/net/netfilter/nf_conntrack_proto_tcp.c
@@ -750,7 +750,6 @@ static const u8 tcp_valid_flags[(TCPHDR_FIN|TCPHDR_SYN|TCPHDR_RST|TCPHDR_ACK|
 static int tcp_error(struct net *net, struct nf_conn *tmpl,
 		     struct sk_buff *skb,
 		     unsigned int dataoff,
-		     enum ip_conntrack_info *ctinfo,
 		     u_int8_t pf,
 		     unsigned int hooknum)
 {
diff --git a/net/netfilter/nf_conntrack_proto_udp.c b/net/netfilter/nf_conntrack_proto_udp.c
index 20f35ed68030..f6ebce6178ca 100644
--- a/net/netfilter/nf_conntrack_proto_udp.c
+++ b/net/netfilter/nf_conntrack_proto_udp.c
@@ -108,8 +108,60 @@ static bool udp_new(struct nf_conn *ct, const struct sk_buff *skb,
 	return true;
 }
 
+#ifdef CONFIG_NF_CT_PROTO_UDPLITE
+static int udplite_error(struct net *net, struct nf_conn *tmpl,
+			 struct sk_buff *skb,
+			 unsigned int dataoff,
+			 u8 pf, unsigned int hooknum)
+{
+	unsigned int udplen = skb->len - dataoff;
+	const struct udphdr *hdr;
+	struct udphdr _hdr;
+	unsigned int cscov;
+
+	/* Header is too small? */
+	hdr = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr);
+	if (!hdr) {
+		if (LOG_INVALID(net, IPPROTO_UDPLITE))
+			nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
+				      "nf_ct_udplite: short packet ");
+		return -NF_ACCEPT;
+	}
+
+	cscov = ntohs(hdr->len);
+	if (cscov == 0) {
+		cscov = udplen;
+	} else if (cscov < sizeof(*hdr) || cscov > udplen) {
+		if (LOG_INVALID(net, IPPROTO_UDPLITE))
+			nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
+				      "nf_ct_udplite: invalid checksum coverage ");
+		return -NF_ACCEPT;
+	}
+
+	/* UDPLITE mandates checksums */
+	if (!hdr->check) {
+		if (LOG_INVALID(net, IPPROTO_UDPLITE))
+			nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
+				      "nf_ct_udplite: checksum missing ");
+		return -NF_ACCEPT;
+	}
+
+	/* Checksum invalid? Ignore. */
+	if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING &&
+	    nf_checksum_partial(skb, hooknum, dataoff, cscov, IPPROTO_UDP,
+				pf)) {
+		if (LOG_INVALID(net, IPPROTO_UDPLITE))
+			nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
+				      "nf_ct_udplite: bad UDPLite checksum ");
+		return -NF_ACCEPT;
+	}
+
+	return NF_ACCEPT;
+}
+#endif
+
 static int udp_error(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb,
-		     unsigned int dataoff, enum ip_conntrack_info *ctinfo,
+		     unsigned int dataoff,
 		     u_int8_t pf,
 		     unsigned int hooknum)
 {
@@ -290,6 +342,41 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_udp4 __read_mostly =
 };
 EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_udp4);
 
+#ifdef CONFIG_NF_CT_PROTO_UDPLITE
+struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite4 __read_mostly =
+{
+	.l3proto		= PF_INET,
+	.l4proto		= IPPROTO_UDPLITE,
+	.name			= "udplite",
+	.allow_clash		= true,
+	.pkt_to_tuple		= udp_pkt_to_tuple,
+	.invert_tuple		= udp_invert_tuple,
+	.print_tuple		= udp_print_tuple,
+	.packet			= udp_packet,
+	.get_timeouts		= udp_get_timeouts,
+	.new			= udp_new,
+	.error			= udplite_error,
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
+	.tuple_to_nlattr	= nf_ct_port_tuple_to_nlattr,
+	.nlattr_to_tuple	= nf_ct_port_nlattr_to_tuple,
+	.nlattr_tuple_size	= nf_ct_port_nlattr_tuple_size,
+	.nla_policy		= nf_ct_port_nla_policy,
+#endif
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
+	.ctnl_timeout		= {
+		.nlattr_to_obj	= udp_timeout_nlattr_to_obj,
+		.obj_to_nlattr	= udp_timeout_obj_to_nlattr,
+		.nlattr_max	= CTA_TIMEOUT_UDP_MAX,
+		.obj_size	= sizeof(unsigned int) * CTA_TIMEOUT_UDP_MAX,
+		.nla_policy	= udp_timeout_nla_policy,
+	},
+#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
+	.init_net		= udp_init_net,
+	.get_net_proto		= udp_get_net_proto,
+};
+EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_udplite4);
+#endif
+
 struct nf_conntrack_l4proto nf_conntrack_l4proto_udp6 __read_mostly =
 {
 	.l3proto		= PF_INET6,
@@ -322,3 +409,38 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_udp6 __read_mostly =
 	.get_net_proto		= udp_get_net_proto,
 };
 EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_udp6);
+
+#ifdef CONFIG_NF_CT_PROTO_UDPLITE
+struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite6 __read_mostly =
+{
+	.l3proto		= PF_INET6,
+	.l4proto		= IPPROTO_UDPLITE,
+	.name			= "udplite",
+	.allow_clash		= true,
+	.pkt_to_tuple		= udp_pkt_to_tuple,
+	.invert_tuple		= udp_invert_tuple,
+	.print_tuple		= udp_print_tuple,
+	.packet			= udp_packet,
+	.get_timeouts		= udp_get_timeouts,
+	.new			= udp_new,
+	.error			= udplite_error,
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
+	.tuple_to_nlattr	= nf_ct_port_tuple_to_nlattr,
+	.nlattr_to_tuple	= nf_ct_port_nlattr_to_tuple,
+	.nlattr_tuple_size	= nf_ct_port_nlattr_tuple_size,
+	.nla_policy		= nf_ct_port_nla_policy,
+#endif
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
+	.ctnl_timeout		= {
+		.nlattr_to_obj	= udp_timeout_nlattr_to_obj,
+		.obj_to_nlattr	= udp_timeout_obj_to_nlattr,
+		.nlattr_max	= CTA_TIMEOUT_UDP_MAX,
+		.obj_size	= sizeof(unsigned int) * CTA_TIMEOUT_UDP_MAX,
+		.nla_policy	= udp_timeout_nla_policy,
+	},
+#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
+	.init_net		= udp_init_net,
+	.get_net_proto		= udp_get_net_proto,
+};
+EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_udplite6);
+#endif
diff --git a/net/netfilter/nf_conntrack_proto_udplite.c b/net/netfilter/nf_conntrack_proto_udplite.c
deleted file mode 100644
index c35f7bf05d8c..000000000000
--- a/net/netfilter/nf_conntrack_proto_udplite.c
+++ /dev/null
@@ -1,324 +0,0 @@
-/* (C) 1999-2001 Paul `Rusty' Russell
- * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
- * (C) 2007 Patrick McHardy <kaber@trash.net>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/types.h>
-#include <linux/timer.h>
-#include <linux/udp.h>
-#include <linux/seq_file.h>
-#include <linux/skbuff.h>
-#include <linux/ipv6.h>
-#include <net/ip6_checksum.h>
-#include <net/checksum.h>
-
-#include <linux/netfilter.h>
-#include <linux/netfilter_ipv4.h>
-#include <linux/netfilter_ipv6.h>
-#include <net/netfilter/nf_conntrack_l4proto.h>
-#include <net/netfilter/nf_conntrack_ecache.h>
-#include <net/netfilter/nf_log.h>
-
-static unsigned int udplite_timeouts[UDPLITE_CT_MAX] = {
-	[UDPLITE_CT_UNREPLIED]	= 30*HZ,
-	[UDPLITE_CT_REPLIED]	= 180*HZ,
-};
-
-static inline struct nf_udplite_net *udplite_pernet(struct net *net)
-{
-	return &net->ct.nf_ct_proto.udplite;
-}
-
-static bool udplite_pkt_to_tuple(const struct sk_buff *skb,
-				 unsigned int dataoff,
-				 struct net *net,
-				 struct nf_conntrack_tuple *tuple)
-{
-	const struct udphdr *hp;
-	struct udphdr _hdr;
-
-	/* Actually only need first 4 bytes to get ports. */
-	hp = skb_header_pointer(skb, dataoff, 4, &_hdr);
-	if (hp == NULL)
-		return false;
-
-	tuple->src.u.udp.port = hp->source;
-	tuple->dst.u.udp.port = hp->dest;
-	return true;
-}
-
-static bool udplite_invert_tuple(struct nf_conntrack_tuple *tuple,
-				 const struct nf_conntrack_tuple *orig)
-{
-	tuple->src.u.udp.port = orig->dst.u.udp.port;
-	tuple->dst.u.udp.port = orig->src.u.udp.port;
-	return true;
-}
-
-/* Print out the per-protocol part of the tuple. */
-static void udplite_print_tuple(struct seq_file *s,
-				const struct nf_conntrack_tuple *tuple)
-{
-	seq_printf(s, "sport=%hu dport=%hu ",
-		   ntohs(tuple->src.u.udp.port),
-		   ntohs(tuple->dst.u.udp.port));
-}
-
-static unsigned int *udplite_get_timeouts(struct net *net)
-{
-	return udplite_pernet(net)->timeouts;
-}
-
-/* Returns verdict for packet, and may modify conntracktype */
-static int udplite_packet(struct nf_conn *ct,
-			  const struct sk_buff *skb,
-			  unsigned int dataoff,
-			  enum ip_conntrack_info ctinfo,
-			  u_int8_t pf,
-			  unsigned int hooknum,
-			  unsigned int *timeouts)
-{
-	/* If we've seen traffic both ways, this is some kind of UDP
-	   stream.  Extend timeout. */
-	if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
-		nf_ct_refresh_acct(ct, ctinfo, skb,
-				   timeouts[UDPLITE_CT_REPLIED]);
-		/* Also, more likely to be important, and not a probe */
-		if (!test_and_set_bit(IPS_ASSURED_BIT, &ct->status))
-			nf_conntrack_event_cache(IPCT_ASSURED, ct);
-	} else {
-		nf_ct_refresh_acct(ct, ctinfo, skb,
-				   timeouts[UDPLITE_CT_UNREPLIED]);
-	}
-	return NF_ACCEPT;
-}
-
-/* Called when a new connection for this protocol found. */
-static bool udplite_new(struct nf_conn *ct, const struct sk_buff *skb,
-			unsigned int dataoff, unsigned int *timeouts)
-{
-	return true;
-}
-
-static int udplite_error(struct net *net, struct nf_conn *tmpl,
-			 struct sk_buff *skb,
-			 unsigned int dataoff,
-			 enum ip_conntrack_info *ctinfo,
-			 u_int8_t pf,
-			 unsigned int hooknum)
-{
-	unsigned int udplen = skb->len - dataoff;
-	const struct udphdr *hdr;
-	struct udphdr _hdr;
-	unsigned int cscov;
-
-	/* Header is too small? */
-	hdr = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr);
-	if (hdr == NULL) {
-		if (LOG_INVALID(net, IPPROTO_UDPLITE))
-			nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
-				      "nf_ct_udplite: short packet ");
-		return -NF_ACCEPT;
-	}
-
-	cscov = ntohs(hdr->len);
-	if (cscov == 0)
-		cscov = udplen;
-	else if (cscov < sizeof(*hdr) || cscov > udplen) {
-		if (LOG_INVALID(net, IPPROTO_UDPLITE))
-			nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
-				"nf_ct_udplite: invalid checksum coverage ");
-		return -NF_ACCEPT;
-	}
-
-	/* UDPLITE mandates checksums */
-	if (!hdr->check) {
-		if (LOG_INVALID(net, IPPROTO_UDPLITE))
-			nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
-				      "nf_ct_udplite: checksum missing ");
-		return -NF_ACCEPT;
-	}
-
-	/* Checksum invalid? Ignore. */
-	if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING &&
-	    nf_checksum_partial(skb, hooknum, dataoff, cscov, IPPROTO_UDP,
-	    			pf)) {
-		if (LOG_INVALID(net, IPPROTO_UDPLITE))
-			nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
-				      "nf_ct_udplite: bad UDPLite checksum ");
-		return -NF_ACCEPT;
-	}
-
-	return NF_ACCEPT;
-}
-
-#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
-
-#include <linux/netfilter/nfnetlink.h>
-#include <linux/netfilter/nfnetlink_cttimeout.h>
-
-static int udplite_timeout_nlattr_to_obj(struct nlattr *tb[],
-					 struct net *net, void *data)
-{
-	unsigned int *timeouts = data;
-	struct nf_udplite_net *un = udplite_pernet(net);
-
-	/* set default timeouts for UDPlite. */
-	timeouts[UDPLITE_CT_UNREPLIED] = un->timeouts[UDPLITE_CT_UNREPLIED];
-	timeouts[UDPLITE_CT_REPLIED] = un->timeouts[UDPLITE_CT_REPLIED];
-
-	if (tb[CTA_TIMEOUT_UDPLITE_UNREPLIED]) {
-		timeouts[UDPLITE_CT_UNREPLIED] =
-		  ntohl(nla_get_be32(tb[CTA_TIMEOUT_UDPLITE_UNREPLIED])) * HZ;
-	}
-	if (tb[CTA_TIMEOUT_UDPLITE_REPLIED]) {
-		timeouts[UDPLITE_CT_REPLIED] =
-		  ntohl(nla_get_be32(tb[CTA_TIMEOUT_UDPLITE_REPLIED])) * HZ;
-	}
-	return 0;
-}
-
-static int
-udplite_timeout_obj_to_nlattr(struct sk_buff *skb, const void *data)
-{
-	const unsigned int *timeouts = data;
-
-	if (nla_put_be32(skb, CTA_TIMEOUT_UDPLITE_UNREPLIED,
-			 htonl(timeouts[UDPLITE_CT_UNREPLIED] / HZ)) ||
-	    nla_put_be32(skb, CTA_TIMEOUT_UDPLITE_REPLIED,
-			 htonl(timeouts[UDPLITE_CT_REPLIED] / HZ)))
-		goto nla_put_failure;
-	return 0;
-
-nla_put_failure:
-	return -ENOSPC;
-}
-
-static const struct nla_policy
-udplite_timeout_nla_policy[CTA_TIMEOUT_UDPLITE_MAX+1] = {
-	[CTA_TIMEOUT_UDPLITE_UNREPLIED]	= { .type = NLA_U32 },
-	[CTA_TIMEOUT_UDPLITE_REPLIED]	= { .type = NLA_U32 },
-};
-#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
-
-#ifdef CONFIG_SYSCTL
-static struct ctl_table udplite_sysctl_table[] = {
-	{
-		.procname	= "nf_conntrack_udplite_timeout",
-		.maxlen		= sizeof(unsigned int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec_jiffies,
-	},
-	{
-		.procname	= "nf_conntrack_udplite_timeout_stream",
-		.maxlen		= sizeof(unsigned int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec_jiffies,
-	},
-	{ }
-};
-#endif /* CONFIG_SYSCTL */
-
-static int udplite_kmemdup_sysctl_table(struct nf_proto_net *pn,
-					struct nf_udplite_net *un)
-{
-#ifdef CONFIG_SYSCTL
-	if (pn->ctl_table)
-		return 0;
-
-	pn->ctl_table = kmemdup(udplite_sysctl_table,
-				sizeof(udplite_sysctl_table),
-				GFP_KERNEL);
-	if (!pn->ctl_table)
-		return -ENOMEM;
-
-	pn->ctl_table[0].data = &un->timeouts[UDPLITE_CT_UNREPLIED];
-	pn->ctl_table[1].data = &un->timeouts[UDPLITE_CT_REPLIED];
-#endif
-	return 0;
-}
-
-static int udplite_init_net(struct net *net, u_int16_t proto)
-{
-	struct nf_udplite_net *un = udplite_pernet(net);
-	struct nf_proto_net *pn = &un->pn;
-
-	if (!pn->users) {
-		int i;
-
-		for (i = 0 ; i < UDPLITE_CT_MAX; i++)
-			un->timeouts[i] = udplite_timeouts[i];
-	}
-
-	return udplite_kmemdup_sysctl_table(pn, un);
-}
-
-struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite4 __read_mostly =
-{
-	.l3proto		= PF_INET,
-	.l4proto		= IPPROTO_UDPLITE,
-	.name			= "udplite",
-	.allow_clash		= true,
-	.pkt_to_tuple		= udplite_pkt_to_tuple,
-	.invert_tuple		= udplite_invert_tuple,
-	.print_tuple		= udplite_print_tuple,
-	.packet			= udplite_packet,
-	.get_timeouts		= udplite_get_timeouts,
-	.new			= udplite_new,
-	.error			= udplite_error,
-#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
-	.tuple_to_nlattr	= nf_ct_port_tuple_to_nlattr,
-	.nlattr_tuple_size	= nf_ct_port_nlattr_tuple_size,
-	.nlattr_to_tuple	= nf_ct_port_nlattr_to_tuple,
-	.nla_policy		= nf_ct_port_nla_policy,
-#endif
-#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
-	.ctnl_timeout		= {
-		.nlattr_to_obj	= udplite_timeout_nlattr_to_obj,
-		.obj_to_nlattr	= udplite_timeout_obj_to_nlattr,
-		.nlattr_max	= CTA_TIMEOUT_UDPLITE_MAX,
-		.obj_size	= sizeof(unsigned int) *
-					CTA_TIMEOUT_UDPLITE_MAX,
-		.nla_policy	= udplite_timeout_nla_policy,
-	},
-#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
-	.init_net		= udplite_init_net,
-};
-EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_udplite4);
-
-struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite6 __read_mostly =
-{
-	.l3proto		= PF_INET6,
-	.l4proto		= IPPROTO_UDPLITE,
-	.name			= "udplite",
-	.allow_clash		= true,
-	.pkt_to_tuple		= udplite_pkt_to_tuple,
-	.invert_tuple		= udplite_invert_tuple,
-	.print_tuple		= udplite_print_tuple,
-	.packet			= udplite_packet,
-	.get_timeouts		= udplite_get_timeouts,
-	.new			= udplite_new,
-	.error			= udplite_error,
-#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
-	.tuple_to_nlattr	= nf_ct_port_tuple_to_nlattr,
-	.nlattr_tuple_size	= nf_ct_port_nlattr_tuple_size,
-	.nlattr_to_tuple	= nf_ct_port_nlattr_to_tuple,
-	.nla_policy		= nf_ct_port_nla_policy,
-#endif
-#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
-	.ctnl_timeout		= {
-		.nlattr_to_obj	= udplite_timeout_nlattr_to_obj,
-		.obj_to_nlattr	= udplite_timeout_obj_to_nlattr,
-		.nlattr_max	= CTA_TIMEOUT_UDPLITE_MAX,
-		.obj_size	= sizeof(unsigned int) *
-					CTA_TIMEOUT_UDPLITE_MAX,
-		.nla_policy	= udplite_timeout_nla_policy,
-	},
-#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
-	.init_net		= udplite_init_net,
-};
-EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_udplite6);
diff --git a/net/netfilter/nf_conntrack_sip.c b/net/netfilter/nf_conntrack_sip.c
index c3fc14e021ec..24174c520239 100644
--- a/net/netfilter/nf_conntrack_sip.c
+++ b/net/netfilter/nf_conntrack_sip.c
@@ -809,13 +809,11 @@ static int refresh_signalling_expectation(struct nf_conn *ct,
 		    exp->tuple.dst.protonum != proto ||
 		    exp->tuple.dst.u.udp.port != port)
 			continue;
-		if (!del_timer(&exp->timeout))
-			continue;
-		exp->flags &= ~NF_CT_EXPECT_INACTIVE;
-		exp->timeout.expires = jiffies + expires * HZ;
-		add_timer(&exp->timeout);
-		found = 1;
-		break;
+		if (mod_timer_pending(&exp->timeout, jiffies + expires * HZ)) {
+			exp->flags &= ~NF_CT_EXPECT_INACTIVE;
+			found = 1;
+			break;
+		}
 	}
 	spin_unlock_bh(&nf_conntrack_expect_lock);
 	return found;
diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
index d009ae663453..2256147dcaad 100644
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -642,6 +642,9 @@ static int __init nf_conntrack_standalone_init(void)
 	if (ret < 0)
 		goto out_start;
 
+	BUILD_BUG_ON(SKB_NFCT_PTRMASK != NFCT_PTRMASK);
+	BUILD_BUG_ON(NFCT_INFOMASK <= IP_CT_NUMBER);
+
 #ifdef CONFIG_SYSCTL
 	nf_ct_netfilter_header =
 		register_net_sysctl(&init_net, "net", nf_ct_netfilter_table);
diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c
index ffb9e8ada899..8d85a0598b60 100644
--- a/net/netfilter/nf_log.c
+++ b/net/netfilter/nf_log.c
@@ -15,6 +15,9 @@
 
 #define NFLOGGER_NAME_LEN		64
 
+int sysctl_nf_log_all_netns __read_mostly;
+EXPORT_SYMBOL(sysctl_nf_log_all_netns);
+
 static struct nf_logger __rcu *loggers[NFPROTO_NUMPROTO][NF_LOG_TYPE_MAX] __read_mostly;
 static DEFINE_MUTEX(nf_log_mutex);
 
@@ -413,6 +416,18 @@ static const struct file_operations nflog_file_ops = {
 #ifdef CONFIG_SYSCTL
 static char nf_log_sysctl_fnames[NFPROTO_NUMPROTO-NFPROTO_UNSPEC][3];
 static struct ctl_table nf_log_sysctl_table[NFPROTO_NUMPROTO+1];
+static struct ctl_table_header *nf_log_sysctl_fhdr;
+
+static struct ctl_table nf_log_sysctl_ftable[] = {
+	{
+		.procname	= "nf_log_all_netns",
+		.data		= &sysctl_nf_log_all_netns,
+		.maxlen		= sizeof(sysctl_nf_log_all_netns),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{ }
+};
 
 static int nf_log_proc_dostring(struct ctl_table *table, int write,
 			 void __user *buffer, size_t *lenp, loff_t *ppos)
@@ -482,6 +497,10 @@ static int netfilter_log_sysctl_init(struct net *net)
 			nf_log_sysctl_table[i].extra1 =
 				(void *)(unsigned long) i;
 		}
+		nf_log_sysctl_fhdr = register_net_sysctl(net, "net/netfilter",
+							 nf_log_sysctl_ftable);
+		if (!nf_log_sysctl_fhdr)
+			goto err_freg;
 	}
 
 	for (i = NFPROTO_UNSPEC; i < NFPROTO_NUMPROTO; i++)
@@ -498,6 +517,9 @@ static int netfilter_log_sysctl_init(struct net *net)
 err_reg:
 	if (!net_eq(net, &init_net))
 		kfree(table);
+	else
+		unregister_net_sysctl_table(nf_log_sysctl_fhdr);
+err_freg:
 err_alloc:
 	return -ENOMEM;
 }
@@ -510,6 +532,8 @@ static void netfilter_log_sysctl_exit(struct net *net)
 	unregister_net_sysctl_table(net->nf.nf_log_dir_header);
 	if (!net_eq(net, &init_net))
 		kfree(table);
+	else
+		unregister_net_sysctl_table(nf_log_sysctl_fhdr);
 }
 #else
 static int netfilter_log_sysctl_init(struct net *net)
diff --git a/net/netfilter/nf_nat_helper.c b/net/netfilter/nf_nat_helper.c
index 2840abb5bb99..211661cb2c90 100644
--- a/net/netfilter/nf_nat_helper.c
+++ b/net/netfilter/nf_nat_helper.c
@@ -60,7 +60,7 @@ static void mangle_contents(struct sk_buff *skb,
 		__skb_trim(skb, skb->len + rep_len - match_len);
 	}
 
-	if (nf_ct_l3num((struct nf_conn *)skb->nfct) == NFPROTO_IPV4) {
+	if (nf_ct_l3num((struct nf_conn *)skb_nfct(skb)) == NFPROTO_IPV4) {
 		/* fix IP hdr checksum information */
 		ip_hdr(skb)->tot_len = htons(skb->len);
 		ip_send_check(ip_hdr(skb));
diff --git a/net/netfilter/nf_nat_proto_udp.c b/net/netfilter/nf_nat_proto_udp.c
index b1e627227b6e..edd4a77dc09a 100644
--- a/net/netfilter/nf_nat_proto_udp.c
+++ b/net/netfilter/nf_nat_proto_udp.c
@@ -30,20 +30,15 @@ udp_unique_tuple(const struct nf_nat_l3proto *l3proto,
 				    &udp_port_rover);
 }
 
-static bool
-udp_manip_pkt(struct sk_buff *skb,
-	      const struct nf_nat_l3proto *l3proto,
-	      unsigned int iphdroff, unsigned int hdroff,
-	      const struct nf_conntrack_tuple *tuple,
-	      enum nf_nat_manip_type maniptype)
+static void
+__udp_manip_pkt(struct sk_buff *skb,
+	        const struct nf_nat_l3proto *l3proto,
+	        unsigned int iphdroff, struct udphdr *hdr,
+	        const struct nf_conntrack_tuple *tuple,
+	        enum nf_nat_manip_type maniptype, bool do_csum)
 {
-	struct udphdr *hdr;
 	__be16 *portptr, newport;
 
-	if (!skb_make_writable(skb, hdroff + sizeof(*hdr)))
-		return false;
-	hdr = (struct udphdr *)(skb->data + hdroff);
-
 	if (maniptype == NF_NAT_MANIP_SRC) {
 		/* Get rid of src port */
 		newport = tuple->src.u.udp.port;
@@ -53,7 +48,7 @@ udp_manip_pkt(struct sk_buff *skb,
 		newport = tuple->dst.u.udp.port;
 		portptr = &hdr->dest;
 	}
-	if (hdr->check || skb->ip_summed == CHECKSUM_PARTIAL) {
+	if (do_csum) {
 		l3proto->csum_update(skb, iphdroff, &hdr->check,
 				     tuple, maniptype);
 		inet_proto_csum_replace2(&hdr->check, skb, *portptr, newport,
@@ -62,9 +57,68 @@ udp_manip_pkt(struct sk_buff *skb,
 			hdr->check = CSUM_MANGLED_0;
 	}
 	*portptr = newport;
+}
+
+static bool udp_manip_pkt(struct sk_buff *skb,
+			  const struct nf_nat_l3proto *l3proto,
+			  unsigned int iphdroff, unsigned int hdroff,
+			  const struct nf_conntrack_tuple *tuple,
+			  enum nf_nat_manip_type maniptype)
+{
+	struct udphdr *hdr;
+	bool do_csum;
+
+	if (!skb_make_writable(skb, hdroff + sizeof(*hdr)))
+		return false;
+
+	hdr = (struct udphdr *)(skb->data + hdroff);
+	do_csum = hdr->check || skb->ip_summed == CHECKSUM_PARTIAL;
+
+	__udp_manip_pkt(skb, l3proto, iphdroff, hdr, tuple, maniptype, do_csum);
+	return true;
+}
+
+#ifdef CONFIG_NF_NAT_PROTO_UDPLITE
+static u16 udplite_port_rover;
+
+static bool udplite_manip_pkt(struct sk_buff *skb,
+			      const struct nf_nat_l3proto *l3proto,
+			      unsigned int iphdroff, unsigned int hdroff,
+			      const struct nf_conntrack_tuple *tuple,
+			      enum nf_nat_manip_type maniptype)
+{
+	struct udphdr *hdr;
+
+	if (!skb_make_writable(skb, hdroff + sizeof(*hdr)))
+		return false;
+
+	hdr = (struct udphdr *)(skb->data + hdroff);
+	__udp_manip_pkt(skb, l3proto, iphdroff, hdr, tuple, maniptype, true);
 	return true;
 }
 
+static void
+udplite_unique_tuple(const struct nf_nat_l3proto *l3proto,
+		     struct nf_conntrack_tuple *tuple,
+		     const struct nf_nat_range *range,
+		     enum nf_nat_manip_type maniptype,
+		     const struct nf_conn *ct)
+{
+	nf_nat_l4proto_unique_tuple(l3proto, tuple, range, maniptype, ct,
+				    &udplite_port_rover);
+}
+
+const struct nf_nat_l4proto nf_nat_l4proto_udplite = {
+	.l4proto		= IPPROTO_UDPLITE,
+	.manip_pkt		= udplite_manip_pkt,
+	.in_range		= nf_nat_l4proto_in_range,
+	.unique_tuple		= udplite_unique_tuple,
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
+	.nlattr_to_range	= nf_nat_l4proto_nlattr_to_range,
+#endif
+};
+#endif /* CONFIG_NF_NAT_PROTO_UDPLITE */
+
 const struct nf_nat_l4proto nf_nat_l4proto_udp = {
 	.l4proto		= IPPROTO_UDP,
 	.manip_pkt		= udp_manip_pkt,
diff --git a/net/netfilter/nf_nat_proto_udplite.c b/net/netfilter/nf_nat_proto_udplite.c
deleted file mode 100644
index 366bfbfd82a1..000000000000
--- a/net/netfilter/nf_nat_proto_udplite.c
+++ /dev/null
@@ -1,73 +0,0 @@
-/* (C) 1999-2001 Paul `Rusty' Russell
- * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
- * (C) 2008 Patrick McHardy <kaber@trash.net>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/types.h>
-#include <linux/udp.h>
-
-#include <linux/netfilter.h>
-#include <net/netfilter/nf_nat.h>
-#include <net/netfilter/nf_nat_l3proto.h>
-#include <net/netfilter/nf_nat_l4proto.h>
-
-static u16 udplite_port_rover;
-
-static void
-udplite_unique_tuple(const struct nf_nat_l3proto *l3proto,
-		     struct nf_conntrack_tuple *tuple,
-		     const struct nf_nat_range *range,
-		     enum nf_nat_manip_type maniptype,
-		     const struct nf_conn *ct)
-{
-	nf_nat_l4proto_unique_tuple(l3proto, tuple, range, maniptype, ct,
-				    &udplite_port_rover);
-}
-
-static bool
-udplite_manip_pkt(struct sk_buff *skb,
-		  const struct nf_nat_l3proto *l3proto,
-		  unsigned int iphdroff, unsigned int hdroff,
-		  const struct nf_conntrack_tuple *tuple,
-		  enum nf_nat_manip_type maniptype)
-{
-	struct udphdr *hdr;
-	__be16 *portptr, newport;
-
-	if (!skb_make_writable(skb, hdroff + sizeof(*hdr)))
-		return false;
-
-	hdr = (struct udphdr *)(skb->data + hdroff);
-
-	if (maniptype == NF_NAT_MANIP_SRC) {
-		/* Get rid of source port */
-		newport = tuple->src.u.udp.port;
-		portptr = &hdr->source;
-	} else {
-		/* Get rid of dst port */
-		newport = tuple->dst.u.udp.port;
-		portptr = &hdr->dest;
-	}
-
-	l3proto->csum_update(skb, iphdroff, &hdr->check, tuple, maniptype);
-	inet_proto_csum_replace2(&hdr->check, skb, *portptr, newport, false);
-	if (!hdr->check)
-		hdr->check = CSUM_MANGLED_0;
-
-	*portptr = newport;
-	return true;
-}
-
-const struct nf_nat_l4proto nf_nat_l4proto_udplite = {
-	.l4proto		= IPPROTO_UDPLITE,
-	.manip_pkt		= udplite_manip_pkt,
-	.in_range		= nf_nat_l4proto_in_range,
-	.unique_tuple		= udplite_unique_tuple,
-#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
-	.nlattr_to_range	= nf_nat_l4proto_nlattr_to_range,
-#endif
-};
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 1b913760f205..ff7304ae58ac 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -240,6 +240,10 @@ static struct nft_trans *nft_trans_rule_add(struct nft_ctx *ctx, int msg_type,
 	if (trans == NULL)
 		return NULL;
 
+	if (msg_type == NFT_MSG_NEWRULE && ctx->nla[NFTA_RULE_ID] != NULL) {
+		nft_trans_rule_id(trans) =
+			ntohl(nla_get_be32(ctx->nla[NFTA_RULE_ID]));
+	}
 	nft_trans_rule(trans) = rule;
 	list_add_tail(&trans->list, &ctx->net->nft.commit_list);
 
@@ -576,6 +580,28 @@ err:
 	return err;
 }
 
+static void _nf_tables_table_disable(struct net *net,
+				     const struct nft_af_info *afi,
+				     struct nft_table *table,
+				     u32 cnt)
+{
+	struct nft_chain *chain;
+	u32 i = 0;
+
+	list_for_each_entry(chain, &table->chains, list) {
+		if (!nft_is_active_next(net, chain))
+			continue;
+		if (!(chain->flags & NFT_BASE_CHAIN))
+			continue;
+
+		if (cnt && i++ == cnt)
+			break;
+
+		nf_unregister_net_hooks(net, nft_base_chain(chain)->ops,
+					afi->nops);
+	}
+}
+
 static int nf_tables_table_enable(struct net *net,
 				  const struct nft_af_info *afi,
 				  struct nft_table *table)
@@ -598,18 +624,8 @@ static int nf_tables_table_enable(struct net *net,
 	}
 	return 0;
 err:
-	list_for_each_entry(chain, &table->chains, list) {
-		if (!nft_is_active_next(net, chain))
-			continue;
-		if (!(chain->flags & NFT_BASE_CHAIN))
-			continue;
-
-		if (i-- <= 0)
-			break;
-
-		nf_unregister_net_hooks(net, nft_base_chain(chain)->ops,
-					afi->nops);
-	}
+	if (i)
+		_nf_tables_table_disable(net, afi, table, i);
 	return err;
 }
 
@@ -617,17 +633,7 @@ static void nf_tables_table_disable(struct net *net,
 				    const struct nft_af_info *afi,
 				    struct nft_table *table)
 {
-	struct nft_chain *chain;
-
-	list_for_each_entry(chain, &table->chains, list) {
-		if (!nft_is_active_next(net, chain))
-			continue;
-		if (!(chain->flags & NFT_BASE_CHAIN))
-			continue;
-
-		nf_unregister_net_hooks(net, nft_base_chain(chain)->ops,
-					afi->nops);
-	}
+	_nf_tables_table_disable(net, afi, table, 0);
 }
 
 static int nf_tables_updtable(struct nft_ctx *ctx)
@@ -696,10 +702,7 @@ static int nf_tables_newtable(struct net *net, struct sock *nlsk,
 	if (IS_ERR(table)) {
 		if (PTR_ERR(table) != -ENOENT)
 			return PTR_ERR(table);
-		table = NULL;
-	}
-
-	if (table != NULL) {
+	} else {
 		if (nlh->nlmsg_flags & NLM_F_EXCL)
 			return -EEXIST;
 		if (nlh->nlmsg_flags & NLM_F_REPLACE)
@@ -2294,6 +2297,22 @@ err1:
 	return err;
 }
 
+static struct nft_rule *nft_rule_lookup_byid(const struct net *net,
+					     const struct nlattr *nla)
+{
+	u32 id = ntohl(nla_get_be32(nla));
+	struct nft_trans *trans;
+
+	list_for_each_entry(trans, &net->nft.commit_list, list) {
+		struct nft_rule *rule = nft_trans_rule(trans);
+
+		if (trans->msg_type == NFT_MSG_NEWRULE &&
+		    id == nft_trans_rule_id(trans))
+			return rule;
+	}
+	return ERR_PTR(-ENOENT);
+}
+
 static int nf_tables_delrule(struct net *net, struct sock *nlsk,
 			     struct sk_buff *skb, const struct nlmsghdr *nlh,
 			     const struct nlattr * const nla[])
@@ -2332,6 +2351,12 @@ static int nf_tables_delrule(struct net *net, struct sock *nlsk,
 				return PTR_ERR(rule);
 
 			err = nft_delrule(&ctx, rule);
+		} else if (nla[NFTA_RULE_ID]) {
+			rule = nft_rule_lookup_byid(net, nla[NFTA_RULE_ID]);
+			if (IS_ERR(rule))
+				return PTR_ERR(rule);
+
+			err = nft_delrule(&ctx, rule);
 		} else {
 			err = nft_delrule_by_chain(&ctx);
 		}
@@ -2399,12 +2424,14 @@ nft_select_set_ops(const struct nlattr * const nla[],
 	features = 0;
 	if (nla[NFTA_SET_FLAGS] != NULL) {
 		features = ntohl(nla_get_be32(nla[NFTA_SET_FLAGS]));
-		features &= NFT_SET_INTERVAL | NFT_SET_MAP | NFT_SET_TIMEOUT;
+		features &= NFT_SET_INTERVAL | NFT_SET_MAP | NFT_SET_TIMEOUT |
+			    NFT_SET_OBJECT;
 	}
 
-	bops	   = NULL;
-	best.size  = ~0;
-	best.class = ~0;
+	bops	    = NULL;
+	best.size   = ~0;
+	best.lookup = ~0;
+	best.space  = ~0;
 
 	list_for_each_entry(ops, &nf_tables_set_ops, list) {
 		if ((ops->features & features) != features)
@@ -2414,16 +2441,27 @@ nft_select_set_ops(const struct nlattr * const nla[],
 
 		switch (policy) {
 		case NFT_SET_POL_PERFORMANCE:
-			if (est.class < best.class)
-				break;
-			if (est.class == best.class && est.size < best.size)
+			if (est.lookup < best.lookup)
 				break;
+			if (est.lookup == best.lookup) {
+				if (!desc->size) {
+					if (est.space < best.space)
+						break;
+				} else if (est.size < best.size) {
+					break;
+				}
+			}
 			continue;
 		case NFT_SET_POL_MEMORY:
-			if (est.size < best.size)
-				break;
-			if (est.size == best.size && est.class < best.class)
+			if (!desc->size) {
+				if (est.space < best.space)
+					break;
+				if (est.space == best.space &&
+				    est.lookup < best.lookup)
+					break;
+			} else if (est.size < best.size) {
 				break;
+			}
 			continue;
 		default:
 			break;
@@ -2966,10 +3004,7 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk,
 	if (IS_ERR(set)) {
 		if (PTR_ERR(set) != -ENOENT)
 			return PTR_ERR(set);
-		set = NULL;
-	}
-
-	if (set != NULL) {
+	} else {
 		if (nlh->nlmsg_flags & NLM_F_EXCL)
 			return -EEXIST;
 		if (nlh->nlmsg_flags & NLM_F_REPLACE)
@@ -3125,6 +3160,7 @@ int nf_tables_bind_set(const struct nft_ctx *ctx, struct nft_set *set,
 		iter.count	= 0;
 		iter.err	= 0;
 		iter.fn		= nf_tables_bind_check_setelem;
+		iter.flush	= false;
 
 		set->ops->walk(ctx, set, &iter);
 		if (iter.err < 0)
@@ -3378,6 +3414,7 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb)
 	args.iter.count		= 0;
 	args.iter.err		= 0;
 	args.iter.fn		= nf_tables_dump_setelem;
+	args.iter.flush		= false;
 	set->ops->walk(&ctx, set, &args.iter);
 
 	nla_nest_end(skb, nest);
@@ -3756,7 +3793,7 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
 	return 0;
 
 err6:
-	set->ops->remove(set, &elem);
+	set->ops->remove(ctx->net, set, &elem);
 err5:
 	kfree(trans);
 err4:
@@ -3902,7 +3939,7 @@ static int nft_flush_set(const struct nft_ctx *ctx,
 	if (!trans)
 		return -ENOMEM;
 
-	if (!set->ops->deactivate_one(ctx->net, set, elem->priv)) {
+	if (!set->ops->flush(ctx->net, set, elem->priv)) {
 		err = -ENOENT;
 		goto err1;
 	}
@@ -3940,15 +3977,14 @@ static int nf_tables_delsetelem(struct net *net, struct sock *nlsk,
 		return -EBUSY;
 
 	if (nla[NFTA_SET_ELEM_LIST_ELEMENTS] == NULL) {
-		struct nft_set_dump_args args = {
-			.iter	= {
-				.genmask	= genmask,
-				.fn		= nft_flush_set,
-			},
+		struct nft_set_iter iter = {
+			.genmask	= genmask,
+			.fn		= nft_flush_set,
+			.flush		= true,
 		};
-		set->ops->walk(&ctx, set, &args.iter);
+		set->ops->walk(&ctx, set, &iter);
 
-		return args.iter.err;
+		return iter.err;
 	}
 
 	nla_for_each_nested(attr, nla[NFTA_SET_ELEM_LIST_ELEMENTS], rem) {
@@ -4163,10 +4199,7 @@ static int nf_tables_newobj(struct net *net, struct sock *nlsk,
 		if (err != -ENOENT)
 			return err;
 
-		obj = NULL;
-	}
-
-	if (obj != NULL) {
+	} else {
 		if (nlh->nlmsg_flags & NLM_F_EXCL)
 			return -EEXIST;
 
@@ -4811,7 +4844,7 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb)
 			nf_tables_setelem_notify(&trans->ctx, te->set,
 						 &te->elem,
 						 NFT_MSG_DELSETELEM, 0);
-			te->set->ops->remove(te->set, &te->elem);
+			te->set->ops->remove(net, te->set, &te->elem);
 			atomic_dec(&te->set->nelems);
 			te->set->ndeact--;
 			break;
@@ -4932,7 +4965,7 @@ static int nf_tables_abort(struct net *net, struct sk_buff *skb)
 		case NFT_MSG_NEWSETELEM:
 			te = (struct nft_trans_elem *)trans->data;
 
-			te->set->ops->remove(te->set, &te->elem);
+			te->set->ops->remove(net, te->set, &te->elem);
 			atomic_dec(&te->set->nelems);
 			break;
 		case NFT_MSG_DELSETELEM:
@@ -4966,6 +4999,11 @@ static int nf_tables_abort(struct net *net, struct sk_buff *skb)
 	return 0;
 }
 
+static bool nf_tables_valid_genid(struct net *net, u32 genid)
+{
+	return net->nft.base_seq == genid;
+}
+
 static const struct nfnetlink_subsystem nf_tables_subsys = {
 	.name		= "nf_tables",
 	.subsys_id	= NFNL_SUBSYS_NFTABLES,
@@ -4973,6 +5011,7 @@ static const struct nfnetlink_subsystem nf_tables_subsys = {
 	.cb		= nf_tables_cb,
 	.commit		= nf_tables_commit,
 	.abort		= nf_tables_abort,
+	.valid_genid	= nf_tables_valid_genid,
 };
 
 int nft_chain_validate_dependency(const struct nft_chain *chain,
@@ -5098,6 +5137,7 @@ static int nf_tables_check_loops(const struct nft_ctx *ctx,
 			iter.count	= 0;
 			iter.err	= 0;
 			iter.fn		= nf_tables_loop_check_setelem;
+			iter.flush	= false;
 
 			set->ops->walk(ctx, set, &iter);
 			if (iter.err < 0)
diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c
index a09fa9fd8f3d..68eda920160e 100644
--- a/net/netfilter/nfnetlink.c
+++ b/net/netfilter/nfnetlink.c
@@ -3,7 +3,7 @@
  *
  * (C) 2001 by Jay Schulist <jschlst@samba.org>,
  * (C) 2002-2005 by Harald Welte <laforge@gnumonks.org>
- * (C) 2005,2007 by Pablo Neira Ayuso <pablo@netfilter.org>
+ * (C) 2005-2017 by Pablo Neira Ayuso <pablo@netfilter.org>
  *
  * Initial netfilter messages via netlink development funded and
  * generally made possible by Network Robots, Inc. (www.networkrobots.com)
@@ -100,9 +100,9 @@ int nfnetlink_subsys_unregister(const struct nfnetlink_subsystem *n)
 }
 EXPORT_SYMBOL_GPL(nfnetlink_subsys_unregister);
 
-static inline const struct nfnetlink_subsystem *nfnetlink_get_subsys(u_int16_t type)
+static inline const struct nfnetlink_subsystem *nfnetlink_get_subsys(u16 type)
 {
-	u_int8_t subsys_id = NFNL_SUBSYS_ID(type);
+	u8 subsys_id = NFNL_SUBSYS_ID(type);
 
 	if (subsys_id >= NFNL_SUBSYS_COUNT)
 		return NULL;
@@ -111,9 +111,9 @@ static inline const struct nfnetlink_subsystem *nfnetlink_get_subsys(u_int16_t t
 }
 
 static inline const struct nfnl_callback *
-nfnetlink_find_client(u_int16_t type, const struct nfnetlink_subsystem *ss)
+nfnetlink_find_client(u16 type, const struct nfnetlink_subsystem *ss)
 {
-	u_int8_t cb_id = NFNL_MSG_TYPE(type);
+	u8 cb_id = NFNL_MSG_TYPE(type);
 
 	if (cb_id >= ss->cb_count)
 		return NULL;
@@ -185,7 +185,7 @@ replay:
 
 	{
 		int min_len = nlmsg_total_size(sizeof(struct nfgenmsg));
-		u_int8_t cb_id = NFNL_MSG_TYPE(nlh->nlmsg_type);
+		u8 cb_id = NFNL_MSG_TYPE(nlh->nlmsg_type);
 		struct nlattr *cda[ss->cb[cb_id].attr_count + 1];
 		struct nlattr *attr = (void *)nlh + min_len;
 		int attrlen = nlh->nlmsg_len - min_len;
@@ -273,13 +273,13 @@ enum {
 };
 
 static void nfnetlink_rcv_batch(struct sk_buff *skb, struct nlmsghdr *nlh,
-				u_int16_t subsys_id)
+				u16 subsys_id, u32 genid)
 {
 	struct sk_buff *oskb = skb;
 	struct net *net = sock_net(skb->sk);
 	const struct nfnetlink_subsystem *ss;
 	const struct nfnl_callback *nc;
-	static LIST_HEAD(err_list);
+	LIST_HEAD(err_list);
 	u32 status;
 	int err;
 
@@ -315,6 +315,12 @@ replay:
 		return kfree_skb(skb);
 	}
 
+	if (genid && ss->valid_genid && !ss->valid_genid(net, genid)) {
+		nfnl_unlock(subsys_id);
+		netlink_ack(oskb, nlh, -ERESTART);
+		return kfree_skb(skb);
+	}
+
 	while (skb->len >= nlmsg_total_size(0)) {
 		int msglen, type;
 
@@ -365,7 +371,7 @@ replay:
 
 		{
 			int min_len = nlmsg_total_size(sizeof(struct nfgenmsg));
-			u_int8_t cb_id = NFNL_MSG_TYPE(nlh->nlmsg_type);
+			u8 cb_id = NFNL_MSG_TYPE(nlh->nlmsg_type);
 			struct nlattr *cda[ss->cb[cb_id].attr_count + 1];
 			struct nlattr *attr = (void *)nlh + min_len;
 			int attrlen = nlh->nlmsg_len - min_len;
@@ -436,11 +442,51 @@ done:
 	kfree_skb(skb);
 }
 
+static const struct nla_policy nfnl_batch_policy[NFNL_BATCH_MAX + 1] = {
+	[NFNL_BATCH_GENID]	= { .type = NLA_U32 },
+};
+
+static void nfnetlink_rcv_skb_batch(struct sk_buff *skb, struct nlmsghdr *nlh)
+{
+	int min_len = nlmsg_total_size(sizeof(struct nfgenmsg));
+	struct nlattr *attr = (void *)nlh + min_len;
+	struct nlattr *cda[NFNL_BATCH_MAX + 1];
+	int attrlen = nlh->nlmsg_len - min_len;
+	struct nfgenmsg *nfgenmsg;
+	int msglen, err;
+	u32 gen_id = 0;
+	u16 res_id;
+
+	msglen = NLMSG_ALIGN(nlh->nlmsg_len);
+	if (msglen > skb->len)
+		msglen = skb->len;
+
+	if (nlh->nlmsg_len < NLMSG_HDRLEN ||
+	    skb->len < NLMSG_HDRLEN + sizeof(struct nfgenmsg))
+		return;
+
+	err = nla_parse(cda, NFNL_BATCH_MAX, attr, attrlen, nfnl_batch_policy);
+	if (err < 0) {
+		netlink_ack(skb, nlh, err);
+		return;
+	}
+	if (cda[NFNL_BATCH_GENID])
+		gen_id = ntohl(nla_get_be32(cda[NFNL_BATCH_GENID]));
+
+	nfgenmsg = nlmsg_data(nlh);
+	skb_pull(skb, msglen);
+	/* Work around old nft using host byte order */
+	if (nfgenmsg->res_id == NFNL_SUBSYS_NFTABLES)
+		res_id = NFNL_SUBSYS_NFTABLES;
+	else
+		res_id = ntohs(nfgenmsg->res_id);
+
+	nfnetlink_rcv_batch(skb, nlh, res_id, gen_id);
+}
+
 static void nfnetlink_rcv(struct sk_buff *skb)
 {
 	struct nlmsghdr *nlh = nlmsg_hdr(skb);
-	u_int16_t res_id;
-	int msglen;
 
 	if (nlh->nlmsg_len < NLMSG_HDRLEN ||
 	    skb->len < nlh->nlmsg_len)
@@ -451,28 +497,10 @@ static void nfnetlink_rcv(struct sk_buff *skb)
 		return;
 	}
 
-	if (nlh->nlmsg_type == NFNL_MSG_BATCH_BEGIN) {
-		struct nfgenmsg *nfgenmsg;
-
-		msglen = NLMSG_ALIGN(nlh->nlmsg_len);
-		if (msglen > skb->len)
-			msglen = skb->len;
-
-		if (nlh->nlmsg_len < NLMSG_HDRLEN ||
-		    skb->len < NLMSG_HDRLEN + sizeof(struct nfgenmsg))
-			return;
-
-		nfgenmsg = nlmsg_data(nlh);
-		skb_pull(skb, msglen);
-		/* Work around old nft using host byte order */
-		if (nfgenmsg->res_id == NFNL_SUBSYS_NFTABLES)
-			res_id = NFNL_SUBSYS_NFTABLES;
-		else
-			res_id = ntohs(nfgenmsg->res_id);
-		nfnetlink_rcv_batch(skb, nlh, res_id);
-	} else {
+	if (nlh->nlmsg_type == NFNL_MSG_BATCH_BEGIN)
+		nfnetlink_rcv_skb_batch(skb, nlh);
+	else
 		netlink_rcv_skb(skb, &nfnetlink_rcv_msg);
-	}
 }
 
 #ifdef CONFIG_MODULES
diff --git a/net/netfilter/nfnetlink_cthelper.c b/net/netfilter/nfnetlink_cthelper.c
index 3b79f34b5095..de8782345c86 100644
--- a/net/netfilter/nfnetlink_cthelper.c
+++ b/net/netfilter/nfnetlink_cthelper.c
@@ -48,7 +48,7 @@ nfnl_userspace_cthelper(struct sk_buff *skb, unsigned int protoff,
 	if (helper == NULL)
 		return NF_DROP;
 
-	/* This is an user-space helper not yet configured, skip. */
+	/* This is a user-space helper not yet configured, skip. */
 	if ((helper->flags &
 	    (NF_CT_HELPER_F_USERSPACE | NF_CT_HELPER_F_CONFIGURED)) ==
 	     NF_CT_HELPER_F_USERSPACE)
diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c
index e6baeaebe653..bf548a7a71ec 100644
--- a/net/netfilter/nft_ct.c
+++ b/net/netfilter/nft_ct.c
@@ -32,6 +32,11 @@ struct nft_ct {
 	};
 };
 
+#ifdef CONFIG_NF_CONNTRACK_ZONES
+static DEFINE_PER_CPU(struct nf_conn *, nft_ct_pcpu_template);
+static unsigned int nft_ct_pcpu_template_refcnt __read_mostly;
+#endif
+
 static u64 nft_ct_get_eval_counter(const struct nf_conn_counter *c,
 				   enum nft_ct_keys k,
 				   enum ip_conntrack_dir d)
@@ -129,12 +134,40 @@ static void nft_ct_get_eval(const struct nft_expr *expr,
 		memcpy(dest, &count, sizeof(count));
 		return;
 	}
+	case NFT_CT_AVGPKT: {
+		const struct nf_conn_acct *acct = nf_conn_acct_find(ct);
+		u64 avgcnt = 0, bcnt = 0, pcnt = 0;
+
+		if (acct) {
+			pcnt = nft_ct_get_eval_counter(acct->counter,
+						       NFT_CT_PKTS, priv->dir);
+			bcnt = nft_ct_get_eval_counter(acct->counter,
+						       NFT_CT_BYTES, priv->dir);
+			if (pcnt != 0)
+				avgcnt = div64_u64(bcnt, pcnt);
+		}
+
+		memcpy(dest, &avgcnt, sizeof(avgcnt));
+		return;
+	}
 	case NFT_CT_L3PROTOCOL:
 		*dest = nf_ct_l3num(ct);
 		return;
 	case NFT_CT_PROTOCOL:
 		*dest = nf_ct_protonum(ct);
 		return;
+#ifdef CONFIG_NF_CONNTRACK_ZONES
+	case NFT_CT_ZONE: {
+		const struct nf_conntrack_zone *zone = nf_ct_zone(ct);
+
+		if (priv->dir < IP_CT_DIR_MAX)
+			*dest = nf_ct_zone_id(zone, priv->dir);
+		else
+			*dest = zone->id;
+
+		return;
+	}
+#endif
 	default:
 		break;
 	}
@@ -163,6 +196,53 @@ err:
 	regs->verdict.code = NFT_BREAK;
 }
 
+#ifdef CONFIG_NF_CONNTRACK_ZONES
+static void nft_ct_set_zone_eval(const struct nft_expr *expr,
+				 struct nft_regs *regs,
+				 const struct nft_pktinfo *pkt)
+{
+	struct nf_conntrack_zone zone = { .dir = NF_CT_DEFAULT_ZONE_DIR };
+	const struct nft_ct *priv = nft_expr_priv(expr);
+	struct sk_buff *skb = pkt->skb;
+	enum ip_conntrack_info ctinfo;
+	u16 value = regs->data[priv->sreg];
+	struct nf_conn *ct;
+
+	ct = nf_ct_get(skb, &ctinfo);
+	if (ct) /* already tracked */
+		return;
+
+	zone.id = value;
+
+	switch (priv->dir) {
+	case IP_CT_DIR_ORIGINAL:
+		zone.dir = NF_CT_ZONE_DIR_ORIG;
+		break;
+	case IP_CT_DIR_REPLY:
+		zone.dir = NF_CT_ZONE_DIR_REPL;
+		break;
+	default:
+		break;
+	}
+
+	ct = this_cpu_read(nft_ct_pcpu_template);
+
+	if (likely(atomic_read(&ct->ct_general.use) == 1)) {
+		nf_ct_zone_add(ct, &zone);
+	} else {
+		/* previous skb got queued to userspace */
+		ct = nf_ct_tmpl_alloc(nft_net(pkt), &zone, GFP_ATOMIC);
+		if (!ct) {
+			regs->verdict.code = NF_DROP;
+			return;
+		}
+	}
+
+	atomic_inc(&ct->ct_general.use);
+	nf_ct_set(skb, ct, IP_CT_NEW);
+}
+#endif
+
 static void nft_ct_set_eval(const struct nft_expr *expr,
 			    struct nft_regs *regs,
 			    const struct nft_pktinfo *pkt)
@@ -241,6 +321,45 @@ static void nft_ct_netns_put(struct net *net, uint8_t family)
 		nf_ct_netns_put(net, family);
 }
 
+#ifdef CONFIG_NF_CONNTRACK_ZONES
+static void nft_ct_tmpl_put_pcpu(void)
+{
+	struct nf_conn *ct;
+	int cpu;
+
+	for_each_possible_cpu(cpu) {
+		ct = per_cpu(nft_ct_pcpu_template, cpu);
+		if (!ct)
+			break;
+		nf_ct_put(ct);
+		per_cpu(nft_ct_pcpu_template, cpu) = NULL;
+	}
+}
+
+static bool nft_ct_tmpl_alloc_pcpu(void)
+{
+	struct nf_conntrack_zone zone = { .id = 0 };
+	struct nf_conn *tmp;
+	int cpu;
+
+	if (nft_ct_pcpu_template_refcnt)
+		return true;
+
+	for_each_possible_cpu(cpu) {
+		tmp = nf_ct_tmpl_alloc(&init_net, &zone, GFP_KERNEL);
+		if (!tmp) {
+			nft_ct_tmpl_put_pcpu();
+			return false;
+		}
+
+		atomic_set(&tmp->ct_general.use, 1);
+		per_cpu(nft_ct_pcpu_template, cpu) = tmp;
+	}
+
+	return true;
+}
+#endif
+
 static int nft_ct_get_init(const struct nft_ctx *ctx,
 			   const struct nft_expr *expr,
 			   const struct nlattr * const tb[])
@@ -250,6 +369,7 @@ static int nft_ct_get_init(const struct nft_ctx *ctx,
 	int err;
 
 	priv->key = ntohl(nla_get_be32(tb[NFTA_CT_KEY]));
+	priv->dir = IP_CT_DIR_MAX;
 	switch (priv->key) {
 	case NFT_CT_DIRECTION:
 		if (tb[NFTA_CT_DIRECTION] != NULL)
@@ -316,11 +436,14 @@ static int nft_ct_get_init(const struct nft_ctx *ctx,
 		break;
 	case NFT_CT_BYTES:
 	case NFT_CT_PKTS:
-		/* no direction? return sum of original + reply */
-		if (tb[NFTA_CT_DIRECTION] == NULL)
-			priv->dir = IP_CT_DIR_MAX;
+	case NFT_CT_AVGPKT:
 		len = sizeof(u64);
 		break;
+#ifdef CONFIG_NF_CONNTRACK_ZONES
+	case NFT_CT_ZONE:
+		len = sizeof(u16);
+		break;
+#endif
 	default:
 		return -EOPNOTSUPP;
 	}
@@ -346,21 +469,41 @@ static int nft_ct_get_init(const struct nft_ctx *ctx,
 	if (err < 0)
 		return err;
 
-	if (priv->key == NFT_CT_BYTES || priv->key == NFT_CT_PKTS)
+	if (priv->key == NFT_CT_BYTES ||
+	    priv->key == NFT_CT_PKTS  ||
+	    priv->key == NFT_CT_AVGPKT)
 		nf_ct_set_acct(ctx->net, true);
 
 	return 0;
 }
 
+static void __nft_ct_set_destroy(const struct nft_ctx *ctx, struct nft_ct *priv)
+{
+	switch (priv->key) {
+#ifdef CONFIG_NF_CONNTRACK_LABELS
+	case NFT_CT_LABELS:
+		nf_connlabels_put(ctx->net);
+		break;
+#endif
+#ifdef CONFIG_NF_CONNTRACK_ZONES
+	case NFT_CT_ZONE:
+		if (--nft_ct_pcpu_template_refcnt == 0)
+			nft_ct_tmpl_put_pcpu();
+#endif
+	default:
+		break;
+	}
+}
+
 static int nft_ct_set_init(const struct nft_ctx *ctx,
 			   const struct nft_expr *expr,
 			   const struct nlattr * const tb[])
 {
 	struct nft_ct *priv = nft_expr_priv(expr);
-	bool label_got = false;
 	unsigned int len;
 	int err;
 
+	priv->dir = IP_CT_DIR_MAX;
 	priv->key = ntohl(nla_get_be32(tb[NFTA_CT_KEY]));
 	switch (priv->key) {
 #ifdef CONFIG_NF_CONNTRACK_MARK
@@ -378,13 +521,31 @@ static int nft_ct_set_init(const struct nft_ctx *ctx,
 		err = nf_connlabels_get(ctx->net, (len * BITS_PER_BYTE) - 1);
 		if (err)
 			return err;
-		label_got = true;
+		break;
+#endif
+#ifdef CONFIG_NF_CONNTRACK_ZONES
+	case NFT_CT_ZONE:
+		if (!nft_ct_tmpl_alloc_pcpu())
+			return -ENOMEM;
+		nft_ct_pcpu_template_refcnt++;
+		len = sizeof(u16);
 		break;
 #endif
 	default:
 		return -EOPNOTSUPP;
 	}
 
+	if (tb[NFTA_CT_DIRECTION]) {
+		priv->dir = nla_get_u8(tb[NFTA_CT_DIRECTION]);
+		switch (priv->dir) {
+		case IP_CT_DIR_ORIGINAL:
+		case IP_CT_DIR_REPLY:
+			break;
+		default:
+			return -EINVAL;
+		}
+	}
+
 	priv->sreg = nft_parse_register(tb[NFTA_CT_SREG]);
 	err = nft_validate_register_load(priv->sreg, len);
 	if (err < 0)
@@ -397,8 +558,7 @@ static int nft_ct_set_init(const struct nft_ctx *ctx,
 	return 0;
 
 err1:
-	if (label_got)
-		nf_connlabels_put(ctx->net);
+	__nft_ct_set_destroy(ctx, priv);
 	return err;
 }
 
@@ -413,16 +573,7 @@ static void nft_ct_set_destroy(const struct nft_ctx *ctx,
 {
 	struct nft_ct *priv = nft_expr_priv(expr);
 
-	switch (priv->key) {
-#ifdef CONFIG_NF_CONNTRACK_LABELS
-	case NFT_CT_LABELS:
-		nf_connlabels_put(ctx->net);
-		break;
-#endif
-	default:
-		break;
-	}
-
+	__nft_ct_set_destroy(ctx, priv);
 	nft_ct_netns_put(ctx->net, ctx->afi->family);
 }
 
@@ -445,6 +596,8 @@ static int nft_ct_get_dump(struct sk_buff *skb, const struct nft_expr *expr)
 		break;
 	case NFT_CT_BYTES:
 	case NFT_CT_PKTS:
+	case NFT_CT_AVGPKT:
+	case NFT_CT_ZONE:
 		if (priv->dir < IP_CT_DIR_MAX &&
 		    nla_put_u8(skb, NFTA_CT_DIRECTION, priv->dir))
 			goto nla_put_failure;
@@ -467,6 +620,17 @@ static int nft_ct_set_dump(struct sk_buff *skb, const struct nft_expr *expr)
 		goto nla_put_failure;
 	if (nla_put_be32(skb, NFTA_CT_KEY, htonl(priv->key)))
 		goto nla_put_failure;
+
+	switch (priv->key) {
+	case NFT_CT_ZONE:
+		if (priv->dir < IP_CT_DIR_MAX &&
+		    nla_put_u8(skb, NFTA_CT_DIRECTION, priv->dir))
+			goto nla_put_failure;
+		break;
+	default:
+		break;
+	}
+
 	return 0;
 
 nla_put_failure:
@@ -492,6 +656,17 @@ static const struct nft_expr_ops nft_ct_set_ops = {
 	.dump		= nft_ct_set_dump,
 };
 
+#ifdef CONFIG_NF_CONNTRACK_ZONES
+static const struct nft_expr_ops nft_ct_set_zone_ops = {
+	.type		= &nft_ct_type,
+	.size		= NFT_EXPR_SIZE(sizeof(struct nft_ct)),
+	.eval		= nft_ct_set_zone_eval,
+	.init		= nft_ct_set_init,
+	.destroy	= nft_ct_set_destroy,
+	.dump		= nft_ct_set_dump,
+};
+#endif
+
 static const struct nft_expr_ops *
 nft_ct_select_ops(const struct nft_ctx *ctx,
 		    const struct nlattr * const tb[])
@@ -505,8 +680,13 @@ nft_ct_select_ops(const struct nft_ctx *ctx,
 	if (tb[NFTA_CT_DREG])
 		return &nft_ct_get_ops;
 
-	if (tb[NFTA_CT_SREG])
+	if (tb[NFTA_CT_SREG]) {
+#ifdef CONFIG_NF_CONNTRACK_ZONES
+		if (nla_get_be32(tb[NFTA_CT_KEY]) == htonl(NFT_CT_ZONE))
+			return &nft_ct_set_zone_ops;
+#endif
 		return &nft_ct_set_ops;
+	}
 
 	return ERR_PTR(-EINVAL);
 }
@@ -534,8 +714,7 @@ static void nft_notrack_eval(const struct nft_expr *expr,
 
 	ct = nf_ct_untracked_get();
 	atomic_inc(&ct->ct_general.use);
-	skb->nfct = &ct->ct_general;
-	skb->nfctinfo = IP_CT_NEW;
+	nf_ct_set(skb, ct, IP_CT_NEW);
 }
 
 static struct nft_expr_type nft_notrack_type;
diff --git a/net/netfilter/nft_exthdr.c b/net/netfilter/nft_exthdr.c
index 47beb3abcc9d..c308920b194c 100644
--- a/net/netfilter/nft_exthdr.c
+++ b/net/netfilter/nft_exthdr.c
@@ -15,19 +15,29 @@
 #include <linux/netfilter.h>
 #include <linux/netfilter/nf_tables.h>
 #include <net/netfilter/nf_tables.h>
-// FIXME:
-#include <net/ipv6.h>
+#include <net/tcp.h>
 
 struct nft_exthdr {
 	u8			type;
 	u8			offset;
 	u8			len;
+	u8			op;
 	enum nft_registers	dreg:8;
+	u8			flags;
 };
 
-static void nft_exthdr_eval(const struct nft_expr *expr,
-			    struct nft_regs *regs,
-			    const struct nft_pktinfo *pkt)
+static unsigned int optlen(const u8 *opt, unsigned int offset)
+{
+	/* Beware zero-length options: make finite progress */
+	if (opt[offset] <= TCPOPT_NOP || opt[offset + 1] == 0)
+		return 1;
+	else
+		return opt[offset + 1];
+}
+
+static void nft_exthdr_ipv6_eval(const struct nft_expr *expr,
+				 struct nft_regs *regs,
+				 const struct nft_pktinfo *pkt)
 {
 	struct nft_exthdr *priv = nft_expr_priv(expr);
 	u32 *dest = &regs->data[priv->dreg];
@@ -35,8 +45,12 @@ static void nft_exthdr_eval(const struct nft_expr *expr,
 	int err;
 
 	err = ipv6_find_hdr(pkt->skb, &offset, priv->type, NULL, NULL);
-	if (err < 0)
+	if (priv->flags & NFT_EXTHDR_F_PRESENT) {
+		*dest = (err >= 0);
+		return;
+	} else if (err < 0) {
 		goto err;
+	}
 	offset += priv->offset;
 
 	dest[priv->len / NFT_REG32_SIZE] = 0;
@@ -47,11 +61,59 @@ err:
 	regs->verdict.code = NFT_BREAK;
 }
 
+static void nft_exthdr_tcp_eval(const struct nft_expr *expr,
+				struct nft_regs *regs,
+				const struct nft_pktinfo *pkt)
+{
+	u8 buff[sizeof(struct tcphdr) + MAX_TCP_OPTION_SPACE];
+	struct nft_exthdr *priv = nft_expr_priv(expr);
+	unsigned int i, optl, tcphdr_len, offset;
+	u32 *dest = &regs->data[priv->dreg];
+	struct tcphdr *tcph;
+	u8 *opt;
+
+	if (!pkt->tprot_set || pkt->tprot != IPPROTO_TCP)
+		goto err;
+
+	tcph = skb_header_pointer(pkt->skb, pkt->xt.thoff, sizeof(*tcph), buff);
+	if (!tcph)
+		goto err;
+
+	tcphdr_len = __tcp_hdrlen(tcph);
+	if (tcphdr_len < sizeof(*tcph))
+		goto err;
+
+	tcph = skb_header_pointer(pkt->skb, pkt->xt.thoff, tcphdr_len, buff);
+	if (!tcph)
+		goto err;
+
+	opt = (u8 *)tcph;
+	for (i = sizeof(*tcph); i < tcphdr_len - 1; i += optl) {
+		optl = optlen(opt, i);
+
+		if (priv->type != opt[i])
+			continue;
+
+		if (i + optl > tcphdr_len || priv->len + priv->offset > optl)
+			goto err;
+
+		offset = i + priv->offset;
+		dest[priv->len / NFT_REG32_SIZE] = 0;
+		memcpy(dest, opt + offset, priv->len);
+
+		return;
+	}
+
+err:
+	regs->verdict.code = NFT_BREAK;
+}
+
 static const struct nla_policy nft_exthdr_policy[NFTA_EXTHDR_MAX + 1] = {
 	[NFTA_EXTHDR_DREG]		= { .type = NLA_U32 },
 	[NFTA_EXTHDR_TYPE]		= { .type = NLA_U8 },
 	[NFTA_EXTHDR_OFFSET]		= { .type = NLA_U32 },
 	[NFTA_EXTHDR_LEN]		= { .type = NLA_U32 },
+	[NFTA_EXTHDR_FLAGS]		= { .type = NLA_U32 },
 };
 
 static int nft_exthdr_init(const struct nft_ctx *ctx,
@@ -59,13 +121,13 @@ static int nft_exthdr_init(const struct nft_ctx *ctx,
 			   const struct nlattr * const tb[])
 {
 	struct nft_exthdr *priv = nft_expr_priv(expr);
-	u32 offset, len;
+	u32 offset, len, flags = 0, op = NFT_EXTHDR_OP_IPV6;
 	int err;
 
-	if (tb[NFTA_EXTHDR_DREG] == NULL ||
-	    tb[NFTA_EXTHDR_TYPE] == NULL ||
-	    tb[NFTA_EXTHDR_OFFSET] == NULL ||
-	    tb[NFTA_EXTHDR_LEN] == NULL)
+	if (!tb[NFTA_EXTHDR_DREG] ||
+	    !tb[NFTA_EXTHDR_TYPE] ||
+	    !tb[NFTA_EXTHDR_OFFSET] ||
+	    !tb[NFTA_EXTHDR_LEN])
 		return -EINVAL;
 
 	err = nft_parse_u32_check(tb[NFTA_EXTHDR_OFFSET], U8_MAX, &offset);
@@ -76,10 +138,27 @@ static int nft_exthdr_init(const struct nft_ctx *ctx,
 	if (err < 0)
 		return err;
 
+	if (tb[NFTA_EXTHDR_FLAGS]) {
+		err = nft_parse_u32_check(tb[NFTA_EXTHDR_FLAGS], U8_MAX, &flags);
+		if (err < 0)
+			return err;
+
+		if (flags & ~NFT_EXTHDR_F_PRESENT)
+			return -EINVAL;
+	}
+
+	if (tb[NFTA_EXTHDR_OP]) {
+		err = nft_parse_u32_check(tb[NFTA_EXTHDR_OP], U8_MAX, &op);
+		if (err < 0)
+			return err;
+	}
+
 	priv->type   = nla_get_u8(tb[NFTA_EXTHDR_TYPE]);
 	priv->offset = offset;
 	priv->len    = len;
 	priv->dreg   = nft_parse_register(tb[NFTA_EXTHDR_DREG]);
+	priv->flags  = flags;
+	priv->op     = op;
 
 	return nft_validate_register_store(ctx, priv->dreg, NULL,
 					   NFT_DATA_VALUE, priv->len);
@@ -97,6 +176,10 @@ static int nft_exthdr_dump(struct sk_buff *skb, const struct nft_expr *expr)
 		goto nla_put_failure;
 	if (nla_put_be32(skb, NFTA_EXTHDR_LEN, htonl(priv->len)))
 		goto nla_put_failure;
+	if (nla_put_be32(skb, NFTA_EXTHDR_FLAGS, htonl(priv->flags)))
+		goto nla_put_failure;
+	if (nla_put_be32(skb, NFTA_EXTHDR_OP, htonl(priv->op)))
+		goto nla_put_failure;
 	return 0;
 
 nla_put_failure:
@@ -104,17 +187,45 @@ nla_put_failure:
 }
 
 static struct nft_expr_type nft_exthdr_type;
-static const struct nft_expr_ops nft_exthdr_ops = {
+static const struct nft_expr_ops nft_exthdr_ipv6_ops = {
 	.type		= &nft_exthdr_type,
 	.size		= NFT_EXPR_SIZE(sizeof(struct nft_exthdr)),
-	.eval		= nft_exthdr_eval,
+	.eval		= nft_exthdr_ipv6_eval,
 	.init		= nft_exthdr_init,
 	.dump		= nft_exthdr_dump,
 };
 
+static const struct nft_expr_ops nft_exthdr_tcp_ops = {
+	.type		= &nft_exthdr_type,
+	.size		= NFT_EXPR_SIZE(sizeof(struct nft_exthdr)),
+	.eval		= nft_exthdr_tcp_eval,
+	.init		= nft_exthdr_init,
+	.dump		= nft_exthdr_dump,
+};
+
+static const struct nft_expr_ops *
+nft_exthdr_select_ops(const struct nft_ctx *ctx,
+		      const struct nlattr * const tb[])
+{
+	u32 op;
+
+	if (!tb[NFTA_EXTHDR_OP])
+		return &nft_exthdr_ipv6_ops;
+
+	op = ntohl(nla_get_u32(tb[NFTA_EXTHDR_OP]));
+	switch (op) {
+	case NFT_EXTHDR_OP_TCPOPT:
+		return &nft_exthdr_tcp_ops;
+	case NFT_EXTHDR_OP_IPV6:
+		return &nft_exthdr_ipv6_ops;
+	}
+
+	return ERR_PTR(-EOPNOTSUPP);
+}
+
 static struct nft_expr_type nft_exthdr_type __read_mostly = {
 	.name		= "exthdr",
-	.ops		= &nft_exthdr_ops,
+	.select_ops	= &nft_exthdr_select_ops,
 	.policy		= nft_exthdr_policy,
 	.maxattr	= NFTA_EXTHDR_MAX,
 	.owner		= THIS_MODULE,
diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c
index 66c7f4b4c49b..e1f5ca9b423b 100644
--- a/net/netfilter/nft_meta.c
+++ b/net/netfilter/nft_meta.c
@@ -154,13 +154,36 @@ void nft_meta_get_eval(const struct nft_expr *expr,
 				*dest = PACKET_BROADCAST;
 			break;
 		case NFPROTO_IPV6:
-			if (ipv6_hdr(skb)->daddr.s6_addr[0] == 0xFF)
+			*dest = PACKET_MULTICAST;
+			break;
+		case NFPROTO_NETDEV:
+			switch (skb->protocol) {
+			case htons(ETH_P_IP): {
+				int noff = skb_network_offset(skb);
+				struct iphdr *iph, _iph;
+
+				iph = skb_header_pointer(skb, noff,
+							 sizeof(_iph), &_iph);
+				if (!iph)
+					goto err;
+
+				if (ipv4_is_multicast(iph->daddr))
+					*dest = PACKET_MULTICAST;
+				else
+					*dest = PACKET_BROADCAST;
+
+				break;
+			}
+			case htons(ETH_P_IPV6):
 				*dest = PACKET_MULTICAST;
-			else
-				*dest = PACKET_BROADCAST;
+				break;
+			default:
+				WARN_ON_ONCE(1);
+				goto err;
+			}
 			break;
 		default:
-			WARN_ON(1);
+			WARN_ON_ONCE(1);
 			goto err;
 		}
 		break;
diff --git a/net/netfilter/nft_set_bitmap.c b/net/netfilter/nft_set_bitmap.c
new file mode 100644
index 000000000000..152d226552c1
--- /dev/null
+++ b/net/netfilter/nft_set_bitmap.c
@@ -0,0 +1,314 @@
+/*
+ * Copyright (c) 2017 Pablo Neira Ayuso <pablo@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/list.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables.h>
+
+/* This bitmap uses two bits to represent one element. These two bits determine
+ * the element state in the current and the future generation.
+ *
+ * An element can be in three states. The generation cursor is represented using
+ * the ^ character, note that this cursor shifts on every succesful transaction.
+ * If no transaction is going on, we observe all elements are in the following
+ * state:
+ *
+ * 11 = this element is active in the current generation. In case of no updates,
+ * ^    it stays active in the next generation.
+ * 00 = this element is inactive in the current generation. In case of no
+ * ^    updates, it stays inactive in the next generation.
+ *
+ * On transaction handling, we observe these two temporary states:
+ *
+ * 01 = this element is inactive in the current generation and it becomes active
+ * ^    in the next one. This happens when the element is inserted but commit
+ *      path has not yet been executed yet, so activation is still pending. On
+ *      transaction abortion, the element is removed.
+ * 10 = this element is active in the current generation and it becomes inactive
+ * ^    in the next one. This happens when the element is deactivated but commit
+ *      path has not yet been executed yet, so removal is still pending. On
+ *      transation abortion, the next generation bit is reset to go back to
+ *      restore its previous state.
+ */
+struct nft_bitmap {
+	u16	bitmap_size;
+	u8	bitmap[];
+};
+
+static inline void nft_bitmap_location(u32 key, u32 *idx, u32 *off)
+{
+	u32 k = (key << 1);
+
+	*idx = k / BITS_PER_BYTE;
+	*off = k % BITS_PER_BYTE;
+}
+
+/* Fetch the two bits that represent the element and check if it is active based
+ * on the generation mask.
+ */
+static inline bool
+nft_bitmap_active(const u8 *bitmap, u32 idx, u32 off, u8 genmask)
+{
+	return (bitmap[idx] & (0x3 << off)) & (genmask << off);
+}
+
+static bool nft_bitmap_lookup(const struct net *net, const struct nft_set *set,
+			      const u32 *key, const struct nft_set_ext **ext)
+{
+	const struct nft_bitmap *priv = nft_set_priv(set);
+	u8 genmask = nft_genmask_cur(net);
+	u32 idx, off;
+
+	nft_bitmap_location(*key, &idx, &off);
+
+	return nft_bitmap_active(priv->bitmap, idx, off, genmask);
+}
+
+static int nft_bitmap_insert(const struct net *net, const struct nft_set *set,
+			     const struct nft_set_elem *elem,
+			     struct nft_set_ext **_ext)
+{
+	struct nft_bitmap *priv = nft_set_priv(set);
+	struct nft_set_ext *ext = elem->priv;
+	u8 genmask = nft_genmask_next(net);
+	u32 idx, off;
+
+	nft_bitmap_location(nft_set_ext_key(ext)->data[0], &idx, &off);
+	if (nft_bitmap_active(priv->bitmap, idx, off, genmask))
+		return -EEXIST;
+
+	/* Enter 01 state. */
+	priv->bitmap[idx] |= (genmask << off);
+
+	return 0;
+}
+
+static void nft_bitmap_remove(const struct net *net,
+			      const struct nft_set *set,
+			      const struct nft_set_elem *elem)
+{
+	struct nft_bitmap *priv = nft_set_priv(set);
+	struct nft_set_ext *ext = elem->priv;
+	u8 genmask = nft_genmask_next(net);
+	u32 idx, off;
+
+	nft_bitmap_location(nft_set_ext_key(ext)->data[0], &idx, &off);
+	/* Enter 00 state. */
+	priv->bitmap[idx] &= ~(genmask << off);
+}
+
+static void nft_bitmap_activate(const struct net *net,
+				const struct nft_set *set,
+				const struct nft_set_elem *elem)
+{
+	struct nft_bitmap *priv = nft_set_priv(set);
+	struct nft_set_ext *ext = elem->priv;
+	u8 genmask = nft_genmask_next(net);
+	u32 idx, off;
+
+	nft_bitmap_location(nft_set_ext_key(ext)->data[0], &idx, &off);
+	/* Enter 11 state. */
+	priv->bitmap[idx] |= (genmask << off);
+}
+
+static bool nft_bitmap_flush(const struct net *net,
+			     const struct nft_set *set, void *ext)
+{
+	struct nft_bitmap *priv = nft_set_priv(set);
+	u8 genmask = nft_genmask_next(net);
+	u32 idx, off;
+
+	nft_bitmap_location(nft_set_ext_key(ext)->data[0], &idx, &off);
+	/* Enter 10 state, similar to deactivation. */
+	priv->bitmap[idx] &= ~(genmask << off);
+
+	return true;
+}
+
+static struct nft_set_ext *nft_bitmap_ext_alloc(const struct nft_set *set,
+						const struct nft_set_elem *elem)
+{
+	struct nft_set_ext_tmpl tmpl;
+	struct nft_set_ext *ext;
+
+	nft_set_ext_prepare(&tmpl);
+	nft_set_ext_add_length(&tmpl, NFT_SET_EXT_KEY, set->klen);
+
+	ext = kzalloc(tmpl.len, GFP_KERNEL);
+	if (!ext)
+		return NULL;
+
+	nft_set_ext_init(ext, &tmpl);
+	memcpy(nft_set_ext_key(ext), elem->key.val.data, set->klen);
+
+	return ext;
+}
+
+static void *nft_bitmap_deactivate(const struct net *net,
+				   const struct nft_set *set,
+				   const struct nft_set_elem *elem)
+{
+	struct nft_bitmap *priv = nft_set_priv(set);
+	u8 genmask = nft_genmask_next(net);
+	struct nft_set_ext *ext;
+	u32 idx, off, key = 0;
+
+	memcpy(&key, elem->key.val.data, set->klen);
+	nft_bitmap_location(key, &idx, &off);
+
+	if (!nft_bitmap_active(priv->bitmap, idx, off, genmask))
+		return NULL;
+
+	/* We have no real set extension since this is a bitmap, allocate this
+	 * dummy object that is released from the commit/abort path.
+	 */
+	ext = nft_bitmap_ext_alloc(set, elem);
+	if (!ext)
+		return NULL;
+
+	/* Enter 10 state. */
+	priv->bitmap[idx] &= ~(genmask << off);
+
+	return ext;
+}
+
+static void nft_bitmap_walk(const struct nft_ctx *ctx,
+			    struct nft_set *set,
+			    struct nft_set_iter *iter)
+{
+	const struct nft_bitmap *priv = nft_set_priv(set);
+	struct nft_set_ext_tmpl tmpl;
+	struct nft_set_elem elem;
+	struct nft_set_ext *ext;
+	int idx, off;
+	u16 key;
+
+	nft_set_ext_prepare(&tmpl);
+	nft_set_ext_add_length(&tmpl, NFT_SET_EXT_KEY, set->klen);
+
+	for (idx = 0; idx < priv->bitmap_size; idx++) {
+		for (off = 0; off < BITS_PER_BYTE; off += 2) {
+			if (iter->count < iter->skip)
+				goto cont;
+
+			if (!nft_bitmap_active(priv->bitmap, idx, off,
+					       iter->genmask))
+				goto cont;
+
+			ext = kzalloc(tmpl.len, GFP_KERNEL);
+			if (!ext) {
+				iter->err = -ENOMEM;
+				return;
+			}
+			nft_set_ext_init(ext, &tmpl);
+			key = ((idx * BITS_PER_BYTE) + off) >> 1;
+			memcpy(nft_set_ext_key(ext), &key, set->klen);
+
+			elem.priv = ext;
+			iter->err = iter->fn(ctx, set, iter, &elem);
+
+			/* On set flush, this dummy extension object is released
+			 * from the commit/abort path.
+			 */
+			if (!iter->flush)
+				kfree(ext);
+
+			if (iter->err < 0)
+				return;
+cont:
+			iter->count++;
+		}
+	}
+}
+
+/* The bitmap size is pow(2, key length in bits) / bits per byte. This is
+ * multiplied by two since each element takes two bits. For 8 bit keys, the
+ * bitmap consumes 66 bytes. For 16 bit keys, 16388 bytes.
+ */
+static inline u32 nft_bitmap_size(u32 klen)
+{
+	return ((2 << ((klen * BITS_PER_BYTE) - 1)) / BITS_PER_BYTE) << 1;
+}
+
+static inline u32 nft_bitmap_total_size(u32 klen)
+{
+	return sizeof(struct nft_bitmap) + nft_bitmap_size(klen);
+}
+
+static unsigned int nft_bitmap_privsize(const struct nlattr * const nla[])
+{
+	u32 klen = ntohl(nla_get_be32(nla[NFTA_SET_KEY_LEN]));
+
+	return nft_bitmap_total_size(klen);
+}
+
+static int nft_bitmap_init(const struct nft_set *set,
+			 const struct nft_set_desc *desc,
+			 const struct nlattr * const nla[])
+{
+	struct nft_bitmap *priv = nft_set_priv(set);
+
+	priv->bitmap_size = nft_bitmap_size(set->klen);
+
+	return 0;
+}
+
+static void nft_bitmap_destroy(const struct nft_set *set)
+{
+}
+
+static bool nft_bitmap_estimate(const struct nft_set_desc *desc, u32 features,
+				struct nft_set_estimate *est)
+{
+	/* Make sure bitmaps we don't get bitmaps larger than 16 Kbytes. */
+	if (desc->klen > 2)
+		return false;
+
+	est->size   = nft_bitmap_total_size(desc->klen);
+	est->lookup = NFT_SET_CLASS_O_1;
+	est->space  = NFT_SET_CLASS_O_1;
+
+	return true;
+}
+
+static struct nft_set_ops nft_bitmap_ops __read_mostly = {
+	.privsize	= nft_bitmap_privsize,
+	.estimate	= nft_bitmap_estimate,
+	.init		= nft_bitmap_init,
+	.destroy	= nft_bitmap_destroy,
+	.insert		= nft_bitmap_insert,
+	.remove		= nft_bitmap_remove,
+	.deactivate	= nft_bitmap_deactivate,
+	.flush		= nft_bitmap_flush,
+	.activate	= nft_bitmap_activate,
+	.lookup		= nft_bitmap_lookup,
+	.walk		= nft_bitmap_walk,
+	.owner		= THIS_MODULE,
+};
+
+static int __init nft_bitmap_module_init(void)
+{
+	return nft_register_set(&nft_bitmap_ops);
+}
+
+static void __exit nft_bitmap_module_exit(void)
+{
+	nft_unregister_set(&nft_bitmap_ops);
+}
+
+module_init(nft_bitmap_module_init);
+module_exit(nft_bitmap_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
+MODULE_ALIAS_NFT_SET();
diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c
index e36069fb76ae..5f652720fc78 100644
--- a/net/netfilter/nft_set_hash.c
+++ b/net/netfilter/nft_set_hash.c
@@ -167,8 +167,8 @@ static void nft_hash_activate(const struct net *net, const struct nft_set *set,
 	nft_set_elem_clear_busy(&he->ext);
 }
 
-static bool nft_hash_deactivate_one(const struct net *net,
-				    const struct nft_set *set, void *priv)
+static bool nft_hash_flush(const struct net *net,
+			   const struct nft_set *set, void *priv)
 {
 	struct nft_hash_elem *he = priv;
 
@@ -195,7 +195,7 @@ static void *nft_hash_deactivate(const struct net *net,
 	rcu_read_lock();
 	he = rhashtable_lookup_fast(&priv->ht, &arg, nft_hash_params);
 	if (he != NULL &&
-	    !nft_hash_deactivate_one(net, set, he))
+	    !nft_hash_flush(net, set, he))
 		he = NULL;
 
 	rcu_read_unlock();
@@ -203,7 +203,8 @@ static void *nft_hash_deactivate(const struct net *net,
 	return he;
 }
 
-static void nft_hash_remove(const struct nft_set *set,
+static void nft_hash_remove(const struct net *net,
+			    const struct nft_set *set,
 			    const struct nft_set_elem *elem)
 {
 	struct nft_hash *priv = nft_set_priv(set);
@@ -383,7 +384,8 @@ static bool nft_hash_estimate(const struct nft_set_desc *desc, u32 features,
 		est->size = esize + 2 * sizeof(struct nft_hash_elem *);
 	}
 
-	est->class = NFT_SET_CLASS_O_1;
+	est->lookup = NFT_SET_CLASS_O_1;
+	est->space  = NFT_SET_CLASS_O_N;
 
 	return true;
 }
@@ -397,12 +399,12 @@ static struct nft_set_ops nft_hash_ops __read_mostly = {
 	.insert		= nft_hash_insert,
 	.activate	= nft_hash_activate,
 	.deactivate	= nft_hash_deactivate,
-	.deactivate_one	= nft_hash_deactivate_one,
+	.flush		= nft_hash_flush,
 	.remove		= nft_hash_remove,
 	.lookup		= nft_hash_lookup,
 	.update		= nft_hash_update,
 	.walk		= nft_hash_walk,
-	.features	= NFT_SET_MAP | NFT_SET_TIMEOUT,
+	.features	= NFT_SET_MAP | NFT_SET_OBJECT | NFT_SET_TIMEOUT,
 	.owner		= THIS_MODULE,
 };
 
diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c
index f06f55ee516d..71e8fb886a73 100644
--- a/net/netfilter/nft_set_rbtree.c
+++ b/net/netfilter/nft_set_rbtree.c
@@ -151,7 +151,8 @@ static int nft_rbtree_insert(const struct net *net, const struct nft_set *set,
 	return err;
 }
 
-static void nft_rbtree_remove(const struct nft_set *set,
+static void nft_rbtree_remove(const struct net *net,
+			      const struct nft_set *set,
 			      const struct nft_set_elem *elem)
 {
 	struct nft_rbtree *priv = nft_set_priv(set);
@@ -171,8 +172,8 @@ static void nft_rbtree_activate(const struct net *net,
 	nft_set_elem_change_active(net, set, &rbe->ext);
 }
 
-static bool nft_rbtree_deactivate_one(const struct net *net,
-				      const struct nft_set *set, void *priv)
+static bool nft_rbtree_flush(const struct net *net,
+			     const struct nft_set *set, void *priv)
 {
 	struct nft_rbtree_elem *rbe = priv;
 
@@ -213,7 +214,7 @@ static void *nft_rbtree_deactivate(const struct net *net,
 				parent = parent->rb_right;
 				continue;
 			}
-			nft_rbtree_deactivate_one(net, set, rbe);
+			nft_rbtree_flush(net, set, rbe);
 			return rbe;
 		}
 	}
@@ -290,7 +291,8 @@ static bool nft_rbtree_estimate(const struct nft_set_desc *desc, u32 features,
 	else
 		est->size = nsize;
 
-	est->class = NFT_SET_CLASS_O_LOG_N;
+	est->lookup = NFT_SET_CLASS_O_LOG_N;
+	est->space  = NFT_SET_CLASS_O_N;
 
 	return true;
 }
@@ -304,11 +306,11 @@ static struct nft_set_ops nft_rbtree_ops __read_mostly = {
 	.insert		= nft_rbtree_insert,
 	.remove		= nft_rbtree_remove,
 	.deactivate	= nft_rbtree_deactivate,
-	.deactivate_one	= nft_rbtree_deactivate_one,
+	.flush		= nft_rbtree_flush,
 	.activate	= nft_rbtree_activate,
 	.lookup		= nft_rbtree_lookup,
 	.walk		= nft_rbtree_walk,
-	.features	= NFT_SET_INTERVAL | NFT_SET_MAP,
+	.features	= NFT_SET_INTERVAL | NFT_SET_MAP | NFT_SET_OBJECT,
 	.owner		= THIS_MODULE,
 };
 
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index 2ff499680cc6..14857afc9937 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -262,6 +262,60 @@ struct xt_target *xt_request_find_target(u8 af, const char *name, u8 revision)
 }
 EXPORT_SYMBOL_GPL(xt_request_find_target);
 
+
+static int xt_obj_to_user(u16 __user *psize, u16 size,
+			  void __user *pname, const char *name,
+			  u8 __user *prev, u8 rev)
+{
+	if (put_user(size, psize))
+		return -EFAULT;
+	if (copy_to_user(pname, name, strlen(name) + 1))
+		return -EFAULT;
+	if (put_user(rev, prev))
+		return -EFAULT;
+
+	return 0;
+}
+
+#define XT_OBJ_TO_USER(U, K, TYPE, C_SIZE)				\
+	xt_obj_to_user(&U->u.TYPE##_size, C_SIZE ? : K->u.TYPE##_size,	\
+		       U->u.user.name, K->u.kernel.TYPE->name,		\
+		       &U->u.user.revision, K->u.kernel.TYPE->revision)
+
+int xt_data_to_user(void __user *dst, const void *src,
+		    int usersize, int size)
+{
+	usersize = usersize ? : size;
+	if (copy_to_user(dst, src, usersize))
+		return -EFAULT;
+	if (usersize != size && clear_user(dst + usersize, size - usersize))
+		return -EFAULT;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(xt_data_to_user);
+
+#define XT_DATA_TO_USER(U, K, TYPE, C_SIZE)				\
+	xt_data_to_user(U->data, K->data,				\
+			K->u.kernel.TYPE->usersize,			\
+			C_SIZE ? : K->u.kernel.TYPE->TYPE##size)
+
+int xt_match_to_user(const struct xt_entry_match *m,
+		     struct xt_entry_match __user *u)
+{
+	return XT_OBJ_TO_USER(u, m, match, 0) ||
+	       XT_DATA_TO_USER(u, m, match, 0);
+}
+EXPORT_SYMBOL_GPL(xt_match_to_user);
+
+int xt_target_to_user(const struct xt_entry_target *t,
+		      struct xt_entry_target __user *u)
+{
+	return XT_OBJ_TO_USER(u, t, target, 0) ||
+	       XT_DATA_TO_USER(u, t, target, 0);
+}
+EXPORT_SYMBOL_GPL(xt_target_to_user);
+
 static int match_revfn(u8 af, const char *name, u8 revision, int *bestp)
 {
 	const struct xt_match *m;
@@ -565,17 +619,14 @@ int xt_compat_match_to_user(const struct xt_entry_match *m,
 	int off = xt_compat_match_offset(match);
 	u_int16_t msize = m->u.user.match_size - off;
 
-	if (copy_to_user(cm, m, sizeof(*cm)) ||
-	    put_user(msize, &cm->u.user.match_size) ||
-	    copy_to_user(cm->u.user.name, m->u.kernel.match->name,
-			 strlen(m->u.kernel.match->name) + 1))
+	if (XT_OBJ_TO_USER(cm, m, match, msize))
 		return -EFAULT;
 
 	if (match->compat_to_user) {
 		if (match->compat_to_user((void __user *)cm->data, m->data))
 			return -EFAULT;
 	} else {
-		if (copy_to_user(cm->data, m->data, msize - sizeof(*cm)))
+		if (XT_DATA_TO_USER(cm, m, match, msize - sizeof(*cm)))
 			return -EFAULT;
 	}
 
@@ -616,7 +667,7 @@ int xt_compat_check_entry_offsets(const void *base, const char *elems,
 	    COMPAT_XT_ALIGN(target_offset + sizeof(struct compat_xt_standard_target)) != next_offset)
 		return -EINVAL;
 
-	/* compat_xt_entry match has less strict aligment requirements,
+	/* compat_xt_entry match has less strict alignment requirements,
 	 * otherwise they are identical.  In case of padding differences
 	 * we need to add compat version of xt_check_entry_match.
 	 */
@@ -923,17 +974,14 @@ int xt_compat_target_to_user(const struct xt_entry_target *t,
 	int off = xt_compat_target_offset(target);
 	u_int16_t tsize = t->u.user.target_size - off;
 
-	if (copy_to_user(ct, t, sizeof(*ct)) ||
-	    put_user(tsize, &ct->u.user.target_size) ||
-	    copy_to_user(ct->u.user.name, t->u.kernel.target->name,
-			 strlen(t->u.kernel.target->name) + 1))
+	if (XT_OBJ_TO_USER(ct, t, target, tsize))
 		return -EFAULT;
 
 	if (target->compat_to_user) {
 		if (target->compat_to_user((void __user *)ct->data, t->data))
 			return -EFAULT;
 	} else {
-		if (copy_to_user(ct->data, t->data, tsize - sizeof(*ct)))
+		if (XT_DATA_TO_USER(ct, t, target, tsize - sizeof(*ct)))
 			return -EFAULT;
 	}
 
diff --git a/net/netfilter/xt_CT.c b/net/netfilter/xt_CT.c
index 95c750358747..b008db0184b8 100644
--- a/net/netfilter/xt_CT.c
+++ b/net/netfilter/xt_CT.c
@@ -23,15 +23,14 @@
 static inline int xt_ct_target(struct sk_buff *skb, struct nf_conn *ct)
 {
 	/* Previously seen (loopback)? Ignore. */
-	if (skb->nfct != NULL)
+	if (skb->_nfct != 0)
 		return XT_CONTINUE;
 
 	/* special case the untracked ct : we want the percpu object */
 	if (!ct)
 		ct = nf_ct_untracked_get();
 	atomic_inc(&ct->ct_general.use);
-	skb->nfct = &ct->ct_general;
-	skb->nfctinfo = IP_CT_NEW;
+	nf_ct_set(skb, ct, IP_CT_NEW);
 
 	return XT_CONTINUE;
 }
@@ -373,6 +372,7 @@ static struct xt_target xt_ct_tg_reg[] __read_mostly = {
 		.name		= "CT",
 		.family		= NFPROTO_UNSPEC,
 		.targetsize	= sizeof(struct xt_ct_target_info),
+		.usersize	= offsetof(struct xt_ct_target_info, ct),
 		.checkentry	= xt_ct_tg_check_v0,
 		.destroy	= xt_ct_tg_destroy_v0,
 		.target		= xt_ct_target_v0,
@@ -384,6 +384,7 @@ static struct xt_target xt_ct_tg_reg[] __read_mostly = {
 		.family		= NFPROTO_UNSPEC,
 		.revision	= 1,
 		.targetsize	= sizeof(struct xt_ct_target_info_v1),
+		.usersize	= offsetof(struct xt_ct_target_info, ct),
 		.checkentry	= xt_ct_tg_check_v1,
 		.destroy	= xt_ct_tg_destroy_v1,
 		.target		= xt_ct_target_v1,
@@ -395,6 +396,7 @@ static struct xt_target xt_ct_tg_reg[] __read_mostly = {
 		.family		= NFPROTO_UNSPEC,
 		.revision	= 2,
 		.targetsize	= sizeof(struct xt_ct_target_info_v1),
+		.usersize	= offsetof(struct xt_ct_target_info, ct),
 		.checkentry	= xt_ct_tg_check_v2,
 		.destroy	= xt_ct_tg_destroy_v1,
 		.target		= xt_ct_target_v1,
@@ -407,12 +409,11 @@ static unsigned int
 notrack_tg(struct sk_buff *skb, const struct xt_action_param *par)
 {
 	/* Previously seen (loopback)? Ignore. */
-	if (skb->nfct != NULL)
+	if (skb->_nfct != 0)
 		return XT_CONTINUE;
 
-	skb->nfct = &nf_ct_untracked_get()->ct_general;
-	skb->nfctinfo = IP_CT_NEW;
-	nf_conntrack_get(skb->nfct);
+	nf_ct_set(skb, nf_ct_untracked_get(), IP_CT_NEW);
+	nf_conntrack_get(skb_nfct(skb));
 
 	return XT_CONTINUE;
 }
diff --git a/net/netfilter/xt_RATEEST.c b/net/netfilter/xt_RATEEST.c
index 91a373a3f534..498b54fd04d7 100644
--- a/net/netfilter/xt_RATEEST.c
+++ b/net/netfilter/xt_RATEEST.c
@@ -162,6 +162,7 @@ static struct xt_target xt_rateest_tg_reg __read_mostly = {
 	.checkentry = xt_rateest_tg_checkentry,
 	.destroy    = xt_rateest_tg_destroy,
 	.targetsize = sizeof(struct xt_rateest_target_info),
+	.usersize   = offsetof(struct xt_rateest_target_info, est),
 	.me         = THIS_MODULE,
 };
 
diff --git a/net/netfilter/xt_TEE.c b/net/netfilter/xt_TEE.c
index 1c57ace75ae6..86b0580b2216 100644
--- a/net/netfilter/xt_TEE.c
+++ b/net/netfilter/xt_TEE.c
@@ -133,6 +133,7 @@ static struct xt_target tee_tg_reg[] __read_mostly = {
 		.family     = NFPROTO_IPV4,
 		.target     = tee_tg4,
 		.targetsize = sizeof(struct xt_tee_tginfo),
+		.usersize   = offsetof(struct xt_tee_tginfo, priv),
 		.checkentry = tee_tg_check,
 		.destroy    = tee_tg_destroy,
 		.me         = THIS_MODULE,
@@ -144,6 +145,7 @@ static struct xt_target tee_tg_reg[] __read_mostly = {
 		.family     = NFPROTO_IPV6,
 		.target     = tee_tg6,
 		.targetsize = sizeof(struct xt_tee_tginfo),
+		.usersize   = offsetof(struct xt_tee_tginfo, priv),
 		.checkentry = tee_tg_check,
 		.destroy    = tee_tg_destroy,
 		.me         = THIS_MODULE,
diff --git a/net/netfilter/xt_bpf.c b/net/netfilter/xt_bpf.c
index 2dedaa23ab0a..38986a95216c 100644
--- a/net/netfilter/xt_bpf.c
+++ b/net/netfilter/xt_bpf.c
@@ -110,6 +110,7 @@ static struct xt_match bpf_mt_reg[] __read_mostly = {
 		.match		= bpf_mt,
 		.destroy	= bpf_mt_destroy,
 		.matchsize	= sizeof(struct xt_bpf_info),
+		.usersize	= offsetof(struct xt_bpf_info, filter),
 		.me		= THIS_MODULE,
 	},
 	{
@@ -120,6 +121,7 @@ static struct xt_match bpf_mt_reg[] __read_mostly = {
 		.match		= bpf_mt_v1,
 		.destroy	= bpf_mt_destroy_v1,
 		.matchsize	= sizeof(struct xt_bpf_info_v1),
+		.usersize	= offsetof(struct xt_bpf_info_v1, filter),
 		.me		= THIS_MODULE,
 	},
 };
diff --git a/net/netfilter/xt_cgroup.c b/net/netfilter/xt_cgroup.c
index a086a914865f..1db1ce59079f 100644
--- a/net/netfilter/xt_cgroup.c
+++ b/net/netfilter/xt_cgroup.c
@@ -122,6 +122,7 @@ static struct xt_match cgroup_mt_reg[] __read_mostly = {
 		.checkentry	= cgroup_mt_check_v1,
 		.match		= cgroup_mt_v1,
 		.matchsize	= sizeof(struct xt_cgroup_info_v1),
+		.usersize	= offsetof(struct xt_cgroup_info_v1, priv),
 		.destroy	= cgroup_mt_destroy_v1,
 		.me		= THIS_MODULE,
 		.hooks		= (1 << NF_INET_LOCAL_OUT) |
diff --git a/net/netfilter/xt_connlimit.c b/net/netfilter/xt_connlimit.c
index 2aff2b7c4689..b8fd4ab762ed 100644
--- a/net/netfilter/xt_connlimit.c
+++ b/net/netfilter/xt_connlimit.c
@@ -218,7 +218,7 @@ count_tree(struct net *net, struct rb_root *root,
 		int diff;
 		bool addit;
 
-		rbconn = container_of(*rbnode, struct xt_connlimit_rb, node);
+		rbconn = rb_entry(*rbnode, struct xt_connlimit_rb, node);
 
 		parent = *rbnode;
 		diff = same_source_net(addr, mask, &rbconn->addr, family);
@@ -398,7 +398,7 @@ static void destroy_tree(struct rb_root *r)
 	struct rb_node *node;
 
 	while ((node = rb_first(r)) != NULL) {
-		rbconn = container_of(node, struct xt_connlimit_rb, node);
+		rbconn = rb_entry(node, struct xt_connlimit_rb, node);
 
 		rb_erase(node, r);
 
@@ -431,6 +431,7 @@ static struct xt_match connlimit_mt_reg __read_mostly = {
 	.checkentry = connlimit_mt_check,
 	.match      = connlimit_mt,
 	.matchsize  = sizeof(struct xt_connlimit_info),
+	.usersize   = offsetof(struct xt_connlimit_info, data),
 	.destroy    = connlimit_mt_destroy,
 	.me         = THIS_MODULE,
 };
diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c
index 10063408141d..2a6dfe8b74d3 100644
--- a/net/netfilter/xt_hashlimit.c
+++ b/net/netfilter/xt_hashlimit.c
@@ -463,23 +463,16 @@ static u32 xt_hashlimit_len_to_chunks(u32 len)
 /* Precision saver. */
 static u64 user2credits(u64 user, int revision)
 {
-	if (revision == 1) {
-		/* If multiplying would overflow... */
-		if (user > 0xFFFFFFFF / (HZ*CREDITS_PER_JIFFY_v1))
-			/* Divide first. */
-			return div64_u64(user, XT_HASHLIMIT_SCALE)
-				* HZ * CREDITS_PER_JIFFY_v1;
-
-		return div64_u64(user * HZ * CREDITS_PER_JIFFY_v1,
-				 XT_HASHLIMIT_SCALE);
-	} else {
-		if (user > 0xFFFFFFFFFFFFFFFFULL / (HZ*CREDITS_PER_JIFFY))
-			return div64_u64(user, XT_HASHLIMIT_SCALE_v2)
-				* HZ * CREDITS_PER_JIFFY;
+	u64 scale = (revision == 1) ?
+		XT_HASHLIMIT_SCALE : XT_HASHLIMIT_SCALE_v2;
+	u64 cpj = (revision == 1) ?
+		CREDITS_PER_JIFFY_v1 : CREDITS_PER_JIFFY;
 
-		return div64_u64(user * HZ * CREDITS_PER_JIFFY,
-				 XT_HASHLIMIT_SCALE_v2);
-	}
+	/* Avoid overflow: divide the constant operands first */
+	if (scale >= HZ * cpj)
+		return div64_u64(user, div64_u64(scale, HZ * cpj));
+
+	return user * div64_u64(HZ * cpj, scale);
 }
 
 static u32 user2credits_byte(u32 user)
@@ -838,6 +831,7 @@ static struct xt_match hashlimit_mt_reg[] __read_mostly = {
 		.family         = NFPROTO_IPV4,
 		.match          = hashlimit_mt_v1,
 		.matchsize      = sizeof(struct xt_hashlimit_mtinfo1),
+		.usersize	= offsetof(struct xt_hashlimit_mtinfo1, hinfo),
 		.checkentry     = hashlimit_mt_check_v1,
 		.destroy        = hashlimit_mt_destroy_v1,
 		.me             = THIS_MODULE,
@@ -848,6 +842,7 @@ static struct xt_match hashlimit_mt_reg[] __read_mostly = {
 		.family         = NFPROTO_IPV4,
 		.match          = hashlimit_mt,
 		.matchsize      = sizeof(struct xt_hashlimit_mtinfo2),
+		.usersize	= offsetof(struct xt_hashlimit_mtinfo2, hinfo),
 		.checkentry     = hashlimit_mt_check,
 		.destroy        = hashlimit_mt_destroy,
 		.me             = THIS_MODULE,
@@ -859,6 +854,7 @@ static struct xt_match hashlimit_mt_reg[] __read_mostly = {
 		.family         = NFPROTO_IPV6,
 		.match          = hashlimit_mt_v1,
 		.matchsize      = sizeof(struct xt_hashlimit_mtinfo1),
+		.usersize	= offsetof(struct xt_hashlimit_mtinfo1, hinfo),
 		.checkentry     = hashlimit_mt_check_v1,
 		.destroy        = hashlimit_mt_destroy_v1,
 		.me             = THIS_MODULE,
@@ -869,6 +865,7 @@ static struct xt_match hashlimit_mt_reg[] __read_mostly = {
 		.family         = NFPROTO_IPV6,
 		.match          = hashlimit_mt,
 		.matchsize      = sizeof(struct xt_hashlimit_mtinfo2),
+		.usersize	= offsetof(struct xt_hashlimit_mtinfo2, hinfo),
 		.checkentry     = hashlimit_mt_check,
 		.destroy        = hashlimit_mt_destroy,
 		.me             = THIS_MODULE,
diff --git a/net/netfilter/xt_limit.c b/net/netfilter/xt_limit.c
index bef850596558..dab962df1787 100644
--- a/net/netfilter/xt_limit.c
+++ b/net/netfilter/xt_limit.c
@@ -192,6 +192,8 @@ static struct xt_match limit_mt_reg __read_mostly = {
 	.compatsize       = sizeof(struct compat_xt_rateinfo),
 	.compat_from_user = limit_mt_compat_from_user,
 	.compat_to_user   = limit_mt_compat_to_user,
+#else
+	.usersize         = offsetof(struct xt_rateinfo, prev),
 #endif
 	.me               = THIS_MODULE,
 };
diff --git a/net/netfilter/xt_pkttype.c b/net/netfilter/xt_pkttype.c
index 57efb703ff18..1ef99151b3ba 100644
--- a/net/netfilter/xt_pkttype.c
+++ b/net/netfilter/xt_pkttype.c
@@ -33,8 +33,7 @@ pkttype_mt(const struct sk_buff *skb, struct xt_action_param *par)
 	else if (xt_family(par) == NFPROTO_IPV4 &&
 	    ipv4_is_multicast(ip_hdr(skb)->daddr))
 		type = PACKET_MULTICAST;
-	else if (xt_family(par) == NFPROTO_IPV6 &&
-	    ipv6_hdr(skb)->daddr.s6_addr[0] == 0xFF)
+	else if (xt_family(par) == NFPROTO_IPV6)
 		type = PACKET_MULTICAST;
 	else
 		type = PACKET_BROADCAST;
diff --git a/net/netfilter/xt_quota.c b/net/netfilter/xt_quota.c
index 44c8eb4c9d66..10d61a6eed71 100644
--- a/net/netfilter/xt_quota.c
+++ b/net/netfilter/xt_quota.c
@@ -73,6 +73,7 @@ static struct xt_match quota_mt_reg __read_mostly = {
 	.checkentry = quota_mt_check,
 	.destroy    = quota_mt_destroy,
 	.matchsize  = sizeof(struct xt_quota_info),
+	.usersize   = offsetof(struct xt_quota_info, master),
 	.me         = THIS_MODULE,
 };
 
diff --git a/net/netfilter/xt_rateest.c b/net/netfilter/xt_rateest.c
index 1db02f6fca54..755d2f6693a2 100644
--- a/net/netfilter/xt_rateest.c
+++ b/net/netfilter/xt_rateest.c
@@ -133,6 +133,7 @@ static struct xt_match xt_rateest_mt_reg __read_mostly = {
 	.checkentry = xt_rateest_mt_checkentry,
 	.destroy    = xt_rateest_mt_destroy,
 	.matchsize  = sizeof(struct xt_rateest_match_info),
+	.usersize   = offsetof(struct xt_rateest_match_info, est1),
 	.me         = THIS_MODULE,
 };
 
diff --git a/net/netfilter/xt_string.c b/net/netfilter/xt_string.c
index 0bc3460319c8..423293ee57c2 100644
--- a/net/netfilter/xt_string.c
+++ b/net/netfilter/xt_string.c
@@ -77,6 +77,7 @@ static struct xt_match xt_string_mt_reg __read_mostly = {
 	.match      = string_mt,
 	.destroy    = string_mt_destroy,
 	.matchsize  = sizeof(struct xt_string_info),
+	.usersize   = offsetof(struct xt_string_info, config),
 	.me         = THIS_MODULE,
 };
 
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 161b628ab2b0..7b73c7c161a9 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -1210,9 +1210,9 @@ static struct sk_buff *netlink_trim(struct sk_buff *skb, gfp_t allocation)
 		skb = nskb;
 	}
 
-	if (!pskb_expand_head(skb, 0, -delta, allocation))
-		skb->truesize -= delta;
-
+	pskb_expand_head(skb, 0, -delta,
+			 (allocation & ~__GFP_DIRECT_RECLAIM) |
+			 __GFP_NOWARN | __GFP_NORETRY);
 	return skb;
 }
 
diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index 514f7bcf7c63..b1beb2b94ec7 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -1074,6 +1074,8 @@ static int execute_masked_set_action(struct sk_buff *skb,
 	case OVS_KEY_ATTR_CT_ZONE:
 	case OVS_KEY_ATTR_CT_MARK:
 	case OVS_KEY_ATTR_CT_LABELS:
+	case OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV4:
+	case OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6:
 		err = -EINVAL;
 		break;
 	}
@@ -1141,12 +1143,6 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb,
 			      struct sw_flow_key *key,
 			      const struct nlattr *attr, int len)
 {
-	/* Every output action needs a separate clone of 'skb', but the common
-	 * case is just a single output action, so that doing a clone and
-	 * then freeing the original skbuff is wasteful.  So the following code
-	 * is slightly obscure just to avoid that.
-	 */
-	int prev_port = -1;
 	const struct nlattr *a;
 	int rem;
 
@@ -1154,20 +1150,28 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb,
 	     a = nla_next(a, &rem)) {
 		int err = 0;
 
-		if (unlikely(prev_port != -1)) {
-			struct sk_buff *out_skb = skb_clone(skb, GFP_ATOMIC);
-
-			if (out_skb)
-				do_output(dp, out_skb, prev_port, key);
+		switch (nla_type(a)) {
+		case OVS_ACTION_ATTR_OUTPUT: {
+			int port = nla_get_u32(a);
+			struct sk_buff *clone;
+
+			/* Every output action needs a separate clone
+			 * of 'skb', In case the output action is the
+			 * last action, cloning can be avoided.
+			 */
+			if (nla_is_last(a, rem)) {
+				do_output(dp, skb, port, key);
+				/* 'skb' has been used for output.
+				 */
+				return 0;
+			}
 
+			clone = skb_clone(skb, GFP_ATOMIC);
+			if (clone)
+				do_output(dp, clone, port, key);
 			OVS_CB(skb)->cutlen = 0;
-			prev_port = -1;
-		}
-
-		switch (nla_type(a)) {
-		case OVS_ACTION_ATTR_OUTPUT:
-			prev_port = nla_get_u32(a);
 			break;
+		}
 
 		case OVS_ACTION_ATTR_TRUNC: {
 			struct ovs_action_trunc *trunc = nla_data(a);
@@ -1257,11 +1261,7 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb,
 		}
 	}
 
-	if (prev_port != -1)
-		do_output(dp, skb, prev_port, key);
-	else
-		consume_skb(skb);
-
+	consume_skb(skb);
 	return 0;
 }
 
diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c
index 54253ea5976e..85cd59526670 100644
--- a/net/openvswitch/conntrack.c
+++ b/net/openvswitch/conntrack.c
@@ -65,6 +65,7 @@ struct ovs_conntrack_info {
 	struct nf_conn *ct;
 	u8 commit : 1;
 	u8 nat : 3;                 /* enum ovs_ct_nat */
+	u8 force : 1;
 	u16 family;
 	struct md_mark mark;
 	struct md_labels labels;
@@ -73,6 +74,8 @@ struct ovs_conntrack_info {
 #endif
 };
 
+static bool labels_nonzero(const struct ovs_key_ct_labels *labels);
+
 static void __ovs_ct_free_action(struct ovs_conntrack_info *ct_info);
 
 static u16 key_to_nfproto(const struct sw_flow_key *key)
@@ -129,21 +132,33 @@ static u32 ovs_ct_get_mark(const struct nf_conn *ct)
 #endif
 }
 
+/* Guard against conntrack labels max size shrinking below 128 bits. */
+#if NF_CT_LABELS_MAX_SIZE < 16
+#error NF_CT_LABELS_MAX_SIZE must be at least 16 bytes
+#endif
+
 static void ovs_ct_get_labels(const struct nf_conn *ct,
 			      struct ovs_key_ct_labels *labels)
 {
 	struct nf_conn_labels *cl = ct ? nf_ct_labels_find(ct) : NULL;
 
-	if (cl) {
-		size_t len = sizeof(cl->bits);
+	if (cl)
+		memcpy(labels, cl->bits, OVS_CT_LABELS_LEN);
+	else
+		memset(labels, 0, OVS_CT_LABELS_LEN);
+}
 
-		if (len > OVS_CT_LABELS_LEN)
-			len = OVS_CT_LABELS_LEN;
-		else if (len < OVS_CT_LABELS_LEN)
-			memset(labels, 0, OVS_CT_LABELS_LEN);
-		memcpy(labels, cl->bits, len);
+static void __ovs_ct_update_key_orig_tp(struct sw_flow_key *key,
+					const struct nf_conntrack_tuple *orig,
+					u8 icmp_proto)
+{
+	key->ct_orig_proto = orig->dst.protonum;
+	if (orig->dst.protonum == icmp_proto) {
+		key->ct.orig_tp.src = htons(orig->dst.u.icmp.type);
+		key->ct.orig_tp.dst = htons(orig->dst.u.icmp.code);
 	} else {
-		memset(labels, 0, OVS_CT_LABELS_LEN);
+		key->ct.orig_tp.src = orig->src.u.all;
+		key->ct.orig_tp.dst = orig->dst.u.all;
 	}
 }
 
@@ -151,13 +166,42 @@ static void __ovs_ct_update_key(struct sw_flow_key *key, u8 state,
 				const struct nf_conntrack_zone *zone,
 				const struct nf_conn *ct)
 {
-	key->ct.state = state;
-	key->ct.zone = zone->id;
+	key->ct_state = state;
+	key->ct_zone = zone->id;
 	key->ct.mark = ovs_ct_get_mark(ct);
 	ovs_ct_get_labels(ct, &key->ct.labels);
+
+	if (ct) {
+		const struct nf_conntrack_tuple *orig;
+
+		/* Use the master if we have one. */
+		if (ct->master)
+			ct = ct->master;
+		orig = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
+
+		/* IP version must match with the master connection. */
+		if (key->eth.type == htons(ETH_P_IP) &&
+		    nf_ct_l3num(ct) == NFPROTO_IPV4) {
+			key->ipv4.ct_orig.src = orig->src.u3.ip;
+			key->ipv4.ct_orig.dst = orig->dst.u3.ip;
+			__ovs_ct_update_key_orig_tp(key, orig, IPPROTO_ICMP);
+			return;
+		} else if (key->eth.type == htons(ETH_P_IPV6) &&
+			   !sw_flow_key_is_nd(key) &&
+			   nf_ct_l3num(ct) == NFPROTO_IPV6) {
+			key->ipv6.ct_orig.src = orig->src.u3.in6;
+			key->ipv6.ct_orig.dst = orig->dst.u3.in6;
+			__ovs_ct_update_key_orig_tp(key, orig, NEXTHDR_ICMP);
+			return;
+		}
+	}
+	/* Clear 'ct_orig_proto' to mark the non-existence of conntrack
+	 * original direction key fields.
+	 */
+	key->ct_orig_proto = 0;
 }
 
-/* Update 'key' based on skb->nfct.  If 'post_ct' is true, then OVS has
+/* Update 'key' based on skb->_nfct.  If 'post_ct' is true, then OVS has
  * previously sent the packet to conntrack via the ct action.  If
  * 'keep_nat_flags' is true, the existing NAT flags retained, else they are
  * initialized from the connection status.
@@ -184,7 +228,7 @@ static void ovs_ct_update_key(const struct sk_buff *skb,
 		if (ct->master)
 			state |= OVS_CS_F_RELATED;
 		if (keep_nat_flags) {
-			state |= key->ct.state & OVS_CS_F_NAT_MASK;
+			state |= key->ct_state & OVS_CS_F_NAT_MASK;
 		} else {
 			if (ct->status & IPS_SRC_NAT)
 				state |= OVS_CS_F_SRC_NAT;
@@ -208,44 +252,69 @@ void ovs_ct_fill_key(const struct sk_buff *skb, struct sw_flow_key *key)
 	ovs_ct_update_key(skb, NULL, key, false, false);
 }
 
-int ovs_ct_put_key(const struct sw_flow_key *key, struct sk_buff *skb)
+#define IN6_ADDR_INITIALIZER(ADDR) \
+	{ (ADDR).s6_addr32[0], (ADDR).s6_addr32[1], \
+	  (ADDR).s6_addr32[2], (ADDR).s6_addr32[3] }
+
+int ovs_ct_put_key(const struct sw_flow_key *swkey,
+		   const struct sw_flow_key *output, struct sk_buff *skb)
 {
-	if (nla_put_u32(skb, OVS_KEY_ATTR_CT_STATE, key->ct.state))
+	if (nla_put_u32(skb, OVS_KEY_ATTR_CT_STATE, output->ct_state))
 		return -EMSGSIZE;
 
 	if (IS_ENABLED(CONFIG_NF_CONNTRACK_ZONES) &&
-	    nla_put_u16(skb, OVS_KEY_ATTR_CT_ZONE, key->ct.zone))
+	    nla_put_u16(skb, OVS_KEY_ATTR_CT_ZONE, output->ct_zone))
 		return -EMSGSIZE;
 
 	if (IS_ENABLED(CONFIG_NF_CONNTRACK_MARK) &&
-	    nla_put_u32(skb, OVS_KEY_ATTR_CT_MARK, key->ct.mark))
+	    nla_put_u32(skb, OVS_KEY_ATTR_CT_MARK, output->ct.mark))
 		return -EMSGSIZE;
 
 	if (IS_ENABLED(CONFIG_NF_CONNTRACK_LABELS) &&
-	    nla_put(skb, OVS_KEY_ATTR_CT_LABELS, sizeof(key->ct.labels),
-		    &key->ct.labels))
+	    nla_put(skb, OVS_KEY_ATTR_CT_LABELS, sizeof(output->ct.labels),
+		    &output->ct.labels))
 		return -EMSGSIZE;
 
+	if (swkey->ct_orig_proto) {
+		if (swkey->eth.type == htons(ETH_P_IP)) {
+			struct ovs_key_ct_tuple_ipv4 orig = {
+				output->ipv4.ct_orig.src,
+				output->ipv4.ct_orig.dst,
+				output->ct.orig_tp.src,
+				output->ct.orig_tp.dst,
+				output->ct_orig_proto,
+			};
+			if (nla_put(skb, OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV4,
+				    sizeof(orig), &orig))
+				return -EMSGSIZE;
+		} else if (swkey->eth.type == htons(ETH_P_IPV6)) {
+			struct ovs_key_ct_tuple_ipv6 orig = {
+				IN6_ADDR_INITIALIZER(output->ipv6.ct_orig.src),
+				IN6_ADDR_INITIALIZER(output->ipv6.ct_orig.dst),
+				output->ct.orig_tp.src,
+				output->ct.orig_tp.dst,
+				output->ct_orig_proto,
+			};
+			if (nla_put(skb, OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6,
+				    sizeof(orig), &orig))
+				return -EMSGSIZE;
+		}
+	}
+
 	return 0;
 }
 
-static int ovs_ct_set_mark(struct sk_buff *skb, struct sw_flow_key *key,
+static int ovs_ct_set_mark(struct nf_conn *ct, struct sw_flow_key *key,
 			   u32 ct_mark, u32 mask)
 {
 #if IS_ENABLED(CONFIG_NF_CONNTRACK_MARK)
-	enum ip_conntrack_info ctinfo;
-	struct nf_conn *ct;
 	u32 new_mark;
 
-	/* The connection could be invalid, in which case set_mark is no-op. */
-	ct = nf_ct_get(skb, &ctinfo);
-	if (!ct)
-		return 0;
-
 	new_mark = ct_mark | (ct->mark & ~(mask));
 	if (ct->mark != new_mark) {
 		ct->mark = new_mark;
-		nf_conntrack_event_cache(IPCT_MARK, ct);
+		if (nf_ct_is_confirmed(ct))
+			nf_conntrack_event_cache(IPCT_MARK, ct);
 		key->ct.mark = new_mark;
 	}
 
@@ -255,34 +324,83 @@ static int ovs_ct_set_mark(struct sk_buff *skb, struct sw_flow_key *key,
 #endif
 }
 
-static int ovs_ct_set_labels(struct sk_buff *skb, struct sw_flow_key *key,
-			     const struct ovs_key_ct_labels *labels,
-			     const struct ovs_key_ct_labels *mask)
+static struct nf_conn_labels *ovs_ct_get_conn_labels(struct nf_conn *ct)
 {
-	enum ip_conntrack_info ctinfo;
 	struct nf_conn_labels *cl;
-	struct nf_conn *ct;
-	int err;
-
-	/* The connection could be invalid, in which case set_label is no-op.*/
-	ct = nf_ct_get(skb, &ctinfo);
-	if (!ct)
-		return 0;
 
 	cl = nf_ct_labels_find(ct);
 	if (!cl) {
 		nf_ct_labels_ext_add(ct);
 		cl = nf_ct_labels_find(ct);
 	}
-	if (!cl || sizeof(cl->bits) < OVS_CT_LABELS_LEN)
+
+	return cl;
+}
+
+/* Initialize labels for a new, yet to be committed conntrack entry.  Note that
+ * since the new connection is not yet confirmed, and thus no-one else has
+ * access to it's labels, we simply write them over.
+ */
+static int ovs_ct_init_labels(struct nf_conn *ct, struct sw_flow_key *key,
+			      const struct ovs_key_ct_labels *labels,
+			      const struct ovs_key_ct_labels *mask)
+{
+	struct nf_conn_labels *cl, *master_cl;
+	bool have_mask = labels_nonzero(mask);
+
+	/* Inherit master's labels to the related connection? */
+	master_cl = ct->master ? nf_ct_labels_find(ct->master) : NULL;
+
+	if (!master_cl && !have_mask)
+		return 0;   /* Nothing to do. */
+
+	cl = ovs_ct_get_conn_labels(ct);
+	if (!cl)
+		return -ENOSPC;
+
+	/* Inherit the master's labels, if any. */
+	if (master_cl)
+		*cl = *master_cl;
+
+	if (have_mask) {
+		u32 *dst = (u32 *)cl->bits;
+		int i;
+
+		for (i = 0; i < OVS_CT_LABELS_LEN_32; i++)
+			dst[i] = (dst[i] & ~mask->ct_labels_32[i]) |
+				(labels->ct_labels_32[i]
+				 & mask->ct_labels_32[i]);
+	}
+
+	/* Labels are included in the IPCTNL_MSG_CT_NEW event only if the
+	 * IPCT_LABEL bit it set in the event cache.
+	 */
+	nf_conntrack_event_cache(IPCT_LABEL, ct);
+
+	memcpy(&key->ct.labels, cl->bits, OVS_CT_LABELS_LEN);
+
+	return 0;
+}
+
+static int ovs_ct_set_labels(struct nf_conn *ct, struct sw_flow_key *key,
+			     const struct ovs_key_ct_labels *labels,
+			     const struct ovs_key_ct_labels *mask)
+{
+	struct nf_conn_labels *cl;
+	int err;
+
+	cl = ovs_ct_get_conn_labels(ct);
+	if (!cl)
 		return -ENOSPC;
 
-	err = nf_connlabels_replace(ct, (u32 *)labels, (u32 *)mask,
-				    OVS_CT_LABELS_LEN / sizeof(u32));
+	err = nf_connlabels_replace(ct, labels->ct_labels_32,
+				    mask->ct_labels_32,
+				    OVS_CT_LABELS_LEN_32);
 	if (err)
 		return err;
 
-	ovs_ct_get_labels(ct, &key->ct.labels);
+	memcpy(&key->ct.labels, cl->bits, OVS_CT_LABELS_LEN);
+
 	return 0;
 }
 
@@ -421,16 +539,16 @@ ovs_ct_get_info(const struct nf_conntrack_tuple_hash *h)
 
 /* Find an existing connection which this packet belongs to without
  * re-attributing statistics or modifying the connection state.  This allows an
- * skb->nfct lost due to an upcall to be recovered during actions execution.
+ * skb->_nfct lost due to an upcall to be recovered during actions execution.
  *
  * Must be called with rcu_read_lock.
  *
- * On success, populates skb->nfct and skb->nfctinfo, and returns the
- * connection.  Returns NULL if there is no existing entry.
+ * On success, populates skb->_nfct and returns the connection.  Returns NULL
+ * if there is no existing entry.
  */
 static struct nf_conn *
 ovs_ct_find_existing(struct net *net, const struct nf_conntrack_zone *zone,
-		     u8 l3num, struct sk_buff *skb)
+		     u8 l3num, struct sk_buff *skb, bool natted)
 {
 	struct nf_conntrack_l3proto *l3proto;
 	struct nf_conntrack_l4proto *l4proto;
@@ -453,6 +571,17 @@ ovs_ct_find_existing(struct net *net, const struct nf_conntrack_zone *zone,
 		return NULL;
 	}
 
+	/* Must invert the tuple if skb has been transformed by NAT. */
+	if (natted) {
+		struct nf_conntrack_tuple inverse;
+
+		if (!nf_ct_invert_tuple(&inverse, &tuple, l3proto, l4proto)) {
+			pr_debug("ovs_ct_find_existing: Inversion failed!\n");
+			return NULL;
+		}
+		tuple = inverse;
+	}
+
 	/* look for tuple match */
 	h = nf_conntrack_find_get(net, zone, &tuple);
 	if (!h)
@@ -460,12 +589,18 @@ ovs_ct_find_existing(struct net *net, const struct nf_conntrack_zone *zone,
 
 	ct = nf_ct_tuplehash_to_ctrack(h);
 
-	skb->nfct = &ct->ct_general;
-	skb->nfctinfo = ovs_ct_get_info(h);
+	/* Inverted packet tuple matches the reverse direction conntrack tuple,
+	 * select the other tuplehash to get the right 'ctinfo' bits for this
+	 * packet.
+	 */
+	if (natted)
+		h = &ct->tuplehash[!h->tuple.dst.dir];
+
+	nf_ct_set(skb, ct, ovs_ct_get_info(h));
 	return ct;
 }
 
-/* Determine whether skb->nfct is equal to the result of conntrack lookup. */
+/* Determine whether skb->_nfct is equal to the result of conntrack lookup. */
 static bool skb_nfct_cached(struct net *net,
 			    const struct sw_flow_key *key,
 			    const struct ovs_conntrack_info *info,
@@ -476,14 +611,19 @@ static bool skb_nfct_cached(struct net *net,
 
 	ct = nf_ct_get(skb, &ctinfo);
 	/* If no ct, check if we have evidence that an existing conntrack entry
-	 * might be found for this skb.  This happens when we lose a skb->nfct
+	 * might be found for this skb.  This happens when we lose a skb->_nfct
 	 * due to an upcall.  If the connection was not confirmed, it is not
 	 * cached and needs to be run through conntrack again.
 	 */
-	if (!ct && key->ct.state & OVS_CS_F_TRACKED &&
-	    !(key->ct.state & OVS_CS_F_INVALID) &&
-	    key->ct.zone == info->zone.id)
-		ct = ovs_ct_find_existing(net, &info->zone, info->family, skb);
+	if (!ct && key->ct_state & OVS_CS_F_TRACKED &&
+	    !(key->ct_state & OVS_CS_F_INVALID) &&
+	    key->ct_zone == info->zone.id) {
+		ct = ovs_ct_find_existing(net, &info->zone, info->family, skb,
+					  !!(key->ct_state
+					     & OVS_CS_F_NAT_MASK));
+		if (ct)
+			nf_ct_get(skb, &ctinfo);
+	}
 	if (!ct)
 		return false;
 	if (!net_eq(net, read_pnet(&ct->ct_net)))
@@ -497,6 +637,18 @@ static bool skb_nfct_cached(struct net *net,
 		if (help && rcu_access_pointer(help->helper) != info->helper)
 			return false;
 	}
+	/* Force conntrack entry direction to the current packet? */
+	if (info->force && CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL) {
+		/* Delete the conntrack entry if confirmed, else just release
+		 * the reference.
+		 */
+		if (nf_ct_is_confirmed(ct))
+			nf_ct_delete(ct, 0, 0);
+		else
+			nf_conntrack_put(&ct->ct_general);
+		nf_ct_set(skb, NULL, 0);
+		return false;
+	}
 
 	return true;
 }
@@ -591,7 +743,7 @@ static void ovs_nat_update_key(struct sw_flow_key *key,
 	if (maniptype == NF_NAT_MANIP_SRC) {
 		__be16 src;
 
-		key->ct.state |= OVS_CS_F_SRC_NAT;
+		key->ct_state |= OVS_CS_F_SRC_NAT;
 		if (key->eth.type == htons(ETH_P_IP))
 			key->ipv4.addr.src = ip_hdr(skb)->saddr;
 		else if (key->eth.type == htons(ETH_P_IPV6))
@@ -613,7 +765,7 @@ static void ovs_nat_update_key(struct sw_flow_key *key,
 	} else {
 		__be16 dst;
 
-		key->ct.state |= OVS_CS_F_DST_NAT;
+		key->ct_state |= OVS_CS_F_DST_NAT;
 		if (key->eth.type == htons(ETH_P_IP))
 			key->ipv4.addr.dst = ip_hdr(skb)->daddr;
 		else if (key->eth.type == htons(ETH_P_IPV6))
@@ -700,7 +852,7 @@ static int ovs_ct_nat(struct net *net, struct sw_flow_key *key,
 /* Pass 'skb' through conntrack in 'net', using zone configured in 'info', if
  * not done already.  Update key with new CT state after passing the packet
  * through conntrack.
- * Note that if the packet is deemed invalid by conntrack, skb->nfct will be
+ * Note that if the packet is deemed invalid by conntrack, skb->_nfct will be
  * set to NULL and 0 will be returned.
  */
 static int __ovs_ct_lookup(struct net *net, struct sw_flow_key *key,
@@ -722,11 +874,10 @@ static int __ovs_ct_lookup(struct net *net, struct sw_flow_key *key,
 
 		/* Associate skb with specified zone. */
 		if (tmpl) {
-			if (skb->nfct)
-				nf_conntrack_put(skb->nfct);
+			if (skb_nfct(skb))
+				nf_conntrack_put(skb_nfct(skb));
 			nf_conntrack_get(&tmpl->ct_general);
-			skb->nfct = &tmpl->ct_general;
-			skb->nfctinfo = IP_CT_NEW;
+			nf_ct_set(skb, tmpl, IP_CT_NEW);
 		}
 
 		err = nf_conntrack_in(net, info->family,
@@ -738,7 +889,7 @@ static int __ovs_ct_lookup(struct net *net, struct sw_flow_key *key,
 		 * NAT after the nf_conntrack_in() call.  We can actually clear
 		 * the whole state, as it will be re-initialized below.
 		 */
-		key->ct.state = 0;
+		key->ct_state = 0;
 
 		/* Update the key, but keep the NAT flags. */
 		ovs_ct_update_key(skb, info, key, true, true);
@@ -754,9 +905,9 @@ static int __ovs_ct_lookup(struct net *net, struct sw_flow_key *key,
 		 *
 		 * NAT will be done only if the CT action has NAT, and only
 		 * once per packet (per zone), as guarded by the NAT bits in
-		 * the key->ct.state.
+		 * the key->ct_state.
 		 */
-		if (info->nat && !(key->ct.state & OVS_CS_F_NAT_MASK) &&
+		if (info->nat && !(key->ct_state & OVS_CS_F_NAT_MASK) &&
 		    (nf_ct_is_confirmed(ct) || info->commit) &&
 		    ovs_ct_nat(net, key, info, skb, ct, ctinfo) != NF_ACCEPT) {
 			return -EINVAL;
@@ -820,7 +971,7 @@ static int ovs_ct_lookup(struct net *net, struct sw_flow_key *key,
 		if (err)
 			return err;
 
-		ct = (struct nf_conn *)skb->nfct;
+		ct = (struct nf_conn *)skb_nfct(skb);
 		if (ct)
 			nf_ct_deliver_cached_events(ct);
 	}
@@ -832,8 +983,8 @@ static bool labels_nonzero(const struct ovs_key_ct_labels *labels)
 {
 	size_t i;
 
-	for (i = 0; i < sizeof(*labels); i++)
-		if (labels->ct_labels[i])
+	for (i = 0; i < OVS_CT_LABELS_LEN_32; i++)
+		if (labels->ct_labels_32[i])
 			return true;
 
 	return false;
@@ -844,24 +995,36 @@ static int ovs_ct_commit(struct net *net, struct sw_flow_key *key,
 			 const struct ovs_conntrack_info *info,
 			 struct sk_buff *skb)
 {
+	enum ip_conntrack_info ctinfo;
+	struct nf_conn *ct;
 	int err;
 
 	err = __ovs_ct_lookup(net, key, info, skb);
 	if (err)
 		return err;
 
+	/* The connection could be invalid, in which case this is a no-op.*/
+	ct = nf_ct_get(skb, &ctinfo);
+	if (!ct)
+		return 0;
+
 	/* Apply changes before confirming the connection so that the initial
 	 * conntrack NEW netlink event carries the values given in the CT
 	 * action.
 	 */
 	if (info->mark.mask) {
-		err = ovs_ct_set_mark(skb, key, info->mark.value,
+		err = ovs_ct_set_mark(ct, key, info->mark.value,
 				      info->mark.mask);
 		if (err)
 			return err;
 	}
-	if (labels_nonzero(&info->labels.mask)) {
-		err = ovs_ct_set_labels(skb, key, &info->labels.value,
+	if (!nf_ct_is_confirmed(ct)) {
+		err = ovs_ct_init_labels(ct, key, &info->labels.value,
+					 &info->labels.mask);
+		if (err)
+			return err;
+	} else if (labels_nonzero(&info->labels.mask)) {
+		err = ovs_ct_set_labels(ct, key, &info->labels.value,
 					&info->labels.mask);
 		if (err)
 			return err;
@@ -1063,6 +1226,7 @@ static int parse_nat(const struct nlattr *attr,
 
 static const struct ovs_ct_len_tbl ovs_ct_attr_lens[OVS_CT_ATTR_MAX + 1] = {
 	[OVS_CT_ATTR_COMMIT]	= { .minlen = 0, .maxlen = 0 },
+	[OVS_CT_ATTR_FORCE_COMMIT]	= { .minlen = 0, .maxlen = 0 },
 	[OVS_CT_ATTR_ZONE]	= { .minlen = sizeof(u16),
 				    .maxlen = sizeof(u16) },
 	[OVS_CT_ATTR_MARK]	= { .minlen = sizeof(struct md_mark),
@@ -1102,6 +1266,9 @@ static int parse_ct(const struct nlattr *attr, struct ovs_conntrack_info *info,
 		}
 
 		switch (type) {
+		case OVS_CT_ATTR_FORCE_COMMIT:
+			info->force = true;
+			/* fall through. */
 		case OVS_CT_ATTR_COMMIT:
 			info->commit = true;
 			break;
@@ -1328,7 +1495,9 @@ int ovs_ct_action_to_attr(const struct ovs_conntrack_info *ct_info,
 	if (!start)
 		return -EMSGSIZE;
 
-	if (ct_info->commit && nla_put_flag(skb, OVS_CT_ATTR_COMMIT))
+	if (ct_info->commit && nla_put_flag(skb, ct_info->force
+					    ? OVS_CT_ATTR_FORCE_COMMIT
+					    : OVS_CT_ATTR_COMMIT))
 		return -EMSGSIZE;
 	if (IS_ENABLED(CONFIG_NF_CONNTRACK_ZONES) &&
 	    nla_put_u16(skb, OVS_CT_ATTR_ZONE, ct_info->zone.id))
diff --git a/net/openvswitch/conntrack.h b/net/openvswitch/conntrack.h
index 8f6230bd6183..bc7efd1867ab 100644
--- a/net/openvswitch/conntrack.h
+++ b/net/openvswitch/conntrack.h
@@ -32,7 +32,8 @@ int ovs_ct_execute(struct net *, struct sk_buff *, struct sw_flow_key *,
 		   const struct ovs_conntrack_info *);
 
 void ovs_ct_fill_key(const struct sk_buff *skb, struct sw_flow_key *key);
-int ovs_ct_put_key(const struct sw_flow_key *key, struct sk_buff *skb);
+int ovs_ct_put_key(const struct sw_flow_key *swkey,
+		   const struct sw_flow_key *output, struct sk_buff *skb);
 void ovs_ct_free_action(const struct nlattr *a);
 
 #define CT_SUPPORTED_MASK (OVS_CS_F_NEW | OVS_CS_F_ESTABLISHED | \
@@ -75,13 +76,18 @@ static inline int ovs_ct_execute(struct net *net, struct sk_buff *skb,
 static inline void ovs_ct_fill_key(const struct sk_buff *skb,
 				   struct sw_flow_key *key)
 {
-	key->ct.state = 0;
-	key->ct.zone = 0;
+	key->ct_state = 0;
+	key->ct_zone = 0;
 	key->ct.mark = 0;
 	memset(&key->ct.labels, 0, sizeof(key->ct.labels));
+	/* Clear 'ct_orig_proto' to mark the non-existence of original
+	 * direction key fields.
+	 */
+	key->ct_orig_proto = 0;
 }
 
-static inline int ovs_ct_put_key(const struct sw_flow_key *key,
+static inline int ovs_ct_put_key(const struct sw_flow_key *swkey,
+				 const struct sw_flow_key *output,
 				 struct sk_buff *skb)
 {
 	return 0;
diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c
index 2c0a00f7f1b7..9d4bb8eb63f2 100644
--- a/net/openvswitch/flow.c
+++ b/net/openvswitch/flow.c
@@ -765,7 +765,7 @@ static int key_extract_mac_proto(struct sk_buff *skb)
 int ovs_flow_key_extract(const struct ip_tunnel_info *tun_info,
 			 struct sk_buff *skb, struct sw_flow_key *key)
 {
-	int res;
+	int res, err;
 
 	/* Extract metadata from packet. */
 	if (tun_info) {
@@ -792,7 +792,6 @@ int ovs_flow_key_extract(const struct ip_tunnel_info *tun_info,
 	key->phy.priority = skb->priority;
 	key->phy.in_port = OVS_CB(skb)->input_vport->port_no;
 	key->phy.skb_mark = skb->mark;
-	ovs_ct_fill_key(skb, key);
 	key->ovs_flow_hash = 0;
 	res = key_extract_mac_proto(skb);
 	if (res < 0)
@@ -800,17 +799,26 @@ int ovs_flow_key_extract(const struct ip_tunnel_info *tun_info,
 	key->mac_proto = res;
 	key->recirc_id = 0;
 
-	return key_extract(skb, key);
+	err = key_extract(skb, key);
+	if (!err)
+		ovs_ct_fill_key(skb, key);   /* Must be after key_extract(). */
+	return err;
 }
 
 int ovs_flow_key_extract_userspace(struct net *net, const struct nlattr *attr,
 				   struct sk_buff *skb,
 				   struct sw_flow_key *key, bool log)
 {
+	const struct nlattr *a[OVS_KEY_ATTR_MAX + 1];
+	u64 attrs = 0;
 	int err;
 
+	err = parse_flow_nlattrs(attr, a, &attrs, log);
+	if (err)
+		return -EINVAL;
+
 	/* Extract metadata from netlink attributes. */
-	err = ovs_nla_get_flow_metadata(net, attr, key, log);
+	err = ovs_nla_get_flow_metadata(net, a, attrs, key, log);
 	if (err)
 		return err;
 
@@ -824,5 +832,21 @@ int ovs_flow_key_extract_userspace(struct net *net, const struct nlattr *attr,
 	 */
 
 	skb->protocol = key->eth.type;
-	return key_extract(skb, key);
+	err = key_extract(skb, key);
+	if (err)
+		return err;
+
+	/* Check that we have conntrack original direction tuple metadata only
+	 * for packets for which it makes sense.  Otherwise the key may be
+	 * corrupted due to overlapping key fields.
+	 */
+	if (attrs & (1 << OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV4) &&
+	    key->eth.type != htons(ETH_P_IP))
+		return -EINVAL;
+	if (attrs & (1 << OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6) &&
+	    (key->eth.type != htons(ETH_P_IPV6) ||
+	     sw_flow_key_is_nd(key)))
+		return -EINVAL;
+
+	return 0;
 }
diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h
index f61cae7f9030..a9bc1c875965 100644
--- a/net/openvswitch/flow.h
+++ b/net/openvswitch/flow.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2007-2014 Nicira, Inc.
+ * Copyright (c) 2007-2017 Nicira, Inc.
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of version 2 of the GNU General Public
@@ -85,6 +85,11 @@ struct sw_flow_key {
 		struct vlan_head cvlan;
 		__be16 type;		/* Ethernet frame type. */
 	} eth;
+	/* Filling a hole of two bytes. */
+	u8 ct_state;
+	u8 ct_orig_proto;		/* CT original direction tuple IP
+					 * protocol.
+					 */
 	union {
 		struct {
 			__be32 top_lse;	/* top label stack entry */
@@ -96,6 +101,7 @@ struct sw_flow_key {
 			u8     frag;	/* One of OVS_FRAG_TYPE_*. */
 		} ip;
 	};
+	u16 ct_zone;			/* Conntrack zone. */
 	struct {
 		__be16 src;		/* TCP/UDP/SCTP source port. */
 		__be16 dst;		/* TCP/UDP/SCTP destination port. */
@@ -107,10 +113,16 @@ struct sw_flow_key {
 				__be32 src;	/* IP source address. */
 				__be32 dst;	/* IP destination address. */
 			} addr;
-			struct {
-				u8 sha[ETH_ALEN];	/* ARP source hardware address. */
-				u8 tha[ETH_ALEN];	/* ARP target hardware address. */
-			} arp;
+			union {
+				struct {
+					__be32 src;
+					__be32 dst;
+				} ct_orig;	/* Conntrack original direction fields. */
+				struct {
+					u8 sha[ETH_ALEN];	/* ARP source hardware address. */
+					u8 tha[ETH_ALEN];	/* ARP target hardware address. */
+				} arp;
+			};
 		} ipv4;
 		struct {
 			struct {
@@ -118,23 +130,40 @@ struct sw_flow_key {
 				struct in6_addr dst;	/* IPv6 destination address. */
 			} addr;
 			__be32 label;			/* IPv6 flow label. */
-			struct {
-				struct in6_addr target;	/* ND target address. */
-				u8 sll[ETH_ALEN];	/* ND source link layer address. */
-				u8 tll[ETH_ALEN];	/* ND target link layer address. */
-			} nd;
+			union {
+				struct {
+					struct in6_addr src;
+					struct in6_addr dst;
+				} ct_orig;	/* Conntrack original direction fields. */
+				struct {
+					struct in6_addr target;	/* ND target address. */
+					u8 sll[ETH_ALEN];	/* ND source link layer address. */
+					u8 tll[ETH_ALEN];	/* ND target link layer address. */
+				} nd;
+			};
 		} ipv6;
 	};
 	struct {
-		/* Connection tracking fields. */
-		u16 zone;
+		/* Connection tracking fields not packed above. */
+		struct {
+			__be16 src;	/* CT orig tuple tp src port. */
+			__be16 dst;	/* CT orig tuple tp dst port. */
+		} orig_tp;
 		u32 mark;
-		u8 state;
 		struct ovs_key_ct_labels labels;
 	} ct;
 
 } __aligned(BITS_PER_LONG/8); /* Ensure that we can do comparisons as longs. */
 
+static inline bool sw_flow_key_is_nd(const struct sw_flow_key *key)
+{
+	return key->eth.type == htons(ETH_P_IPV6) &&
+		key->ip.proto == NEXTHDR_ICMP &&
+		key->tp.dst == 0 &&
+		(key->tp.src == htons(NDISC_NEIGHBOUR_SOLICITATION) ||
+		 key->tp.src == htons(NDISC_NEIGHBOUR_ADVERTISEMENT));
+}
+
 struct sw_flow_key_range {
 	unsigned short int start;
 	unsigned short int end;
diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c
index c87d359b9b37..6f5fa50f716d 100644
--- a/net/openvswitch/flow_netlink.c
+++ b/net/openvswitch/flow_netlink.c
@@ -129,7 +129,9 @@ static bool match_validate(const struct sw_flow_match *match,
 	/* The following mask attributes allowed only if they
 	 * pass the validation tests. */
 	mask_allowed &= ~((1 << OVS_KEY_ATTR_IPV4)
+			| (1 << OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV4)
 			| (1 << OVS_KEY_ATTR_IPV6)
+			| (1 << OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6)
 			| (1 << OVS_KEY_ATTR_TCP)
 			| (1 << OVS_KEY_ATTR_TCP_FLAGS)
 			| (1 << OVS_KEY_ATTR_UDP)
@@ -161,8 +163,10 @@ static bool match_validate(const struct sw_flow_match *match,
 
 	if (match->key->eth.type == htons(ETH_P_IP)) {
 		key_expected |= 1 << OVS_KEY_ATTR_IPV4;
-		if (match->mask && (match->mask->key.eth.type == htons(0xffff)))
+		if (match->mask && match->mask->key.eth.type == htons(0xffff)) {
 			mask_allowed |= 1 << OVS_KEY_ATTR_IPV4;
+			mask_allowed |= 1 << OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV4;
+		}
 
 		if (match->key->ip.frag != OVS_FRAG_TYPE_LATER) {
 			if (match->key->ip.proto == IPPROTO_UDP) {
@@ -196,8 +200,10 @@ static bool match_validate(const struct sw_flow_match *match,
 
 	if (match->key->eth.type == htons(ETH_P_IPV6)) {
 		key_expected |= 1 << OVS_KEY_ATTR_IPV6;
-		if (match->mask && (match->mask->key.eth.type == htons(0xffff)))
+		if (match->mask && match->mask->key.eth.type == htons(0xffff)) {
 			mask_allowed |= 1 << OVS_KEY_ATTR_IPV6;
+			mask_allowed |= 1 << OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6;
+		}
 
 		if (match->key->ip.frag != OVS_FRAG_TYPE_LATER) {
 			if (match->key->ip.proto == IPPROTO_UDP) {
@@ -230,6 +236,12 @@ static bool match_validate(const struct sw_flow_match *match,
 						htons(NDISC_NEIGHBOUR_SOLICITATION) ||
 				    match->key->tp.src == htons(NDISC_NEIGHBOUR_ADVERTISEMENT)) {
 					key_expected |= 1 << OVS_KEY_ATTR_ND;
+					/* Original direction conntrack tuple
+					 * uses the same space as the ND fields
+					 * in the key, so both are not allowed
+					 * at the same time.
+					 */
+					mask_allowed &= ~(1ULL << OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6);
 					if (match->mask && (match->mask->key.tp.src == htons(0xff)))
 						mask_allowed |= 1 << OVS_KEY_ATTR_ND;
 				}
@@ -282,7 +294,7 @@ size_t ovs_key_attr_size(void)
 	/* Whenever adding new OVS_KEY_ FIELDS, we should consider
 	 * updating this function.
 	 */
-	BUILD_BUG_ON(OVS_KEY_ATTR_TUNNEL_INFO != 26);
+	BUILD_BUG_ON(OVS_KEY_ATTR_TUNNEL_INFO != 28);
 
 	return    nla_total_size(4)   /* OVS_KEY_ATTR_PRIORITY */
 		+ nla_total_size(0)   /* OVS_KEY_ATTR_TUNNEL */
@@ -295,6 +307,7 @@ size_t ovs_key_attr_size(void)
 		+ nla_total_size(2)   /* OVS_KEY_ATTR_CT_ZONE */
 		+ nla_total_size(4)   /* OVS_KEY_ATTR_CT_MARK */
 		+ nla_total_size(16)  /* OVS_KEY_ATTR_CT_LABELS */
+		+ nla_total_size(40)  /* OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6 */
 		+ nla_total_size(12)  /* OVS_KEY_ATTR_ETHERNET */
 		+ nla_total_size(2)   /* OVS_KEY_ATTR_ETHERTYPE */
 		+ nla_total_size(4)   /* OVS_KEY_ATTR_VLAN */
@@ -355,6 +368,10 @@ static const struct ovs_len_tbl ovs_key_lens[OVS_KEY_ATTR_MAX + 1] = {
 	[OVS_KEY_ATTR_CT_ZONE]	 = { .len = sizeof(u16) },
 	[OVS_KEY_ATTR_CT_MARK]	 = { .len = sizeof(u32) },
 	[OVS_KEY_ATTR_CT_LABELS] = { .len = sizeof(struct ovs_key_ct_labels) },
+	[OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV4] = {
+		.len = sizeof(struct ovs_key_ct_tuple_ipv4) },
+	[OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6] = {
+		.len = sizeof(struct ovs_key_ct_tuple_ipv6) },
 };
 
 static bool check_attr_len(unsigned int attr_len, unsigned int expected_len)
@@ -430,9 +447,8 @@ static int parse_flow_mask_nlattrs(const struct nlattr *attr,
 	return __parse_flow_nlattrs(attr, a, attrsp, log, true);
 }
 
-static int parse_flow_nlattrs(const struct nlattr *attr,
-			      const struct nlattr *a[], u64 *attrsp,
-			      bool log)
+int parse_flow_nlattrs(const struct nlattr *attr, const struct nlattr *a[],
+		       u64 *attrsp, bool log)
 {
 	return __parse_flow_nlattrs(attr, a, attrsp, log, false);
 }
@@ -1056,14 +1072,14 @@ static int metadata_from_nlattrs(struct net *net, struct sw_flow_match *match,
 			return -EINVAL;
 		}
 
-		SW_FLOW_KEY_PUT(match, ct.state, ct_state, is_mask);
+		SW_FLOW_KEY_PUT(match, ct_state, ct_state, is_mask);
 		*attrs &= ~(1ULL << OVS_KEY_ATTR_CT_STATE);
 	}
 	if (*attrs & (1 << OVS_KEY_ATTR_CT_ZONE) &&
 	    ovs_ct_verify(net, OVS_KEY_ATTR_CT_ZONE)) {
 		u16 ct_zone = nla_get_u16(a[OVS_KEY_ATTR_CT_ZONE]);
 
-		SW_FLOW_KEY_PUT(match, ct.zone, ct_zone, is_mask);
+		SW_FLOW_KEY_PUT(match, ct_zone, ct_zone, is_mask);
 		*attrs &= ~(1ULL << OVS_KEY_ATTR_CT_ZONE);
 	}
 	if (*attrs & (1 << OVS_KEY_ATTR_CT_MARK) &&
@@ -1082,6 +1098,34 @@ static int metadata_from_nlattrs(struct net *net, struct sw_flow_match *match,
 				   sizeof(*cl), is_mask);
 		*attrs &= ~(1ULL << OVS_KEY_ATTR_CT_LABELS);
 	}
+	if (*attrs & (1ULL << OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV4)) {
+		const struct ovs_key_ct_tuple_ipv4 *ct;
+
+		ct = nla_data(a[OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV4]);
+
+		SW_FLOW_KEY_PUT(match, ipv4.ct_orig.src, ct->ipv4_src, is_mask);
+		SW_FLOW_KEY_PUT(match, ipv4.ct_orig.dst, ct->ipv4_dst, is_mask);
+		SW_FLOW_KEY_PUT(match, ct.orig_tp.src, ct->src_port, is_mask);
+		SW_FLOW_KEY_PUT(match, ct.orig_tp.dst, ct->dst_port, is_mask);
+		SW_FLOW_KEY_PUT(match, ct_orig_proto, ct->ipv4_proto, is_mask);
+		*attrs &= ~(1ULL << OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV4);
+	}
+	if (*attrs & (1ULL << OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6)) {
+		const struct ovs_key_ct_tuple_ipv6 *ct;
+
+		ct = nla_data(a[OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6]);
+
+		SW_FLOW_KEY_MEMCPY(match, ipv6.ct_orig.src, &ct->ipv6_src,
+				   sizeof(match->key->ipv6.ct_orig.src),
+				   is_mask);
+		SW_FLOW_KEY_MEMCPY(match, ipv6.ct_orig.dst, &ct->ipv6_dst,
+				   sizeof(match->key->ipv6.ct_orig.dst),
+				   is_mask);
+		SW_FLOW_KEY_PUT(match, ct.orig_tp.src, ct->src_port, is_mask);
+		SW_FLOW_KEY_PUT(match, ct.orig_tp.dst, ct->dst_port, is_mask);
+		SW_FLOW_KEY_PUT(match, ct_orig_proto, ct->ipv6_proto, is_mask);
+		*attrs &= ~(1ULL << OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6);
+	}
 
 	/* For layer 3 packets the Ethernet type is provided
 	 * and treated as metadata but no MAC addresses are provided.
@@ -1493,9 +1537,12 @@ u32 ovs_nla_get_ufid_flags(const struct nlattr *attr)
 
 /**
  * ovs_nla_get_flow_metadata - parses Netlink attributes into a flow key.
- * @key: Receives extracted in_port, priority, tun_key and skb_mark.
- * @attr: Netlink attribute holding nested %OVS_KEY_ATTR_* Netlink attribute
- * sequence.
+ * @net: Network namespace.
+ * @key: Receives extracted in_port, priority, tun_key, skb_mark and conntrack
+ * metadata.
+ * @a: Array of netlink attributes holding parsed %OVS_KEY_ATTR_* Netlink
+ * attributes.
+ * @attrs: Bit mask for the netlink attributes included in @a.
  * @log: Boolean to allow kernel error logging.  Normally true, but when
  * probing for feature compatibility this should be passed in as false to
  * suppress unnecessary error logging.
@@ -1504,25 +1551,26 @@ u32 ovs_nla_get_ufid_flags(const struct nlattr *attr)
  * take the same form accepted by flow_from_nlattrs(), but only enough of it to
  * get the metadata, that is, the parts of the flow key that cannot be
  * extracted from the packet itself.
+ *
+ * This must be called before the packet key fields are filled in 'key'.
  */
 
-int ovs_nla_get_flow_metadata(struct net *net, const struct nlattr *attr,
-			      struct sw_flow_key *key,
-			      bool log)
+int ovs_nla_get_flow_metadata(struct net *net,
+			      const struct nlattr *a[OVS_KEY_ATTR_MAX + 1],
+			      u64 attrs, struct sw_flow_key *key, bool log)
 {
-	const struct nlattr *a[OVS_KEY_ATTR_MAX + 1];
 	struct sw_flow_match match;
-	u64 attrs = 0;
-	int err;
-
-	err = parse_flow_nlattrs(attr, a, &attrs, log);
-	if (err)
-		return -EINVAL;
 
 	memset(&match, 0, sizeof(match));
 	match.key = key;
 
+	key->ct_state = 0;
+	key->ct_zone = 0;
+	key->ct_orig_proto = 0;
 	memset(&key->ct, 0, sizeof(key->ct));
+	memset(&key->ipv4.ct_orig, 0, sizeof(key->ipv4.ct_orig));
+	memset(&key->ipv6.ct_orig, 0, sizeof(key->ipv6.ct_orig));
+
 	key->phy.in_port = DP_MAX_PORTS;
 
 	return metadata_from_nlattrs(net, &match, &attrs, a, false, log);
@@ -1584,7 +1632,7 @@ static int __ovs_nla_put_key(const struct sw_flow_key *swkey,
 	if (nla_put_u32(skb, OVS_KEY_ATTR_SKB_MARK, output->phy.skb_mark))
 		goto nla_put_failure;
 
-	if (ovs_ct_put_key(output, skb))
+	if (ovs_ct_put_key(swkey, output, skb))
 		goto nla_put_failure;
 
 	if (ovs_key_mac_proto(swkey) == MAC_PROTO_ETHERNET) {
diff --git a/net/openvswitch/flow_netlink.h b/net/openvswitch/flow_netlink.h
index 45f9769e5aac..929c665ac3aa 100644
--- a/net/openvswitch/flow_netlink.h
+++ b/net/openvswitch/flow_netlink.h
@@ -46,8 +46,11 @@ void ovs_match_init(struct sw_flow_match *match,
 
 int ovs_nla_put_key(const struct sw_flow_key *, const struct sw_flow_key *,
 		    int attr, bool is_mask, struct sk_buff *);
-int ovs_nla_get_flow_metadata(struct net *, const struct nlattr *,
-			      struct sw_flow_key *, bool log);
+int parse_flow_nlattrs(const struct nlattr *attr, const struct nlattr *a[],
+		       u64 *attrsp, bool log);
+int ovs_nla_get_flow_metadata(struct net *net,
+			      const struct nlattr *a[OVS_KEY_ATTR_MAX + 1],
+			      u64 attrs, struct sw_flow_key *key, bool log);
 
 int ovs_nla_put_identifier(const struct sw_flow *flow, struct sk_buff *skb);
 int ovs_nla_put_masked_key(const struct sw_flow *flow, struct sk_buff *skb);
diff --git a/net/openvswitch/vport-internal_dev.c b/net/openvswitch/vport-internal_dev.c
index d5d6caecd072..89193a634da4 100644
--- a/net/openvswitch/vport-internal_dev.c
+++ b/net/openvswitch/vport-internal_dev.c
@@ -97,7 +97,7 @@ static void internal_dev_destructor(struct net_device *dev)
 	free_netdev(dev);
 }
 
-static struct rtnl_link_stats64 *
+static void
 internal_get_stats(struct net_device *dev, struct rtnl_link_stats64 *stats)
 {
 	int i;
@@ -125,8 +125,6 @@ internal_get_stats(struct net_device *dev, struct rtnl_link_stats64 *stats)
 		stats->tx_bytes         += local_stats.tx_bytes;
 		stats->tx_packets       += local_stats.tx_packets;
 	}
-
-	return stats;
 }
 
 static void internal_set_rx_headroom(struct net_device *dev, int new_hr)
@@ -151,6 +149,8 @@ static void do_setup(struct net_device *netdev)
 {
 	ether_setup(netdev);
 
+	netdev->max_mtu = ETH_MAX_MTU;
+
 	netdev->netdev_ops = &internal_dev_netdev_ops;
 
 	netdev->priv_flags &= ~IFF_TX_SKB_SHARING;
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 3d555c79a7b5..2bd0d1949312 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -409,6 +409,9 @@ static void __packet_set_status(struct packet_sock *po, void *frame, int status)
 		flush_dcache_page(pgv_to_page(&h.h2->tp_status));
 		break;
 	case TPACKET_V3:
+		h.h3->tp_status = status;
+		flush_dcache_page(pgv_to_page(&h.h3->tp_status));
+		break;
 	default:
 		WARN(1, "TPACKET version not supported.\n");
 		BUG();
@@ -432,6 +435,8 @@ static int __packet_get_status(struct packet_sock *po, void *frame)
 		flush_dcache_page(pgv_to_page(&h.h2->tp_status));
 		return h.h2->tp_status;
 	case TPACKET_V3:
+		flush_dcache_page(pgv_to_page(&h.h3->tp_status));
+		return h.h3->tp_status;
 	default:
 		WARN(1, "TPACKET version not supported.\n");
 		BUG();
@@ -476,6 +481,9 @@ static __u32 __packet_set_timestamp(struct packet_sock *po, void *frame,
 		h.h2->tp_nsec = ts.tv_nsec;
 		break;
 	case TPACKET_V3:
+		h.h3->tp_sec = ts.tv_sec;
+		h.h3->tp_nsec = ts.tv_nsec;
+		break;
 	default:
 		WARN(1, "TPACKET version not supported.\n");
 		BUG();
@@ -1497,6 +1505,8 @@ static void __fanout_link(struct sock *sk, struct packet_sock *po)
 	f->arr[f->num_members] = sk;
 	smp_wmb();
 	f->num_members++;
+	if (f->num_members == 1)
+		dev_add_pack(&f->prot_hook);
 	spin_unlock(&f->lock);
 }
 
@@ -1513,6 +1523,8 @@ static void __fanout_unlink(struct sock *sk, struct packet_sock *po)
 	BUG_ON(i >= f->num_members);
 	f->arr[i] = f->arr[f->num_members - 1];
 	f->num_members--;
+	if (f->num_members == 0)
+		__dev_remove_pack(&f->prot_hook);
 	spin_unlock(&f->lock);
 }
 
@@ -1619,6 +1631,7 @@ static void fanout_release_data(struct packet_fanout *f)
 
 static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
 {
+	struct packet_rollover *rollover = NULL;
 	struct packet_sock *po = pkt_sk(sk);
 	struct packet_fanout *f, *match;
 	u8 type = type_flags & 0xff;
@@ -1641,23 +1654,28 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
 		return -EINVAL;
 	}
 
+	mutex_lock(&fanout_mutex);
+
+	err = -EINVAL;
 	if (!po->running)
-		return -EINVAL;
+		goto out;
 
+	err = -EALREADY;
 	if (po->fanout)
-		return -EALREADY;
+		goto out;
 
 	if (type == PACKET_FANOUT_ROLLOVER ||
 	    (type_flags & PACKET_FANOUT_FLAG_ROLLOVER)) {
-		po->rollover = kzalloc(sizeof(*po->rollover), GFP_KERNEL);
-		if (!po->rollover)
-			return -ENOMEM;
-		atomic_long_set(&po->rollover->num, 0);
-		atomic_long_set(&po->rollover->num_huge, 0);
-		atomic_long_set(&po->rollover->num_failed, 0);
+		err = -ENOMEM;
+		rollover = kzalloc(sizeof(*rollover), GFP_KERNEL);
+		if (!rollover)
+			goto out;
+		atomic_long_set(&rollover->num, 0);
+		atomic_long_set(&rollover->num_huge, 0);
+		atomic_long_set(&rollover->num_failed, 0);
+		po->rollover = rollover;
 	}
 
-	mutex_lock(&fanout_mutex);
 	match = NULL;
 	list_for_each_entry(f, &fanout_list, list) {
 		if (f->id == id &&
@@ -1687,7 +1705,6 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
 		match->prot_hook.func = packet_rcv_fanout;
 		match->prot_hook.af_packet_priv = match;
 		match->prot_hook.id_match = match_fanout_group;
-		dev_add_pack(&match->prot_hook);
 		list_add(&match->list, &fanout_list);
 	}
 	err = -EINVAL;
@@ -1704,36 +1721,40 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
 		}
 	}
 out:
-	mutex_unlock(&fanout_mutex);
-	if (err) {
-		kfree(po->rollover);
+	if (err && rollover) {
+		kfree(rollover);
 		po->rollover = NULL;
 	}
+	mutex_unlock(&fanout_mutex);
 	return err;
 }
 
-static void fanout_release(struct sock *sk)
+/* If pkt_sk(sk)->fanout->sk_ref is zero, this function removes
+ * pkt_sk(sk)->fanout from fanout_list and returns pkt_sk(sk)->fanout.
+ * It is the responsibility of the caller to call fanout_release_data() and
+ * free the returned packet_fanout (after synchronize_net())
+ */
+static struct packet_fanout *fanout_release(struct sock *sk)
 {
 	struct packet_sock *po = pkt_sk(sk);
 	struct packet_fanout *f;
 
+	mutex_lock(&fanout_mutex);
 	f = po->fanout;
-	if (!f)
-		return;
+	if (f) {
+		po->fanout = NULL;
 
-	mutex_lock(&fanout_mutex);
-	po->fanout = NULL;
+		if (atomic_dec_and_test(&f->sk_ref))
+			list_del(&f->list);
+		else
+			f = NULL;
 
-	if (atomic_dec_and_test(&f->sk_ref)) {
-		list_del(&f->list);
-		dev_remove_pack(&f->prot_hook);
-		fanout_release_data(f);
-		kfree(f);
+		if (po->rollover)
+			kfree_rcu(po->rollover, rcu);
 	}
 	mutex_unlock(&fanout_mutex);
 
-	if (po->rollover)
-		kfree_rcu(po->rollover, rcu);
+	return f;
 }
 
 static bool packet_extra_vlan_len_allowed(const struct net_device *dev,
@@ -2497,6 +2518,13 @@ static int tpacket_parse_header(struct packet_sock *po, void *frame,
 	ph.raw = frame;
 
 	switch (po->tp_version) {
+	case TPACKET_V3:
+		if (ph.h3->tp_next_offset != 0) {
+			pr_warn_once("variable sized slot not supported");
+			return -EINVAL;
+		}
+		tp_len = ph.h3->tp_len;
+		break;
 	case TPACKET_V2:
 		tp_len = ph.h2->tp_len;
 		break;
@@ -2516,6 +2544,9 @@ static int tpacket_parse_header(struct packet_sock *po, void *frame,
 		off_max = po->tx_ring.frame_size - tp_len;
 		if (po->sk.sk_type == SOCK_DGRAM) {
 			switch (po->tp_version) {
+			case TPACKET_V3:
+				off = ph.h3->tp_net;
+				break;
 			case TPACKET_V2:
 				off = ph.h2->tp_net;
 				break;
@@ -2525,6 +2556,9 @@ static int tpacket_parse_header(struct packet_sock *po, void *frame,
 			}
 		} else {
 			switch (po->tp_version) {
+			case TPACKET_V3:
+				off = ph.h3->tp_mac;
+				break;
 			case TPACKET_V2:
 				off = ph.h2->tp_mac;
 				break;
@@ -2755,7 +2789,7 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
 	struct virtio_net_hdr vnet_hdr = { 0 };
 	int offset = 0;
 	struct packet_sock *po = pkt_sk(sk);
-	int hlen, tlen;
+	int hlen, tlen, linear;
 	int extra_len = 0;
 
 	/*
@@ -2816,8 +2850,9 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
 	err = -ENOBUFS;
 	hlen = LL_RESERVED_SPACE(dev);
 	tlen = dev->needed_tailroom;
-	skb = packet_alloc_skb(sk, hlen + tlen, hlen, len,
-			       __virtio16_to_cpu(vio_le(), vnet_hdr.hdr_len),
+	linear = __virtio16_to_cpu(vio_le(), vnet_hdr.hdr_len);
+	linear = max(linear, min_t(int, len, dev->hard_header_len));
+	skb = packet_alloc_skb(sk, hlen + tlen, hlen, len, linear,
 			       msg->msg_flags & MSG_DONTWAIT, &err);
 	if (skb == NULL)
 		goto out_unlock;
@@ -2906,6 +2941,7 @@ static int packet_release(struct socket *sock)
 {
 	struct sock *sk = sock->sk;
 	struct packet_sock *po;
+	struct packet_fanout *f;
 	struct net *net;
 	union tpacket_req_u req_u;
 
@@ -2945,9 +2981,14 @@ static int packet_release(struct socket *sock)
 		packet_set_ring(sk, &req_u, 1, 1);
 	}
 
-	fanout_release(sk);
+	f = fanout_release(sk);
 
 	synchronize_net();
+
+	if (f) {
+		fanout_release_data(f);
+		kfree(f);
+	}
 	/*
 	 *	Now the socket is dead. No more input will appear.
 	 */
@@ -3899,7 +3940,6 @@ static int packet_notifier(struct notifier_block *this,
 				}
 				if (msg == NETDEV_UNREGISTER) {
 					packet_cached_dev_reset(po);
-					fanout_release(sk);
 					po->ifindex = -1;
 					if (po->prot_hook.dev)
 						dev_put(po->prot_hook.dev);
@@ -4113,11 +4153,6 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
 	struct tpacket_req *req = &req_u->req;
 
 	lock_sock(sk);
-	/* Opening a Tx-ring is NOT supported in TPACKET_V3 */
-	if (!closing && tx_ring && (po->tp_version > TPACKET_V2)) {
-		net_warn_ratelimited("Tx-ring is not supported.\n");
-		goto out;
-	}
 
 	rb = tx_ring ? &po->tx_ring : &po->rx_ring;
 	rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;
@@ -4177,11 +4212,19 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
 			goto out;
 		switch (po->tp_version) {
 		case TPACKET_V3:
-		/* Transmit path is not supported. We checked
-		 * it above but just being paranoid
-		 */
-			if (!tx_ring)
+			/* Block transmit is not supported yet */
+			if (!tx_ring) {
 				init_prb_bdqc(po, rb, pg_vec, req_u);
+			} else {
+				struct tpacket_req3 *req3 = &req_u->req3;
+
+				if (req3->tp_retire_blk_tov ||
+				    req3->tp_sizeof_priv ||
+				    req3->tp_feature_req_word) {
+					err = -EINVAL;
+					goto out;
+				}
+			}
 			break;
 		default:
 			break;
diff --git a/net/packet/diag.c b/net/packet/diag.c
index 0ed68f0238bf..7ef1c881ae74 100644
--- a/net/packet/diag.c
+++ b/net/packet/diag.c
@@ -73,8 +73,7 @@ static int pdiag_put_ring(struct packet_ring_buffer *ring, int ver, int nl_type,
 {
 	struct packet_diag_ring pdr;
 
-	if (!ring->pg_vec || ((ver > TPACKET_V2) &&
-				(nl_type == PACKET_DIAG_TX_RING)))
+	if (!ring->pg_vec)
 		return 0;
 
 	pdr.pdr_block_size = ring->pg_vec_pages << PAGE_SHIFT;
diff --git a/net/psample/Kconfig b/net/psample/Kconfig
new file mode 100644
index 000000000000..d850246a6059
--- /dev/null
+++ b/net/psample/Kconfig
@@ -0,0 +1,15 @@
+#
+# psample packet sampling configuration
+#
+
+menuconfig PSAMPLE
+	depends on NET
+	tristate "Packet-sampling netlink channel"
+	default n
+	help
+	  Say Y here to add support for packet-sampling netlink channel
+	  This netlink channel allows transferring packets alongside some
+	  metadata to userspace.
+
+	  To compile this support as a module, choose M here: the module will
+	  be called psample.
diff --git a/net/psample/Makefile b/net/psample/Makefile
new file mode 100644
index 000000000000..609b0a79c9f3
--- /dev/null
+++ b/net/psample/Makefile
@@ -0,0 +1,5 @@
+#
+# Makefile for the psample netlink channel
+#
+
+obj-$(CONFIG_PSAMPLE) += psample.o
diff --git a/net/psample/psample.c b/net/psample/psample.c
new file mode 100644
index 000000000000..8aa58a918783
--- /dev/null
+++ b/net/psample/psample.c
@@ -0,0 +1,301 @@
+/*
+ * net/psample/psample.c - Netlink channel for packet sampling
+ * Copyright (c) 2017 Yotam Gigi <yotamg@mellanox.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/module.h>
+#include <net/net_namespace.h>
+#include <net/sock.h>
+#include <net/netlink.h>
+#include <net/genetlink.h>
+#include <net/psample.h>
+#include <linux/spinlock.h>
+
+#define PSAMPLE_MAX_PACKET_SIZE 0xffff
+
+static LIST_HEAD(psample_groups_list);
+static DEFINE_SPINLOCK(psample_groups_lock);
+
+/* multicast groups */
+enum psample_nl_multicast_groups {
+	PSAMPLE_NL_MCGRP_CONFIG,
+	PSAMPLE_NL_MCGRP_SAMPLE,
+};
+
+static const struct genl_multicast_group psample_nl_mcgrps[] = {
+	[PSAMPLE_NL_MCGRP_CONFIG] = { .name = PSAMPLE_NL_MCGRP_CONFIG_NAME },
+	[PSAMPLE_NL_MCGRP_SAMPLE] = { .name = PSAMPLE_NL_MCGRP_SAMPLE_NAME },
+};
+
+static struct genl_family psample_nl_family __ro_after_init;
+
+static int psample_group_nl_fill(struct sk_buff *msg,
+				 struct psample_group *group,
+				 enum psample_command cmd, u32 portid, u32 seq,
+				 int flags)
+{
+	void *hdr;
+	int ret;
+
+	hdr = genlmsg_put(msg, portid, seq, &psample_nl_family, flags, cmd);
+	if (!hdr)
+		return -EMSGSIZE;
+
+	ret = nla_put_u32(msg, PSAMPLE_ATTR_SAMPLE_GROUP, group->group_num);
+	if (ret < 0)
+		goto error;
+
+	ret = nla_put_u32(msg, PSAMPLE_ATTR_GROUP_REFCOUNT, group->refcount);
+	if (ret < 0)
+		goto error;
+
+	ret = nla_put_u32(msg, PSAMPLE_ATTR_GROUP_SEQ, group->seq);
+	if (ret < 0)
+		goto error;
+
+	genlmsg_end(msg, hdr);
+	return 0;
+
+error:
+	genlmsg_cancel(msg, hdr);
+	return -EMSGSIZE;
+}
+
+static int psample_nl_cmd_get_group_dumpit(struct sk_buff *msg,
+					   struct netlink_callback *cb)
+{
+	struct psample_group *group;
+	int start = cb->args[0];
+	int idx = 0;
+	int err;
+
+	spin_lock(&psample_groups_lock);
+	list_for_each_entry(group, &psample_groups_list, list) {
+		if (!net_eq(group->net, sock_net(msg->sk)))
+			continue;
+		if (idx < start) {
+			idx++;
+			continue;
+		}
+		err = psample_group_nl_fill(msg, group, PSAMPLE_CMD_NEW_GROUP,
+					    NETLINK_CB(cb->skb).portid,
+					    cb->nlh->nlmsg_seq, NLM_F_MULTI);
+		if (err)
+			break;
+		idx++;
+	}
+
+	spin_unlock(&psample_groups_lock);
+	cb->args[0] = idx;
+	return msg->len;
+}
+
+static const struct genl_ops psample_nl_ops[] = {
+	{
+		.cmd = PSAMPLE_CMD_GET_GROUP,
+		.dumpit = psample_nl_cmd_get_group_dumpit,
+		/* can be retrieved by unprivileged users */
+	}
+};
+
+static struct genl_family psample_nl_family __ro_after_init = {
+	.name		= PSAMPLE_GENL_NAME,
+	.version	= PSAMPLE_GENL_VERSION,
+	.maxattr	= PSAMPLE_ATTR_MAX,
+	.netnsok	= true,
+	.module		= THIS_MODULE,
+	.mcgrps		= psample_nl_mcgrps,
+	.ops		= psample_nl_ops,
+	.n_ops		= ARRAY_SIZE(psample_nl_ops),
+	.n_mcgrps	= ARRAY_SIZE(psample_nl_mcgrps),
+};
+
+static void psample_group_notify(struct psample_group *group,
+				 enum psample_command cmd)
+{
+	struct sk_buff *msg;
+	int err;
+
+	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC);
+	if (!msg)
+		return;
+
+	err = psample_group_nl_fill(msg, group, cmd, 0, 0, NLM_F_MULTI);
+	if (!err)
+		genlmsg_multicast_netns(&psample_nl_family, group->net, msg, 0,
+					PSAMPLE_NL_MCGRP_CONFIG, GFP_ATOMIC);
+	else
+		nlmsg_free(msg);
+}
+
+static struct psample_group *psample_group_create(struct net *net,
+						  u32 group_num)
+{
+	struct psample_group *group;
+
+	group = kzalloc(sizeof(*group), GFP_ATOMIC);
+	if (!group)
+		return NULL;
+
+	group->net = net;
+	group->group_num = group_num;
+	list_add_tail(&group->list, &psample_groups_list);
+
+	psample_group_notify(group, PSAMPLE_CMD_NEW_GROUP);
+	return group;
+}
+
+static void psample_group_destroy(struct psample_group *group)
+{
+	psample_group_notify(group, PSAMPLE_CMD_DEL_GROUP);
+	list_del(&group->list);
+	kfree(group);
+}
+
+static struct psample_group *
+psample_group_lookup(struct net *net, u32 group_num)
+{
+	struct psample_group *group;
+
+	list_for_each_entry(group, &psample_groups_list, list)
+		if ((group->group_num == group_num) && (group->net == net))
+			return group;
+	return NULL;
+}
+
+struct psample_group *psample_group_get(struct net *net, u32 group_num)
+{
+	struct psample_group *group;
+
+	spin_lock(&psample_groups_lock);
+
+	group = psample_group_lookup(net, group_num);
+	if (!group) {
+		group = psample_group_create(net, group_num);
+		if (!group)
+			goto out;
+	}
+	group->refcount++;
+
+out:
+	spin_unlock(&psample_groups_lock);
+	return group;
+}
+EXPORT_SYMBOL_GPL(psample_group_get);
+
+void psample_group_put(struct psample_group *group)
+{
+	spin_lock(&psample_groups_lock);
+
+	if (--group->refcount == 0)
+		psample_group_destroy(group);
+
+	spin_unlock(&psample_groups_lock);
+}
+EXPORT_SYMBOL_GPL(psample_group_put);
+
+void psample_sample_packet(struct psample_group *group, struct sk_buff *skb,
+			   u32 trunc_size, int in_ifindex, int out_ifindex,
+			   u32 sample_rate)
+{
+	struct sk_buff *nl_skb;
+	int data_len;
+	int meta_len;
+	void *data;
+	int ret;
+
+	meta_len = (in_ifindex ? nla_total_size(sizeof(u16)) : 0) +
+		   (out_ifindex ? nla_total_size(sizeof(u16)) : 0) +
+		   nla_total_size(sizeof(u32)) +	/* sample_rate */
+		   nla_total_size(sizeof(u32)) +	/* orig_size */
+		   nla_total_size(sizeof(u32)) +	/* group_num */
+		   nla_total_size(sizeof(u32));		/* seq */
+
+	data_len = min(skb->len, trunc_size);
+	if (meta_len + nla_total_size(data_len) > PSAMPLE_MAX_PACKET_SIZE)
+		data_len = PSAMPLE_MAX_PACKET_SIZE - meta_len - NLA_HDRLEN
+			    - NLA_ALIGNTO;
+
+	nl_skb = genlmsg_new(meta_len + data_len, GFP_ATOMIC);
+	if (unlikely(!nl_skb))
+		return;
+
+	data = genlmsg_put(nl_skb, 0, 0, &psample_nl_family, 0,
+			   PSAMPLE_CMD_SAMPLE);
+	if (unlikely(!data))
+		goto error;
+
+	if (in_ifindex) {
+		ret = nla_put_u16(nl_skb, PSAMPLE_ATTR_IIFINDEX, in_ifindex);
+		if (unlikely(ret < 0))
+			goto error;
+	}
+
+	if (out_ifindex) {
+		ret = nla_put_u16(nl_skb, PSAMPLE_ATTR_OIFINDEX, out_ifindex);
+		if (unlikely(ret < 0))
+			goto error;
+	}
+
+	ret = nla_put_u32(nl_skb, PSAMPLE_ATTR_SAMPLE_RATE, sample_rate);
+	if (unlikely(ret < 0))
+		goto error;
+
+	ret = nla_put_u32(nl_skb, PSAMPLE_ATTR_ORIGSIZE, skb->len);
+	if (unlikely(ret < 0))
+		goto error;
+
+	ret = nla_put_u32(nl_skb, PSAMPLE_ATTR_SAMPLE_GROUP, group->group_num);
+	if (unlikely(ret < 0))
+		goto error;
+
+	ret = nla_put_u32(nl_skb, PSAMPLE_ATTR_GROUP_SEQ, group->seq++);
+	if (unlikely(ret < 0))
+		goto error;
+
+	if (data_len) {
+		int nla_len = nla_total_size(data_len);
+		struct nlattr *nla;
+
+		nla = (struct nlattr *)skb_put(nl_skb, nla_len);
+		nla->nla_type = PSAMPLE_ATTR_DATA;
+		nla->nla_len = nla_attr_size(data_len);
+
+		if (skb_copy_bits(skb, 0, nla_data(nla), data_len))
+			goto error;
+	}
+
+	genlmsg_end(nl_skb, data);
+	genlmsg_multicast_netns(&psample_nl_family, group->net, nl_skb, 0,
+				PSAMPLE_NL_MCGRP_SAMPLE, GFP_ATOMIC);
+
+	return;
+error:
+	pr_err_ratelimited("Could not create psample log message\n");
+	nlmsg_free(nl_skb);
+}
+EXPORT_SYMBOL_GPL(psample_sample_packet);
+
+static int __init psample_module_init(void)
+{
+	return genl_register_family(&psample_nl_family);
+}
+
+static void __exit psample_module_exit(void)
+{
+	genl_unregister_family(&psample_nl_family);
+}
+
+module_init(psample_module_init);
+module_exit(psample_module_exit);
+
+MODULE_AUTHOR("Yotam Gigi <yotamg@mellanox.com>");
+MODULE_DESCRIPTION("netlink channel for packet sampling");
+MODULE_LICENSE("GPL v2");
diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c
index 2ac1e6194be3..b405f77d664c 100644
--- a/net/rds/af_rds.c
+++ b/net/rds/af_rds.c
@@ -298,6 +298,33 @@ static int rds_enable_recvtstamp(struct sock *sk, char __user *optval,
 	return 0;
 }
 
+static int rds_recv_track_latency(struct rds_sock *rs, char __user *optval,
+				  int optlen)
+{
+	struct rds_rx_trace_so trace;
+	int i;
+
+	if (optlen != sizeof(struct rds_rx_trace_so))
+		return -EFAULT;
+
+	if (copy_from_user(&trace, optval, sizeof(trace)))
+		return -EFAULT;
+
+	if (trace.rx_traces > RDS_MSG_RX_DGRAM_TRACE_MAX)
+		return -EFAULT;
+
+	rs->rs_rx_traces = trace.rx_traces;
+	for (i = 0; i < rs->rs_rx_traces; i++) {
+		if (trace.rx_trace_pos[i] > RDS_MSG_RX_DGRAM_TRACE_MAX) {
+			rs->rs_rx_traces = 0;
+			return -EFAULT;
+		}
+		rs->rs_rx_trace[i] = trace.rx_trace_pos[i];
+	}
+
+	return 0;
+}
+
 static int rds_setsockopt(struct socket *sock, int level, int optname,
 			  char __user *optval, unsigned int optlen)
 {
@@ -338,6 +365,9 @@ static int rds_setsockopt(struct socket *sock, int level, int optname,
 		ret = rds_enable_recvtstamp(sock->sk, optval, optlen);
 		release_sock(sock->sk);
 		break;
+	case SO_RDS_MSG_RXPATH_LATENCY:
+		ret = rds_recv_track_latency(rs, optval, optlen);
+		break;
 	default:
 		ret = -ENOPROTOOPT;
 	}
@@ -484,6 +514,7 @@ static int __rds_create(struct socket *sock, struct sock *sk, int protocol)
 	INIT_LIST_HEAD(&rs->rs_cong_list);
 	spin_lock_init(&rs->rs_rdma_lock);
 	rs->rs_rdma_keys = RB_ROOT;
+	rs->rs_rx_traces = 0;
 
 	spin_lock_bh(&rds_sock_lock);
 	list_add_tail(&rs->rs_item, &rds_sock_list);
diff --git a/net/rds/bind.c b/net/rds/bind.c
index 095f6ce583fe..3a915bedb76c 100644
--- a/net/rds/bind.c
+++ b/net/rds/bind.c
@@ -176,8 +176,8 @@ int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 	if (!trans) {
 		ret = -EADDRNOTAVAIL;
 		rds_remove_bound(rs);
-		printk_ratelimited(KERN_INFO "RDS: rds_bind() could not find a transport, "
-				"load rds_tcp or rds_rdma?\n");
+		pr_info_ratelimited("RDS: %s could not find a transport for %pI4, load rds_tcp or rds_rdma?\n",
+				    __func__, &sin->sin_addr.s_addr);
 		goto out;
 	}
 
diff --git a/net/rds/connection.c b/net/rds/connection.c
index fe9d31c0b22d..0e04dcceb1d4 100644
--- a/net/rds/connection.c
+++ b/net/rds/connection.c
@@ -545,11 +545,11 @@ void rds_for_each_conn_info(struct socket *sock, unsigned int len,
 }
 EXPORT_SYMBOL_GPL(rds_for_each_conn_info);
 
-void rds_walk_conn_path_info(struct socket *sock, unsigned int len,
-			     struct rds_info_iterator *iter,
-			     struct rds_info_lengths *lens,
-			     int (*visitor)(struct rds_conn_path *, void *),
-			     size_t item_len)
+static void rds_walk_conn_path_info(struct socket *sock, unsigned int len,
+				    struct rds_info_iterator *iter,
+				    struct rds_info_lengths *lens,
+				    int (*visitor)(struct rds_conn_path *, void *),
+				    size_t item_len)
 {
 	u64  buffer[(item_len + 7) / 8];
 	struct hlist_head *head;
diff --git a/net/rds/ib.c b/net/rds/ib.c
index 5680d90b0b77..91fe46f1e4cc 100644
--- a/net/rds/ib.c
+++ b/net/rds/ib.c
@@ -111,6 +111,8 @@ static void rds_ib_dev_free(struct work_struct *work)
 		kfree(i_ipaddr);
 	}
 
+	kfree(rds_ibdev->vector_load);
+
 	kfree(rds_ibdev);
 }
 
@@ -159,6 +161,14 @@ static void rds_ib_add_one(struct ib_device *device)
 	rds_ibdev->max_initiator_depth = device->attrs.max_qp_init_rd_atom;
 	rds_ibdev->max_responder_resources = device->attrs.max_qp_rd_atom;
 
+	rds_ibdev->vector_load = kzalloc(sizeof(int) * device->num_comp_vectors,
+					 GFP_KERNEL);
+	if (!rds_ibdev->vector_load) {
+		pr_err("RDS/IB: %s failed to allocate vector memory\n",
+			__func__);
+		goto put_dev;
+	}
+
 	rds_ibdev->dev = device;
 	rds_ibdev->pd = ib_alloc_pd(device, 0);
 	if (IS_ERR(rds_ibdev->pd)) {
diff --git a/net/rds/ib.h b/net/rds/ib.h
index 45ac8e8e58f4..ec550626e221 100644
--- a/net/rds/ib.h
+++ b/net/rds/ib.h
@@ -14,9 +14,10 @@
 
 #define RDS_IB_DEFAULT_RECV_WR		1024
 #define RDS_IB_DEFAULT_SEND_WR		256
-#define RDS_IB_DEFAULT_FR_WR		512
+#define RDS_IB_DEFAULT_FR_WR		256
+#define RDS_IB_DEFAULT_FR_INV_WR	256
 
-#define RDS_IB_DEFAULT_RETRY_COUNT	2
+#define RDS_IB_DEFAULT_RETRY_COUNT	1
 
 #define RDS_IB_SUPPORTED_PROTOCOLS	0x00000003	/* minor versions supported */
 
@@ -125,6 +126,7 @@ struct rds_ib_connection {
 
 	/* To control the number of wrs from fastreg */
 	atomic_t		i_fastreg_wrs;
+	atomic_t		i_fastunreg_wrs;
 
 	/* interrupt handling */
 	struct tasklet_struct	i_send_tasklet;
@@ -134,7 +136,7 @@ struct rds_ib_connection {
 	struct rds_ib_work_ring	i_send_ring;
 	struct rm_data_op	*i_data_op;
 	struct rds_header	*i_send_hdrs;
-	u64			i_send_hdrs_dma;
+	dma_addr_t		i_send_hdrs_dma;
 	struct rds_ib_send_work *i_sends;
 	atomic_t		i_signaled_sends;
 
@@ -144,11 +146,12 @@ struct rds_ib_connection {
 	struct rds_ib_incoming	*i_ibinc;
 	u32			i_recv_data_rem;
 	struct rds_header	*i_recv_hdrs;
-	u64			i_recv_hdrs_dma;
+	dma_addr_t		i_recv_hdrs_dma;
 	struct rds_ib_recv_work *i_recvs;
 	u64			i_ack_recv;	/* last ACK received */
 	struct rds_ib_refill_cache i_cache_incs;
 	struct rds_ib_refill_cache i_cache_frags;
+	atomic_t		i_cache_allocs;
 
 	/* sending acks */
 	unsigned long		i_ack_flags;
@@ -161,7 +164,7 @@ struct rds_ib_connection {
 	struct rds_header	*i_ack;
 	struct ib_send_wr	i_ack_wr;
 	struct ib_sge		i_ack_sge;
-	u64			i_ack_dma;
+	dma_addr_t		i_ack_dma;
 	unsigned long		i_ack_queued;
 
 	/* Flow control related information
@@ -179,6 +182,14 @@ struct rds_ib_connection {
 
 	/* Batched completions */
 	unsigned int		i_unsignaled_wrs;
+
+	/* Endpoint role in connection */
+	bool			i_active_side;
+	atomic_t		i_cq_quiesce;
+
+	/* Send/Recv vectors */
+	int			i_scq_vector;
+	int			i_rcq_vector;
 };
 
 /* This assumes that atomic_t is at least 32 bits */
@@ -221,9 +232,10 @@ struct rds_ib_device {
 	spinlock_t		spinlock;	/* protect the above */
 	atomic_t		refcount;
 	struct work_struct	free_work;
+	int			*vector_load;
 };
 
-#define ibdev_to_node(ibdev) dev_to_node(ibdev->dma_device)
+#define ibdev_to_node(ibdev) dev_to_node((ibdev)->dev.parent)
 #define rdsibdev_to_node(rdsibdev) ibdev_to_node(rdsibdev->dev)
 
 /* bits for i_ack_flags */
@@ -249,6 +261,8 @@ struct rds_ib_statistics {
 	uint64_t	s_ib_rx_refill_from_cq;
 	uint64_t	s_ib_rx_refill_from_thread;
 	uint64_t	s_ib_rx_alloc_limit;
+	uint64_t	s_ib_rx_total_frags;
+	uint64_t	s_ib_rx_total_incs;
 	uint64_t	s_ib_rx_credit_updates;
 	uint64_t	s_ib_ack_sent;
 	uint64_t	s_ib_ack_send_failure;
@@ -271,6 +285,8 @@ struct rds_ib_statistics {
 	uint64_t	s_ib_rdma_mr_1m_reused;
 	uint64_t	s_ib_atomic_cswp;
 	uint64_t	s_ib_atomic_fadd;
+	uint64_t	s_ib_recv_added_to_cache;
+	uint64_t	s_ib_recv_removed_from_cache;
 };
 
 extern struct workqueue_struct *rds_ib_wq;
@@ -401,6 +417,8 @@ int rds_ib_xmit_atomic(struct rds_connection *conn, struct rm_atomic_op *op);
 /* ib_stats.c */
 DECLARE_PER_CPU(struct rds_ib_statistics, rds_ib_stats);
 #define rds_ib_stats_inc(member) rds_stats_inc_which(rds_ib_stats, member)
+#define rds_ib_stats_add(member, count) \
+		rds_stats_add_which(rds_ib_stats, member, count)
 unsigned int rds_ib_stats_info_copy(struct rds_info_iterator *iter,
 				    unsigned int avail);
 
diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c
index 5b2ab95afa07..ce3775abc6e7 100644
--- a/net/rds/ib_cm.c
+++ b/net/rds/ib_cm.c
@@ -113,24 +113,26 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even
 	}
 
 	if (conn->c_version < RDS_PROTOCOL(3, 1)) {
-		printk(KERN_NOTICE "RDS/IB: Connection to %pI4 version %u.%u failed,"
-		       " no longer supported\n",
-		       &conn->c_faddr,
-		       RDS_PROTOCOL_MAJOR(conn->c_version),
-		       RDS_PROTOCOL_MINOR(conn->c_version));
+		pr_notice("RDS/IB: Connection <%pI4,%pI4> version %u.%u no longer supported\n",
+			  &conn->c_laddr, &conn->c_faddr,
+			  RDS_PROTOCOL_MAJOR(conn->c_version),
+			  RDS_PROTOCOL_MINOR(conn->c_version));
 		rds_conn_destroy(conn);
 		return;
 	} else {
-		printk(KERN_NOTICE "RDS/IB: connected to %pI4 version %u.%u%s\n",
-		       &conn->c_faddr,
-		       RDS_PROTOCOL_MAJOR(conn->c_version),
-		       RDS_PROTOCOL_MINOR(conn->c_version),
-		       ic->i_flowctl ? ", flow control" : "");
+		pr_notice("RDS/IB: %s conn connected <%pI4,%pI4> version %u.%u%s\n",
+			  ic->i_active_side ? "Active" : "Passive",
+			  &conn->c_laddr, &conn->c_faddr,
+			  RDS_PROTOCOL_MAJOR(conn->c_version),
+			  RDS_PROTOCOL_MINOR(conn->c_version),
+			  ic->i_flowctl ? ", flow control" : "");
 	}
 
-	/*
-	 * Init rings and fill recv. this needs to wait until protocol negotiation
-	 * is complete, since ring layout is different from 3.0 to 3.1.
+	atomic_set(&ic->i_cq_quiesce, 0);
+
+	/* Init rings and fill recv. this needs to wait until protocol
+	 * negotiation is complete, since ring layout is different
+	 * from 3.1 to 4.1.
 	 */
 	rds_ib_send_init_ring(ic);
 	rds_ib_recv_init_ring(ic);
@@ -267,6 +269,10 @@ static void rds_ib_tasklet_fn_send(unsigned long data)
 
 	rds_ib_stats_inc(s_ib_tasklet_call);
 
+	/* if cq has been already reaped, ignore incoming cq event */
+	if (atomic_read(&ic->i_cq_quiesce))
+		return;
+
 	poll_scq(ic, ic->i_send_cq, ic->i_send_wc);
 	ib_req_notify_cq(ic->i_send_cq, IB_CQ_NEXT_COMP);
 	poll_scq(ic, ic->i_send_cq, ic->i_send_wc);
@@ -308,6 +314,10 @@ static void rds_ib_tasklet_fn_recv(unsigned long data)
 
 	rds_ib_stats_inc(s_ib_tasklet_call);
 
+	/* if cq has been already reaped, ignore incoming cq event */
+	if (atomic_read(&ic->i_cq_quiesce))
+		return;
+
 	memset(&state, 0, sizeof(state));
 	poll_rcq(ic, ic->i_recv_cq, ic->i_recv_wc, &state);
 	ib_req_notify_cq(ic->i_recv_cq, IB_CQ_SOLICITED);
@@ -358,6 +368,28 @@ static void rds_ib_cq_comp_handler_send(struct ib_cq *cq, void *context)
 	tasklet_schedule(&ic->i_send_tasklet);
 }
 
+static inline int ibdev_get_unused_vector(struct rds_ib_device *rds_ibdev)
+{
+	int min = rds_ibdev->vector_load[rds_ibdev->dev->num_comp_vectors - 1];
+	int index = rds_ibdev->dev->num_comp_vectors - 1;
+	int i;
+
+	for (i = rds_ibdev->dev->num_comp_vectors - 1; i >= 0; i--) {
+		if (rds_ibdev->vector_load[i] < min) {
+			index = i;
+			min = rds_ibdev->vector_load[i];
+		}
+	}
+
+	rds_ibdev->vector_load[index]++;
+	return index;
+}
+
+static inline void ibdev_put_vector(struct rds_ib_device *rds_ibdev, int index)
+{
+	rds_ibdev->vector_load[index]--;
+}
+
 /*
  * This needs to be very careful to not leave IS_ERR pointers around for
  * cleanup to trip over.
@@ -383,7 +415,10 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
 	 * completion queue and send queue. This extra space is used for FRMR
 	 * registration and invalidation work requests
 	 */
-	fr_queue_space = (rds_ibdev->use_fastreg ? RDS_IB_DEFAULT_FR_WR : 0);
+	fr_queue_space = rds_ibdev->use_fastreg ?
+			 (RDS_IB_DEFAULT_FR_WR + 1) +
+			 (RDS_IB_DEFAULT_FR_INV_WR + 1)
+			 : 0;
 
 	/* add the conn now so that connection establishment has the dev */
 	rds_ib_add_conn(rds_ibdev, conn);
@@ -396,25 +431,30 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
 	/* Protection domain and memory range */
 	ic->i_pd = rds_ibdev->pd;
 
+	ic->i_scq_vector = ibdev_get_unused_vector(rds_ibdev);
 	cq_attr.cqe = ic->i_send_ring.w_nr + fr_queue_space + 1;
-
+	cq_attr.comp_vector = ic->i_scq_vector;
 	ic->i_send_cq = ib_create_cq(dev, rds_ib_cq_comp_handler_send,
 				     rds_ib_cq_event_handler, conn,
 				     &cq_attr);
 	if (IS_ERR(ic->i_send_cq)) {
 		ret = PTR_ERR(ic->i_send_cq);
 		ic->i_send_cq = NULL;
+		ibdev_put_vector(rds_ibdev, ic->i_scq_vector);
 		rdsdebug("ib_create_cq send failed: %d\n", ret);
 		goto out;
 	}
 
+	ic->i_rcq_vector = ibdev_get_unused_vector(rds_ibdev);
 	cq_attr.cqe = ic->i_recv_ring.w_nr;
+	cq_attr.comp_vector = ic->i_rcq_vector;
 	ic->i_recv_cq = ib_create_cq(dev, rds_ib_cq_comp_handler_recv,
 				     rds_ib_cq_event_handler, conn,
 				     &cq_attr);
 	if (IS_ERR(ic->i_recv_cq)) {
 		ret = PTR_ERR(ic->i_recv_cq);
 		ic->i_recv_cq = NULL;
+		ibdev_put_vector(rds_ibdev, ic->i_rcq_vector);
 		rdsdebug("ib_create_cq recv failed: %d\n", ret);
 		goto out;
 	}
@@ -445,6 +485,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
 	attr.send_cq = ic->i_send_cq;
 	attr.recv_cq = ic->i_recv_cq;
 	atomic_set(&ic->i_fastreg_wrs, RDS_IB_DEFAULT_FR_WR);
+	atomic_set(&ic->i_fastunreg_wrs, RDS_IB_DEFAULT_FR_INV_WR);
 
 	/*
 	 * XXX this can fail if max_*_wr is too large?  Are we supposed
@@ -682,6 +723,7 @@ out:
 		if (ic->i_cm_id == cm_id)
 			ret = 0;
 	}
+	ic->i_active_side = true;
 	return ret;
 }
 
@@ -767,17 +809,27 @@ void rds_ib_conn_path_shutdown(struct rds_conn_path *cp)
 		wait_event(rds_ib_ring_empty_wait,
 			   rds_ib_ring_empty(&ic->i_recv_ring) &&
 			   (atomic_read(&ic->i_signaled_sends) == 0) &&
-			   (atomic_read(&ic->i_fastreg_wrs) == RDS_IB_DEFAULT_FR_WR));
+			   (atomic_read(&ic->i_fastreg_wrs) == RDS_IB_DEFAULT_FR_WR) &&
+			   (atomic_read(&ic->i_fastunreg_wrs) == RDS_IB_DEFAULT_FR_INV_WR));
 		tasklet_kill(&ic->i_send_tasklet);
 		tasklet_kill(&ic->i_recv_tasklet);
 
+		atomic_set(&ic->i_cq_quiesce, 1);
+
 		/* first destroy the ib state that generates callbacks */
 		if (ic->i_cm_id->qp)
 			rdma_destroy_qp(ic->i_cm_id);
-		if (ic->i_send_cq)
+		if (ic->i_send_cq) {
+			if (ic->rds_ibdev)
+				ibdev_put_vector(ic->rds_ibdev, ic->i_scq_vector);
 			ib_destroy_cq(ic->i_send_cq);
-		if (ic->i_recv_cq)
+		}
+
+		if (ic->i_recv_cq) {
+			if (ic->rds_ibdev)
+				ibdev_put_vector(ic->rds_ibdev, ic->i_rcq_vector);
 			ib_destroy_cq(ic->i_recv_cq);
+		}
 
 		/* then free the resources that ib callbacks use */
 		if (ic->i_send_hdrs)
@@ -855,6 +907,7 @@ void rds_ib_conn_path_shutdown(struct rds_conn_path *cp)
 	ic->i_sends = NULL;
 	vfree(ic->i_recvs);
 	ic->i_recvs = NULL;
+	ic->i_active_side = false;
 }
 
 int rds_ib_conn_alloc(struct rds_connection *conn, gfp_t gfp)
diff --git a/net/rds/ib_frmr.c b/net/rds/ib_frmr.c
index d921adc62765..48332a6ed738 100644
--- a/net/rds/ib_frmr.c
+++ b/net/rds/ib_frmr.c
@@ -104,14 +104,15 @@ static int rds_ib_post_reg_frmr(struct rds_ib_mr *ibmr)
 	struct rds_ib_frmr *frmr = &ibmr->u.frmr;
 	struct ib_send_wr *failed_wr;
 	struct ib_reg_wr reg_wr;
-	int ret;
+	int ret, off = 0;
 
 	while (atomic_dec_return(&ibmr->ic->i_fastreg_wrs) <= 0) {
 		atomic_inc(&ibmr->ic->i_fastreg_wrs);
 		cpu_relax();
 	}
 
-	ret = ib_map_mr_sg_zbva(frmr->mr, ibmr->sg, ibmr->sg_len, 0, PAGE_SIZE);
+	ret = ib_map_mr_sg_zbva(frmr->mr, ibmr->sg, ibmr->sg_len,
+				&off, PAGE_SIZE);
 	if (unlikely(ret != ibmr->sg_len))
 		return ret < 0 ? ret : -EINVAL;
 
@@ -240,8 +241,8 @@ static int rds_ib_post_inv(struct rds_ib_mr *ibmr)
 	if (frmr->fr_state != FRMR_IS_INUSE)
 		goto out;
 
-	while (atomic_dec_return(&ibmr->ic->i_fastreg_wrs) <= 0) {
-		atomic_inc(&ibmr->ic->i_fastreg_wrs);
+	while (atomic_dec_return(&ibmr->ic->i_fastunreg_wrs) <= 0) {
+		atomic_inc(&ibmr->ic->i_fastunreg_wrs);
 		cpu_relax();
 	}
 
@@ -260,7 +261,7 @@ static int rds_ib_post_inv(struct rds_ib_mr *ibmr)
 	if (unlikely(ret)) {
 		frmr->fr_state = FRMR_IS_STALE;
 		frmr->fr_inv = false;
-		atomic_inc(&ibmr->ic->i_fastreg_wrs);
+		atomic_inc(&ibmr->ic->i_fastunreg_wrs);
 		pr_err("RDS/IB: %s returned error(%d)\n", __func__, ret);
 		goto out;
 	}
@@ -288,9 +289,10 @@ void rds_ib_mr_cqe_handler(struct rds_ib_connection *ic, struct ib_wc *wc)
 	if (frmr->fr_inv) {
 		frmr->fr_state = FRMR_IS_FREE;
 		frmr->fr_inv = false;
+		atomic_inc(&ic->i_fastreg_wrs);
+	} else {
+		atomic_inc(&ic->i_fastunreg_wrs);
 	}
-
-	atomic_inc(&ic->i_fastreg_wrs);
 }
 
 void rds_ib_unreg_frmr(struct list_head *list, unsigned int *nfreed,
diff --git a/net/rds/ib_mr.h b/net/rds/ib_mr.h
index 1c754f4acbe5..24c086db4511 100644
--- a/net/rds/ib_mr.h
+++ b/net/rds/ib_mr.h
@@ -45,7 +45,6 @@
 
 struct rds_ib_fmr {
 	struct ib_fmr		*fmr;
-	u64			*dma;
 };
 
 enum rds_ib_fr_state {
diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c
index 606a11f681d2..e10624aa6959 100644
--- a/net/rds/ib_recv.c
+++ b/net/rds/ib_recv.c
@@ -194,6 +194,8 @@ static void rds_ib_frag_free(struct rds_ib_connection *ic,
 	rdsdebug("frag %p page %p\n", frag, sg_page(&frag->f_sg));
 
 	rds_ib_recv_cache_put(&frag->f_cache_entry, &ic->i_cache_frags);
+	atomic_add(RDS_FRAG_SIZE / SZ_1K, &ic->i_cache_allocs);
+	rds_ib_stats_add(s_ib_recv_added_to_cache, RDS_FRAG_SIZE);
 }
 
 /* Recycle inc after freeing attached frags */
@@ -261,6 +263,7 @@ static struct rds_ib_incoming *rds_ib_refill_one_inc(struct rds_ib_connection *i
 			atomic_dec(&rds_ib_allocation);
 			return NULL;
 		}
+		rds_ib_stats_inc(s_ib_rx_total_incs);
 	}
 	INIT_LIST_HEAD(&ibinc->ii_frags);
 	rds_inc_init(&ibinc->ii_inc, ic->conn, ic->conn->c_faddr);
@@ -278,6 +281,8 @@ static struct rds_page_frag *rds_ib_refill_one_frag(struct rds_ib_connection *ic
 	cache_item = rds_ib_recv_cache_get(&ic->i_cache_frags);
 	if (cache_item) {
 		frag = container_of(cache_item, struct rds_page_frag, f_cache_entry);
+		atomic_sub(RDS_FRAG_SIZE / SZ_1K, &ic->i_cache_allocs);
+		rds_ib_stats_add(s_ib_recv_added_to_cache, RDS_FRAG_SIZE);
 	} else {
 		frag = kmem_cache_alloc(rds_ib_frag_slab, slab_mask);
 		if (!frag)
@@ -290,6 +295,7 @@ static struct rds_page_frag *rds_ib_refill_one_frag(struct rds_ib_connection *ic
 			kmem_cache_free(rds_ib_frag_slab, frag);
 			return NULL;
 		}
+		rds_ib_stats_inc(s_ib_rx_total_frags);
 	}
 
 	INIT_LIST_HEAD(&frag->f_item);
@@ -905,8 +911,12 @@ static void rds_ib_process_recv(struct rds_connection *conn,
 		ic->i_ibinc = ibinc;
 
 		hdr = &ibinc->ii_inc.i_hdr;
+		ibinc->ii_inc.i_rx_lat_trace[RDS_MSG_RX_HDR] =
+				local_clock();
 		memcpy(hdr, ihdr, sizeof(*hdr));
 		ic->i_recv_data_rem = be32_to_cpu(hdr->h_len);
+		ibinc->ii_inc.i_rx_lat_trace[RDS_MSG_RX_START] =
+				local_clock();
 
 		rdsdebug("ic %p ibinc %p rem %u flag 0x%x\n", ic, ibinc,
 			 ic->i_recv_data_rem, hdr->h_flags);
@@ -980,8 +990,8 @@ void rds_ib_recv_cqe_handler(struct rds_ib_connection *ic,
 	} else {
 		/* We expect errors as the qp is drained during shutdown */
 		if (rds_conn_up(conn) || rds_conn_connecting(conn))
-			rds_ib_conn_error(conn, "recv completion on %pI4 had status %u (%s), disconnecting and reconnecting\n",
-					  &conn->c_faddr,
+			rds_ib_conn_error(conn, "recv completion on <%pI4,%pI4> had status %u (%s), disconnecting and reconnecting\n",
+					  &conn->c_laddr, &conn->c_faddr,
 					  wc->status,
 					  ib_wc_status_msg(wc->status));
 	}
diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c
index 84d90c97332f..6ab39dbcca01 100644
--- a/net/rds/ib_send.c
+++ b/net/rds/ib_send.c
@@ -69,16 +69,6 @@ static void rds_ib_send_complete(struct rds_message *rm,
 	complete(rm, notify_status);
 }
 
-static void rds_ib_send_unmap_data(struct rds_ib_connection *ic,
-				   struct rm_data_op *op,
-				   int wc_status)
-{
-	if (op->op_nents)
-		ib_dma_unmap_sg(ic->i_cm_id->device,
-				op->op_sg, op->op_nents,
-				DMA_TO_DEVICE);
-}
-
 static void rds_ib_send_unmap_rdma(struct rds_ib_connection *ic,
 				   struct rm_rdma_op *op,
 				   int wc_status)
@@ -139,6 +129,21 @@ static void rds_ib_send_unmap_atomic(struct rds_ib_connection *ic,
 		rds_ib_stats_inc(s_ib_atomic_fadd);
 }
 
+static void rds_ib_send_unmap_data(struct rds_ib_connection *ic,
+				   struct rm_data_op *op,
+				   int wc_status)
+{
+	struct rds_message *rm = container_of(op, struct rds_message, data);
+
+	if (op->op_nents)
+		ib_dma_unmap_sg(ic->i_cm_id->device,
+				op->op_sg, op->op_nents,
+				DMA_TO_DEVICE);
+
+	if (rm->rdma.op_active && rm->data.op_notify)
+		rds_ib_send_unmap_rdma(ic, &rm->rdma, wc_status);
+}
+
 /*
  * Unmap the resources associated with a struct send_work.
  *
@@ -300,8 +305,8 @@ void rds_ib_send_cqe_handler(struct rds_ib_connection *ic, struct ib_wc *wc)
 
 	/* We expect errors as the qp is drained during shutdown */
 	if (wc->status != IB_WC_SUCCESS && rds_conn_up(conn)) {
-		rds_ib_conn_error(conn, "send completion on %pI4 had status %u (%s), disconnecting and reconnecting\n",
-				  &conn->c_faddr, wc->status,
+		rds_ib_conn_error(conn, "send completion on <%pI4,%pI4> had status %u (%s), disconnecting and reconnecting\n",
+				  &conn->c_laddr, &conn->c_faddr, wc->status,
 				  ib_wc_status_msg(wc->status));
 	}
 }
@@ -765,7 +770,6 @@ int rds_ib_xmit_atomic(struct rds_connection *conn, struct rm_atomic_op *op)
 
 	work_alloc = rds_ib_ring_alloc(&ic->i_send_ring, 1, &pos);
 	if (work_alloc != 1) {
-		rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
 		rds_ib_stats_inc(s_ib_tx_ring_full);
 		ret = -ENOMEM;
 		goto out;
diff --git a/net/rds/ib_stats.c b/net/rds/ib_stats.c
index 7e78dca1f252..9252ad126335 100644
--- a/net/rds/ib_stats.c
+++ b/net/rds/ib_stats.c
@@ -55,6 +55,8 @@ static const char *const rds_ib_stat_names[] = {
 	"ib_rx_refill_from_cq",
 	"ib_rx_refill_from_thread",
 	"ib_rx_alloc_limit",
+	"ib_rx_total_frags",
+	"ib_rx_total_incs",
 	"ib_rx_credit_updates",
 	"ib_ack_sent",
 	"ib_ack_send_failure",
diff --git a/net/rds/rdma.c b/net/rds/rdma.c
index ea961144084f..f06fac4886b0 100644
--- a/net/rds/rdma.c
+++ b/net/rds/rdma.c
@@ -40,7 +40,6 @@
 /*
  * XXX
  *  - build with sparse
- *  - should we limit the size of a mr region?  let transport return failure?
  *  - should we detect duplicate keys on a socket?  hmm.
  *  - an rdma is an mlock, apply rlimit?
  */
@@ -200,6 +199,14 @@ static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args,
 		goto out;
 	}
 
+	/* Restrict the size of mr irrespective of underlying transport
+	 * To account for unaligned mr regions, subtract one from nr_pages
+	 */
+	if ((nr_pages - 1) > (RDS_MAX_MSG_SIZE >> PAGE_SHIFT)) {
+		ret = -EMSGSIZE;
+		goto out;
+	}
+
 	rdsdebug("RDS: get_mr addr %llx len %llu nr_pages %u\n",
 		args->vec.addr, args->vec.bytes, nr_pages);
 
@@ -415,7 +422,8 @@ void rds_rdma_unuse(struct rds_sock *rs, u32 r_key, int force)
 	spin_lock_irqsave(&rs->rs_rdma_lock, flags);
 	mr = rds_mr_tree_walk(&rs->rs_rdma_keys, r_key, NULL);
 	if (!mr) {
-		printk(KERN_ERR "rds: trying to unuse MR with unknown r_key %u!\n", r_key);
+		pr_debug("rds: trying to unuse MR with unknown r_key %u!\n",
+			 r_key);
 		spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);
 		return;
 	}
@@ -626,6 +634,16 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
 		}
 		op->op_notifier->n_user_token = args->user_token;
 		op->op_notifier->n_status = RDS_RDMA_SUCCESS;
+
+		/* Enable rmda notification on data operation for composite
+		 * rds messages and make sure notification is enabled only
+		 * for the data operation which follows it so that application
+		 * gets notified only after full message gets delivered.
+		 */
+		if (rm->data.op_sg) {
+			rm->rdma.op_notify = 0;
+			rm->data.op_notify = !!(args->flags & RDS_RDMA_NOTIFY_ME);
+		}
 	}
 
 	/* The cookie contains the R_Key of the remote memory region, and
diff --git a/net/rds/rdma_transport.c b/net/rds/rdma_transport.c
index d5f311767157..fc59821f0a27 100644
--- a/net/rds/rdma_transport.c
+++ b/net/rds/rdma_transport.c
@@ -206,18 +206,13 @@ static int rds_rdma_init(void)
 {
 	int ret;
 
-	ret = rds_rdma_listen_init();
+	ret = rds_ib_init();
 	if (ret)
 		goto out;
 
-	ret = rds_ib_init();
+	ret = rds_rdma_listen_init();
 	if (ret)
-		goto err_ib_init;
-
-	goto out;
-
-err_ib_init:
-	rds_rdma_listen_stop();
+		rds_ib_exit();
 out:
 	return ret;
 }
diff --git a/net/rds/rds.h b/net/rds/rds.h
index ebbf909b87ec..07fff73dd4f3 100644
--- a/net/rds/rds.h
+++ b/net/rds/rds.h
@@ -50,6 +50,9 @@ void rdsdebug(char *fmt, ...)
 #define RDS_FRAG_SHIFT	12
 #define RDS_FRAG_SIZE	((unsigned int)(1 << RDS_FRAG_SHIFT))
 
+/* Used to limit both RDMA and non-RDMA RDS message to 1MB */
+#define RDS_MAX_MSG_SIZE	((unsigned int)(1 << 20))
+
 #define RDS_CONG_MAP_BYTES	(65536 / 8)
 #define RDS_CONG_MAP_PAGES	(PAGE_ALIGN(RDS_CONG_MAP_BYTES) / PAGE_SIZE)
 #define RDS_CONG_MAP_PAGE_BITS	(PAGE_SIZE * 8)
@@ -250,6 +253,11 @@ struct rds_ext_header_rdma_dest {
 #define RDS_EXTHDR_GEN_NUM	6
 
 #define __RDS_EXTHDR_MAX	16 /* for now */
+#define RDS_RX_MAX_TRACES	(RDS_MSG_RX_DGRAM_TRACE_MAX + 1)
+#define	RDS_MSG_RX_HDR		0
+#define	RDS_MSG_RX_START	1
+#define	RDS_MSG_RX_END		2
+#define	RDS_MSG_RX_CMSG		3
 
 struct rds_incoming {
 	atomic_t		i_refcount;
@@ -262,6 +270,7 @@ struct rds_incoming {
 
 	rds_rdma_cookie_t	i_rdma_cookie;
 	struct timeval		i_rx_tstamp;
+	u64			i_rx_lat_trace[RDS_RX_MAX_TRACES];
 };
 
 struct rds_mr {
@@ -419,6 +428,7 @@ struct rds_message {
 		} rdma;
 		struct rm_data_op {
 			unsigned int		op_active:1;
+			unsigned int		op_notify:1;
 			unsigned int		op_nents;
 			unsigned int		op_count;
 			unsigned int		op_dmasg;
@@ -571,6 +581,10 @@ struct rds_sock {
 	unsigned char		rs_recverr,
 				rs_cong_monitor;
 	u32			rs_hash_initval;
+
+	/* Socket receive path trace points*/
+	u8			rs_rx_traces;
+	u8			rs_rx_trace[RDS_MSG_RX_DGRAM_TRACE_MAX];
 };
 
 static inline struct rds_sock *rds_sk_to_rs(const struct sock *sk)
@@ -630,6 +644,9 @@ struct rds_statistics {
 	uint64_t	s_cong_update_received;
 	uint64_t	s_cong_send_error;
 	uint64_t	s_cong_send_blocked;
+	uint64_t	s_recv_bytes_added_to_socket;
+	uint64_t	s_recv_bytes_removed_from_socket;
+
 };
 
 /* af_rds.c */
diff --git a/net/rds/recv.c b/net/rds/recv.c
index 9d0666e5fe35..8b7e7b7f2c2d 100644
--- a/net/rds/recv.c
+++ b/net/rds/recv.c
@@ -43,6 +43,8 @@
 void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn,
 		  __be32 saddr)
 {
+	int i;
+
 	atomic_set(&inc->i_refcount, 1);
 	INIT_LIST_HEAD(&inc->i_item);
 	inc->i_conn = conn;
@@ -50,6 +52,9 @@ void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn,
 	inc->i_rdma_cookie = 0;
 	inc->i_rx_tstamp.tv_sec = 0;
 	inc->i_rx_tstamp.tv_usec = 0;
+
+	for (i = 0; i < RDS_RX_MAX_TRACES; i++)
+		inc->i_rx_lat_trace[i] = 0;
 }
 EXPORT_SYMBOL_GPL(rds_inc_init);
 
@@ -94,6 +99,10 @@ static void rds_recv_rcvbuf_delta(struct rds_sock *rs, struct sock *sk,
 		return;
 
 	rs->rs_rcv_bytes += delta;
+	if (delta > 0)
+		rds_stats_add(s_recv_bytes_added_to_socket, delta);
+	else
+		rds_stats_add(s_recv_bytes_removed_from_socket, -delta);
 	now_congested = rs->rs_rcv_bytes > rds_sk_rcvbuf(rs);
 
 	rdsdebug("rs %p (%pI4:%u) recv bytes %d buf %d "
@@ -369,6 +378,7 @@ void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr,
 		if (sock_flag(sk, SOCK_RCVTSTAMP))
 			do_gettimeofday(&inc->i_rx_tstamp);
 		rds_inc_addref(inc);
+		inc->i_rx_lat_trace[RDS_MSG_RX_END] = local_clock();
 		list_add_tail(&inc->i_item, &rs->rs_recv_queue);
 		__rds_wake_sk_sleep(sk);
 	} else {
@@ -530,7 +540,7 @@ static int rds_cmsg_recv(struct rds_incoming *inc, struct msghdr *msg,
 		ret = put_cmsg(msg, SOL_RDS, RDS_CMSG_RDMA_DEST,
 				sizeof(inc->i_rdma_cookie), &inc->i_rdma_cookie);
 		if (ret)
-			return ret;
+			goto out;
 	}
 
 	if ((inc->i_rx_tstamp.tv_sec != 0) &&
@@ -539,10 +549,30 @@ static int rds_cmsg_recv(struct rds_incoming *inc, struct msghdr *msg,
 			       sizeof(struct timeval),
 			       &inc->i_rx_tstamp);
 		if (ret)
-			return ret;
+			goto out;
 	}
 
-	return 0;
+	if (rs->rs_rx_traces) {
+		struct rds_cmsg_rx_trace t;
+		int i, j;
+
+		inc->i_rx_lat_trace[RDS_MSG_RX_CMSG] = local_clock();
+		t.rx_traces =  rs->rs_rx_traces;
+		for (i = 0; i < rs->rs_rx_traces; i++) {
+			j = rs->rs_rx_trace[i];
+			t.rx_trace_pos[i] = j;
+			t.rx_trace[i] = inc->i_rx_lat_trace[j + 1] -
+					  inc->i_rx_lat_trace[j];
+		}
+
+		ret = put_cmsg(msg, SOL_RDS, RDS_CMSG_RXPATH_LATENCY,
+			       sizeof(t), &t);
+		if (ret)
+			goto out;
+	}
+
+out:
+	return ret;
 }
 
 int rds_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
diff --git a/net/rds/send.c b/net/rds/send.c
index 77c8c6e613ad..5cc64039caf7 100644
--- a/net/rds/send.c
+++ b/net/rds/send.c
@@ -476,12 +476,14 @@ void rds_rdma_send_complete(struct rds_message *rm, int status)
 	struct rm_rdma_op *ro;
 	struct rds_notifier *notifier;
 	unsigned long flags;
+	unsigned int notify = 0;
 
 	spin_lock_irqsave(&rm->m_rs_lock, flags);
 
+	notify =  rm->rdma.op_notify | rm->data.op_notify;
 	ro = &rm->rdma;
 	if (test_bit(RDS_MSG_ON_SOCK, &rm->m_flags) &&
-	    ro->op_active && ro->op_notify && ro->op_notifier) {
+	    ro->op_active && notify && ro->op_notifier) {
 		notifier = ro->op_notifier;
 		rs = rm->m_rs;
 		sock_hold(rds_rs_to_sk(rs));
@@ -945,6 +947,11 @@ static int rds_cmsg_send(struct rds_sock *rs, struct rds_message *rm,
 			ret = rds_cmsg_rdma_map(rs, rm, cmsg);
 			if (!ret)
 				*allocated_mr = 1;
+			else if (ret == -ENODEV)
+				/* Accommodate the get_mr() case which can fail
+				 * if connection isn't established yet.
+				 */
+				ret = -EAGAIN;
 			break;
 		case RDS_CMSG_ATOMIC_CSWP:
 		case RDS_CMSG_ATOMIC_FADD:
@@ -987,6 +994,26 @@ static int rds_send_mprds_hash(struct rds_sock *rs, struct rds_connection *conn)
 	return hash;
 }
 
+static int rds_rdma_bytes(struct msghdr *msg, size_t *rdma_bytes)
+{
+	struct rds_rdma_args *args;
+	struct cmsghdr *cmsg;
+
+	for_each_cmsghdr(cmsg, msg) {
+		if (!CMSG_OK(msg, cmsg))
+			return -EINVAL;
+
+		if (cmsg->cmsg_level != SOL_RDS)
+			continue;
+
+		if (cmsg->cmsg_type == RDS_CMSG_RDMA_ARGS) {
+			args = CMSG_DATA(cmsg);
+			*rdma_bytes += args->remote_vec.bytes;
+		}
+	}
+	return 0;
+}
+
 int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)
 {
 	struct sock *sk = sock->sk;
@@ -1001,6 +1028,7 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)
 	int nonblock = msg->msg_flags & MSG_DONTWAIT;
 	long timeo = sock_sndtimeo(sk, nonblock);
 	struct rds_conn_path *cpath;
+	size_t total_payload_len = payload_len, rdma_payload_len = 0;
 
 	/* Mirror Linux UDP mirror of BSD error message compatibility */
 	/* XXX: Perhaps MSG_MORE someday */
@@ -1033,6 +1061,16 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)
 	}
 	release_sock(sk);
 
+	ret = rds_rdma_bytes(msg, &rdma_payload_len);
+	if (ret)
+		goto out;
+
+	total_payload_len += rdma_payload_len;
+	if (max_t(size_t, payload_len, rdma_payload_len) > RDS_MAX_MSG_SIZE) {
+		ret = -EMSGSIZE;
+		goto out;
+	}
+
 	if (payload_len > rds_sk_sndbuf(rs)) {
 		ret = -EMSGSIZE;
 		goto out;
@@ -1082,8 +1120,12 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)
 
 	/* Parse any control messages the user may have included. */
 	ret = rds_cmsg_send(rs, rm, msg, &allocated_mr);
-	if (ret)
+	if (ret) {
+		/* Trigger connection so that its ready for the next retry */
+		if (ret ==  -EAGAIN)
+			rds_conn_connect_if_down(conn);
 		goto out;
+	}
 
 	if (rm->rdma.op_active && !conn->c_trans->xmit_rdma) {
 		printk_ratelimited(KERN_NOTICE "rdma_op %p conn xmit_rdma %p\n",
@@ -1169,7 +1211,7 @@ out:
  * or
  *   RDS_FLAG_HB_PONG|RDS_FLAG_ACK_REQUIRED
  */
-int
+static int
 rds_send_probe(struct rds_conn_path *cp, __be16 sport,
 	       __be16 dport, u8 h_flags)
 {
@@ -1238,7 +1280,7 @@ rds_send_pong(struct rds_conn_path *cp, __be16 dport)
 	return rds_send_probe(cp, 0, dport, 0);
 }
 
-void
+static void
 rds_send_ping(struct rds_connection *conn)
 {
 	unsigned long flags;
diff --git a/net/rds/tcp.c b/net/rds/tcp.c
index 57bb52361e0f..5438f6725092 100644
--- a/net/rds/tcp.c
+++ b/net/rds/tcp.c
@@ -641,12 +641,12 @@ static int rds_tcp_init(void)
 	ret = register_netdevice_notifier(&rds_tcp_dev_notifier);
 	if (ret) {
 		pr_warn("could not register rds_tcp_dev_notifier\n");
-		goto out;
+		goto out_slab;
 	}
 
 	ret = register_pernet_subsys(&rds_tcp_net_ops);
 	if (ret)
-		goto out_slab;
+		goto out_notifier;
 
 	ret = rds_tcp_recv_init();
 	if (ret)
@@ -664,9 +664,10 @@ out_recv:
 	rds_tcp_recv_exit();
 out_pernet:
 	unregister_pernet_subsys(&rds_tcp_net_ops);
-out_slab:
+out_notifier:
 	if (unregister_netdevice_notifier(&rds_tcp_dev_notifier))
 		pr_warn("could not unregister rds_tcp_dev_notifier\n");
+out_slab:
 	kmem_cache_destroy(rds_tcp_conn_slab);
 out:
 	return ret;
diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c
index f74bab3ecdca..67d0929c7d3d 100644
--- a/net/rds/tcp_listen.c
+++ b/net/rds/tcp_listen.c
@@ -79,6 +79,7 @@ bail:
  * smaller ip address, we recycle conns in RDS_CONN_ERROR on the passive side
  * by moving them to CONNECTING in this function.
  */
+static
 struct rds_tcp_connection *rds_tcp_accept_one_path(struct rds_connection *conn)
 {
 	int i;
diff --git a/net/rds/tcp_recv.c b/net/rds/tcp_recv.c
index ad4892e97f91..e006ef8e6d40 100644
--- a/net/rds/tcp_recv.c
+++ b/net/rds/tcp_recv.c
@@ -180,6 +180,9 @@ static int rds_tcp_data_recv(read_descriptor_t *desc, struct sk_buff *skb,
 			rdsdebug("alloced tinc %p\n", tinc);
 			rds_inc_path_init(&tinc->ti_inc, cp,
 					  cp->cp_conn->c_faddr);
+			tinc->ti_inc.i_rx_lat_trace[RDS_MSG_RX_HDR] =
+					local_clock();
+
 			/*
 			 * XXX * we might be able to use the __ variants when
 			 * we've already serialized at a higher level.
@@ -204,6 +207,8 @@ static int rds_tcp_data_recv(read_descriptor_t *desc, struct sk_buff *skb,
 				/* could be 0 for a 0 len message */
 				tc->t_tinc_data_rem =
 					be32_to_cpu(tinc->ti_inc.i_hdr.h_len);
+				tinc->ti_inc.i_rx_lat_trace[RDS_MSG_RX_START] =
+					local_clock();
 			}
 		}
 
diff --git a/net/rfkill/Kconfig b/net/rfkill/Kconfig
index 868f1ad0415a..060600b03fad 100644
--- a/net/rfkill/Kconfig
+++ b/net/rfkill/Kconfig
@@ -23,17 +23,6 @@ config RFKILL_INPUT
 	depends on INPUT = y || RFKILL = INPUT
 	default y if !EXPERT
 
-config RFKILL_REGULATOR
-	tristate "Generic rfkill regulator driver"
-	depends on RFKILL || !RFKILL
-	depends on REGULATOR
-	help
-          This options enable controlling radio transmitters connected to
-          voltage regulator using the regulator framework.
-
-          To compile this driver as a module, choose M here: the module will
-          be called rfkill-regulator.
-
 config RFKILL_GPIO
 	tristate "GPIO RFKILL driver"
 	depends on RFKILL
diff --git a/net/rfkill/Makefile b/net/rfkill/Makefile
index 311768783f4a..87a80aded0b3 100644
--- a/net/rfkill/Makefile
+++ b/net/rfkill/Makefile
@@ -5,5 +5,4 @@
 rfkill-y			+= core.o
 rfkill-$(CONFIG_RFKILL_INPUT)	+= input.o
 obj-$(CONFIG_RFKILL)		+= rfkill.o
-obj-$(CONFIG_RFKILL_REGULATOR)	+= rfkill-regulator.o
 obj-$(CONFIG_RFKILL_GPIO)	+= rfkill-gpio.o
diff --git a/net/rfkill/core.c b/net/rfkill/core.c
index 884027f62783..2064c3a35ef8 100644
--- a/net/rfkill/core.c
+++ b/net/rfkill/core.c
@@ -176,6 +176,50 @@ static void rfkill_led_trigger_unregister(struct rfkill *rfkill)
 {
 	led_trigger_unregister(&rfkill->led_trigger);
 }
+
+static struct led_trigger rfkill_any_led_trigger;
+static struct work_struct rfkill_any_work;
+
+static void rfkill_any_led_trigger_worker(struct work_struct *work)
+{
+	enum led_brightness brightness = LED_OFF;
+	struct rfkill *rfkill;
+
+	mutex_lock(&rfkill_global_mutex);
+	list_for_each_entry(rfkill, &rfkill_list, node) {
+		if (!(rfkill->state & RFKILL_BLOCK_ANY)) {
+			brightness = LED_FULL;
+			break;
+		}
+	}
+	mutex_unlock(&rfkill_global_mutex);
+
+	led_trigger_event(&rfkill_any_led_trigger, brightness);
+}
+
+static void rfkill_any_led_trigger_event(void)
+{
+	schedule_work(&rfkill_any_work);
+}
+
+static void rfkill_any_led_trigger_activate(struct led_classdev *led_cdev)
+{
+	rfkill_any_led_trigger_event();
+}
+
+static int rfkill_any_led_trigger_register(void)
+{
+	INIT_WORK(&rfkill_any_work, rfkill_any_led_trigger_worker);
+	rfkill_any_led_trigger.name = "rfkill-any";
+	rfkill_any_led_trigger.activate = rfkill_any_led_trigger_activate;
+	return led_trigger_register(&rfkill_any_led_trigger);
+}
+
+static void rfkill_any_led_trigger_unregister(void)
+{
+	led_trigger_unregister(&rfkill_any_led_trigger);
+	cancel_work_sync(&rfkill_any_work);
+}
 #else
 static void rfkill_led_trigger_event(struct rfkill *rfkill)
 {
@@ -189,6 +233,19 @@ static inline int rfkill_led_trigger_register(struct rfkill *rfkill)
 static inline void rfkill_led_trigger_unregister(struct rfkill *rfkill)
 {
 }
+
+static void rfkill_any_led_trigger_event(void)
+{
+}
+
+static int rfkill_any_led_trigger_register(void)
+{
+	return 0;
+}
+
+static void rfkill_any_led_trigger_unregister(void)
+{
+}
 #endif /* CONFIG_RFKILL_LEDS */
 
 static void rfkill_fill_event(struct rfkill_event *ev, struct rfkill *rfkill,
@@ -297,6 +354,7 @@ static void rfkill_set_block(struct rfkill *rfkill, bool blocked)
 	spin_unlock_irqrestore(&rfkill->lock, flags);
 
 	rfkill_led_trigger_event(rfkill);
+	rfkill_any_led_trigger_event();
 
 	if (prev != curr)
 		rfkill_event(rfkill);
@@ -477,11 +535,9 @@ bool rfkill_set_hw_state(struct rfkill *rfkill, bool blocked)
 	spin_unlock_irqrestore(&rfkill->lock, flags);
 
 	rfkill_led_trigger_event(rfkill);
+	rfkill_any_led_trigger_event();
 
-	if (!rfkill->registered)
-		return ret;
-
-	if (prev != blocked)
+	if (rfkill->registered && prev != blocked)
 		schedule_work(&rfkill->uevent_work);
 
 	return ret;
@@ -523,6 +579,7 @@ bool rfkill_set_sw_state(struct rfkill *rfkill, bool blocked)
 		schedule_work(&rfkill->uevent_work);
 
 	rfkill_led_trigger_event(rfkill);
+	rfkill_any_led_trigger_event();
 
 	return blocked;
 }
@@ -572,6 +629,7 @@ void rfkill_set_states(struct rfkill *rfkill, bool sw, bool hw)
 			schedule_work(&rfkill->uevent_work);
 
 		rfkill_led_trigger_event(rfkill);
+		rfkill_any_led_trigger_event();
 	}
 }
 EXPORT_SYMBOL(rfkill_set_states);
@@ -988,6 +1046,7 @@ int __must_check rfkill_register(struct rfkill *rfkill)
 #endif
 	}
 
+	rfkill_any_led_trigger_event();
 	rfkill_send_events(rfkill, RFKILL_OP_ADD);
 
 	mutex_unlock(&rfkill_global_mutex);
@@ -1020,6 +1079,7 @@ void rfkill_unregister(struct rfkill *rfkill)
 	mutex_lock(&rfkill_global_mutex);
 	rfkill_send_events(rfkill, RFKILL_OP_DEL);
 	list_del_init(&rfkill->node);
+	rfkill_any_led_trigger_event();
 	mutex_unlock(&rfkill_global_mutex);
 
 	rfkill_led_trigger_unregister(rfkill);
@@ -1266,24 +1326,33 @@ static int __init rfkill_init(void)
 
 	error = class_register(&rfkill_class);
 	if (error)
-		goto out;
+		goto error_class;
 
 	error = misc_register(&rfkill_miscdev);
-	if (error) {
-		class_unregister(&rfkill_class);
-		goto out;
-	}
+	if (error)
+		goto error_misc;
+
+	error = rfkill_any_led_trigger_register();
+	if (error)
+		goto error_led_trigger;
 
 #ifdef CONFIG_RFKILL_INPUT
 	error = rfkill_handler_init();
-	if (error) {
-		misc_deregister(&rfkill_miscdev);
-		class_unregister(&rfkill_class);
-		goto out;
-	}
+	if (error)
+		goto error_input;
 #endif
 
- out:
+	return 0;
+
+#ifdef CONFIG_RFKILL_INPUT
+error_input:
+	rfkill_any_led_trigger_unregister();
+#endif
+error_led_trigger:
+	misc_deregister(&rfkill_miscdev);
+error_misc:
+	class_unregister(&rfkill_class);
+error_class:
 	return error;
 }
 subsys_initcall(rfkill_init);
@@ -1293,6 +1362,7 @@ static void __exit rfkill_exit(void)
 #ifdef CONFIG_RFKILL_INPUT
 	rfkill_handler_exit();
 #endif
+	rfkill_any_led_trigger_unregister();
 	misc_deregister(&rfkill_miscdev);
 	class_unregister(&rfkill_class);
 }
diff --git a/net/rfkill/rfkill-regulator.c b/net/rfkill/rfkill-regulator.c
deleted file mode 100644
index 50cd26a48e87..000000000000
--- a/net/rfkill/rfkill-regulator.c
+++ /dev/null
@@ -1,154 +0,0 @@
-/*
- * rfkill-regulator.c - Regulator consumer driver for rfkill
- *
- * Copyright (C) 2009  Guiming Zhuo <gmzhuo@gmail.com>
- * Copyright (C) 2011  Antonio Ospite <ospite@studenti.unina.it>
- *
- * Implementation inspired by leds-regulator driver.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- */
-
-#include <linux/module.h>
-#include <linux/err.h>
-#include <linux/slab.h>
-#include <linux/platform_device.h>
-#include <linux/regulator/consumer.h>
-#include <linux/rfkill.h>
-#include <linux/rfkill-regulator.h>
-
-struct rfkill_regulator_data {
-	struct rfkill *rf_kill;
-	bool reg_enabled;
-
-	struct regulator *vcc;
-};
-
-static int rfkill_regulator_set_block(void *data, bool blocked)
-{
-	struct rfkill_regulator_data *rfkill_data = data;
-	int ret = 0;
-
-	pr_debug("%s: blocked: %d\n", __func__, blocked);
-
-	if (blocked) {
-		if (rfkill_data->reg_enabled) {
-			regulator_disable(rfkill_data->vcc);
-			rfkill_data->reg_enabled = false;
-		}
-	} else {
-		if (!rfkill_data->reg_enabled) {
-			ret = regulator_enable(rfkill_data->vcc);
-			if (!ret)
-				rfkill_data->reg_enabled = true;
-		}
-	}
-
-	pr_debug("%s: regulator_is_enabled after set_block: %d\n", __func__,
-		regulator_is_enabled(rfkill_data->vcc));
-
-	return ret;
-}
-
-static struct rfkill_ops rfkill_regulator_ops = {
-	.set_block = rfkill_regulator_set_block,
-};
-
-static int rfkill_regulator_probe(struct platform_device *pdev)
-{
-	struct rfkill_regulator_platform_data *pdata = pdev->dev.platform_data;
-	struct rfkill_regulator_data *rfkill_data;
-	struct regulator *vcc;
-	struct rfkill *rf_kill;
-	int ret = 0;
-
-	if (pdata == NULL) {
-		dev_err(&pdev->dev, "no platform data\n");
-		return -ENODEV;
-	}
-
-	if (pdata->name == NULL || pdata->type == 0) {
-		dev_err(&pdev->dev, "invalid name or type in platform data\n");
-		return -EINVAL;
-	}
-
-	vcc = regulator_get_exclusive(&pdev->dev, "vrfkill");
-	if (IS_ERR(vcc)) {
-		dev_err(&pdev->dev, "Cannot get vcc for %s\n", pdata->name);
-		ret = PTR_ERR(vcc);
-		goto out;
-	}
-
-	rfkill_data = kzalloc(sizeof(*rfkill_data), GFP_KERNEL);
-	if (rfkill_data == NULL) {
-		ret = -ENOMEM;
-		goto err_data_alloc;
-	}
-
-	rf_kill = rfkill_alloc(pdata->name, &pdev->dev,
-				pdata->type,
-				&rfkill_regulator_ops, rfkill_data);
-	if (rf_kill == NULL) {
-		ret = -ENOMEM;
-		goto err_rfkill_alloc;
-	}
-
-	if (regulator_is_enabled(vcc)) {
-		dev_dbg(&pdev->dev, "Regulator already enabled\n");
-		rfkill_data->reg_enabled = true;
-	}
-	rfkill_data->vcc = vcc;
-	rfkill_data->rf_kill = rf_kill;
-
-	ret = rfkill_register(rf_kill);
-	if (ret) {
-		dev_err(&pdev->dev, "Cannot register rfkill device\n");
-		goto err_rfkill_register;
-	}
-
-	platform_set_drvdata(pdev, rfkill_data);
-	dev_info(&pdev->dev, "%s initialized\n", pdata->name);
-
-	return 0;
-
-err_rfkill_register:
-	rfkill_destroy(rf_kill);
-err_rfkill_alloc:
-	kfree(rfkill_data);
-err_data_alloc:
-	regulator_put(vcc);
-out:
-	return ret;
-}
-
-static int rfkill_regulator_remove(struct platform_device *pdev)
-{
-	struct rfkill_regulator_data *rfkill_data = platform_get_drvdata(pdev);
-	struct rfkill *rf_kill = rfkill_data->rf_kill;
-
-	rfkill_unregister(rf_kill);
-	rfkill_destroy(rf_kill);
-	regulator_put(rfkill_data->vcc);
-	kfree(rfkill_data);
-
-	return 0;
-}
-
-static struct platform_driver rfkill_regulator_driver = {
-	.probe = rfkill_regulator_probe,
-	.remove = rfkill_regulator_remove,
-	.driver = {
-		.name = "rfkill-regulator",
-	},
-};
-
-module_platform_driver(rfkill_regulator_driver);
-
-MODULE_AUTHOR("Guiming Zhuo <gmzhuo@gmail.com>");
-MODULE_AUTHOR("Antonio Ospite <ospite@studenti.unina.it>");
-MODULE_DESCRIPTION("Regulator consumer driver for rfkill");
-MODULE_LICENSE("GPL");
-MODULE_ALIAS("platform:rfkill-regulator");
diff --git a/net/rxrpc/Makefile b/net/rxrpc/Makefile
index 8fc6ea347182..b9da4d6b914f 100644
--- a/net/rxrpc/Makefile
+++ b/net/rxrpc/Makefile
@@ -2,7 +2,9 @@
 # Makefile for Linux kernel RxRPC
 #
 
-af-rxrpc-y := \
+obj-$(CONFIG_AF_RXRPC) += rxrpc.o
+
+rxrpc-y := \
 	af_rxrpc.o \
 	call_accept.o \
 	call_event.o \
@@ -26,8 +28,6 @@ af-rxrpc-y := \
 	skbuff.o \
 	utils.o
 
-af-rxrpc-$(CONFIG_PROC_FS) += proc.o
-af-rxrpc-$(CONFIG_RXKAD) += rxkad.o
-af-rxrpc-$(CONFIG_SYSCTL) += sysctl.o
-
-obj-$(CONFIG_AF_RXRPC) += af-rxrpc.o
+rxrpc-$(CONFIG_PROC_FS) += proc.o
+rxrpc-$(CONFIG_RXKAD) += rxkad.o
+rxrpc-$(CONFIG_SYSCTL) += sysctl.o
diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c
index 5f63f6dcaabb..199b46e93e64 100644
--- a/net/rxrpc/af_rxrpc.c
+++ b/net/rxrpc/af_rxrpc.c
@@ -224,6 +224,14 @@ static int rxrpc_listen(struct socket *sock, int backlog)
 		else
 			sk->sk_max_ack_backlog = old;
 		break;
+	case RXRPC_SERVER_LISTENING:
+		if (backlog == 0) {
+			rx->sk.sk_state = RXRPC_SERVER_LISTEN_DISABLED;
+			sk->sk_max_ack_backlog = 0;
+			rxrpc_discard_prealloc(rx);
+			ret = 0;
+			break;
+		}
 	default:
 		ret = -EBUSY;
 		break;
diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
index f60e35576526..12be432be9b2 100644
--- a/net/rxrpc/ar-internal.h
+++ b/net/rxrpc/ar-internal.h
@@ -60,6 +60,7 @@ enum {
 	RXRPC_CLIENT_BOUND,		/* client local address bound */
 	RXRPC_SERVER_BOUND,		/* server local address bound */
 	RXRPC_SERVER_LISTENING,		/* server listening for connections */
+	RXRPC_SERVER_LISTEN_DISABLED,	/* server listening disabled */
 	RXRPC_CLOSE,			/* socket is being closed */
 };
 
@@ -593,200 +594,6 @@ struct rxrpc_ack_summary {
 	u8			cumulative_acks;
 };
 
-enum rxrpc_skb_trace {
-	rxrpc_skb_rx_cleaned,
-	rxrpc_skb_rx_freed,
-	rxrpc_skb_rx_got,
-	rxrpc_skb_rx_lost,
-	rxrpc_skb_rx_received,
-	rxrpc_skb_rx_rotated,
-	rxrpc_skb_rx_purged,
-	rxrpc_skb_rx_seen,
-	rxrpc_skb_tx_cleaned,
-	rxrpc_skb_tx_freed,
-	rxrpc_skb_tx_got,
-	rxrpc_skb_tx_new,
-	rxrpc_skb_tx_rotated,
-	rxrpc_skb_tx_seen,
-	rxrpc_skb__nr_trace
-};
-
-extern const char rxrpc_skb_traces[rxrpc_skb__nr_trace][7];
-
-enum rxrpc_conn_trace {
-	rxrpc_conn_new_client,
-	rxrpc_conn_new_service,
-	rxrpc_conn_queued,
-	rxrpc_conn_seen,
-	rxrpc_conn_got,
-	rxrpc_conn_put_client,
-	rxrpc_conn_put_service,
-	rxrpc_conn__nr_trace
-};
-
-extern const char rxrpc_conn_traces[rxrpc_conn__nr_trace][4];
-
-enum rxrpc_client_trace {
-	rxrpc_client_activate_chans,
-	rxrpc_client_alloc,
-	rxrpc_client_chan_activate,
-	rxrpc_client_chan_disconnect,
-	rxrpc_client_chan_pass,
-	rxrpc_client_chan_unstarted,
-	rxrpc_client_cleanup,
-	rxrpc_client_count,
-	rxrpc_client_discard,
-	rxrpc_client_duplicate,
-	rxrpc_client_exposed,
-	rxrpc_client_replace,
-	rxrpc_client_to_active,
-	rxrpc_client_to_culled,
-	rxrpc_client_to_idle,
-	rxrpc_client_to_inactive,
-	rxrpc_client_to_waiting,
-	rxrpc_client_uncount,
-	rxrpc_client__nr_trace
-};
-
-extern const char rxrpc_client_traces[rxrpc_client__nr_trace][7];
-extern const char rxrpc_conn_cache_states[RXRPC_CONN__NR_CACHE_STATES][5];
-
-enum rxrpc_call_trace {
-	rxrpc_call_new_client,
-	rxrpc_call_new_service,
-	rxrpc_call_queued,
-	rxrpc_call_queued_ref,
-	rxrpc_call_seen,
-	rxrpc_call_connected,
-	rxrpc_call_release,
-	rxrpc_call_got,
-	rxrpc_call_got_userid,
-	rxrpc_call_got_kernel,
-	rxrpc_call_put,
-	rxrpc_call_put_userid,
-	rxrpc_call_put_kernel,
-	rxrpc_call_put_noqueue,
-	rxrpc_call_error,
-	rxrpc_call__nr_trace
-};
-
-extern const char rxrpc_call_traces[rxrpc_call__nr_trace][4];
-
-enum rxrpc_transmit_trace {
-	rxrpc_transmit_wait,
-	rxrpc_transmit_queue,
-	rxrpc_transmit_queue_last,
-	rxrpc_transmit_rotate,
-	rxrpc_transmit_rotate_last,
-	rxrpc_transmit_await_reply,
-	rxrpc_transmit_end,
-	rxrpc_transmit__nr_trace
-};
-
-extern const char rxrpc_transmit_traces[rxrpc_transmit__nr_trace][4];
-
-enum rxrpc_receive_trace {
-	rxrpc_receive_incoming,
-	rxrpc_receive_queue,
-	rxrpc_receive_queue_last,
-	rxrpc_receive_front,
-	rxrpc_receive_rotate,
-	rxrpc_receive_end,
-	rxrpc_receive__nr_trace
-};
-
-extern const char rxrpc_receive_traces[rxrpc_receive__nr_trace][4];
-
-enum rxrpc_recvmsg_trace {
-	rxrpc_recvmsg_enter,
-	rxrpc_recvmsg_wait,
-	rxrpc_recvmsg_dequeue,
-	rxrpc_recvmsg_hole,
-	rxrpc_recvmsg_next,
-	rxrpc_recvmsg_cont,
-	rxrpc_recvmsg_full,
-	rxrpc_recvmsg_data_return,
-	rxrpc_recvmsg_terminal,
-	rxrpc_recvmsg_to_be_accepted,
-	rxrpc_recvmsg_return,
-	rxrpc_recvmsg__nr_trace
-};
-
-extern const char rxrpc_recvmsg_traces[rxrpc_recvmsg__nr_trace][5];
-
-enum rxrpc_rtt_tx_trace {
-	rxrpc_rtt_tx_ping,
-	rxrpc_rtt_tx_data,
-	rxrpc_rtt_tx__nr_trace
-};
-
-extern const char rxrpc_rtt_tx_traces[rxrpc_rtt_tx__nr_trace][5];
-
-enum rxrpc_rtt_rx_trace {
-	rxrpc_rtt_rx_ping_response,
-	rxrpc_rtt_rx_requested_ack,
-	rxrpc_rtt_rx__nr_trace
-};
-
-extern const char rxrpc_rtt_rx_traces[rxrpc_rtt_rx__nr_trace][5];
-
-enum rxrpc_timer_trace {
-	rxrpc_timer_begin,
-	rxrpc_timer_init_for_reply,
-	rxrpc_timer_init_for_send_reply,
-	rxrpc_timer_expired,
-	rxrpc_timer_set_for_ack,
-	rxrpc_timer_set_for_ping,
-	rxrpc_timer_set_for_resend,
-	rxrpc_timer_set_for_send,
-	rxrpc_timer__nr_trace
-};
-
-extern const char rxrpc_timer_traces[rxrpc_timer__nr_trace][8];
-
-enum rxrpc_propose_ack_trace {
-	rxrpc_propose_ack_client_tx_end,
-	rxrpc_propose_ack_input_data,
-	rxrpc_propose_ack_ping_for_lost_ack,
-	rxrpc_propose_ack_ping_for_lost_reply,
-	rxrpc_propose_ack_ping_for_params,
-	rxrpc_propose_ack_processing_op,
-	rxrpc_propose_ack_respond_to_ack,
-	rxrpc_propose_ack_respond_to_ping,
-	rxrpc_propose_ack_retry_tx,
-	rxrpc_propose_ack_rotate_rx,
-	rxrpc_propose_ack_terminal_ack,
-	rxrpc_propose_ack__nr_trace
-};
-
-enum rxrpc_propose_ack_outcome {
-	rxrpc_propose_ack_use,
-	rxrpc_propose_ack_update,
-	rxrpc_propose_ack_subsume,
-	rxrpc_propose_ack__nr_outcomes
-};
-
-extern const char rxrpc_propose_ack_traces[rxrpc_propose_ack__nr_trace][8];
-extern const char *const rxrpc_propose_ack_outcomes[rxrpc_propose_ack__nr_outcomes];
-
-enum rxrpc_congest_change {
-	rxrpc_cong_begin_retransmission,
-	rxrpc_cong_cleared_nacks,
-	rxrpc_cong_new_low_nack,
-	rxrpc_cong_no_change,
-	rxrpc_cong_progress,
-	rxrpc_cong_retransmit_again,
-	rxrpc_cong_rtt_window_end,
-	rxrpc_cong_saw_nack,
-	rxrpc_congest__nr_change
-};
-
-extern const char rxrpc_congest_modes[NR__RXRPC_CONGEST_MODES][10];
-extern const char rxrpc_congest_changes[rxrpc_congest__nr_change][9];
-
-extern const char *const rxrpc_pkts[];
-extern const char rxrpc_ack_names[RXRPC_ACK__INVALID + 1][4];
-
 #include <trace/events/rxrpc.h>
 
 /*
diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c
index 832d854c2d5c..7c4c64ab8da2 100644
--- a/net/rxrpc/call_accept.c
+++ b/net/rxrpc/call_accept.c
@@ -349,7 +349,8 @@ struct rxrpc_call *rxrpc_new_incoming_call(struct rxrpc_local *local,
 
 found_service:
 	spin_lock(&rx->incoming_lock);
-	if (rx->sk.sk_state == RXRPC_CLOSE) {
+	if (rx->sk.sk_state == RXRPC_SERVER_LISTEN_DISABLED ||
+	    rx->sk.sk_state == RXRPC_CLOSE) {
 		trace_rxrpc_abort("CLS", sp->hdr.cid, sp->hdr.callNumber,
 				  sp->hdr.seq, RX_INVALID_OPERATION, ESHUTDOWN);
 		skb->mark = RXRPC_SKB_MARK_LOCAL_ABORT;
diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c
index 1ed18d8c9c9f..8b94db3c9b2e 100644
--- a/net/rxrpc/call_object.c
+++ b/net/rxrpc/call_object.c
@@ -43,24 +43,6 @@ const char *const rxrpc_call_completions[NR__RXRPC_CALL_COMPLETIONS] = {
 	[RXRPC_CALL_NETWORK_ERROR]		= "NetError",
 };
 
-const char rxrpc_call_traces[rxrpc_call__nr_trace][4] = {
-	[rxrpc_call_new_client]		= "NWc",
-	[rxrpc_call_new_service]	= "NWs",
-	[rxrpc_call_queued]		= "QUE",
-	[rxrpc_call_queued_ref]		= "QUR",
-	[rxrpc_call_connected]		= "CON",
-	[rxrpc_call_release]		= "RLS",
-	[rxrpc_call_seen]		= "SEE",
-	[rxrpc_call_got]		= "GOT",
-	[rxrpc_call_got_userid]		= "Gus",
-	[rxrpc_call_got_kernel]		= "Gke",
-	[rxrpc_call_put]		= "PUT",
-	[rxrpc_call_put_userid]		= "Pus",
-	[rxrpc_call_put_kernel]		= "Pke",
-	[rxrpc_call_put_noqueue]	= "PNQ",
-	[rxrpc_call_error]		= "*E*",
-};
-
 struct kmem_cache *rxrpc_call_jar;
 LIST_HEAD(rxrpc_calls);
 DEFINE_RWLOCK(rxrpc_call_lock);
diff --git a/net/rxrpc/conn_client.c b/net/rxrpc/conn_client.c
index 6cbcdcc29853..40a1ef2adeb4 100644
--- a/net/rxrpc/conn_client.c
+++ b/net/rxrpc/conn_client.c
@@ -105,14 +105,6 @@ static void rxrpc_discard_expired_client_conns(struct work_struct *);
 static DECLARE_DELAYED_WORK(rxrpc_client_conn_reap,
 			    rxrpc_discard_expired_client_conns);
 
-const char rxrpc_conn_cache_states[RXRPC_CONN__NR_CACHE_STATES][5] = {
-	[RXRPC_CONN_CLIENT_INACTIVE]	= "Inac",
-	[RXRPC_CONN_CLIENT_WAITING]	= "Wait",
-	[RXRPC_CONN_CLIENT_ACTIVE]	= "Actv",
-	[RXRPC_CONN_CLIENT_CULLED]	= "Cull",
-	[RXRPC_CONN_CLIENT_IDLE]	= "Idle",
-};
-
 /*
  * Get a connection ID and epoch for a client connection from the global pool.
  * The connection struct pointer is then recorded in the idr radix tree.  The
diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c
index e1e83af47866..b0ecb770fdce 100644
--- a/net/rxrpc/conn_object.c
+++ b/net/rxrpc/conn_object.c
@@ -173,6 +173,7 @@ void __rxrpc_disconnect_call(struct rxrpc_connection *conn,
 		/* Save the result of the call so that we can repeat it if necessary
 		 * through the channel, whilst disposing of the actual call record.
 		 */
+		trace_rxrpc_disconnect_call(call);
 		chan->last_service_id = call->service_id;
 		if (call->abort_code) {
 			chan->last_abort = call->abort_code;
diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c
index 1d87b5453ef7..78ec33477adf 100644
--- a/net/rxrpc/input.c
+++ b/net/rxrpc/input.c
@@ -481,6 +481,7 @@ next_subpacket:
 			return rxrpc_proto_abort("LSA", call, seq);
 	}
 
+	trace_rxrpc_rx_data(call, seq, serial, flags, annotation);
 	if (before_eq(seq, hard_ack)) {
 		ack = RXRPC_ACK_DUPLICATE;
 		ack_serial = serial;
@@ -765,16 +766,9 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb,
 	summary.ack_reason = (buf.ack.reason < RXRPC_ACK__INVALID ?
 			      buf.ack.reason : RXRPC_ACK__INVALID);
 
-	trace_rxrpc_rx_ack(call, first_soft_ack, summary.ack_reason, nr_acks);
-
-	_proto("Rx ACK %%%u { m=%hu f=#%u p=#%u s=%%%u r=%s n=%u }",
-	       sp->hdr.serial,
-	       ntohs(buf.ack.maxSkew),
-	       first_soft_ack,
-	       ntohl(buf.ack.previousPacket),
-	       acked_serial,
-	       rxrpc_ack_names[summary.ack_reason],
-	       buf.ack.nAcks);
+	trace_rxrpc_rx_ack(call, sp->hdr.serial, acked_serial,
+			   first_soft_ack, ntohl(buf.ack.previousPacket),
+			   summary.ack_reason, nr_acks);
 
 	if (buf.ack.reason == RXRPC_ACK_PING_RESPONSE)
 		rxrpc_input_ping_response(call, skb->tstamp, acked_serial,
@@ -931,7 +925,6 @@ static void rxrpc_input_call_packet(struct rxrpc_call *call,
 		break;
 
 	default:
-		_proto("Rx %s %%%u", rxrpc_pkts[sp->hdr.type], sp->hdr.serial);
 		break;
 	}
 
@@ -961,6 +954,7 @@ static void rxrpc_input_implicit_end_call(struct rxrpc_connection *conn,
 		break;
 	}
 
+	trace_rxrpc_improper_term(call);
 	__rxrpc_disconnect_call(conn, call);
 	rxrpc_notify_socket(call);
 }
diff --git a/net/rxrpc/key.c b/net/rxrpc/key.c
index 18c737a61d80..0a4e28477ad9 100644
--- a/net/rxrpc/key.c
+++ b/net/rxrpc/key.c
@@ -1065,7 +1065,7 @@ static long rxrpc_read(const struct key *key,
 
 		switch (token->security_index) {
 		case RXRPC_SECURITY_RXKAD:
-			toksize += 8 * 4;	/* viceid, kvno, key*2, begin,
+			toksize += 9 * 4;	/* viceid, kvno, key*2 + len, begin,
 						 * end, primary, tktlen */
 			toksize += RND(token->kad->ticket_len);
 			break;
diff --git a/net/rxrpc/misc.c b/net/rxrpc/misc.c
index 6dee55fad2d3..1a2d4b112064 100644
--- a/net/rxrpc/misc.c
+++ b/net/rxrpc/misc.c
@@ -77,12 +77,6 @@ unsigned int rxrpc_rx_jumbo_max = 4;
  */
 unsigned int rxrpc_resend_timeout = 4 * 1000;
 
-const char *const rxrpc_pkts[] = {
-	"?00",
-	"DATA", "ACK", "BUSY", "ABORT", "ACKALL", "CHALL", "RESP", "DEBUG",
-	"?09", "?10", "?11", "?12", "VERSION", "?14", "?15"
-};
-
 const s8 rxrpc_ack_priority[] = {
 	[0]				= 0,
 	[RXRPC_ACK_DELAY]		= 1,
@@ -94,148 +88,3 @@ const s8 rxrpc_ack_priority[] = {
 	[RXRPC_ACK_NOSPACE]		= 7,
 	[RXRPC_ACK_PING_RESPONSE]	= 8,
 };
-
-const char rxrpc_ack_names[RXRPC_ACK__INVALID + 1][4] = {
-	"---", "REQ", "DUP", "OOS", "WIN", "MEM", "PNG", "PNR", "DLY",
-	"IDL", "-?-"
-};
-
-const char rxrpc_skb_traces[rxrpc_skb__nr_trace][7] = {
-	[rxrpc_skb_rx_cleaned]		= "Rx CLN",
-	[rxrpc_skb_rx_freed]		= "Rx FRE",
-	[rxrpc_skb_rx_got]		= "Rx GOT",
-	[rxrpc_skb_rx_lost]		= "Rx *L*",
-	[rxrpc_skb_rx_received]		= "Rx RCV",
-	[rxrpc_skb_rx_purged]		= "Rx PUR",
-	[rxrpc_skb_rx_rotated]		= "Rx ROT",
-	[rxrpc_skb_rx_seen]		= "Rx SEE",
-	[rxrpc_skb_tx_cleaned]		= "Tx CLN",
-	[rxrpc_skb_tx_freed]		= "Tx FRE",
-	[rxrpc_skb_tx_got]		= "Tx GOT",
-	[rxrpc_skb_tx_new]		= "Tx NEW",
-	[rxrpc_skb_tx_rotated]		= "Tx ROT",
-	[rxrpc_skb_tx_seen]		= "Tx SEE",
-};
-
-const char rxrpc_conn_traces[rxrpc_conn__nr_trace][4] = {
-	[rxrpc_conn_new_client]		= "NWc",
-	[rxrpc_conn_new_service]	= "NWs",
-	[rxrpc_conn_queued]		= "QUE",
-	[rxrpc_conn_seen]		= "SEE",
-	[rxrpc_conn_got]		= "GOT",
-	[rxrpc_conn_put_client]		= "PTc",
-	[rxrpc_conn_put_service]	= "PTs",
-};
-
-const char rxrpc_client_traces[rxrpc_client__nr_trace][7] = {
-	[rxrpc_client_activate_chans]	= "Activa",
-	[rxrpc_client_alloc]		= "Alloc ",
-	[rxrpc_client_chan_activate]	= "ChActv",
-	[rxrpc_client_chan_disconnect]	= "ChDisc",
-	[rxrpc_client_chan_pass]	= "ChPass",
-	[rxrpc_client_chan_unstarted]	= "ChUnst",
-	[rxrpc_client_cleanup]		= "Clean ",
-	[rxrpc_client_count]		= "Count ",
-	[rxrpc_client_discard]		= "Discar",
-	[rxrpc_client_duplicate]	= "Duplic",
-	[rxrpc_client_exposed]		= "Expose",
-	[rxrpc_client_replace]		= "Replac",
-	[rxrpc_client_to_active]	= "->Actv",
-	[rxrpc_client_to_culled]	= "->Cull",
-	[rxrpc_client_to_idle]		= "->Idle",
-	[rxrpc_client_to_inactive]	= "->Inac",
-	[rxrpc_client_to_waiting]	= "->Wait",
-	[rxrpc_client_uncount]		= "Uncoun",
-};
-
-const char rxrpc_transmit_traces[rxrpc_transmit__nr_trace][4] = {
-	[rxrpc_transmit_wait]		= "WAI",
-	[rxrpc_transmit_queue]		= "QUE",
-	[rxrpc_transmit_queue_last]	= "QLS",
-	[rxrpc_transmit_rotate]		= "ROT",
-	[rxrpc_transmit_rotate_last]	= "RLS",
-	[rxrpc_transmit_await_reply]	= "AWR",
-	[rxrpc_transmit_end]		= "END",
-};
-
-const char rxrpc_receive_traces[rxrpc_receive__nr_trace][4] = {
-	[rxrpc_receive_incoming]	= "INC",
-	[rxrpc_receive_queue]		= "QUE",
-	[rxrpc_receive_queue_last]	= "QLS",
-	[rxrpc_receive_front]		= "FRN",
-	[rxrpc_receive_rotate]		= "ROT",
-	[rxrpc_receive_end]		= "END",
-};
-
-const char rxrpc_recvmsg_traces[rxrpc_recvmsg__nr_trace][5] = {
-	[rxrpc_recvmsg_enter]		= "ENTR",
-	[rxrpc_recvmsg_wait]		= "WAIT",
-	[rxrpc_recvmsg_dequeue]		= "DEQU",
-	[rxrpc_recvmsg_hole]		= "HOLE",
-	[rxrpc_recvmsg_next]		= "NEXT",
-	[rxrpc_recvmsg_cont]		= "CONT",
-	[rxrpc_recvmsg_full]		= "FULL",
-	[rxrpc_recvmsg_data_return]	= "DATA",
-	[rxrpc_recvmsg_terminal]	= "TERM",
-	[rxrpc_recvmsg_to_be_accepted]	= "TBAC",
-	[rxrpc_recvmsg_return]		= "RETN",
-};
-
-const char rxrpc_rtt_tx_traces[rxrpc_rtt_tx__nr_trace][5] = {
-	[rxrpc_rtt_tx_ping]		= "PING",
-	[rxrpc_rtt_tx_data]		= "DATA",
-};
-
-const char rxrpc_rtt_rx_traces[rxrpc_rtt_rx__nr_trace][5] = {
-	[rxrpc_rtt_rx_ping_response]	= "PONG",
-	[rxrpc_rtt_rx_requested_ack]	= "RACK",
-};
-
-const char rxrpc_timer_traces[rxrpc_timer__nr_trace][8] = {
-	[rxrpc_timer_begin]			= "Begin ",
-	[rxrpc_timer_expired]			= "*EXPR*",
-	[rxrpc_timer_init_for_reply]		= "IniRpl",
-	[rxrpc_timer_init_for_send_reply]	= "SndRpl",
-	[rxrpc_timer_set_for_ack]		= "SetAck",
-	[rxrpc_timer_set_for_ping]		= "SetPng",
-	[rxrpc_timer_set_for_send]		= "SetTx ",
-	[rxrpc_timer_set_for_resend]		= "SetRTx",
-};
-
-const char rxrpc_propose_ack_traces[rxrpc_propose_ack__nr_trace][8] = {
-	[rxrpc_propose_ack_client_tx_end]	= "ClTxEnd",
-	[rxrpc_propose_ack_input_data]		= "DataIn ",
-	[rxrpc_propose_ack_ping_for_lost_ack]	= "LostAck",
-	[rxrpc_propose_ack_ping_for_lost_reply]	= "LostRpl",
-	[rxrpc_propose_ack_ping_for_params]	= "Params ",
-	[rxrpc_propose_ack_processing_op]	= "ProcOp ",
-	[rxrpc_propose_ack_respond_to_ack]	= "Rsp2Ack",
-	[rxrpc_propose_ack_respond_to_ping]	= "Rsp2Png",
-	[rxrpc_propose_ack_retry_tx]		= "RetryTx",
-	[rxrpc_propose_ack_rotate_rx]		= "RxAck  ",
-	[rxrpc_propose_ack_terminal_ack]	= "ClTerm ",
-};
-
-const char *const rxrpc_propose_ack_outcomes[rxrpc_propose_ack__nr_outcomes] = {
-	[rxrpc_propose_ack_use]			= "",
-	[rxrpc_propose_ack_update]		= " Update",
-	[rxrpc_propose_ack_subsume]		= " Subsume",
-};
-
-const char rxrpc_congest_modes[NR__RXRPC_CONGEST_MODES][10] = {
-	[RXRPC_CALL_SLOW_START]		= "SlowStart",
-	[RXRPC_CALL_CONGEST_AVOIDANCE]	= "CongAvoid",
-	[RXRPC_CALL_PACKET_LOSS]	= "PktLoss  ",
-	[RXRPC_CALL_FAST_RETRANSMIT]	= "FastReTx ",
-};
-
-const char rxrpc_congest_changes[rxrpc_congest__nr_change][9] = {
-	[rxrpc_cong_begin_retransmission]	= " Retrans",
-	[rxrpc_cong_cleared_nacks]		= " Cleared",
-	[rxrpc_cong_new_low_nack]		= " NewLowN",
-	[rxrpc_cong_no_change]			= "",
-	[rxrpc_cong_progress]			= " Progres",
-	[rxrpc_cong_retransmit_again]		= " ReTxAgn",
-	[rxrpc_cong_rtt_window_end]		= " RttWinE",
-	[rxrpc_cong_saw_nack]			= " SawNack",
-};
diff --git a/net/rxrpc/proc.c b/net/rxrpc/proc.c
index 65cd980767fa..b9bcfbfb095c 100644
--- a/net/rxrpc/proc.c
+++ b/net/rxrpc/proc.c
@@ -52,6 +52,7 @@ static int rxrpc_call_seq_show(struct seq_file *seq, void *v)
 	struct rxrpc_sock *rx;
 	struct rxrpc_peer *peer;
 	struct rxrpc_call *call;
+	rxrpc_seq_t tx_hard_ack, rx_hard_ack;
 	char lbuff[50], rbuff[50];
 
 	if (v == &rxrpc_calls) {
@@ -82,9 +83,11 @@ static int rxrpc_call_seq_show(struct seq_file *seq, void *v)
 	else
 		strcpy(rbuff, "no_connection");
 
+	tx_hard_ack = READ_ONCE(call->tx_hard_ack);
+	rx_hard_ack = READ_ONCE(call->rx_hard_ack);
 	seq_printf(seq,
 		   "UDP   %-47.47s %-47.47s %4x %08x %08x %s %3u"
-		   " %-8.8s %08x %lx\n",
+		   " %-8.8s %08x %lx %08x %02x %08x %02x\n",
 		   lbuff,
 		   rbuff,
 		   call->service_id,
@@ -94,7 +97,9 @@ static int rxrpc_call_seq_show(struct seq_file *seq, void *v)
 		   atomic_read(&call->usage),
 		   rxrpc_call_states[call->state],
 		   call->abort_code,
-		   call->user_call_ID);
+		   call->user_call_ID,
+		   tx_hard_ack, READ_ONCE(call->tx_top) - tx_hard_ack,
+		   rx_hard_ack, READ_ONCE(call->rx_top) - rx_hard_ack);
 
 	return 0;
 }
diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c
index c29362d50a92..f3a688e10843 100644
--- a/net/rxrpc/recvmsg.c
+++ b/net/rxrpc/recvmsg.c
@@ -320,8 +320,10 @@ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call,
 
 	/* Barriers against rxrpc_input_data(). */
 	hard_ack = call->rx_hard_ack;
-	top = smp_load_acquire(&call->rx_top);
-	for (seq = hard_ack + 1; before_eq(seq, top); seq++) {
+	seq = hard_ack + 1;
+	while (top = smp_load_acquire(&call->rx_top),
+	       before_eq(seq, top)
+	       ) {
 		ix = seq & RXRPC_RXTX_BUFF_MASK;
 		skb = call->rxtx_buffer[ix];
 		if (!skb) {
@@ -394,6 +396,8 @@ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call,
 			ret = 1;
 			goto out;
 		}
+
+		seq++;
 	}
 
 out:
diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c
index b214a4d4a641..0a6ef217aa8a 100644
--- a/net/rxrpc/sendmsg.c
+++ b/net/rxrpc/sendmsg.c
@@ -376,7 +376,7 @@ static int rxrpc_sendmsg_cmsg(struct msghdr *msg,
 		if (!CMSG_OK(msg, cmsg))
 			return -EINVAL;
 
-		len = cmsg->cmsg_len - CMSG_ALIGN(sizeof(struct cmsghdr));
+		len = cmsg->cmsg_len - sizeof(struct cmsghdr);
 		_debug("CMSG %d, %d, %d",
 		       cmsg->cmsg_level, cmsg->cmsg_type, len);
 
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index 87956a768d1b..403790cce7d2 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -650,6 +650,18 @@ config NET_ACT_MIRRED
 	  To compile this code as a module, choose M here: the
 	  module will be called act_mirred.
 
+config NET_ACT_SAMPLE
+        tristate "Traffic Sampling"
+        depends on NET_CLS_ACT
+        select PSAMPLE
+        ---help---
+	  Say Y here to allow packet sampling tc action. The packet sample
+	  action consists of statistically choosing packets and sampling
+	  them using the psample module.
+
+	  To compile this code as a module, choose M here: the
+	  module will be called act_sample.
+
 config NET_ACT_IPT
         tristate "IPtables targets"
         depends on NET_CLS_ACT && NETFILTER && IP_NF_IPTABLES
@@ -707,6 +719,7 @@ config NET_ACT_SKBEDIT
 config NET_ACT_CSUM
         tristate "Checksum Updating"
         depends on NET_CLS_ACT && INET
+        select LIBCRC32C
         ---help---
 	  Say Y here to update some common checksum after some direct
 	  packet alterations.
@@ -763,6 +776,7 @@ config NET_ACT_SKBMOD
 config NET_ACT_IFE
         tristate "Inter-FE action based on IETF ForCES InterFE LFB"
         depends on NET_CLS_ACT
+        select NET_IFE
         ---help---
 	  Say Y here to allow for sourcing and terminating metadata
 	  For details refer to netdev01 paper:
diff --git a/net/sched/Makefile b/net/sched/Makefile
index 4bdda3634e0b..7b915d226de7 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -10,6 +10,7 @@ obj-$(CONFIG_NET_CLS_ACT)	+= act_api.o
 obj-$(CONFIG_NET_ACT_POLICE)	+= act_police.o
 obj-$(CONFIG_NET_ACT_GACT)	+= act_gact.o
 obj-$(CONFIG_NET_ACT_MIRRED)	+= act_mirred.o
+obj-$(CONFIG_NET_ACT_SAMPLE)	+= act_sample.o
 obj-$(CONFIG_NET_ACT_IPT)	+= act_ipt.o
 obj-$(CONFIG_NET_ACT_NAT)	+= act_nat.o
 obj-$(CONFIG_NET_ACT_PEDIT)	+= act_pedit.o
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index e10456ef6f7a..b70aa57319ea 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -24,6 +24,7 @@
 #include <net/net_namespace.h>
 #include <net/sock.h>
 #include <net/sch_generic.h>
+#include <net/pkt_cls.h>
 #include <net/act_api.h>
 #include <net/netlink.h>
 
@@ -33,6 +34,12 @@ static void free_tcf(struct rcu_head *head)
 
 	free_percpu(p->cpu_bstats);
 	free_percpu(p->cpu_qstats);
+
+	if (p->act_cookie) {
+		kfree(p->act_cookie->data);
+		kfree(p->act_cookie);
+	}
+
 	kfree(p);
 }
 
@@ -426,11 +433,9 @@ int tcf_action_exec(struct sk_buff *skb, struct tc_action **actions,
 {
 	int ret = -1, i;
 
-	if (skb->tc_verd & TC_NCLS) {
-		skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
-		ret = TC_ACT_OK;
-		goto exec_done;
-	}
+	if (skb_skip_tc_classify(skb))
+		return TC_ACT_OK;
+
 	for (i = 0; i < nr_actions; i++) {
 		const struct tc_action *a = actions[i];
 
@@ -439,9 +444,8 @@ repeat:
 		if (ret == TC_ACT_REPEAT)
 			goto repeat;	/* we need a ttl - JHS */
 		if (ret != TC_ACT_PIPE)
-			goto exec_done;
+			break;
 	}
-exec_done:
 	return ret;
 }
 EXPORT_SYMBOL(tcf_action_exec);
@@ -478,6 +482,12 @@ tcf_action_dump_1(struct sk_buff *skb, struct tc_action *a, int bind, int ref)
 		goto nla_put_failure;
 	if (tcf_action_copy_stats(skb, a, 0))
 		goto nla_put_failure;
+	if (a->act_cookie) {
+		if (nla_put(skb, TCA_ACT_COOKIE, a->act_cookie->len,
+			    a->act_cookie->data))
+			goto nla_put_failure;
+	}
+
 	nest = nla_nest_start(skb, TCA_OPTIONS);
 	if (nest == NULL)
 		goto nla_put_failure;
@@ -519,6 +529,22 @@ errout:
 	return err;
 }
 
+static int nla_memdup_cookie(struct tc_action *a, struct nlattr **tb)
+{
+	a->act_cookie = kzalloc(sizeof(*a->act_cookie), GFP_KERNEL);
+	if (!a->act_cookie)
+		return -ENOMEM;
+
+	a->act_cookie->data = nla_memdup(tb[TCA_ACT_COOKIE], GFP_KERNEL);
+	if (!a->act_cookie->data) {
+		kfree(a->act_cookie);
+		return -ENOMEM;
+	}
+	a->act_cookie->len = nla_len(tb[TCA_ACT_COOKIE]);
+
+	return 0;
+}
+
 struct tc_action *tcf_action_init_1(struct net *net, struct nlattr *nla,
 				    struct nlattr *est, char *name, int ovr,
 				    int bind)
@@ -578,6 +604,22 @@ struct tc_action *tcf_action_init_1(struct net *net, struct nlattr *nla,
 	if (err < 0)
 		goto err_mod;
 
+	if (tb[TCA_ACT_COOKIE]) {
+		int cklen = nla_len(tb[TCA_ACT_COOKIE]);
+
+		if (cklen > TC_COOKIE_MAX_SIZE) {
+			err = -EINVAL;
+			tcf_hash_release(a, bind);
+			goto err_mod;
+		}
+
+		if (nla_memdup_cookie(a, tb) < 0) {
+			err = -ENOMEM;
+			tcf_hash_release(a, bind);
+			goto err_mod;
+		}
+	}
+
 	/* module count goes up only when brand new policy is created
 	 * if it exists and is only bound to in a_o->init() then
 	 * ACT_P_CREATED is not returned (a zero is).
@@ -817,10 +859,8 @@ static int tca_action_flush(struct net *net, struct nlattr *nla,
 		goto out_module_put;
 
 	err = ops->walk(net, skb, &dcb, RTM_DELACTION, ops);
-	if (err < 0)
+	if (err <= 0)
 		goto out_module_put;
-	if (err == 0)
-		goto noflush_out;
 
 	nla_nest_end(skb, nest);
 
@@ -837,7 +877,6 @@ static int tca_action_flush(struct net *net, struct nlattr *nla,
 out_module_put:
 	module_put(ops->owner);
 err_out:
-noflush_out:
 	kfree_skb(skb);
 	return err;
 }
diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c
index a0edd80a44db..e978ccd4402c 100644
--- a/net/sched/act_csum.c
+++ b/net/sched/act_csum.c
@@ -30,6 +30,7 @@
 #include <net/tcp.h>
 #include <net/udp.h>
 #include <net/ip6_checksum.h>
+#include <net/sctp/checksum.h>
 
 #include <net/act_api.h>
 
@@ -322,6 +323,25 @@ ignore_obscure_skb:
 	return 1;
 }
 
+static int tcf_csum_sctp(struct sk_buff *skb, unsigned int ihl,
+			 unsigned int ipl)
+{
+	struct sctphdr *sctph;
+
+	if (skb_is_gso(skb) && skb_shinfo(skb)->gso_type & SKB_GSO_SCTP)
+		return 1;
+
+	sctph = tcf_csum_skb_nextlayer(skb, ihl, ipl, sizeof(*sctph));
+	if (!sctph)
+		return 0;
+
+	sctph->checksum = sctp_compute_cksum(skb,
+					     skb_network_offset(skb) + ihl);
+	skb->ip_summed = CHECKSUM_NONE;
+
+	return 1;
+}
+
 static int tcf_csum_ipv4(struct sk_buff *skb, u32 update_flags)
 {
 	const struct iphdr *iph;
@@ -365,6 +385,11 @@ static int tcf_csum_ipv4(struct sk_buff *skb, u32 update_flags)
 					       ntohs(iph->tot_len), 1))
 				goto fail;
 		break;
+	case IPPROTO_SCTP:
+		if ((update_flags & TCA_CSUM_UPDATE_FLAG_SCTP) &&
+		    !tcf_csum_sctp(skb, iph->ihl * 4, ntohs(iph->tot_len)))
+			goto fail;
+		break;
 	}
 
 	if (update_flags & TCA_CSUM_UPDATE_FLAG_IPV4HDR) {
@@ -481,6 +506,11 @@ static int tcf_csum_ipv6(struct sk_buff *skb, u32 update_flags)
 						       pl + sizeof(*ip6h), 1))
 					goto fail;
 			goto done;
+		case IPPROTO_SCTP:
+			if ((update_flags & TCA_CSUM_UPDATE_FLAG_SCTP) &&
+			    !tcf_csum_sctp(skb, hl, pl + sizeof(*ip6h)))
+				goto fail;
+			goto done;
 		default:
 			goto ignore_skb;
 		}
diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c
index 80b848d3f096..71e7ff22f7c9 100644
--- a/net/sched/act_ife.c
+++ b/net/sched/act_ife.c
@@ -32,6 +32,7 @@
 #include <uapi/linux/tc_act/tc_ife.h>
 #include <net/tc_act/tc_ife.h>
 #include <linux/etherdevice.h>
+#include <net/ife.h>
 
 #define IFE_TAB_MASK 15
 
@@ -46,23 +47,6 @@ static const struct nla_policy ife_policy[TCA_IFE_MAX + 1] = {
 	[TCA_IFE_TYPE] = { .type = NLA_U16},
 };
 
-/* Caller takes care of presenting data in network order
-*/
-int ife_tlv_meta_encode(void *skbdata, u16 attrtype, u16 dlen, const void *dval)
-{
-	u32 *tlv = (u32 *)(skbdata);
-	u16 totlen = nla_total_size(dlen);	/*alignment + hdr */
-	char *dptr = (char *)tlv + NLA_HDRLEN;
-	u32 htlv = attrtype << 16 | (dlen + NLA_HDRLEN);
-
-	*tlv = htonl(htlv);
-	memset(dptr, 0, totlen - NLA_HDRLEN);
-	memcpy(dptr, dval, dlen);
-
-	return totlen;
-}
-EXPORT_SYMBOL_GPL(ife_tlv_meta_encode);
-
 int ife_encode_meta_u16(u16 metaval, void *skbdata, struct tcf_meta_info *mi)
 {
 	u16 edata = 0;
@@ -637,69 +621,59 @@ int find_decode_metaid(struct sk_buff *skb, struct tcf_ife_info *ife,
 	return 0;
 }
 
-struct ifeheadr {
-	__be16 metalen;
-	u8 tlv_data[];
-};
-
-struct meta_tlvhdr {
-	__be16 type;
-	__be16 len;
-};
-
 static int tcf_ife_decode(struct sk_buff *skb, const struct tc_action *a,
 			  struct tcf_result *res)
 {
 	struct tcf_ife_info *ife = to_ife(a);
 	int action = ife->tcf_action;
-	struct ifeheadr *ifehdr = (struct ifeheadr *)skb->data;
-	int ifehdrln = (int)ifehdr->metalen;
-	struct meta_tlvhdr *tlv = (struct meta_tlvhdr *)(ifehdr->tlv_data);
+	u8 *ifehdr_end;
+	u8 *tlv_data;
+	u16 metalen;
 
 	spin_lock(&ife->tcf_lock);
 	bstats_update(&ife->tcf_bstats, skb);
 	tcf_lastuse_update(&ife->tcf_tm);
 	spin_unlock(&ife->tcf_lock);
 
-	ifehdrln = ntohs(ifehdrln);
-	if (unlikely(!pskb_may_pull(skb, ifehdrln))) {
+	if (skb_at_tc_ingress(skb))
+		skb_push(skb, skb->dev->hard_header_len);
+
+	tlv_data = ife_decode(skb, &metalen);
+	if (unlikely(!tlv_data)) {
 		spin_lock(&ife->tcf_lock);
 		ife->tcf_qstats.drops++;
 		spin_unlock(&ife->tcf_lock);
 		return TC_ACT_SHOT;
 	}
 
-	skb_set_mac_header(skb, ifehdrln);
-	__skb_pull(skb, ifehdrln);
-	skb->protocol = eth_type_trans(skb, skb->dev);
-	ifehdrln -= IFE_METAHDRLEN;
-
-	while (ifehdrln > 0) {
-		u8 *tlvdata = (u8 *)tlv;
-		u16 mtype = tlv->type;
-		u16 mlen = tlv->len;
-		u16 alen;
+	ifehdr_end = tlv_data + metalen;
+	for (; tlv_data < ifehdr_end; tlv_data = ife_tlv_meta_next(tlv_data)) {
+		u8 *curr_data;
+		u16 mtype;
+		u16 dlen;
 
-		mtype = ntohs(mtype);
-		mlen = ntohs(mlen);
-		alen = NLA_ALIGN(mlen);
+		curr_data = ife_tlv_meta_decode(tlv_data, &mtype, &dlen, NULL);
 
-		if (find_decode_metaid(skb, ife, mtype, (mlen - NLA_HDRLEN),
-				       (void *)(tlvdata + NLA_HDRLEN))) {
+		if (find_decode_metaid(skb, ife, mtype, dlen, curr_data)) {
 			/* abuse overlimits to count when we receive metadata
 			 * but dont have an ops for it
 			 */
-			pr_info_ratelimited("Unknown metaid %d alnlen %d\n",
-					    mtype, mlen);
+			pr_info_ratelimited("Unknown metaid %d dlen %d\n",
+					    mtype, dlen);
 			ife->tcf_qstats.overlimits++;
 		}
+	}
 
-		tlvdata += alen;
-		ifehdrln -= alen;
-		tlv = (struct meta_tlvhdr *)tlvdata;
+	if (WARN_ON(tlv_data != ifehdr_end)) {
+		spin_lock(&ife->tcf_lock);
+		ife->tcf_qstats.drops++;
+		spin_unlock(&ife->tcf_lock);
+		return TC_ACT_SHOT;
 	}
 
+	skb->protocol = eth_type_trans(skb, skb->dev);
 	skb_reset_network_header(skb);
+
 	return action;
 }
 
@@ -727,7 +701,6 @@ static int tcf_ife_encode(struct sk_buff *skb, const struct tc_action *a,
 	struct tcf_ife_info *ife = to_ife(a);
 	int action = ife->tcf_action;
 	struct ethhdr *oethh;	/* outer ether header */
-	struct ethhdr *iethh;	/* inner eth header */
 	struct tcf_meta_info *e;
 	/*
 	   OUTERHDR:TOTMETALEN:{TLVHDR:Metadatum:TLVHDR..}:ORIGDATA
@@ -735,13 +708,13 @@ static int tcf_ife_encode(struct sk_buff *skb, const struct tc_action *a,
 	 */
 	u16 metalen = ife_get_sz(skb, ife);
 	int hdrm = metalen + skb->dev->hard_header_len + IFE_METAHDRLEN;
-	unsigned int skboff = skb->dev->hard_header_len;
-	u32 at = G_TC_AT(skb->tc_verd);
+	unsigned int skboff = 0;
 	int new_len = skb->len + hdrm;
 	bool exceed_mtu = false;
-	int err;
+	void *ife_meta;
+	int err = 0;
 
-	if (at & AT_EGRESS) {
+	if (!skb_at_tc_ingress(skb)) {
 		if (new_len > skb->dev->mtu)
 			exceed_mtu = true;
 	}
@@ -766,27 +739,10 @@ static int tcf_ife_encode(struct sk_buff *skb, const struct tc_action *a,
 		return TC_ACT_SHOT;
 	}
 
-	err = skb_cow_head(skb, hdrm);
-	if (unlikely(err)) {
-		ife->tcf_qstats.drops++;
-		spin_unlock(&ife->tcf_lock);
-		return TC_ACT_SHOT;
-	}
-
-	if (!(at & AT_EGRESS))
+	if (skb_at_tc_ingress(skb))
 		skb_push(skb, skb->dev->hard_header_len);
 
-	iethh = (struct ethhdr *)skb->data;
-	__skb_push(skb, hdrm);
-	memcpy(skb->data, iethh, skb->mac_len);
-	skb_reset_mac_header(skb);
-	oethh = eth_hdr(skb);
-
-	/*total metadata length */
-	metalen += IFE_METAHDRLEN;
-	metalen = htons(metalen);
-	memcpy((skb->data + skboff), &metalen, IFE_METAHDRLEN);
-	skboff += IFE_METAHDRLEN;
+	ife_meta = ife_encode(skb, metalen);
 
 	/* XXX: we dont have a clever way of telling encode to
 	 * not repeat some of the computations that are done by
@@ -794,7 +750,7 @@ static int tcf_ife_encode(struct sk_buff *skb, const struct tc_action *a,
 	 */
 	list_for_each_entry(e, &ife->metalist, metalist) {
 		if (e->ops->encode) {
-			err = e->ops->encode(skb, (void *)(skb->data + skboff),
+			err = e->ops->encode(skb, (void *)(ife_meta + skboff),
 					     e);
 		}
 		if (err < 0) {
@@ -805,18 +761,15 @@ static int tcf_ife_encode(struct sk_buff *skb, const struct tc_action *a,
 		}
 		skboff += err;
 	}
+	oethh = (struct ethhdr *)skb->data;
 
 	if (!is_zero_ether_addr(ife->eth_src))
 		ether_addr_copy(oethh->h_source, ife->eth_src);
-	else
-		ether_addr_copy(oethh->h_source, iethh->h_source);
 	if (!is_zero_ether_addr(ife->eth_dst))
 		ether_addr_copy(oethh->h_dest, ife->eth_dst);
-	else
-		ether_addr_copy(oethh->h_dest, iethh->h_dest);
 	oethh->h_proto = htons(ife->eth_type);
 
-	if (!(at & AT_EGRESS))
+	if (skb_at_tc_ingress(skb))
 		skb_pull(skb, skb->dev->hard_header_len);
 
 	spin_unlock(&ife->tcf_lock);
diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c
index 2d9fa6e0a1b4..af49c7dca860 100644
--- a/net/sched/act_mirred.c
+++ b/net/sched/act_mirred.c
@@ -28,8 +28,6 @@
 #include <linux/tc_act/tc_mirred.h>
 #include <net/tc_act/tc_mirred.h>
 
-#include <linux/if_arp.h>
-
 #define MIRRED_TAB_MASK     7
 static LIST_HEAD(mirred_list);
 static DEFINE_SPINLOCK(mirred_list_lock);
@@ -39,15 +37,15 @@ static bool tcf_mirred_is_act_redirect(int action)
 	return action == TCA_EGRESS_REDIR || action == TCA_INGRESS_REDIR;
 }
 
-static u32 tcf_mirred_act_direction(int action)
+static bool tcf_mirred_act_wants_ingress(int action)
 {
 	switch (action) {
 	case TCA_EGRESS_REDIR:
 	case TCA_EGRESS_MIRROR:
-		return AT_EGRESS;
+		return false;
 	case TCA_INGRESS_REDIR:
 	case TCA_INGRESS_MIRROR:
-		return AT_INGRESS;
+		return true;
 	default:
 		BUG();
 	}
@@ -170,7 +168,6 @@ static int tcf_mirred(struct sk_buff *skb, const struct tc_action *a,
 	int retval, err = 0;
 	int m_eaction;
 	int mac_len;
-	u32 at;
 
 	tcf_lastuse_update(&m->tcf_tm);
 	bstats_cpu_update(this_cpu_ptr(m->common.cpu_bstats), skb);
@@ -191,7 +188,6 @@ static int tcf_mirred(struct sk_buff *skb, const struct tc_action *a,
 		goto out;
 	}
 
-	at = G_TC_AT(skb->tc_verd);
 	skb2 = skb_clone(skb, GFP_ATOMIC);
 	if (!skb2)
 		goto out;
@@ -200,8 +196,9 @@ static int tcf_mirred(struct sk_buff *skb, const struct tc_action *a,
 	 * and devices expect a mac header on xmit, then mac push/pull is
 	 * needed.
 	 */
-	if (at != tcf_mirred_act_direction(m_eaction) && m_mac_header_xmit) {
-		if (at & AT_EGRESS) {
+	if (skb_at_tc_ingress(skb) != tcf_mirred_act_wants_ingress(m_eaction) &&
+	    m_mac_header_xmit) {
+		if (!skb_at_tc_ingress(skb)) {
 			/* caught at egress, act ingress: pull mac */
 			mac_len = skb_network_header(skb) - skb_mac_header(skb);
 			skb_pull_rcsum(skb2, mac_len);
@@ -212,12 +209,14 @@ static int tcf_mirred(struct sk_buff *skb, const struct tc_action *a,
 	}
 
 	/* mirror is always swallowed */
-	if (tcf_mirred_is_act_redirect(m_eaction))
-		skb2->tc_verd = SET_TC_FROM(skb2->tc_verd, at);
+	if (tcf_mirred_is_act_redirect(m_eaction)) {
+		skb2->tc_redirected = 1;
+		skb2->tc_from_ingress = skb2->tc_at_ingress;
+	}
 
 	skb2->skb_iif = skb->dev->ifindex;
 	skb2->dev = dev;
-	if (tcf_mirred_act_direction(m_eaction) & AT_EGRESS)
+	if (!tcf_mirred_act_wants_ingress(m_eaction))
 		err = dev_queue_xmit(skb2);
 	else
 		err = netif_receive_skb(skb2);
diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c
index b27c4daec88f..c1310472f620 100644
--- a/net/sched/act_pedit.c
+++ b/net/sched/act_pedit.c
@@ -22,6 +22,7 @@
 #include <net/pkt_sched.h>
 #include <linux/tc_act/tc_pedit.h>
 #include <net/tc_act/tc_pedit.h>
+#include <uapi/linux/tc_act/tc_pedit.h>
 
 #define PEDIT_TAB_MASK	15
 
@@ -30,18 +31,117 @@ static struct tc_action_ops act_pedit_ops;
 
 static const struct nla_policy pedit_policy[TCA_PEDIT_MAX + 1] = {
 	[TCA_PEDIT_PARMS]	= { .len = sizeof(struct tc_pedit) },
+	[TCA_PEDIT_KEYS_EX]   = { .type = NLA_NESTED },
 };
 
+static const struct nla_policy pedit_key_ex_policy[TCA_PEDIT_KEY_EX_MAX + 1] = {
+	[TCA_PEDIT_KEY_EX_HTYPE]  = { .type = NLA_U16 },
+	[TCA_PEDIT_KEY_EX_CMD]	  = { .type = NLA_U16 },
+};
+
+static struct tcf_pedit_key_ex *tcf_pedit_keys_ex_parse(struct nlattr *nla,
+							u8 n)
+{
+	struct tcf_pedit_key_ex *keys_ex;
+	struct tcf_pedit_key_ex *k;
+	const struct nlattr *ka;
+	int err = -EINVAL;
+	int rem;
+
+	if (!nla || !n)
+		return NULL;
+
+	keys_ex = kcalloc(n, sizeof(*k), GFP_KERNEL);
+	if (!keys_ex)
+		return ERR_PTR(-ENOMEM);
+
+	k = keys_ex;
+
+	nla_for_each_nested(ka, nla, rem) {
+		struct nlattr *tb[TCA_PEDIT_KEY_EX_MAX + 1];
+
+		if (!n) {
+			err = -EINVAL;
+			goto err_out;
+		}
+		n--;
+
+		if (nla_type(ka) != TCA_PEDIT_KEY_EX) {
+			err = -EINVAL;
+			goto err_out;
+		}
+
+		err = nla_parse_nested(tb, TCA_PEDIT_KEY_EX_MAX, ka,
+				       pedit_key_ex_policy);
+		if (err)
+			goto err_out;
+
+		if (!tb[TCA_PEDIT_KEY_EX_HTYPE] ||
+		    !tb[TCA_PEDIT_KEY_EX_CMD]) {
+			err = -EINVAL;
+			goto err_out;
+		}
+
+		k->htype = nla_get_u16(tb[TCA_PEDIT_KEY_EX_HTYPE]);
+		k->cmd = nla_get_u16(tb[TCA_PEDIT_KEY_EX_CMD]);
+
+		if (k->htype > TCA_PEDIT_HDR_TYPE_MAX ||
+		    k->cmd > TCA_PEDIT_CMD_MAX) {
+			err = -EINVAL;
+			goto err_out;
+		}
+
+		k++;
+	}
+
+	if (n)
+		goto err_out;
+
+	return keys_ex;
+
+err_out:
+	kfree(keys_ex);
+	return ERR_PTR(err);
+}
+
+static int tcf_pedit_key_ex_dump(struct sk_buff *skb,
+				 struct tcf_pedit_key_ex *keys_ex, int n)
+{
+	struct nlattr *keys_start = nla_nest_start(skb, TCA_PEDIT_KEYS_EX);
+
+	for (; n > 0; n--) {
+		struct nlattr *key_start;
+
+		key_start = nla_nest_start(skb, TCA_PEDIT_KEY_EX);
+
+		if (nla_put_u16(skb, TCA_PEDIT_KEY_EX_HTYPE, keys_ex->htype) ||
+		    nla_put_u16(skb, TCA_PEDIT_KEY_EX_CMD, keys_ex->cmd)) {
+			nlmsg_trim(skb, keys_start);
+			return -EINVAL;
+		}
+
+		nla_nest_end(skb, key_start);
+
+		keys_ex++;
+	}
+
+	nla_nest_end(skb, keys_start);
+
+	return 0;
+}
+
 static int tcf_pedit_init(struct net *net, struct nlattr *nla,
 			  struct nlattr *est, struct tc_action **a,
 			  int ovr, int bind)
 {
 	struct tc_action_net *tn = net_generic(net, pedit_net_id);
 	struct nlattr *tb[TCA_PEDIT_MAX + 1];
+	struct nlattr *pattr;
 	struct tc_pedit *parm;
 	int ret = 0, err;
 	struct tcf_pedit *p;
 	struct tc_pedit_key *keys = NULL;
+	struct tcf_pedit_key_ex *keys_ex;
 	int ksize;
 
 	if (nla == NULL)
@@ -51,13 +151,21 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla,
 	if (err < 0)
 		return err;
 
-	if (tb[TCA_PEDIT_PARMS] == NULL)
+	pattr = tb[TCA_PEDIT_PARMS];
+	if (!pattr)
+		pattr = tb[TCA_PEDIT_PARMS_EX];
+	if (!pattr)
 		return -EINVAL;
-	parm = nla_data(tb[TCA_PEDIT_PARMS]);
+
+	parm = nla_data(pattr);
 	ksize = parm->nkeys * sizeof(struct tc_pedit_key);
-	if (nla_len(tb[TCA_PEDIT_PARMS]) < sizeof(*parm) + ksize)
+	if (nla_len(pattr) < sizeof(*parm) + ksize)
 		return -EINVAL;
 
+	keys_ex = tcf_pedit_keys_ex_parse(tb[TCA_PEDIT_KEYS_EX], parm->nkeys);
+	if (IS_ERR(keys_ex))
+		return PTR_ERR(keys_ex);
+
 	if (!tcf_hash_check(tn, parm->index, a, bind)) {
 		if (!parm->nkeys)
 			return -EINVAL;
@@ -69,6 +177,7 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla,
 		keys = kmalloc(ksize, GFP_KERNEL);
 		if (keys == NULL) {
 			tcf_hash_cleanup(*a, est);
+			kfree(keys_ex);
 			return -ENOMEM;
 		}
 		ret = ACT_P_CREATED;
@@ -81,8 +190,10 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla,
 		p = to_pedit(*a);
 		if (p->tcfp_nkeys && p->tcfp_nkeys != parm->nkeys) {
 			keys = kmalloc(ksize, GFP_KERNEL);
-			if (keys == NULL)
+			if (!keys) {
+				kfree(keys_ex);
 				return -ENOMEM;
+			}
 		}
 	}
 
@@ -95,6 +206,10 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla,
 		p->tcfp_nkeys = parm->nkeys;
 	}
 	memcpy(p->tcfp_keys, parm->keys, ksize);
+
+	kfree(p->tcfp_keys_ex);
+	p->tcfp_keys_ex = keys_ex;
+
 	spin_unlock_bh(&p->tcf_lock);
 	if (ret == ACT_P_CREATED)
 		tcf_hash_insert(tn, *a);
@@ -106,6 +221,7 @@ static void tcf_pedit_cleanup(struct tc_action *a, int bind)
 	struct tcf_pedit *p = to_pedit(a);
 	struct tc_pedit_key *keys = p->tcfp_keys;
 	kfree(keys);
+	kfree(p->tcfp_keys_ex);
 }
 
 static bool offset_valid(struct sk_buff *skb, int offset)
@@ -119,38 +235,88 @@ static bool offset_valid(struct sk_buff *skb, int offset)
 	return true;
 }
 
+static int pedit_skb_hdr_offset(struct sk_buff *skb,
+				enum pedit_header_type htype, int *hoffset)
+{
+	int ret = -EINVAL;
+
+	switch (htype) {
+	case TCA_PEDIT_KEY_EX_HDR_TYPE_ETH:
+		if (skb_mac_header_was_set(skb)) {
+			*hoffset = skb_mac_offset(skb);
+			ret = 0;
+		}
+		break;
+	case TCA_PEDIT_KEY_EX_HDR_TYPE_NETWORK:
+	case TCA_PEDIT_KEY_EX_HDR_TYPE_IP4:
+	case TCA_PEDIT_KEY_EX_HDR_TYPE_IP6:
+		*hoffset = skb_network_offset(skb);
+		ret = 0;
+		break;
+	case TCA_PEDIT_KEY_EX_HDR_TYPE_TCP:
+	case TCA_PEDIT_KEY_EX_HDR_TYPE_UDP:
+		if (skb_transport_header_was_set(skb)) {
+			*hoffset = skb_transport_offset(skb);
+			ret = 0;
+		}
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	};
+
+	return ret;
+}
+
 static int tcf_pedit(struct sk_buff *skb, const struct tc_action *a,
 		     struct tcf_result *res)
 {
 	struct tcf_pedit *p = to_pedit(a);
 	int i;
-	unsigned int off;
 
 	if (skb_unclone(skb, GFP_ATOMIC))
 		return p->tcf_action;
 
-	off = skb_network_offset(skb);
-
 	spin_lock(&p->tcf_lock);
 
 	tcf_lastuse_update(&p->tcf_tm);
 
 	if (p->tcfp_nkeys > 0) {
 		struct tc_pedit_key *tkey = p->tcfp_keys;
+		struct tcf_pedit_key_ex *tkey_ex = p->tcfp_keys_ex;
+		enum pedit_header_type htype = TCA_PEDIT_KEY_EX_HDR_TYPE_NETWORK;
+		enum pedit_cmd cmd = TCA_PEDIT_KEY_EX_CMD_SET;
 
 		for (i = p->tcfp_nkeys; i > 0; i--, tkey++) {
 			u32 *ptr, _data;
 			int offset = tkey->off;
+			int hoffset;
+			u32 val;
+			int rc;
+
+			if (tkey_ex) {
+				htype = tkey_ex->htype;
+				cmd = tkey_ex->cmd;
+
+				tkey_ex++;
+			}
+
+			rc = pedit_skb_hdr_offset(skb, htype, &hoffset);
+			if (rc) {
+				pr_info("tc filter pedit bad header type specified (0x%x)\n",
+					htype);
+				goto bad;
+			}
 
 			if (tkey->offmask) {
 				char *d, _d;
 
-				if (!offset_valid(skb, off + tkey->at)) {
+				if (!offset_valid(skb, hoffset + tkey->at)) {
 					pr_info("tc filter pedit 'at' offset %d out of bounds\n",
-						off + tkey->at);
+						hoffset + tkey->at);
 					goto bad;
 				}
-				d = skb_header_pointer(skb, off + tkey->at, 1,
+				d = skb_header_pointer(skb, hoffset + tkey->at, 1,
 						       &_d);
 				if (!d)
 					goto bad;
@@ -163,19 +329,32 @@ static int tcf_pedit(struct sk_buff *skb, const struct tc_action *a,
 				goto bad;
 			}
 
-			if (!offset_valid(skb, off + offset)) {
+			if (!offset_valid(skb, hoffset + offset)) {
 				pr_info("tc filter pedit offset %d out of bounds\n",
-					offset);
+					hoffset + offset);
 				goto bad;
 			}
 
-			ptr = skb_header_pointer(skb, off + offset, 4, &_data);
+			ptr = skb_header_pointer(skb, hoffset + offset, 4, &_data);
 			if (!ptr)
 				goto bad;
 			/* just do it, baby */
-			*ptr = ((*ptr & tkey->mask) ^ tkey->val);
+			switch (cmd) {
+			case TCA_PEDIT_KEY_EX_CMD_SET:
+				val = tkey->val;
+				break;
+			case TCA_PEDIT_KEY_EX_CMD_ADD:
+				val = (*ptr + tkey->val) & ~tkey->mask;
+				break;
+			default:
+				pr_info("tc filter pedit bad command (%d)\n",
+					cmd);
+				goto bad;
+			}
+
+			*ptr = ((*ptr & tkey->mask) ^ val);
 			if (ptr == &_data)
-				skb_store_bits(skb, off + offset, ptr, 4);
+				skb_store_bits(skb, hoffset + offset, ptr, 4);
 		}
 
 		goto done;
@@ -215,8 +394,15 @@ static int tcf_pedit_dump(struct sk_buff *skb, struct tc_action *a,
 	opt->refcnt = p->tcf_refcnt - ref;
 	opt->bindcnt = p->tcf_bindcnt - bind;
 
-	if (nla_put(skb, TCA_PEDIT_PARMS, s, opt))
-		goto nla_put_failure;
+	if (p->tcfp_keys_ex) {
+		tcf_pedit_key_ex_dump(skb, p->tcfp_keys_ex, p->tcfp_nkeys);
+
+		if (nla_put(skb, TCA_PEDIT_PARMS_EX, s, opt))
+			goto nla_put_failure;
+	} else {
+		if (nla_put(skb, TCA_PEDIT_PARMS, s, opt))
+			goto nla_put_failure;
+	}
 
 	tcf_tm_dump(&t, &p->tcf_tm);
 	if (nla_put_64bit(skb, TCA_PEDIT_TM, sizeof(t), &t, TCA_PEDIT_PAD))
diff --git a/net/sched/act_sample.c b/net/sched/act_sample.c
new file mode 100644
index 000000000000..0b8217b4763f
--- /dev/null
+++ b/net/sched/act_sample.c
@@ -0,0 +1,276 @@
+/*
+ * net/sched/act_sample.c - Packet sampling tc action
+ * Copyright (c) 2017 Yotam Gigi <yotamg@mellanox.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <linux/rtnetlink.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/gfp.h>
+#include <net/net_namespace.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+#include <linux/tc_act/tc_sample.h>
+#include <net/tc_act/tc_sample.h>
+#include <net/psample.h>
+
+#include <linux/if_arp.h>
+
+#define SAMPLE_TAB_MASK     7
+static unsigned int sample_net_id;
+static struct tc_action_ops act_sample_ops;
+
+static const struct nla_policy sample_policy[TCA_SAMPLE_MAX + 1] = {
+	[TCA_SAMPLE_PARMS]		= { .len = sizeof(struct tc_sample) },
+	[TCA_SAMPLE_RATE]		= { .type = NLA_U32 },
+	[TCA_SAMPLE_TRUNC_SIZE]		= { .type = NLA_U32 },
+	[TCA_SAMPLE_PSAMPLE_GROUP]	= { .type = NLA_U32 },
+};
+
+static int tcf_sample_init(struct net *net, struct nlattr *nla,
+			   struct nlattr *est, struct tc_action **a, int ovr,
+			   int bind)
+{
+	struct tc_action_net *tn = net_generic(net, sample_net_id);
+	struct nlattr *tb[TCA_SAMPLE_MAX + 1];
+	struct psample_group *psample_group;
+	struct tc_sample *parm;
+	struct tcf_sample *s;
+	bool exists = false;
+	int ret;
+
+	if (!nla)
+		return -EINVAL;
+	ret = nla_parse_nested(tb, TCA_SAMPLE_MAX, nla, sample_policy);
+	if (ret < 0)
+		return ret;
+	if (!tb[TCA_SAMPLE_PARMS] || !tb[TCA_SAMPLE_RATE] ||
+	    !tb[TCA_SAMPLE_PSAMPLE_GROUP])
+		return -EINVAL;
+
+	parm = nla_data(tb[TCA_SAMPLE_PARMS]);
+
+	exists = tcf_hash_check(tn, parm->index, a, bind);
+	if (exists && bind)
+		return 0;
+
+	if (!exists) {
+		ret = tcf_hash_create(tn, parm->index, est, a,
+				      &act_sample_ops, bind, false);
+		if (ret)
+			return ret;
+		ret = ACT_P_CREATED;
+	} else {
+		tcf_hash_release(*a, bind);
+		if (!ovr)
+			return -EEXIST;
+	}
+	s = to_sample(*a);
+
+	s->tcf_action = parm->action;
+	s->rate = nla_get_u32(tb[TCA_SAMPLE_RATE]);
+	s->psample_group_num = nla_get_u32(tb[TCA_SAMPLE_PSAMPLE_GROUP]);
+	psample_group = psample_group_get(net, s->psample_group_num);
+	if (!psample_group) {
+		if (ret == ACT_P_CREATED)
+			tcf_hash_release(*a, bind);
+		return -ENOMEM;
+	}
+	RCU_INIT_POINTER(s->psample_group, psample_group);
+
+	if (tb[TCA_SAMPLE_TRUNC_SIZE]) {
+		s->truncate = true;
+		s->trunc_size = nla_get_u32(tb[TCA_SAMPLE_TRUNC_SIZE]);
+	}
+
+	if (ret == ACT_P_CREATED)
+		tcf_hash_insert(tn, *a);
+	return ret;
+}
+
+static void tcf_sample_cleanup_rcu(struct rcu_head *rcu)
+{
+	struct tcf_sample *s = container_of(rcu, struct tcf_sample, rcu);
+	struct psample_group *psample_group;
+
+	psample_group = rcu_dereference_protected(s->psample_group, 1);
+	RCU_INIT_POINTER(s->psample_group, NULL);
+	psample_group_put(psample_group);
+}
+
+static void tcf_sample_cleanup(struct tc_action *a, int bind)
+{
+	struct tcf_sample *s = to_sample(a);
+
+	call_rcu(&s->rcu, tcf_sample_cleanup_rcu);
+}
+
+static bool tcf_sample_dev_ok_push(struct net_device *dev)
+{
+	switch (dev->type) {
+	case ARPHRD_TUNNEL:
+	case ARPHRD_TUNNEL6:
+	case ARPHRD_SIT:
+	case ARPHRD_IPGRE:
+	case ARPHRD_VOID:
+	case ARPHRD_NONE:
+		return false;
+	default:
+		return true;
+	}
+}
+
+static int tcf_sample_act(struct sk_buff *skb, const struct tc_action *a,
+			  struct tcf_result *res)
+{
+	struct tcf_sample *s = to_sample(a);
+	struct psample_group *psample_group;
+	int retval;
+	int size;
+	int iif;
+	int oif;
+
+	tcf_lastuse_update(&s->tcf_tm);
+	bstats_cpu_update(this_cpu_ptr(s->common.cpu_bstats), skb);
+	retval = READ_ONCE(s->tcf_action);
+
+	rcu_read_lock();
+	psample_group = rcu_dereference(s->psample_group);
+
+	/* randomly sample packets according to rate */
+	if (psample_group && (prandom_u32() % s->rate == 0)) {
+		if (!skb_at_tc_ingress(skb)) {
+			iif = skb->skb_iif;
+			oif = skb->dev->ifindex;
+		} else {
+			iif = skb->dev->ifindex;
+			oif = 0;
+		}
+
+		/* on ingress, the mac header gets popped, so push it back */
+		if (skb_at_tc_ingress(skb) && tcf_sample_dev_ok_push(skb->dev))
+			skb_push(skb, skb->mac_len);
+
+		size = s->truncate ? s->trunc_size : skb->len;
+		psample_sample_packet(psample_group, skb, size, iif, oif,
+				      s->rate);
+
+		if (skb_at_tc_ingress(skb) && tcf_sample_dev_ok_push(skb->dev))
+			skb_pull(skb, skb->mac_len);
+	}
+
+	rcu_read_unlock();
+	return retval;
+}
+
+static int tcf_sample_dump(struct sk_buff *skb, struct tc_action *a,
+			   int bind, int ref)
+{
+	unsigned char *b = skb_tail_pointer(skb);
+	struct tcf_sample *s = to_sample(a);
+	struct tc_sample opt = {
+		.index      = s->tcf_index,
+		.action     = s->tcf_action,
+		.refcnt     = s->tcf_refcnt - ref,
+		.bindcnt    = s->tcf_bindcnt - bind,
+	};
+	struct tcf_t t;
+
+	if (nla_put(skb, TCA_SAMPLE_PARMS, sizeof(opt), &opt))
+		goto nla_put_failure;
+
+	tcf_tm_dump(&t, &s->tcf_tm);
+	if (nla_put_64bit(skb, TCA_SAMPLE_TM, sizeof(t), &t, TCA_SAMPLE_PAD))
+		goto nla_put_failure;
+
+	if (nla_put_u32(skb, TCA_SAMPLE_RATE, s->rate))
+		goto nla_put_failure;
+
+	if (s->truncate)
+		if (nla_put_u32(skb, TCA_SAMPLE_TRUNC_SIZE, s->trunc_size))
+			goto nla_put_failure;
+
+	if (nla_put_u32(skb, TCA_SAMPLE_PSAMPLE_GROUP, s->psample_group_num))
+		goto nla_put_failure;
+	return skb->len;
+
+nla_put_failure:
+	nlmsg_trim(skb, b);
+	return -1;
+}
+
+static int tcf_sample_walker(struct net *net, struct sk_buff *skb,
+			     struct netlink_callback *cb, int type,
+			     const struct tc_action_ops *ops)
+{
+	struct tc_action_net *tn = net_generic(net, sample_net_id);
+
+	return tcf_generic_walker(tn, skb, cb, type, ops);
+}
+
+static int tcf_sample_search(struct net *net, struct tc_action **a, u32 index)
+{
+	struct tc_action_net *tn = net_generic(net, sample_net_id);
+
+	return tcf_hash_search(tn, a, index);
+}
+
+static struct tc_action_ops act_sample_ops = {
+	.kind	  = "sample",
+	.type	  = TCA_ACT_SAMPLE,
+	.owner	  = THIS_MODULE,
+	.act	  = tcf_sample_act,
+	.dump	  = tcf_sample_dump,
+	.init	  = tcf_sample_init,
+	.cleanup  = tcf_sample_cleanup,
+	.walk	  = tcf_sample_walker,
+	.lookup	  = tcf_sample_search,
+	.size	  = sizeof(struct tcf_sample),
+};
+
+static __net_init int sample_init_net(struct net *net)
+{
+	struct tc_action_net *tn = net_generic(net, sample_net_id);
+
+	return tc_action_net_init(tn, &act_sample_ops, SAMPLE_TAB_MASK);
+}
+
+static void __net_exit sample_exit_net(struct net *net)
+{
+	struct tc_action_net *tn = net_generic(net, sample_net_id);
+
+	tc_action_net_exit(tn);
+}
+
+static struct pernet_operations sample_net_ops = {
+	.init = sample_init_net,
+	.exit = sample_exit_net,
+	.id   = &sample_net_id,
+	.size = sizeof(struct tc_action_net),
+};
+
+static int __init sample_init_module(void)
+{
+	return tcf_register_action(&act_sample_ops, &sample_net_ops);
+}
+
+static void __exit sample_cleanup_module(void)
+{
+	tcf_unregister_action(&act_sample_ops, &sample_net_ops);
+}
+
+module_init(sample_init_module);
+module_exit(sample_cleanup_module);
+
+MODULE_AUTHOR("Yotam Gigi <yotamg@mellanox.com>");
+MODULE_DESCRIPTION("Packet sampling action");
+MODULE_LICENSE("GPL v2");
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 1ecdf809b5fa..732f7cae459d 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -19,6 +19,7 @@
 #include <linux/kernel.h>
 #include <linux/string.h>
 #include <linux/errno.h>
+#include <linux/err.h>
 #include <linux/skbuff.h>
 #include <linux/init.h>
 #include <linux/kmod.h>
@@ -38,14 +39,14 @@ static DEFINE_RWLOCK(cls_mod_lock);
 
 /* Find classifier type by string name */
 
-static const struct tcf_proto_ops *tcf_proto_lookup_ops(struct nlattr *kind)
+static const struct tcf_proto_ops *tcf_proto_lookup_ops(const char *kind)
 {
 	const struct tcf_proto_ops *t, *res = NULL;
 
 	if (kind) {
 		read_lock(&cls_mod_lock);
 		list_for_each_entry(t, &tcf_proto_base, head) {
-			if (nla_strcmp(kind, t->kind) == 0) {
+			if (strcmp(kind, t->kind) == 0) {
 				if (try_module_get(t->owner))
 					res = t;
 				break;
@@ -127,6 +128,77 @@ static inline u32 tcf_auto_prio(struct tcf_proto *tp)
 	return first;
 }
 
+static struct tcf_proto *tcf_proto_create(const char *kind, u32 protocol,
+					  u32 prio, u32 parent, struct Qdisc *q)
+{
+	struct tcf_proto *tp;
+	int err;
+
+	tp = kzalloc(sizeof(*tp), GFP_KERNEL);
+	if (!tp)
+		return ERR_PTR(-ENOBUFS);
+
+	err = -ENOENT;
+	tp->ops = tcf_proto_lookup_ops(kind);
+	if (!tp->ops) {
+#ifdef CONFIG_MODULES
+		rtnl_unlock();
+		request_module("cls_%s", kind);
+		rtnl_lock();
+		tp->ops = tcf_proto_lookup_ops(kind);
+		/* We dropped the RTNL semaphore in order to perform
+		 * the module load. So, even if we succeeded in loading
+		 * the module we have to replay the request. We indicate
+		 * this using -EAGAIN.
+		 */
+		if (tp->ops) {
+			module_put(tp->ops->owner);
+			err = -EAGAIN;
+		} else {
+			err = -ENOENT;
+		}
+		goto errout;
+#endif
+	}
+	tp->classify = tp->ops->classify;
+	tp->protocol = protocol;
+	tp->prio = prio;
+	tp->classid = parent;
+	tp->q = q;
+
+	err = tp->ops->init(tp);
+	if (err) {
+		module_put(tp->ops->owner);
+		goto errout;
+	}
+	return tp;
+
+errout:
+	kfree(tp);
+	return ERR_PTR(err);
+}
+
+static bool tcf_proto_destroy(struct tcf_proto *tp, bool force)
+{
+	if (tp->ops->destroy(tp, force)) {
+		module_put(tp->ops->owner);
+		kfree_rcu(tp, rcu);
+		return true;
+	}
+	return false;
+}
+
+void tcf_destroy_chain(struct tcf_proto __rcu **fl)
+{
+	struct tcf_proto *tp;
+
+	while ((tp = rtnl_dereference(*fl)) != NULL) {
+		RCU_INIT_POINTER(*fl, tp->next);
+		tcf_proto_destroy(tp, true);
+	}
+}
+EXPORT_SYMBOL(tcf_destroy_chain);
+
 /* Add/change/delete/get a filter node */
 
 static int tc_ctl_tfilter(struct sk_buff *skb, struct nlmsghdr *n)
@@ -142,8 +214,8 @@ static int tc_ctl_tfilter(struct sk_buff *skb, struct nlmsghdr *n)
 	struct Qdisc  *q;
 	struct tcf_proto __rcu **back;
 	struct tcf_proto __rcu **chain;
+	struct tcf_proto *next;
 	struct tcf_proto *tp;
-	const struct tcf_proto_ops *tp_ops;
 	const struct Qdisc_class_ops *cops;
 	unsigned long cl;
 	unsigned long fh;
@@ -222,9 +294,10 @@ replay:
 
 	/* And the last stroke */
 	chain = cops->tcf_chain(q, cl);
-	err = -EINVAL;
-	if (chain == NULL)
+	if (chain == NULL) {
+		err = -EINVAL;
 		goto errout;
+	}
 	if (n->nlmsg_type == RTM_DELTFILTER && prio == 0) {
 		tfilter_notify_chain(net, skb, n, chain, RTM_DELTFILTER);
 		tcf_destroy_chain(chain);
@@ -239,10 +312,13 @@ replay:
 		if (tp->prio >= prio) {
 			if (tp->prio == prio) {
 				if (!nprio ||
-				    (tp->protocol != protocol && protocol))
+				    (tp->protocol != protocol && protocol)) {
+					err = -EINVAL;
 					goto errout;
-			} else
+				}
+			} else {
 				tp = NULL;
+			}
 			break;
 		}
 	}
@@ -250,109 +326,69 @@ replay:
 	if (tp == NULL) {
 		/* Proto-tcf does not exist, create new one */
 
-		if (tca[TCA_KIND] == NULL || !protocol)
+		if (tca[TCA_KIND] == NULL || !protocol) {
+			err = -EINVAL;
 			goto errout;
+		}
 
-		err = -ENOENT;
 		if (n->nlmsg_type != RTM_NEWTFILTER ||
-		    !(n->nlmsg_flags & NLM_F_CREATE))
+		    !(n->nlmsg_flags & NLM_F_CREATE)) {
+			err = -ENOENT;
 			goto errout;
+		}
 
+		if (!nprio)
+			nprio = TC_H_MAJ(tcf_auto_prio(rtnl_dereference(*back)));
 
-		/* Create new proto tcf */
-
-		err = -ENOBUFS;
-		tp = kzalloc(sizeof(*tp), GFP_KERNEL);
-		if (tp == NULL)
-			goto errout;
-		err = -ENOENT;
-		tp_ops = tcf_proto_lookup_ops(tca[TCA_KIND]);
-		if (tp_ops == NULL) {
-#ifdef CONFIG_MODULES
-			struct nlattr *kind = tca[TCA_KIND];
-			char name[IFNAMSIZ];
-
-			if (kind != NULL &&
-			    nla_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
-				rtnl_unlock();
-				request_module("cls_%s", name);
-				rtnl_lock();
-				tp_ops = tcf_proto_lookup_ops(kind);
-				/* We dropped the RTNL semaphore in order to
-				 * perform the module load.  So, even if we
-				 * succeeded in loading the module we have to
-				 * replay the request.  We indicate this using
-				 * -EAGAIN.
-				 */
-				if (tp_ops != NULL) {
-					module_put(tp_ops->owner);
-					err = -EAGAIN;
-				}
-			}
-#endif
-			kfree(tp);
-			goto errout;
-		}
-		tp->ops = tp_ops;
-		tp->protocol = protocol;
-		tp->prio = nprio ? :
-			       TC_H_MAJ(tcf_auto_prio(rtnl_dereference(*back)));
-		tp->q = q;
-		tp->classify = tp_ops->classify;
-		tp->classid = parent;
-
-		err = tp_ops->init(tp);
-		if (err != 0) {
-			module_put(tp_ops->owner);
-			kfree(tp);
+		tp = tcf_proto_create(nla_data(tca[TCA_KIND]),
+				      protocol, nprio, parent, q);
+		if (IS_ERR(tp)) {
+			err = PTR_ERR(tp);
 			goto errout;
 		}
-
 		tp_created = 1;
-
-	} else if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], tp->ops->kind))
+	} else if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], tp->ops->kind)) {
+		err = -EINVAL;
 		goto errout;
+	}
 
 	fh = tp->ops->get(tp, t->tcm_handle);
 
 	if (fh == 0) {
 		if (n->nlmsg_type == RTM_DELTFILTER && t->tcm_handle == 0) {
-			struct tcf_proto *next = rtnl_dereference(tp->next);
-
+			next = rtnl_dereference(tp->next);
 			RCU_INIT_POINTER(*back, next);
-
 			tfilter_notify(net, skb, n, tp, fh,
 				       RTM_DELTFILTER, false);
-			tcf_destroy(tp, true);
+			tcf_proto_destroy(tp, true);
 			err = 0;
 			goto errout;
 		}
 
-		err = -ENOENT;
 		if (n->nlmsg_type != RTM_NEWTFILTER ||
-		    !(n->nlmsg_flags & NLM_F_CREATE))
+		    !(n->nlmsg_flags & NLM_F_CREATE)) {
+			err = -ENOENT;
 			goto errout;
+		}
 	} else {
 		switch (n->nlmsg_type) {
 		case RTM_NEWTFILTER:
-			err = -EEXIST;
 			if (n->nlmsg_flags & NLM_F_EXCL) {
 				if (tp_created)
-					tcf_destroy(tp, true);
+					tcf_proto_destroy(tp, true);
+				err = -EEXIST;
 				goto errout;
 			}
 			break;
 		case RTM_DELTFILTER:
 			err = tp->ops->delete(tp, fh);
-			if (err == 0) {
-				struct tcf_proto *next = rtnl_dereference(tp->next);
-
-				tfilter_notify(net, skb, n, tp,
-					       t->tcm_handle,
-					       RTM_DELTFILTER, false);
-				if (tcf_destroy(tp, false))
-					RCU_INIT_POINTER(*back, next);
-			}
+			if (err)
+				goto errout;
+			next = rtnl_dereference(tp->next);
+			tfilter_notify(net, skb, n, tp, t->tcm_handle,
+				       RTM_DELTFILTER, false);
+			if (tcf_proto_destroy(tp, false))
+				RCU_INIT_POINTER(*back, next);
 			goto errout;
 		case RTM_GETTFILTER:
 			err = tfilter_notify(net, skb, n, tp, fh,
@@ -374,7 +410,7 @@ replay:
 		tfilter_notify(net, skb, n, tp, fh, RTM_NEWTFILTER, false);
 	} else {
 		if (tp_created)
-			tcf_destroy(tp, true);
+			tcf_proto_destroy(tp, true);
 	}
 
 errout:
diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c
index d9c97018317d..80f688436dd7 100644
--- a/net/sched/cls_bpf.c
+++ b/net/sched/cls_bpf.c
@@ -148,6 +148,7 @@ static int cls_bpf_offload_cmd(struct tcf_proto *tp, struct cls_bpf_prog *prog,
 	struct net_device *dev = tp->q->dev_queue->dev;
 	struct tc_cls_bpf_offload bpf_offload = {};
 	struct tc_to_netdev offload;
+	int err;
 
 	offload.type = TC_SETUP_CLSBPF;
 	offload.cls_bpf = &bpf_offload;
@@ -159,8 +160,13 @@ static int cls_bpf_offload_cmd(struct tcf_proto *tp, struct cls_bpf_prog *prog,
 	bpf_offload.exts_integrated = prog->exts_integrated;
 	bpf_offload.gen_flags = prog->gen_flags;
 
-	return dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle,
-					     tp->protocol, &offload);
+	err = dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle,
+					    tp->protocol, &offload);
+
+	if (!err && (cmd == TC_CLSBPF_ADD || cmd == TC_CLSBPF_REPLACE))
+		prog->gen_flags |= TCA_CLS_FLAGS_IN_HW;
+
+	return err;
 }
 
 static int cls_bpf_offload(struct tcf_proto *tp, struct cls_bpf_prog *prog,
@@ -511,6 +517,9 @@ static int cls_bpf_change(struct net *net, struct sk_buff *in_skb,
 		return ret;
 	}
 
+	if (!tc_in_hw(prog->gen_flags))
+		prog->gen_flags |= TCA_CLS_FLAGS_NOT_IN_HW;
+
 	if (oldprog) {
 		list_replace_rcu(&oldprog->link, &prog->link);
 		tcf_unbind_filter(tp, &oldprog->res);
diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c
index 6575aba87630..3d6b9286c203 100644
--- a/net/sched/cls_flow.c
+++ b/net/sched/cls_flow.c
@@ -129,7 +129,7 @@ static u32 flow_get_mark(const struct sk_buff *skb)
 static u32 flow_get_nfct(const struct sk_buff *skb)
 {
 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
-	return addr_fold(skb->nfct);
+	return addr_fold(skb_nfct(skb));
 #else
 	return 0;
 #endif
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index 5752789acc13..9d0c99d2e9fb 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -40,6 +40,7 @@ struct fl_flow_key {
 	};
 	struct flow_dissector_key_ports tp;
 	struct flow_dissector_key_icmp icmp;
+	struct flow_dissector_key_arp arp;
 	struct flow_dissector_key_keyid enc_key_id;
 	union {
 		struct flow_dissector_key_ipv4_addrs enc_ipv4;
@@ -133,6 +134,14 @@ static void fl_clear_masked_range(struct fl_flow_key *key,
 	memset(fl_key_get_start(key, mask), 0, fl_mask_range(mask));
 }
 
+static struct cls_fl_filter *fl_lookup(struct cls_fl_head *head,
+				       struct fl_flow_key *mkey)
+{
+	return rhashtable_lookup_fast(&head->ht,
+				      fl_key_get_start(mkey, &head->mask),
+				      head->ht_params);
+}
+
 static int fl_classify(struct sk_buff *skb, const struct tcf_proto *tp,
 		       struct tcf_result *res)
 {
@@ -180,9 +189,7 @@ static int fl_classify(struct sk_buff *skb, const struct tcf_proto *tp,
 
 	fl_set_masked_key(&skb_mkey, &skb_key, &head->mask);
 
-	f = rhashtable_lookup_fast(&head->ht,
-				   fl_key_get_start(&skb_mkey, &head->mask),
-				   head->ht_params);
+	f = fl_lookup(head, &skb_mkey);
 	if (f && !tc_skip_sw(f->flags)) {
 		*res = f->res;
 		return tcf_exts_exec(skb, &f->exts, res);
@@ -222,6 +229,7 @@ static void fl_hw_destroy_filter(struct tcf_proto *tp, struct cls_fl_filter *f)
 		return;
 
 	offload.command = TC_CLSFLOWER_DESTROY;
+	offload.prio = tp->prio;
 	offload.cookie = (unsigned long)f;
 
 	tc->type = TC_SETUP_CLSFLOWER;
@@ -253,6 +261,7 @@ static int fl_hw_replace_filter(struct tcf_proto *tp,
 	}
 
 	offload.command = TC_CLSFLOWER_REPLACE;
+	offload.prio = tp->prio;
 	offload.cookie = (unsigned long)f;
 	offload.dissector = dissector;
 	offload.mask = mask;
@@ -264,6 +273,8 @@ static int fl_hw_replace_filter(struct tcf_proto *tp,
 
 	err = dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, tp->protocol,
 					    tc);
+	if (!err)
+		f->flags |= TCA_CLS_FLAGS_IN_HW;
 
 	if (tc_skip_sw(f->flags))
 		return err;
@@ -280,6 +291,7 @@ static void fl_hw_update_stats(struct tcf_proto *tp, struct cls_fl_filter *f)
 		return;
 
 	offload.command = TC_CLSFLOWER_STATS;
+	offload.prio = tp->prio;
 	offload.cookie = (unsigned long)f;
 	offload.exts = &f->exts;
 
@@ -401,6 +413,16 @@ static const struct nla_policy fl_policy[TCA_FLOWER_MAX + 1] = {
 	[TCA_FLOWER_KEY_ICMPV6_TYPE_MASK] = { .type = NLA_U8 },
 	[TCA_FLOWER_KEY_ICMPV6_CODE]	= { .type = NLA_U8 },
 	[TCA_FLOWER_KEY_ICMPV6_CODE_MASK] = { .type = NLA_U8 },
+	[TCA_FLOWER_KEY_ARP_SIP]	= { .type = NLA_U32 },
+	[TCA_FLOWER_KEY_ARP_SIP_MASK]	= { .type = NLA_U32 },
+	[TCA_FLOWER_KEY_ARP_TIP]	= { .type = NLA_U32 },
+	[TCA_FLOWER_KEY_ARP_TIP_MASK]	= { .type = NLA_U32 },
+	[TCA_FLOWER_KEY_ARP_OP]		= { .type = NLA_U8 },
+	[TCA_FLOWER_KEY_ARP_OP_MASK]	= { .type = NLA_U8 },
+	[TCA_FLOWER_KEY_ARP_SHA]	= { .len = ETH_ALEN },
+	[TCA_FLOWER_KEY_ARP_SHA_MASK]	= { .len = ETH_ALEN },
+	[TCA_FLOWER_KEY_ARP_THA]	= { .len = ETH_ALEN },
+	[TCA_FLOWER_KEY_ARP_THA_MASK]	= { .len = ETH_ALEN },
 };
 
 static void fl_set_key_val(struct nlattr **tb,
@@ -572,6 +594,23 @@ static int fl_set_key(struct net *net, struct nlattr **tb,
 			       &mask->icmp.code,
 			       TCA_FLOWER_KEY_ICMPV6_CODE_MASK,
 			       sizeof(key->icmp.code));
+	} else if (key->basic.n_proto == htons(ETH_P_ARP) ||
+		   key->basic.n_proto == htons(ETH_P_RARP)) {
+		fl_set_key_val(tb, &key->arp.sip, TCA_FLOWER_KEY_ARP_SIP,
+			       &mask->arp.sip, TCA_FLOWER_KEY_ARP_SIP_MASK,
+			       sizeof(key->arp.sip));
+		fl_set_key_val(tb, &key->arp.tip, TCA_FLOWER_KEY_ARP_TIP,
+			       &mask->arp.tip, TCA_FLOWER_KEY_ARP_TIP_MASK,
+			       sizeof(key->arp.tip));
+		fl_set_key_val(tb, &key->arp.op, TCA_FLOWER_KEY_ARP_OP,
+			       &mask->arp.op, TCA_FLOWER_KEY_ARP_OP_MASK,
+			       sizeof(key->arp.op));
+		fl_set_key_val(tb, key->arp.sha, TCA_FLOWER_KEY_ARP_SHA,
+			       mask->arp.sha, TCA_FLOWER_KEY_ARP_SHA_MASK,
+			       sizeof(key->arp.sha));
+		fl_set_key_val(tb, key->arp.tha, TCA_FLOWER_KEY_ARP_THA,
+			       mask->arp.tha, TCA_FLOWER_KEY_ARP_THA_MASK,
+			       sizeof(key->arp.tha));
 	}
 
 	if (tb[TCA_FLOWER_KEY_ENC_IPV4_SRC] ||
@@ -689,6 +728,8 @@ static void fl_init_dissector(struct cls_fl_head *head,
 	FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt,
 			     FLOW_DISSECTOR_KEY_ICMP, icmp);
 	FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt,
+			     FLOW_DISSECTOR_KEY_ARP, arp);
+	FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt,
 			     FLOW_DISSECTOR_KEY_VLAN, vlan);
 	FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt,
 			     FLOW_DISSECTOR_KEY_ENC_KEYID, enc_key_id);
@@ -796,23 +837,31 @@ static int fl_change(struct net *net, struct sk_buff *in_skb,
 	struct cls_fl_head *head = rtnl_dereference(tp->root);
 	struct cls_fl_filter *fold = (struct cls_fl_filter *) *arg;
 	struct cls_fl_filter *fnew;
-	struct nlattr *tb[TCA_FLOWER_MAX + 1];
+	struct nlattr **tb;
 	struct fl_flow_mask mask = {};
 	int err;
 
 	if (!tca[TCA_OPTIONS])
 		return -EINVAL;
 
+	tb = kcalloc(TCA_FLOWER_MAX + 1, sizeof(struct nlattr *), GFP_KERNEL);
+	if (!tb)
+		return -ENOBUFS;
+
 	err = nla_parse_nested(tb, TCA_FLOWER_MAX, tca[TCA_OPTIONS], fl_policy);
 	if (err < 0)
-		return err;
+		goto errout_tb;
 
-	if (fold && handle && fold->handle != handle)
-		return -EINVAL;
+	if (fold && handle && fold->handle != handle) {
+		err = -EINVAL;
+		goto errout_tb;
+	}
 
 	fnew = kzalloc(sizeof(*fnew), GFP_KERNEL);
-	if (!fnew)
-		return -ENOBUFS;
+	if (!fnew) {
+		err = -ENOBUFS;
+		goto errout_tb;
+	}
 
 	err = tcf_exts_init(&fnew->exts, TCA_FLOWER_ACT, 0);
 	if (err < 0)
@@ -845,6 +894,11 @@ static int fl_change(struct net *net, struct sk_buff *in_skb,
 		goto errout;
 
 	if (!tc_skip_sw(fnew->flags)) {
+		if (!fold && fl_lookup(head, &fnew->mkey)) {
+			err = -EEXIST;
+			goto errout;
+		}
+
 		err = rhashtable_insert_fast(&head->ht, &fnew->ht_node,
 					     head->ht_params);
 		if (err)
@@ -860,6 +914,9 @@ static int fl_change(struct net *net, struct sk_buff *in_skb,
 			goto errout;
 	}
 
+	if (!tc_in_hw(fnew->flags))
+		fnew->flags |= TCA_CLS_FLAGS_NOT_IN_HW;
+
 	if (fold) {
 		if (!tc_skip_sw(fold->flags))
 			rhashtable_remove_fast(&head->ht, &fold->ht_node,
@@ -878,11 +935,14 @@ static int fl_change(struct net *net, struct sk_buff *in_skb,
 		list_add_tail_rcu(&fnew->list, &head->filters);
 	}
 
+	kfree(tb);
 	return 0;
 
 errout:
 	tcf_exts_destroy(&fnew->exts);
 	kfree(fnew);
+errout_tb:
+	kfree(tb);
 	return err;
 }
 
@@ -1112,6 +1172,27 @@ static int fl_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
 				  TCA_FLOWER_KEY_ICMPV6_CODE_MASK,
 				  sizeof(key->icmp.code))))
 		goto nla_put_failure;
+	else if ((key->basic.n_proto == htons(ETH_P_ARP) ||
+		  key->basic.n_proto == htons(ETH_P_RARP)) &&
+		 (fl_dump_key_val(skb, &key->arp.sip,
+				  TCA_FLOWER_KEY_ARP_SIP, &mask->arp.sip,
+				  TCA_FLOWER_KEY_ARP_SIP_MASK,
+				  sizeof(key->arp.sip)) ||
+		  fl_dump_key_val(skb, &key->arp.tip,
+				  TCA_FLOWER_KEY_ARP_TIP, &mask->arp.tip,
+				  TCA_FLOWER_KEY_ARP_TIP_MASK,
+				  sizeof(key->arp.tip)) ||
+		  fl_dump_key_val(skb, &key->arp.op,
+				  TCA_FLOWER_KEY_ARP_OP, &mask->arp.op,
+				  TCA_FLOWER_KEY_ARP_OP_MASK,
+				  sizeof(key->arp.op)) ||
+		  fl_dump_key_val(skb, key->arp.sha, TCA_FLOWER_KEY_ARP_SHA,
+				  mask->arp.sha, TCA_FLOWER_KEY_ARP_SHA_MASK,
+				  sizeof(key->arp.sha)) ||
+		  fl_dump_key_val(skb, key->arp.tha, TCA_FLOWER_KEY_ARP_THA,
+				  mask->arp.tha, TCA_FLOWER_KEY_ARP_THA_MASK,
+				  sizeof(key->arp.tha))))
+		goto nla_put_failure;
 
 	if (key->enc_control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS &&
 	    (fl_dump_key_val(skb, &key->enc_ipv4.src,
@@ -1153,7 +1234,8 @@ static int fl_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
 	if (fl_dump_key_flags(skb, key->control.flags, mask->control.flags))
 		goto nla_put_failure;
 
-	nla_put_u32(skb, TCA_FLOWER_FLAGS, f->flags);
+	if (f->flags && nla_put_u32(skb, TCA_FLOWER_FLAGS, f->flags))
+		goto nla_put_failure;
 
 	if (tcf_exts_dump(skb, &f->exts))
 		goto nla_put_failure;
diff --git a/net/sched/cls_matchall.c b/net/sched/cls_matchall.c
index b12bc2abea93..224eb2c14346 100644
--- a/net/sched/cls_matchall.c
+++ b/net/sched/cls_matchall.c
@@ -56,6 +56,7 @@ static int mall_replace_hw_filter(struct tcf_proto *tp,
 	struct net_device *dev = tp->q->dev_queue->dev;
 	struct tc_to_netdev offload;
 	struct tc_cls_matchall_offload mall_offload = {0};
+	int err;
 
 	offload.type = TC_SETUP_MATCHALL;
 	offload.cls_mall = &mall_offload;
@@ -63,8 +64,12 @@ static int mall_replace_hw_filter(struct tcf_proto *tp,
 	offload.cls_mall->exts = &head->exts;
 	offload.cls_mall->cookie = cookie;
 
-	return dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, tp->protocol,
-					     &offload);
+	err = dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, tp->protocol,
+					    &offload);
+	if (!err)
+		head->flags |= TCA_CLS_FLAGS_IN_HW;
+
+	return err;
 }
 
 static void mall_destroy_hw_filter(struct tcf_proto *tp,
@@ -118,10 +123,12 @@ static int mall_set_parms(struct net *net, struct tcf_proto *tp,
 	struct tcf_exts e;
 	int err;
 
-	tcf_exts_init(&e, TCA_MATCHALL_ACT, 0);
+	err = tcf_exts_init(&e, TCA_MATCHALL_ACT, 0);
+	if (err)
+		return err;
 	err = tcf_exts_validate(net, tp, tb, est, &e, ovr);
 	if (err < 0)
-		return err;
+		goto errout;
 
 	if (tb[TCA_MATCHALL_CLASSID]) {
 		head->res.classid = nla_get_u32(tb[TCA_MATCHALL_CLASSID]);
@@ -131,6 +138,9 @@ static int mall_set_parms(struct net *net, struct tcf_proto *tp,
 	tcf_exts_change(tp, &head->exts, &e);
 
 	return 0;
+errout:
+	tcf_exts_destroy(&e);
+	return err;
 }
 
 static int mall_change(struct net *net, struct sk_buff *in_skb,
@@ -166,7 +176,9 @@ static int mall_change(struct net *net, struct sk_buff *in_skb,
 	if (!new)
 		return -ENOBUFS;
 
-	tcf_exts_init(&new->exts, TCA_MATCHALL_ACT, 0);
+	err = tcf_exts_init(&new->exts, TCA_MATCHALL_ACT, 0);
+	if (err)
+		goto err_exts_init;
 
 	if (!handle)
 		handle = 1;
@@ -175,25 +187,31 @@ static int mall_change(struct net *net, struct sk_buff *in_skb,
 
 	err = mall_set_parms(net, tp, new, base, tb, tca[TCA_RATE], ovr);
 	if (err)
-		goto errout;
+		goto err_set_parms;
 
 	if (tc_should_offload(dev, tp, flags)) {
 		err = mall_replace_hw_filter(tp, new, (unsigned long) new);
 		if (err) {
 			if (tc_skip_sw(flags))
-				goto errout;
+				goto err_replace_hw_filter;
 			else
 				err = 0;
 		}
 	}
 
+	if (!tc_in_hw(new->flags))
+		new->flags |= TCA_CLS_FLAGS_NOT_IN_HW;
+
 	*arg = (unsigned long) head;
 	rcu_assign_pointer(tp->root, new);
 	if (head)
 		call_rcu(&head->rcu, mall_destroy_rcu);
 	return 0;
 
-errout:
+err_replace_hw_filter:
+err_set_parms:
+	tcf_exts_destroy(&new->exts);
+err_exts_init:
 	kfree(new);
 	return err;
 }
@@ -234,6 +252,9 @@ static int mall_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
 	    nla_put_u32(skb, TCA_MATCHALL_CLASSID, head->res.classid))
 		goto nla_put_failure;
 
+	if (head->flags && nla_put_u32(skb, TCA_MATCHALL_FLAGS, head->flags))
+		goto nla_put_failure;
+
 	if (tcf_exts_dump(skb, &head->exts))
 		goto nla_put_failure;
 
diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c
index ae83c3aec308..4dbe0c680fe6 100644
--- a/net/sched/cls_u32.c
+++ b/net/sched/cls_u32.c
@@ -334,7 +334,6 @@ static int u32_init(struct tcf_proto *tp)
 	if (root_ht == NULL)
 		return -ENOBUFS;
 
-	root_ht->divisor = 0;
 	root_ht->refcnt++;
 	root_ht->handle = tp_c ? gen_new_htid(tp_c) : 0x80000000;
 	root_ht->prio = tp->prio;
@@ -524,6 +523,10 @@ static int u32_replace_hw_knode(struct tcf_proto *tp, struct tc_u_knode *n,
 
 	err = dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle,
 					    tp->protocol, &offload);
+
+	if (!err)
+		n->flags |= TCA_CLS_FLAGS_IN_HW;
+
 	if (tc_skip_sw(flags))
 		return err;
 
@@ -896,6 +899,9 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
 			return err;
 		}
 
+		if (!tc_in_hw(new->flags))
+			new->flags |= TCA_CLS_FLAGS_NOT_IN_HW;
+
 		u32_replace_knode(tp, tp_c, new);
 		tcf_unbind_filter(tp, &n->res);
 		call_rcu(&n->rcu, u32_delete_key_rcu);
@@ -1015,6 +1021,9 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
 		if (err)
 			goto errhw;
 
+		if (!tc_in_hw(n->flags))
+			n->flags |= TCA_CLS_FLAGS_NOT_IN_HW;
+
 		ins = &ht->ht[TC_U32_HASH(handle)];
 		for (pins = rtnl_dereference(*ins); pins;
 		     ins = &pins->next, pins = rtnl_dereference(*ins))
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index d7b93429f0cc..bcf49cd22786 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -440,7 +440,6 @@ void qdisc_put_rtab(struct qdisc_rate_table *tab)
 EXPORT_SYMBOL(qdisc_put_rtab);
 
 static LIST_HEAD(qdisc_stab_list);
-static DEFINE_SPINLOCK(qdisc_stab_lock);
 
 static const struct nla_policy stab_policy[TCA_STAB_MAX + 1] = {
 	[TCA_STAB_BASE]	= { .len = sizeof(struct tc_sizespec) },
@@ -474,20 +473,15 @@ static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt)
 	if (tsize != s->tsize || (!tab && tsize > 0))
 		return ERR_PTR(-EINVAL);
 
-	spin_lock(&qdisc_stab_lock);
-
 	list_for_each_entry(stab, &qdisc_stab_list, list) {
 		if (memcmp(&stab->szopts, s, sizeof(*s)))
 			continue;
 		if (tsize > 0 && memcmp(stab->data, tab, tsize * sizeof(u16)))
 			continue;
 		stab->refcnt++;
-		spin_unlock(&qdisc_stab_lock);
 		return stab;
 	}
 
-	spin_unlock(&qdisc_stab_lock);
-
 	stab = kmalloc(sizeof(*stab) + tsize * sizeof(u16), GFP_KERNEL);
 	if (!stab)
 		return ERR_PTR(-ENOMEM);
@@ -497,9 +491,7 @@ static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt)
 	if (tsize > 0)
 		memcpy(stab->data, tab, tsize * sizeof(u16));
 
-	spin_lock(&qdisc_stab_lock);
 	list_add_tail(&stab->list, &qdisc_stab_list);
-	spin_unlock(&qdisc_stab_lock);
 
 	return stab;
 }
@@ -514,14 +506,10 @@ void qdisc_put_stab(struct qdisc_size_table *tab)
 	if (!tab)
 		return;
 
-	spin_lock(&qdisc_stab_lock);
-
 	if (--tab->refcnt == 0) {
 		list_del(&tab->list);
 		call_rcu_bh(&tab->rcu, stab_kfree_rcu);
 	}
-
-	spin_unlock(&qdisc_stab_lock);
 }
 EXPORT_SYMBOL(qdisc_put_stab);
 
@@ -1019,6 +1007,8 @@ static struct Qdisc *qdisc_create(struct net_device *dev,
 
 		return sch;
 	}
+	/* ops->init() failed, we call ->destroy() like qdisc_create_dflt() */
+	ops->destroy(sch);
 err_out3:
 	dev_put(dev);
 	kfree((char *) sch - sch->padded);
@@ -1861,6 +1851,7 @@ int tc_classify(struct sk_buff *skb, const struct tcf_proto *tp,
 {
 	__be16 protocol = tc_skb_protocol(skb);
 #ifdef CONFIG_NET_CLS_ACT
+	const int max_reclassify_loop = 4;
 	const struct tcf_proto *old_tp = tp;
 	int limit = 0;
 
@@ -1885,7 +1876,7 @@ reclassify:
 	return TC_ACT_UNSPEC; /* signal: continue lookup */
 #ifdef CONFIG_NET_CLS_ACT
 reset:
-	if (unlikely(limit++ >= MAX_REC_LOOP)) {
+	if (unlikely(limit++ >= max_reclassify_loop)) {
 		net_notice_ratelimited("%s: reclassify loop, rule prio %u, protocol %02x\n",
 				       tp->q->ops->id, tp->prio & 0xffff,
 				       ntohs(tp->protocol));
@@ -1899,28 +1890,6 @@ reset:
 }
 EXPORT_SYMBOL(tc_classify);
 
-bool tcf_destroy(struct tcf_proto *tp, bool force)
-{
-	if (tp->ops->destroy(tp, force)) {
-		module_put(tp->ops->owner);
-		kfree_rcu(tp, rcu);
-		return true;
-	}
-
-	return false;
-}
-
-void tcf_destroy_chain(struct tcf_proto __rcu **fl)
-{
-	struct tcf_proto *tp;
-
-	while ((tp = rtnl_dereference(*fl)) != NULL) {
-		RCU_INIT_POINTER(*fl, tp->next);
-		tcf_destroy(tp, true);
-	}
-}
-EXPORT_SYMBOL(tcf_destroy_chain);
-
 #ifdef CONFIG_PROC_FS
 static int psched_show(struct seq_file *seq, void *v)
 {
diff --git a/net/sched/sch_atm.c b/net/sched/sch_atm.c
index 481e4f12aeb4..2209c2ddacbf 100644
--- a/net/sched/sch_atm.c
+++ b/net/sched/sch_atm.c
@@ -15,6 +15,7 @@
 #include <linux/file.h>		/* for fput */
 #include <net/netlink.h>
 #include <net/pkt_sched.h>
+#include <net/pkt_cls.h>
 
 /*
  * The ATM queuing discipline provides a framework for invoking classifiers
diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c
index f1207582cbf3..d6ca18dc04c3 100644
--- a/net/sched/sch_cbq.c
+++ b/net/sched/sch_cbq.c
@@ -19,6 +19,7 @@
 #include <linux/skbuff.h>
 #include <net/netlink.h>
 #include <net/pkt_sched.h>
+#include <net/pkt_cls.h>
 
 
 /*	Class-Based Queueing (CBQ) algorithm.
diff --git a/net/sched/sch_choke.c b/net/sched/sch_choke.c
index 3b6d5bd69101..3b86a97bc67c 100644
--- a/net/sched/sch_choke.c
+++ b/net/sched/sch_choke.c
@@ -16,6 +16,7 @@
 #include <linux/skbuff.h>
 #include <linux/vmalloc.h>
 #include <net/pkt_sched.h>
+#include <net/pkt_cls.h>
 #include <net/inet_ecn.h>
 #include <net/red.h>
 #include <net/flow_dissector.h>
diff --git a/net/sched/sch_dsmark.c b/net/sched/sch_dsmark.c
index 1308bbf460f7..802ac7c2e5e8 100644
--- a/net/sched/sch_dsmark.c
+++ b/net/sched/sch_dsmark.c
@@ -13,6 +13,7 @@
 #include <linux/rtnetlink.h>
 #include <linux/bitops.h>
 #include <net/pkt_sched.h>
+#include <net/pkt_cls.h>
 #include <net/dsfield.h>
 #include <net/inet_ecn.h>
 #include <asm/byteorder.h>
diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c
index a5ea0e9b6be4..9f3a884d1590 100644
--- a/net/sched/sch_fq_codel.c
+++ b/net/sched/sch_fq_codel.c
@@ -23,6 +23,7 @@
 #include <linux/vmalloc.h>
 #include <net/netlink.h>
 #include <net/pkt_sched.h>
+#include <net/pkt_cls.h>
 #include <net/codel.h>
 #include <net/codel_impl.h>
 #include <net/codel_qdisc.h>
@@ -57,7 +58,6 @@ struct fq_codel_sched_data {
 	struct fq_codel_flow *flows;	/* Flows table [flows_cnt] */
 	u32		*backlogs;	/* backlog table [flows_cnt] */
 	u32		flows_cnt;	/* number of flows */
-	u32		perturbation;	/* hash perturbation */
 	u32		quantum;	/* psched_mtu(qdisc_dev(sch)); */
 	u32		drop_batch_size;
 	u32		memory_limit;
@@ -75,9 +75,7 @@ struct fq_codel_sched_data {
 static unsigned int fq_codel_hash(const struct fq_codel_sched_data *q,
 				  struct sk_buff *skb)
 {
-	u32 hash = skb_get_hash_perturb(skb, q->perturbation);
-
-	return reciprocal_scale(hash, q->flows_cnt);
+	return reciprocal_scale(skb_get_hash(skb), q->flows_cnt);
 }
 
 static unsigned int fq_codel_classify(struct sk_buff *skb, struct Qdisc *sch,
@@ -482,7 +480,6 @@ static int fq_codel_init(struct Qdisc *sch, struct nlattr *opt)
 	q->memory_limit = 32 << 20; /* 32 MBytes */
 	q->drop_batch_size = 64;
 	q->quantum = psched_mtu(qdisc_dev(sch));
-	q->perturbation = prandom_u32();
 	INIT_LIST_HEAD(&q->new_flows);
 	INIT_LIST_HEAD(&q->old_flows);
 	codel_params_init(&q->cparams);
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 6eb9c8e88519..b052b27a984e 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -247,7 +247,7 @@ static inline int qdisc_restart(struct Qdisc *q, int *packets)
 
 void __qdisc_run(struct Qdisc *q)
 {
-	int quota = weight_p;
+	int quota = dev_tx_weight;
 	int packets;
 
 	while (qdisc_restart(q, &packets)) {
diff --git a/net/sched/sch_hhf.c b/net/sched/sch_hhf.c
index e3d0458af17b..2fae8b5f1b80 100644
--- a/net/sched/sch_hhf.c
+++ b/net/sched/sch_hhf.c
@@ -627,7 +627,9 @@ static int hhf_init(struct Qdisc *sch, struct nlattr *opt)
 			q->hhf_arrays[i] = hhf_zalloc(HHF_ARRAYS_LEN *
 						      sizeof(u32));
 			if (!q->hhf_arrays[i]) {
-				hhf_destroy(sch);
+				/* Note: hhf_destroy() will be called
+				 * by our caller.
+				 */
 				return -ENOMEM;
 			}
 		}
@@ -638,7 +640,9 @@ static int hhf_init(struct Qdisc *sch, struct nlattr *opt)
 			q->hhf_valid_bits[i] = hhf_zalloc(HHF_ARRAYS_LEN /
 							  BITS_PER_BYTE);
 			if (!q->hhf_valid_bits[i]) {
-				hhf_destroy(sch);
+				/* Note: hhf_destroy() will be called
+				 * by our caller.
+				 */
 				return -ENOMEM;
 			}
 		}
diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c
index 760f39e7caee..4cd5fb134bc9 100644
--- a/net/sched/sch_htb.c
+++ b/net/sched/sch_htb.c
@@ -40,6 +40,7 @@
 #include <net/netlink.h>
 #include <net/sch_generic.h>
 #include <net/pkt_sched.h>
+#include <net/pkt_cls.h>
 
 /* HTB algorithm.
     Author: devik@cdi.cz
diff --git a/net/sched/sch_ingress.c b/net/sched/sch_ingress.c
index 8fe6999b642a..3bab5f66c392 100644
--- a/net/sched/sch_ingress.c
+++ b/net/sched/sch_ingress.c
@@ -16,6 +16,7 @@
 
 #include <net/netlink.h>
 #include <net/pkt_sched.h>
+#include <net/pkt_cls.h>
 
 static struct Qdisc *ingress_leaf(struct Qdisc *sch, unsigned long arg)
 {
diff --git a/net/sched/sch_mq.c b/net/sched/sch_mq.c
index 2bc8d7f8df16..20b7f1646f69 100644
--- a/net/sched/sch_mq.c
+++ b/net/sched/sch_mq.c
@@ -52,7 +52,7 @@ static int mq_init(struct Qdisc *sch, struct nlattr *opt)
 	/* pre-allocate qdiscs, attachment can't fail */
 	priv->qdiscs = kcalloc(dev->num_tx_queues, sizeof(priv->qdiscs[0]),
 			       GFP_KERNEL);
-	if (priv->qdiscs == NULL)
+	if (!priv->qdiscs)
 		return -ENOMEM;
 
 	for (ntx = 0; ntx < dev->num_tx_queues; ntx++) {
@@ -60,18 +60,14 @@ static int mq_init(struct Qdisc *sch, struct nlattr *opt)
 		qdisc = qdisc_create_dflt(dev_queue, get_default_qdisc_ops(dev, ntx),
 					  TC_H_MAKE(TC_H_MAJ(sch->handle),
 						    TC_H_MIN(ntx + 1)));
-		if (qdisc == NULL)
-			goto err;
+		if (!qdisc)
+			return -ENOMEM;
 		priv->qdiscs[ntx] = qdisc;
 		qdisc->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT;
 	}
 
 	sch->flags |= TCQ_F_MQROOT;
 	return 0;
-
-err:
-	mq_destroy(sch);
-	return -ENOMEM;
 }
 
 static void mq_attach(struct Qdisc *sch)
diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c
index b5c502c78143..922683418e53 100644
--- a/net/sched/sch_mqprio.c
+++ b/net/sched/sch_mqprio.c
@@ -118,10 +118,8 @@ static int mqprio_init(struct Qdisc *sch, struct nlattr *opt)
 	/* pre-allocate qdisc, attachment can't fail */
 	priv->qdiscs = kcalloc(dev->num_tx_queues, sizeof(priv->qdiscs[0]),
 			       GFP_KERNEL);
-	if (priv->qdiscs == NULL) {
-		err = -ENOMEM;
-		goto err;
-	}
+	if (!priv->qdiscs)
+		return -ENOMEM;
 
 	for (i = 0; i < dev->num_tx_queues; i++) {
 		dev_queue = netdev_get_tx_queue(dev, i);
@@ -129,10 +127,9 @@ static int mqprio_init(struct Qdisc *sch, struct nlattr *opt)
 					  get_default_qdisc_ops(dev, i),
 					  TC_H_MAKE(TC_H_MAJ(sch->handle),
 						    TC_H_MIN(i + 1)));
-		if (qdisc == NULL) {
-			err = -ENOMEM;
-			goto err;
-		}
+		if (!qdisc)
+			return -ENOMEM;
+
 		priv->qdiscs[i] = qdisc;
 		qdisc->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT;
 	}
@@ -148,7 +145,7 @@ static int mqprio_init(struct Qdisc *sch, struct nlattr *opt)
 		priv->hw_owned = 1;
 		err = dev->netdev_ops->ndo_setup_tc(dev, sch->handle, 0, &tc);
 		if (err)
-			goto err;
+			return err;
 	} else {
 		netdev_set_num_tc(dev, qopt->num_tc);
 		for (i = 0; i < qopt->num_tc; i++)
@@ -162,10 +159,6 @@ static int mqprio_init(struct Qdisc *sch, struct nlattr *opt)
 
 	sch->flags |= TCQ_F_MQROOT;
 	return 0;
-
-err:
-	mqprio_destroy(sch);
-	return err;
 }
 
 static void mqprio_attach(struct Qdisc *sch)
diff --git a/net/sched/sch_multiq.c b/net/sched/sch_multiq.c
index 9ffbb025b37e..e7839a0d0eaa 100644
--- a/net/sched/sch_multiq.c
+++ b/net/sched/sch_multiq.c
@@ -25,7 +25,7 @@
 #include <linux/skbuff.h>
 #include <net/netlink.h>
 #include <net/pkt_sched.h>
-
+#include <net/pkt_cls.h>
 
 struct multiq_sched_data {
 	u16 bands;
diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index bcfadfdea8e0..c8bb62a1e744 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -626,7 +626,7 @@ deliver:
 			 * If it's at ingress let's pretend the delay is
 			 * from the network (tstamp will be updated).
 			 */
-			if (G_TC_FROM(skb->tc_verd) & AT_INGRESS)
+			if (skb->tc_redirected && skb->tc_from_ingress)
 				skb->tstamp = 0;
 #endif
 
diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c
index 8f575899adfa..d4d7db267b6e 100644
--- a/net/sched/sch_prio.c
+++ b/net/sched/sch_prio.c
@@ -20,7 +20,7 @@
 #include <linux/skbuff.h>
 #include <net/netlink.h>
 #include <net/pkt_sched.h>
-
+#include <net/pkt_cls.h>
 
 struct prio_sched_data {
 	int bands;
diff --git a/net/sched/sch_sfb.c b/net/sched/sch_sfb.c
index 20a350bd1b1d..fe6963d21519 100644
--- a/net/sched/sch_sfb.c
+++ b/net/sched/sch_sfb.c
@@ -25,6 +25,7 @@
 #include <linux/jhash.h>
 #include <net/ip.h>
 #include <net/pkt_sched.h>
+#include <net/pkt_cls.h>
 #include <net/inet_ecn.h>
 
 /*
diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c
index 7f195ed4d568..42e8c8615e65 100644
--- a/net/sched/sch_sfq.c
+++ b/net/sched/sch_sfq.c
@@ -23,6 +23,7 @@
 #include <linux/vmalloc.h>
 #include <net/netlink.h>
 #include <net/pkt_sched.h>
+#include <net/pkt_cls.h>
 #include <net/red.h>
 
 
@@ -742,9 +743,10 @@ static int sfq_init(struct Qdisc *sch, struct nlattr *opt)
 	q->ht = sfq_alloc(sizeof(q->ht[0]) * q->divisor);
 	q->slots = sfq_alloc(sizeof(q->slots[0]) * q->maxflows);
 	if (!q->ht || !q->slots) {
-		sfq_destroy(sch);
+		/* Note: sfq_destroy() will be called by our caller */
 		return -ENOMEM;
 	}
+
 	for (i = 0; i < q->divisor; i++)
 		q->ht[i] = SFQ_EMPTY_SLOT;
 
diff --git a/net/sched/sch_teql.c b/net/sched/sch_teql.c
index b0196366d58d..9fe6b427afed 100644
--- a/net/sched/sch_teql.c
+++ b/net/sched/sch_teql.c
@@ -401,8 +401,8 @@ static int teql_master_close(struct net_device *dev)
 	return 0;
 }
 
-static struct rtnl_link_stats64 *teql_master_stats64(struct net_device *dev,
-						     struct rtnl_link_stats64 *stats)
+static void teql_master_stats64(struct net_device *dev,
+				struct rtnl_link_stats64 *stats)
 {
 	struct teql_master *m = netdev_priv(dev);
 
@@ -410,7 +410,6 @@ static struct rtnl_link_stats64 *teql_master_stats64(struct net_device *dev,
 	stats->tx_bytes		= m->tx_bytes;
 	stats->tx_errors	= m->tx_errors;
 	stats->tx_dropped	= m->tx_dropped;
-	return stats;
 }
 
 static int teql_master_mtu(struct net_device *dev, int new_mtu)
diff --git a/net/sctp/Makefile b/net/sctp/Makefile
index 6c4f7496cec6..70f1b570bab9 100644
--- a/net/sctp/Makefile
+++ b/net/sctp/Makefile
@@ -11,7 +11,7 @@ sctp-y := sm_statetable.o sm_statefuns.o sm_sideeffect.o \
 	  transport.o chunk.o sm_make_chunk.o ulpevent.o \
 	  inqueue.o outqueue.o ulpqueue.o \
 	  tsnmap.o bind_addr.o socket.o primitive.o \
-	  output.o input.o debug.o ssnmap.o auth.o \
+	  output.o input.o debug.o stream.o auth.o \
 	  offload.o
 
 sctp_probe-y := probe.o
diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index d3cc30c25c41..2a6835b4562b 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -207,6 +207,7 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a
 	 * association to the same value as the initial TSN.
 	 */
 	asoc->addip_serial = asoc->c.initial_tsn;
+	asoc->strreset_outseq = asoc->c.initial_tsn;
 
 	INIT_LIST_HEAD(&asoc->addip_chunk_list);
 	INIT_LIST_HEAD(&asoc->asconf_ack_list);
@@ -269,6 +270,8 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a
 
 	asoc->active_key_id = ep->active_key_id;
 	asoc->prsctp_enable = ep->prsctp_enable;
+	asoc->reconf_enable = ep->reconf_enable;
+	asoc->strreset_enable = ep->strreset_enable;
 
 	/* Save the hmacs and chunks list into this association */
 	if (ep->auth_hmacs_list)
@@ -358,8 +361,11 @@ void sctp_association_free(struct sctp_association *asoc)
 
 	sctp_tsnmap_free(&asoc->peer.tsn_map);
 
-	/* Free ssnmap storage. */
-	sctp_ssnmap_free(asoc->ssnmap);
+	/* Free stream information. */
+	sctp_stream_free(asoc->stream);
+
+	if (asoc->strreset_chunk)
+		sctp_chunk_free(asoc->strreset_chunk);
 
 	/* Clean up the bound address list. */
 	sctp_bind_addr_free(&asoc->base.bind_addr);
@@ -519,6 +525,12 @@ void sctp_assoc_rm_peer(struct sctp_association *asoc,
 	if (asoc->peer.last_data_from == peer)
 		asoc->peer.last_data_from = transport;
 
+	if (asoc->strreset_chunk &&
+	    asoc->strreset_chunk->transport == peer) {
+		asoc->strreset_chunk->transport = transport;
+		sctp_transport_reset_reconf_timer(transport);
+	}
+
 	/* If we remove the transport an INIT was last sent to, set it to
 	 * NULL. Combined with the update of the retran path above, this
 	 * will cause the next INIT to be sent to the next available
@@ -820,8 +832,7 @@ void sctp_assoc_control_transport(struct sctp_association *asoc,
 		if (transport->state != SCTP_UNCONFIRMED)
 			transport->state = SCTP_INACTIVE;
 		else {
-			dst_release(transport->dst);
-			transport->dst = NULL;
+			sctp_transport_dst_release(transport);
 			ulp_notify = false;
 		}
 
@@ -1137,7 +1148,7 @@ void sctp_assoc_update(struct sctp_association *asoc,
 		/* Reinitialize SSN for both local streams
 		 * and peer's streams.
 		 */
-		sctp_ssnmap_clear(asoc->ssnmap);
+		sctp_stream_clear(asoc->stream);
 
 		/* Flush the ULP reassembly and ordered queue.
 		 * Any data there will now be stale and will
@@ -1162,10 +1173,9 @@ void sctp_assoc_update(struct sctp_association *asoc,
 
 		asoc->ctsn_ack_point = asoc->next_tsn - 1;
 		asoc->adv_peer_ack_point = asoc->ctsn_ack_point;
-		if (!asoc->ssnmap) {
-			/* Move the ssnmap. */
-			asoc->ssnmap = new->ssnmap;
-			new->ssnmap = NULL;
+		if (!asoc->stream) {
+			asoc->stream = new->stream;
+			new->stream = NULL;
 		}
 
 		if (!asoc->assoc_id) {
diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c
index 615f0ddd41df..e3621cb4827f 100644
--- a/net/sctp/chunk.c
+++ b/net/sctp/chunk.c
@@ -165,14 +165,12 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc,
 					    struct sctp_sndrcvinfo *sinfo,
 					    struct iov_iter *from)
 {
-	int max, whole, i, offset, over, err;
-	int len, first_len;
-	int max_data;
+	size_t len, first_len, max_data, remaining;
+	size_t msg_len = iov_iter_count(from);
+	struct list_head *pos, *temp;
 	struct sctp_chunk *chunk;
 	struct sctp_datamsg *msg;
-	struct list_head *pos, *temp;
-	size_t msg_len = iov_iter_count(from);
-	__u8 frag;
+	int err;
 
 	msg = sctp_datamsg_new(GFP_KERNEL);
 	if (!msg)
@@ -185,7 +183,7 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc,
 	    (SCTP_PR_TTL_ENABLED(sinfo->sinfo_flags) ||
 	     !SCTP_PR_POLICY(sinfo->sinfo_flags)))
 		msg->expires_at = jiffies +
-				    msecs_to_jiffies(sinfo->sinfo_timetolive);
+				  msecs_to_jiffies(sinfo->sinfo_timetolive);
 
 	/* This is the biggest possible DATA chunk that can fit into
 	 * the packet
@@ -195,7 +193,6 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc,
 		   sizeof(struct sctphdr) - sizeof(struct sctp_data_chunk);
 	max_data = SCTP_TRUNC4(max_data);
 
-	max = asoc->frag_point;
 	/* If the the peer requested that we authenticate DATA chunks
 	 * we need to account for bundling of the AUTH chunks along with
 	 * DATA.
@@ -208,12 +205,11 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc,
 					      hmac_desc->hmac_len);
 	}
 
-	/* Now, check if we need to reduce our max */
-	if (max > max_data)
-		max = max_data;
+	/* Check what's our max considering the above */
+	max_data = min_t(size_t, max_data, asoc->frag_point);
 
-	whole = 0;
-	first_len = max;
+	/* Set first_len and then account for possible bundles on first frag */
+	first_len = max_data;
 
 	/* Check to see if we have a pending SACK and try to let it be bundled
 	 * with this message.  Do this if we don't have any data queued already.
@@ -224,40 +220,38 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc,
 	if (timer_pending(&asoc->timers[SCTP_EVENT_TIMEOUT_SACK]) &&
 	    asoc->outqueue.out_qlen == 0 &&
 	    list_empty(&asoc->outqueue.retransmit) &&
-	    msg_len > max)
-		max_data -= SCTP_PAD4(sizeof(sctp_sack_chunk_t));
+	    msg_len > max_data)
+		first_len -= SCTP_PAD4(sizeof(sctp_sack_chunk_t));
 
 	/* Encourage Cookie-ECHO bundling. */
 	if (asoc->state < SCTP_STATE_COOKIE_ECHOED)
-		max_data -= SCTP_ARBITRARY_COOKIE_ECHO_LEN;
-
-	/* Now that we adjusted completely, reset first_len */
-	if (first_len > max_data)
-		first_len = max_data;
+		first_len -= SCTP_ARBITRARY_COOKIE_ECHO_LEN;
 
 	/* Account for a different sized first fragment */
 	if (msg_len >= first_len) {
-		msg_len -= first_len;
-		whole = 1;
 		msg->can_delay = 0;
-	}
-
-	/* How many full sized?  How many bytes leftover? */
-	whole += msg_len / max;
-	over = msg_len % max;
-	offset = 0;
-
-	if ((whole > 1) || (whole && over))
 		SCTP_INC_STATS(sock_net(asoc->base.sk), SCTP_MIB_FRAGUSRMSGS);
+	} else {
+		/* Which may be the only one... */
+		first_len = msg_len;
+	}
 
-	/* Create chunks for all the full sized DATA chunks. */
-	for (i = 0, len = first_len; i < whole; i++) {
-		frag = SCTP_DATA_MIDDLE_FRAG;
+	/* Create chunks for all DATA chunks. */
+	for (remaining = msg_len; remaining; remaining -= len) {
+		u8 frag = SCTP_DATA_MIDDLE_FRAG;
 
-		if (0 == i)
+		if (remaining == msg_len) {
+			/* First frag, which may also be the last */
 			frag |= SCTP_DATA_FIRST_FRAG;
+			len = first_len;
+		} else {
+			/* Middle frags */
+			len = max_data;
+		}
 
-		if ((i == (whole - 1)) && !over) {
+		if (len >= remaining) {
+			/* Last frag, which may also be the first */
+			len = remaining;
 			frag |= SCTP_DATA_LAST_FRAG;
 
 			/* The application requests to set the I-bit of the
@@ -271,7 +265,6 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc,
 
 		chunk = sctp_make_datafrag_empty(asoc, sinfo, len, frag,
 						 0, GFP_KERNEL);
-
 		if (!chunk) {
 			err = -ENOMEM;
 			goto errout;
@@ -282,45 +275,8 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc,
 			goto errout_chunk_free;
 
 		/* Put the chunk->skb back into the form expected by send.  */
-		__skb_pull(chunk->skb, (__u8 *)chunk->chunk_hdr
-			   - (__u8 *)chunk->skb->data);
-
-		sctp_datamsg_assign(msg, chunk);
-		list_add_tail(&chunk->frag_list, &msg->chunks);
-
-		/* The first chunk, the first chunk was likely short
-		 * to allow bundling, so reset to full size.
-		 */
-		if (0 == i)
-			len = max;
-	}
-
-	/* .. now the leftover bytes. */
-	if (over) {
-		if (!whole)
-			frag = SCTP_DATA_NOT_FRAG;
-		else
-			frag = SCTP_DATA_LAST_FRAG;
-
-		if ((sinfo->sinfo_flags & SCTP_EOF) ||
-		    (sinfo->sinfo_flags & SCTP_SACK_IMMEDIATELY))
-			frag |= SCTP_DATA_SACK_IMM;
-
-		chunk = sctp_make_datafrag_empty(asoc, sinfo, over, frag,
-						 0, GFP_KERNEL);
-
-		if (!chunk) {
-			err = -ENOMEM;
-			goto errout;
-		}
-
-		err = sctp_user_addto_chunk(chunk, over, from);
-
-		/* Put the chunk->skb back into the form expected by send.  */
-		__skb_pull(chunk->skb, (__u8 *)chunk->chunk_hdr
-			   - (__u8 *)chunk->skb->data);
-		if (err < 0)
-			goto errout_chunk_free;
+		__skb_pull(chunk->skb, (__u8 *)chunk->chunk_hdr -
+				       chunk->skb->data);
 
 		sctp_datamsg_assign(msg, chunk);
 		list_add_tail(&chunk->frag_list, &msg->chunks);
@@ -338,6 +294,7 @@ errout:
 		sctp_chunk_free(chunk);
 	}
 	sctp_datamsg_put(msg);
+
 	return ERR_PTR(err);
 }
 
diff --git a/net/sctp/debug.c b/net/sctp/debug.c
index 95d7b15dad21..2e47eb2f05cb 100644
--- a/net/sctp/debug.c
+++ b/net/sctp/debug.c
@@ -159,6 +159,7 @@ static const char *const sctp_timer_tbl[] = {
 	"TIMEOUT_T4_RTO",
 	"TIMEOUT_T5_SHUTDOWN_GUARD",
 	"TIMEOUT_HEARTBEAT",
+	"TIMEOUT_RECONF",
 	"TIMEOUT_SACK",
 	"TIMEOUT_AUTOCLOSE",
 };
@@ -166,7 +167,9 @@ static const char *const sctp_timer_tbl[] = {
 /* Lookup timer debug name. */
 const char *sctp_tname(const sctp_subtype_t id)
 {
-	if (id.timeout <= SCTP_EVENT_TIMEOUT_MAX)
+	BUILD_BUG_ON(SCTP_EVENT_TIMEOUT_MAX + 1 != ARRAY_SIZE(sctp_timer_tbl));
+
+	if (id.timeout < ARRAY_SIZE(sctp_timer_tbl))
 		return sctp_timer_tbl[id.timeout];
 	return "unknown_timer";
 }
diff --git a/net/sctp/endpointola.c b/net/sctp/endpointola.c
index 410ddc1e3443..8c589230794f 100644
--- a/net/sctp/endpointola.c
+++ b/net/sctp/endpointola.c
@@ -164,6 +164,7 @@ static struct sctp_endpoint *sctp_endpoint_init(struct sctp_endpoint *ep,
 	ep->auth_hmacs_list = auth_hmacs;
 	ep->auth_chunk_list = auth_chunks;
 	ep->prsctp_enable = net->sctp.prsctp_enable;
+	ep->reconf_enable = net->sctp.reconf_enable;
 
 	return ep;
 
diff --git a/net/sctp/input.c b/net/sctp/input.c
index 458e506ef84b..fc458968fe4b 100644
--- a/net/sctp/input.c
+++ b/net/sctp/input.c
@@ -872,6 +872,8 @@ void sctp_transport_hashtable_destroy(void)
 
 int sctp_hash_transport(struct sctp_transport *t)
 {
+	struct sctp_transport *transport;
+	struct rhlist_head *tmp, *list;
 	struct sctp_hash_cmp_arg arg;
 	int err;
 
@@ -882,8 +884,19 @@ int sctp_hash_transport(struct sctp_transport *t)
 	arg.paddr = &t->ipaddr;
 	arg.lport = htons(t->asoc->base.bind_addr.port);
 
+	list = rhltable_lookup(&sctp_transport_hashtable, &arg,
+			       sctp_hash_params);
+
+	rhl_for_each_entry_rcu(transport, tmp, list, node)
+		if (transport->asoc->ep == t->asoc->ep) {
+			err = -EEXIST;
+			goto out;
+		}
+
 	err = rhltable_insert_key(&sctp_transport_hashtable, &arg,
 				  &t->node, sctp_hash_params);
+
+out:
 	if (err)
 		pr_err_once("insert transport fail, errno %d\n", err);
 
@@ -1229,13 +1242,26 @@ static struct sctp_association *__sctp_rcv_lookup(struct net *net,
 	struct sctp_association *asoc;
 
 	asoc = __sctp_lookup_association(net, laddr, paddr, transportp);
+	if (asoc)
+		goto out;
 
 	/* Further lookup for INIT/INIT-ACK packets.
 	 * SCTP Implementors Guide, 2.18 Handling of address
 	 * parameters within the INIT or INIT-ACK.
 	 */
-	if (!asoc)
-		asoc = __sctp_rcv_lookup_harder(net, skb, laddr, transportp);
+	asoc = __sctp_rcv_lookup_harder(net, skb, laddr, transportp);
+	if (asoc)
+		goto out;
+
+	if (paddr->sa.sa_family == AF_INET)
+		pr_debug("sctp: asoc not found for src:%pI4:%d dst:%pI4:%d\n",
+			 &laddr->v4.sin_addr, ntohs(laddr->v4.sin_port),
+			 &paddr->v4.sin_addr, ntohs(paddr->v4.sin_port));
+	else
+		pr_debug("sctp: asoc not found for src:%pI6:%d dst:%pI6:%d\n",
+			 &laddr->v6.sin6_addr, ntohs(laddr->v6.sin6_port),
+			 &paddr->v6.sin6_addr, ntohs(paddr->v6.sin6_port));
 
+out:
 	return asoc;
 }
diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
index 64dfd35ccdcc..063baac5b9fe 100644
--- a/net/sctp/ipv6.c
+++ b/net/sctp/ipv6.c
@@ -413,22 +413,20 @@ static void sctp_v6_copy_addrlist(struct list_head *addrlist,
 static void sctp_v6_from_skb(union sctp_addr *addr, struct sk_buff *skb,
 			     int is_saddr)
 {
-	__be16 *port;
-	struct sctphdr *sh;
+	/* Always called on head skb, so this is safe */
+	struct sctphdr *sh = sctp_hdr(skb);
+	struct sockaddr_in6 *sa = &addr->v6;
 
-	port = &addr->v6.sin6_port;
 	addr->v6.sin6_family = AF_INET6;
 	addr->v6.sin6_flowinfo = 0; /* FIXME */
 	addr->v6.sin6_scope_id = ((struct inet6_skb_parm *)skb->cb)->iif;
 
-	/* Always called on head skb, so this is safe */
-	sh = sctp_hdr(skb);
 	if (is_saddr) {
-		*port  = sh->source;
-		addr->v6.sin6_addr = ipv6_hdr(skb)->saddr;
+		sa->sin6_port = sh->source;
+		sa->sin6_addr = ipv6_hdr(skb)->saddr;
 	} else {
-		*port = sh->dest;
-		addr->v6.sin6_addr = ipv6_hdr(skb)->daddr;
+		sa->sin6_port = sh->dest;
+		sa->sin6_addr = ipv6_hdr(skb)->daddr;
 	}
 }
 
diff --git a/net/sctp/objcnt.c b/net/sctp/objcnt.c
index 40e7fac96c41..105ac3327b28 100644
--- a/net/sctp/objcnt.c
+++ b/net/sctp/objcnt.c
@@ -51,7 +51,6 @@ SCTP_DBG_OBJCNT(bind_addr);
 SCTP_DBG_OBJCNT(bind_bucket);
 SCTP_DBG_OBJCNT(chunk);
 SCTP_DBG_OBJCNT(addr);
-SCTP_DBG_OBJCNT(ssnmap);
 SCTP_DBG_OBJCNT(datamsg);
 SCTP_DBG_OBJCNT(keys);
 
@@ -67,7 +66,6 @@ static sctp_dbg_objcnt_entry_t sctp_dbg_objcnt[] = {
 	SCTP_DBG_OBJCNT_ENTRY(bind_addr),
 	SCTP_DBG_OBJCNT_ENTRY(bind_bucket),
 	SCTP_DBG_OBJCNT_ENTRY(addr),
-	SCTP_DBG_OBJCNT_ENTRY(ssnmap),
 	SCTP_DBG_OBJCNT_ENTRY(datamsg),
 	SCTP_DBG_OBJCNT_ENTRY(keys),
 };
diff --git a/net/sctp/output.c b/net/sctp/output.c
index f5320a87341e..71ce6b945dcb 100644
--- a/net/sctp/output.c
+++ b/net/sctp/output.c
@@ -81,8 +81,8 @@ static void sctp_packet_reset(struct sctp_packet *packet)
 /* Config a packet.
  * This appears to be a followup set of initializations.
  */
-struct sctp_packet *sctp_packet_config(struct sctp_packet *packet,
-				       __u32 vtag, int ecn_capable)
+void sctp_packet_config(struct sctp_packet *packet, __u32 vtag,
+			int ecn_capable)
 {
 	struct sctp_transport *tp = packet->transport;
 	struct sctp_association *asoc = tp->asoc;
@@ -123,14 +123,12 @@ struct sctp_packet *sctp_packet_config(struct sctp_packet *packet,
 		if (chunk)
 			sctp_packet_append_chunk(packet, chunk);
 	}
-
-	return packet;
 }
 
 /* Initialize the packet structure. */
-struct sctp_packet *sctp_packet_init(struct sctp_packet *packet,
-				     struct sctp_transport *transport,
-				     __u16 sport, __u16 dport)
+void sctp_packet_init(struct sctp_packet *packet,
+		      struct sctp_transport *transport,
+		      __u16 sport, __u16 dport)
 {
 	struct sctp_association *asoc = transport->asoc;
 	size_t overhead;
@@ -151,8 +149,6 @@ struct sctp_packet *sctp_packet_init(struct sctp_packet *packet,
 	packet->overhead = overhead;
 	sctp_packet_reset(packet);
 	packet->vtag = 0;
-
-	return packet;
 }
 
 /* Free a packet.  */
@@ -181,7 +177,7 @@ sctp_xmit_t sctp_packet_transmit_chunk(struct sctp_packet *packet,
 {
 	sctp_xmit_t retval;
 
-	pr_debug("%s: packet:%p size:%Zu chunk:%p size:%d\n", __func__,
+	pr_debug("%s: packet:%p size:%zu chunk:%p size:%d\n", __func__,
 		 packet, packet->size, chunk, chunk->skb ? chunk->skb->len : -1);
 
 	switch ((retval = (sctp_packet_append_chunk(packet, chunk)))) {
@@ -550,6 +546,7 @@ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp)
 	struct sctp_association *asoc = tp->asoc;
 	struct sctp_chunk *chunk, *tmp;
 	int pkt_count, gso = 0;
+	int confirm;
 	struct dst_entry *dst;
 	struct sk_buff *head;
 	struct sctphdr *sh;
@@ -628,7 +625,14 @@ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp)
 			asoc->peer.last_sent_to = tp;
 	}
 	head->ignore_df = packet->ipfragok;
-	tp->af_specific->sctp_xmit(head, tp);
+	confirm = tp->dst_pending_confirm;
+	if (confirm)
+		skb_set_dst_pending_confirm(head, 1);
+	/* neighbour should be confirmed on successful transmission or
+	 * positive error
+	 */
+	if (tp->af_specific->sctp_xmit(head, tp) >= 0 && confirm)
+		tp->dst_pending_confirm = 0;
 
 out:
 	list_for_each_entry_safe(chunk, tmp, &packet->chunk_list, list) {
@@ -700,18 +704,15 @@ static sctp_xmit_t sctp_packet_can_append_data(struct sctp_packet *packet,
 	 * unacknowledged.
 	 */
 
-	if (sctp_sk(asoc->base.sk)->nodelay)
-		/* Nagle disabled */
+	if ((sctp_sk(asoc->base.sk)->nodelay || inflight == 0) &&
+	    !chunk->msg->force_delay)
+		/* Nothing unacked */
 		return SCTP_XMIT_OK;
 
 	if (!sctp_packet_empty(packet))
 		/* Append to packet */
 		return SCTP_XMIT_OK;
 
-	if (inflight == 0)
-		/* Nothing unacked */
-		return SCTP_XMIT_OK;
-
 	if (!sctp_state(asoc, ESTABLISHED))
 		return SCTP_XMIT_OK;
 
diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c
index 34efaa4ef2f6..db352e5d61f8 100644
--- a/net/sctp/outqueue.c
+++ b/net/sctp/outqueue.c
@@ -915,22 +915,28 @@ static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp)
 		case SCTP_CID_ECN_ECNE:
 		case SCTP_CID_ASCONF:
 		case SCTP_CID_FWD_TSN:
+		case SCTP_CID_RECONF:
 			status = sctp_packet_transmit_chunk(packet, chunk,
 							    one_packet, gfp);
 			if (status  != SCTP_XMIT_OK) {
 				/* put the chunk back */
 				list_add(&chunk->list, &q->control_chunk_list);
-			} else {
-				asoc->stats.octrlchunks++;
-				/* PR-SCTP C5) If a FORWARD TSN is sent, the
-				 * sender MUST assure that at least one T3-rtx
-				 * timer is running.
-				 */
-				if (chunk->chunk_hdr->type == SCTP_CID_FWD_TSN) {
-					sctp_transport_reset_t3_rtx(transport);
-					transport->last_time_sent = jiffies;
-				}
+				break;
+			}
+
+			asoc->stats.octrlchunks++;
+			/* PR-SCTP C5) If a FORWARD TSN is sent, the
+			 * sender MUST assure that at least one T3-rtx
+			 * timer is running.
+			 */
+			if (chunk->chunk_hdr->type == SCTP_CID_FWD_TSN) {
+				sctp_transport_reset_t3_rtx(transport);
+				transport->last_time_sent = jiffies;
 			}
+
+			if (chunk == asoc->strreset_chunk)
+				sctp_transport_reset_reconf_timer(transport);
+
 			break;
 
 		default:
@@ -1016,6 +1022,8 @@ static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp)
 
 		/* Finally, transmit new packets.  */
 		while ((chunk = sctp_outq_dequeue_data(q)) != NULL) {
+			__u32 sid = ntohs(chunk->subh.data_hdr->stream);
+
 			/* RFC 2960 6.5 Every DATA chunk MUST carry a valid
 			 * stream identifier.
 			 */
@@ -1038,6 +1046,11 @@ static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp)
 				continue;
 			}
 
+			if (asoc->stream->out[sid].state == SCTP_STREAM_CLOSED) {
+				sctp_outq_head_data(q, chunk);
+				goto sctp_flush_out;
+			}
+
 			/* If there is a specified transport, use it.
 			 * Otherwise, we want to use the active path.
 			 */
@@ -1641,7 +1654,7 @@ static void sctp_check_transmitted(struct sctp_outq *q,
 
 		if (forward_progress) {
 			if (transport->dst)
-				dst_confirm(transport->dst);
+				sctp_transport_dst_confirm(transport);
 		}
 	}
 
diff --git a/net/sctp/primitive.c b/net/sctp/primitive.c
index ab8d9f96a177..f0553a022859 100644
--- a/net/sctp/primitive.c
+++ b/net/sctp/primitive.c
@@ -211,3 +211,6 @@ DECLARE_PRIMITIVE(REQUESTHEARTBEAT);
 */
 
 DECLARE_PRIMITIVE(ASCONF);
+
+/* RE-CONFIG 5.1 */
+DECLARE_PRIMITIVE(RECONF);
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index 616a9428e0c4..1b6d4574d2b0 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -199,6 +199,7 @@ int sctp_copy_local_addr_list(struct net *net, struct sctp_bind_addr *bp,
 			      sctp_scope_t scope, gfp_t gfp, int copy_flags)
 {
 	struct sctp_sockaddr_entry *addr;
+	union sctp_addr laddr;
 	int error = 0;
 
 	rcu_read_lock();
@@ -220,7 +221,10 @@ int sctp_copy_local_addr_list(struct net *net, struct sctp_bind_addr *bp,
 		     !(copy_flags & SCTP_ADDR6_PEERSUPP)))
 			continue;
 
-		if (sctp_bind_addr_state(bp, &addr->a) != -1)
+		laddr = addr->a;
+		/* also works for setting ipv6 address port */
+		laddr.v4.sin_port = htons(bp->port);
+		if (sctp_bind_addr_state(bp, &laddr) != -1)
 			continue;
 
 		error = sctp_add_bind_addr(bp, &addr->a, sizeof(addr->a),
@@ -237,23 +241,19 @@ int sctp_copy_local_addr_list(struct net *net, struct sctp_bind_addr *bp,
 static void sctp_v4_from_skb(union sctp_addr *addr, struct sk_buff *skb,
 			     int is_saddr)
 {
-	void *from;
-	__be16 *port;
-	struct sctphdr *sh;
+	/* Always called on head skb, so this is safe */
+	struct sctphdr *sh = sctp_hdr(skb);
+	struct sockaddr_in *sa = &addr->v4;
 
-	port = &addr->v4.sin_port;
 	addr->v4.sin_family = AF_INET;
 
-	/* Always called on head skb, so this is safe */
-	sh = sctp_hdr(skb);
 	if (is_saddr) {
-		*port  = sh->source;
-		from = &ip_hdr(skb)->saddr;
+		sa->sin_port = sh->source;
+		sa->sin_addr.s_addr = ip_hdr(skb)->saddr;
 	} else {
-		*port = sh->dest;
-		from = &ip_hdr(skb)->daddr;
+		sa->sin_port = sh->dest;
+		sa->sin_addr.s_addr = ip_hdr(skb)->daddr;
 	}
-	memcpy(&addr->v4.sin_addr.s_addr, from, sizeof(struct in_addr));
 }
 
 /* Initialize an sctp_addr from a socket. */
@@ -1262,6 +1262,9 @@ static int __net_init sctp_defaults_init(struct net *net)
 	/* Enable PR-SCTP by default. */
 	net->sctp.prsctp_enable = 1;
 
+	/* Disable RECONF by default. */
+	net->sctp.reconf_enable = 0;
+
 	/* Disable AUTH by default. */
 	net->sctp.auth_enable = 0;
 
diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index 9e9690b7afe1..969a30c7bb54 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -270,6 +270,11 @@ struct sctp_chunk *sctp_make_init(const struct sctp_association *asoc,
 		num_ext += 2;
 	}
 
+	if (asoc->reconf_enable) {
+		extensions[num_ext] = SCTP_CID_RECONF;
+		num_ext += 1;
+	}
+
 	if (sp->adaptation_ind)
 		chunksize += sizeof(aiparam);
 
@@ -434,6 +439,11 @@ struct sctp_chunk *sctp_make_init_ack(const struct sctp_association *asoc,
 		num_ext += 2;
 	}
 
+	if (asoc->peer.reconf_capable) {
+		extensions[num_ext] = SCTP_CID_RECONF;
+		num_ext += 1;
+	}
+
 	if (sp->adaptation_ind)
 		chunksize += sizeof(aiparam);
 
@@ -1536,7 +1546,7 @@ void sctp_chunk_assign_ssn(struct sctp_chunk *chunk)
 
 	/* All fragments will be on the same stream */
 	sid = ntohs(chunk->subh.data_hdr->stream);
-	stream = &chunk->asoc->ssnmap->out;
+	stream = chunk->asoc->stream;
 
 	/* Now assign the sequence number to the entire message.
 	 * All fragments must have the same stream sequence number.
@@ -1547,9 +1557,9 @@ void sctp_chunk_assign_ssn(struct sctp_chunk *chunk)
 			ssn = 0;
 		} else {
 			if (lchunk->chunk_hdr->flags & SCTP_DATA_LAST_FRAG)
-				ssn = sctp_ssn_next(stream, sid);
+				ssn = sctp_ssn_next(stream, out, sid);
 			else
-				ssn = sctp_ssn_peek(stream, sid);
+				ssn = sctp_ssn_peek(stream, out, sid);
 		}
 
 		lchunk->subh.data_hdr->ssn = htons(ssn);
@@ -1844,6 +1854,7 @@ no_hmac:
 	retval->next_tsn = retval->c.initial_tsn;
 	retval->ctsn_ack_point = retval->next_tsn - 1;
 	retval->addip_serial = retval->c.initial_tsn;
+	retval->strreset_outseq = retval->c.initial_tsn;
 	retval->adv_peer_ack_point = retval->ctsn_ack_point;
 	retval->peer.prsctp_capable = retval->c.prsctp_capable;
 	retval->peer.adaptation_ind = retval->c.adaptation_ind;
@@ -2011,6 +2022,11 @@ static void sctp_process_ext_param(struct sctp_association *asoc,
 
 	for (i = 0; i < num_ext; i++) {
 		switch (param.ext->chunks[i]) {
+		case SCTP_CID_RECONF:
+			if (asoc->reconf_enable &&
+			    !asoc->peer.reconf_capable)
+				asoc->peer.reconf_capable = 1;
+			break;
 		case SCTP_CID_FWD_TSN:
 			if (asoc->prsctp_enable && !asoc->peer.prsctp_capable)
 				asoc->peer.prsctp_capable = 1;
@@ -2387,6 +2403,8 @@ int sctp_process_init(struct sctp_association *asoc, struct sctp_chunk *chunk,
 	asoc->peer.i.initial_tsn =
 		ntohl(peer_init->init_hdr.initial_tsn);
 
+	asoc->strreset_inseq = asoc->peer.i.initial_tsn;
+
 	/* Apply the upper bounds for output streams based on peer's
 	 * number of inbound streams.
 	 */
@@ -2444,9 +2462,9 @@ int sctp_process_init(struct sctp_association *asoc, struct sctp_chunk *chunk,
 	if (!asoc->temp) {
 		int error;
 
-		asoc->ssnmap = sctp_ssnmap_new(asoc->c.sinit_max_instreams,
+		asoc->stream = sctp_stream_new(asoc->c.sinit_max_instreams,
 					       asoc->c.sinit_num_ostreams, gfp);
-		if (!asoc->ssnmap)
+		if (!asoc->stream)
 			goto clean_up;
 
 		error = sctp_assoc_set_id(asoc, gfp);
@@ -3210,7 +3228,6 @@ struct sctp_chunk *sctp_process_asconf(struct sctp_association *asoc,
 	union sctp_params param;
 	sctp_addiphdr_t		*hdr;
 	union sctp_addr_param	*addr_param;
-	sctp_addip_param_t	*asconf_param;
 	struct sctp_chunk	*asconf_ack;
 	__be16	err_code;
 	int	length = 0;
@@ -3230,7 +3247,6 @@ struct sctp_chunk *sctp_process_asconf(struct sctp_association *asoc,
 	 * asconf parameter.
 	 */
 	length = ntohs(addr_param->p.length);
-	asconf_param = (void *)addr_param + length;
 	chunk_len -= length;
 
 	/* create an ASCONF_ACK chunk.
@@ -3317,8 +3333,7 @@ static void sctp_asconf_param_success(struct sctp_association *asoc,
 		local_bh_enable();
 		list_for_each_entry(transport, &asoc->peer.transport_addr_list,
 				transports) {
-			dst_release(transport->dst);
-			transport->dst = NULL;
+			sctp_transport_dst_release(transport);
 		}
 		break;
 	case SCTP_PARAM_DEL_IP:
@@ -3332,8 +3347,7 @@ static void sctp_asconf_param_success(struct sctp_association *asoc,
 		local_bh_enable();
 		list_for_each_entry(transport, &asoc->peer.transport_addr_list,
 				transports) {
-			dst_release(transport->dst);
-			transport->dst = NULL;
+			sctp_transport_dst_release(transport);
 		}
 		break;
 	default:
@@ -3526,3 +3540,323 @@ struct sctp_chunk *sctp_make_fwdtsn(const struct sctp_association *asoc,
 
 	return retval;
 }
+
+/* RE-CONFIG 3.1 (RE-CONFIG chunk)
+ *   0                   1                   2                   3
+ *   0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *  | Type = 130    |  Chunk Flags  |      Chunk Length             |
+ *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *  \                                                               \
+ *  /                  Re-configuration Parameter                   /
+ *  \                                                               \
+ *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *  \                                                               \
+ *  /             Re-configuration Parameter (optional)             /
+ *  \                                                               \
+ *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ */
+static struct sctp_chunk *sctp_make_reconf(
+				const struct sctp_association *asoc,
+				int length)
+{
+	struct sctp_reconf_chunk *reconf;
+	struct sctp_chunk *retval;
+
+	retval = sctp_make_control(asoc, SCTP_CID_RECONF, 0, length,
+				   GFP_ATOMIC);
+	if (!retval)
+		return NULL;
+
+	reconf = (struct sctp_reconf_chunk *)retval->chunk_hdr;
+	retval->param_hdr.v = reconf->params;
+
+	return retval;
+}
+
+/* RE-CONFIG 4.1 (STREAM OUT RESET)
+ *   0                   1                   2                   3
+ *   0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *  |     Parameter Type = 13       | Parameter Length = 16 + 2 * N |
+ *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *  |           Re-configuration Request Sequence Number            |
+ *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *  |           Re-configuration Response Sequence Number           |
+ *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *  |                Sender's Last Assigned TSN                     |
+ *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *  |  Stream Number 1 (optional)   |    Stream Number 2 (optional) |
+ *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *  /                            ......                             /
+ *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *  |  Stream Number N-1 (optional) |    Stream Number N (optional) |
+ *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *
+ * RE-CONFIG 4.2 (STREAM IN RESET)
+ *   0                   1                   2                   3
+ *   0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *  |     Parameter Type = 14       |  Parameter Length = 8 + 2 * N |
+ *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *  |          Re-configuration Request Sequence Number             |
+ *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *  |  Stream Number 1 (optional)   |    Stream Number 2 (optional) |
+ *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *  /                            ......                             /
+ *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *  |  Stream Number N-1 (optional) |    Stream Number N (optional) |
+ *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ */
+struct sctp_chunk *sctp_make_strreset_req(
+				const struct sctp_association *asoc,
+				__u16 stream_num, __u16 *stream_list,
+				bool out, bool in)
+{
+	struct sctp_strreset_outreq outreq;
+	__u16 stream_len = stream_num * 2;
+	struct sctp_strreset_inreq inreq;
+	struct sctp_chunk *retval;
+	__u16 outlen, inlen;
+
+	outlen = (sizeof(outreq) + stream_len) * out;
+	inlen = (sizeof(inreq) + stream_len) * in;
+
+	retval = sctp_make_reconf(asoc, outlen + inlen);
+	if (!retval)
+		return NULL;
+
+	if (outlen) {
+		outreq.param_hdr.type = SCTP_PARAM_RESET_OUT_REQUEST;
+		outreq.param_hdr.length = htons(outlen);
+		outreq.request_seq = htonl(asoc->strreset_outseq);
+		outreq.response_seq = htonl(asoc->strreset_inseq - 1);
+		outreq.send_reset_at_tsn = htonl(asoc->next_tsn - 1);
+
+		sctp_addto_chunk(retval, sizeof(outreq), &outreq);
+
+		if (stream_len)
+			sctp_addto_chunk(retval, stream_len, stream_list);
+	}
+
+	if (inlen) {
+		inreq.param_hdr.type = SCTP_PARAM_RESET_IN_REQUEST;
+		inreq.param_hdr.length = htons(inlen);
+		inreq.request_seq = htonl(asoc->strreset_outseq + out);
+
+		sctp_addto_chunk(retval, sizeof(inreq), &inreq);
+
+		if (stream_len)
+			sctp_addto_chunk(retval, stream_len, stream_list);
+	}
+
+	return retval;
+}
+
+/* RE-CONFIG 4.3 (SSN/TSN RESET ALL)
+ *   0                   1                   2                   3
+ *   0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *  |     Parameter Type = 15       |      Parameter Length = 8     |
+ *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *  |         Re-configuration Request Sequence Number              |
+ *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ */
+struct sctp_chunk *sctp_make_strreset_tsnreq(
+				const struct sctp_association *asoc)
+{
+	struct sctp_strreset_tsnreq tsnreq;
+	__u16 length = sizeof(tsnreq);
+	struct sctp_chunk *retval;
+
+	retval = sctp_make_reconf(asoc, length);
+	if (!retval)
+		return NULL;
+
+	tsnreq.param_hdr.type = SCTP_PARAM_RESET_TSN_REQUEST;
+	tsnreq.param_hdr.length = htons(length);
+	tsnreq.request_seq = htonl(asoc->strreset_outseq);
+
+	sctp_addto_chunk(retval, sizeof(tsnreq), &tsnreq);
+
+	return retval;
+}
+
+/* RE-CONFIG 4.5/4.6 (ADD STREAM)
+ *   0                   1                   2                   3
+ *   0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *  |     Parameter Type = 17       |      Parameter Length = 12    |
+ *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *  |          Re-configuration Request Sequence Number             |
+ *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *  |      Number of new streams    |         Reserved              |
+ *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ */
+struct sctp_chunk *sctp_make_strreset_addstrm(
+				const struct sctp_association *asoc,
+				__u16 out, __u16 in)
+{
+	struct sctp_strreset_addstrm addstrm;
+	__u16 size = sizeof(addstrm);
+	struct sctp_chunk *retval;
+
+	retval = sctp_make_reconf(asoc, (!!out + !!in) * size);
+	if (!retval)
+		return NULL;
+
+	if (out) {
+		addstrm.param_hdr.type = SCTP_PARAM_RESET_ADD_OUT_STREAMS;
+		addstrm.param_hdr.length = htons(size);
+		addstrm.number_of_streams = htons(out);
+		addstrm.request_seq = htonl(asoc->strreset_outseq);
+		addstrm.reserved = 0;
+
+		sctp_addto_chunk(retval, size, &addstrm);
+	}
+
+	if (in) {
+		addstrm.param_hdr.type = SCTP_PARAM_RESET_ADD_IN_STREAMS;
+		addstrm.param_hdr.length = htons(size);
+		addstrm.number_of_streams = htons(in);
+		addstrm.request_seq = htonl(asoc->strreset_outseq + !!out);
+		addstrm.reserved = 0;
+
+		sctp_addto_chunk(retval, size, &addstrm);
+	}
+
+	return retval;
+}
+
+/* RE-CONFIG 4.4 (RESP)
+ *   0                   1                   2                   3
+ *   0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *  |     Parameter Type = 16       |      Parameter Length         |
+ *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *  |         Re-configuration Response Sequence Number             |
+ *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *  |                            Result                             |
+ *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ */
+struct sctp_chunk *sctp_make_strreset_resp(
+				const struct sctp_association *asoc,
+				__u32 result, __u32 sn)
+{
+	struct sctp_strreset_resp resp;
+	__u16 length = sizeof(resp);
+	struct sctp_chunk *retval;
+
+	retval = sctp_make_reconf(asoc, length);
+	if (!retval)
+		return NULL;
+
+	resp.param_hdr.type = SCTP_PARAM_RESET_RESPONSE;
+	resp.param_hdr.length = htons(length);
+	resp.response_seq = htonl(sn);
+	resp.result = htonl(result);
+
+	sctp_addto_chunk(retval, sizeof(resp), &resp);
+
+	return retval;
+}
+
+/* RE-CONFIG 4.4 OPTIONAL (TSNRESP)
+ *   0                   1                   2                   3
+ *   0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *  |     Parameter Type = 16       |      Parameter Length         |
+ *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *  |         Re-configuration Response Sequence Number             |
+ *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *  |                            Result                             |
+ *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *  |                   Sender's Next TSN (optional)                |
+ *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *  |                  Receiver's Next TSN (optional)               |
+ *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ */
+struct sctp_chunk *sctp_make_strreset_tsnresp(
+					struct sctp_association *asoc,
+					__u32 result, __u32 sn,
+					__u32 sender_tsn, __u32 receiver_tsn)
+{
+	struct sctp_strreset_resptsn tsnresp;
+	__u16 length = sizeof(tsnresp);
+	struct sctp_chunk *retval;
+
+	retval = sctp_make_reconf(asoc, length);
+	if (!retval)
+		return NULL;
+
+	tsnresp.param_hdr.type = SCTP_PARAM_RESET_RESPONSE;
+	tsnresp.param_hdr.length = htons(length);
+
+	tsnresp.response_seq = htonl(sn);
+	tsnresp.result = htonl(result);
+	tsnresp.senders_next_tsn = htonl(sender_tsn);
+	tsnresp.receivers_next_tsn = htonl(receiver_tsn);
+
+	sctp_addto_chunk(retval, sizeof(tsnresp), &tsnresp);
+
+	return retval;
+}
+
+bool sctp_verify_reconf(const struct sctp_association *asoc,
+			struct sctp_chunk *chunk,
+			struct sctp_paramhdr **errp)
+{
+	struct sctp_reconf_chunk *hdr;
+	union sctp_params param;
+	__u16 last = 0, cnt = 0;
+
+	hdr = (struct sctp_reconf_chunk *)chunk->chunk_hdr;
+	sctp_walk_params(param, hdr, params) {
+		__u16 length = ntohs(param.p->length);
+
+		*errp = param.p;
+		if (cnt++ > 2)
+			return false;
+		switch (param.p->type) {
+		case SCTP_PARAM_RESET_OUT_REQUEST:
+			if (length < sizeof(struct sctp_strreset_outreq) ||
+			    (last && last != SCTP_PARAM_RESET_RESPONSE &&
+			     last != SCTP_PARAM_RESET_IN_REQUEST))
+				return false;
+			break;
+		case SCTP_PARAM_RESET_IN_REQUEST:
+			if (length < sizeof(struct sctp_strreset_inreq) ||
+			    (last && last != SCTP_PARAM_RESET_OUT_REQUEST))
+				return false;
+			break;
+		case SCTP_PARAM_RESET_RESPONSE:
+			if ((length != sizeof(struct sctp_strreset_resp) &&
+			     length != sizeof(struct sctp_strreset_resptsn)) ||
+			    (last && last != SCTP_PARAM_RESET_RESPONSE &&
+			     last != SCTP_PARAM_RESET_OUT_REQUEST))
+				return false;
+			break;
+		case SCTP_PARAM_RESET_TSN_REQUEST:
+			if (length !=
+			    sizeof(struct sctp_strreset_tsnreq) || last)
+				return false;
+			break;
+		case SCTP_PARAM_RESET_ADD_IN_STREAMS:
+			if (length != sizeof(struct sctp_strreset_addstrm) ||
+			    (last && last != SCTP_PARAM_RESET_ADD_OUT_STREAMS))
+				return false;
+			break;
+		case SCTP_PARAM_RESET_ADD_OUT_STREAMS:
+			if (length != sizeof(struct sctp_strreset_addstrm) ||
+			    (last && last != SCTP_PARAM_RESET_ADD_IN_STREAMS))
+				return false;
+			break;
+		default:
+			return false;
+		}
+
+		last = param.p->type;
+	}
+
+	return true;
+}
diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c
index c345bf153bed..25384fa82ba9 100644
--- a/net/sctp/sm_sideeffect.c
+++ b/net/sctp/sm_sideeffect.c
@@ -436,6 +436,37 @@ out_unlock:
 	sctp_association_put(asoc);
 }
 
+ /* Handle the timeout of the RE-CONFIG timer. */
+void sctp_generate_reconf_event(unsigned long data)
+{
+	struct sctp_transport *transport = (struct sctp_transport *)data;
+	struct sctp_association *asoc = transport->asoc;
+	struct sock *sk = asoc->base.sk;
+	struct net *net = sock_net(sk);
+	int error = 0;
+
+	bh_lock_sock(sk);
+	if (sock_owned_by_user(sk)) {
+		pr_debug("%s: sock is busy\n", __func__);
+
+		/* Try again later.  */
+		if (!mod_timer(&transport->reconf_timer, jiffies + (HZ / 20)))
+			sctp_transport_hold(transport);
+		goto out_unlock;
+	}
+
+	error = sctp_do_sm(net, SCTP_EVENT_T_TIMEOUT,
+			   SCTP_ST_TIMEOUT(SCTP_EVENT_TIMEOUT_RECONF),
+			   asoc->state, asoc->ep, asoc,
+			   transport, GFP_ATOMIC);
+
+	if (error)
+		sk->sk_err = -error;
+
+out_unlock:
+	bh_unlock_sock(sk);
+	sctp_transport_put(transport);
+}
 
 /* Inject a SACK Timeout event into the state machine.  */
 static void sctp_generate_sack_event(unsigned long data)
@@ -453,6 +484,7 @@ sctp_timer_event_t *sctp_timer_events[SCTP_NUM_TIMEOUT_TYPES] = {
 	sctp_generate_t4_rto_event,
 	sctp_generate_t5_shutdown_guard_event,
 	NULL,
+	NULL,
 	sctp_generate_sack_event,
 	sctp_generate_autoclose_event,
 };
@@ -723,7 +755,7 @@ static void sctp_cmd_transport_on(sctp_cmd_seq_t *cmds,
 	 * forward progress.
 	 */
 	if (t->dst)
-		dst_confirm(t->dst);
+		sctp_transport_dst_confirm(t);
 
 	/* The receiver of the HEARTBEAT ACK should also perform an
 	 * RTT measurement for that destination transport address
@@ -840,6 +872,10 @@ static void sctp_cmd_new_state(sctp_cmd_seq_t *cmds,
 		if (!sctp_style(sk, UDP))
 			sk->sk_state_change(sk);
 	}
+
+	if (sctp_state(asoc, SHUTDOWN_PENDING) &&
+	    !sctp_outq_is_empty(&asoc->outqueue))
+		sctp_outq_uncork(&asoc->outqueue, GFP_ATOMIC);
 }
 
 /* Helper function to delete an association. */
diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
index 8ec20a64a3f8..e03bb1aab4d0 100644
--- a/net/sctp/sm_statefuns.c
+++ b/net/sctp/sm_statefuns.c
@@ -160,23 +160,22 @@ static sctp_disposition_t __sctp_sf_do_9_1_abort(struct net *net,
 /* Small helper function that checks if the chunk length
  * is of the appropriate length.  The 'required_length' argument
  * is set to be the size of a specific chunk we are testing.
- * Return Values:  1 = Valid length
- * 		   0 = Invalid length
+ * Return Values:  true  = Valid length
+ * 		   false = Invalid length
  *
  */
-static inline int
-sctp_chunk_length_valid(struct sctp_chunk *chunk,
-			   __u16 required_length)
+static inline bool
+sctp_chunk_length_valid(struct sctp_chunk *chunk, __u16 required_length)
 {
 	__u16 chunk_length = ntohs(chunk->chunk_hdr->length);
 
 	/* Previously already marked? */
 	if (unlikely(chunk->pdiscard))
-		return 0;
+		return false;
 	if (unlikely(chunk_length < required_length))
-		return 0;
+		return false;
 
-	return 1;
+	return true;
 }
 
 /**********************************************************
@@ -1022,6 +1021,34 @@ sctp_disposition_t sctp_sf_sendbeat_8_3(struct net *net,
 	return SCTP_DISPOSITION_CONSUME;
 }
 
+/* resend asoc strreset_chunk.  */
+sctp_disposition_t sctp_sf_send_reconf(struct net *net,
+				       const struct sctp_endpoint *ep,
+				       const struct sctp_association *asoc,
+				       const sctp_subtype_t type, void *arg,
+				       sctp_cmd_seq_t *commands)
+{
+	struct sctp_transport *transport = arg;
+
+	if (asoc->overall_error_count >= asoc->max_retrans) {
+		sctp_add_cmd_sf(commands, SCTP_CMD_SET_SK_ERR,
+				SCTP_ERROR(ETIMEDOUT));
+		/* CMD_ASSOC_FAILED calls CMD_DELETE_TCB. */
+		sctp_add_cmd_sf(commands, SCTP_CMD_ASSOC_FAILED,
+				SCTP_PERR(SCTP_ERROR_NO_ERROR));
+		SCTP_INC_STATS(net, SCTP_MIB_ABORTEDS);
+		SCTP_DEC_STATS(net, SCTP_MIB_CURRESTAB);
+		return SCTP_DISPOSITION_DELETE_TCB;
+	}
+
+	sctp_chunk_hold(asoc->strreset_chunk);
+	sctp_add_cmd_sf(commands, SCTP_CMD_REPLY,
+			SCTP_CHUNK(asoc->strreset_chunk));
+	sctp_add_cmd_sf(commands, SCTP_CMD_STRIKE, SCTP_TRANSPORT(transport));
+
+	return SCTP_DISPOSITION_CONSUME;
+}
+
 /*
  * Process an heartbeat request.
  *
@@ -3237,36 +3264,34 @@ static sctp_disposition_t sctp_sf_tabort_8_4_8(struct net *net,
 	struct sctp_chunk *abort;
 
 	packet = sctp_ootb_pkt_new(net, asoc, chunk);
+	if (!packet)
+		return SCTP_DISPOSITION_NOMEM;
 
-	if (packet) {
-		/* Make an ABORT. The T bit will be set if the asoc
-		 * is NULL.
-		 */
-		abort = sctp_make_abort(asoc, chunk, 0);
-		if (!abort) {
-			sctp_ootb_pkt_free(packet);
-			return SCTP_DISPOSITION_NOMEM;
-		}
-
-		/* Reflect vtag if T-Bit is set */
-		if (sctp_test_T_bit(abort))
-			packet->vtag = ntohl(chunk->sctp_hdr->vtag);
+	/* Make an ABORT. The T bit will be set if the asoc
+	 * is NULL.
+	 */
+	abort = sctp_make_abort(asoc, chunk, 0);
+	if (!abort) {
+		sctp_ootb_pkt_free(packet);
+		return SCTP_DISPOSITION_NOMEM;
+	}
 
-		/* Set the skb to the belonging sock for accounting.  */
-		abort->skb->sk = ep->base.sk;
+	/* Reflect vtag if T-Bit is set */
+	if (sctp_test_T_bit(abort))
+		packet->vtag = ntohl(chunk->sctp_hdr->vtag);
 
-		sctp_packet_append_chunk(packet, abort);
+	/* Set the skb to the belonging sock for accounting.  */
+	abort->skb->sk = ep->base.sk;
 
-		sctp_add_cmd_sf(commands, SCTP_CMD_SEND_PKT,
-				SCTP_PACKET(packet));
+	sctp_packet_append_chunk(packet, abort);
 
-		SCTP_INC_STATS(net, SCTP_MIB_OUTCTRLCHUNKS);
+	sctp_add_cmd_sf(commands, SCTP_CMD_SEND_PKT,
+			SCTP_PACKET(packet));
 
-		sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
-		return SCTP_DISPOSITION_CONSUME;
-	}
+	SCTP_INC_STATS(net, SCTP_MIB_OUTCTRLCHUNKS);
 
-	return SCTP_DISPOSITION_NOMEM;
+	sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
+	return SCTP_DISPOSITION_CONSUME;
 }
 
 /*
@@ -3503,45 +3528,43 @@ static sctp_disposition_t sctp_sf_shut_8_4_5(struct net *net,
 	struct sctp_chunk *shut;
 
 	packet = sctp_ootb_pkt_new(net, asoc, chunk);
+	if (!packet)
+		return SCTP_DISPOSITION_NOMEM;
 
-	if (packet) {
-		/* Make an SHUTDOWN_COMPLETE.
-		 * The T bit will be set if the asoc is NULL.
-		 */
-		shut = sctp_make_shutdown_complete(asoc, chunk);
-		if (!shut) {
-			sctp_ootb_pkt_free(packet);
-			return SCTP_DISPOSITION_NOMEM;
-		}
-
-		/* Reflect vtag if T-Bit is set */
-		if (sctp_test_T_bit(shut))
-			packet->vtag = ntohl(chunk->sctp_hdr->vtag);
+	/* Make an SHUTDOWN_COMPLETE.
+	 * The T bit will be set if the asoc is NULL.
+	 */
+	shut = sctp_make_shutdown_complete(asoc, chunk);
+	if (!shut) {
+		sctp_ootb_pkt_free(packet);
+		return SCTP_DISPOSITION_NOMEM;
+	}
 
-		/* Set the skb to the belonging sock for accounting.  */
-		shut->skb->sk = ep->base.sk;
+	/* Reflect vtag if T-Bit is set */
+	if (sctp_test_T_bit(shut))
+		packet->vtag = ntohl(chunk->sctp_hdr->vtag);
 
-		sctp_packet_append_chunk(packet, shut);
+	/* Set the skb to the belonging sock for accounting.  */
+	shut->skb->sk = ep->base.sk;
 
-		sctp_add_cmd_sf(commands, SCTP_CMD_SEND_PKT,
-				SCTP_PACKET(packet));
+	sctp_packet_append_chunk(packet, shut);
 
-		SCTP_INC_STATS(net, SCTP_MIB_OUTCTRLCHUNKS);
+	sctp_add_cmd_sf(commands, SCTP_CMD_SEND_PKT,
+			SCTP_PACKET(packet));
 
-		/* If the chunk length is invalid, we don't want to process
-		 * the reset of the packet.
-		 */
-		if (!sctp_chunk_length_valid(chunk, sizeof(sctp_chunkhdr_t)))
-			return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
+	SCTP_INC_STATS(net, SCTP_MIB_OUTCTRLCHUNKS);
 
-		/* We need to discard the rest of the packet to prevent
-		 * potential bomming attacks from additional bundled chunks.
-		 * This is documented in SCTP Threats ID.
-		 */
+	/* If the chunk length is invalid, we don't want to process
+	 * the reset of the packet.
+	 */
+	if (!sctp_chunk_length_valid(chunk, sizeof(sctp_chunkhdr_t)))
 		return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
-	}
 
-	return SCTP_DISPOSITION_NOMEM;
+	/* We need to discard the rest of the packet to prevent
+	 * potential bomming attacks from additional bundled chunks.
+	 * This is documented in SCTP Threats ID.
+	 */
+	return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
 }
 
 /*
@@ -3811,6 +3834,60 @@ sctp_disposition_t sctp_sf_do_asconf_ack(struct net *net,
 	return SCTP_DISPOSITION_DISCARD;
 }
 
+/* RE-CONFIG Section 5.2 Upon reception of an RECONF Chunk. */
+sctp_disposition_t sctp_sf_do_reconf(struct net *net,
+				     const struct sctp_endpoint *ep,
+				     const struct sctp_association *asoc,
+				     const sctp_subtype_t type, void *arg,
+				     sctp_cmd_seq_t *commands)
+{
+	struct sctp_paramhdr *err_param = NULL;
+	struct sctp_chunk *chunk = arg;
+	struct sctp_reconf_chunk *hdr;
+	union sctp_params param;
+
+	if (!sctp_vtag_verify(chunk, asoc)) {
+		sctp_add_cmd_sf(commands, SCTP_CMD_REPORT_BAD_TAG,
+				SCTP_NULL());
+		return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
+	}
+
+	/* Make sure that the RECONF chunk has a valid length.  */
+	if (!sctp_chunk_length_valid(chunk, sizeof(*hdr)))
+		return sctp_sf_violation_chunklen(net, ep, asoc, type, arg,
+						  commands);
+
+	if (!sctp_verify_reconf(asoc, chunk, &err_param))
+		return sctp_sf_violation_paramlen(net, ep, asoc, type, arg,
+						  (void *)err_param, commands);
+
+	hdr = (struct sctp_reconf_chunk *)chunk->chunk_hdr;
+	sctp_walk_params(param, hdr, params) {
+		struct sctp_chunk *reply = NULL;
+		struct sctp_ulpevent *ev = NULL;
+
+		if (param.p->type == SCTP_PARAM_RESET_OUT_REQUEST)
+			reply = sctp_process_strreset_outreq(
+				(struct sctp_association *)asoc, param, &ev);
+		else if (param.p->type == SCTP_PARAM_RESET_IN_REQUEST)
+			reply = sctp_process_strreset_inreq(
+				(struct sctp_association *)asoc, param, &ev);
+		/* More handles for other types will be added here, by now it
+		 * just ignores other types.
+		 */
+
+		if (ev)
+			sctp_add_cmd_sf(commands, SCTP_CMD_EVENT_ULP,
+					SCTP_ULPEVENT(ev));
+
+		if (reply)
+			sctp_add_cmd_sf(commands, SCTP_CMD_REPLY,
+					SCTP_CHUNK(reply));
+	}
+
+	return SCTP_DISPOSITION_CONSUME;
+}
+
 /*
  * PR-SCTP Section 3.6 Receiver Side Implementation of PR-SCTP
  *
@@ -3844,6 +3921,9 @@ sctp_disposition_t sctp_sf_eat_fwd_tsn(struct net *net,
 		return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
 	}
 
+	if (!asoc->peer.prsctp_capable)
+		return sctp_sf_unk_chunk(net, ep, asoc, type, arg, commands);
+
 	/* Make sure that the FORWARD_TSN chunk has valid length.  */
 	if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_fwdtsn_chunk)))
 		return sctp_sf_violation_chunklen(net, ep, asoc, type, arg,
@@ -3912,6 +3992,9 @@ sctp_disposition_t sctp_sf_eat_fwd_tsn_fast(
 		return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
 	}
 
+	if (!asoc->peer.prsctp_capable)
+		return sctp_sf_unk_chunk(net, ep, asoc, type, arg, commands);
+
 	/* Make sure that the FORWARD_TSN chunk has a valid length.  */
 	if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_fwdtsn_chunk)))
 		return sctp_sf_violation_chunklen(net, ep, asoc, type, arg,
@@ -5162,6 +5245,19 @@ sctp_disposition_t sctp_sf_do_prm_asconf(struct net *net,
 	return SCTP_DISPOSITION_CONSUME;
 }
 
+/* RE-CONFIG Section 5.1 RECONF Chunk Procedures */
+sctp_disposition_t sctp_sf_do_prm_reconf(struct net *net,
+					 const struct sctp_endpoint *ep,
+					 const struct sctp_association *asoc,
+					 const sctp_subtype_t type,
+					 void *arg, sctp_cmd_seq_t *commands)
+{
+	struct sctp_chunk *chunk = arg;
+
+	sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(chunk));
+	return SCTP_DISPOSITION_CONSUME;
+}
+
 /*
  * Ignore the primitive event
  *
@@ -6036,8 +6132,9 @@ static struct sctp_packet *sctp_ootb_pkt_new(struct net *net,
 	sctp_transport_route(transport, (union sctp_addr *)&chunk->dest,
 			     sctp_sk(net->sctp.ctl_sock));
 
-	packet = sctp_packet_init(&transport->packet, transport, sport, dport);
-	packet = sctp_packet_config(packet, vtag, 0);
+	packet = &transport->packet;
+	sctp_packet_init(packet, transport, sport, dport);
+	sctp_packet_config(packet, vtag, 0);
 
 	return packet;
 
@@ -6278,9 +6375,8 @@ static int sctp_eat_data(const struct sctp_association *asoc,
 	 * and is invalid.
 	 */
 	ssn = ntohs(data_hdr->ssn);
-	if (ordered && SSN_lt(ssn, sctp_ssn_peek(&asoc->ssnmap->in, sid))) {
+	if (ordered && SSN_lt(ssn, sctp_ssn_peek(asoc->stream, in, sid)))
 		return SCTP_IERROR_PROTO_VIOLATION;
-	}
 
 	/* Send the data up to the user.  Note:  Schedule  the
 	 * SCTP_CMD_CHUNK_ULP cmd before the SCTP_CMD_GEN_SACK, as the SACK
diff --git a/net/sctp/sm_statetable.c b/net/sctp/sm_statetable.c
index a987d54b379c..419b18ebb056 100644
--- a/net/sctp/sm_statetable.c
+++ b/net/sctp/sm_statetable.c
@@ -482,6 +482,32 @@ static const sctp_sm_table_entry_t prsctp_chunk_event_table[SCTP_NUM_PRSCTP_CHUN
 	TYPE_SCTP_FWD_TSN,
 }; /*state_fn_t prsctp_chunk_event_table[][] */
 
+#define TYPE_SCTP_RECONF { \
+	/* SCTP_STATE_CLOSED */ \
+	TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
+	/* SCTP_STATE_COOKIE_WAIT */ \
+	TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
+	/* SCTP_STATE_COOKIE_ECHOED */ \
+	TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
+	/* SCTP_STATE_ESTABLISHED */ \
+	TYPE_SCTP_FUNC(sctp_sf_do_reconf), \
+	/* SCTP_STATE_SHUTDOWN_PENDING */ \
+	TYPE_SCTP_FUNC(sctp_sf_do_reconf), \
+	/* SCTP_STATE_SHUTDOWN_SENT */ \
+	TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
+	/* SCTP_STATE_SHUTDOWN_RECEIVED */ \
+	TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
+	/* SCTP_STATE_SHUTDOWN_ACK_SENT */ \
+	TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
+} /* TYPE_SCTP_RECONF */
+
+/* The primary index for this table is the chunk type.
+ * The secondary index for this table is the state.
+ */
+static const sctp_sm_table_entry_t reconf_chunk_event_table[SCTP_NUM_RECONF_CHUNK_TYPES][SCTP_STATE_NUM_STATES] = {
+	TYPE_SCTP_RECONF,
+}; /*state_fn_t reconf_chunk_event_table[][] */
+
 #define TYPE_SCTP_AUTH { \
 	/* SCTP_STATE_CLOSED */ \
 	TYPE_SCTP_FUNC(sctp_sf_ootb), \
@@ -643,6 +669,25 @@ chunk_event_table_unknown[SCTP_STATE_NUM_STATES] = {
 	TYPE_SCTP_FUNC(sctp_sf_error_shutdown), \
 } /* TYPE_SCTP_PRIMITIVE_ASCONF */
 
+#define TYPE_SCTP_PRIMITIVE_RECONF { \
+	/* SCTP_STATE_CLOSED */ \
+	TYPE_SCTP_FUNC(sctp_sf_error_closed), \
+	/* SCTP_STATE_COOKIE_WAIT */ \
+	TYPE_SCTP_FUNC(sctp_sf_error_closed), \
+	/* SCTP_STATE_COOKIE_ECHOED */ \
+	TYPE_SCTP_FUNC(sctp_sf_error_closed), \
+	/* SCTP_STATE_ESTABLISHED */ \
+	TYPE_SCTP_FUNC(sctp_sf_do_prm_reconf), \
+	/* SCTP_STATE_SHUTDOWN_PENDING */ \
+	TYPE_SCTP_FUNC(sctp_sf_do_prm_reconf), \
+	/* SCTP_STATE_SHUTDOWN_SENT */ \
+	TYPE_SCTP_FUNC(sctp_sf_do_prm_reconf), \
+	/* SCTP_STATE_SHUTDOWN_RECEIVED */ \
+	TYPE_SCTP_FUNC(sctp_sf_do_prm_reconf), \
+	/* SCTP_STATE_SHUTDOWN_ACK_SENT */ \
+	TYPE_SCTP_FUNC(sctp_sf_error_shutdown), \
+} /* TYPE_SCTP_PRIMITIVE_RECONF */
+
 /* The primary index for this table is the primitive type.
  * The secondary index for this table is the state.
  */
@@ -653,6 +698,7 @@ static const sctp_sm_table_entry_t primitive_event_table[SCTP_NUM_PRIMITIVE_TYPE
 	TYPE_SCTP_PRIMITIVE_SEND,
 	TYPE_SCTP_PRIMITIVE_REQUESTHEARTBEAT,
 	TYPE_SCTP_PRIMITIVE_ASCONF,
+	TYPE_SCTP_PRIMITIVE_RECONF,
 };
 
 #define TYPE_SCTP_OTHER_NO_PENDING_TSN  { \
@@ -888,6 +934,25 @@ static const sctp_sm_table_entry_t other_event_table[SCTP_NUM_OTHER_TYPES][SCTP_
 	TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
 }
 
+#define TYPE_SCTP_EVENT_TIMEOUT_RECONF { \
+	/* SCTP_STATE_CLOSED */ \
+	TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
+	/* SCTP_STATE_COOKIE_WAIT */ \
+	TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
+	/* SCTP_STATE_COOKIE_ECHOED */ \
+	TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
+	/* SCTP_STATE_ESTABLISHED */ \
+	TYPE_SCTP_FUNC(sctp_sf_send_reconf), \
+	/* SCTP_STATE_SHUTDOWN_PENDING */ \
+	TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
+	/* SCTP_STATE_SHUTDOWN_SENT */ \
+	TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
+	/* SCTP_STATE_SHUTDOWN_RECEIVED */ \
+	TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
+	/* SCTP_STATE_SHUTDOWN_ACK_SENT */ \
+	TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
+}
+
 static const sctp_sm_table_entry_t timeout_event_table[SCTP_NUM_TIMEOUT_TYPES][SCTP_STATE_NUM_STATES] = {
 	TYPE_SCTP_EVENT_TIMEOUT_NONE,
 	TYPE_SCTP_EVENT_TIMEOUT_T1_COOKIE,
@@ -897,6 +962,7 @@ static const sctp_sm_table_entry_t timeout_event_table[SCTP_NUM_TIMEOUT_TYPES][S
 	TYPE_SCTP_EVENT_TIMEOUT_T4_RTO,
 	TYPE_SCTP_EVENT_TIMEOUT_T5_SHUTDOWN_GUARD,
 	TYPE_SCTP_EVENT_TIMEOUT_HEARTBEAT,
+	TYPE_SCTP_EVENT_TIMEOUT_RECONF,
 	TYPE_SCTP_EVENT_TIMEOUT_SACK,
 	TYPE_SCTP_EVENT_TIMEOUT_AUTOCLOSE,
 };
@@ -924,6 +990,10 @@ static const sctp_sm_table_entry_t *sctp_chunk_event_lookup(struct net *net,
 			return &addip_chunk_event_table[1][state];
 	}
 
+	if (net->sctp.reconf_enable)
+		if (cid == SCTP_CID_RECONF)
+			return &reconf_chunk_event_table[0][state];
+
 	if (net->sctp.auth_enable) {
 		if (cid == SCTP_CID_AUTH)
 			return &auth_chunk_event_table[0][state];
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 37eeab7899fc..465a9c8464f9 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -239,7 +239,7 @@ static struct sctp_transport *sctp_addr_id2transport(struct sock *sk,
 	union sctp_addr *laddr = (union sctp_addr *)addr;
 	struct sctp_transport *transport;
 
-	if (sctp_verify_addr(sk, laddr, af->sockaddr_len))
+	if (!af || sctp_verify_addr(sk, laddr, af->sockaddr_len))
 		return NULL;
 
 	addr_asoc = sctp_endpoint_lookup_assoc(sctp_sk(sk)->ep,
@@ -364,7 +364,7 @@ static int sctp_do_bind(struct sock *sk, union sctp_addr *addr, int len)
 		}
 	}
 
-	if (snum && snum < PROT_SOCK &&
+	if (snum && snum < inet_prot_sock(net) &&
 	    !ns_capable(net->user_ns, CAP_NET_BIND_SERVICE))
 		return -EACCES;
 
@@ -592,7 +592,7 @@ static int sctp_send_asconf_add_ip(struct sock		*sk,
 			list_for_each_entry(trans,
 			    &asoc->peer.transport_addr_list, transports) {
 				/* Clear the source and route cache */
-				dst_release(trans->dst);
+				sctp_transport_dst_release(trans);
 				trans->cwnd = min(4*asoc->pathmtu, max_t(__u32,
 				    2*asoc->pathmtu, 4380));
 				trans->ssthresh = asoc->peer.i.a_rwnd;
@@ -843,7 +843,7 @@ skip_mkasconf:
 		 */
 		list_for_each_entry(transport, &asoc->peer.transport_addr_list,
 					transports) {
-			dst_release(transport->dst);
+			sctp_transport_dst_release(transport);
 			sctp_transport_route(transport, NULL,
 					     sctp_sk(asoc->base.sk));
 		}
@@ -1156,8 +1156,10 @@ static int __sctp_connect(struct sock *sk,
 				 * accept new associations, but it SHOULD NOT
 				 * be permitted to open new associations.
 				 */
-				if (ep->base.bind_addr.port < PROT_SOCK &&
-				    !ns_capable(net->user_ns, CAP_NET_BIND_SERVICE)) {
+				if (ep->base.bind_addr.port <
+				    inet_prot_sock(net) &&
+				    !ns_capable(net->user_ns,
+				    CAP_NET_BIND_SERVICE)) {
 					err = -EACCES;
 					goto out_free;
 				}
@@ -1822,7 +1824,7 @@ static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len)
 			 * but it SHOULD NOT be permitted to open new
 			 * associations.
 			 */
-			if (ep->base.bind_addr.port < PROT_SOCK &&
+			if (ep->base.bind_addr.port < inet_prot_sock(net) &&
 			    !ns_capable(net->user_ns, CAP_NET_BIND_SERVICE)) {
 				err = -EACCES;
 				goto out_unlock;
@@ -1962,6 +1964,7 @@ static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len)
 		err = PTR_ERR(datamsg);
 		goto out_free;
 	}
+	datamsg->force_delay = !!(msg->msg_flags & MSG_MORE);
 
 	/* Now send the (possibly) fragmented message. */
 	list_for_each_entry(chunk, &datamsg->chunks, frag_list) {
@@ -2434,7 +2437,6 @@ static int sctp_apply_peer_addr_params(struct sctp_paddrparams *params,
 			sctp_assoc_sync_pmtu(sctp_opt2sk(sp), asoc);
 		} else if (asoc) {
 			asoc->pathmtu = params->spp_pathmtu;
-			sctp_frag_point(asoc, params->spp_pathmtu);
 		} else {
 			sp->pathmtu = params->spp_pathmtu;
 		}
@@ -3755,6 +3757,120 @@ out:
 	return retval;
 }
 
+static int sctp_setsockopt_enable_strreset(struct sock *sk,
+					   char __user *optval,
+					   unsigned int optlen)
+{
+	struct sctp_assoc_value params;
+	struct sctp_association *asoc;
+	int retval = -EINVAL;
+
+	if (optlen != sizeof(params))
+		goto out;
+
+	if (copy_from_user(&params, optval, optlen)) {
+		retval = -EFAULT;
+		goto out;
+	}
+
+	if (params.assoc_value & (~SCTP_ENABLE_STRRESET_MASK))
+		goto out;
+
+	asoc = sctp_id2assoc(sk, params.assoc_id);
+	if (asoc) {
+		asoc->strreset_enable = params.assoc_value;
+	} else if (!params.assoc_id) {
+		struct sctp_sock *sp = sctp_sk(sk);
+
+		sp->ep->strreset_enable = params.assoc_value;
+	} else {
+		goto out;
+	}
+
+	retval = 0;
+
+out:
+	return retval;
+}
+
+static int sctp_setsockopt_reset_streams(struct sock *sk,
+					 char __user *optval,
+					 unsigned int optlen)
+{
+	struct sctp_reset_streams *params;
+	struct sctp_association *asoc;
+	int retval = -EINVAL;
+
+	if (optlen < sizeof(struct sctp_reset_streams))
+		return -EINVAL;
+
+	params = memdup_user(optval, optlen);
+	if (IS_ERR(params))
+		return PTR_ERR(params);
+
+	asoc = sctp_id2assoc(sk, params->srs_assoc_id);
+	if (!asoc)
+		goto out;
+
+	retval = sctp_send_reset_streams(asoc, params);
+
+out:
+	kfree(params);
+	return retval;
+}
+
+static int sctp_setsockopt_reset_assoc(struct sock *sk,
+				       char __user *optval,
+				       unsigned int optlen)
+{
+	struct sctp_association *asoc;
+	sctp_assoc_t associd;
+	int retval = -EINVAL;
+
+	if (optlen != sizeof(associd))
+		goto out;
+
+	if (copy_from_user(&associd, optval, optlen)) {
+		retval = -EFAULT;
+		goto out;
+	}
+
+	asoc = sctp_id2assoc(sk, associd);
+	if (!asoc)
+		goto out;
+
+	retval = sctp_send_reset_assoc(asoc);
+
+out:
+	return retval;
+}
+
+static int sctp_setsockopt_add_streams(struct sock *sk,
+				       char __user *optval,
+				       unsigned int optlen)
+{
+	struct sctp_association *asoc;
+	struct sctp_add_streams params;
+	int retval = -EINVAL;
+
+	if (optlen != sizeof(params))
+		goto out;
+
+	if (copy_from_user(&params, optval, optlen)) {
+		retval = -EFAULT;
+		goto out;
+	}
+
+	asoc = sctp_id2assoc(sk, params.sas_assoc_id);
+	if (!asoc)
+		goto out;
+
+	retval = sctp_send_add_streams(asoc, &params);
+
+out:
+	return retval;
+}
+
 /* API 6.2 setsockopt(), getsockopt()
  *
  * Applications use setsockopt() and getsockopt() to set or retrieve
@@ -3921,6 +4037,18 @@ static int sctp_setsockopt(struct sock *sk, int level, int optname,
 	case SCTP_DEFAULT_PRINFO:
 		retval = sctp_setsockopt_default_prinfo(sk, optval, optlen);
 		break;
+	case SCTP_ENABLE_STREAM_RESET:
+		retval = sctp_setsockopt_enable_strreset(sk, optval, optlen);
+		break;
+	case SCTP_RESET_STREAMS:
+		retval = sctp_setsockopt_reset_streams(sk, optval, optlen);
+		break;
+	case SCTP_RESET_ASSOC:
+		retval = sctp_setsockopt_reset_assoc(sk, optval, optlen);
+		break;
+	case SCTP_ADD_STREAMS:
+		retval = sctp_setsockopt_add_streams(sk, optval, optlen);
+		break;
 	default:
 		retval = -ENOPROTOOPT;
 		break;
@@ -4734,6 +4862,12 @@ int sctp_do_peeloff(struct sock *sk, sctp_assoc_t id, struct socket **sockp)
 	if (!asoc)
 		return -EINVAL;
 
+	/* If there is a thread waiting on more sndbuf space for
+	 * sending on this asoc, it cannot be peeled.
+	 */
+	if (waitqueue_active(&asoc->wait))
+		return -EBUSY;
+
 	/* An association cannot be branched off from an already peeled-off
 	 * socket, nor is this supported for tcp style sockets.
 	 */
@@ -6405,6 +6539,47 @@ out:
 	return retval;
 }
 
+static int sctp_getsockopt_enable_strreset(struct sock *sk, int len,
+					   char __user *optval,
+					   int __user *optlen)
+{
+	struct sctp_assoc_value params;
+	struct sctp_association *asoc;
+	int retval = -EFAULT;
+
+	if (len < sizeof(params)) {
+		retval = -EINVAL;
+		goto out;
+	}
+
+	len = sizeof(params);
+	if (copy_from_user(&params, optval, len))
+		goto out;
+
+	asoc = sctp_id2assoc(sk, params.assoc_id);
+	if (asoc) {
+		params.assoc_value = asoc->strreset_enable;
+	} else if (!params.assoc_id) {
+		struct sctp_sock *sp = sctp_sk(sk);
+
+		params.assoc_value = sp->ep->strreset_enable;
+	} else {
+		retval = -EINVAL;
+		goto out;
+	}
+
+	if (put_user(len, optlen))
+		goto out;
+
+	if (copy_to_user(optval, &params, len))
+		goto out;
+
+	retval = 0;
+
+out:
+	return retval;
+}
+
 static int sctp_getsockopt(struct sock *sk, int level, int optname,
 			   char __user *optval, int __user *optlen)
 {
@@ -6572,6 +6747,10 @@ static int sctp_getsockopt(struct sock *sk, int level, int optname,
 		retval = sctp_getsockopt_pr_assocstatus(sk, len, optval,
 							optlen);
 		break;
+	case SCTP_ENABLE_STREAM_RESET:
+		retval = sctp_getsockopt_enable_strreset(sk, len, optval,
+							 optlen);
+		break;
 	default:
 		retval = -ENOPROTOOPT;
 		break;
@@ -7426,7 +7605,6 @@ static int sctp_wait_for_sndbuf(struct sctp_association *asoc, long *timeo_p,
 		 */
 		release_sock(sk);
 		current_timeo = schedule_timeout(current_timeo);
-		BUG_ON(sk != asoc->base.sk);
 		lock_sock(sk);
 
 		*timeo_p = current_timeo;
diff --git a/net/sctp/ssnmap.c b/net/sctp/ssnmap.c
deleted file mode 100644
index b9c8521c1a98..000000000000
--- a/net/sctp/ssnmap.c
+++ /dev/null
@@ -1,125 +0,0 @@
-/* SCTP kernel implementation
- * Copyright (c) 2003 International Business Machines, Corp.
- *
- * This file is part of the SCTP kernel implementation
- *
- * These functions manipulate sctp SSN tracker.
- *
- * This SCTP implementation is free software;
- * you can redistribute it and/or modify it under the terms of
- * the GNU General Public License as published by
- * the Free Software Foundation; either version 2, or (at your option)
- * any later version.
- *
- * This SCTP implementation is distributed in the hope that it
- * will be useful, but WITHOUT ANY WARRANTY; without even the implied
- *                 ************************
- * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- * See the GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with GNU CC; see the file COPYING.  If not, see
- * <http://www.gnu.org/licenses/>.
- *
- * Please send any bug reports or fixes you make to the
- * email address(es):
- *    lksctp developers <linux-sctp@vger.kernel.org>
- *
- * Written or modified by:
- *    Jon Grimm             <jgrimm@us.ibm.com>
- */
-
-#include <linux/types.h>
-#include <linux/slab.h>
-#include <net/sctp/sctp.h>
-#include <net/sctp/sm.h>
-
-static struct sctp_ssnmap *sctp_ssnmap_init(struct sctp_ssnmap *map, __u16 in,
-					    __u16 out);
-
-/* Storage size needed for map includes 2 headers and then the
- * specific needs of in or out streams.
- */
-static inline size_t sctp_ssnmap_size(__u16 in, __u16 out)
-{
-	return sizeof(struct sctp_ssnmap) + (in + out) * sizeof(__u16);
-}
-
-
-/* Create a new sctp_ssnmap.
- * Allocate room to store at least 'len' contiguous TSNs.
- */
-struct sctp_ssnmap *sctp_ssnmap_new(__u16 in, __u16 out,
-				    gfp_t gfp)
-{
-	struct sctp_ssnmap *retval;
-	int size;
-
-	size = sctp_ssnmap_size(in, out);
-	if (size <= KMALLOC_MAX_SIZE)
-		retval = kmalloc(size, gfp);
-	else
-		retval = (struct sctp_ssnmap *)
-			  __get_free_pages(gfp, get_order(size));
-	if (!retval)
-		goto fail;
-
-	if (!sctp_ssnmap_init(retval, in, out))
-		goto fail_map;
-
-	SCTP_DBG_OBJCNT_INC(ssnmap);
-
-	return retval;
-
-fail_map:
-	if (size <= KMALLOC_MAX_SIZE)
-		kfree(retval);
-	else
-		free_pages((unsigned long)retval, get_order(size));
-fail:
-	return NULL;
-}
-
-
-/* Initialize a block of memory as a ssnmap.  */
-static struct sctp_ssnmap *sctp_ssnmap_init(struct sctp_ssnmap *map, __u16 in,
-					    __u16 out)
-{
-	memset(map, 0x00, sctp_ssnmap_size(in, out));
-
-	/* Start 'in' stream just after the map header. */
-	map->in.ssn = (__u16 *)&map[1];
-	map->in.len = in;
-
-	/* Start 'out' stream just after 'in'. */
-	map->out.ssn = &map->in.ssn[in];
-	map->out.len = out;
-
-	return map;
-}
-
-/* Clear out the ssnmap streams.  */
-void sctp_ssnmap_clear(struct sctp_ssnmap *map)
-{
-	size_t size;
-
-	size = (map->in.len + map->out.len) * sizeof(__u16);
-	memset(map->in.ssn, 0x00, size);
-}
-
-/* Dispose of a ssnmap.  */
-void sctp_ssnmap_free(struct sctp_ssnmap *map)
-{
-	int size;
-
-	if (unlikely(!map))
-		return;
-
-	size = sctp_ssnmap_size(map->in.len, map->out.len);
-	if (size <= KMALLOC_MAX_SIZE)
-		kfree(map);
-	else
-		free_pages((unsigned long)map, get_order(size));
-
-	SCTP_DBG_OBJCNT_DEC(ssnmap);
-}
diff --git a/net/sctp/stream.c b/net/sctp/stream.c
new file mode 100644
index 000000000000..1c6cc04fa3a4
--- /dev/null
+++ b/net/sctp/stream.c
@@ -0,0 +1,479 @@
+/* SCTP kernel implementation
+ * (C) Copyright IBM Corp. 2001, 2004
+ * Copyright (c) 1999-2000 Cisco, Inc.
+ * Copyright (c) 1999-2001 Motorola, Inc.
+ * Copyright (c) 2001 Intel Corp.
+ *
+ * This file is part of the SCTP kernel implementation
+ *
+ * These functions manipulate sctp tsn mapping array.
+ *
+ * This SCTP implementation is free software;
+ * you can redistribute it and/or modify it under the terms of
+ * the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * This SCTP implementation is distributed in the hope that it
+ * will be useful, but WITHOUT ANY WARRANTY; without even the implied
+ *                 ************************
+ * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ * See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU CC; see the file COPYING.  If not, see
+ * <http://www.gnu.org/licenses/>.
+ *
+ * Please send any bug reports or fixes you make to the
+ * email address(es):
+ *    lksctp developers <linux-sctp@vger.kernel.org>
+ *
+ * Written or modified by:
+ *    Xin Long <lucien.xin@gmail.com>
+ */
+
+#include <net/sctp/sctp.h>
+#include <net/sctp/sm.h>
+
+struct sctp_stream *sctp_stream_new(__u16 incnt, __u16 outcnt, gfp_t gfp)
+{
+	struct sctp_stream *stream;
+	int i;
+
+	stream = kzalloc(sizeof(*stream), gfp);
+	if (!stream)
+		return NULL;
+
+	stream->outcnt = outcnt;
+	stream->out = kcalloc(stream->outcnt, sizeof(*stream->out), gfp);
+	if (!stream->out) {
+		kfree(stream);
+		return NULL;
+	}
+	for (i = 0; i < stream->outcnt; i++)
+		stream->out[i].state = SCTP_STREAM_OPEN;
+
+	stream->incnt = incnt;
+	stream->in = kcalloc(stream->incnt, sizeof(*stream->in), gfp);
+	if (!stream->in) {
+		kfree(stream->out);
+		kfree(stream);
+		return NULL;
+	}
+
+	return stream;
+}
+
+void sctp_stream_free(struct sctp_stream *stream)
+{
+	if (unlikely(!stream))
+		return;
+
+	kfree(stream->out);
+	kfree(stream->in);
+	kfree(stream);
+}
+
+void sctp_stream_clear(struct sctp_stream *stream)
+{
+	int i;
+
+	for (i = 0; i < stream->outcnt; i++)
+		stream->out[i].ssn = 0;
+
+	for (i = 0; i < stream->incnt; i++)
+		stream->in[i].ssn = 0;
+}
+
+static int sctp_send_reconf(struct sctp_association *asoc,
+			    struct sctp_chunk *chunk)
+{
+	struct net *net = sock_net(asoc->base.sk);
+	int retval = 0;
+
+	retval = sctp_primitive_RECONF(net, asoc, chunk);
+	if (retval)
+		sctp_chunk_free(chunk);
+
+	return retval;
+}
+
+int sctp_send_reset_streams(struct sctp_association *asoc,
+			    struct sctp_reset_streams *params)
+{
+	struct sctp_stream *stream = asoc->stream;
+	__u16 i, str_nums, *str_list;
+	struct sctp_chunk *chunk;
+	int retval = -EINVAL;
+	bool out, in;
+
+	if (!asoc->peer.reconf_capable ||
+	    !(asoc->strreset_enable & SCTP_ENABLE_RESET_STREAM_REQ)) {
+		retval = -ENOPROTOOPT;
+		goto out;
+	}
+
+	if (asoc->strreset_outstanding) {
+		retval = -EINPROGRESS;
+		goto out;
+	}
+
+	out = params->srs_flags & SCTP_STREAM_RESET_OUTGOING;
+	in  = params->srs_flags & SCTP_STREAM_RESET_INCOMING;
+	if (!out && !in)
+		goto out;
+
+	str_nums = params->srs_number_streams;
+	str_list = params->srs_stream_list;
+	if (out && str_nums)
+		for (i = 0; i < str_nums; i++)
+			if (str_list[i] >= stream->outcnt)
+				goto out;
+
+	if (in && str_nums)
+		for (i = 0; i < str_nums; i++)
+			if (str_list[i] >= stream->incnt)
+				goto out;
+
+	for (i = 0; i < str_nums; i++)
+		str_list[i] = htons(str_list[i]);
+
+	chunk = sctp_make_strreset_req(asoc, str_nums, str_list, out, in);
+
+	for (i = 0; i < str_nums; i++)
+		str_list[i] = ntohs(str_list[i]);
+
+	if (!chunk) {
+		retval = -ENOMEM;
+		goto out;
+	}
+
+	if (out) {
+		if (str_nums)
+			for (i = 0; i < str_nums; i++)
+				stream->out[str_list[i]].state =
+						       SCTP_STREAM_CLOSED;
+		else
+			for (i = 0; i < stream->outcnt; i++)
+				stream->out[i].state = SCTP_STREAM_CLOSED;
+	}
+
+	asoc->strreset_chunk = chunk;
+	sctp_chunk_hold(asoc->strreset_chunk);
+
+	retval = sctp_send_reconf(asoc, chunk);
+	if (retval) {
+		sctp_chunk_put(asoc->strreset_chunk);
+		asoc->strreset_chunk = NULL;
+		if (!out)
+			goto out;
+
+		if (str_nums)
+			for (i = 0; i < str_nums; i++)
+				stream->out[str_list[i]].state =
+						       SCTP_STREAM_OPEN;
+		else
+			for (i = 0; i < stream->outcnt; i++)
+				stream->out[i].state = SCTP_STREAM_OPEN;
+
+		goto out;
+	}
+
+	asoc->strreset_outstanding = out + in;
+
+out:
+	return retval;
+}
+
+int sctp_send_reset_assoc(struct sctp_association *asoc)
+{
+	struct sctp_chunk *chunk = NULL;
+	int retval;
+	__u16 i;
+
+	if (!asoc->peer.reconf_capable ||
+	    !(asoc->strreset_enable & SCTP_ENABLE_RESET_ASSOC_REQ))
+		return -ENOPROTOOPT;
+
+	if (asoc->strreset_outstanding)
+		return -EINPROGRESS;
+
+	chunk = sctp_make_strreset_tsnreq(asoc);
+	if (!chunk)
+		return -ENOMEM;
+
+	/* Block further xmit of data until this request is completed */
+	for (i = 0; i < asoc->stream->outcnt; i++)
+		asoc->stream->out[i].state = SCTP_STREAM_CLOSED;
+
+	asoc->strreset_chunk = chunk;
+	sctp_chunk_hold(asoc->strreset_chunk);
+
+	retval = sctp_send_reconf(asoc, chunk);
+	if (retval) {
+		sctp_chunk_put(asoc->strreset_chunk);
+		asoc->strreset_chunk = NULL;
+
+		for (i = 0; i < asoc->stream->outcnt; i++)
+			asoc->stream->out[i].state = SCTP_STREAM_OPEN;
+
+		return retval;
+	}
+
+	asoc->strreset_outstanding = 1;
+
+	return 0;
+}
+
+int sctp_send_add_streams(struct sctp_association *asoc,
+			  struct sctp_add_streams *params)
+{
+	struct sctp_stream *stream = asoc->stream;
+	struct sctp_chunk *chunk = NULL;
+	int retval = -ENOMEM;
+	__u32 outcnt, incnt;
+	__u16 out, in;
+
+	if (!asoc->peer.reconf_capable ||
+	    !(asoc->strreset_enable & SCTP_ENABLE_CHANGE_ASSOC_REQ)) {
+		retval = -ENOPROTOOPT;
+		goto out;
+	}
+
+	if (asoc->strreset_outstanding) {
+		retval = -EINPROGRESS;
+		goto out;
+	}
+
+	out = params->sas_outstrms;
+	in  = params->sas_instrms;
+	outcnt = stream->outcnt + out;
+	incnt = stream->incnt + in;
+	if (outcnt > SCTP_MAX_STREAM || incnt > SCTP_MAX_STREAM ||
+	    (!out && !in)) {
+		retval = -EINVAL;
+		goto out;
+	}
+
+	if (out) {
+		struct sctp_stream_out *streamout;
+
+		streamout = krealloc(stream->out, outcnt * sizeof(*streamout),
+				     GFP_KERNEL);
+		if (!streamout)
+			goto out;
+
+		memset(streamout + stream->outcnt, 0, out * sizeof(*streamout));
+		stream->out = streamout;
+	}
+
+	if (in) {
+		struct sctp_stream_in *streamin;
+
+		streamin = krealloc(stream->in, incnt * sizeof(*streamin),
+				    GFP_KERNEL);
+		if (!streamin)
+			goto out;
+
+		memset(streamin + stream->incnt, 0, in * sizeof(*streamin));
+		stream->in = streamin;
+	}
+
+	chunk = sctp_make_strreset_addstrm(asoc, out, in);
+	if (!chunk)
+		goto out;
+
+	asoc->strreset_chunk = chunk;
+	sctp_chunk_hold(asoc->strreset_chunk);
+
+	retval = sctp_send_reconf(asoc, chunk);
+	if (retval) {
+		sctp_chunk_put(asoc->strreset_chunk);
+		asoc->strreset_chunk = NULL;
+		goto out;
+	}
+
+	stream->incnt = incnt;
+	stream->outcnt = outcnt;
+
+	asoc->strreset_outstanding = !!out + !!in;
+
+out:
+	return retval;
+}
+
+static sctp_paramhdr_t *sctp_chunk_lookup_strreset_param(
+			struct sctp_association *asoc, __u32 resp_seq)
+{
+	struct sctp_chunk *chunk = asoc->strreset_chunk;
+	struct sctp_reconf_chunk *hdr;
+	union sctp_params param;
+
+	if (ntohl(resp_seq) != asoc->strreset_outseq || !chunk)
+		return NULL;
+
+	hdr = (struct sctp_reconf_chunk *)chunk->chunk_hdr;
+	sctp_walk_params(param, hdr, params) {
+		/* sctp_strreset_tsnreq is actually the basic structure
+		 * of all stream reconf params, so it's safe to use it
+		 * to access request_seq.
+		 */
+		struct sctp_strreset_tsnreq *req = param.v;
+
+		if (req->request_seq == resp_seq)
+			return param.v;
+	}
+
+	return NULL;
+}
+
+struct sctp_chunk *sctp_process_strreset_outreq(
+				struct sctp_association *asoc,
+				union sctp_params param,
+				struct sctp_ulpevent **evp)
+{
+	struct sctp_strreset_outreq *outreq = param.v;
+	struct sctp_stream *stream = asoc->stream;
+	__u16 i, nums, flags = 0, *str_p = NULL;
+	__u32 result = SCTP_STRRESET_DENIED;
+	__u32 request_seq;
+
+	request_seq = ntohl(outreq->request_seq);
+
+	if (ntohl(outreq->send_reset_at_tsn) >
+	    sctp_tsnmap_get_ctsn(&asoc->peer.tsn_map)) {
+		result = SCTP_STRRESET_IN_PROGRESS;
+		goto out;
+	}
+
+	if (request_seq > asoc->strreset_inseq) {
+		result = SCTP_STRRESET_ERR_BAD_SEQNO;
+		goto out;
+	} else if (request_seq == asoc->strreset_inseq) {
+		asoc->strreset_inseq++;
+	}
+
+	/* Check strreset_enable after inseq inc, as sender cannot tell
+	 * the peer doesn't enable strreset after receiving response with
+	 * result denied, as well as to keep consistent with bsd.
+	 */
+	if (!(asoc->strreset_enable & SCTP_ENABLE_RESET_STREAM_REQ))
+		goto out;
+
+	if (asoc->strreset_chunk) {
+		sctp_paramhdr_t *param_hdr;
+		struct sctp_transport *t;
+
+		param_hdr = sctp_chunk_lookup_strreset_param(
+					asoc, outreq->response_seq);
+		if (!param_hdr || param_hdr->type !=
+					SCTP_PARAM_RESET_IN_REQUEST) {
+			/* same process with outstanding isn't 0 */
+			result = SCTP_STRRESET_ERR_IN_PROGRESS;
+			goto out;
+		}
+
+		asoc->strreset_outstanding--;
+		asoc->strreset_outseq++;
+
+		if (!asoc->strreset_outstanding) {
+			t = asoc->strreset_chunk->transport;
+			if (del_timer(&t->reconf_timer))
+				sctp_transport_put(t);
+
+			sctp_chunk_put(asoc->strreset_chunk);
+			asoc->strreset_chunk = NULL;
+		}
+
+		flags = SCTP_STREAM_RESET_INCOMING_SSN;
+	}
+
+	nums = (ntohs(param.p->length) - sizeof(*outreq)) / 2;
+	if (nums) {
+		str_p = outreq->list_of_streams;
+		for (i = 0; i < nums; i++) {
+			if (ntohs(str_p[i]) >= stream->incnt) {
+				result = SCTP_STRRESET_ERR_WRONG_SSN;
+				goto out;
+			}
+		}
+
+		for (i = 0; i < nums; i++)
+			stream->in[ntohs(str_p[i])].ssn = 0;
+	} else {
+		for (i = 0; i < stream->incnt; i++)
+			stream->in[i].ssn = 0;
+	}
+
+	result = SCTP_STRRESET_PERFORMED;
+
+	*evp = sctp_ulpevent_make_stream_reset_event(asoc,
+		flags | SCTP_STREAM_RESET_OUTGOING_SSN, nums, str_p,
+		GFP_ATOMIC);
+
+out:
+	return sctp_make_strreset_resp(asoc, result, request_seq);
+}
+
+struct sctp_chunk *sctp_process_strreset_inreq(
+				struct sctp_association *asoc,
+				union sctp_params param,
+				struct sctp_ulpevent **evp)
+{
+	struct sctp_strreset_inreq *inreq = param.v;
+	struct sctp_stream *stream = asoc->stream;
+	__u32 result = SCTP_STRRESET_DENIED;
+	struct sctp_chunk *chunk = NULL;
+	__u16 i, nums, *str_p;
+	__u32 request_seq;
+
+	request_seq = ntohl(inreq->request_seq);
+	if (request_seq > asoc->strreset_inseq) {
+		result = SCTP_STRRESET_ERR_BAD_SEQNO;
+		goto out;
+	} else if (request_seq == asoc->strreset_inseq) {
+		asoc->strreset_inseq++;
+	}
+
+	if (!(asoc->strreset_enable & SCTP_ENABLE_RESET_STREAM_REQ))
+		goto out;
+
+	if (asoc->strreset_outstanding) {
+		result = SCTP_STRRESET_ERR_IN_PROGRESS;
+		goto out;
+	}
+
+	nums = (ntohs(param.p->length) - sizeof(*inreq)) / 2;
+	str_p = inreq->list_of_streams;
+	for (i = 0; i < nums; i++) {
+		if (ntohs(str_p[i]) >= stream->outcnt) {
+			result = SCTP_STRRESET_ERR_WRONG_SSN;
+			goto out;
+		}
+	}
+
+	chunk = sctp_make_strreset_req(asoc, nums, str_p, 1, 0);
+	if (!chunk)
+		goto out;
+
+	if (nums)
+		for (i = 0; i < nums; i++)
+			stream->out[ntohs(str_p[i])].state =
+					       SCTP_STREAM_CLOSED;
+	else
+		for (i = 0; i < stream->outcnt; i++)
+			stream->out[i].state = SCTP_STREAM_CLOSED;
+
+	asoc->strreset_chunk = chunk;
+	asoc->strreset_outstanding = 1;
+	sctp_chunk_hold(asoc->strreset_chunk);
+
+	*evp = sctp_ulpevent_make_stream_reset_event(asoc,
+		SCTP_STREAM_RESET_INCOMING_SSN, nums, str_p, GFP_ATOMIC);
+
+out:
+	if (!chunk)
+		chunk =  sctp_make_strreset_resp(asoc, result, request_seq);
+
+	return chunk;
+}
diff --git a/net/sctp/transport.c b/net/sctp/transport.c
index a1652ab63918..3379668af368 100644
--- a/net/sctp/transport.c
+++ b/net/sctp/transport.c
@@ -88,9 +88,11 @@ static struct sctp_transport *sctp_transport_init(struct net *net,
 	INIT_LIST_HEAD(&peer->transports);
 
 	setup_timer(&peer->T3_rtx_timer, sctp_generate_t3_rtx_event,
-			(unsigned long)peer);
+		    (unsigned long)peer);
 	setup_timer(&peer->hb_timer, sctp_generate_heartbeat_event,
-			(unsigned long)peer);
+		    (unsigned long)peer);
+	setup_timer(&peer->reconf_timer, sctp_generate_reconf_event,
+		    (unsigned long)peer);
 	setup_timer(&peer->proto_unreach_timer,
 		    sctp_generate_proto_unreach_event, (unsigned long)peer);
 
@@ -144,6 +146,9 @@ void sctp_transport_free(struct sctp_transport *transport)
 	if (del_timer(&transport->T3_rtx_timer))
 		sctp_transport_put(transport);
 
+	if (del_timer(&transport->reconf_timer))
+		sctp_transport_put(transport);
+
 	/* Delete the ICMP proto unreachable timer if it's active. */
 	if (del_timer(&transport->proto_unreach_timer))
 		sctp_association_put(transport->asoc);
@@ -211,6 +216,14 @@ void sctp_transport_reset_hb_timer(struct sctp_transport *transport)
 		sctp_transport_hold(transport);
 }
 
+void sctp_transport_reset_reconf_timer(struct sctp_transport *transport)
+{
+	if (!timer_pending(&transport->reconf_timer))
+		if (!mod_timer(&transport->reconf_timer,
+			       jiffies + transport->rto))
+			sctp_transport_hold(transport);
+}
+
 /* This transport has been assigned to an association.
  * Initialize fields from the association or from the sock itself.
  * Register the reference count in the association.
@@ -227,7 +240,7 @@ void sctp_transport_pmtu(struct sctp_transport *transport, struct sock *sk)
 {
 	/* If we don't have a fresh route, look one up */
 	if (!transport->dst || transport->dst->obsolete) {
-		dst_release(transport->dst);
+		sctp_transport_dst_release(transport);
 		transport->af_specific->get_dst(transport, &transport->saddr,
 						&transport->fl, sk);
 	}
@@ -630,9 +643,7 @@ void sctp_transport_reset(struct sctp_transport *t)
 	t->srtt = 0;
 	t->rttvar = 0;
 
-	/* Reset these additional varibles so that we have a clean
-	 * slate.
-	 */
+	/* Reset these additional variables so that we have a clean slate. */
 	t->partial_bytes_acked = 0;
 	t->flight_size = 0;
 	t->error_count = 0;
@@ -659,3 +670,17 @@ void sctp_transport_immediate_rtx(struct sctp_transport *t)
 			sctp_transport_hold(t);
 	}
 }
+
+/* Drop dst */
+void sctp_transport_dst_release(struct sctp_transport *t)
+{
+	dst_release(t->dst);
+	t->dst = NULL;
+	t->dst_pending_confirm = 0;
+}
+
+/* Schedule neighbour confirm */
+void sctp_transport_dst_confirm(struct sctp_transport *t)
+{
+	t->dst_pending_confirm = 1;
+}
diff --git a/net/sctp/ulpevent.c b/net/sctp/ulpevent.c
index bea00058ce35..c8881bc542a0 100644
--- a/net/sctp/ulpevent.c
+++ b/net/sctp/ulpevent.c
@@ -854,6 +854,35 @@ struct sctp_ulpevent *sctp_ulpevent_make_sender_dry_event(
 	return event;
 }
 
+struct sctp_ulpevent *sctp_ulpevent_make_stream_reset_event(
+	const struct sctp_association *asoc, __u16 flags, __u16 stream_num,
+	__u16 *stream_list, gfp_t gfp)
+{
+	struct sctp_stream_reset_event *sreset;
+	struct sctp_ulpevent *event;
+	struct sk_buff *skb;
+	int length, i;
+
+	length = sizeof(struct sctp_stream_reset_event) + 2 * stream_num;
+	event = sctp_ulpevent_new(length, MSG_NOTIFICATION, gfp);
+	if (!event)
+		return NULL;
+
+	skb = sctp_event2skb(event);
+	sreset = (struct sctp_stream_reset_event *)skb_put(skb, length);
+
+	sreset->strreset_type = SCTP_STREAM_RESET_EVENT;
+	sreset->strreset_flags = flags;
+	sreset->strreset_length = length;
+	sctp_ulpevent_set_owner(event, asoc);
+	sreset->strreset_assoc_id = sctp_assoc2id(asoc);
+
+	for (i = 0; i < stream_num; i++)
+		sreset->strreset_stream_list[i] = ntohs(stream_list[i]);
+
+	return event;
+}
+
 /* Return the notification type, assuming this is a notification
  * event.
  */
diff --git a/net/sctp/ulpqueue.c b/net/sctp/ulpqueue.c
index 84d0fdaf7de9..aa3624d50278 100644
--- a/net/sctp/ulpqueue.c
+++ b/net/sctp/ulpqueue.c
@@ -760,11 +760,11 @@ static void sctp_ulpq_retrieve_ordered(struct sctp_ulpq *ulpq,
 	struct sk_buff_head *event_list;
 	struct sk_buff *pos, *tmp;
 	struct sctp_ulpevent *cevent;
-	struct sctp_stream *in;
+	struct sctp_stream *stream;
 	__u16 sid, csid, cssn;
 
 	sid = event->stream;
-	in  = &ulpq->asoc->ssnmap->in;
+	stream  = ulpq->asoc->stream;
 
 	event_list = (struct sk_buff_head *) sctp_event2skb(event)->prev;
 
@@ -782,11 +782,11 @@ static void sctp_ulpq_retrieve_ordered(struct sctp_ulpq *ulpq,
 		if (csid < sid)
 			continue;
 
-		if (cssn != sctp_ssn_peek(in, sid))
+		if (cssn != sctp_ssn_peek(stream, in, sid))
 			break;
 
-		/* Found it, so mark in the ssnmap. */
-		sctp_ssn_next(in, sid);
+		/* Found it, so mark in the stream. */
+		sctp_ssn_next(stream, in, sid);
 
 		__skb_unlink(pos, &ulpq->lobby);
 
@@ -849,7 +849,7 @@ static struct sctp_ulpevent *sctp_ulpq_order(struct sctp_ulpq *ulpq,
 					     struct sctp_ulpevent *event)
 {
 	__u16 sid, ssn;
-	struct sctp_stream *in;
+	struct sctp_stream *stream;
 
 	/* Check if this message needs ordering.  */
 	if (SCTP_DATA_UNORDERED & event->msg_flags)
@@ -858,10 +858,10 @@ static struct sctp_ulpevent *sctp_ulpq_order(struct sctp_ulpq *ulpq,
 	/* Note: The stream ID must be verified before this routine.  */
 	sid = event->stream;
 	ssn = event->ssn;
-	in  = &ulpq->asoc->ssnmap->in;
+	stream  = ulpq->asoc->stream;
 
 	/* Is this the expected SSN for this stream ID?  */
-	if (ssn != sctp_ssn_peek(in, sid)) {
+	if (ssn != sctp_ssn_peek(stream, in, sid)) {
 		/* We've received something out of order, so find where it
 		 * needs to be placed.  We order by stream and then by SSN.
 		 */
@@ -870,7 +870,7 @@ static struct sctp_ulpevent *sctp_ulpq_order(struct sctp_ulpq *ulpq,
 	}
 
 	/* Mark that the next chunk has been found.  */
-	sctp_ssn_next(in, sid);
+	sctp_ssn_next(stream, in, sid);
 
 	/* Go find any other chunks that were waiting for
 	 * ordering.
@@ -888,12 +888,12 @@ static void sctp_ulpq_reap_ordered(struct sctp_ulpq *ulpq, __u16 sid)
 	struct sk_buff *pos, *tmp;
 	struct sctp_ulpevent *cevent;
 	struct sctp_ulpevent *event;
-	struct sctp_stream *in;
+	struct sctp_stream *stream;
 	struct sk_buff_head temp;
 	struct sk_buff_head *lobby = &ulpq->lobby;
 	__u16 csid, cssn;
 
-	in  = &ulpq->asoc->ssnmap->in;
+	stream = ulpq->asoc->stream;
 
 	/* We are holding the chunks by stream, by SSN.  */
 	skb_queue_head_init(&temp);
@@ -912,7 +912,7 @@ static void sctp_ulpq_reap_ordered(struct sctp_ulpq *ulpq, __u16 sid)
 			continue;
 
 		/* see if this ssn has been marked by skipping */
-		if (!SSN_lt(cssn, sctp_ssn_peek(in, csid)))
+		if (!SSN_lt(cssn, sctp_ssn_peek(stream, in, csid)))
 			break;
 
 		__skb_unlink(pos, lobby);
@@ -932,8 +932,8 @@ static void sctp_ulpq_reap_ordered(struct sctp_ulpq *ulpq, __u16 sid)
 		csid = cevent->stream;
 		cssn = cevent->ssn;
 
-		if (csid == sid && cssn == sctp_ssn_peek(in, csid)) {
-			sctp_ssn_next(in, csid);
+		if (csid == sid && cssn == sctp_ssn_peek(stream, in, csid)) {
+			sctp_ssn_next(stream, in, csid);
 			__skb_unlink(pos, lobby);
 			__skb_queue_tail(&temp, pos);
 			event = sctp_skb2event(pos);
@@ -955,17 +955,17 @@ static void sctp_ulpq_reap_ordered(struct sctp_ulpq *ulpq, __u16 sid)
  */
 void sctp_ulpq_skip(struct sctp_ulpq *ulpq, __u16 sid, __u16 ssn)
 {
-	struct sctp_stream *in;
+	struct sctp_stream *stream;
 
 	/* Note: The stream ID must be verified before this routine.  */
-	in  = &ulpq->asoc->ssnmap->in;
+	stream  = ulpq->asoc->stream;
 
 	/* Is this an old SSN?  If so ignore. */
-	if (SSN_lt(ssn, sctp_ssn_peek(in, sid)))
+	if (SSN_lt(ssn, sctp_ssn_peek(stream, in, sid)))
 		return;
 
 	/* Mark that we are no longer expecting this SSN or lower. */
-	sctp_ssn_skip(in, sid, ssn);
+	sctp_ssn_skip(stream, in, sid, ssn);
 
 	/* Go find any other chunks that were waiting for
 	 * ordering and deliver them if needed.
diff --git a/net/smc/Kconfig b/net/smc/Kconfig
new file mode 100644
index 000000000000..c717ef0896aa
--- /dev/null
+++ b/net/smc/Kconfig
@@ -0,0 +1,20 @@
+config SMC
+	tristate "SMC socket protocol family"
+	depends on INET && INFINIBAND
+	---help---
+	  SMC-R provides a "sockets over RDMA" solution making use of
+	  RDMA over Converged Ethernet (RoCE) technology to upgrade
+	  AF_INET TCP connections transparently.
+	  The Linux implementation of the SMC-R solution is designed as
+	  a separate socket family SMC.
+
+	  Select this option if you want to run SMC socket applications
+
+config SMC_DIAG
+	tristate "SMC: socket monitoring interface"
+	depends on SMC
+	---help---
+	  Support for SMC socket monitoring interface used by tools such as
+	  smcss.
+
+	  if unsure, say Y.
diff --git a/net/smc/Makefile b/net/smc/Makefile
new file mode 100644
index 000000000000..188104654b54
--- /dev/null
+++ b/net/smc/Makefile
@@ -0,0 +1,4 @@
+obj-$(CONFIG_SMC)	+= smc.o
+obj-$(CONFIG_SMC_DIAG)	+= smc_diag.o
+smc-y := af_smc.o smc_pnet.o smc_ib.o smc_clc.o smc_core.o smc_wr.o smc_llc.o
+smc-y += smc_cdc.o smc_tx.o smc_rx.o smc_close.o
diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
new file mode 100644
index 000000000000..5d4208ad029e
--- /dev/null
+++ b/net/smc/af_smc.c
@@ -0,0 +1,1407 @@
+/*
+ *  Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ *  AF_SMC protocol family socket handler keeping the AF_INET sock address type
+ *  applies to SOCK_STREAM sockets only
+ *  offers an alternative communication option for TCP-protocol sockets
+ *  applicable with RoCE-cards only
+ *
+ *  Initial restrictions:
+ *    - non-blocking connect postponed
+ *    - IPv6 support postponed
+ *    - support for alternate links postponed
+ *    - partial support for non-blocking sockets only
+ *    - support for urgent data postponed
+ *
+ *  Copyright IBM Corp. 2016
+ *
+ *  Author(s):  Ursula Braun <ubraun@linux.vnet.ibm.com>
+ *              based on prototype from Frank Blaschka
+ */
+
+#define KMSG_COMPONENT "smc"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/module.h>
+#include <linux/socket.h>
+#include <linux/inetdevice.h>
+#include <linux/workqueue.h>
+#include <linux/in.h>
+#include <net/sock.h>
+#include <net/tcp.h>
+#include <net/smc.h>
+
+#include "smc.h"
+#include "smc_clc.h"
+#include "smc_llc.h"
+#include "smc_cdc.h"
+#include "smc_core.h"
+#include "smc_ib.h"
+#include "smc_pnet.h"
+#include "smc_tx.h"
+#include "smc_rx.h"
+#include "smc_close.h"
+
+static DEFINE_MUTEX(smc_create_lgr_pending);	/* serialize link group
+						 * creation
+						 */
+
+struct smc_lgr_list smc_lgr_list = {		/* established link groups */
+	.lock = __SPIN_LOCK_UNLOCKED(smc_lgr_list.lock),
+	.list = LIST_HEAD_INIT(smc_lgr_list.list),
+};
+
+static void smc_tcp_listen_work(struct work_struct *);
+
+static void smc_set_keepalive(struct sock *sk, int val)
+{
+	struct smc_sock *smc = smc_sk(sk);
+
+	smc->clcsock->sk->sk_prot->keepalive(smc->clcsock->sk, val);
+}
+
+static struct smc_hashinfo smc_v4_hashinfo = {
+	.lock = __RW_LOCK_UNLOCKED(smc_v4_hashinfo.lock),
+};
+
+int smc_hash_sk(struct sock *sk)
+{
+	struct smc_hashinfo *h = sk->sk_prot->h.smc_hash;
+	struct hlist_head *head;
+
+	head = &h->ht;
+
+	write_lock_bh(&h->lock);
+	sk_add_node(sk, head);
+	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
+	write_unlock_bh(&h->lock);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(smc_hash_sk);
+
+void smc_unhash_sk(struct sock *sk)
+{
+	struct smc_hashinfo *h = sk->sk_prot->h.smc_hash;
+
+	write_lock_bh(&h->lock);
+	if (sk_del_node_init(sk))
+		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
+	write_unlock_bh(&h->lock);
+}
+EXPORT_SYMBOL_GPL(smc_unhash_sk);
+
+struct proto smc_proto = {
+	.name		= "SMC",
+	.owner		= THIS_MODULE,
+	.keepalive	= smc_set_keepalive,
+	.hash		= smc_hash_sk,
+	.unhash		= smc_unhash_sk,
+	.obj_size	= sizeof(struct smc_sock),
+	.h.smc_hash	= &smc_v4_hashinfo,
+	.slab_flags	= SLAB_DESTROY_BY_RCU,
+};
+EXPORT_SYMBOL_GPL(smc_proto);
+
+static int smc_release(struct socket *sock)
+{
+	struct sock *sk = sock->sk;
+	struct smc_sock *smc;
+	int rc = 0;
+
+	if (!sk)
+		goto out;
+
+	smc = smc_sk(sk);
+	sock_hold(sk);
+	if (sk->sk_state == SMC_LISTEN)
+		/* smc_close_non_accepted() is called and acquires
+		 * sock lock for child sockets again
+		 */
+		lock_sock_nested(sk, SINGLE_DEPTH_NESTING);
+	else
+		lock_sock(sk);
+
+	if (smc->use_fallback) {
+		sk->sk_state = SMC_CLOSED;
+		sk->sk_state_change(sk);
+	} else {
+		rc = smc_close_active(smc);
+		sock_set_flag(sk, SOCK_DEAD);
+		sk->sk_shutdown |= SHUTDOWN_MASK;
+	}
+	if (smc->clcsock) {
+		sock_release(smc->clcsock);
+		smc->clcsock = NULL;
+	}
+
+	/* detach socket */
+	sock_orphan(sk);
+	sock->sk = NULL;
+	if (smc->use_fallback) {
+		schedule_delayed_work(&smc->sock_put_work, TCP_TIMEWAIT_LEN);
+	} else if (sk->sk_state == SMC_CLOSED) {
+		smc_conn_free(&smc->conn);
+		schedule_delayed_work(&smc->sock_put_work,
+				      SMC_CLOSE_SOCK_PUT_DELAY);
+	}
+	sk->sk_prot->unhash(sk);
+	release_sock(sk);
+
+	sock_put(sk);
+out:
+	return rc;
+}
+
+static void smc_destruct(struct sock *sk)
+{
+	if (sk->sk_state != SMC_CLOSED)
+		return;
+	if (!sock_flag(sk, SOCK_DEAD))
+		return;
+
+	sk_refcnt_debug_dec(sk);
+}
+
+static struct sock *smc_sock_alloc(struct net *net, struct socket *sock)
+{
+	struct smc_sock *smc;
+	struct sock *sk;
+
+	sk = sk_alloc(net, PF_SMC, GFP_KERNEL, &smc_proto, 0);
+	if (!sk)
+		return NULL;
+
+	sock_init_data(sock, sk); /* sets sk_refcnt to 1 */
+	sk->sk_state = SMC_INIT;
+	sk->sk_destruct = smc_destruct;
+	sk->sk_protocol = SMCPROTO_SMC;
+	smc = smc_sk(sk);
+	INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work);
+	INIT_LIST_HEAD(&smc->accept_q);
+	spin_lock_init(&smc->accept_q_lock);
+	INIT_DELAYED_WORK(&smc->sock_put_work, smc_close_sock_put_work);
+	sk->sk_prot->hash(sk);
+	sk_refcnt_debug_inc(sk);
+
+	return sk;
+}
+
+static int smc_bind(struct socket *sock, struct sockaddr *uaddr,
+		    int addr_len)
+{
+	struct sockaddr_in *addr = (struct sockaddr_in *)uaddr;
+	struct sock *sk = sock->sk;
+	struct smc_sock *smc;
+	int rc;
+
+	smc = smc_sk(sk);
+
+	/* replicate tests from inet_bind(), to be safe wrt. future changes */
+	rc = -EINVAL;
+	if (addr_len < sizeof(struct sockaddr_in))
+		goto out;
+
+	rc = -EAFNOSUPPORT;
+	/* accept AF_UNSPEC (mapped to AF_INET) only if s_addr is INADDR_ANY */
+	if ((addr->sin_family != AF_INET) &&
+	    ((addr->sin_family != AF_UNSPEC) ||
+	     (addr->sin_addr.s_addr != htonl(INADDR_ANY))))
+		goto out;
+
+	lock_sock(sk);
+
+	/* Check if socket is already active */
+	rc = -EINVAL;
+	if (sk->sk_state != SMC_INIT)
+		goto out_rel;
+
+	smc->clcsock->sk->sk_reuse = sk->sk_reuse;
+	rc = kernel_bind(smc->clcsock, uaddr, addr_len);
+
+out_rel:
+	release_sock(sk);
+out:
+	return rc;
+}
+
+static void smc_copy_sock_settings(struct sock *nsk, struct sock *osk,
+				   unsigned long mask)
+{
+	/* options we don't get control via setsockopt for */
+	nsk->sk_type = osk->sk_type;
+	nsk->sk_sndbuf = osk->sk_sndbuf;
+	nsk->sk_rcvbuf = osk->sk_rcvbuf;
+	nsk->sk_sndtimeo = osk->sk_sndtimeo;
+	nsk->sk_rcvtimeo = osk->sk_rcvtimeo;
+	nsk->sk_mark = osk->sk_mark;
+	nsk->sk_priority = osk->sk_priority;
+	nsk->sk_rcvlowat = osk->sk_rcvlowat;
+	nsk->sk_bound_dev_if = osk->sk_bound_dev_if;
+	nsk->sk_err = osk->sk_err;
+
+	nsk->sk_flags &= ~mask;
+	nsk->sk_flags |= osk->sk_flags & mask;
+}
+
+#define SK_FLAGS_SMC_TO_CLC ((1UL << SOCK_URGINLINE) | \
+			     (1UL << SOCK_KEEPOPEN) | \
+			     (1UL << SOCK_LINGER) | \
+			     (1UL << SOCK_BROADCAST) | \
+			     (1UL << SOCK_TIMESTAMP) | \
+			     (1UL << SOCK_DBG) | \
+			     (1UL << SOCK_RCVTSTAMP) | \
+			     (1UL << SOCK_RCVTSTAMPNS) | \
+			     (1UL << SOCK_LOCALROUTE) | \
+			     (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE) | \
+			     (1UL << SOCK_RXQ_OVFL) | \
+			     (1UL << SOCK_WIFI_STATUS) | \
+			     (1UL << SOCK_NOFCS) | \
+			     (1UL << SOCK_FILTER_LOCKED))
+/* copy only relevant settings and flags of SOL_SOCKET level from smc to
+ * clc socket (since smc is not called for these options from net/core)
+ */
+static void smc_copy_sock_settings_to_clc(struct smc_sock *smc)
+{
+	smc_copy_sock_settings(smc->clcsock->sk, &smc->sk, SK_FLAGS_SMC_TO_CLC);
+}
+
+#define SK_FLAGS_CLC_TO_SMC ((1UL << SOCK_URGINLINE) | \
+			     (1UL << SOCK_KEEPOPEN) | \
+			     (1UL << SOCK_LINGER) | \
+			     (1UL << SOCK_DBG))
+/* copy only settings and flags relevant for smc from clc to smc socket */
+static void smc_copy_sock_settings_to_smc(struct smc_sock *smc)
+{
+	smc_copy_sock_settings(&smc->sk, smc->clcsock->sk, SK_FLAGS_CLC_TO_SMC);
+}
+
+/* determine subnet and mask of internal TCP socket */
+int smc_netinfo_by_tcpsk(struct socket *clcsock,
+			 __be32 *subnet, u8 *prefix_len)
+{
+	struct dst_entry *dst = sk_dst_get(clcsock->sk);
+	struct sockaddr_in addr;
+	int rc = -ENOENT;
+	int len;
+
+	if (!dst) {
+		rc = -ENOTCONN;
+		goto out;
+	}
+	if (!dst->dev) {
+		rc = -ENODEV;
+		goto out_rel;
+	}
+
+	/* get address to which the internal TCP socket is bound */
+	kernel_getsockname(clcsock, (struct sockaddr *)&addr, &len);
+	/* analyze IPv4 specific data of net_device belonging to TCP socket */
+	for_ifa(dst->dev->ip_ptr) {
+		if (ifa->ifa_address != addr.sin_addr.s_addr)
+			continue;
+		*prefix_len = inet_mask_len(ifa->ifa_mask);
+		*subnet = ifa->ifa_address & ifa->ifa_mask;
+		rc = 0;
+		break;
+	} endfor_ifa(dst->dev->ip_ptr);
+
+out_rel:
+	dst_release(dst);
+out:
+	return rc;
+}
+
+static int smc_clnt_conf_first_link(struct smc_sock *smc, union ib_gid *gid)
+{
+	struct smc_link_group *lgr = smc->conn.lgr;
+	struct smc_link *link;
+	int rest;
+	int rc;
+
+	link = &lgr->lnk[SMC_SINGLE_LINK];
+	/* receive CONFIRM LINK request from server over RoCE fabric */
+	rest = wait_for_completion_interruptible_timeout(
+		&link->llc_confirm,
+		SMC_LLC_WAIT_FIRST_TIME);
+	if (rest <= 0) {
+		struct smc_clc_msg_decline dclc;
+
+		rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
+				      SMC_CLC_DECLINE);
+		return rc;
+	}
+
+	rc = smc_ib_modify_qp_rts(link);
+	if (rc)
+		return SMC_CLC_DECL_INTERR;
+
+	smc_wr_remember_qp_attr(link);
+	/* send CONFIRM LINK response over RoCE fabric */
+	rc = smc_llc_send_confirm_link(link,
+				       link->smcibdev->mac[link->ibport - 1],
+				       gid, SMC_LLC_RESP);
+	if (rc < 0)
+		return SMC_CLC_DECL_TCL;
+
+	return rc;
+}
+
+static void smc_conn_save_peer_info(struct smc_sock *smc,
+				    struct smc_clc_msg_accept_confirm *clc)
+{
+	smc->conn.peer_conn_idx = clc->conn_idx;
+	smc->conn.local_tx_ctrl.token = ntohl(clc->rmbe_alert_token);
+	smc->conn.peer_rmbe_size = smc_uncompress_bufsize(clc->rmbe_size);
+	atomic_set(&smc->conn.peer_rmbe_space, smc->conn.peer_rmbe_size);
+}
+
+static void smc_link_save_peer_info(struct smc_link *link,
+				    struct smc_clc_msg_accept_confirm *clc)
+{
+	link->peer_qpn = ntoh24(clc->qpn);
+	memcpy(link->peer_gid, clc->lcl.gid, SMC_GID_SIZE);
+	memcpy(link->peer_mac, clc->lcl.mac, sizeof(link->peer_mac));
+	link->peer_psn = ntoh24(clc->psn);
+	link->peer_mtu = clc->qp_mtu;
+}
+
+/* setup for RDMA connection of client */
+static int smc_connect_rdma(struct smc_sock *smc)
+{
+	struct sockaddr_in *inaddr = (struct sockaddr_in *)smc->addr;
+	struct smc_clc_msg_accept_confirm aclc;
+	int local_contact = SMC_FIRST_CONTACT;
+	struct smc_ib_device *smcibdev;
+	struct smc_link *link;
+	u8 srv_first_contact;
+	int reason_code = 0;
+	int rc = 0;
+	u8 ibport;
+
+	/* IPSec connections opt out of SMC-R optimizations */
+	if (using_ipsec(smc)) {
+		reason_code = SMC_CLC_DECL_IPSEC;
+		goto decline_rdma;
+	}
+
+	/* PNET table look up: search active ib_device and port
+	 * within same PNETID that also contains the ethernet device
+	 * used for the internal TCP socket
+	 */
+	smc_pnet_find_roce_resource(smc->clcsock->sk, &smcibdev, &ibport);
+	if (!smcibdev) {
+		reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */
+		goto decline_rdma;
+	}
+
+	/* do inband token exchange */
+	reason_code = smc_clc_send_proposal(smc, smcibdev, ibport);
+	if (reason_code < 0) {
+		rc = reason_code;
+		goto out_err;
+	}
+	if (reason_code > 0) /* configuration error */
+		goto decline_rdma;
+	/* receive SMC Accept CLC message */
+	reason_code = smc_clc_wait_msg(smc, &aclc, sizeof(aclc),
+				       SMC_CLC_ACCEPT);
+	if (reason_code < 0) {
+		rc = reason_code;
+		goto out_err;
+	}
+	if (reason_code > 0)
+		goto decline_rdma;
+
+	srv_first_contact = aclc.hdr.flag;
+	mutex_lock(&smc_create_lgr_pending);
+	local_contact = smc_conn_create(smc, inaddr->sin_addr.s_addr, smcibdev,
+					ibport, &aclc.lcl, srv_first_contact);
+	if (local_contact < 0) {
+		rc = local_contact;
+		if (rc == -ENOMEM)
+			reason_code = SMC_CLC_DECL_MEM;/* insufficient memory*/
+		else if (rc == -ENOLINK)
+			reason_code = SMC_CLC_DECL_SYNCERR; /* synchr. error */
+		goto decline_rdma_unlock;
+	}
+	link = &smc->conn.lgr->lnk[SMC_SINGLE_LINK];
+
+	smc_conn_save_peer_info(smc, &aclc);
+
+	rc = smc_sndbuf_create(smc);
+	if (rc) {
+		reason_code = SMC_CLC_DECL_MEM;
+		goto decline_rdma_unlock;
+	}
+	rc = smc_rmb_create(smc);
+	if (rc) {
+		reason_code = SMC_CLC_DECL_MEM;
+		goto decline_rdma_unlock;
+	}
+
+	if (local_contact == SMC_FIRST_CONTACT)
+		smc_link_save_peer_info(link, &aclc);
+
+	rc = smc_rmb_rtoken_handling(&smc->conn, &aclc);
+	if (rc) {
+		reason_code = SMC_CLC_DECL_INTERR;
+		goto decline_rdma_unlock;
+	}
+
+	if (local_contact == SMC_FIRST_CONTACT) {
+		rc = smc_ib_ready_link(link);
+		if (rc) {
+			reason_code = SMC_CLC_DECL_INTERR;
+			goto decline_rdma_unlock;
+		}
+	}
+
+	rc = smc_clc_send_confirm(smc);
+	if (rc)
+		goto out_err_unlock;
+
+	if (local_contact == SMC_FIRST_CONTACT) {
+		/* QP confirmation over RoCE fabric */
+		reason_code = smc_clnt_conf_first_link(
+			smc, &smcibdev->gid[ibport - 1]);
+		if (reason_code < 0) {
+			rc = reason_code;
+			goto out_err_unlock;
+		}
+		if (reason_code > 0)
+			goto decline_rdma_unlock;
+	}
+
+	mutex_unlock(&smc_create_lgr_pending);
+	smc_tx_init(smc);
+	smc_rx_init(smc);
+
+out_connected:
+	smc_copy_sock_settings_to_clc(smc);
+	if (smc->sk.sk_state == SMC_INIT)
+		smc->sk.sk_state = SMC_ACTIVE;
+
+	return rc ? rc : local_contact;
+
+decline_rdma_unlock:
+	mutex_unlock(&smc_create_lgr_pending);
+	smc_conn_free(&smc->conn);
+decline_rdma:
+	/* RDMA setup failed, switch back to TCP */
+	smc->use_fallback = true;
+	if (reason_code && (reason_code != SMC_CLC_DECL_REPLY)) {
+		rc = smc_clc_send_decline(smc, reason_code, 0);
+		if (rc < sizeof(struct smc_clc_msg_decline))
+			goto out_err;
+	}
+	goto out_connected;
+
+out_err_unlock:
+	mutex_unlock(&smc_create_lgr_pending);
+	smc_conn_free(&smc->conn);
+out_err:
+	return rc;
+}
+
+static int smc_connect(struct socket *sock, struct sockaddr *addr,
+		       int alen, int flags)
+{
+	struct sock *sk = sock->sk;
+	struct smc_sock *smc;
+	int rc = -EINVAL;
+
+	smc = smc_sk(sk);
+
+	/* separate smc parameter checking to be safe */
+	if (alen < sizeof(addr->sa_family))
+		goto out_err;
+	if (addr->sa_family != AF_INET)
+		goto out_err;
+	smc->addr = addr;	/* needed for nonblocking connect */
+
+	lock_sock(sk);
+	switch (sk->sk_state) {
+	default:
+		goto out;
+	case SMC_ACTIVE:
+		rc = -EISCONN;
+		goto out;
+	case SMC_INIT:
+		rc = 0;
+		break;
+	}
+
+	smc_copy_sock_settings_to_clc(smc);
+	rc = kernel_connect(smc->clcsock, addr, alen, flags);
+	if (rc)
+		goto out;
+
+	/* setup RDMA connection */
+	rc = smc_connect_rdma(smc);
+	if (rc < 0)
+		goto out;
+	else
+		rc = 0; /* success cases including fallback */
+
+out:
+	release_sock(sk);
+out_err:
+	return rc;
+}
+
+static int smc_clcsock_accept(struct smc_sock *lsmc, struct smc_sock **new_smc)
+{
+	struct sock *sk = &lsmc->sk;
+	struct socket *new_clcsock;
+	struct sock *new_sk;
+	int rc;
+
+	release_sock(&lsmc->sk);
+	new_sk = smc_sock_alloc(sock_net(sk), NULL);
+	if (!new_sk) {
+		rc = -ENOMEM;
+		lsmc->sk.sk_err = ENOMEM;
+		*new_smc = NULL;
+		lock_sock(&lsmc->sk);
+		goto out;
+	}
+	*new_smc = smc_sk(new_sk);
+
+	rc = kernel_accept(lsmc->clcsock, &new_clcsock, 0);
+	lock_sock(&lsmc->sk);
+	if  (rc < 0) {
+		lsmc->sk.sk_err = -rc;
+		new_sk->sk_state = SMC_CLOSED;
+		sock_set_flag(new_sk, SOCK_DEAD);
+		sk->sk_prot->unhash(new_sk);
+		sock_put(new_sk);
+		*new_smc = NULL;
+		goto out;
+	}
+	if (lsmc->sk.sk_state == SMC_CLOSED) {
+		if (new_clcsock)
+			sock_release(new_clcsock);
+		new_sk->sk_state = SMC_CLOSED;
+		sock_set_flag(new_sk, SOCK_DEAD);
+		sk->sk_prot->unhash(new_sk);
+		sock_put(new_sk);
+		*new_smc = NULL;
+		goto out;
+	}
+
+	(*new_smc)->clcsock = new_clcsock;
+out:
+	return rc;
+}
+
+/* add a just created sock to the accept queue of the listen sock as
+ * candidate for a following socket accept call from user space
+ */
+static void smc_accept_enqueue(struct sock *parent, struct sock *sk)
+{
+	struct smc_sock *par = smc_sk(parent);
+
+	sock_hold(sk);
+	spin_lock(&par->accept_q_lock);
+	list_add_tail(&smc_sk(sk)->accept_q, &par->accept_q);
+	spin_unlock(&par->accept_q_lock);
+	sk_acceptq_added(parent);
+}
+
+/* remove a socket from the accept queue of its parental listening socket */
+static void smc_accept_unlink(struct sock *sk)
+{
+	struct smc_sock *par = smc_sk(sk)->listen_smc;
+
+	spin_lock(&par->accept_q_lock);
+	list_del_init(&smc_sk(sk)->accept_q);
+	spin_unlock(&par->accept_q_lock);
+	sk_acceptq_removed(&smc_sk(sk)->listen_smc->sk);
+	sock_put(sk);
+}
+
+/* remove a sock from the accept queue to bind it to a new socket created
+ * for a socket accept call from user space
+ */
+struct sock *smc_accept_dequeue(struct sock *parent,
+				struct socket *new_sock)
+{
+	struct smc_sock *isk, *n;
+	struct sock *new_sk;
+
+	list_for_each_entry_safe(isk, n, &smc_sk(parent)->accept_q, accept_q) {
+		new_sk = (struct sock *)isk;
+
+		smc_accept_unlink(new_sk);
+		if (new_sk->sk_state == SMC_CLOSED) {
+			/* tbd in follow-on patch: close this sock */
+			continue;
+		}
+		if (new_sock)
+			sock_graft(new_sk, new_sock);
+		return new_sk;
+	}
+	return NULL;
+}
+
+/* clean up for a created but never accepted sock */
+void smc_close_non_accepted(struct sock *sk)
+{
+	struct smc_sock *smc = smc_sk(sk);
+
+	sock_hold(sk);
+	lock_sock(sk);
+	if (!sk->sk_lingertime)
+		/* wait for peer closing */
+		sk->sk_lingertime = SMC_MAX_STREAM_WAIT_TIMEOUT;
+	if (!smc->use_fallback)
+		smc_close_active(smc);
+	if (smc->clcsock) {
+		struct socket *tcp;
+
+		tcp = smc->clcsock;
+		smc->clcsock = NULL;
+		sock_release(tcp);
+	}
+	sock_set_flag(sk, SOCK_DEAD);
+	sk->sk_shutdown |= SHUTDOWN_MASK;
+	if (smc->use_fallback) {
+		schedule_delayed_work(&smc->sock_put_work, TCP_TIMEWAIT_LEN);
+	} else {
+		smc_conn_free(&smc->conn);
+		schedule_delayed_work(&smc->sock_put_work,
+				      SMC_CLOSE_SOCK_PUT_DELAY);
+	}
+	release_sock(sk);
+	sock_put(sk);
+}
+
+static int smc_serv_conf_first_link(struct smc_sock *smc)
+{
+	struct smc_link_group *lgr = smc->conn.lgr;
+	struct smc_link *link;
+	int rest;
+	int rc;
+
+	link = &lgr->lnk[SMC_SINGLE_LINK];
+	/* send CONFIRM LINK request to client over the RoCE fabric */
+	rc = smc_llc_send_confirm_link(link,
+				       link->smcibdev->mac[link->ibport - 1],
+				       &link->smcibdev->gid[link->ibport - 1],
+				       SMC_LLC_REQ);
+	if (rc < 0)
+		return SMC_CLC_DECL_TCL;
+
+	/* receive CONFIRM LINK response from client over the RoCE fabric */
+	rest = wait_for_completion_interruptible_timeout(
+		&link->llc_confirm_resp,
+		SMC_LLC_WAIT_FIRST_TIME);
+	if (rest <= 0) {
+		struct smc_clc_msg_decline dclc;
+
+		rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
+				      SMC_CLC_DECLINE);
+	}
+
+	return rc;
+}
+
+/* setup for RDMA connection of server */
+static void smc_listen_work(struct work_struct *work)
+{
+	struct smc_sock *new_smc = container_of(work, struct smc_sock,
+						smc_listen_work);
+	struct socket *newclcsock = new_smc->clcsock;
+	struct smc_sock *lsmc = new_smc->listen_smc;
+	struct smc_clc_msg_accept_confirm cclc;
+	int local_contact = SMC_REUSE_CONTACT;
+	struct sock *newsmcsk = &new_smc->sk;
+	struct smc_clc_msg_proposal pclc;
+	struct smc_ib_device *smcibdev;
+	struct sockaddr_in peeraddr;
+	struct smc_link *link;
+	int reason_code = 0;
+	int rc = 0, len;
+	__be32 subnet;
+	u8 prefix_len;
+	u8 ibport;
+
+	/* do inband token exchange -
+	 *wait for and receive SMC Proposal CLC message
+	 */
+	reason_code = smc_clc_wait_msg(new_smc, &pclc, sizeof(pclc),
+				       SMC_CLC_PROPOSAL);
+	if (reason_code < 0)
+		goto out_err;
+	if (reason_code > 0)
+		goto decline_rdma;
+
+	/* IPSec connections opt out of SMC-R optimizations */
+	if (using_ipsec(new_smc)) {
+		reason_code = SMC_CLC_DECL_IPSEC;
+		goto decline_rdma;
+	}
+
+	/* PNET table look up: search active ib_device and port
+	 * within same PNETID that also contains the ethernet device
+	 * used for the internal TCP socket
+	 */
+	smc_pnet_find_roce_resource(newclcsock->sk, &smcibdev, &ibport);
+	if (!smcibdev) {
+		reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */
+		goto decline_rdma;
+	}
+
+	/* determine subnet and mask from internal TCP socket */
+	rc = smc_netinfo_by_tcpsk(newclcsock, &subnet, &prefix_len);
+	if (rc) {
+		reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */
+		goto decline_rdma;
+	}
+	if ((pclc.outgoing_subnet != subnet) ||
+	    (pclc.prefix_len != prefix_len)) {
+		reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */
+		goto decline_rdma;
+	}
+
+	/* get address of the peer connected to the internal TCP socket */
+	kernel_getpeername(newclcsock, (struct sockaddr *)&peeraddr, &len);
+
+	/* allocate connection / link group */
+	mutex_lock(&smc_create_lgr_pending);
+	local_contact = smc_conn_create(new_smc, peeraddr.sin_addr.s_addr,
+					smcibdev, ibport, &pclc.lcl, 0);
+	if (local_contact == SMC_REUSE_CONTACT)
+		/* lock no longer needed, free it due to following
+		 * smc_clc_wait_msg() call
+		 */
+		mutex_unlock(&smc_create_lgr_pending);
+	if (local_contact < 0) {
+		rc = local_contact;
+		if (rc == -ENOMEM)
+			reason_code = SMC_CLC_DECL_MEM;/* insufficient memory*/
+		else if (rc == -ENOLINK)
+			reason_code = SMC_CLC_DECL_SYNCERR; /* synchr. error */
+		goto decline_rdma;
+	}
+	link = &new_smc->conn.lgr->lnk[SMC_SINGLE_LINK];
+
+	rc = smc_sndbuf_create(new_smc);
+	if (rc) {
+		reason_code = SMC_CLC_DECL_MEM;
+		goto decline_rdma;
+	}
+	rc = smc_rmb_create(new_smc);
+	if (rc) {
+		reason_code = SMC_CLC_DECL_MEM;
+		goto decline_rdma;
+	}
+
+	rc = smc_clc_send_accept(new_smc, local_contact);
+	if (rc)
+		goto out_err;
+
+	/* receive SMC Confirm CLC message */
+	reason_code = smc_clc_wait_msg(new_smc, &cclc, sizeof(cclc),
+				       SMC_CLC_CONFIRM);
+	if (reason_code < 0)
+		goto out_err;
+	if (reason_code > 0)
+		goto decline_rdma;
+	smc_conn_save_peer_info(new_smc, &cclc);
+	if (local_contact == SMC_FIRST_CONTACT)
+		smc_link_save_peer_info(link, &cclc);
+
+	rc = smc_rmb_rtoken_handling(&new_smc->conn, &cclc);
+	if (rc) {
+		reason_code = SMC_CLC_DECL_INTERR;
+		goto decline_rdma;
+	}
+
+	if (local_contact == SMC_FIRST_CONTACT) {
+		rc = smc_ib_ready_link(link);
+		if (rc) {
+			reason_code = SMC_CLC_DECL_INTERR;
+			goto decline_rdma;
+		}
+		/* QP confirmation over RoCE fabric */
+		reason_code = smc_serv_conf_first_link(new_smc);
+		if (reason_code < 0) {
+			/* peer is not aware of a problem */
+			rc = reason_code;
+			goto out_err;
+		}
+		if (reason_code > 0)
+			goto decline_rdma;
+	}
+
+	smc_tx_init(new_smc);
+	smc_rx_init(new_smc);
+
+out_connected:
+	sk_refcnt_debug_inc(newsmcsk);
+	if (newsmcsk->sk_state == SMC_INIT)
+		newsmcsk->sk_state = SMC_ACTIVE;
+enqueue:
+	if (local_contact == SMC_FIRST_CONTACT)
+		mutex_unlock(&smc_create_lgr_pending);
+	lock_sock_nested(&lsmc->sk, SINGLE_DEPTH_NESTING);
+	if (lsmc->sk.sk_state == SMC_LISTEN) {
+		smc_accept_enqueue(&lsmc->sk, newsmcsk);
+	} else { /* no longer listening */
+		smc_close_non_accepted(newsmcsk);
+	}
+	release_sock(&lsmc->sk);
+
+	/* Wake up accept */
+	lsmc->sk.sk_data_ready(&lsmc->sk);
+	sock_put(&lsmc->sk); /* sock_hold in smc_tcp_listen_work */
+	return;
+
+decline_rdma:
+	/* RDMA setup failed, switch back to TCP */
+	smc_conn_free(&new_smc->conn);
+	new_smc->use_fallback = true;
+	if (reason_code && (reason_code != SMC_CLC_DECL_REPLY)) {
+		rc = smc_clc_send_decline(new_smc, reason_code, 0);
+		if (rc < sizeof(struct smc_clc_msg_decline))
+			goto out_err;
+	}
+	goto out_connected;
+
+out_err:
+	newsmcsk->sk_state = SMC_CLOSED;
+	smc_conn_free(&new_smc->conn);
+	goto enqueue; /* queue new sock with sk_err set */
+}
+
+static void smc_tcp_listen_work(struct work_struct *work)
+{
+	struct smc_sock *lsmc = container_of(work, struct smc_sock,
+					     tcp_listen_work);
+	struct smc_sock *new_smc;
+	int rc = 0;
+
+	lock_sock(&lsmc->sk);
+	while (lsmc->sk.sk_state == SMC_LISTEN) {
+		rc = smc_clcsock_accept(lsmc, &new_smc);
+		if (rc)
+			goto out;
+		if (!new_smc)
+			continue;
+
+		new_smc->listen_smc = lsmc;
+		new_smc->use_fallback = false; /* assume rdma capability first*/
+		sock_hold(&lsmc->sk); /* sock_put in smc_listen_work */
+		INIT_WORK(&new_smc->smc_listen_work, smc_listen_work);
+		smc_copy_sock_settings_to_smc(new_smc);
+		schedule_work(&new_smc->smc_listen_work);
+	}
+
+out:
+	release_sock(&lsmc->sk);
+	lsmc->sk.sk_data_ready(&lsmc->sk); /* no more listening, wake accept */
+}
+
+static int smc_listen(struct socket *sock, int backlog)
+{
+	struct sock *sk = sock->sk;
+	struct smc_sock *smc;
+	int rc;
+
+	smc = smc_sk(sk);
+	lock_sock(sk);
+
+	rc = -EINVAL;
+	if ((sk->sk_state != SMC_INIT) && (sk->sk_state != SMC_LISTEN))
+		goto out;
+
+	rc = 0;
+	if (sk->sk_state == SMC_LISTEN) {
+		sk->sk_max_ack_backlog = backlog;
+		goto out;
+	}
+	/* some socket options are handled in core, so we could not apply
+	 * them to the clc socket -- copy smc socket options to clc socket
+	 */
+	smc_copy_sock_settings_to_clc(smc);
+
+	rc = kernel_listen(smc->clcsock, backlog);
+	if (rc)
+		goto out;
+	sk->sk_max_ack_backlog = backlog;
+	sk->sk_ack_backlog = 0;
+	sk->sk_state = SMC_LISTEN;
+	INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work);
+	schedule_work(&smc->tcp_listen_work);
+
+out:
+	release_sock(sk);
+	return rc;
+}
+
+static int smc_accept(struct socket *sock, struct socket *new_sock,
+		      int flags)
+{
+	struct sock *sk = sock->sk, *nsk;
+	DECLARE_WAITQUEUE(wait, current);
+	struct smc_sock *lsmc;
+	long timeo;
+	int rc = 0;
+
+	lsmc = smc_sk(sk);
+	lock_sock(sk);
+
+	if (lsmc->sk.sk_state != SMC_LISTEN) {
+		rc = -EINVAL;
+		goto out;
+	}
+
+	/* Wait for an incoming connection */
+	timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
+	add_wait_queue_exclusive(sk_sleep(sk), &wait);
+	while (!(nsk = smc_accept_dequeue(sk, new_sock))) {
+		set_current_state(TASK_INTERRUPTIBLE);
+		if (!timeo) {
+			rc = -EAGAIN;
+			break;
+		}
+		release_sock(sk);
+		timeo = schedule_timeout(timeo);
+		/* wakeup by sk_data_ready in smc_listen_work() */
+		sched_annotate_sleep();
+		lock_sock(sk);
+		if (signal_pending(current)) {
+			rc = sock_intr_errno(timeo);
+			break;
+		}
+	}
+	set_current_state(TASK_RUNNING);
+	remove_wait_queue(sk_sleep(sk), &wait);
+
+	if (!rc)
+		rc = sock_error(nsk);
+
+out:
+	release_sock(sk);
+	return rc;
+}
+
+static int smc_getname(struct socket *sock, struct sockaddr *addr,
+		       int *len, int peer)
+{
+	struct smc_sock *smc;
+
+	if (peer && (sock->sk->sk_state != SMC_ACTIVE) &&
+	    (sock->sk->sk_state != SMC_APPCLOSEWAIT1))
+		return -ENOTCONN;
+
+	smc = smc_sk(sock->sk);
+
+	return smc->clcsock->ops->getname(smc->clcsock, addr, len, peer);
+}
+
+static int smc_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
+{
+	struct sock *sk = sock->sk;
+	struct smc_sock *smc;
+	int rc = -EPIPE;
+
+	smc = smc_sk(sk);
+	lock_sock(sk);
+	if ((sk->sk_state != SMC_ACTIVE) &&
+	    (sk->sk_state != SMC_APPCLOSEWAIT1) &&
+	    (sk->sk_state != SMC_INIT))
+		goto out;
+	if (smc->use_fallback)
+		rc = smc->clcsock->ops->sendmsg(smc->clcsock, msg, len);
+	else
+		rc = smc_tx_sendmsg(smc, msg, len);
+out:
+	release_sock(sk);
+	return rc;
+}
+
+static int smc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
+		       int flags)
+{
+	struct sock *sk = sock->sk;
+	struct smc_sock *smc;
+	int rc = -ENOTCONN;
+
+	smc = smc_sk(sk);
+	lock_sock(sk);
+	if ((sk->sk_state == SMC_INIT) ||
+	    (sk->sk_state == SMC_LISTEN) ||
+	    (sk->sk_state == SMC_CLOSED))
+		goto out;
+
+	if (sk->sk_state == SMC_PEERFINCLOSEWAIT) {
+		rc = 0;
+		goto out;
+	}
+
+	if (smc->use_fallback)
+		rc = smc->clcsock->ops->recvmsg(smc->clcsock, msg, len, flags);
+	else
+		rc = smc_rx_recvmsg(smc, msg, len, flags);
+
+out:
+	release_sock(sk);
+	return rc;
+}
+
+static unsigned int smc_accept_poll(struct sock *parent)
+{
+	struct smc_sock *isk;
+	struct sock *sk;
+
+	lock_sock(parent);
+	list_for_each_entry(isk, &smc_sk(parent)->accept_q, accept_q) {
+		sk = (struct sock *)isk;
+
+		if (sk->sk_state == SMC_ACTIVE) {
+			release_sock(parent);
+			return POLLIN | POLLRDNORM;
+		}
+	}
+	release_sock(parent);
+
+	return 0;
+}
+
+static unsigned int smc_poll(struct file *file, struct socket *sock,
+			     poll_table *wait)
+{
+	struct sock *sk = sock->sk;
+	unsigned int mask = 0;
+	struct smc_sock *smc;
+	int rc;
+
+	smc = smc_sk(sock->sk);
+	if ((sk->sk_state == SMC_INIT) || smc->use_fallback) {
+		/* delegate to CLC child sock */
+		mask = smc->clcsock->ops->poll(file, smc->clcsock, wait);
+		/* if non-blocking connect finished ... */
+		lock_sock(sk);
+		if ((sk->sk_state == SMC_INIT) && (mask & POLLOUT)) {
+			sk->sk_err = smc->clcsock->sk->sk_err;
+			if (sk->sk_err) {
+				mask |= POLLERR;
+			} else {
+				rc = smc_connect_rdma(smc);
+				if (rc < 0)
+					mask |= POLLERR;
+				else
+					/* success cases including fallback */
+					mask |= POLLOUT | POLLWRNORM;
+			}
+		}
+		release_sock(sk);
+	} else {
+		sock_poll_wait(file, sk_sleep(sk), wait);
+		if (sk->sk_state == SMC_LISTEN)
+			/* woken up by sk_data_ready in smc_listen_work() */
+			mask |= smc_accept_poll(sk);
+		if (sk->sk_err)
+			mask |= POLLERR;
+		if (atomic_read(&smc->conn.sndbuf_space) ||
+		    (sk->sk_shutdown & SEND_SHUTDOWN)) {
+			mask |= POLLOUT | POLLWRNORM;
+		} else {
+			sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
+			set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+		}
+		if (atomic_read(&smc->conn.bytes_to_rcv))
+			mask |= POLLIN | POLLRDNORM;
+		if ((sk->sk_shutdown == SHUTDOWN_MASK) ||
+		    (sk->sk_state == SMC_CLOSED))
+			mask |= POLLHUP;
+		if (sk->sk_shutdown & RCV_SHUTDOWN)
+			mask |= POLLIN | POLLRDNORM | POLLRDHUP;
+		if (sk->sk_state == SMC_APPCLOSEWAIT1)
+			mask |= POLLIN;
+
+	}
+
+	return mask;
+}
+
+static int smc_shutdown(struct socket *sock, int how)
+{
+	struct sock *sk = sock->sk;
+	struct smc_sock *smc;
+	int rc = -EINVAL;
+	int rc1 = 0;
+
+	smc = smc_sk(sk);
+
+	if ((how < SHUT_RD) || (how > SHUT_RDWR))
+		return rc;
+
+	lock_sock(sk);
+
+	rc = -ENOTCONN;
+	if ((sk->sk_state != SMC_LISTEN) &&
+	    (sk->sk_state != SMC_ACTIVE) &&
+	    (sk->sk_state != SMC_PEERCLOSEWAIT1) &&
+	    (sk->sk_state != SMC_PEERCLOSEWAIT2) &&
+	    (sk->sk_state != SMC_APPCLOSEWAIT1) &&
+	    (sk->sk_state != SMC_APPCLOSEWAIT2) &&
+	    (sk->sk_state != SMC_APPFINCLOSEWAIT))
+		goto out;
+	if (smc->use_fallback) {
+		rc = kernel_sock_shutdown(smc->clcsock, how);
+		sk->sk_shutdown = smc->clcsock->sk->sk_shutdown;
+		if (sk->sk_shutdown == SHUTDOWN_MASK)
+			sk->sk_state = SMC_CLOSED;
+		goto out;
+	}
+	switch (how) {
+	case SHUT_RDWR:		/* shutdown in both directions */
+		rc = smc_close_active(smc);
+		break;
+	case SHUT_WR:
+		rc = smc_close_shutdown_write(smc);
+		break;
+	case SHUT_RD:
+		if (sk->sk_state == SMC_LISTEN)
+			rc = smc_close_active(smc);
+		else
+			rc = 0;
+			/* nothing more to do because peer is not involved */
+		break;
+	}
+	rc1 = kernel_sock_shutdown(smc->clcsock, how);
+	/* map sock_shutdown_cmd constants to sk_shutdown value range */
+	sk->sk_shutdown |= how + 1;
+
+out:
+	release_sock(sk);
+	return rc ? rc : rc1;
+}
+
+static int smc_setsockopt(struct socket *sock, int level, int optname,
+			  char __user *optval, unsigned int optlen)
+{
+	struct sock *sk = sock->sk;
+	struct smc_sock *smc;
+
+	smc = smc_sk(sk);
+
+	/* generic setsockopts reaching us here always apply to the
+	 * CLC socket
+	 */
+	return smc->clcsock->ops->setsockopt(smc->clcsock, level, optname,
+					     optval, optlen);
+}
+
+static int smc_getsockopt(struct socket *sock, int level, int optname,
+			  char __user *optval, int __user *optlen)
+{
+	struct smc_sock *smc;
+
+	smc = smc_sk(sock->sk);
+	/* socket options apply to the CLC socket */
+	return smc->clcsock->ops->getsockopt(smc->clcsock, level, optname,
+					     optval, optlen);
+}
+
+static int smc_ioctl(struct socket *sock, unsigned int cmd,
+		     unsigned long arg)
+{
+	struct smc_sock *smc;
+
+	smc = smc_sk(sock->sk);
+	if (smc->use_fallback)
+		return smc->clcsock->ops->ioctl(smc->clcsock, cmd, arg);
+	else
+		return sock_no_ioctl(sock, cmd, arg);
+}
+
+static ssize_t smc_sendpage(struct socket *sock, struct page *page,
+			    int offset, size_t size, int flags)
+{
+	struct sock *sk = sock->sk;
+	struct smc_sock *smc;
+	int rc = -EPIPE;
+
+	smc = smc_sk(sk);
+	lock_sock(sk);
+	if (sk->sk_state != SMC_ACTIVE)
+		goto out;
+	if (smc->use_fallback)
+		rc = kernel_sendpage(smc->clcsock, page, offset,
+				     size, flags);
+	else
+		rc = sock_no_sendpage(sock, page, offset, size, flags);
+
+out:
+	release_sock(sk);
+	return rc;
+}
+
+static ssize_t smc_splice_read(struct socket *sock, loff_t *ppos,
+			       struct pipe_inode_info *pipe, size_t len,
+				    unsigned int flags)
+{
+	struct sock *sk = sock->sk;
+	struct smc_sock *smc;
+	int rc = -ENOTCONN;
+
+	smc = smc_sk(sk);
+	lock_sock(sk);
+	if ((sk->sk_state != SMC_ACTIVE) && (sk->sk_state != SMC_CLOSED))
+		goto out;
+	if (smc->use_fallback) {
+		rc = smc->clcsock->ops->splice_read(smc->clcsock, ppos,
+						    pipe, len, flags);
+	} else {
+		rc = -EOPNOTSUPP;
+	}
+out:
+	release_sock(sk);
+	return rc;
+}
+
+/* must look like tcp */
+static const struct proto_ops smc_sock_ops = {
+	.family		= PF_SMC,
+	.owner		= THIS_MODULE,
+	.release	= smc_release,
+	.bind		= smc_bind,
+	.connect	= smc_connect,
+	.socketpair	= sock_no_socketpair,
+	.accept		= smc_accept,
+	.getname	= smc_getname,
+	.poll		= smc_poll,
+	.ioctl		= smc_ioctl,
+	.listen		= smc_listen,
+	.shutdown	= smc_shutdown,
+	.setsockopt	= smc_setsockopt,
+	.getsockopt	= smc_getsockopt,
+	.sendmsg	= smc_sendmsg,
+	.recvmsg	= smc_recvmsg,
+	.mmap		= sock_no_mmap,
+	.sendpage	= smc_sendpage,
+	.splice_read	= smc_splice_read,
+};
+
+static int smc_create(struct net *net, struct socket *sock, int protocol,
+		      int kern)
+{
+	struct smc_sock *smc;
+	struct sock *sk;
+	int rc;
+
+	rc = -ESOCKTNOSUPPORT;
+	if (sock->type != SOCK_STREAM)
+		goto out;
+
+	rc = -EPROTONOSUPPORT;
+	if ((protocol != IPPROTO_IP) && (protocol != IPPROTO_TCP))
+		goto out;
+
+	rc = -ENOBUFS;
+	sock->ops = &smc_sock_ops;
+	sk = smc_sock_alloc(net, sock);
+	if (!sk)
+		goto out;
+
+	/* create internal TCP socket for CLC handshake and fallback */
+	smc = smc_sk(sk);
+	smc->use_fallback = false; /* assume rdma capability first */
+	rc = sock_create_kern(net, PF_INET, SOCK_STREAM,
+			      IPPROTO_TCP, &smc->clcsock);
+	if (rc)
+		sk_common_release(sk);
+	smc->sk.sk_sndbuf = max(smc->clcsock->sk->sk_sndbuf, SMC_BUF_MIN_SIZE);
+	smc->sk.sk_rcvbuf = max(smc->clcsock->sk->sk_rcvbuf, SMC_BUF_MIN_SIZE);
+
+out:
+	return rc;
+}
+
+static const struct net_proto_family smc_sock_family_ops = {
+	.family	= PF_SMC,
+	.owner	= THIS_MODULE,
+	.create	= smc_create,
+};
+
+static int __init smc_init(void)
+{
+	int rc;
+
+	rc = smc_pnet_init();
+	if (rc)
+		return rc;
+
+	rc = smc_llc_init();
+	if (rc) {
+		pr_err("%s: smc_llc_init fails with %d\n", __func__, rc);
+		goto out_pnet;
+	}
+
+	rc = smc_cdc_init();
+	if (rc) {
+		pr_err("%s: smc_cdc_init fails with %d\n", __func__, rc);
+		goto out_pnet;
+	}
+
+	rc = proto_register(&smc_proto, 1);
+	if (rc) {
+		pr_err("%s: proto_register fails with %d\n", __func__, rc);
+		goto out_pnet;
+	}
+
+	rc = sock_register(&smc_sock_family_ops);
+	if (rc) {
+		pr_err("%s: sock_register fails with %d\n", __func__, rc);
+		goto out_proto;
+	}
+	INIT_HLIST_HEAD(&smc_v4_hashinfo.ht);
+
+	rc = smc_ib_register_client();
+	if (rc) {
+		pr_err("%s: ib_register fails with %d\n", __func__, rc);
+		goto out_sock;
+	}
+
+	return 0;
+
+out_sock:
+	sock_unregister(PF_SMC);
+out_proto:
+	proto_unregister(&smc_proto);
+out_pnet:
+	smc_pnet_exit();
+	return rc;
+}
+
+static void __exit smc_exit(void)
+{
+	struct smc_link_group *lgr, *lg;
+	LIST_HEAD(lgr_freeing_list);
+
+	spin_lock_bh(&smc_lgr_list.lock);
+	if (!list_empty(&smc_lgr_list.list))
+		list_splice_init(&smc_lgr_list.list, &lgr_freeing_list);
+	spin_unlock_bh(&smc_lgr_list.lock);
+	list_for_each_entry_safe(lgr, lg, &lgr_freeing_list, list) {
+		list_del_init(&lgr->list);
+		smc_lgr_free(lgr); /* free link group */
+	}
+	smc_ib_unregister_client();
+	sock_unregister(PF_SMC);
+	proto_unregister(&smc_proto);
+	smc_pnet_exit();
+}
+
+module_init(smc_init);
+module_exit(smc_exit);
+
+MODULE_AUTHOR("Ursula Braun <ubraun@linux.vnet.ibm.com>");
+MODULE_DESCRIPTION("smc socket address family");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_NETPROTO(PF_SMC);
diff --git a/net/smc/smc.h b/net/smc/smc.h
new file mode 100644
index 000000000000..ee5fbea24549
--- /dev/null
+++ b/net/smc/smc.h
@@ -0,0 +1,274 @@
+/*
+ *  Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ *  Definitions for the SMC module (socket related)
+ *
+ *  Copyright IBM Corp. 2016
+ *
+ *  Author(s):  Ursula Braun <ubraun@linux.vnet.ibm.com>
+ */
+#ifndef __SMC_H
+#define __SMC_H
+
+#include <linux/socket.h>
+#include <linux/types.h>
+#include <linux/compiler.h> /* __aligned */
+#include <net/sock.h>
+
+#include "smc_ib.h"
+
+#define SMCPROTO_SMC		0	/* SMC protocol */
+
+#define SMC_MAX_PORTS		2	/* Max # of ports */
+
+extern struct proto smc_proto;
+
+#ifdef ATOMIC64_INIT
+#define KERNEL_HAS_ATOMIC64
+#endif
+
+enum smc_state {		/* possible states of an SMC socket */
+	SMC_ACTIVE	= 1,
+	SMC_INIT	= 2,
+	SMC_CLOSED	= 7,
+	SMC_LISTEN	= 10,
+	/* normal close */
+	SMC_PEERCLOSEWAIT1	= 20,
+	SMC_PEERCLOSEWAIT2	= 21,
+	SMC_APPFINCLOSEWAIT	= 24,
+	SMC_APPCLOSEWAIT1	= 22,
+	SMC_APPCLOSEWAIT2	= 23,
+	SMC_PEERFINCLOSEWAIT	= 25,
+	/* abnormal close */
+	SMC_PEERABORTWAIT	= 26,
+	SMC_PROCESSABORT	= 27,
+};
+
+struct smc_link_group;
+
+struct smc_wr_rx_hdr {	/* common prefix part of LLC and CDC to demultiplex */
+	u8			type;
+} __aligned(1);
+
+struct smc_cdc_conn_state_flags {
+#if defined(__BIG_ENDIAN_BITFIELD)
+	u8	peer_done_writing : 1;	/* Sending done indicator */
+	u8	peer_conn_closed : 1;	/* Peer connection closed indicator */
+	u8	peer_conn_abort : 1;	/* Abnormal close indicator */
+	u8	reserved : 5;
+#elif defined(__LITTLE_ENDIAN_BITFIELD)
+	u8	reserved : 5;
+	u8	peer_conn_abort : 1;
+	u8	peer_conn_closed : 1;
+	u8	peer_done_writing : 1;
+#endif
+};
+
+struct smc_cdc_producer_flags {
+#if defined(__BIG_ENDIAN_BITFIELD)
+	u8	write_blocked : 1;	/* Writing Blocked, no rx buf space */
+	u8	urg_data_pending : 1;	/* Urgent Data Pending */
+	u8	urg_data_present : 1;	/* Urgent Data Present */
+	u8	cons_curs_upd_req : 1;	/* cursor update requested */
+	u8	failover_validation : 1;/* message replay due to failover */
+	u8	reserved : 3;
+#elif defined(__LITTLE_ENDIAN_BITFIELD)
+	u8	reserved : 3;
+	u8	failover_validation : 1;
+	u8	cons_curs_upd_req : 1;
+	u8	urg_data_present : 1;
+	u8	urg_data_pending : 1;
+	u8	write_blocked : 1;
+#endif
+};
+
+/* in host byte order */
+union smc_host_cursor {	/* SMC cursor - an offset in an RMBE */
+	struct {
+		u16	reserved;
+		u16	wrap;		/* window wrap sequence number */
+		u32	count;		/* cursor (= offset) part */
+	};
+#ifdef KERNEL_HAS_ATOMIC64
+	atomic64_t		acurs;	/* for atomic processing */
+#else
+	u64			acurs;	/* for atomic processing */
+#endif
+} __aligned(8);
+
+/* in host byte order, except for flag bitfields in network byte order */
+struct smc_host_cdc_msg {		/* Connection Data Control message */
+	struct smc_wr_rx_hdr		common; /* .type = 0xFE */
+	u8				len;	/* length = 44 */
+	u16				seqno;	/* connection seq # */
+	u32				token;	/* alert_token */
+	union smc_host_cursor		prod;		/* producer cursor */
+	union smc_host_cursor		cons;		/* consumer cursor,
+							 * piggy backed "ack"
+							 */
+	struct smc_cdc_producer_flags	prod_flags;	/* conn. tx/rx status */
+	struct smc_cdc_conn_state_flags	conn_state_flags; /* peer conn. status*/
+	u8				reserved[18];
+} __aligned(8);
+
+struct smc_connection {
+	struct rb_node		alert_node;
+	struct smc_link_group	*lgr;		/* link group of connection */
+	u32			alert_token_local; /* unique conn. id */
+	u8			peer_conn_idx;	/* from tcp handshake */
+	int			peer_rmbe_size;	/* size of peer rx buffer */
+	atomic_t		peer_rmbe_space;/* remaining free bytes in peer
+						 * rmbe
+						 */
+	int			rtoken_idx;	/* idx to peer RMB rkey/addr */
+
+	struct smc_buf_desc	*sndbuf_desc;	/* send buffer descriptor */
+	int			sndbuf_size;	/* sndbuf size <== sock wmem */
+	struct smc_buf_desc	*rmb_desc;	/* RMBE descriptor */
+	int			rmbe_size;	/* RMBE size <== sock rmem */
+	int			rmbe_size_short;/* compressed notation */
+	int			rmbe_update_limit;
+						/* lower limit for consumer
+						 * cursor update
+						 */
+
+	struct smc_host_cdc_msg	local_tx_ctrl;	/* host byte order staging
+						 * buffer for CDC msg send
+						 * .prod cf. TCP snd_nxt
+						 * .cons cf. TCP sends ack
+						 */
+	union smc_host_cursor	tx_curs_prep;	/* tx - prepared data
+						 * snd_max..wmem_alloc
+						 */
+	union smc_host_cursor	tx_curs_sent;	/* tx - sent data
+						 * snd_nxt ?
+						 */
+	union smc_host_cursor	tx_curs_fin;	/* tx - confirmed by peer
+						 * snd-wnd-begin ?
+						 */
+	atomic_t		sndbuf_space;	/* remaining space in sndbuf */
+	u16			tx_cdc_seq;	/* sequence # for CDC send */
+	spinlock_t		send_lock;	/* protect wr_sends */
+	struct work_struct	tx_work;	/* retry of smc_cdc_msg_send */
+
+	struct smc_host_cdc_msg	local_rx_ctrl;	/* filled during event_handl.
+						 * .prod cf. TCP rcv_nxt
+						 * .cons cf. TCP snd_una
+						 */
+	union smc_host_cursor	rx_curs_confirmed; /* confirmed to peer
+						    * source of snd_una ?
+						    */
+	atomic_t		bytes_to_rcv;	/* arrived data,
+						 * not yet received
+						 */
+#ifndef KERNEL_HAS_ATOMIC64
+	spinlock_t		acurs_lock;	/* protect cursors */
+#endif
+};
+
+struct smc_sock {				/* smc sock container */
+	struct sock		sk;
+	struct socket		*clcsock;	/* internal tcp socket */
+	struct smc_connection	conn;		/* smc connection */
+	struct sockaddr		*addr;		/* inet connect address */
+	struct smc_sock		*listen_smc;	/* listen parent */
+	struct work_struct	tcp_listen_work;/* handle tcp socket accepts */
+	struct work_struct	smc_listen_work;/* prepare new accept socket */
+	struct list_head	accept_q;	/* sockets to be accepted */
+	spinlock_t		accept_q_lock;	/* protects accept_q */
+	struct delayed_work	sock_put_work;	/* final socket freeing */
+	bool			use_fallback;	/* fallback to tcp */
+	u8			wait_close_tx_prepared : 1;
+						/* shutdown wr or close
+						 * started, waiting for unsent
+						 * data to be sent
+						 */
+};
+
+static inline struct smc_sock *smc_sk(const struct sock *sk)
+{
+	return (struct smc_sock *)sk;
+}
+
+#define SMC_SYSTEMID_LEN		8
+
+extern u8	local_systemid[SMC_SYSTEMID_LEN]; /* unique system identifier */
+
+/* convert an u32 value into network byte order, store it into a 3 byte field */
+static inline void hton24(u8 *net, u32 host)
+{
+	__be32 t;
+
+	t = cpu_to_be32(host);
+	memcpy(net, ((u8 *)&t) + 1, 3);
+}
+
+/* convert a received 3 byte field into host byte order*/
+static inline u32 ntoh24(u8 *net)
+{
+	__be32 t = 0;
+
+	memcpy(((u8 *)&t) + 1, net, 3);
+	return be32_to_cpu(t);
+}
+
+#define SMC_BUF_MIN_SIZE 16384		/* minimum size of an RMB */
+
+#define SMC_RMBE_SIZES	16	/* number of distinct sizes for an RMBE */
+/* theoretically, the RFC states that largest size would be 512K,
+ * i.e. compressed 5 and thus 6 sizes (0..5), despite
+ * struct smc_clc_msg_accept_confirm.rmbe_size being a 4 bit value (0..15)
+ */
+
+/* convert the RMB size into the compressed notation - minimum 16K.
+ * In contrast to plain ilog2, this rounds towards the next power of 2,
+ * so the socket application gets at least its desired sndbuf / rcvbuf size.
+ */
+static inline u8 smc_compress_bufsize(int size)
+{
+	u8 compressed;
+
+	if (size <= SMC_BUF_MIN_SIZE)
+		return 0;
+
+	size = (size - 1) >> 14;
+	compressed = ilog2(size) + 1;
+	if (compressed >= SMC_RMBE_SIZES)
+		compressed = SMC_RMBE_SIZES - 1;
+	return compressed;
+}
+
+/* convert the RMB size from compressed notation into integer */
+static inline int smc_uncompress_bufsize(u8 compressed)
+{
+	u32 size;
+
+	size = 0x00000001 << (((int)compressed) + 14);
+	return (int)size;
+}
+
+#ifdef CONFIG_XFRM
+static inline bool using_ipsec(struct smc_sock *smc)
+{
+	return (smc->clcsock->sk->sk_policy[0] ||
+		smc->clcsock->sk->sk_policy[1]) ? 1 : 0;
+}
+#else
+static inline bool using_ipsec(struct smc_sock *smc)
+{
+	return 0;
+}
+#endif
+
+struct smc_clc_msg_local;
+
+int smc_netinfo_by_tcpsk(struct socket *clcsock, __be32 *subnet,
+			 u8 *prefix_len);
+void smc_conn_free(struct smc_connection *conn);
+int smc_conn_create(struct smc_sock *smc, __be32 peer_in_addr,
+		    struct smc_ib_device *smcibdev, u8 ibport,
+		    struct smc_clc_msg_local *lcl, int srv_first_contact);
+struct sock *smc_accept_dequeue(struct sock *parent, struct socket *new_sock);
+void smc_close_non_accepted(struct sock *sk);
+
+#endif	/* __SMC_H */
diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c
new file mode 100644
index 000000000000..5a339493872e
--- /dev/null
+++ b/net/smc/smc_cdc.c
@@ -0,0 +1,304 @@
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * Connection Data Control (CDC)
+ * handles flow control
+ *
+ * Copyright IBM Corp. 2016
+ *
+ * Author(s):  Ursula Braun <ubraun@linux.vnet.ibm.com>
+ */
+
+#include <linux/spinlock.h>
+
+#include "smc.h"
+#include "smc_wr.h"
+#include "smc_cdc.h"
+#include "smc_tx.h"
+#include "smc_rx.h"
+#include "smc_close.h"
+
+/********************************** send *************************************/
+
+struct smc_cdc_tx_pend {
+	struct smc_connection	*conn;		/* socket connection */
+	union smc_host_cursor	cursor;	/* tx sndbuf cursor sent */
+	union smc_host_cursor	p_cursor;	/* rx RMBE cursor produced */
+	u16			ctrl_seq;	/* conn. tx sequence # */
+};
+
+/* handler for send/transmission completion of a CDC msg */
+static void smc_cdc_tx_handler(struct smc_wr_tx_pend_priv *pnd_snd,
+			       struct smc_link *link,
+			       enum ib_wc_status wc_status)
+{
+	struct smc_cdc_tx_pend *cdcpend = (struct smc_cdc_tx_pend *)pnd_snd;
+	struct smc_sock *smc;
+	int diff;
+
+	if (!cdcpend->conn)
+		/* already dismissed */
+		return;
+
+	smc = container_of(cdcpend->conn, struct smc_sock, conn);
+	bh_lock_sock(&smc->sk);
+	if (!wc_status) {
+		diff = smc_curs_diff(cdcpend->conn->sndbuf_size,
+				     &cdcpend->conn->tx_curs_fin,
+				     &cdcpend->cursor);
+		/* sndbuf_space is decreased in smc_sendmsg */
+		smp_mb__before_atomic();
+		atomic_add(diff, &cdcpend->conn->sndbuf_space);
+		/* guarantee 0 <= sndbuf_space <= sndbuf_size */
+		smp_mb__after_atomic();
+		smc_curs_write(&cdcpend->conn->tx_curs_fin,
+			       smc_curs_read(&cdcpend->cursor, cdcpend->conn),
+			       cdcpend->conn);
+	}
+	smc_tx_sndbuf_nonfull(smc);
+	if (smc->sk.sk_state != SMC_ACTIVE)
+		/* wake up smc_close_wait_tx_pends() */
+		smc->sk.sk_state_change(&smc->sk);
+	bh_unlock_sock(&smc->sk);
+}
+
+int smc_cdc_get_free_slot(struct smc_link *link,
+			  struct smc_wr_buf **wr_buf,
+			  struct smc_cdc_tx_pend **pend)
+{
+	return smc_wr_tx_get_free_slot(link, smc_cdc_tx_handler, wr_buf,
+				       (struct smc_wr_tx_pend_priv **)pend);
+}
+
+static inline void smc_cdc_add_pending_send(struct smc_connection *conn,
+					    struct smc_cdc_tx_pend *pend)
+{
+	BUILD_BUG_ON_MSG(
+		sizeof(struct smc_cdc_msg) > SMC_WR_BUF_SIZE,
+		"must increase SMC_WR_BUF_SIZE to at least sizeof(struct smc_cdc_msg)");
+	BUILD_BUG_ON_MSG(
+		offsetof(struct smc_cdc_msg, reserved) > SMC_WR_TX_SIZE,
+		"must adapt SMC_WR_TX_SIZE to sizeof(struct smc_cdc_msg); if not all smc_wr upper layer protocols use the same message size any more, must start to set link->wr_tx_sges[i].length on each individual smc_wr_tx_send()");
+	BUILD_BUG_ON_MSG(
+		sizeof(struct smc_cdc_tx_pend) > SMC_WR_TX_PEND_PRIV_SIZE,
+		"must increase SMC_WR_TX_PEND_PRIV_SIZE to at least sizeof(struct smc_cdc_tx_pend)");
+	pend->conn = conn;
+	pend->cursor = conn->tx_curs_sent;
+	pend->p_cursor = conn->local_tx_ctrl.prod;
+	pend->ctrl_seq = conn->tx_cdc_seq;
+}
+
+int smc_cdc_msg_send(struct smc_connection *conn,
+		     struct smc_wr_buf *wr_buf,
+		     struct smc_cdc_tx_pend *pend)
+{
+	struct smc_link *link;
+	int rc;
+
+	link = &conn->lgr->lnk[SMC_SINGLE_LINK];
+
+	smc_cdc_add_pending_send(conn, pend);
+
+	conn->tx_cdc_seq++;
+	conn->local_tx_ctrl.seqno = conn->tx_cdc_seq;
+	smc_host_msg_to_cdc((struct smc_cdc_msg *)wr_buf,
+			    &conn->local_tx_ctrl, conn);
+	rc = smc_wr_tx_send(link, (struct smc_wr_tx_pend_priv *)pend);
+	if (!rc)
+		smc_curs_write(&conn->rx_curs_confirmed,
+			       smc_curs_read(&conn->local_tx_ctrl.cons, conn),
+			       conn);
+
+	return rc;
+}
+
+int smc_cdc_get_slot_and_msg_send(struct smc_connection *conn)
+{
+	struct smc_cdc_tx_pend *pend;
+	struct smc_wr_buf *wr_buf;
+	int rc;
+
+	rc = smc_cdc_get_free_slot(&conn->lgr->lnk[SMC_SINGLE_LINK], &wr_buf,
+				   &pend);
+	if (rc)
+		return rc;
+
+	return smc_cdc_msg_send(conn, wr_buf, pend);
+}
+
+static bool smc_cdc_tx_filter(struct smc_wr_tx_pend_priv *tx_pend,
+			      unsigned long data)
+{
+	struct smc_connection *conn = (struct smc_connection *)data;
+	struct smc_cdc_tx_pend *cdc_pend =
+		(struct smc_cdc_tx_pend *)tx_pend;
+
+	return cdc_pend->conn == conn;
+}
+
+static void smc_cdc_tx_dismisser(struct smc_wr_tx_pend_priv *tx_pend)
+{
+	struct smc_cdc_tx_pend *cdc_pend =
+		(struct smc_cdc_tx_pend *)tx_pend;
+
+	cdc_pend->conn = NULL;
+}
+
+void smc_cdc_tx_dismiss_slots(struct smc_connection *conn)
+{
+	struct smc_link *link = &conn->lgr->lnk[SMC_SINGLE_LINK];
+
+	smc_wr_tx_dismiss_slots(link, SMC_CDC_MSG_TYPE,
+				smc_cdc_tx_filter, smc_cdc_tx_dismisser,
+				(unsigned long)conn);
+}
+
+bool smc_cdc_tx_has_pending(struct smc_connection *conn)
+{
+	struct smc_link *link = &conn->lgr->lnk[SMC_SINGLE_LINK];
+
+	return smc_wr_tx_has_pending(link, SMC_CDC_MSG_TYPE,
+				     smc_cdc_tx_filter, (unsigned long)conn);
+}
+
+/********************************* receive ***********************************/
+
+static inline bool smc_cdc_before(u16 seq1, u16 seq2)
+{
+	return (s16)(seq1 - seq2) < 0;
+}
+
+static void smc_cdc_msg_recv_action(struct smc_sock *smc,
+				    struct smc_link *link,
+				    struct smc_cdc_msg *cdc)
+{
+	union smc_host_cursor cons_old, prod_old;
+	struct smc_connection *conn = &smc->conn;
+	int diff_cons, diff_prod;
+
+	if (!cdc->prod_flags.failover_validation) {
+		if (smc_cdc_before(ntohs(cdc->seqno),
+				   conn->local_rx_ctrl.seqno))
+			/* received seqno is old */
+			return;
+	}
+	smc_curs_write(&prod_old,
+		       smc_curs_read(&conn->local_rx_ctrl.prod, conn),
+		       conn);
+	smc_curs_write(&cons_old,
+		       smc_curs_read(&conn->local_rx_ctrl.cons, conn),
+		       conn);
+	smc_cdc_msg_to_host(&conn->local_rx_ctrl, cdc, conn);
+
+	diff_cons = smc_curs_diff(conn->peer_rmbe_size, &cons_old,
+				  &conn->local_rx_ctrl.cons);
+	if (diff_cons) {
+		/* peer_rmbe_space is decreased during data transfer with RDMA
+		 * write
+		 */
+		smp_mb__before_atomic();
+		atomic_add(diff_cons, &conn->peer_rmbe_space);
+		/* guarantee 0 <= peer_rmbe_space <= peer_rmbe_size */
+		smp_mb__after_atomic();
+	}
+
+	diff_prod = smc_curs_diff(conn->rmbe_size, &prod_old,
+				  &conn->local_rx_ctrl.prod);
+	if (diff_prod) {
+		/* bytes_to_rcv is decreased in smc_recvmsg */
+		smp_mb__before_atomic();
+		atomic_add(diff_prod, &conn->bytes_to_rcv);
+		/* guarantee 0 <= bytes_to_rcv <= rmbe_size */
+		smp_mb__after_atomic();
+		smc->sk.sk_data_ready(&smc->sk);
+	}
+
+	if (conn->local_rx_ctrl.conn_state_flags.peer_conn_abort) {
+		smc->sk.sk_err = ECONNRESET;
+		conn->local_tx_ctrl.conn_state_flags.peer_conn_abort = 1;
+	}
+	if (smc_cdc_rxed_any_close_or_senddone(conn))
+		smc_close_passive_received(smc);
+
+	/* piggy backed tx info */
+	/* trigger sndbuf consumer: RDMA write into peer RMBE and CDC */
+	if (diff_cons && smc_tx_prepared_sends(conn)) {
+		smc_tx_sndbuf_nonempty(conn);
+		/* trigger socket release if connection closed */
+		smc_close_wake_tx_prepared(smc);
+	}
+
+	/* subsequent patch: trigger socket release if connection closed */
+
+	/* socket connected but not accepted */
+	if (!smc->sk.sk_socket)
+		return;
+
+	/* data available */
+	if ((conn->local_rx_ctrl.prod_flags.write_blocked) ||
+	    (conn->local_rx_ctrl.prod_flags.cons_curs_upd_req))
+		smc_tx_consumer_update(conn);
+}
+
+/* called under tasklet context */
+static inline void smc_cdc_msg_recv(struct smc_cdc_msg *cdc,
+				    struct smc_link *link, u64 wr_id)
+{
+	struct smc_link_group *lgr = container_of(link, struct smc_link_group,
+						  lnk[SMC_SINGLE_LINK]);
+	struct smc_connection *connection;
+	struct smc_sock *smc;
+
+	/* lookup connection */
+	read_lock_bh(&lgr->conns_lock);
+	connection = smc_lgr_find_conn(ntohl(cdc->token), lgr);
+	if (!connection) {
+		read_unlock_bh(&lgr->conns_lock);
+		return;
+	}
+	smc = container_of(connection, struct smc_sock, conn);
+	sock_hold(&smc->sk);
+	read_unlock_bh(&lgr->conns_lock);
+	bh_lock_sock(&smc->sk);
+	smc_cdc_msg_recv_action(smc, link, cdc);
+	bh_unlock_sock(&smc->sk);
+	sock_put(&smc->sk); /* no free sk in softirq-context */
+}
+
+/***************************** init, exit, misc ******************************/
+
+static void smc_cdc_rx_handler(struct ib_wc *wc, void *buf)
+{
+	struct smc_link *link = (struct smc_link *)wc->qp->qp_context;
+	struct smc_cdc_msg *cdc = buf;
+
+	if (wc->byte_len < offsetof(struct smc_cdc_msg, reserved))
+		return; /* short message */
+	if (cdc->len != sizeof(*cdc))
+		return; /* invalid message */
+	smc_cdc_msg_recv(cdc, link, wc->wr_id);
+}
+
+static struct smc_wr_rx_handler smc_cdc_rx_handlers[] = {
+	{
+		.handler	= smc_cdc_rx_handler,
+		.type		= SMC_CDC_MSG_TYPE
+	},
+	{
+		.handler	= NULL,
+	}
+};
+
+int __init smc_cdc_init(void)
+{
+	struct smc_wr_rx_handler *handler;
+	int rc = 0;
+
+	for (handler = smc_cdc_rx_handlers; handler->handler; handler++) {
+		INIT_HLIST_NODE(&handler->list);
+		rc = smc_wr_rx_register_handler(handler);
+		if (rc)
+			break;
+	}
+	return rc;
+}
diff --git a/net/smc/smc_cdc.h b/net/smc/smc_cdc.h
new file mode 100644
index 000000000000..8e1d76f26007
--- /dev/null
+++ b/net/smc/smc_cdc.h
@@ -0,0 +1,218 @@
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * Connection Data Control (CDC)
+ *
+ * Copyright IBM Corp. 2016
+ *
+ * Author(s):  Ursula Braun <ubraun@linux.vnet.ibm.com>
+ */
+
+#ifndef SMC_CDC_H
+#define SMC_CDC_H
+
+#include <linux/kernel.h> /* max_t */
+#include <linux/atomic.h>
+#include <linux/in.h>
+#include <linux/compiler.h>
+
+#include "smc.h"
+#include "smc_core.h"
+#include "smc_wr.h"
+
+#define	SMC_CDC_MSG_TYPE		0xFE
+
+/* in network byte order */
+union smc_cdc_cursor {		/* SMC cursor */
+	struct {
+		__be16	reserved;
+		__be16	wrap;
+		__be32	count;
+	};
+#ifdef KERNEL_HAS_ATOMIC64
+	atomic64_t	acurs;		/* for atomic processing */
+#else
+	u64		acurs;		/* for atomic processing */
+#endif
+} __aligned(8);
+
+/* in network byte order */
+struct smc_cdc_msg {
+	struct smc_wr_rx_hdr		common; /* .type = 0xFE */
+	u8				len;	/* 44 */
+	__be16				seqno;
+	__be32				token;
+	union smc_cdc_cursor		prod;
+	union smc_cdc_cursor		cons;	/* piggy backed "ack" */
+	struct smc_cdc_producer_flags	prod_flags;
+	struct smc_cdc_conn_state_flags	conn_state_flags;
+	u8				reserved[18];
+} __aligned(8);
+
+static inline bool smc_cdc_rxed_any_close(struct smc_connection *conn)
+{
+	return conn->local_rx_ctrl.conn_state_flags.peer_conn_abort ||
+	       conn->local_rx_ctrl.conn_state_flags.peer_conn_closed;
+}
+
+static inline bool smc_cdc_rxed_any_close_or_senddone(
+	struct smc_connection *conn)
+{
+	return smc_cdc_rxed_any_close(conn) ||
+	       conn->local_rx_ctrl.conn_state_flags.peer_done_writing;
+}
+
+static inline void smc_curs_add(int size, union smc_host_cursor *curs,
+				int value)
+{
+	curs->count += value;
+	if (curs->count >= size) {
+		curs->wrap++;
+		curs->count -= size;
+	}
+}
+
+/* SMC cursors are 8 bytes long and require atomic reading and writing */
+static inline u64 smc_curs_read(union smc_host_cursor *curs,
+				struct smc_connection *conn)
+{
+#ifndef KERNEL_HAS_ATOMIC64
+	unsigned long flags;
+	u64 ret;
+
+	spin_lock_irqsave(&conn->acurs_lock, flags);
+	ret = curs->acurs;
+	spin_unlock_irqrestore(&conn->acurs_lock, flags);
+	return ret;
+#else
+	return atomic64_read(&curs->acurs);
+#endif
+}
+
+static inline u64 smc_curs_read_net(union smc_cdc_cursor *curs,
+				    struct smc_connection *conn)
+{
+#ifndef KERNEL_HAS_ATOMIC64
+	unsigned long flags;
+	u64 ret;
+
+	spin_lock_irqsave(&conn->acurs_lock, flags);
+	ret = curs->acurs;
+	spin_unlock_irqrestore(&conn->acurs_lock, flags);
+	return ret;
+#else
+	return atomic64_read(&curs->acurs);
+#endif
+}
+
+static inline void smc_curs_write(union smc_host_cursor *curs, u64 val,
+				  struct smc_connection *conn)
+{
+#ifndef KERNEL_HAS_ATOMIC64
+	unsigned long flags;
+
+	spin_lock_irqsave(&conn->acurs_lock, flags);
+	curs->acurs = val;
+	spin_unlock_irqrestore(&conn->acurs_lock, flags);
+#else
+	atomic64_set(&curs->acurs, val);
+#endif
+}
+
+static inline void smc_curs_write_net(union smc_cdc_cursor *curs, u64 val,
+				      struct smc_connection *conn)
+{
+#ifndef KERNEL_HAS_ATOMIC64
+	unsigned long flags;
+
+	spin_lock_irqsave(&conn->acurs_lock, flags);
+	curs->acurs = val;
+	spin_unlock_irqrestore(&conn->acurs_lock, flags);
+#else
+	atomic64_set(&curs->acurs, val);
+#endif
+}
+
+/* calculate cursor difference between old and new, where old <= new */
+static inline int smc_curs_diff(unsigned int size,
+				union smc_host_cursor *old,
+				union smc_host_cursor *new)
+{
+	if (old->wrap != new->wrap)
+		return max_t(int, 0,
+			     ((size - old->count) + new->count));
+
+	return max_t(int, 0, (new->count - old->count));
+}
+
+static inline void smc_host_cursor_to_cdc(union smc_cdc_cursor *peer,
+					  union smc_host_cursor *local,
+					  struct smc_connection *conn)
+{
+	union smc_host_cursor temp;
+
+	smc_curs_write(&temp, smc_curs_read(local, conn), conn);
+	peer->count = htonl(temp.count);
+	peer->wrap = htons(temp.wrap);
+	/* peer->reserved = htons(0); must be ensured by caller */
+}
+
+static inline void smc_host_msg_to_cdc(struct smc_cdc_msg *peer,
+				       struct smc_host_cdc_msg *local,
+				       struct smc_connection *conn)
+{
+	peer->common.type = local->common.type;
+	peer->len = local->len;
+	peer->seqno = htons(local->seqno);
+	peer->token = htonl(local->token);
+	smc_host_cursor_to_cdc(&peer->prod, &local->prod, conn);
+	smc_host_cursor_to_cdc(&peer->cons, &local->cons, conn);
+	peer->prod_flags = local->prod_flags;
+	peer->conn_state_flags = local->conn_state_flags;
+}
+
+static inline void smc_cdc_cursor_to_host(union smc_host_cursor *local,
+					  union smc_cdc_cursor *peer,
+					  struct smc_connection *conn)
+{
+	union smc_host_cursor temp, old;
+	union smc_cdc_cursor net;
+
+	smc_curs_write(&old, smc_curs_read(local, conn), conn);
+	smc_curs_write_net(&net, smc_curs_read_net(peer, conn), conn);
+	temp.count = ntohl(net.count);
+	temp.wrap = ntohs(net.wrap);
+	if ((old.wrap > temp.wrap) && temp.wrap)
+		return;
+	if ((old.wrap == temp.wrap) &&
+	    (old.count > temp.count))
+		return;
+	smc_curs_write(local, smc_curs_read(&temp, conn), conn);
+}
+
+static inline void smc_cdc_msg_to_host(struct smc_host_cdc_msg *local,
+				       struct smc_cdc_msg *peer,
+				       struct smc_connection *conn)
+{
+	local->common.type = peer->common.type;
+	local->len = peer->len;
+	local->seqno = ntohs(peer->seqno);
+	local->token = ntohl(peer->token);
+	smc_cdc_cursor_to_host(&local->prod, &peer->prod, conn);
+	smc_cdc_cursor_to_host(&local->cons, &peer->cons, conn);
+	local->prod_flags = peer->prod_flags;
+	local->conn_state_flags = peer->conn_state_flags;
+}
+
+struct smc_cdc_tx_pend;
+
+int smc_cdc_get_free_slot(struct smc_link *link, struct smc_wr_buf **wr_buf,
+			  struct smc_cdc_tx_pend **pend);
+void smc_cdc_tx_dismiss_slots(struct smc_connection *conn);
+int smc_cdc_msg_send(struct smc_connection *conn, struct smc_wr_buf *wr_buf,
+		     struct smc_cdc_tx_pend *pend);
+int smc_cdc_get_slot_and_msg_send(struct smc_connection *conn);
+bool smc_cdc_tx_has_pending(struct smc_connection *conn);
+int smc_cdc_init(void) __init;
+
+#endif /* SMC_CDC_H */
diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c
new file mode 100644
index 000000000000..cc6b6f8651eb
--- /dev/null
+++ b/net/smc/smc_clc.c
@@ -0,0 +1,280 @@
+/*
+ *  Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ *  CLC (connection layer control) handshake over initial TCP socket to
+ *  prepare for RDMA traffic
+ *
+ *  Copyright IBM Corp. 2016
+ *
+ *  Author(s):  Ursula Braun <ubraun@linux.vnet.ibm.com>
+ */
+
+#include <linux/in.h>
+#include <linux/if_ether.h>
+#include <net/sock.h>
+#include <net/tcp.h>
+
+#include "smc.h"
+#include "smc_core.h"
+#include "smc_clc.h"
+#include "smc_ib.h"
+
+/* Wait for data on the tcp-socket, analyze received data
+ * Returns:
+ * 0 if success and it was not a decline that we received.
+ * SMC_CLC_DECL_REPLY if decline received for fallback w/o another decl send.
+ * clcsock error, -EINTR, -ECONNRESET, -EPROTO otherwise.
+ */
+int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen,
+		     u8 expected_type)
+{
+	struct sock *clc_sk = smc->clcsock->sk;
+	struct smc_clc_msg_hdr *clcm = buf;
+	struct msghdr msg = {NULL, 0};
+	int reason_code = 0;
+	struct kvec vec;
+	int len, datlen;
+	int krflags;
+
+	/* peek the first few bytes to determine length of data to receive
+	 * so we don't consume any subsequent CLC message or payload data
+	 * in the TCP byte stream
+	 */
+	vec.iov_base = buf;
+	vec.iov_len = buflen;
+	krflags = MSG_PEEK | MSG_WAITALL;
+	smc->clcsock->sk->sk_rcvtimeo = CLC_WAIT_TIME;
+	len = kernel_recvmsg(smc->clcsock, &msg, &vec, 1,
+			     sizeof(struct smc_clc_msg_hdr), krflags);
+	if (signal_pending(current)) {
+		reason_code = -EINTR;
+		clc_sk->sk_err = EINTR;
+		smc->sk.sk_err = EINTR;
+		goto out;
+	}
+	if (clc_sk->sk_err) {
+		reason_code = -clc_sk->sk_err;
+		smc->sk.sk_err = clc_sk->sk_err;
+		goto out;
+	}
+	if (!len) { /* peer has performed orderly shutdown */
+		smc->sk.sk_err = ECONNRESET;
+		reason_code = -ECONNRESET;
+		goto out;
+	}
+	if (len < 0) {
+		smc->sk.sk_err = -len;
+		reason_code = len;
+		goto out;
+	}
+	datlen = ntohs(clcm->length);
+	if ((len < sizeof(struct smc_clc_msg_hdr)) ||
+	    (datlen < sizeof(struct smc_clc_msg_decline)) ||
+	    (datlen > sizeof(struct smc_clc_msg_accept_confirm)) ||
+	    memcmp(clcm->eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)) ||
+	    ((clcm->type != SMC_CLC_DECLINE) &&
+	     (clcm->type != expected_type))) {
+		smc->sk.sk_err = EPROTO;
+		reason_code = -EPROTO;
+		goto out;
+	}
+
+	/* receive the complete CLC message */
+	vec.iov_base = buf;
+	vec.iov_len = buflen;
+	memset(&msg, 0, sizeof(struct msghdr));
+	krflags = MSG_WAITALL;
+	smc->clcsock->sk->sk_rcvtimeo = CLC_WAIT_TIME;
+	len = kernel_recvmsg(smc->clcsock, &msg, &vec, 1, datlen, krflags);
+	if (len < datlen) {
+		smc->sk.sk_err = EPROTO;
+		reason_code = -EPROTO;
+		goto out;
+	}
+	if (clcm->type == SMC_CLC_DECLINE) {
+		reason_code = SMC_CLC_DECL_REPLY;
+		if (ntohl(((struct smc_clc_msg_decline *)buf)->peer_diagnosis)
+			== SMC_CLC_DECL_SYNCERR)
+			smc->conn.lgr->sync_err = true;
+	}
+
+out:
+	return reason_code;
+}
+
+/* send CLC DECLINE message across internal TCP socket */
+int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info,
+			 u8 out_of_sync)
+{
+	struct smc_clc_msg_decline dclc;
+	struct msghdr msg;
+	struct kvec vec;
+	int len;
+
+	memset(&dclc, 0, sizeof(dclc));
+	memcpy(dclc.hdr.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
+	dclc.hdr.type = SMC_CLC_DECLINE;
+	dclc.hdr.length = htons(sizeof(struct smc_clc_msg_decline));
+	dclc.hdr.version = SMC_CLC_V1;
+	dclc.hdr.flag = out_of_sync ? 1 : 0;
+	memcpy(dclc.id_for_peer, local_systemid, sizeof(local_systemid));
+	dclc.peer_diagnosis = htonl(peer_diag_info);
+	memcpy(dclc.trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
+
+	memset(&msg, 0, sizeof(msg));
+	vec.iov_base = &dclc;
+	vec.iov_len = sizeof(struct smc_clc_msg_decline);
+	len = kernel_sendmsg(smc->clcsock, &msg, &vec, 1,
+			     sizeof(struct smc_clc_msg_decline));
+	if (len < sizeof(struct smc_clc_msg_decline))
+		smc->sk.sk_err = EPROTO;
+	if (len < 0)
+		smc->sk.sk_err = -len;
+	return len;
+}
+
+/* send CLC PROPOSAL message across internal TCP socket */
+int smc_clc_send_proposal(struct smc_sock *smc,
+			  struct smc_ib_device *smcibdev,
+			  u8 ibport)
+{
+	struct smc_clc_msg_proposal pclc;
+	int reason_code = 0;
+	struct msghdr msg;
+	struct kvec vec;
+	int len, rc;
+
+	/* send SMC Proposal CLC message */
+	memset(&pclc, 0, sizeof(pclc));
+	memcpy(pclc.hdr.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
+	pclc.hdr.type = SMC_CLC_PROPOSAL;
+	pclc.hdr.length = htons(sizeof(pclc));
+	pclc.hdr.version = SMC_CLC_V1;		/* SMC version */
+	memcpy(pclc.lcl.id_for_peer, local_systemid, sizeof(local_systemid));
+	memcpy(&pclc.lcl.gid, &smcibdev->gid[ibport - 1], SMC_GID_SIZE);
+	memcpy(&pclc.lcl.mac, &smcibdev->mac[ibport - 1], ETH_ALEN);
+
+	/* determine subnet and mask from internal TCP socket */
+	rc = smc_netinfo_by_tcpsk(smc->clcsock, &pclc.outgoing_subnet,
+				  &pclc.prefix_len);
+	if (rc)
+		return SMC_CLC_DECL_CNFERR; /* configuration error */
+	memcpy(pclc.trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
+	memset(&msg, 0, sizeof(msg));
+	vec.iov_base = &pclc;
+	vec.iov_len = sizeof(pclc);
+	/* due to the few bytes needed for clc-handshake this cannot block */
+	len = kernel_sendmsg(smc->clcsock, &msg, &vec, 1, sizeof(pclc));
+	if (len < sizeof(pclc)) {
+		if (len >= 0) {
+			reason_code = -ENETUNREACH;
+			smc->sk.sk_err = -reason_code;
+		} else {
+			smc->sk.sk_err = smc->clcsock->sk->sk_err;
+			reason_code = -smc->sk.sk_err;
+		}
+	}
+
+	return reason_code;
+}
+
+/* send CLC CONFIRM message across internal TCP socket */
+int smc_clc_send_confirm(struct smc_sock *smc)
+{
+	struct smc_connection *conn = &smc->conn;
+	struct smc_clc_msg_accept_confirm cclc;
+	struct smc_link *link;
+	int reason_code = 0;
+	struct msghdr msg;
+	struct kvec vec;
+	int len;
+
+	link = &conn->lgr->lnk[SMC_SINGLE_LINK];
+	/* send SMC Confirm CLC msg */
+	memset(&cclc, 0, sizeof(cclc));
+	memcpy(cclc.hdr.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
+	cclc.hdr.type = SMC_CLC_CONFIRM;
+	cclc.hdr.length = htons(sizeof(cclc));
+	cclc.hdr.version = SMC_CLC_V1;		/* SMC version */
+	memcpy(cclc.lcl.id_for_peer, local_systemid, sizeof(local_systemid));
+	memcpy(&cclc.lcl.gid, &link->smcibdev->gid[link->ibport - 1],
+	       SMC_GID_SIZE);
+	memcpy(&cclc.lcl.mac, &link->smcibdev->mac[link->ibport - 1], ETH_ALEN);
+	hton24(cclc.qpn, link->roce_qp->qp_num);
+	cclc.rmb_rkey =
+		htonl(conn->rmb_desc->mr_rx[SMC_SINGLE_LINK]->rkey);
+	cclc.conn_idx = 1; /* for now: 1 RMB = 1 RMBE */
+	cclc.rmbe_alert_token = htonl(conn->alert_token_local);
+	cclc.qp_mtu = min(link->path_mtu, link->peer_mtu);
+	cclc.rmbe_size = conn->rmbe_size_short;
+	cclc.rmb_dma_addr =
+		cpu_to_be64((u64)conn->rmb_desc->dma_addr[SMC_SINGLE_LINK]);
+	hton24(cclc.psn, link->psn_initial);
+
+	memcpy(cclc.trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
+
+	memset(&msg, 0, sizeof(msg));
+	vec.iov_base = &cclc;
+	vec.iov_len = sizeof(cclc);
+	len = kernel_sendmsg(smc->clcsock, &msg, &vec, 1, sizeof(cclc));
+	if (len < sizeof(cclc)) {
+		if (len >= 0) {
+			reason_code = -ENETUNREACH;
+			smc->sk.sk_err = -reason_code;
+		} else {
+			smc->sk.sk_err = smc->clcsock->sk->sk_err;
+			reason_code = -smc->sk.sk_err;
+		}
+	}
+	return reason_code;
+}
+
+/* send CLC ACCEPT message across internal TCP socket */
+int smc_clc_send_accept(struct smc_sock *new_smc, int srv_first_contact)
+{
+	struct smc_connection *conn = &new_smc->conn;
+	struct smc_clc_msg_accept_confirm aclc;
+	struct smc_link *link;
+	struct msghdr msg;
+	struct kvec vec;
+	int rc = 0;
+	int len;
+
+	link = &conn->lgr->lnk[SMC_SINGLE_LINK];
+	memset(&aclc, 0, sizeof(aclc));
+	memcpy(aclc.hdr.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
+	aclc.hdr.type = SMC_CLC_ACCEPT;
+	aclc.hdr.length = htons(sizeof(aclc));
+	aclc.hdr.version = SMC_CLC_V1;		/* SMC version */
+	if (srv_first_contact)
+		aclc.hdr.flag = 1;
+	memcpy(aclc.lcl.id_for_peer, local_systemid, sizeof(local_systemid));
+	memcpy(&aclc.lcl.gid, &link->smcibdev->gid[link->ibport - 1],
+	       SMC_GID_SIZE);
+	memcpy(&aclc.lcl.mac, link->smcibdev->mac[link->ibport - 1], ETH_ALEN);
+	hton24(aclc.qpn, link->roce_qp->qp_num);
+	aclc.rmb_rkey =
+		htonl(conn->rmb_desc->mr_rx[SMC_SINGLE_LINK]->rkey);
+	aclc.conn_idx = 1;			/* as long as 1 RMB = 1 RMBE */
+	aclc.rmbe_alert_token = htonl(conn->alert_token_local);
+	aclc.qp_mtu = link->path_mtu;
+	aclc.rmbe_size = conn->rmbe_size_short,
+	aclc.rmb_dma_addr =
+		cpu_to_be64((u64)conn->rmb_desc->dma_addr[SMC_SINGLE_LINK]);
+	hton24(aclc.psn, link->psn_initial);
+	memcpy(aclc.trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
+
+	memset(&msg, 0, sizeof(msg));
+	vec.iov_base = &aclc;
+	vec.iov_len = sizeof(aclc);
+	len = kernel_sendmsg(new_smc->clcsock, &msg, &vec, 1, sizeof(aclc));
+	if (len < sizeof(aclc)) {
+		if (len >= 0)
+			new_smc->sk.sk_err = EPROTO;
+		else
+			new_smc->sk.sk_err = new_smc->clcsock->sk->sk_err;
+		rc = sock_error(&new_smc->sk);
+	}
+
+	return rc;
+}
diff --git a/net/smc/smc_clc.h b/net/smc/smc_clc.h
new file mode 100644
index 000000000000..13db8ce177c9
--- /dev/null
+++ b/net/smc/smc_clc.h
@@ -0,0 +1,116 @@
+/*
+ *  Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ *  CLC (connection layer control) handshake over initial TCP socket to
+ *  prepare for RDMA traffic
+ *
+ *  Copyright IBM Corp. 2016
+ *
+ *  Author(s):  Ursula Braun <ubraun@linux.vnet.ibm.com>
+ */
+
+#ifndef _SMC_CLC_H
+#define _SMC_CLC_H
+
+#include <rdma/ib_verbs.h>
+
+#include "smc.h"
+
+#define SMC_CLC_PROPOSAL	0x01
+#define SMC_CLC_ACCEPT		0x02
+#define SMC_CLC_CONFIRM		0x03
+#define SMC_CLC_DECLINE		0x04
+
+/* eye catcher "SMCR" EBCDIC for CLC messages */
+static const char SMC_EYECATCHER[4] = {'\xe2', '\xd4', '\xc3', '\xd9'};
+
+#define SMC_CLC_V1		0x1		/* SMC version                */
+#define CLC_WAIT_TIME		(6 * HZ)	/* max. wait time on clcsock  */
+#define SMC_CLC_DECL_MEM	0x01010000  /* insufficient memory resources  */
+#define SMC_CLC_DECL_TIMEOUT	0x02000000  /* timeout                        */
+#define SMC_CLC_DECL_CNFERR	0x03000000  /* configuration error            */
+#define SMC_CLC_DECL_IPSEC	0x03030000  /* IPsec usage                    */
+#define SMC_CLC_DECL_SYNCERR	0x04000000  /* synchronization error          */
+#define SMC_CLC_DECL_REPLY	0x06000000  /* reply to a received decline    */
+#define SMC_CLC_DECL_INTERR	0x99990000  /* internal error                 */
+#define SMC_CLC_DECL_TCL	0x02040000  /* timeout w4 QP confirm          */
+#define SMC_CLC_DECL_SEND	0x07000000  /* sending problem                */
+
+struct smc_clc_msg_hdr {	/* header1 of clc messages */
+	u8 eyecatcher[4];	/* eye catcher */
+	u8 type;		/* proposal / accept / confirm / decline */
+	__be16 length;
+#if defined(__BIG_ENDIAN_BITFIELD)
+	u8 version : 4,
+	   flag    : 1,
+	   rsvd	   : 3;
+#elif defined(__LITTLE_ENDIAN_BITFIELD)
+	u8 rsvd    : 3,
+	   flag    : 1,
+	   version : 4;
+#endif
+} __packed;			/* format defined in RFC7609 */
+
+struct smc_clc_msg_trail {	/* trailer of clc messages */
+	u8 eyecatcher[4];
+};
+
+struct smc_clc_msg_local {	/* header2 of clc messages */
+	u8 id_for_peer[SMC_SYSTEMID_LEN]; /* unique system id */
+	u8 gid[16];		/* gid of ib_device port */
+	u8 mac[6];		/* mac of ib_device port */
+};
+
+struct smc_clc_msg_proposal {	/* clc proposal message */
+	struct smc_clc_msg_hdr hdr;
+	struct smc_clc_msg_local lcl;
+	__be16 iparea_offset;	/* offset to IP address information area */
+	__be32 outgoing_subnet;	/* subnet mask */
+	u8 prefix_len;		/* number of significant bits in mask */
+	u8 reserved[2];
+	u8 ipv6_prefixes_cnt;	/* number of IPv6 prefixes in prefix array */
+	struct smc_clc_msg_trail trl; /* eye catcher "SMCR" EBCDIC */
+} __aligned(4);
+
+struct smc_clc_msg_accept_confirm {	/* clc accept / confirm message */
+	struct smc_clc_msg_hdr hdr;
+	struct smc_clc_msg_local lcl;
+	u8 qpn[3];		/* QP number */
+	__be32 rmb_rkey;	/* RMB rkey */
+	u8 conn_idx;		/* Connection index, which RMBE in RMB */
+	__be32 rmbe_alert_token;/* unique connection id */
+#if defined(__BIG_ENDIAN_BITFIELD)
+	u8 rmbe_size : 4,	/* RMBE buf size (compressed notation) */
+	   qp_mtu   : 4;	/* QP mtu */
+#elif defined(__LITTLE_ENDIAN_BITFIELD)
+	u8 qp_mtu   : 4,
+	   rmbe_size : 4;
+#endif
+	u8 reserved;
+	__be64 rmb_dma_addr;	/* RMB virtual address */
+	u8 reserved2;
+	u8 psn[3];		/* initial packet sequence number */
+	struct smc_clc_msg_trail trl; /* eye catcher "SMCR" EBCDIC */
+} __packed;			/* format defined in RFC7609 */
+
+struct smc_clc_msg_decline {	/* clc decline message */
+	struct smc_clc_msg_hdr hdr;
+	u8 id_for_peer[SMC_SYSTEMID_LEN]; /* sender peer_id */
+	__be32 peer_diagnosis;	/* diagnosis information */
+	u8 reserved2[4];
+	struct smc_clc_msg_trail trl; /* eye catcher "SMCR" EBCDIC */
+} __aligned(4);
+
+struct smc_sock;
+struct smc_ib_device;
+
+int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen,
+		     u8 expected_type);
+int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info,
+			 u8 out_of_sync);
+int smc_clc_send_proposal(struct smc_sock *smc, struct smc_ib_device *smcibdev,
+			  u8 ibport);
+int smc_clc_send_confirm(struct smc_sock *smc);
+int smc_clc_send_accept(struct smc_sock *smc, int srv_first_contact);
+
+#endif
diff --git a/net/smc/smc_close.c b/net/smc/smc_close.c
new file mode 100644
index 000000000000..03dfcc6b7661
--- /dev/null
+++ b/net/smc/smc_close.c
@@ -0,0 +1,442 @@
+/*
+ *  Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ *  Socket Closing - normal and abnormal
+ *
+ *  Copyright IBM Corp. 2016
+ *
+ *  Author(s):  Ursula Braun <ubraun@linux.vnet.ibm.com>
+ */
+
+#include <linux/workqueue.h>
+#include <net/sock.h>
+
+#include "smc.h"
+#include "smc_tx.h"
+#include "smc_cdc.h"
+#include "smc_close.h"
+
+#define SMC_CLOSE_WAIT_TX_PENDS_TIME		(5 * HZ)
+
+static void smc_close_cleanup_listen(struct sock *parent)
+{
+	struct sock *sk;
+
+	/* Close non-accepted connections */
+	while ((sk = smc_accept_dequeue(parent, NULL)))
+		smc_close_non_accepted(sk);
+}
+
+static void smc_close_wait_tx_pends(struct smc_sock *smc)
+{
+	DEFINE_WAIT_FUNC(wait, woken_wake_function);
+	struct sock *sk = &smc->sk;
+	signed long timeout;
+
+	timeout = SMC_CLOSE_WAIT_TX_PENDS_TIME;
+	add_wait_queue(sk_sleep(sk), &wait);
+	while (!signal_pending(current) && timeout) {
+		int rc;
+
+		rc = sk_wait_event(sk, &timeout,
+				   !smc_cdc_tx_has_pending(&smc->conn),
+				   &wait);
+		if (rc)
+			break;
+	}
+	remove_wait_queue(sk_sleep(sk), &wait);
+}
+
+/* wait for sndbuf data being transmitted */
+static void smc_close_stream_wait(struct smc_sock *smc, long timeout)
+{
+	DEFINE_WAIT_FUNC(wait, woken_wake_function);
+	struct sock *sk = &smc->sk;
+
+	if (!timeout)
+		return;
+
+	if (!smc_tx_prepared_sends(&smc->conn))
+		return;
+
+	smc->wait_close_tx_prepared = 1;
+	add_wait_queue(sk_sleep(sk), &wait);
+	while (!signal_pending(current) && timeout) {
+		int rc;
+
+		rc = sk_wait_event(sk, &timeout,
+				   !smc_tx_prepared_sends(&smc->conn) ||
+				   (sk->sk_err == ECONNABORTED) ||
+				   (sk->sk_err == ECONNRESET),
+				   &wait);
+		if (rc)
+			break;
+	}
+	remove_wait_queue(sk_sleep(sk), &wait);
+	smc->wait_close_tx_prepared = 0;
+}
+
+void smc_close_wake_tx_prepared(struct smc_sock *smc)
+{
+	if (smc->wait_close_tx_prepared)
+		/* wake up socket closing */
+		smc->sk.sk_state_change(&smc->sk);
+}
+
+static int smc_close_wr(struct smc_connection *conn)
+{
+	conn->local_tx_ctrl.conn_state_flags.peer_done_writing = 1;
+
+	return smc_cdc_get_slot_and_msg_send(conn);
+}
+
+static int smc_close_final(struct smc_connection *conn)
+{
+	if (atomic_read(&conn->bytes_to_rcv))
+		conn->local_tx_ctrl.conn_state_flags.peer_conn_abort = 1;
+	else
+		conn->local_tx_ctrl.conn_state_flags.peer_conn_closed = 1;
+
+	return smc_cdc_get_slot_and_msg_send(conn);
+}
+
+static int smc_close_abort(struct smc_connection *conn)
+{
+	conn->local_tx_ctrl.conn_state_flags.peer_conn_abort = 1;
+
+	return smc_cdc_get_slot_and_msg_send(conn);
+}
+
+/* terminate smc socket abnormally - active abort
+ * RDMA communication no longer possible
+ */
+void smc_close_active_abort(struct smc_sock *smc)
+{
+	struct smc_cdc_conn_state_flags *txflags =
+		&smc->conn.local_tx_ctrl.conn_state_flags;
+
+	bh_lock_sock(&smc->sk);
+	smc->sk.sk_err = ECONNABORTED;
+	if (smc->clcsock && smc->clcsock->sk) {
+		smc->clcsock->sk->sk_err = ECONNABORTED;
+		smc->clcsock->sk->sk_state_change(smc->clcsock->sk);
+	}
+	switch (smc->sk.sk_state) {
+	case SMC_INIT:
+		smc->sk.sk_state = SMC_PEERABORTWAIT;
+		break;
+	case SMC_APPCLOSEWAIT1:
+	case SMC_APPCLOSEWAIT2:
+		txflags->peer_conn_abort = 1;
+		sock_release(smc->clcsock);
+		if (!smc_cdc_rxed_any_close(&smc->conn))
+			smc->sk.sk_state = SMC_PEERABORTWAIT;
+		else
+			smc->sk.sk_state = SMC_CLOSED;
+		break;
+	case SMC_PEERCLOSEWAIT1:
+	case SMC_PEERCLOSEWAIT2:
+		if (!txflags->peer_conn_closed) {
+			smc->sk.sk_state = SMC_PEERABORTWAIT;
+			txflags->peer_conn_abort = 1;
+			sock_release(smc->clcsock);
+		} else {
+			smc->sk.sk_state = SMC_CLOSED;
+		}
+		break;
+	case SMC_PROCESSABORT:
+	case SMC_APPFINCLOSEWAIT:
+		if (!txflags->peer_conn_closed) {
+			txflags->peer_conn_abort = 1;
+			sock_release(smc->clcsock);
+		}
+		smc->sk.sk_state = SMC_CLOSED;
+		break;
+	case SMC_PEERFINCLOSEWAIT:
+	case SMC_PEERABORTWAIT:
+	case SMC_CLOSED:
+		break;
+	}
+
+	sock_set_flag(&smc->sk, SOCK_DEAD);
+	bh_unlock_sock(&smc->sk);
+	smc->sk.sk_state_change(&smc->sk);
+}
+
+int smc_close_active(struct smc_sock *smc)
+{
+	struct smc_cdc_conn_state_flags *txflags =
+		&smc->conn.local_tx_ctrl.conn_state_flags;
+	long timeout = SMC_MAX_STREAM_WAIT_TIMEOUT;
+	struct smc_connection *conn = &smc->conn;
+	struct sock *sk = &smc->sk;
+	int old_state;
+	int rc = 0;
+
+	if (sock_flag(sk, SOCK_LINGER) &&
+	    !(current->flags & PF_EXITING))
+		timeout = sk->sk_lingertime;
+
+again:
+	old_state = sk->sk_state;
+	switch (old_state) {
+	case SMC_INIT:
+		sk->sk_state = SMC_CLOSED;
+		if (smc->smc_listen_work.func)
+			flush_work(&smc->smc_listen_work);
+		sock_put(sk);
+		break;
+	case SMC_LISTEN:
+		sk->sk_state = SMC_CLOSED;
+		sk->sk_state_change(sk); /* wake up accept */
+		if (smc->clcsock && smc->clcsock->sk) {
+			rc = kernel_sock_shutdown(smc->clcsock, SHUT_RDWR);
+			/* wake up kernel_accept of smc_tcp_listen_worker */
+			smc->clcsock->sk->sk_data_ready(smc->clcsock->sk);
+		}
+		release_sock(sk);
+		smc_close_cleanup_listen(sk);
+		flush_work(&smc->tcp_listen_work);
+		lock_sock(sk);
+		break;
+	case SMC_ACTIVE:
+		smc_close_stream_wait(smc, timeout);
+		release_sock(sk);
+		cancel_work_sync(&conn->tx_work);
+		lock_sock(sk);
+		if (sk->sk_state == SMC_ACTIVE) {
+			/* send close request */
+			rc = smc_close_final(conn);
+			sk->sk_state = SMC_PEERCLOSEWAIT1;
+		} else {
+			/* peer event has changed the state */
+			goto again;
+		}
+		break;
+	case SMC_APPFINCLOSEWAIT:
+		/* socket already shutdown wr or both (active close) */
+		if (txflags->peer_done_writing &&
+		    !txflags->peer_conn_closed) {
+			/* just shutdown wr done, send close request */
+			rc = smc_close_final(conn);
+		}
+		sk->sk_state = SMC_CLOSED;
+		smc_close_wait_tx_pends(smc);
+		break;
+	case SMC_APPCLOSEWAIT1:
+	case SMC_APPCLOSEWAIT2:
+		if (!smc_cdc_rxed_any_close(conn))
+			smc_close_stream_wait(smc, timeout);
+		release_sock(sk);
+		cancel_work_sync(&conn->tx_work);
+		lock_sock(sk);
+		if (sk->sk_err != ECONNABORTED) {
+			/* confirm close from peer */
+			rc = smc_close_final(conn);
+			if (rc)
+				break;
+		}
+		if (smc_cdc_rxed_any_close(conn))
+			/* peer has closed the socket already */
+			sk->sk_state = SMC_CLOSED;
+		else
+			/* peer has just issued a shutdown write */
+			sk->sk_state = SMC_PEERFINCLOSEWAIT;
+		smc_close_wait_tx_pends(smc);
+		break;
+	case SMC_PEERCLOSEWAIT1:
+	case SMC_PEERCLOSEWAIT2:
+	case SMC_PEERFINCLOSEWAIT:
+		/* peer sending PeerConnectionClosed will cause transition */
+		break;
+	case SMC_PROCESSABORT:
+		cancel_work_sync(&conn->tx_work);
+		smc_close_abort(conn);
+		sk->sk_state = SMC_CLOSED;
+		smc_close_wait_tx_pends(smc);
+		break;
+	case SMC_PEERABORTWAIT:
+	case SMC_CLOSED:
+		/* nothing to do, add tracing in future patch */
+		break;
+	}
+
+	if (old_state != sk->sk_state)
+		sk->sk_state_change(&smc->sk);
+	return rc;
+}
+
+static void smc_close_passive_abort_received(struct smc_sock *smc)
+{
+	struct smc_cdc_conn_state_flags *txflags =
+		&smc->conn.local_tx_ctrl.conn_state_flags;
+	struct sock *sk = &smc->sk;
+
+	switch (sk->sk_state) {
+	case SMC_ACTIVE:
+	case SMC_APPFINCLOSEWAIT:
+	case SMC_APPCLOSEWAIT1:
+	case SMC_APPCLOSEWAIT2:
+		smc_close_abort(&smc->conn);
+		sk->sk_state = SMC_PROCESSABORT;
+		break;
+	case SMC_PEERCLOSEWAIT1:
+	case SMC_PEERCLOSEWAIT2:
+		if (txflags->peer_done_writing &&
+		    !txflags->peer_conn_closed) {
+			/* just shutdown, but not yet closed locally */
+			smc_close_abort(&smc->conn);
+			sk->sk_state = SMC_PROCESSABORT;
+		} else {
+			sk->sk_state = SMC_CLOSED;
+		}
+		break;
+	case SMC_PEERFINCLOSEWAIT:
+	case SMC_PEERABORTWAIT:
+		sk->sk_state = SMC_CLOSED;
+		break;
+	case SMC_INIT:
+	case SMC_PROCESSABORT:
+	/* nothing to do, add tracing in future patch */
+		break;
+	}
+}
+
+/* Some kind of closing has been received: peer_conn_closed, peer_conn_abort,
+ * or peer_done_writing.
+ * Called under tasklet context.
+ */
+void smc_close_passive_received(struct smc_sock *smc)
+{
+	struct smc_cdc_conn_state_flags *rxflags =
+		&smc->conn.local_rx_ctrl.conn_state_flags;
+	struct sock *sk = &smc->sk;
+	int old_state;
+
+	sk->sk_shutdown |= RCV_SHUTDOWN;
+	if (smc->clcsock && smc->clcsock->sk)
+		smc->clcsock->sk->sk_shutdown |= RCV_SHUTDOWN;
+	sock_set_flag(&smc->sk, SOCK_DONE);
+
+	old_state = sk->sk_state;
+
+	if (rxflags->peer_conn_abort) {
+		smc_close_passive_abort_received(smc);
+		goto wakeup;
+	}
+
+	switch (sk->sk_state) {
+	case SMC_INIT:
+		if (atomic_read(&smc->conn.bytes_to_rcv) ||
+		    (rxflags->peer_done_writing &&
+		     !rxflags->peer_conn_closed))
+			sk->sk_state = SMC_APPCLOSEWAIT1;
+		else
+			sk->sk_state = SMC_CLOSED;
+		break;
+	case SMC_ACTIVE:
+		sk->sk_state = SMC_APPCLOSEWAIT1;
+		break;
+	case SMC_PEERCLOSEWAIT1:
+		if (rxflags->peer_done_writing)
+			sk->sk_state = SMC_PEERCLOSEWAIT2;
+		/* fall through to check for closing */
+	case SMC_PEERCLOSEWAIT2:
+	case SMC_PEERFINCLOSEWAIT:
+		if (!smc_cdc_rxed_any_close(&smc->conn))
+			break;
+		if (sock_flag(sk, SOCK_DEAD) &&
+		    (sk->sk_shutdown == SHUTDOWN_MASK)) {
+			/* smc_release has already been called locally */
+			sk->sk_state = SMC_CLOSED;
+		} else {
+			/* just shutdown, but not yet closed locally */
+			sk->sk_state = SMC_APPFINCLOSEWAIT;
+		}
+		break;
+	case SMC_APPCLOSEWAIT1:
+	case SMC_APPCLOSEWAIT2:
+	case SMC_APPFINCLOSEWAIT:
+	case SMC_PEERABORTWAIT:
+	case SMC_PROCESSABORT:
+	case SMC_CLOSED:
+		/* nothing to do, add tracing in future patch */
+		break;
+	}
+
+wakeup:
+	if (old_state != sk->sk_state)
+		sk->sk_state_change(sk);
+	sk->sk_data_ready(sk); /* wakeup blocked rcvbuf consumers */
+	sk->sk_write_space(sk); /* wakeup blocked sndbuf producers */
+
+	if ((sk->sk_state == SMC_CLOSED) &&
+	    (sock_flag(sk, SOCK_DEAD) || (old_state == SMC_INIT))) {
+		smc_conn_free(&smc->conn);
+		schedule_delayed_work(&smc->sock_put_work,
+				      SMC_CLOSE_SOCK_PUT_DELAY);
+	}
+}
+
+void smc_close_sock_put_work(struct work_struct *work)
+{
+	struct smc_sock *smc = container_of(to_delayed_work(work),
+					    struct smc_sock,
+					    sock_put_work);
+
+	smc->sk.sk_prot->unhash(&smc->sk);
+	sock_put(&smc->sk);
+}
+
+int smc_close_shutdown_write(struct smc_sock *smc)
+{
+	struct smc_connection *conn = &smc->conn;
+	long timeout = SMC_MAX_STREAM_WAIT_TIMEOUT;
+	struct sock *sk = &smc->sk;
+	int old_state;
+	int rc = 0;
+
+	if (sock_flag(sk, SOCK_LINGER))
+		timeout = sk->sk_lingertime;
+
+again:
+	old_state = sk->sk_state;
+	switch (old_state) {
+	case SMC_ACTIVE:
+		smc_close_stream_wait(smc, timeout);
+		release_sock(sk);
+		cancel_work_sync(&conn->tx_work);
+		lock_sock(sk);
+		/* send close wr request */
+		rc = smc_close_wr(conn);
+		if (sk->sk_state == SMC_ACTIVE)
+			sk->sk_state = SMC_PEERCLOSEWAIT1;
+		else
+			goto again;
+		break;
+	case SMC_APPCLOSEWAIT1:
+		/* passive close */
+		if (!smc_cdc_rxed_any_close(conn))
+			smc_close_stream_wait(smc, timeout);
+		release_sock(sk);
+		cancel_work_sync(&conn->tx_work);
+		lock_sock(sk);
+		/* confirm close from peer */
+		rc = smc_close_wr(conn);
+		sk->sk_state = SMC_APPCLOSEWAIT2;
+		break;
+	case SMC_APPCLOSEWAIT2:
+	case SMC_PEERFINCLOSEWAIT:
+	case SMC_PEERCLOSEWAIT1:
+	case SMC_PEERCLOSEWAIT2:
+	case SMC_APPFINCLOSEWAIT:
+	case SMC_PROCESSABORT:
+	case SMC_PEERABORTWAIT:
+		/* nothing to do, add tracing in future patch */
+		break;
+	}
+
+	if (old_state != sk->sk_state)
+		sk->sk_state_change(&smc->sk);
+	return rc;
+}
diff --git a/net/smc/smc_close.h b/net/smc/smc_close.h
new file mode 100644
index 000000000000..bc9a2df3633c
--- /dev/null
+++ b/net/smc/smc_close.h
@@ -0,0 +1,28 @@
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * Socket Closing
+ *
+ * Copyright IBM Corp. 2016
+ *
+ * Author(s):  Ursula Braun <ubraun@linux.vnet.ibm.com>
+ */
+
+#ifndef SMC_CLOSE_H
+#define SMC_CLOSE_H
+
+#include <linux/workqueue.h>
+
+#include "smc.h"
+
+#define SMC_MAX_STREAM_WAIT_TIMEOUT		(2 * HZ)
+#define SMC_CLOSE_SOCK_PUT_DELAY		HZ
+
+void smc_close_wake_tx_prepared(struct smc_sock *smc);
+void smc_close_active_abort(struct smc_sock *smc);
+int smc_close_active(struct smc_sock *smc);
+void smc_close_passive_received(struct smc_sock *smc);
+void smc_close_sock_put_work(struct work_struct *work);
+int smc_close_shutdown_write(struct smc_sock *smc);
+
+#endif /* SMC_CLOSE_H */
diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
new file mode 100644
index 000000000000..0eac633fb354
--- /dev/null
+++ b/net/smc/smc_core.c
@@ -0,0 +1,682 @@
+/*
+ *  Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ *  Basic Transport Functions exploiting Infiniband API
+ *
+ *  Copyright IBM Corp. 2016
+ *
+ *  Author(s):  Ursula Braun <ubraun@linux.vnet.ibm.com>
+ */
+
+#include <linux/socket.h>
+#include <linux/if_vlan.h>
+#include <linux/random.h>
+#include <linux/workqueue.h>
+#include <net/tcp.h>
+#include <net/sock.h>
+#include <rdma/ib_verbs.h>
+
+#include "smc.h"
+#include "smc_clc.h"
+#include "smc_core.h"
+#include "smc_ib.h"
+#include "smc_wr.h"
+#include "smc_llc.h"
+#include "smc_cdc.h"
+#include "smc_close.h"
+
+#define SMC_LGR_NUM_INCR	256
+#define SMC_LGR_FREE_DELAY	(600 * HZ)
+
+static u32 smc_lgr_num;			/* unique link group number */
+
+/* Register connection's alert token in our lookup structure.
+ * To use rbtrees we have to implement our own insert core.
+ * Requires @conns_lock
+ * @smc		connection to register
+ * Returns 0 on success, != otherwise.
+ */
+static void smc_lgr_add_alert_token(struct smc_connection *conn)
+{
+	struct rb_node **link, *parent = NULL;
+	u32 token = conn->alert_token_local;
+
+	link = &conn->lgr->conns_all.rb_node;
+	while (*link) {
+		struct smc_connection *cur = rb_entry(*link,
+					struct smc_connection, alert_node);
+
+		parent = *link;
+		if (cur->alert_token_local > token)
+			link = &parent->rb_left;
+		else
+			link = &parent->rb_right;
+	}
+	/* Put the new node there */
+	rb_link_node(&conn->alert_node, parent, link);
+	rb_insert_color(&conn->alert_node, &conn->lgr->conns_all);
+}
+
+/* Register connection in link group by assigning an alert token
+ * registered in a search tree.
+ * Requires @conns_lock
+ * Note that '0' is a reserved value and not assigned.
+ */
+static void smc_lgr_register_conn(struct smc_connection *conn)
+{
+	struct smc_sock *smc = container_of(conn, struct smc_sock, conn);
+	static atomic_t nexttoken = ATOMIC_INIT(0);
+
+	/* find a new alert_token_local value not yet used by some connection
+	 * in this link group
+	 */
+	sock_hold(&smc->sk); /* sock_put in smc_lgr_unregister_conn() */
+	while (!conn->alert_token_local) {
+		conn->alert_token_local = atomic_inc_return(&nexttoken);
+		if (smc_lgr_find_conn(conn->alert_token_local, conn->lgr))
+			conn->alert_token_local = 0;
+	}
+	smc_lgr_add_alert_token(conn);
+	conn->lgr->conns_num++;
+}
+
+/* Unregister connection and reset the alert token of the given connection<
+ */
+static void __smc_lgr_unregister_conn(struct smc_connection *conn)
+{
+	struct smc_sock *smc = container_of(conn, struct smc_sock, conn);
+	struct smc_link_group *lgr = conn->lgr;
+
+	rb_erase(&conn->alert_node, &lgr->conns_all);
+	lgr->conns_num--;
+	conn->alert_token_local = 0;
+	conn->lgr = NULL;
+	sock_put(&smc->sk); /* sock_hold in smc_lgr_register_conn() */
+}
+
+/* Unregister connection and trigger lgr freeing if applicable
+ */
+static void smc_lgr_unregister_conn(struct smc_connection *conn)
+{
+	struct smc_link_group *lgr = conn->lgr;
+	int reduced = 0;
+
+	write_lock_bh(&lgr->conns_lock);
+	if (conn->alert_token_local) {
+		reduced = 1;
+		__smc_lgr_unregister_conn(conn);
+	}
+	write_unlock_bh(&lgr->conns_lock);
+	if (reduced && !lgr->conns_num)
+		schedule_delayed_work(&lgr->free_work, SMC_LGR_FREE_DELAY);
+}
+
+static void smc_lgr_free_work(struct work_struct *work)
+{
+	struct smc_link_group *lgr = container_of(to_delayed_work(work),
+						  struct smc_link_group,
+						  free_work);
+	bool conns;
+
+	spin_lock_bh(&smc_lgr_list.lock);
+	read_lock_bh(&lgr->conns_lock);
+	conns = RB_EMPTY_ROOT(&lgr->conns_all);
+	read_unlock_bh(&lgr->conns_lock);
+	if (!conns) { /* number of lgr connections is no longer zero */
+		spin_unlock_bh(&smc_lgr_list.lock);
+		return;
+	}
+	list_del_init(&lgr->list); /* remove from smc_lgr_list */
+	spin_unlock_bh(&smc_lgr_list.lock);
+	smc_lgr_free(lgr);
+}
+
+/* create a new SMC link group */
+static int smc_lgr_create(struct smc_sock *smc, __be32 peer_in_addr,
+			  struct smc_ib_device *smcibdev, u8 ibport,
+			  char *peer_systemid, unsigned short vlan_id)
+{
+	struct smc_link_group *lgr;
+	struct smc_link *lnk;
+	u8 rndvec[3];
+	int rc = 0;
+	int i;
+
+	lgr = kzalloc(sizeof(*lgr), GFP_KERNEL);
+	if (!lgr) {
+		rc = -ENOMEM;
+		goto out;
+	}
+	lgr->role = smc->listen_smc ? SMC_SERV : SMC_CLNT;
+	lgr->sync_err = false;
+	lgr->daddr = peer_in_addr;
+	memcpy(lgr->peer_systemid, peer_systemid, SMC_SYSTEMID_LEN);
+	lgr->vlan_id = vlan_id;
+	rwlock_init(&lgr->sndbufs_lock);
+	rwlock_init(&lgr->rmbs_lock);
+	for (i = 0; i < SMC_RMBE_SIZES; i++) {
+		INIT_LIST_HEAD(&lgr->sndbufs[i]);
+		INIT_LIST_HEAD(&lgr->rmbs[i]);
+	}
+	smc_lgr_num += SMC_LGR_NUM_INCR;
+	memcpy(&lgr->id, (u8 *)&smc_lgr_num, SMC_LGR_ID_SIZE);
+	INIT_DELAYED_WORK(&lgr->free_work, smc_lgr_free_work);
+	lgr->conns_all = RB_ROOT;
+
+	lnk = &lgr->lnk[SMC_SINGLE_LINK];
+	/* initialize link */
+	lnk->smcibdev = smcibdev;
+	lnk->ibport = ibport;
+	lnk->path_mtu = smcibdev->pattr[ibport - 1].active_mtu;
+	if (!smcibdev->initialized)
+		smc_ib_setup_per_ibdev(smcibdev);
+	get_random_bytes(rndvec, sizeof(rndvec));
+	lnk->psn_initial = rndvec[0] + (rndvec[1] << 8) + (rndvec[2] << 16);
+	rc = smc_wr_alloc_link_mem(lnk);
+	if (rc)
+		goto free_lgr;
+	init_waitqueue_head(&lnk->wr_tx_wait);
+	rc = smc_ib_create_protection_domain(lnk);
+	if (rc)
+		goto free_link_mem;
+	rc = smc_ib_create_queue_pair(lnk);
+	if (rc)
+		goto dealloc_pd;
+	rc = smc_wr_create_link(lnk);
+	if (rc)
+		goto destroy_qp;
+	init_completion(&lnk->llc_confirm);
+	init_completion(&lnk->llc_confirm_resp);
+
+	smc->conn.lgr = lgr;
+	rwlock_init(&lgr->conns_lock);
+	spin_lock_bh(&smc_lgr_list.lock);
+	list_add(&lgr->list, &smc_lgr_list.list);
+	spin_unlock_bh(&smc_lgr_list.lock);
+	return 0;
+
+destroy_qp:
+	smc_ib_destroy_queue_pair(lnk);
+dealloc_pd:
+	smc_ib_dealloc_protection_domain(lnk);
+free_link_mem:
+	smc_wr_free_link_mem(lnk);
+free_lgr:
+	kfree(lgr);
+out:
+	return rc;
+}
+
+static void smc_sndbuf_unuse(struct smc_connection *conn)
+{
+	if (conn->sndbuf_desc) {
+		conn->sndbuf_desc->used = 0;
+		conn->sndbuf_size = 0;
+	}
+}
+
+static void smc_rmb_unuse(struct smc_connection *conn)
+{
+	if (conn->rmb_desc) {
+		conn->rmb_desc->used = 0;
+		conn->rmbe_size = 0;
+	}
+}
+
+/* remove a finished connection from its link group */
+void smc_conn_free(struct smc_connection *conn)
+{
+	struct smc_link_group *lgr = conn->lgr;
+
+	if (!lgr)
+		return;
+	smc_cdc_tx_dismiss_slots(conn);
+	smc_lgr_unregister_conn(conn);
+	smc_rmb_unuse(conn);
+	smc_sndbuf_unuse(conn);
+}
+
+static void smc_link_clear(struct smc_link *lnk)
+{
+	lnk->peer_qpn = 0;
+	smc_ib_modify_qp_reset(lnk);
+	smc_wr_free_link(lnk);
+	smc_ib_destroy_queue_pair(lnk);
+	smc_ib_dealloc_protection_domain(lnk);
+	smc_wr_free_link_mem(lnk);
+}
+
+static void smc_lgr_free_sndbufs(struct smc_link_group *lgr)
+{
+	struct smc_buf_desc *sndbuf_desc, *bf_desc;
+	int i;
+
+	for (i = 0; i < SMC_RMBE_SIZES; i++) {
+		list_for_each_entry_safe(sndbuf_desc, bf_desc, &lgr->sndbufs[i],
+					 list) {
+			list_del(&sndbuf_desc->list);
+			smc_ib_buf_unmap(lgr->lnk[SMC_SINGLE_LINK].smcibdev,
+					 smc_uncompress_bufsize(i),
+					 sndbuf_desc, DMA_TO_DEVICE);
+			kfree(sndbuf_desc->cpu_addr);
+			kfree(sndbuf_desc);
+		}
+	}
+}
+
+static void smc_lgr_free_rmbs(struct smc_link_group *lgr)
+{
+	struct smc_buf_desc *rmb_desc, *bf_desc;
+	struct smc_link *lnk = &lgr->lnk[SMC_SINGLE_LINK];
+	int i;
+
+	for (i = 0; i < SMC_RMBE_SIZES; i++) {
+		list_for_each_entry_safe(rmb_desc, bf_desc, &lgr->rmbs[i],
+					 list) {
+			list_del(&rmb_desc->list);
+			smc_ib_buf_unmap(lnk->smcibdev,
+					 smc_uncompress_bufsize(i),
+					 rmb_desc, DMA_FROM_DEVICE);
+			kfree(rmb_desc->cpu_addr);
+			kfree(rmb_desc);
+		}
+	}
+}
+
+/* remove a link group */
+void smc_lgr_free(struct smc_link_group *lgr)
+{
+	smc_lgr_free_rmbs(lgr);
+	smc_lgr_free_sndbufs(lgr);
+	smc_link_clear(&lgr->lnk[SMC_SINGLE_LINK]);
+	kfree(lgr);
+}
+
+/* terminate linkgroup abnormally */
+void smc_lgr_terminate(struct smc_link_group *lgr)
+{
+	struct smc_connection *conn;
+	struct smc_sock *smc;
+	struct rb_node *node;
+
+	spin_lock_bh(&smc_lgr_list.lock);
+	if (list_empty(&lgr->list)) {
+		/* termination already triggered */
+		spin_unlock_bh(&smc_lgr_list.lock);
+		return;
+	}
+	/* do not use this link group for new connections */
+	list_del_init(&lgr->list);
+	spin_unlock_bh(&smc_lgr_list.lock);
+
+	write_lock_bh(&lgr->conns_lock);
+	node = rb_first(&lgr->conns_all);
+	while (node) {
+		conn = rb_entry(node, struct smc_connection, alert_node);
+		smc = container_of(conn, struct smc_sock, conn);
+		sock_hold(&smc->sk);
+		__smc_lgr_unregister_conn(conn);
+		smc_close_active_abort(smc);
+		sock_put(&smc->sk);
+		node = rb_first(&lgr->conns_all);
+	}
+	write_unlock_bh(&lgr->conns_lock);
+}
+
+/* Determine vlan of internal TCP socket.
+ * @vlan_id: address to store the determined vlan id into
+ */
+static int smc_vlan_by_tcpsk(struct socket *clcsock, unsigned short *vlan_id)
+{
+	struct dst_entry *dst = sk_dst_get(clcsock->sk);
+	int rc = 0;
+
+	*vlan_id = 0;
+	if (!dst) {
+		rc = -ENOTCONN;
+		goto out;
+	}
+	if (!dst->dev) {
+		rc = -ENODEV;
+		goto out_rel;
+	}
+
+	if (is_vlan_dev(dst->dev))
+		*vlan_id = vlan_dev_vlan_id(dst->dev);
+
+out_rel:
+	dst_release(dst);
+out:
+	return rc;
+}
+
+/* determine the link gid matching the vlan id of the link group */
+static int smc_link_determine_gid(struct smc_link_group *lgr)
+{
+	struct smc_link *lnk = &lgr->lnk[SMC_SINGLE_LINK];
+	struct ib_gid_attr gattr;
+	union ib_gid gid;
+	int i;
+
+	if (!lgr->vlan_id) {
+		lnk->gid = lnk->smcibdev->gid[lnk->ibport - 1];
+		return 0;
+	}
+
+	for (i = 0; i < lnk->smcibdev->pattr[lnk->ibport - 1].gid_tbl_len;
+	     i++) {
+		if (ib_query_gid(lnk->smcibdev->ibdev, lnk->ibport, i, &gid,
+				 &gattr))
+			continue;
+		if (gattr.ndev &&
+		    (vlan_dev_vlan_id(gattr.ndev) == lgr->vlan_id)) {
+			lnk->gid = gid;
+			return 0;
+		}
+	}
+	return -ENODEV;
+}
+
+/* create a new SMC connection (and a new link group if necessary) */
+int smc_conn_create(struct smc_sock *smc, __be32 peer_in_addr,
+		    struct smc_ib_device *smcibdev, u8 ibport,
+		    struct smc_clc_msg_local *lcl, int srv_first_contact)
+{
+	struct smc_connection *conn = &smc->conn;
+	struct smc_link_group *lgr;
+	unsigned short vlan_id;
+	enum smc_lgr_role role;
+	int local_contact = SMC_FIRST_CONTACT;
+	int rc = 0;
+
+	role = smc->listen_smc ? SMC_SERV : SMC_CLNT;
+	rc = smc_vlan_by_tcpsk(smc->clcsock, &vlan_id);
+	if (rc)
+		return rc;
+
+	if ((role == SMC_CLNT) && srv_first_contact)
+		/* create new link group as well */
+		goto create;
+
+	/* determine if an existing link group can be reused */
+	spin_lock_bh(&smc_lgr_list.lock);
+	list_for_each_entry(lgr, &smc_lgr_list.list, list) {
+		write_lock_bh(&lgr->conns_lock);
+		if (!memcmp(lgr->peer_systemid, lcl->id_for_peer,
+			    SMC_SYSTEMID_LEN) &&
+		    !memcmp(lgr->lnk[SMC_SINGLE_LINK].peer_gid, &lcl->gid,
+			    SMC_GID_SIZE) &&
+		    !memcmp(lgr->lnk[SMC_SINGLE_LINK].peer_mac, lcl->mac,
+			    sizeof(lcl->mac)) &&
+		    !lgr->sync_err &&
+		    (lgr->role == role) &&
+		    (lgr->vlan_id == vlan_id) &&
+		    ((role == SMC_CLNT) ||
+		     (lgr->conns_num < SMC_RMBS_PER_LGR_MAX))) {
+			/* link group found */
+			local_contact = SMC_REUSE_CONTACT;
+			conn->lgr = lgr;
+			smc_lgr_register_conn(conn); /* add smc conn to lgr */
+			write_unlock_bh(&lgr->conns_lock);
+			break;
+		}
+		write_unlock_bh(&lgr->conns_lock);
+	}
+	spin_unlock_bh(&smc_lgr_list.lock);
+
+	if (role == SMC_CLNT && !srv_first_contact &&
+	    (local_contact == SMC_FIRST_CONTACT)) {
+		/* Server reuses a link group, but Client wants to start
+		 * a new one
+		 * send out_of_sync decline, reason synchr. error
+		 */
+		return -ENOLINK;
+	}
+
+create:
+	if (local_contact == SMC_FIRST_CONTACT) {
+		rc = smc_lgr_create(smc, peer_in_addr, smcibdev, ibport,
+				    lcl->id_for_peer, vlan_id);
+		if (rc)
+			goto out;
+		smc_lgr_register_conn(conn); /* add smc conn to lgr */
+		rc = smc_link_determine_gid(conn->lgr);
+	}
+	conn->local_tx_ctrl.common.type = SMC_CDC_MSG_TYPE;
+	conn->local_tx_ctrl.len = sizeof(struct smc_cdc_msg);
+#ifndef KERNEL_HAS_ATOMIC64
+	spin_lock_init(&conn->acurs_lock);
+#endif
+
+out:
+	return rc ? rc : local_contact;
+}
+
+/* try to reuse a sndbuf description slot of the sndbufs list for a certain
+ * buf_size; if not available, return NULL
+ */
+static inline
+struct smc_buf_desc *smc_sndbuf_get_slot(struct smc_link_group *lgr,
+					 int compressed_bufsize)
+{
+	struct smc_buf_desc *sndbuf_slot;
+
+	read_lock_bh(&lgr->sndbufs_lock);
+	list_for_each_entry(sndbuf_slot, &lgr->sndbufs[compressed_bufsize],
+			    list) {
+		if (cmpxchg(&sndbuf_slot->used, 0, 1) == 0) {
+			read_unlock_bh(&lgr->sndbufs_lock);
+			return sndbuf_slot;
+		}
+	}
+	read_unlock_bh(&lgr->sndbufs_lock);
+	return NULL;
+}
+
+/* try to reuse an rmb description slot of the rmbs list for a certain
+ * rmbe_size; if not available, return NULL
+ */
+static inline
+struct smc_buf_desc *smc_rmb_get_slot(struct smc_link_group *lgr,
+				      int compressed_bufsize)
+{
+	struct smc_buf_desc *rmb_slot;
+
+	read_lock_bh(&lgr->rmbs_lock);
+	list_for_each_entry(rmb_slot, &lgr->rmbs[compressed_bufsize],
+			    list) {
+		if (cmpxchg(&rmb_slot->used, 0, 1) == 0) {
+			read_unlock_bh(&lgr->rmbs_lock);
+			return rmb_slot;
+		}
+	}
+	read_unlock_bh(&lgr->rmbs_lock);
+	return NULL;
+}
+
+/* one of the conditions for announcing a receiver's current window size is
+ * that it "results in a minimum increase in the window size of 10% of the
+ * receive buffer space" [RFC7609]
+ */
+static inline int smc_rmb_wnd_update_limit(int rmbe_size)
+{
+	return min_t(int, rmbe_size / 10, SOCK_MIN_SNDBUF / 2);
+}
+
+/* create the tx buffer for an SMC socket */
+int smc_sndbuf_create(struct smc_sock *smc)
+{
+	struct smc_connection *conn = &smc->conn;
+	struct smc_link_group *lgr = conn->lgr;
+	int tmp_bufsize, tmp_bufsize_short;
+	struct smc_buf_desc *sndbuf_desc;
+	int rc;
+
+	/* use socket send buffer size (w/o overhead) as start value */
+	for (tmp_bufsize_short = smc_compress_bufsize(smc->sk.sk_sndbuf / 2);
+	     tmp_bufsize_short >= 0; tmp_bufsize_short--) {
+		tmp_bufsize = smc_uncompress_bufsize(tmp_bufsize_short);
+		/* check for reusable sndbuf_slot in the link group */
+		sndbuf_desc = smc_sndbuf_get_slot(lgr, tmp_bufsize_short);
+		if (sndbuf_desc) {
+			memset(sndbuf_desc->cpu_addr, 0, tmp_bufsize);
+			break; /* found reusable slot */
+		}
+		/* try to alloc a new send buffer */
+		sndbuf_desc = kzalloc(sizeof(*sndbuf_desc), GFP_KERNEL);
+		if (!sndbuf_desc)
+			break; /* give up with -ENOMEM */
+		sndbuf_desc->cpu_addr = kzalloc(tmp_bufsize,
+						GFP_KERNEL | __GFP_NOWARN |
+						__GFP_NOMEMALLOC |
+						__GFP_NORETRY);
+		if (!sndbuf_desc->cpu_addr) {
+			kfree(sndbuf_desc);
+			sndbuf_desc = NULL;
+			/* if send buffer allocation has failed,
+			 * try a smaller one
+			 */
+			continue;
+		}
+		rc = smc_ib_buf_map(lgr->lnk[SMC_SINGLE_LINK].smcibdev,
+				    tmp_bufsize, sndbuf_desc,
+				    DMA_TO_DEVICE);
+		if (rc) {
+			kfree(sndbuf_desc->cpu_addr);
+			kfree(sndbuf_desc);
+			sndbuf_desc = NULL;
+			continue; /* if mapping failed, try smaller one */
+		}
+		sndbuf_desc->used = 1;
+		write_lock_bh(&lgr->sndbufs_lock);
+		list_add(&sndbuf_desc->list,
+			 &lgr->sndbufs[tmp_bufsize_short]);
+		write_unlock_bh(&lgr->sndbufs_lock);
+		break;
+	}
+	if (sndbuf_desc && sndbuf_desc->cpu_addr) {
+		conn->sndbuf_desc = sndbuf_desc;
+		conn->sndbuf_size = tmp_bufsize;
+		smc->sk.sk_sndbuf = tmp_bufsize * 2;
+		atomic_set(&conn->sndbuf_space, tmp_bufsize);
+		return 0;
+	} else {
+		return -ENOMEM;
+	}
+}
+
+/* create the RMB for an SMC socket (even though the SMC protocol
+ * allows more than one RMB-element per RMB, the Linux implementation
+ * uses just one RMB-element per RMB, i.e. uses an extra RMB for every
+ * connection in a link group
+ */
+int smc_rmb_create(struct smc_sock *smc)
+{
+	struct smc_connection *conn = &smc->conn;
+	struct smc_link_group *lgr = conn->lgr;
+	int tmp_bufsize, tmp_bufsize_short;
+	struct smc_buf_desc *rmb_desc;
+	int rc;
+
+	/* use socket recv buffer size (w/o overhead) as start value */
+	for (tmp_bufsize_short = smc_compress_bufsize(smc->sk.sk_rcvbuf / 2);
+	     tmp_bufsize_short >= 0; tmp_bufsize_short--) {
+		tmp_bufsize = smc_uncompress_bufsize(tmp_bufsize_short);
+		/* check for reusable rmb_slot in the link group */
+		rmb_desc = smc_rmb_get_slot(lgr, tmp_bufsize_short);
+		if (rmb_desc) {
+			memset(rmb_desc->cpu_addr, 0, tmp_bufsize);
+			break; /* found reusable slot */
+		}
+		/* try to alloc a new RMB */
+		rmb_desc = kzalloc(sizeof(*rmb_desc), GFP_KERNEL);
+		if (!rmb_desc)
+			break; /* give up with -ENOMEM */
+		rmb_desc->cpu_addr = kzalloc(tmp_bufsize,
+					     GFP_KERNEL | __GFP_NOWARN |
+					     __GFP_NOMEMALLOC |
+					     __GFP_NORETRY);
+		if (!rmb_desc->cpu_addr) {
+			kfree(rmb_desc);
+			rmb_desc = NULL;
+			/* if RMB allocation has failed,
+			 * try a smaller one
+			 */
+			continue;
+		}
+		rc = smc_ib_buf_map(lgr->lnk[SMC_SINGLE_LINK].smcibdev,
+				    tmp_bufsize, rmb_desc,
+				    DMA_FROM_DEVICE);
+		if (rc) {
+			kfree(rmb_desc->cpu_addr);
+			kfree(rmb_desc);
+			rmb_desc = NULL;
+			continue; /* if mapping failed, try smaller one */
+		}
+		rc = smc_ib_get_memory_region(lgr->lnk[SMC_SINGLE_LINK].roce_pd,
+					      IB_ACCESS_REMOTE_WRITE |
+					      IB_ACCESS_LOCAL_WRITE,
+					     &rmb_desc->mr_rx[SMC_SINGLE_LINK]);
+		if (rc) {
+			smc_ib_buf_unmap(lgr->lnk[SMC_SINGLE_LINK].smcibdev,
+					 tmp_bufsize, rmb_desc,
+					 DMA_FROM_DEVICE);
+			kfree(rmb_desc->cpu_addr);
+			kfree(rmb_desc);
+			rmb_desc = NULL;
+			continue;
+		}
+		rmb_desc->used = 1;
+		write_lock_bh(&lgr->rmbs_lock);
+		list_add(&rmb_desc->list,
+			 &lgr->rmbs[tmp_bufsize_short]);
+		write_unlock_bh(&lgr->rmbs_lock);
+		break;
+	}
+	if (rmb_desc && rmb_desc->cpu_addr) {
+		conn->rmb_desc = rmb_desc;
+		conn->rmbe_size = tmp_bufsize;
+		conn->rmbe_size_short = tmp_bufsize_short;
+		smc->sk.sk_rcvbuf = tmp_bufsize * 2;
+		atomic_set(&conn->bytes_to_rcv, 0);
+		conn->rmbe_update_limit = smc_rmb_wnd_update_limit(tmp_bufsize);
+		return 0;
+	} else {
+		return -ENOMEM;
+	}
+}
+
+static inline int smc_rmb_reserve_rtoken_idx(struct smc_link_group *lgr)
+{
+	int i;
+
+	for_each_clear_bit(i, lgr->rtokens_used_mask, SMC_RMBS_PER_LGR_MAX) {
+		if (!test_and_set_bit(i, lgr->rtokens_used_mask))
+			return i;
+	}
+	return -ENOSPC;
+}
+
+/* save rkey and dma_addr received from peer during clc handshake */
+int smc_rmb_rtoken_handling(struct smc_connection *conn,
+			    struct smc_clc_msg_accept_confirm *clc)
+{
+	u64 dma_addr = be64_to_cpu(clc->rmb_dma_addr);
+	struct smc_link_group *lgr = conn->lgr;
+	u32 rkey = ntohl(clc->rmb_rkey);
+	int i;
+
+	for (i = 0; i < SMC_RMBS_PER_LGR_MAX; i++) {
+		if ((lgr->rtokens[i][SMC_SINGLE_LINK].rkey == rkey) &&
+		    test_bit(i, lgr->rtokens_used_mask)) {
+			conn->rtoken_idx = i;
+			return 0;
+		}
+	}
+	conn->rtoken_idx = smc_rmb_reserve_rtoken_idx(lgr);
+	if (conn->rtoken_idx < 0)
+		return conn->rtoken_idx;
+	lgr->rtokens[conn->rtoken_idx][SMC_SINGLE_LINK].rkey = rkey;
+	lgr->rtokens[conn->rtoken_idx][SMC_SINGLE_LINK].dma_addr = dma_addr;
+	return 0;
+}
diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h
new file mode 100644
index 000000000000..27eb38056a27
--- /dev/null
+++ b/net/smc/smc_core.h
@@ -0,0 +1,181 @@
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ *  Definitions for SMC Connections, Link Groups and Links
+ *
+ *  Copyright IBM Corp. 2016
+ *
+ *  Author(s):  Ursula Braun <ubraun@linux.vnet.ibm.com>
+ */
+
+#ifndef _SMC_CORE_H
+#define _SMC_CORE_H
+
+#include <linux/atomic.h>
+#include <rdma/ib_verbs.h>
+
+#include "smc.h"
+#include "smc_ib.h"
+
+#define SMC_RMBS_PER_LGR_MAX	255	/* max. # of RMBs per link group */
+
+struct smc_lgr_list {			/* list of link group definition */
+	struct list_head	list;
+	spinlock_t		lock;	/* protects list of link groups */
+};
+
+extern struct smc_lgr_list	smc_lgr_list; /* list of link groups */
+
+enum smc_lgr_role {		/* possible roles of a link group */
+	SMC_CLNT,	/* client */
+	SMC_SERV	/* server */
+};
+
+#define SMC_WR_BUF_SIZE		48	/* size of work request buffer */
+
+struct smc_wr_buf {
+	u8	raw[SMC_WR_BUF_SIZE];
+};
+
+struct smc_link {
+	struct smc_ib_device	*smcibdev;	/* ib-device */
+	u8			ibport;		/* port - values 1 | 2 */
+	struct ib_pd		*roce_pd;	/* IB protection domain,
+						 * unique for every RoCE QP
+						 */
+	struct ib_qp		*roce_qp;	/* IB queue pair */
+	struct ib_qp_attr	qp_attr;	/* IB queue pair attributes */
+
+	struct smc_wr_buf	*wr_tx_bufs;	/* WR send payload buffers */
+	struct ib_send_wr	*wr_tx_ibs;	/* WR send meta data */
+	struct ib_sge		*wr_tx_sges;	/* WR send gather meta data */
+	struct smc_wr_tx_pend	*wr_tx_pends;	/* WR send waiting for CQE */
+	/* above four vectors have wr_tx_cnt elements and use the same index */
+	dma_addr_t		wr_tx_dma_addr;	/* DMA address of wr_tx_bufs */
+	atomic_long_t		wr_tx_id;	/* seq # of last sent WR */
+	unsigned long		*wr_tx_mask;	/* bit mask of used indexes */
+	u32			wr_tx_cnt;	/* number of WR send buffers */
+	wait_queue_head_t	wr_tx_wait;	/* wait for free WR send buf */
+
+	struct smc_wr_buf	*wr_rx_bufs;	/* WR recv payload buffers */
+	struct ib_recv_wr	*wr_rx_ibs;	/* WR recv meta data */
+	struct ib_sge		*wr_rx_sges;	/* WR recv scatter meta data */
+	/* above three vectors have wr_rx_cnt elements and use the same index */
+	dma_addr_t		wr_rx_dma_addr;	/* DMA address of wr_rx_bufs */
+	u64			wr_rx_id;	/* seq # of last recv WR */
+	u32			wr_rx_cnt;	/* number of WR recv buffers */
+
+	union ib_gid		gid;		/* gid matching used vlan id */
+	u32			peer_qpn;	/* QP number of peer */
+	enum ib_mtu		path_mtu;	/* used mtu */
+	enum ib_mtu		peer_mtu;	/* mtu size of peer */
+	u32			psn_initial;	/* QP tx initial packet seqno */
+	u32			peer_psn;	/* QP rx initial packet seqno */
+	u8			peer_mac[ETH_ALEN];	/* = gid[8:10||13:15] */
+	u8			peer_gid[sizeof(union ib_gid)];	/* gid of peer*/
+	u8			link_id;	/* unique # within link group */
+	struct completion	llc_confirm;	/* wait for rx of conf link */
+	struct completion	llc_confirm_resp; /* wait 4 rx of cnf lnk rsp */
+};
+
+/* For now we just allow one parallel link per link group. The SMC protocol
+ * allows more (up to 8).
+ */
+#define SMC_LINKS_PER_LGR_MAX	1
+#define SMC_SINGLE_LINK		0
+
+#define SMC_FIRST_CONTACT	1		/* first contact to a peer */
+#define SMC_REUSE_CONTACT	0		/* follow-on contact to a peer*/
+
+/* tx/rx buffer list element for sndbufs list and rmbs list of a lgr */
+struct smc_buf_desc {
+	struct list_head	list;
+	u64			dma_addr[SMC_LINKS_PER_LGR_MAX];
+						/* mapped address of buffer */
+	void			*cpu_addr;	/* virtual address of buffer */
+	struct ib_mr		*mr_rx[SMC_LINKS_PER_LGR_MAX];
+						/* for rmb only:
+						 * rkey provided to peer
+						 */
+	u32			used;		/* currently used / unused */
+};
+
+struct smc_rtoken {				/* address/key of remote RMB */
+	u64			dma_addr;
+	u32			rkey;
+};
+
+#define SMC_LGR_ID_SIZE		4
+
+struct smc_link_group {
+	struct list_head	list;
+	enum smc_lgr_role	role;		/* client or server */
+	__be32			daddr;		/* destination ip address */
+	struct smc_link		lnk[SMC_LINKS_PER_LGR_MAX];	/* smc link */
+	char			peer_systemid[SMC_SYSTEMID_LEN];
+						/* unique system_id of peer */
+	struct rb_root		conns_all;	/* connection tree */
+	rwlock_t		conns_lock;	/* protects conns_all */
+	unsigned int		conns_num;	/* current # of connections */
+	unsigned short		vlan_id;	/* vlan id of link group */
+
+	struct list_head	sndbufs[SMC_RMBE_SIZES];/* tx buffers */
+	rwlock_t		sndbufs_lock;	/* protects tx buffers */
+	struct list_head	rmbs[SMC_RMBE_SIZES];	/* rx buffers */
+	rwlock_t		rmbs_lock;	/* protects rx buffers */
+	struct smc_rtoken	rtokens[SMC_RMBS_PER_LGR_MAX]
+				       [SMC_LINKS_PER_LGR_MAX];
+						/* remote addr/key pairs */
+	unsigned long		rtokens_used_mask[BITS_TO_LONGS(
+							SMC_RMBS_PER_LGR_MAX)];
+						/* used rtoken elements */
+
+	u8			id[SMC_LGR_ID_SIZE];	/* unique lgr id */
+	struct delayed_work	free_work;	/* delayed freeing of an lgr */
+	bool			sync_err;	/* lgr no longer fits to peer */
+};
+
+/* Find the connection associated with the given alert token in the link group.
+ * To use rbtrees we have to implement our own search core.
+ * Requires @conns_lock
+ * @token	alert token to search for
+ * @lgr		 link group to search in
+ * Returns connection associated with token if found, NULL otherwise.
+ */
+static inline struct smc_connection *smc_lgr_find_conn(
+	u32 token, struct smc_link_group *lgr)
+{
+	struct smc_connection *res = NULL;
+	struct rb_node *node;
+
+	node = lgr->conns_all.rb_node;
+	while (node) {
+		struct smc_connection *cur = rb_entry(node,
+					struct smc_connection, alert_node);
+
+		if (cur->alert_token_local > token) {
+			node = node->rb_left;
+		} else {
+			if (cur->alert_token_local < token) {
+				node = node->rb_right;
+			} else {
+				res = cur;
+				break;
+			}
+		}
+	}
+
+	return res;
+}
+
+struct smc_sock;
+struct smc_clc_msg_accept_confirm;
+
+void smc_lgr_free(struct smc_link_group *lgr);
+void smc_lgr_terminate(struct smc_link_group *lgr);
+int smc_sndbuf_create(struct smc_sock *smc);
+int smc_rmb_create(struct smc_sock *smc);
+int smc_rmb_rtoken_handling(struct smc_connection *conn,
+			    struct smc_clc_msg_accept_confirm *clc);
+
+#endif
diff --git a/net/smc/smc_diag.c b/net/smc/smc_diag.c
new file mode 100644
index 000000000000..d2d01cf70224
--- /dev/null
+++ b/net/smc/smc_diag.c
@@ -0,0 +1,215 @@
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * Monitoring SMC transport protocol sockets
+ *
+ * Copyright IBM Corp. 2016
+ *
+ * Author(s):  Ursula Braun <ubraun@linux.vnet.ibm.com>
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/sock_diag.h>
+#include <linux/inet_diag.h>
+#include <linux/smc_diag.h>
+#include <net/netlink.h>
+#include <net/smc.h>
+
+#include "smc.h"
+#include "smc_core.h"
+
+static void smc_gid_be16_convert(__u8 *buf, u8 *gid_raw)
+{
+	sprintf(buf, "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x",
+		be16_to_cpu(((__be16 *)gid_raw)[0]),
+		be16_to_cpu(((__be16 *)gid_raw)[1]),
+		be16_to_cpu(((__be16 *)gid_raw)[2]),
+		be16_to_cpu(((__be16 *)gid_raw)[3]),
+		be16_to_cpu(((__be16 *)gid_raw)[4]),
+		be16_to_cpu(((__be16 *)gid_raw)[5]),
+		be16_to_cpu(((__be16 *)gid_raw)[6]),
+		be16_to_cpu(((__be16 *)gid_raw)[7]));
+}
+
+static void smc_diag_msg_common_fill(struct smc_diag_msg *r, struct sock *sk)
+{
+	struct smc_sock *smc = smc_sk(sk);
+
+	r->diag_family = sk->sk_family;
+	if (!smc->clcsock)
+		return;
+	r->id.idiag_sport = htons(smc->clcsock->sk->sk_num);
+	r->id.idiag_dport = smc->clcsock->sk->sk_dport;
+	r->id.idiag_if = smc->clcsock->sk->sk_bound_dev_if;
+	sock_diag_save_cookie(sk, r->id.idiag_cookie);
+	memset(&r->id.idiag_src, 0, sizeof(r->id.idiag_src));
+	memset(&r->id.idiag_dst, 0, sizeof(r->id.idiag_dst));
+	r->id.idiag_src[0] = smc->clcsock->sk->sk_rcv_saddr;
+	r->id.idiag_dst[0] = smc->clcsock->sk->sk_daddr;
+}
+
+static int smc_diag_msg_attrs_fill(struct sock *sk, struct sk_buff *skb,
+				   struct smc_diag_msg *r,
+				   struct user_namespace *user_ns)
+{
+	if (nla_put_u8(skb, SMC_DIAG_SHUTDOWN, sk->sk_shutdown))
+		return 1;
+
+	r->diag_uid = from_kuid_munged(user_ns, sock_i_uid(sk));
+	r->diag_inode = sock_i_ino(sk);
+	return 0;
+}
+
+static int __smc_diag_dump(struct sock *sk, struct sk_buff *skb,
+			   struct netlink_callback *cb,
+			   const struct smc_diag_req *req,
+			   struct nlattr *bc)
+{
+	struct smc_sock *smc = smc_sk(sk);
+	struct user_namespace *user_ns;
+	struct smc_diag_msg *r;
+	struct nlmsghdr *nlh;
+
+	nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
+			cb->nlh->nlmsg_type, sizeof(*r), NLM_F_MULTI);
+	if (!nlh)
+		return -EMSGSIZE;
+
+	r = nlmsg_data(nlh);
+	smc_diag_msg_common_fill(r, sk);
+	r->diag_state = sk->sk_state;
+	r->diag_fallback = smc->use_fallback;
+	user_ns = sk_user_ns(NETLINK_CB(cb->skb).sk);
+	if (smc_diag_msg_attrs_fill(sk, skb, r, user_ns))
+		goto errout;
+
+	if ((req->diag_ext & (1 << (SMC_DIAG_CONNINFO - 1))) && smc->conn.lgr) {
+		struct smc_connection *conn = &smc->conn;
+		struct smc_diag_conninfo cinfo = {
+			.token = conn->alert_token_local,
+			.sndbuf_size = conn->sndbuf_size,
+			.rmbe_size = conn->rmbe_size,
+			.peer_rmbe_size = conn->peer_rmbe_size,
+
+			.rx_prod.wrap = conn->local_rx_ctrl.prod.wrap,
+			.rx_prod.count = conn->local_rx_ctrl.prod.count,
+			.rx_cons.wrap = conn->local_rx_ctrl.cons.wrap,
+			.rx_cons.count = conn->local_rx_ctrl.cons.count,
+
+			.tx_prod.wrap = conn->local_tx_ctrl.prod.wrap,
+			.tx_prod.count = conn->local_tx_ctrl.prod.count,
+			.tx_cons.wrap = conn->local_tx_ctrl.cons.wrap,
+			.tx_cons.count = conn->local_tx_ctrl.cons.count,
+
+			.tx_prod_flags =
+				*(u8 *)&conn->local_tx_ctrl.prod_flags,
+			.tx_conn_state_flags =
+				*(u8 *)&conn->local_tx_ctrl.conn_state_flags,
+			.rx_prod_flags = *(u8 *)&conn->local_rx_ctrl.prod_flags,
+			.rx_conn_state_flags =
+				*(u8 *)&conn->local_rx_ctrl.conn_state_flags,
+
+			.tx_prep.wrap = conn->tx_curs_prep.wrap,
+			.tx_prep.count = conn->tx_curs_prep.count,
+			.tx_sent.wrap = conn->tx_curs_sent.wrap,
+			.tx_sent.count = conn->tx_curs_sent.count,
+			.tx_fin.wrap = conn->tx_curs_fin.wrap,
+			.tx_fin.count = conn->tx_curs_fin.count,
+		};
+
+		if (nla_put(skb, SMC_DIAG_CONNINFO, sizeof(cinfo), &cinfo) < 0)
+			goto errout;
+	}
+
+	if ((req->diag_ext & (1 << (SMC_DIAG_LGRINFO - 1))) && smc->conn.lgr) {
+		struct smc_diag_lgrinfo linfo = {
+			.role = smc->conn.lgr->role,
+			.lnk[0].ibport = smc->conn.lgr->lnk[0].ibport,
+			.lnk[0].link_id = smc->conn.lgr->lnk[0].link_id,
+		};
+
+		memcpy(linfo.lnk[0].ibname,
+		       smc->conn.lgr->lnk[0].smcibdev->ibdev->name,
+		       sizeof(smc->conn.lgr->lnk[0].smcibdev->ibdev->name));
+		smc_gid_be16_convert(linfo.lnk[0].gid,
+				     smc->conn.lgr->lnk[0].gid.raw);
+		smc_gid_be16_convert(linfo.lnk[0].peer_gid,
+				     smc->conn.lgr->lnk[0].peer_gid);
+
+		if (nla_put(skb, SMC_DIAG_LGRINFO, sizeof(linfo), &linfo) < 0)
+			goto errout;
+	}
+
+	nlmsg_end(skb, nlh);
+	return 0;
+
+errout:
+	nlmsg_cancel(skb, nlh);
+	return -EMSGSIZE;
+}
+
+static int smc_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	struct net *net = sock_net(skb->sk);
+	struct nlattr *bc = NULL;
+	struct hlist_head *head;
+	struct sock *sk;
+	int rc = 0;
+
+	read_lock(&smc_proto.h.smc_hash->lock);
+	head = &smc_proto.h.smc_hash->ht;
+	if (hlist_empty(head))
+		goto out;
+
+	sk_for_each(sk, head) {
+		if (!net_eq(sock_net(sk), net))
+			continue;
+		rc = __smc_diag_dump(sk, skb, cb, nlmsg_data(cb->nlh), bc);
+		if (rc)
+			break;
+	}
+
+out:
+	read_unlock(&smc_proto.h.smc_hash->lock);
+	return rc;
+}
+
+static int smc_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h)
+{
+	struct net *net = sock_net(skb->sk);
+
+	if (h->nlmsg_type == SOCK_DIAG_BY_FAMILY &&
+	    h->nlmsg_flags & NLM_F_DUMP) {
+		{
+			struct netlink_dump_control c = {
+				.dump = smc_diag_dump,
+				.min_dump_alloc = SKB_WITH_OVERHEAD(32768),
+			};
+			return netlink_dump_start(net->diag_nlsk, skb, h, &c);
+		}
+	}
+	return 0;
+}
+
+static const struct sock_diag_handler smc_diag_handler = {
+	.family = AF_SMC,
+	.dump = smc_diag_handler_dump,
+};
+
+static int __init smc_diag_init(void)
+{
+	return sock_diag_register(&smc_diag_handler);
+}
+
+static void __exit smc_diag_exit(void)
+{
+	sock_diag_unregister(&smc_diag_handler);
+}
+
+module_init(smc_diag_init);
+module_exit(smc_diag_exit);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 43 /* AF_SMC */);
diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c
new file mode 100644
index 000000000000..e6743c008ac5
--- /dev/null
+++ b/net/smc/smc_ib.c
@@ -0,0 +1,466 @@
+/*
+ *  Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ *  IB infrastructure:
+ *  Establish SMC-R as an Infiniband Client to be notified about added and
+ *  removed IB devices of type RDMA.
+ *  Determine device and port characteristics for these IB devices.
+ *
+ *  Copyright IBM Corp. 2016
+ *
+ *  Author(s):  Ursula Braun <ubraun@linux.vnet.ibm.com>
+ */
+
+#include <linux/random.h>
+#include <linux/workqueue.h>
+#include <rdma/ib_verbs.h>
+
+#include "smc_pnet.h"
+#include "smc_ib.h"
+#include "smc_core.h"
+#include "smc_wr.h"
+#include "smc.h"
+
+#define SMC_QP_MIN_RNR_TIMER		5
+#define SMC_QP_TIMEOUT			15 /* 4096 * 2 ** timeout usec */
+#define SMC_QP_RETRY_CNT			7 /* 7: infinite */
+#define SMC_QP_RNR_RETRY			7 /* 7: infinite */
+
+struct smc_ib_devices smc_ib_devices = {	/* smc-registered ib devices */
+	.lock = __SPIN_LOCK_UNLOCKED(smc_ib_devices.lock),
+	.list = LIST_HEAD_INIT(smc_ib_devices.list),
+};
+
+#define SMC_LOCAL_SYSTEMID_RESET	"%%%%%%%"
+
+u8 local_systemid[SMC_SYSTEMID_LEN] = SMC_LOCAL_SYSTEMID_RESET;	/* unique system
+								 * identifier
+								 */
+
+int smc_ib_get_memory_region(struct ib_pd *pd, int access_flags,
+			     struct ib_mr **mr)
+{
+	int rc;
+
+	if (*mr)
+		return 0; /* already done */
+
+	/* obtain unique key -
+	 * next invocation of get_dma_mr returns a different key!
+	 */
+	*mr = pd->device->get_dma_mr(pd, access_flags);
+	rc = PTR_ERR_OR_ZERO(*mr);
+	if (IS_ERR(*mr))
+		*mr = NULL;
+	return rc;
+}
+
+static int smc_ib_modify_qp_init(struct smc_link *lnk)
+{
+	struct ib_qp_attr qp_attr;
+
+	memset(&qp_attr, 0, sizeof(qp_attr));
+	qp_attr.qp_state = IB_QPS_INIT;
+	qp_attr.pkey_index = 0;
+	qp_attr.port_num = lnk->ibport;
+	qp_attr.qp_access_flags = IB_ACCESS_LOCAL_WRITE
+				| IB_ACCESS_REMOTE_WRITE;
+	return ib_modify_qp(lnk->roce_qp, &qp_attr,
+			    IB_QP_STATE | IB_QP_PKEY_INDEX |
+			    IB_QP_ACCESS_FLAGS | IB_QP_PORT);
+}
+
+static int smc_ib_modify_qp_rtr(struct smc_link *lnk)
+{
+	enum ib_qp_attr_mask qp_attr_mask =
+		IB_QP_STATE | IB_QP_AV | IB_QP_PATH_MTU | IB_QP_DEST_QPN |
+		IB_QP_RQ_PSN | IB_QP_MAX_DEST_RD_ATOMIC | IB_QP_MIN_RNR_TIMER;
+	struct ib_qp_attr qp_attr;
+
+	memset(&qp_attr, 0, sizeof(qp_attr));
+	qp_attr.qp_state = IB_QPS_RTR;
+	qp_attr.path_mtu = min(lnk->path_mtu, lnk->peer_mtu);
+	qp_attr.ah_attr.port_num = lnk->ibport;
+	qp_attr.ah_attr.ah_flags = IB_AH_GRH;
+	qp_attr.ah_attr.grh.hop_limit = 1;
+	memcpy(&qp_attr.ah_attr.grh.dgid, lnk->peer_gid,
+	       sizeof(lnk->peer_gid));
+	memcpy(&qp_attr.ah_attr.dmac, lnk->peer_mac,
+	       sizeof(lnk->peer_mac));
+	qp_attr.dest_qp_num = lnk->peer_qpn;
+	qp_attr.rq_psn = lnk->peer_psn; /* starting receive packet seq # */
+	qp_attr.max_dest_rd_atomic = 1; /* max # of resources for incoming
+					 * requests
+					 */
+	qp_attr.min_rnr_timer = SMC_QP_MIN_RNR_TIMER;
+
+	return ib_modify_qp(lnk->roce_qp, &qp_attr, qp_attr_mask);
+}
+
+int smc_ib_modify_qp_rts(struct smc_link *lnk)
+{
+	struct ib_qp_attr qp_attr;
+
+	memset(&qp_attr, 0, sizeof(qp_attr));
+	qp_attr.qp_state = IB_QPS_RTS;
+	qp_attr.timeout = SMC_QP_TIMEOUT;	/* local ack timeout */
+	qp_attr.retry_cnt = SMC_QP_RETRY_CNT;	/* retry count */
+	qp_attr.rnr_retry = SMC_QP_RNR_RETRY;	/* RNR retries, 7=infinite */
+	qp_attr.sq_psn = lnk->psn_initial;	/* starting send packet seq # */
+	qp_attr.max_rd_atomic = 1;	/* # of outstanding RDMA reads and
+					 * atomic ops allowed
+					 */
+	return ib_modify_qp(lnk->roce_qp, &qp_attr,
+			    IB_QP_STATE | IB_QP_TIMEOUT | IB_QP_RETRY_CNT |
+			    IB_QP_SQ_PSN | IB_QP_RNR_RETRY |
+			    IB_QP_MAX_QP_RD_ATOMIC);
+}
+
+int smc_ib_modify_qp_reset(struct smc_link *lnk)
+{
+	struct ib_qp_attr qp_attr;
+
+	memset(&qp_attr, 0, sizeof(qp_attr));
+	qp_attr.qp_state = IB_QPS_RESET;
+	return ib_modify_qp(lnk->roce_qp, &qp_attr, IB_QP_STATE);
+}
+
+int smc_ib_ready_link(struct smc_link *lnk)
+{
+	struct smc_link_group *lgr =
+		container_of(lnk, struct smc_link_group, lnk[0]);
+	int rc = 0;
+
+	rc = smc_ib_modify_qp_init(lnk);
+	if (rc)
+		goto out;
+
+	rc = smc_ib_modify_qp_rtr(lnk);
+	if (rc)
+		goto out;
+	smc_wr_remember_qp_attr(lnk);
+	rc = ib_req_notify_cq(lnk->smcibdev->roce_cq_recv,
+			      IB_CQ_SOLICITED_MASK);
+	if (rc)
+		goto out;
+	rc = smc_wr_rx_post_init(lnk);
+	if (rc)
+		goto out;
+	smc_wr_remember_qp_attr(lnk);
+
+	if (lgr->role == SMC_SERV) {
+		rc = smc_ib_modify_qp_rts(lnk);
+		if (rc)
+			goto out;
+		smc_wr_remember_qp_attr(lnk);
+	}
+out:
+	return rc;
+}
+
+/* process context wrapper for might_sleep smc_ib_remember_port_attr */
+static void smc_ib_port_event_work(struct work_struct *work)
+{
+	struct smc_ib_device *smcibdev = container_of(
+		work, struct smc_ib_device, port_event_work);
+	u8 port_idx;
+
+	for_each_set_bit(port_idx, &smcibdev->port_event_mask, SMC_MAX_PORTS) {
+		smc_ib_remember_port_attr(smcibdev, port_idx + 1);
+		clear_bit(port_idx, &smcibdev->port_event_mask);
+	}
+}
+
+/* can be called in IRQ context */
+static void smc_ib_global_event_handler(struct ib_event_handler *handler,
+					struct ib_event *ibevent)
+{
+	struct smc_ib_device *smcibdev;
+	u8 port_idx;
+
+	smcibdev = container_of(handler, struct smc_ib_device, event_handler);
+	if (!smc_pnet_find_ib(smcibdev->ibdev->name))
+		return;
+
+	switch (ibevent->event) {
+	case IB_EVENT_PORT_ERR:
+		port_idx = ibevent->element.port_num - 1;
+		set_bit(port_idx, &smcibdev->port_event_mask);
+		schedule_work(&smcibdev->port_event_work);
+		/* fall through */
+	case IB_EVENT_DEVICE_FATAL:
+		/* tbd in follow-on patch:
+		 * abnormal close of corresponding connections
+		 */
+		break;
+	case IB_EVENT_PORT_ACTIVE:
+		port_idx = ibevent->element.port_num - 1;
+		set_bit(port_idx, &smcibdev->port_event_mask);
+		schedule_work(&smcibdev->port_event_work);
+		break;
+	default:
+		break;
+	}
+}
+
+void smc_ib_dealloc_protection_domain(struct smc_link *lnk)
+{
+	ib_dealloc_pd(lnk->roce_pd);
+	lnk->roce_pd = NULL;
+}
+
+int smc_ib_create_protection_domain(struct smc_link *lnk)
+{
+	int rc;
+
+	lnk->roce_pd = ib_alloc_pd(lnk->smcibdev->ibdev, 0);
+	rc = PTR_ERR_OR_ZERO(lnk->roce_pd);
+	if (IS_ERR(lnk->roce_pd))
+		lnk->roce_pd = NULL;
+	return rc;
+}
+
+static void smc_ib_qp_event_handler(struct ib_event *ibevent, void *priv)
+{
+	switch (ibevent->event) {
+	case IB_EVENT_DEVICE_FATAL:
+	case IB_EVENT_GID_CHANGE:
+	case IB_EVENT_PORT_ERR:
+	case IB_EVENT_QP_ACCESS_ERR:
+		/* tbd in follow-on patch:
+		 * abnormal close of corresponding connections
+		 */
+		break;
+	default:
+		break;
+	}
+}
+
+void smc_ib_destroy_queue_pair(struct smc_link *lnk)
+{
+	ib_destroy_qp(lnk->roce_qp);
+	lnk->roce_qp = NULL;
+}
+
+/* create a queue pair within the protection domain for a link */
+int smc_ib_create_queue_pair(struct smc_link *lnk)
+{
+	struct ib_qp_init_attr qp_attr = {
+		.event_handler = smc_ib_qp_event_handler,
+		.qp_context = lnk,
+		.send_cq = lnk->smcibdev->roce_cq_send,
+		.recv_cq = lnk->smcibdev->roce_cq_recv,
+		.srq = NULL,
+		.cap = {
+			.max_send_wr = SMC_WR_BUF_CNT,
+				/* include unsolicited rdma_writes as well,
+				 * there are max. 2 RDMA_WRITE per 1 WR_SEND
+				 */
+			.max_recv_wr = SMC_WR_BUF_CNT * 3,
+			.max_send_sge = SMC_IB_MAX_SEND_SGE,
+			.max_recv_sge = 1,
+			.max_inline_data = SMC_WR_TX_SIZE,
+		},
+		.sq_sig_type = IB_SIGNAL_REQ_WR,
+		.qp_type = IB_QPT_RC,
+	};
+	int rc;
+
+	lnk->roce_qp = ib_create_qp(lnk->roce_pd, &qp_attr);
+	rc = PTR_ERR_OR_ZERO(lnk->roce_qp);
+	if (IS_ERR(lnk->roce_qp))
+		lnk->roce_qp = NULL;
+	else
+		smc_wr_remember_qp_attr(lnk);
+	return rc;
+}
+
+/* map a new TX or RX buffer to DMA */
+int smc_ib_buf_map(struct smc_ib_device *smcibdev, int buf_size,
+		   struct smc_buf_desc *buf_slot,
+		   enum dma_data_direction data_direction)
+{
+	int rc = 0;
+
+	if (buf_slot->dma_addr[SMC_SINGLE_LINK])
+		return rc; /* already mapped */
+	buf_slot->dma_addr[SMC_SINGLE_LINK] =
+		ib_dma_map_single(smcibdev->ibdev, buf_slot->cpu_addr,
+				  buf_size, data_direction);
+	if (ib_dma_mapping_error(smcibdev->ibdev,
+				 buf_slot->dma_addr[SMC_SINGLE_LINK]))
+		rc = -EIO;
+	return rc;
+}
+
+void smc_ib_buf_unmap(struct smc_ib_device *smcibdev, int buf_size,
+		      struct smc_buf_desc *buf_slot,
+		      enum dma_data_direction data_direction)
+{
+	if (!buf_slot->dma_addr[SMC_SINGLE_LINK])
+		return; /* already unmapped */
+	ib_dma_unmap_single(smcibdev->ibdev, *buf_slot->dma_addr, buf_size,
+			    data_direction);
+	buf_slot->dma_addr[SMC_SINGLE_LINK] = 0;
+}
+
+static int smc_ib_fill_gid_and_mac(struct smc_ib_device *smcibdev, u8 ibport)
+{
+	struct net_device *ndev;
+	int rc;
+
+	rc = ib_query_gid(smcibdev->ibdev, ibport, 0,
+			  &smcibdev->gid[ibport - 1], NULL);
+	/* the SMC protocol requires specification of the roce MAC address;
+	 * if net_device cannot be determined, it can be derived from gid 0
+	 */
+	ndev = smcibdev->ibdev->get_netdev(smcibdev->ibdev, ibport);
+	if (ndev) {
+		memcpy(&smcibdev->mac, ndev->dev_addr, ETH_ALEN);
+	} else if (!rc) {
+		memcpy(&smcibdev->mac[ibport - 1][0],
+		       &smcibdev->gid[ibport - 1].raw[8], 3);
+		memcpy(&smcibdev->mac[ibport - 1][3],
+		       &smcibdev->gid[ibport - 1].raw[13], 3);
+		smcibdev->mac[ibport - 1][0] &= ~0x02;
+	}
+	return rc;
+}
+
+/* Create an identifier unique for this instance of SMC-R.
+ * The MAC-address of the first active registered IB device
+ * plus a random 2-byte number is used to create this identifier.
+ * This name is delivered to the peer during connection initialization.
+ */
+static inline void smc_ib_define_local_systemid(struct smc_ib_device *smcibdev,
+						u8 ibport)
+{
+	memcpy(&local_systemid[2], &smcibdev->mac[ibport - 1],
+	       sizeof(smcibdev->mac[ibport - 1]));
+	get_random_bytes(&local_systemid[0], 2);
+}
+
+bool smc_ib_port_active(struct smc_ib_device *smcibdev, u8 ibport)
+{
+	return smcibdev->pattr[ibport - 1].state == IB_PORT_ACTIVE;
+}
+
+int smc_ib_remember_port_attr(struct smc_ib_device *smcibdev, u8 ibport)
+{
+	int rc;
+
+	memset(&smcibdev->pattr[ibport - 1], 0,
+	       sizeof(smcibdev->pattr[ibport - 1]));
+	rc = ib_query_port(smcibdev->ibdev, ibport,
+			   &smcibdev->pattr[ibport - 1]);
+	if (rc)
+		goto out;
+	rc = smc_ib_fill_gid_and_mac(smcibdev, ibport);
+	if (rc)
+		goto out;
+	if (!strncmp(local_systemid, SMC_LOCAL_SYSTEMID_RESET,
+		     sizeof(local_systemid)) &&
+	    smc_ib_port_active(smcibdev, ibport))
+		/* create unique system identifier */
+		smc_ib_define_local_systemid(smcibdev, ibport);
+out:
+	return rc;
+}
+
+long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev)
+{
+	struct ib_cq_init_attr cqattr =	{
+		.cqe = SMC_WR_MAX_CQE, .comp_vector = 0 };
+	long rc;
+
+	smcibdev->roce_cq_send = ib_create_cq(smcibdev->ibdev,
+					      smc_wr_tx_cq_handler, NULL,
+					      smcibdev, &cqattr);
+	rc = PTR_ERR_OR_ZERO(smcibdev->roce_cq_send);
+	if (IS_ERR(smcibdev->roce_cq_send)) {
+		smcibdev->roce_cq_send = NULL;
+		return rc;
+	}
+	smcibdev->roce_cq_recv = ib_create_cq(smcibdev->ibdev,
+					      smc_wr_rx_cq_handler, NULL,
+					      smcibdev, &cqattr);
+	rc = PTR_ERR_OR_ZERO(smcibdev->roce_cq_recv);
+	if (IS_ERR(smcibdev->roce_cq_recv)) {
+		smcibdev->roce_cq_recv = NULL;
+		goto err;
+	}
+	INIT_IB_EVENT_HANDLER(&smcibdev->event_handler, smcibdev->ibdev,
+			      smc_ib_global_event_handler);
+	ib_register_event_handler(&smcibdev->event_handler);
+	smc_wr_add_dev(smcibdev);
+	smcibdev->initialized = 1;
+	return rc;
+
+err:
+	ib_destroy_cq(smcibdev->roce_cq_send);
+	return rc;
+}
+
+static void smc_ib_cleanup_per_ibdev(struct smc_ib_device *smcibdev)
+{
+	if (!smcibdev->initialized)
+		return;
+	smc_wr_remove_dev(smcibdev);
+	ib_unregister_event_handler(&smcibdev->event_handler);
+	ib_destroy_cq(smcibdev->roce_cq_recv);
+	ib_destroy_cq(smcibdev->roce_cq_send);
+}
+
+static struct ib_client smc_ib_client;
+
+/* callback function for ib_register_client() */
+static void smc_ib_add_dev(struct ib_device *ibdev)
+{
+	struct smc_ib_device *smcibdev;
+
+	if (ibdev->node_type != RDMA_NODE_IB_CA)
+		return;
+
+	smcibdev = kzalloc(sizeof(*smcibdev), GFP_KERNEL);
+	if (!smcibdev)
+		return;
+
+	smcibdev->ibdev = ibdev;
+	INIT_WORK(&smcibdev->port_event_work, smc_ib_port_event_work);
+
+	spin_lock(&smc_ib_devices.lock);
+	list_add_tail(&smcibdev->list, &smc_ib_devices.list);
+	spin_unlock(&smc_ib_devices.lock);
+	ib_set_client_data(ibdev, &smc_ib_client, smcibdev);
+}
+
+/* callback function for ib_register_client() */
+static void smc_ib_remove_dev(struct ib_device *ibdev, void *client_data)
+{
+	struct smc_ib_device *smcibdev;
+
+	smcibdev = ib_get_client_data(ibdev, &smc_ib_client);
+	ib_set_client_data(ibdev, &smc_ib_client, NULL);
+	spin_lock(&smc_ib_devices.lock);
+	list_del_init(&smcibdev->list); /* remove from smc_ib_devices */
+	spin_unlock(&smc_ib_devices.lock);
+	smc_pnet_remove_by_ibdev(smcibdev);
+	smc_ib_cleanup_per_ibdev(smcibdev);
+	kfree(smcibdev);
+}
+
+static struct ib_client smc_ib_client = {
+	.name	= "smc_ib",
+	.add	= smc_ib_add_dev,
+	.remove = smc_ib_remove_dev,
+};
+
+int __init smc_ib_register_client(void)
+{
+	return ib_register_client(&smc_ib_client);
+}
+
+void smc_ib_unregister_client(void)
+{
+	ib_unregister_client(&smc_ib_client);
+}
diff --git a/net/smc/smc_ib.h b/net/smc/smc_ib.h
new file mode 100644
index 000000000000..a95f74bb5569
--- /dev/null
+++ b/net/smc/smc_ib.h
@@ -0,0 +1,71 @@
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ *  Definitions for IB environment
+ *
+ *  Copyright IBM Corp. 2016
+ *
+ *  Author(s):  Ursula Braun <Ursula Braun@linux.vnet.ibm.com>
+ */
+
+#ifndef _SMC_IB_H
+#define _SMC_IB_H
+
+#include <linux/if_ether.h>
+#include <rdma/ib_verbs.h>
+
+#define SMC_MAX_PORTS			2	/* Max # of ports */
+#define SMC_GID_SIZE			sizeof(union ib_gid)
+
+#define SMC_IB_MAX_SEND_SGE		2
+
+struct smc_ib_devices {			/* list of smc ib devices definition */
+	struct list_head	list;
+	spinlock_t		lock;	/* protects list of smc ib devices */
+};
+
+extern struct smc_ib_devices	smc_ib_devices; /* list of smc ib devices */
+
+struct smc_ib_device {				/* ib-device infos for smc */
+	struct list_head	list;
+	struct ib_device	*ibdev;
+	struct ib_port_attr	pattr[SMC_MAX_PORTS];	/* ib dev. port attrs */
+	struct ib_event_handler	event_handler;	/* global ib_event handler */
+	struct ib_cq		*roce_cq_send;	/* send completion queue */
+	struct ib_cq		*roce_cq_recv;	/* recv completion queue */
+	struct tasklet_struct	send_tasklet;	/* called by send cq handler */
+	struct tasklet_struct	recv_tasklet;	/* called by recv cq handler */
+	char			mac[SMC_MAX_PORTS][ETH_ALEN];
+						/* mac address per port*/
+	union ib_gid		gid[SMC_MAX_PORTS]; /* gid per port */
+	u8			initialized : 1; /* ib dev CQ, evthdl done */
+	struct work_struct	port_event_work;
+	unsigned long		port_event_mask;
+};
+
+struct smc_buf_desc;
+struct smc_link;
+
+int smc_ib_register_client(void) __init;
+void smc_ib_unregister_client(void);
+bool smc_ib_port_active(struct smc_ib_device *smcibdev, u8 ibport);
+int smc_ib_remember_port_attr(struct smc_ib_device *smcibdev, u8 ibport);
+int smc_ib_buf_map(struct smc_ib_device *smcibdev, int buf_size,
+		   struct smc_buf_desc *buf_slot,
+		   enum dma_data_direction data_direction);
+void smc_ib_buf_unmap(struct smc_ib_device *smcibdev, int bufsize,
+		      struct smc_buf_desc *buf_slot,
+		      enum dma_data_direction data_direction);
+void smc_ib_dealloc_protection_domain(struct smc_link *lnk);
+int smc_ib_create_protection_domain(struct smc_link *lnk);
+void smc_ib_destroy_queue_pair(struct smc_link *lnk);
+int smc_ib_create_queue_pair(struct smc_link *lnk);
+int smc_ib_get_memory_region(struct ib_pd *pd, int access_flags,
+			     struct ib_mr **mr);
+int smc_ib_ready_link(struct smc_link *lnk);
+int smc_ib_modify_qp_rts(struct smc_link *lnk);
+int smc_ib_modify_qp_reset(struct smc_link *lnk);
+long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev);
+
+
+#endif
diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c
new file mode 100644
index 000000000000..c2f9165d13ef
--- /dev/null
+++ b/net/smc/smc_llc.c
@@ -0,0 +1,158 @@
+/*
+ *  Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ *  Link Layer Control (LLC)
+ *
+ *  For now, we only support the necessary "confirm link" functionality
+ *  which happens for the first RoCE link after successful CLC handshake.
+ *
+ *  Copyright IBM Corp. 2016
+ *
+ *  Author(s):  Klaus Wacker <Klaus.Wacker@de.ibm.com>
+ *              Ursula Braun <ubraun@linux.vnet.ibm.com>
+ */
+
+#include <net/tcp.h>
+#include <rdma/ib_verbs.h>
+
+#include "smc.h"
+#include "smc_core.h"
+#include "smc_clc.h"
+#include "smc_llc.h"
+
+/********************************** send *************************************/
+
+struct smc_llc_tx_pend {
+};
+
+/* handler for send/transmission completion of an LLC msg */
+static void smc_llc_tx_handler(struct smc_wr_tx_pend_priv *pend,
+			       struct smc_link *link,
+			       enum ib_wc_status wc_status)
+{
+	/* future work: handle wc_status error for recovery and failover */
+}
+
+/**
+ * smc_llc_add_pending_send() - add LLC control message to pending WQE transmits
+ * @link: Pointer to SMC link used for sending LLC control message.
+ * @wr_buf: Out variable returning pointer to work request payload buffer.
+ * @pend: Out variable returning pointer to private pending WR tracking.
+ *	  It's the context the transmit complete handler will get.
+ *
+ * Reserves and pre-fills an entry for a pending work request send/tx.
+ * Used by mid-level smc_llc_send_msg() to prepare for later actual send/tx.
+ * Can sleep due to smc_get_ctrl_buf (if not in softirq context).
+ *
+ * Return: 0 on success, otherwise an error value.
+ */
+static int smc_llc_add_pending_send(struct smc_link *link,
+				    struct smc_wr_buf **wr_buf,
+				    struct smc_wr_tx_pend_priv **pend)
+{
+	int rc;
+
+	rc = smc_wr_tx_get_free_slot(link, smc_llc_tx_handler, wr_buf, pend);
+	if (rc < 0)
+		return rc;
+	BUILD_BUG_ON_MSG(
+		sizeof(union smc_llc_msg) > SMC_WR_BUF_SIZE,
+		"must increase SMC_WR_BUF_SIZE to at least sizeof(struct smc_llc_msg)");
+	BUILD_BUG_ON_MSG(
+		sizeof(union smc_llc_msg) != SMC_WR_TX_SIZE,
+		"must adapt SMC_WR_TX_SIZE to sizeof(struct smc_llc_msg); if not all smc_wr upper layer protocols use the same message size any more, must start to set link->wr_tx_sges[i].length on each individual smc_wr_tx_send()");
+	BUILD_BUG_ON_MSG(
+		sizeof(struct smc_llc_tx_pend) > SMC_WR_TX_PEND_PRIV_SIZE,
+		"must increase SMC_WR_TX_PEND_PRIV_SIZE to at least sizeof(struct smc_llc_tx_pend)");
+	return 0;
+}
+
+/* high-level API to send LLC confirm link */
+int smc_llc_send_confirm_link(struct smc_link *link, u8 mac[],
+			      union ib_gid *gid,
+			      enum smc_llc_reqresp reqresp)
+{
+	struct smc_link_group *lgr = container_of(link, struct smc_link_group,
+						  lnk[SMC_SINGLE_LINK]);
+	struct smc_llc_msg_confirm_link *confllc;
+	struct smc_wr_tx_pend_priv *pend;
+	struct smc_wr_buf *wr_buf;
+	int rc;
+
+	rc = smc_llc_add_pending_send(link, &wr_buf, &pend);
+	if (rc)
+		return rc;
+	confllc = (struct smc_llc_msg_confirm_link *)wr_buf;
+	memset(confllc, 0, sizeof(*confllc));
+	confllc->hd.common.type = SMC_LLC_CONFIRM_LINK;
+	confllc->hd.length = sizeof(struct smc_llc_msg_confirm_link);
+	if (reqresp == SMC_LLC_RESP)
+		confllc->hd.flags |= SMC_LLC_FLAG_RESP;
+	memcpy(confllc->sender_mac, mac, ETH_ALEN);
+	memcpy(confllc->sender_gid, gid, SMC_GID_SIZE);
+	hton24(confllc->sender_qp_num, link->roce_qp->qp_num);
+	/* confllc->link_num = SMC_SINGLE_LINK; already done by memset above */
+	memcpy(confllc->link_uid, lgr->id, SMC_LGR_ID_SIZE);
+	confllc->max_links = SMC_LINKS_PER_LGR_MAX;
+	/* send llc message */
+	rc = smc_wr_tx_send(link, pend);
+	return rc;
+}
+
+/********************************* receive ***********************************/
+
+static void smc_llc_rx_confirm_link(struct smc_link *link,
+				    struct smc_llc_msg_confirm_link *llc)
+{
+	struct smc_link_group *lgr;
+
+	lgr = container_of(link, struct smc_link_group, lnk[SMC_SINGLE_LINK]);
+	if (llc->hd.flags & SMC_LLC_FLAG_RESP) {
+		if (lgr->role == SMC_SERV)
+			complete(&link->llc_confirm_resp);
+	} else {
+		if (lgr->role == SMC_CLNT) {
+			link->link_id = llc->link_num;
+			complete(&link->llc_confirm);
+		}
+	}
+}
+
+static void smc_llc_rx_handler(struct ib_wc *wc, void *buf)
+{
+	struct smc_link *link = (struct smc_link *)wc->qp->qp_context;
+	union smc_llc_msg *llc = buf;
+
+	if (wc->byte_len < sizeof(*llc))
+		return; /* short message */
+	if (llc->raw.hdr.length != sizeof(*llc))
+		return; /* invalid message */
+	if (llc->raw.hdr.common.type == SMC_LLC_CONFIRM_LINK)
+		smc_llc_rx_confirm_link(link, &llc->confirm_link);
+}
+
+/***************************** init, exit, misc ******************************/
+
+static struct smc_wr_rx_handler smc_llc_rx_handlers[] = {
+	{
+		.handler	= smc_llc_rx_handler,
+		.type		= SMC_LLC_CONFIRM_LINK
+	},
+	{
+		.handler	= NULL,
+	}
+};
+
+int __init smc_llc_init(void)
+{
+	struct smc_wr_rx_handler *handler;
+	int rc = 0;
+
+	for (handler = smc_llc_rx_handlers; handler->handler; handler++) {
+		INIT_HLIST_NODE(&handler->list);
+		rc = smc_wr_rx_register_handler(handler);
+		if (rc)
+			break;
+	}
+	return rc;
+}
diff --git a/net/smc/smc_llc.h b/net/smc/smc_llc.h
new file mode 100644
index 000000000000..b472f853953a
--- /dev/null
+++ b/net/smc/smc_llc.h
@@ -0,0 +1,63 @@
+/*
+ *  Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ *  Definitions for LLC (link layer control) message handling
+ *
+ *  Copyright IBM Corp. 2016
+ *
+ *  Author(s):  Klaus Wacker <Klaus.Wacker@de.ibm.com>
+ *              Ursula Braun <ubraun@linux.vnet.ibm.com>
+ */
+
+#ifndef SMC_LLC_H
+#define SMC_LLC_H
+
+#include "smc_wr.h"
+
+#define SMC_LLC_FLAG_RESP		0x80
+
+#define SMC_LLC_WAIT_FIRST_TIME		(5 * HZ)
+
+enum smc_llc_reqresp {
+	SMC_LLC_REQ,
+	SMC_LLC_RESP
+};
+
+enum smc_llc_msg_type {
+	SMC_LLC_CONFIRM_LINK		= 0x01,
+};
+
+#define SMC_LLC_DATA_LEN		40
+
+struct smc_llc_hdr {
+	struct smc_wr_rx_hdr common;
+	u8 length;	/* 44 */
+	u8 reserved;
+	u8 flags;
+};
+
+struct smc_llc_msg_confirm_link {	/* type 0x01 */
+	struct smc_llc_hdr hd;
+	u8 sender_mac[ETH_ALEN];
+	u8 sender_gid[SMC_GID_SIZE];
+	u8 sender_qp_num[3];
+	u8 link_num;
+	u8 link_uid[SMC_LGR_ID_SIZE];
+	u8 max_links;
+	u8 reserved[9];
+};
+
+union smc_llc_msg {
+	struct smc_llc_msg_confirm_link confirm_link;
+	struct {
+		struct smc_llc_hdr hdr;
+		u8 data[SMC_LLC_DATA_LEN];
+	} raw;
+};
+
+/* transmit */
+int smc_llc_send_confirm_link(struct smc_link *lnk, u8 mac[], union ib_gid *gid,
+			      enum smc_llc_reqresp reqresp);
+int smc_llc_init(void) __init;
+
+#endif /* SMC_LLC_H */
diff --git a/net/smc/smc_pnet.c b/net/smc/smc_pnet.c
new file mode 100644
index 000000000000..9d3e7fb8348d
--- /dev/null
+++ b/net/smc/smc_pnet.c
@@ -0,0 +1,534 @@
+/*
+ *  Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ *  Generic netlink support functions to configure an SMC-R PNET table
+ *
+ *  Copyright IBM Corp. 2016
+ *
+ *  Author(s):  Thomas Richter <tmricht@linux.vnet.ibm.com>
+ */
+
+#include <linux/module.h>
+#include <linux/list.h>
+#include <linux/ctype.h>
+#include <net/netlink.h>
+#include <net/genetlink.h>
+
+#include <uapi/linux/if.h>
+#include <uapi/linux/smc.h>
+
+#include <rdma/ib_verbs.h>
+
+#include "smc_pnet.h"
+#include "smc_ib.h"
+
+#define SMC_MAX_PNET_ID_LEN	16	/* Max. length of PNET id */
+
+static struct nla_policy smc_pnet_policy[SMC_PNETID_MAX + 1] = {
+	[SMC_PNETID_NAME] = {
+		.type = NLA_NUL_STRING,
+		.len = SMC_MAX_PNET_ID_LEN - 1
+	},
+	[SMC_PNETID_ETHNAME] = {
+		.type = NLA_NUL_STRING,
+		.len = IFNAMSIZ - 1
+	},
+	[SMC_PNETID_IBNAME] = {
+		.type = NLA_NUL_STRING,
+		.len = IB_DEVICE_NAME_MAX - 1
+	},
+	[SMC_PNETID_IBPORT] = { .type = NLA_U8 }
+};
+
+static struct genl_family smc_pnet_nl_family;
+
+/**
+ * struct smc_pnettable - SMC PNET table anchor
+ * @lock: Lock for list action
+ * @pnetlist: List of PNETIDs
+ */
+static struct smc_pnettable {
+	rwlock_t lock;
+	struct list_head pnetlist;
+} smc_pnettable = {
+	.pnetlist = LIST_HEAD_INIT(smc_pnettable.pnetlist),
+	.lock = __RW_LOCK_UNLOCKED(smc_pnettable.lock)
+};
+
+/**
+ * struct smc_pnetentry - pnet identifier name entry
+ * @list: List node.
+ * @pnet_name: Pnet identifier name
+ * @ndev: pointer to network device.
+ * @smcibdev: Pointer to IB device.
+ */
+struct smc_pnetentry {
+	struct list_head list;
+	char pnet_name[SMC_MAX_PNET_ID_LEN + 1];
+	struct net_device *ndev;
+	struct smc_ib_device *smcibdev;
+	u8 ib_port;
+};
+
+/* Check if two RDMA device entries are identical. Use device name and port
+ * number for comparison.
+ */
+static bool smc_pnet_same_ibname(struct smc_pnetentry *pnetelem, char *ibname,
+				 u8 ibport)
+{
+	return pnetelem->ib_port == ibport &&
+	       !strncmp(pnetelem->smcibdev->ibdev->name, ibname,
+			sizeof(pnetelem->smcibdev->ibdev->name));
+}
+
+/* Find a pnetid in the pnet table.
+ */
+static struct smc_pnetentry *smc_pnet_find_pnetid(char *pnet_name)
+{
+	struct smc_pnetentry *pnetelem, *found_pnetelem = NULL;
+
+	read_lock(&smc_pnettable.lock);
+	list_for_each_entry(pnetelem, &smc_pnettable.pnetlist, list) {
+		if (!strncmp(pnetelem->pnet_name, pnet_name,
+			     sizeof(pnetelem->pnet_name))) {
+			found_pnetelem = pnetelem;
+			break;
+		}
+	}
+	read_unlock(&smc_pnettable.lock);
+	return found_pnetelem;
+}
+
+/* Remove a pnetid from the pnet table.
+ */
+static int smc_pnet_remove_by_pnetid(char *pnet_name)
+{
+	struct smc_pnetentry *pnetelem, *tmp_pe;
+	int rc = -ENOENT;
+
+	write_lock(&smc_pnettable.lock);
+	list_for_each_entry_safe(pnetelem, tmp_pe, &smc_pnettable.pnetlist,
+				 list) {
+		if (!strncmp(pnetelem->pnet_name, pnet_name,
+			     sizeof(pnetelem->pnet_name))) {
+			list_del(&pnetelem->list);
+			dev_put(pnetelem->ndev);
+			kfree(pnetelem);
+			rc = 0;
+			break;
+		}
+	}
+	write_unlock(&smc_pnettable.lock);
+	return rc;
+}
+
+/* Remove a pnet entry mentioning a given network device from the pnet table.
+ */
+static int smc_pnet_remove_by_ndev(struct net_device *ndev)
+{
+	struct smc_pnetentry *pnetelem, *tmp_pe;
+	int rc = -ENOENT;
+
+	write_lock(&smc_pnettable.lock);
+	list_for_each_entry_safe(pnetelem, tmp_pe, &smc_pnettable.pnetlist,
+				 list) {
+		if (pnetelem->ndev == ndev) {
+			list_del(&pnetelem->list);
+			dev_put(pnetelem->ndev);
+			kfree(pnetelem);
+			rc = 0;
+			break;
+		}
+	}
+	write_unlock(&smc_pnettable.lock);
+	return rc;
+}
+
+/* Remove a pnet entry mentioning a given ib device from the pnet table.
+ */
+int smc_pnet_remove_by_ibdev(struct smc_ib_device *ibdev)
+{
+	struct smc_pnetentry *pnetelem, *tmp_pe;
+	int rc = -ENOENT;
+
+	write_lock(&smc_pnettable.lock);
+	list_for_each_entry_safe(pnetelem, tmp_pe, &smc_pnettable.pnetlist,
+				 list) {
+		if (pnetelem->smcibdev == ibdev) {
+			list_del(&pnetelem->list);
+			dev_put(pnetelem->ndev);
+			kfree(pnetelem);
+			rc = 0;
+			break;
+		}
+	}
+	write_unlock(&smc_pnettable.lock);
+	return rc;
+}
+
+/* Append a pnetid to the end of the pnet table if not already on this list.
+ */
+static int smc_pnet_enter(struct smc_pnetentry *new_pnetelem)
+{
+	struct smc_pnetentry *pnetelem;
+	int rc = -EEXIST;
+
+	write_lock(&smc_pnettable.lock);
+	list_for_each_entry(pnetelem, &smc_pnettable.pnetlist, list) {
+		if (!strncmp(pnetelem->pnet_name, new_pnetelem->pnet_name,
+			     sizeof(new_pnetelem->pnet_name)) ||
+		    !strncmp(pnetelem->ndev->name, new_pnetelem->ndev->name,
+			     sizeof(new_pnetelem->ndev->name)) ||
+		    smc_pnet_same_ibname(pnetelem,
+					 new_pnetelem->smcibdev->ibdev->name,
+					 new_pnetelem->ib_port))
+			goto found;
+	}
+	list_add_tail(&new_pnetelem->list, &smc_pnettable.pnetlist);
+	rc = 0;
+found:
+	write_unlock(&smc_pnettable.lock);
+	return rc;
+}
+
+/* The limit for pnetid is 16 characters.
+ * Valid characters should be (single-byte character set) a-z, A-Z, 0-9.
+ * Lower case letters are converted to upper case.
+ * Interior blanks should not be used.
+ */
+static bool smc_pnetid_valid(const char *pnet_name, char *pnetid)
+{
+	char *bf = skip_spaces(pnet_name);
+	size_t len = strlen(bf);
+	char *end = bf + len;
+
+	if (!len)
+		return false;
+	while (--end >= bf && isspace(*end))
+		;
+	if (end - bf >= SMC_MAX_PNET_ID_LEN)
+		return false;
+	while (bf <= end) {
+		if (!isalnum(*bf))
+			return false;
+		*pnetid++ = islower(*bf) ? toupper(*bf) : *bf;
+		bf++;
+	}
+	*pnetid = '\0';
+	return true;
+}
+
+/* Find an infiniband device by a given name. The device might not exist. */
+struct smc_ib_device *smc_pnet_find_ib(char *ib_name)
+{
+	struct smc_ib_device *ibdev;
+
+	spin_lock(&smc_ib_devices.lock);
+	list_for_each_entry(ibdev, &smc_ib_devices.list, list) {
+		if (!strncmp(ibdev->ibdev->name, ib_name,
+			     sizeof(ibdev->ibdev->name))) {
+			goto out;
+		}
+	}
+	ibdev = NULL;
+out:
+	spin_unlock(&smc_ib_devices.lock);
+	return ibdev;
+}
+
+/* Parse the supplied netlink attributes and fill a pnetentry structure.
+ * For ethernet and infiniband device names verify that the devices exist.
+ */
+static int smc_pnet_fill_entry(struct net *net, struct smc_pnetentry *pnetelem,
+			       struct nlattr *tb[])
+{
+	char *string, *ibname = NULL;
+	int rc = 0;
+
+	memset(pnetelem, 0, sizeof(*pnetelem));
+	INIT_LIST_HEAD(&pnetelem->list);
+	if (tb[SMC_PNETID_NAME]) {
+		string = (char *)nla_data(tb[SMC_PNETID_NAME]);
+		if (!smc_pnetid_valid(string, pnetelem->pnet_name)) {
+			rc = -EINVAL;
+			goto error;
+		}
+	}
+	if (tb[SMC_PNETID_ETHNAME]) {
+		string = (char *)nla_data(tb[SMC_PNETID_ETHNAME]);
+		pnetelem->ndev = dev_get_by_name(net, string);
+		if (!pnetelem->ndev)
+			return -ENOENT;
+	}
+	if (tb[SMC_PNETID_IBNAME]) {
+		ibname = (char *)nla_data(tb[SMC_PNETID_IBNAME]);
+		ibname = strim(ibname);
+		pnetelem->smcibdev = smc_pnet_find_ib(ibname);
+		if (!pnetelem->smcibdev) {
+			rc = -ENOENT;
+			goto error;
+		}
+	}
+	if (tb[SMC_PNETID_IBPORT]) {
+		pnetelem->ib_port = nla_get_u8(tb[SMC_PNETID_IBPORT]);
+		if (pnetelem->ib_port > SMC_MAX_PORTS) {
+			rc = -EINVAL;
+			goto error;
+		}
+	}
+	return 0;
+
+error:
+	if (pnetelem->ndev)
+		dev_put(pnetelem->ndev);
+	return rc;
+}
+
+/* Convert an smc_pnetentry to a netlink attribute sequence */
+static int smc_pnet_set_nla(struct sk_buff *msg, struct smc_pnetentry *pnetelem)
+{
+	if (nla_put_string(msg, SMC_PNETID_NAME, pnetelem->pnet_name) ||
+	    nla_put_string(msg, SMC_PNETID_ETHNAME, pnetelem->ndev->name) ||
+	    nla_put_string(msg, SMC_PNETID_IBNAME,
+			   pnetelem->smcibdev->ibdev->name) ||
+	    nla_put_u8(msg, SMC_PNETID_IBPORT, pnetelem->ib_port))
+		return -1;
+	return 0;
+}
+
+/* Retrieve one PNETID entry */
+static int smc_pnet_get(struct sk_buff *skb, struct genl_info *info)
+{
+	struct smc_pnetentry *pnetelem;
+	struct sk_buff *msg;
+	void *hdr;
+	int rc;
+
+	pnetelem = smc_pnet_find_pnetid(
+				(char *)nla_data(info->attrs[SMC_PNETID_NAME]));
+	if (!pnetelem)
+		return -ENOENT;
+	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!msg)
+		return -ENOMEM;
+
+	hdr = genlmsg_put(msg, info->snd_portid, info->snd_seq,
+			  &smc_pnet_nl_family, 0, SMC_PNETID_GET);
+	if (!hdr) {
+		rc = -EMSGSIZE;
+		goto err_out;
+	}
+
+	if (smc_pnet_set_nla(msg, pnetelem)) {
+		rc = -ENOBUFS;
+		goto err_out;
+	}
+
+	genlmsg_end(msg, hdr);
+	return genlmsg_reply(msg, info);
+
+err_out:
+	nlmsg_free(msg);
+	return rc;
+}
+
+static int smc_pnet_add(struct sk_buff *skb, struct genl_info *info)
+{
+	struct net *net = genl_info_net(info);
+	struct smc_pnetentry *pnetelem;
+	int rc;
+
+	pnetelem = kzalloc(sizeof(*pnetelem), GFP_KERNEL);
+	if (!pnetelem)
+		return -ENOMEM;
+	rc = smc_pnet_fill_entry(net, pnetelem, info->attrs);
+	if (!rc)
+		rc = smc_pnet_enter(pnetelem);
+	if (rc) {
+		kfree(pnetelem);
+		return rc;
+	}
+	rc = smc_ib_remember_port_attr(pnetelem->smcibdev, pnetelem->ib_port);
+	if (rc)
+		smc_pnet_remove_by_pnetid(pnetelem->pnet_name);
+	return rc;
+}
+
+static int smc_pnet_del(struct sk_buff *skb, struct genl_info *info)
+{
+	return smc_pnet_remove_by_pnetid(
+				(char *)nla_data(info->attrs[SMC_PNETID_NAME]));
+}
+
+static int smc_pnet_dump_start(struct netlink_callback *cb)
+{
+	cb->args[0] = 0;
+	return 0;
+}
+
+static int smc_pnet_dumpinfo(struct sk_buff *skb,
+			     u32 portid, u32 seq, u32 flags,
+			     struct smc_pnetentry *pnetelem)
+{
+	void *hdr;
+
+	hdr = genlmsg_put(skb, portid, seq, &smc_pnet_nl_family,
+			  flags, SMC_PNETID_GET);
+	if (!hdr)
+		return -ENOMEM;
+	if (smc_pnet_set_nla(skb, pnetelem) < 0) {
+		genlmsg_cancel(skb, hdr);
+		return -EMSGSIZE;
+	}
+	genlmsg_end(skb, hdr);
+	return 0;
+}
+
+static int smc_pnet_dump(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	struct smc_pnetentry *pnetelem;
+	int idx = 0;
+
+	read_lock(&smc_pnettable.lock);
+	list_for_each_entry(pnetelem, &smc_pnettable.pnetlist, list) {
+		if (idx++ < cb->args[0])
+			continue;
+		if (smc_pnet_dumpinfo(skb, NETLINK_CB(cb->skb).portid,
+				      cb->nlh->nlmsg_seq, NLM_F_MULTI,
+				      pnetelem)) {
+			--idx;
+			break;
+		}
+	}
+	cb->args[0] = idx;
+	read_unlock(&smc_pnettable.lock);
+	return skb->len;
+}
+
+/* Remove and delete all pnetids from pnet table.
+ */
+static int smc_pnet_flush(struct sk_buff *skb, struct genl_info *info)
+{
+	struct smc_pnetentry *pnetelem, *tmp_pe;
+
+	write_lock(&smc_pnettable.lock);
+	list_for_each_entry_safe(pnetelem, tmp_pe, &smc_pnettable.pnetlist,
+				 list) {
+		list_del(&pnetelem->list);
+		dev_put(pnetelem->ndev);
+		kfree(pnetelem);
+	}
+	write_unlock(&smc_pnettable.lock);
+	return 0;
+}
+
+/* SMC_PNETID generic netlink operation definition */
+static const struct genl_ops smc_pnet_ops[] = {
+	{
+		.cmd = SMC_PNETID_GET,
+		.flags = GENL_ADMIN_PERM,
+		.policy = smc_pnet_policy,
+		.doit = smc_pnet_get,
+		.dumpit = smc_pnet_dump,
+		.start = smc_pnet_dump_start
+	},
+	{
+		.cmd = SMC_PNETID_ADD,
+		.flags = GENL_ADMIN_PERM,
+		.policy = smc_pnet_policy,
+		.doit = smc_pnet_add
+	},
+	{
+		.cmd = SMC_PNETID_DEL,
+		.flags = GENL_ADMIN_PERM,
+		.policy = smc_pnet_policy,
+		.doit = smc_pnet_del
+	},
+	{
+		.cmd = SMC_PNETID_FLUSH,
+		.flags = GENL_ADMIN_PERM,
+		.policy = smc_pnet_policy,
+		.doit = smc_pnet_flush
+	}
+};
+
+/* SMC_PNETID family definition */
+static struct genl_family smc_pnet_nl_family = {
+	.hdrsize = 0,
+	.name = SMCR_GENL_FAMILY_NAME,
+	.version = SMCR_GENL_FAMILY_VERSION,
+	.maxattr = SMC_PNETID_MAX,
+	.netnsok = true,
+	.module = THIS_MODULE,
+	.ops = smc_pnet_ops,
+	.n_ops =  ARRAY_SIZE(smc_pnet_ops)
+};
+
+static int smc_pnet_netdev_event(struct notifier_block *this,
+				 unsigned long event, void *ptr)
+{
+	struct net_device *event_dev = netdev_notifier_info_to_dev(ptr);
+
+	switch (event) {
+	case NETDEV_REBOOT:
+	case NETDEV_UNREGISTER:
+		smc_pnet_remove_by_ndev(event_dev);
+	default:
+		break;
+	}
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block smc_netdev_notifier = {
+	.notifier_call = smc_pnet_netdev_event
+};
+
+int __init smc_pnet_init(void)
+{
+	int rc;
+
+	rc = genl_register_family(&smc_pnet_nl_family);
+	if (rc)
+		return rc;
+	rc = register_netdevice_notifier(&smc_netdev_notifier);
+	if (rc)
+		genl_unregister_family(&smc_pnet_nl_family);
+	return rc;
+}
+
+void smc_pnet_exit(void)
+{
+	smc_pnet_flush(NULL, NULL);
+	unregister_netdevice_notifier(&smc_netdev_notifier);
+	genl_unregister_family(&smc_pnet_nl_family);
+}
+
+/* PNET table analysis for a given sock:
+ * determine ib_device and port belonging to used internal TCP socket
+ * ethernet interface.
+ */
+void smc_pnet_find_roce_resource(struct sock *sk,
+				 struct smc_ib_device **smcibdev, u8 *ibport)
+{
+	struct dst_entry *dst = sk_dst_get(sk);
+	struct smc_pnetentry *pnetelem;
+
+	*smcibdev = NULL;
+	*ibport = 0;
+
+	if (!dst)
+		return;
+	if (!dst->dev)
+		goto out_rel;
+	read_lock(&smc_pnettable.lock);
+	list_for_each_entry(pnetelem, &smc_pnettable.pnetlist, list) {
+		if (dst->dev == pnetelem->ndev) {
+			*smcibdev = pnetelem->smcibdev;
+			*ibport = pnetelem->ib_port;
+			break;
+		}
+	}
+	read_unlock(&smc_pnettable.lock);
+out_rel:
+	dst_release(dst);
+}
diff --git a/net/smc/smc_pnet.h b/net/smc/smc_pnet.h
new file mode 100644
index 000000000000..32ab3df928ca
--- /dev/null
+++ b/net/smc/smc_pnet.h
@@ -0,0 +1,23 @@
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ *  PNET table queries
+ *
+ *  Copyright IBM Corp. 2016
+ *
+ *  Author(s):  Thomas Richter <tmricht@linux.vnet.ibm.com>
+ */
+
+#ifndef _SMC_PNET_H
+#define _SMC_PNET_H
+
+struct smc_ib_device;
+
+int smc_pnet_init(void) __init;
+void smc_pnet_exit(void);
+int smc_pnet_remove_by_ibdev(struct smc_ib_device *ibdev);
+struct smc_ib_device *smc_pnet_find_ib(char *ib_name);
+void smc_pnet_find_roce_resource(struct sock *sk,
+				 struct smc_ib_device **smcibdev, u8 *ibport);
+
+#endif
diff --git a/net/smc/smc_rx.c b/net/smc/smc_rx.c
new file mode 100644
index 000000000000..5d1878732f46
--- /dev/null
+++ b/net/smc/smc_rx.c
@@ -0,0 +1,217 @@
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * Manage RMBE
+ * copy new RMBE data into user space
+ *
+ * Copyright IBM Corp. 2016
+ *
+ * Author(s):  Ursula Braun <ubraun@linux.vnet.ibm.com>
+ */
+
+#include <linux/net.h>
+#include <linux/rcupdate.h>
+#include <net/sock.h>
+
+#include "smc.h"
+#include "smc_core.h"
+#include "smc_cdc.h"
+#include "smc_tx.h" /* smc_tx_consumer_update() */
+#include "smc_rx.h"
+
+/* callback implementation for sk.sk_data_ready()
+ * to wakeup rcvbuf consumers that blocked with smc_rx_wait_data().
+ * indirectly called by smc_cdc_msg_recv_action().
+ */
+static void smc_rx_data_ready(struct sock *sk)
+{
+	struct socket_wq *wq;
+
+	/* derived from sock_def_readable() */
+	/* called already in smc_listen_work() */
+	rcu_read_lock();
+	wq = rcu_dereference(sk->sk_wq);
+	if (skwq_has_sleeper(wq))
+		wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLPRI |
+						POLLRDNORM | POLLRDBAND);
+	if ((sk->sk_shutdown == SHUTDOWN_MASK) ||
+	    (sk->sk_state == SMC_CLOSED))
+		sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_HUP);
+	else
+		sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
+	rcu_read_unlock();
+}
+
+/* blocks rcvbuf consumer until >=len bytes available or timeout or interrupted
+ *   @smc    smc socket
+ *   @timeo  pointer to max seconds to wait, pointer to value 0 for no timeout
+ * Returns:
+ * 1 if at least 1 byte available in rcvbuf or if socket error/shutdown.
+ * 0 otherwise (nothing in rcvbuf nor timeout, e.g. interrupted).
+ */
+static int smc_rx_wait_data(struct smc_sock *smc, long *timeo)
+{
+	DEFINE_WAIT_FUNC(wait, woken_wake_function);
+	struct smc_connection *conn = &smc->conn;
+	struct sock *sk = &smc->sk;
+	int rc;
+
+	if (atomic_read(&conn->bytes_to_rcv))
+		return 1;
+	sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
+	add_wait_queue(sk_sleep(sk), &wait);
+	rc = sk_wait_event(sk, timeo,
+			   sk->sk_err ||
+			   sk->sk_shutdown & RCV_SHUTDOWN ||
+			   sock_flag(sk, SOCK_DONE) ||
+			   atomic_read(&conn->bytes_to_rcv) ||
+			   smc_cdc_rxed_any_close_or_senddone(conn),
+			   &wait);
+	remove_wait_queue(sk_sleep(sk), &wait);
+	sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
+	return rc;
+}
+
+/* rcvbuf consumer: main API called by socket layer.
+ * called under sk lock.
+ */
+int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg, size_t len,
+		   int flags)
+{
+	size_t copylen, read_done = 0, read_remaining = len;
+	size_t chunk_len, chunk_off, chunk_len_sum;
+	struct smc_connection *conn = &smc->conn;
+	union smc_host_cursor cons;
+	int readable, chunk;
+	char *rcvbuf_base;
+	struct sock *sk;
+	long timeo;
+	int target;		/* Read at least these many bytes */
+	int rc;
+
+	if (unlikely(flags & MSG_ERRQUEUE))
+		return -EINVAL; /* future work for sk.sk_family == AF_SMC */
+	if (flags & MSG_OOB)
+		return -EINVAL; /* future work */
+
+	sk = &smc->sk;
+	if (sk->sk_state == SMC_LISTEN)
+		return -ENOTCONN;
+	timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
+	target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
+
+	msg->msg_namelen = 0;
+	/* we currently use 1 RMBE per RMB, so RMBE == RMB base addr */
+	rcvbuf_base = conn->rmb_desc->cpu_addr;
+
+	do { /* while (read_remaining) */
+		if (read_done >= target)
+			break;
+
+		if (atomic_read(&conn->bytes_to_rcv))
+			goto copy;
+
+		if (read_done) {
+			if (sk->sk_err ||
+			    sk->sk_state == SMC_CLOSED ||
+			    (sk->sk_shutdown & RCV_SHUTDOWN) ||
+			    !timeo ||
+			    signal_pending(current) ||
+			    smc_cdc_rxed_any_close_or_senddone(conn) ||
+			    conn->local_tx_ctrl.conn_state_flags.
+			    peer_conn_abort)
+				break;
+		} else {
+			if (sock_flag(sk, SOCK_DONE))
+				break;
+			if (sk->sk_err) {
+				read_done = sock_error(sk);
+				break;
+			}
+			if (sk->sk_shutdown & RCV_SHUTDOWN ||
+			    smc_cdc_rxed_any_close_or_senddone(conn) ||
+			    conn->local_tx_ctrl.conn_state_flags.
+			    peer_conn_abort)
+				break;
+			if (sk->sk_state == SMC_CLOSED) {
+				if (!sock_flag(sk, SOCK_DONE)) {
+					/* This occurs when user tries to read
+					 * from never connected socket.
+					 */
+					read_done = -ENOTCONN;
+					break;
+				}
+				break;
+			}
+			if (signal_pending(current)) {
+				read_done = sock_intr_errno(timeo);
+				break;
+			}
+		}
+
+		if (!atomic_read(&conn->bytes_to_rcv)) {
+			smc_rx_wait_data(smc, &timeo);
+			continue;
+		}
+
+copy:
+		/* initialize variables for 1st iteration of subsequent loop */
+		/* could be just 1 byte, even after smc_rx_wait_data above */
+		readable = atomic_read(&conn->bytes_to_rcv);
+		/* not more than what user space asked for */
+		copylen = min_t(size_t, read_remaining, readable);
+		smc_curs_write(&cons,
+			       smc_curs_read(&conn->local_tx_ctrl.cons, conn),
+			       conn);
+		/* determine chunks where to read from rcvbuf */
+		/* either unwrapped case, or 1st chunk of wrapped case */
+		chunk_len = min_t(size_t,
+				  copylen, conn->rmbe_size - cons.count);
+		chunk_len_sum = chunk_len;
+		chunk_off = cons.count;
+		for (chunk = 0; chunk < 2; chunk++) {
+			if (!(flags & MSG_TRUNC)) {
+				rc = memcpy_to_msg(msg, rcvbuf_base + chunk_off,
+						   chunk_len);
+				if (rc) {
+					if (!read_done)
+						read_done = -EFAULT;
+					goto out;
+				}
+			}
+			read_remaining -= chunk_len;
+			read_done += chunk_len;
+
+			if (chunk_len_sum == copylen)
+				break; /* either on 1st or 2nd iteration */
+			/* prepare next (== 2nd) iteration */
+			chunk_len = copylen - chunk_len; /* remainder */
+			chunk_len_sum += chunk_len;
+			chunk_off = 0; /* modulo offset in recv ring buffer */
+		}
+
+		/* update cursors */
+		if (!(flags & MSG_PEEK)) {
+			smc_curs_add(conn->rmbe_size, &cons, copylen);
+			/* increased in recv tasklet smc_cdc_msg_rcv() */
+			smp_mb__before_atomic();
+			atomic_sub(copylen, &conn->bytes_to_rcv);
+			/* guarantee 0 <= bytes_to_rcv <= rmbe_size */
+			smp_mb__after_atomic();
+			smc_curs_write(&conn->local_tx_ctrl.cons,
+				       smc_curs_read(&cons, conn),
+				       conn);
+			/* send consumer cursor update if required */
+			/* similar to advertising new TCP rcv_wnd if required */
+			smc_tx_consumer_update(conn);
+		}
+	} while (read_remaining);
+out:
+	return read_done;
+}
+
+/* Initialize receive properties on connection establishment. NB: not __init! */
+void smc_rx_init(struct smc_sock *smc)
+{
+	smc->sk.sk_data_ready = smc_rx_data_ready;
+}
diff --git a/net/smc/smc_rx.h b/net/smc/smc_rx.h
new file mode 100644
index 000000000000..b5b80e1f8b0f
--- /dev/null
+++ b/net/smc/smc_rx.h
@@ -0,0 +1,23 @@
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * Manage RMBE
+ *
+ * Copyright IBM Corp. 2016
+ *
+ * Author(s):  Ursula Braun <ubraun@linux.vnet.ibm.com>
+ */
+
+#ifndef SMC_RX_H
+#define SMC_RX_H
+
+#include <linux/socket.h>
+#include <linux/types.h>
+
+#include "smc.h"
+
+void smc_rx_init(struct smc_sock *smc);
+int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg, size_t len,
+		   int flags);
+
+#endif /* SMC_RX_H */
diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c
new file mode 100644
index 000000000000..6e73b28915ea
--- /dev/null
+++ b/net/smc/smc_tx.c
@@ -0,0 +1,483 @@
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * Manage send buffer.
+ * Producer:
+ * Copy user space data into send buffer, if send buffer space available.
+ * Consumer:
+ * Trigger RDMA write into RMBE of peer and send CDC, if RMBE space available.
+ *
+ * Copyright IBM Corp. 2016
+ *
+ * Author(s):  Ursula Braun <ubraun@linux.vnet.ibm.com>
+ */
+
+#include <linux/net.h>
+#include <linux/rcupdate.h>
+#include <linux/workqueue.h>
+#include <net/sock.h>
+
+#include "smc.h"
+#include "smc_wr.h"
+#include "smc_cdc.h"
+#include "smc_tx.h"
+
+/***************************** sndbuf producer *******************************/
+
+/* callback implementation for sk.sk_write_space()
+ * to wakeup sndbuf producers that blocked with smc_tx_wait_memory().
+ * called under sk_socket lock.
+ */
+static void smc_tx_write_space(struct sock *sk)
+{
+	struct socket *sock = sk->sk_socket;
+	struct smc_sock *smc = smc_sk(sk);
+	struct socket_wq *wq;
+
+	/* similar to sk_stream_write_space */
+	if (atomic_read(&smc->conn.sndbuf_space) && sock) {
+		clear_bit(SOCK_NOSPACE, &sock->flags);
+		rcu_read_lock();
+		wq = rcu_dereference(sk->sk_wq);
+		if (skwq_has_sleeper(wq))
+			wake_up_interruptible_poll(&wq->wait,
+						   POLLOUT | POLLWRNORM |
+						   POLLWRBAND);
+		if (wq && wq->fasync_list && !(sk->sk_shutdown & SEND_SHUTDOWN))
+			sock_wake_async(wq, SOCK_WAKE_SPACE, POLL_OUT);
+		rcu_read_unlock();
+	}
+}
+
+/* Wakeup sndbuf producers that blocked with smc_tx_wait_memory().
+ * Cf. tcp_data_snd_check()=>tcp_check_space()=>tcp_new_space().
+ */
+void smc_tx_sndbuf_nonfull(struct smc_sock *smc)
+{
+	if (smc->sk.sk_socket &&
+	    test_bit(SOCK_NOSPACE, &smc->sk.sk_socket->flags))
+		smc->sk.sk_write_space(&smc->sk);
+}
+
+/* blocks sndbuf producer until at least one byte of free space available */
+static int smc_tx_wait_memory(struct smc_sock *smc, int flags)
+{
+	DEFINE_WAIT_FUNC(wait, woken_wake_function);
+	struct smc_connection *conn = &smc->conn;
+	struct sock *sk = &smc->sk;
+	bool noblock;
+	long timeo;
+	int rc = 0;
+
+	/* similar to sk_stream_wait_memory */
+	timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
+	noblock = timeo ? false : true;
+	add_wait_queue(sk_sleep(sk), &wait);
+	while (1) {
+		sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
+		if (sk->sk_err ||
+		    (sk->sk_shutdown & SEND_SHUTDOWN) ||
+		    conn->local_tx_ctrl.conn_state_flags.peer_done_writing) {
+			rc = -EPIPE;
+			break;
+		}
+		if (conn->local_rx_ctrl.conn_state_flags.peer_conn_abort) {
+			rc = -ECONNRESET;
+			break;
+		}
+		if (!timeo) {
+			if (noblock)
+				set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+			rc = -EAGAIN;
+			break;
+		}
+		if (signal_pending(current)) {
+			rc = sock_intr_errno(timeo);
+			break;
+		}
+		sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
+		if (atomic_read(&conn->sndbuf_space))
+			break; /* at least 1 byte of free space available */
+		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+		sk->sk_write_pending++;
+		sk_wait_event(sk, &timeo,
+			      sk->sk_err ||
+			      (sk->sk_shutdown & SEND_SHUTDOWN) ||
+			      smc_cdc_rxed_any_close_or_senddone(conn) ||
+			      atomic_read(&conn->sndbuf_space),
+			      &wait);
+		sk->sk_write_pending--;
+	}
+	remove_wait_queue(sk_sleep(sk), &wait);
+	return rc;
+}
+
+/* sndbuf producer: main API called by socket layer.
+ * called under sock lock.
+ */
+int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len)
+{
+	size_t copylen, send_done = 0, send_remaining = len;
+	size_t chunk_len, chunk_off, chunk_len_sum;
+	struct smc_connection *conn = &smc->conn;
+	union smc_host_cursor prep;
+	struct sock *sk = &smc->sk;
+	char *sndbuf_base;
+	int tx_cnt_prep;
+	int writespace;
+	int rc, chunk;
+
+	/* This should be in poll */
+	sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
+
+	if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) {
+		rc = -EPIPE;
+		goto out_err;
+	}
+
+	while (msg_data_left(msg)) {
+		if (sk->sk_state == SMC_INIT)
+			return -ENOTCONN;
+		if (smc->sk.sk_shutdown & SEND_SHUTDOWN ||
+		    (smc->sk.sk_err == ECONNABORTED) ||
+		    conn->local_tx_ctrl.conn_state_flags.peer_conn_abort)
+			return -EPIPE;
+		if (smc_cdc_rxed_any_close(conn))
+			return send_done ?: -ECONNRESET;
+
+		if (!atomic_read(&conn->sndbuf_space)) {
+			rc = smc_tx_wait_memory(smc, msg->msg_flags);
+			if (rc) {
+				if (send_done)
+					return send_done;
+				goto out_err;
+			}
+			continue;
+		}
+
+		/* initialize variables for 1st iteration of subsequent loop */
+		/* could be just 1 byte, even after smc_tx_wait_memory above */
+		writespace = atomic_read(&conn->sndbuf_space);
+		/* not more than what user space asked for */
+		copylen = min_t(size_t, send_remaining, writespace);
+		/* determine start of sndbuf */
+		sndbuf_base = conn->sndbuf_desc->cpu_addr;
+		smc_curs_write(&prep,
+			       smc_curs_read(&conn->tx_curs_prep, conn),
+			       conn);
+		tx_cnt_prep = prep.count;
+		/* determine chunks where to write into sndbuf */
+		/* either unwrapped case, or 1st chunk of wrapped case */
+		chunk_len = min_t(size_t,
+				  copylen, conn->sndbuf_size - tx_cnt_prep);
+		chunk_len_sum = chunk_len;
+		chunk_off = tx_cnt_prep;
+		for (chunk = 0; chunk < 2; chunk++) {
+			rc = memcpy_from_msg(sndbuf_base + chunk_off,
+					     msg, chunk_len);
+			if (rc) {
+				if (send_done)
+					return send_done;
+				goto out_err;
+			}
+			send_done += chunk_len;
+			send_remaining -= chunk_len;
+
+			if (chunk_len_sum == copylen)
+				break; /* either on 1st or 2nd iteration */
+			/* prepare next (== 2nd) iteration */
+			chunk_len = copylen - chunk_len; /* remainder */
+			chunk_len_sum += chunk_len;
+			chunk_off = 0; /* modulo offset in send ring buffer */
+		}
+		/* update cursors */
+		smc_curs_add(conn->sndbuf_size, &prep, copylen);
+		smc_curs_write(&conn->tx_curs_prep,
+			       smc_curs_read(&prep, conn),
+			       conn);
+		/* increased in send tasklet smc_cdc_tx_handler() */
+		smp_mb__before_atomic();
+		atomic_sub(copylen, &conn->sndbuf_space);
+		/* guarantee 0 <= sndbuf_space <= sndbuf_size */
+		smp_mb__after_atomic();
+		/* since we just produced more new data into sndbuf,
+		 * trigger sndbuf consumer: RDMA write into peer RMBE and CDC
+		 */
+		smc_tx_sndbuf_nonempty(conn);
+	} /* while (msg_data_left(msg)) */
+
+	return send_done;
+
+out_err:
+	rc = sk_stream_error(sk, msg->msg_flags, rc);
+	/* make sure we wake any epoll edge trigger waiter */
+	if (unlikely(rc == -EAGAIN))
+		sk->sk_write_space(sk);
+	return rc;
+}
+
+/***************************** sndbuf consumer *******************************/
+
+/* sndbuf consumer: actual data transfer of one target chunk with RDMA write */
+static int smc_tx_rdma_write(struct smc_connection *conn, int peer_rmbe_offset,
+			     int num_sges, struct ib_sge sges[])
+{
+	struct smc_link_group *lgr = conn->lgr;
+	struct ib_send_wr *failed_wr = NULL;
+	struct ib_rdma_wr rdma_wr;
+	struct smc_link *link;
+	int rc;
+
+	memset(&rdma_wr, 0, sizeof(rdma_wr));
+	link = &lgr->lnk[SMC_SINGLE_LINK];
+	rdma_wr.wr.wr_id = smc_wr_tx_get_next_wr_id(link);
+	rdma_wr.wr.sg_list = sges;
+	rdma_wr.wr.num_sge = num_sges;
+	rdma_wr.wr.opcode = IB_WR_RDMA_WRITE;
+	rdma_wr.remote_addr =
+		lgr->rtokens[conn->rtoken_idx][SMC_SINGLE_LINK].dma_addr +
+		/* RMBE within RMB */
+		((conn->peer_conn_idx - 1) * conn->peer_rmbe_size) +
+		/* offset within RMBE */
+		peer_rmbe_offset;
+	rdma_wr.rkey = lgr->rtokens[conn->rtoken_idx][SMC_SINGLE_LINK].rkey;
+	rc = ib_post_send(link->roce_qp, &rdma_wr.wr, &failed_wr);
+	if (rc)
+		conn->local_tx_ctrl.conn_state_flags.peer_conn_abort = 1;
+	return rc;
+}
+
+/* sndbuf consumer */
+static inline void smc_tx_advance_cursors(struct smc_connection *conn,
+					  union smc_host_cursor *prod,
+					  union smc_host_cursor *sent,
+					  size_t len)
+{
+	smc_curs_add(conn->peer_rmbe_size, prod, len);
+	/* increased in recv tasklet smc_cdc_msg_rcv() */
+	smp_mb__before_atomic();
+	/* data in flight reduces usable snd_wnd */
+	atomic_sub(len, &conn->peer_rmbe_space);
+	/* guarantee 0 <= peer_rmbe_space <= peer_rmbe_size */
+	smp_mb__after_atomic();
+	smc_curs_add(conn->sndbuf_size, sent, len);
+}
+
+/* sndbuf consumer: prepare all necessary (src&dst) chunks of data transmit;
+ * usable snd_wnd as max transmit
+ */
+static int smc_tx_rdma_writes(struct smc_connection *conn)
+{
+	size_t src_off, src_len, dst_off, dst_len; /* current chunk values */
+	size_t len, dst_len_sum, src_len_sum, dstchunk, srcchunk;
+	union smc_host_cursor sent, prep, prod, cons;
+	struct ib_sge sges[SMC_IB_MAX_SEND_SGE];
+	struct smc_link_group *lgr = conn->lgr;
+	int to_send, rmbespace;
+	struct smc_link *link;
+	int num_sges;
+	int rc;
+
+	/* source: sndbuf */
+	smc_curs_write(&sent, smc_curs_read(&conn->tx_curs_sent, conn), conn);
+	smc_curs_write(&prep, smc_curs_read(&conn->tx_curs_prep, conn), conn);
+	/* cf. wmem_alloc - (snd_max - snd_una) */
+	to_send = smc_curs_diff(conn->sndbuf_size, &sent, &prep);
+	if (to_send <= 0)
+		return 0;
+
+	/* destination: RMBE */
+	/* cf. snd_wnd */
+	rmbespace = atomic_read(&conn->peer_rmbe_space);
+	if (rmbespace <= 0)
+		return 0;
+	smc_curs_write(&prod,
+		       smc_curs_read(&conn->local_tx_ctrl.prod, conn),
+		       conn);
+	smc_curs_write(&cons,
+		       smc_curs_read(&conn->local_rx_ctrl.cons, conn),
+		       conn);
+
+	/* if usable snd_wnd closes ask peer to advertise once it opens again */
+	conn->local_tx_ctrl.prod_flags.write_blocked = (to_send >= rmbespace);
+	/* cf. usable snd_wnd */
+	len = min(to_send, rmbespace);
+
+	/* initialize variables for first iteration of subsequent nested loop */
+	link = &lgr->lnk[SMC_SINGLE_LINK];
+	dst_off = prod.count;
+	if (prod.wrap == cons.wrap) {
+		/* the filled destination area is unwrapped,
+		 * hence the available free destination space is wrapped
+		 * and we need 2 destination chunks of sum len; start with 1st
+		 * which is limited by what's available in sndbuf
+		 */
+		dst_len = min_t(size_t,
+				conn->peer_rmbe_size - prod.count, len);
+	} else {
+		/* the filled destination area is wrapped,
+		 * hence the available free destination space is unwrapped
+		 * and we need a single destination chunk of entire len
+		 */
+		dst_len = len;
+	}
+	dst_len_sum = dst_len;
+	src_off = sent.count;
+	/* dst_len determines the maximum src_len */
+	if (sent.count + dst_len <= conn->sndbuf_size) {
+		/* unwrapped src case: single chunk of entire dst_len */
+		src_len = dst_len;
+	} else {
+		/* wrapped src case: 2 chunks of sum dst_len; start with 1st: */
+		src_len = conn->sndbuf_size - sent.count;
+	}
+	src_len_sum = src_len;
+	for (dstchunk = 0; dstchunk < 2; dstchunk++) {
+		num_sges = 0;
+		for (srcchunk = 0; srcchunk < 2; srcchunk++) {
+			sges[srcchunk].addr =
+				conn->sndbuf_desc->dma_addr[SMC_SINGLE_LINK] +
+				src_off;
+			sges[srcchunk].length = src_len;
+			sges[srcchunk].lkey = link->roce_pd->local_dma_lkey;
+			num_sges++;
+			src_off += src_len;
+			if (src_off >= conn->sndbuf_size)
+				src_off -= conn->sndbuf_size;
+						/* modulo in send ring */
+			if (src_len_sum == dst_len)
+				break; /* either on 1st or 2nd iteration */
+			/* prepare next (== 2nd) iteration */
+			src_len = dst_len - src_len; /* remainder */
+			src_len_sum += src_len;
+		}
+		rc = smc_tx_rdma_write(conn, dst_off, num_sges, sges);
+		if (rc)
+			return rc;
+		if (dst_len_sum == len)
+			break; /* either on 1st or 2nd iteration */
+		/* prepare next (== 2nd) iteration */
+		dst_off = 0; /* modulo offset in RMBE ring buffer */
+		dst_len = len - dst_len; /* remainder */
+		dst_len_sum += dst_len;
+		src_len = min_t(int,
+				dst_len, conn->sndbuf_size - sent.count);
+		src_len_sum = src_len;
+	}
+
+	smc_tx_advance_cursors(conn, &prod, &sent, len);
+	/* update connection's cursors with advanced local cursors */
+	smc_curs_write(&conn->local_tx_ctrl.prod,
+		       smc_curs_read(&prod, conn),
+		       conn);
+							/* dst: peer RMBE */
+	smc_curs_write(&conn->tx_curs_sent,
+		       smc_curs_read(&sent, conn),
+		       conn);
+							/* src: local sndbuf */
+
+	return 0;
+}
+
+/* Wakeup sndbuf consumers from any context (IRQ or process)
+ * since there is more data to transmit; usable snd_wnd as max transmit
+ */
+int smc_tx_sndbuf_nonempty(struct smc_connection *conn)
+{
+	struct smc_cdc_tx_pend *pend;
+	struct smc_wr_buf *wr_buf;
+	int rc;
+
+	spin_lock_bh(&conn->send_lock);
+	rc = smc_cdc_get_free_slot(&conn->lgr->lnk[SMC_SINGLE_LINK], &wr_buf,
+				   &pend);
+	if (rc < 0) {
+		if (rc == -EBUSY) {
+			struct smc_sock *smc =
+				container_of(conn, struct smc_sock, conn);
+
+			if (smc->sk.sk_err == ECONNABORTED) {
+				rc = sock_error(&smc->sk);
+				goto out_unlock;
+			}
+			rc = 0;
+			schedule_work(&conn->tx_work);
+		}
+		goto out_unlock;
+	}
+
+	rc = smc_tx_rdma_writes(conn);
+	if (rc) {
+		smc_wr_tx_put_slot(&conn->lgr->lnk[SMC_SINGLE_LINK],
+				   (struct smc_wr_tx_pend_priv *)pend);
+		goto out_unlock;
+	}
+
+	rc = smc_cdc_msg_send(conn, wr_buf, pend);
+
+out_unlock:
+	spin_unlock_bh(&conn->send_lock);
+	return rc;
+}
+
+/* Wakeup sndbuf consumers from process context
+ * since there is more data to transmit
+ */
+static void smc_tx_work(struct work_struct *work)
+{
+	struct smc_connection *conn = container_of(work,
+						   struct smc_connection,
+						   tx_work);
+	struct smc_sock *smc = container_of(conn, struct smc_sock, conn);
+
+	lock_sock(&smc->sk);
+	smc_tx_sndbuf_nonempty(conn);
+	release_sock(&smc->sk);
+}
+
+void smc_tx_consumer_update(struct smc_connection *conn)
+{
+	union smc_host_cursor cfed, cons;
+	struct smc_cdc_tx_pend *pend;
+	struct smc_wr_buf *wr_buf;
+	int to_confirm, rc;
+
+	smc_curs_write(&cons,
+		       smc_curs_read(&conn->local_tx_ctrl.cons, conn),
+		       conn);
+	smc_curs_write(&cfed,
+		       smc_curs_read(&conn->rx_curs_confirmed, conn),
+		       conn);
+	to_confirm = smc_curs_diff(conn->rmbe_size, &cfed, &cons);
+
+	if (conn->local_rx_ctrl.prod_flags.cons_curs_upd_req ||
+	    ((to_confirm > conn->rmbe_update_limit) &&
+	     ((to_confirm > (conn->rmbe_size / 2)) ||
+	      conn->local_rx_ctrl.prod_flags.write_blocked))) {
+		rc = smc_cdc_get_free_slot(&conn->lgr->lnk[SMC_SINGLE_LINK],
+					   &wr_buf, &pend);
+		if (!rc)
+			rc = smc_cdc_msg_send(conn, wr_buf, pend);
+		if (rc < 0) {
+			schedule_work(&conn->tx_work);
+			return;
+		}
+		smc_curs_write(&conn->rx_curs_confirmed,
+			       smc_curs_read(&conn->local_tx_ctrl.cons, conn),
+			       conn);
+		conn->local_rx_ctrl.prod_flags.cons_curs_upd_req = 0;
+	}
+	if (conn->local_rx_ctrl.prod_flags.write_blocked &&
+	    !atomic_read(&conn->bytes_to_rcv))
+		conn->local_rx_ctrl.prod_flags.write_blocked = 0;
+}
+
+/***************************** send initialize *******************************/
+
+/* Initialize send properties on connection establishment. NB: not __init! */
+void smc_tx_init(struct smc_sock *smc)
+{
+	smc->sk.sk_write_space = smc_tx_write_space;
+	INIT_WORK(&smc->conn.tx_work, smc_tx_work);
+	spin_lock_init(&smc->conn.send_lock);
+}
diff --git a/net/smc/smc_tx.h b/net/smc/smc_tx.h
new file mode 100644
index 000000000000..1d6a0dcdcfe6
--- /dev/null
+++ b/net/smc/smc_tx.h
@@ -0,0 +1,35 @@
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * Manage send buffer
+ *
+ * Copyright IBM Corp. 2016
+ *
+ * Author(s):  Ursula Braun <ubraun@linux.vnet.ibm.com>
+ */
+
+#ifndef SMC_TX_H
+#define SMC_TX_H
+
+#include <linux/socket.h>
+#include <linux/types.h>
+
+#include "smc.h"
+#include "smc_cdc.h"
+
+static inline int smc_tx_prepared_sends(struct smc_connection *conn)
+{
+	union smc_host_cursor sent, prep;
+
+	smc_curs_write(&sent, smc_curs_read(&conn->tx_curs_sent, conn), conn);
+	smc_curs_write(&prep, smc_curs_read(&conn->tx_curs_prep, conn), conn);
+	return smc_curs_diff(conn->sndbuf_size, &sent, &prep);
+}
+
+void smc_tx_init(struct smc_sock *smc);
+int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len);
+int smc_tx_sndbuf_nonempty(struct smc_connection *conn);
+void smc_tx_sndbuf_nonfull(struct smc_sock *smc);
+void smc_tx_consumer_update(struct smc_connection *conn);
+
+#endif /* SMC_TX_H */
diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c
new file mode 100644
index 000000000000..eadf157418dc
--- /dev/null
+++ b/net/smc/smc_wr.c
@@ -0,0 +1,614 @@
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * Work Requests exploiting Infiniband API
+ *
+ * Work requests (WR) of type ib_post_send or ib_post_recv respectively
+ * are submitted to either RC SQ or RC RQ respectively
+ * (reliably connected send/receive queue)
+ * and become work queue entries (WQEs).
+ * While an SQ WR/WQE is pending, we track it until transmission completion.
+ * Through a send or receive completion queue (CQ) respectively,
+ * we get completion queue entries (CQEs) [aka work completions (WCs)].
+ * Since the CQ callback is called from IRQ context, we split work by using
+ * bottom halves implemented by tasklets.
+ *
+ * SMC uses this to exchange LLC (link layer control)
+ * and CDC (connection data control) messages.
+ *
+ * Copyright IBM Corp. 2016
+ *
+ * Author(s):  Steffen Maier <maier@linux.vnet.ibm.com>
+ */
+
+#include <linux/atomic.h>
+#include <linux/hashtable.h>
+#include <linux/wait.h>
+#include <rdma/ib_verbs.h>
+#include <asm/div64.h>
+
+#include "smc.h"
+#include "smc_wr.h"
+
+#define SMC_WR_MAX_POLL_CQE 10	/* max. # of compl. queue elements in 1 poll */
+
+#define SMC_WR_RX_HASH_BITS 4
+static DEFINE_HASHTABLE(smc_wr_rx_hash, SMC_WR_RX_HASH_BITS);
+static DEFINE_SPINLOCK(smc_wr_rx_hash_lock);
+
+struct smc_wr_tx_pend {	/* control data for a pending send request */
+	u64			wr_id;		/* work request id sent */
+	smc_wr_tx_handler	handler;
+	enum ib_wc_status	wc_status;	/* CQE status */
+	struct smc_link		*link;
+	u32			idx;
+	struct smc_wr_tx_pend_priv priv;
+};
+
+/******************************** send queue *********************************/
+
+/*------------------------------- completion --------------------------------*/
+
+static inline int smc_wr_tx_find_pending_index(struct smc_link *link, u64 wr_id)
+{
+	u32 i;
+
+	for (i = 0; i < link->wr_tx_cnt; i++) {
+		if (link->wr_tx_pends[i].wr_id == wr_id)
+			return i;
+	}
+	return link->wr_tx_cnt;
+}
+
+static inline void smc_wr_tx_process_cqe(struct ib_wc *wc)
+{
+	struct smc_wr_tx_pend pnd_snd;
+	struct smc_link *link;
+	u32 pnd_snd_idx;
+	int i;
+
+	link = wc->qp->qp_context;
+	pnd_snd_idx = smc_wr_tx_find_pending_index(link, wc->wr_id);
+	if (pnd_snd_idx == link->wr_tx_cnt)
+		return;
+	link->wr_tx_pends[pnd_snd_idx].wc_status = wc->status;
+	memcpy(&pnd_snd, &link->wr_tx_pends[pnd_snd_idx], sizeof(pnd_snd));
+	/* clear the full struct smc_wr_tx_pend including .priv */
+	memset(&link->wr_tx_pends[pnd_snd_idx], 0,
+	       sizeof(link->wr_tx_pends[pnd_snd_idx]));
+	memset(&link->wr_tx_bufs[pnd_snd_idx], 0,
+	       sizeof(link->wr_tx_bufs[pnd_snd_idx]));
+	if (!test_and_clear_bit(pnd_snd_idx, link->wr_tx_mask))
+		return;
+	if (wc->status) {
+		struct smc_link_group *lgr;
+
+		for_each_set_bit(i, link->wr_tx_mask, link->wr_tx_cnt) {
+			/* clear full struct smc_wr_tx_pend including .priv */
+			memset(&link->wr_tx_pends[i], 0,
+			       sizeof(link->wr_tx_pends[i]));
+			memset(&link->wr_tx_bufs[i], 0,
+			       sizeof(link->wr_tx_bufs[i]));
+			clear_bit(i, link->wr_tx_mask);
+		}
+		/* terminate connections of this link group abnormally */
+		lgr = container_of(link, struct smc_link_group,
+				   lnk[SMC_SINGLE_LINK]);
+		smc_lgr_terminate(lgr);
+	}
+	if (pnd_snd.handler)
+		pnd_snd.handler(&pnd_snd.priv, link, wc->status);
+	wake_up(&link->wr_tx_wait);
+}
+
+static void smc_wr_tx_tasklet_fn(unsigned long data)
+{
+	struct smc_ib_device *dev = (struct smc_ib_device *)data;
+	struct ib_wc wc[SMC_WR_MAX_POLL_CQE];
+	int i = 0, rc;
+	int polled = 0;
+
+again:
+	polled++;
+	do {
+		rc = ib_poll_cq(dev->roce_cq_send, SMC_WR_MAX_POLL_CQE, wc);
+		if (polled == 1) {
+			ib_req_notify_cq(dev->roce_cq_send,
+					 IB_CQ_NEXT_COMP |
+					 IB_CQ_REPORT_MISSED_EVENTS);
+		}
+		if (!rc)
+			break;
+		for (i = 0; i < rc; i++)
+			smc_wr_tx_process_cqe(&wc[i]);
+	} while (rc > 0);
+	if (polled == 1)
+		goto again;
+}
+
+void smc_wr_tx_cq_handler(struct ib_cq *ib_cq, void *cq_context)
+{
+	struct smc_ib_device *dev = (struct smc_ib_device *)cq_context;
+
+	tasklet_schedule(&dev->send_tasklet);
+}
+
+/*---------------------------- request submission ---------------------------*/
+
+static inline int smc_wr_tx_get_free_slot_index(struct smc_link *link, u32 *idx)
+{
+	*idx = link->wr_tx_cnt;
+	for_each_clear_bit(*idx, link->wr_tx_mask, link->wr_tx_cnt) {
+		if (!test_and_set_bit(*idx, link->wr_tx_mask))
+			return 0;
+	}
+	*idx = link->wr_tx_cnt;
+	return -EBUSY;
+}
+
+/**
+ * smc_wr_tx_get_free_slot() - returns buffer for message assembly,
+ *			and sets info for pending transmit tracking
+ * @link:		Pointer to smc_link used to later send the message.
+ * @handler:		Send completion handler function pointer.
+ * @wr_buf:		Out value returns pointer to message buffer.
+ * @wr_pend_priv:	Out value returns pointer serving as handler context.
+ *
+ * Return: 0 on success, or -errno on error.
+ */
+int smc_wr_tx_get_free_slot(struct smc_link *link,
+			    smc_wr_tx_handler handler,
+			    struct smc_wr_buf **wr_buf,
+			    struct smc_wr_tx_pend_priv **wr_pend_priv)
+{
+	struct smc_wr_tx_pend *wr_pend;
+	struct ib_send_wr *wr_ib;
+	u64 wr_id;
+	u32 idx;
+	int rc;
+
+	*wr_buf = NULL;
+	*wr_pend_priv = NULL;
+	if (in_softirq()) {
+		rc = smc_wr_tx_get_free_slot_index(link, &idx);
+		if (rc)
+			return rc;
+	} else {
+		rc = wait_event_interruptible_timeout(
+			link->wr_tx_wait,
+			(smc_wr_tx_get_free_slot_index(link, &idx) != -EBUSY),
+			SMC_WR_TX_WAIT_FREE_SLOT_TIME);
+		if (!rc) {
+			/* timeout - terminate connections */
+			struct smc_link_group *lgr;
+
+			lgr = container_of(link, struct smc_link_group,
+					   lnk[SMC_SINGLE_LINK]);
+			smc_lgr_terminate(lgr);
+			return -EPIPE;
+		}
+		if (rc == -ERESTARTSYS)
+			return -EINTR;
+		if (idx == link->wr_tx_cnt)
+			return -EPIPE;
+	}
+	wr_id = smc_wr_tx_get_next_wr_id(link);
+	wr_pend = &link->wr_tx_pends[idx];
+	wr_pend->wr_id = wr_id;
+	wr_pend->handler = handler;
+	wr_pend->link = link;
+	wr_pend->idx = idx;
+	wr_ib = &link->wr_tx_ibs[idx];
+	wr_ib->wr_id = wr_id;
+	*wr_buf = &link->wr_tx_bufs[idx];
+	*wr_pend_priv = &wr_pend->priv;
+	return 0;
+}
+
+int smc_wr_tx_put_slot(struct smc_link *link,
+		       struct smc_wr_tx_pend_priv *wr_pend_priv)
+{
+	struct smc_wr_tx_pend *pend;
+
+	pend = container_of(wr_pend_priv, struct smc_wr_tx_pend, priv);
+	if (pend->idx < link->wr_tx_cnt) {
+		/* clear the full struct smc_wr_tx_pend including .priv */
+		memset(&link->wr_tx_pends[pend->idx], 0,
+		       sizeof(link->wr_tx_pends[pend->idx]));
+		memset(&link->wr_tx_bufs[pend->idx], 0,
+		       sizeof(link->wr_tx_bufs[pend->idx]));
+		test_and_clear_bit(pend->idx, link->wr_tx_mask);
+		return 1;
+	}
+
+	return 0;
+}
+
+/* Send prepared WR slot via ib_post_send.
+ * @priv: pointer to smc_wr_tx_pend_priv identifying prepared message buffer
+ */
+int smc_wr_tx_send(struct smc_link *link, struct smc_wr_tx_pend_priv *priv)
+{
+	struct ib_send_wr *failed_wr = NULL;
+	struct smc_wr_tx_pend *pend;
+	int rc;
+
+	ib_req_notify_cq(link->smcibdev->roce_cq_send,
+			 IB_CQ_SOLICITED_MASK | IB_CQ_REPORT_MISSED_EVENTS);
+	pend = container_of(priv, struct smc_wr_tx_pend, priv);
+	rc = ib_post_send(link->roce_qp, &link->wr_tx_ibs[pend->idx],
+			  &failed_wr);
+	if (rc)
+		smc_wr_tx_put_slot(link, priv);
+	return rc;
+}
+
+void smc_wr_tx_dismiss_slots(struct smc_link *link, u8 wr_rx_hdr_type,
+			     smc_wr_tx_filter filter,
+			     smc_wr_tx_dismisser dismisser,
+			     unsigned long data)
+{
+	struct smc_wr_tx_pend_priv *tx_pend;
+	struct smc_wr_rx_hdr *wr_rx;
+	int i;
+
+	for_each_set_bit(i, link->wr_tx_mask, link->wr_tx_cnt) {
+		wr_rx = (struct smc_wr_rx_hdr *)&link->wr_rx_bufs[i];
+		if (wr_rx->type != wr_rx_hdr_type)
+			continue;
+		tx_pend = &link->wr_tx_pends[i].priv;
+		if (filter(tx_pend, data))
+			dismisser(tx_pend);
+	}
+}
+
+bool smc_wr_tx_has_pending(struct smc_link *link, u8 wr_rx_hdr_type,
+			   smc_wr_tx_filter filter, unsigned long data)
+{
+	struct smc_wr_tx_pend_priv *tx_pend;
+	struct smc_wr_rx_hdr *wr_rx;
+	int i;
+
+	for_each_set_bit(i, link->wr_tx_mask, link->wr_tx_cnt) {
+		wr_rx = (struct smc_wr_rx_hdr *)&link->wr_rx_bufs[i];
+		if (wr_rx->type != wr_rx_hdr_type)
+			continue;
+		tx_pend = &link->wr_tx_pends[i].priv;
+		if (filter(tx_pend, data))
+			return true;
+	}
+	return false;
+}
+
+/****************************** receive queue ********************************/
+
+int smc_wr_rx_register_handler(struct smc_wr_rx_handler *handler)
+{
+	struct smc_wr_rx_handler *h_iter;
+	int rc = 0;
+
+	spin_lock(&smc_wr_rx_hash_lock);
+	hash_for_each_possible(smc_wr_rx_hash, h_iter, list, handler->type) {
+		if (h_iter->type == handler->type) {
+			rc = -EEXIST;
+			goto out_unlock;
+		}
+	}
+	hash_add(smc_wr_rx_hash, &handler->list, handler->type);
+out_unlock:
+	spin_unlock(&smc_wr_rx_hash_lock);
+	return rc;
+}
+
+/* Demultiplex a received work request based on the message type to its handler.
+ * Relies on smc_wr_rx_hash having been completely filled before any IB WRs,
+ * and not being modified any more afterwards so we don't need to lock it.
+ */
+static inline void smc_wr_rx_demultiplex(struct ib_wc *wc)
+{
+	struct smc_link *link = (struct smc_link *)wc->qp->qp_context;
+	struct smc_wr_rx_handler *handler;
+	struct smc_wr_rx_hdr *wr_rx;
+	u64 temp_wr_id;
+	u32 index;
+
+	if (wc->byte_len < sizeof(*wr_rx))
+		return; /* short message */
+	temp_wr_id = wc->wr_id;
+	index = do_div(temp_wr_id, link->wr_rx_cnt);
+	wr_rx = (struct smc_wr_rx_hdr *)&link->wr_rx_bufs[index];
+	hash_for_each_possible(smc_wr_rx_hash, handler, list, wr_rx->type) {
+		if (handler->type == wr_rx->type)
+			handler->handler(wc, wr_rx);
+	}
+}
+
+static inline void smc_wr_rx_process_cqes(struct ib_wc wc[], int num)
+{
+	struct smc_link *link;
+	int i;
+
+	for (i = 0; i < num; i++) {
+		link = wc[i].qp->qp_context;
+		if (wc[i].status == IB_WC_SUCCESS) {
+			smc_wr_rx_demultiplex(&wc[i]);
+			smc_wr_rx_post(link); /* refill WR RX */
+		} else {
+			struct smc_link_group *lgr;
+
+			/* handle status errors */
+			switch (wc[i].status) {
+			case IB_WC_RETRY_EXC_ERR:
+			case IB_WC_RNR_RETRY_EXC_ERR:
+			case IB_WC_WR_FLUSH_ERR:
+				/* terminate connections of this link group
+				 * abnormally
+				 */
+				lgr = container_of(link, struct smc_link_group,
+						   lnk[SMC_SINGLE_LINK]);
+				smc_lgr_terminate(lgr);
+				break;
+			default:
+				smc_wr_rx_post(link); /* refill WR RX */
+				break;
+			}
+		}
+	}
+}
+
+static void smc_wr_rx_tasklet_fn(unsigned long data)
+{
+	struct smc_ib_device *dev = (struct smc_ib_device *)data;
+	struct ib_wc wc[SMC_WR_MAX_POLL_CQE];
+	int polled = 0;
+	int rc;
+
+again:
+	polled++;
+	do {
+		memset(&wc, 0, sizeof(wc));
+		rc = ib_poll_cq(dev->roce_cq_recv, SMC_WR_MAX_POLL_CQE, wc);
+		if (polled == 1) {
+			ib_req_notify_cq(dev->roce_cq_recv,
+					 IB_CQ_SOLICITED_MASK
+					 | IB_CQ_REPORT_MISSED_EVENTS);
+		}
+		if (!rc)
+			break;
+		smc_wr_rx_process_cqes(&wc[0], rc);
+	} while (rc > 0);
+	if (polled == 1)
+		goto again;
+}
+
+void smc_wr_rx_cq_handler(struct ib_cq *ib_cq, void *cq_context)
+{
+	struct smc_ib_device *dev = (struct smc_ib_device *)cq_context;
+
+	tasklet_schedule(&dev->recv_tasklet);
+}
+
+int smc_wr_rx_post_init(struct smc_link *link)
+{
+	u32 i;
+	int rc = 0;
+
+	for (i = 0; i < link->wr_rx_cnt; i++)
+		rc = smc_wr_rx_post(link);
+	return rc;
+}
+
+/***************************** init, exit, misc ******************************/
+
+void smc_wr_remember_qp_attr(struct smc_link *lnk)
+{
+	struct ib_qp_attr *attr = &lnk->qp_attr;
+	struct ib_qp_init_attr init_attr;
+
+	memset(attr, 0, sizeof(*attr));
+	memset(&init_attr, 0, sizeof(init_attr));
+	ib_query_qp(lnk->roce_qp, attr,
+		    IB_QP_STATE |
+		    IB_QP_CUR_STATE |
+		    IB_QP_PKEY_INDEX |
+		    IB_QP_PORT |
+		    IB_QP_QKEY |
+		    IB_QP_AV |
+		    IB_QP_PATH_MTU |
+		    IB_QP_TIMEOUT |
+		    IB_QP_RETRY_CNT |
+		    IB_QP_RNR_RETRY |
+		    IB_QP_RQ_PSN |
+		    IB_QP_ALT_PATH |
+		    IB_QP_MIN_RNR_TIMER |
+		    IB_QP_SQ_PSN |
+		    IB_QP_PATH_MIG_STATE |
+		    IB_QP_CAP |
+		    IB_QP_DEST_QPN,
+		    &init_attr);
+
+	lnk->wr_tx_cnt = min_t(size_t, SMC_WR_BUF_CNT,
+			       lnk->qp_attr.cap.max_send_wr);
+	lnk->wr_rx_cnt = min_t(size_t, SMC_WR_BUF_CNT * 3,
+			       lnk->qp_attr.cap.max_recv_wr);
+}
+
+static void smc_wr_init_sge(struct smc_link *lnk)
+{
+	u32 i;
+
+	for (i = 0; i < lnk->wr_tx_cnt; i++) {
+		lnk->wr_tx_sges[i].addr =
+			lnk->wr_tx_dma_addr + i * SMC_WR_BUF_SIZE;
+		lnk->wr_tx_sges[i].length = SMC_WR_TX_SIZE;
+		lnk->wr_tx_sges[i].lkey = lnk->roce_pd->local_dma_lkey;
+		lnk->wr_tx_ibs[i].next = NULL;
+		lnk->wr_tx_ibs[i].sg_list = &lnk->wr_tx_sges[i];
+		lnk->wr_tx_ibs[i].num_sge = 1;
+		lnk->wr_tx_ibs[i].opcode = IB_WR_SEND;
+		lnk->wr_tx_ibs[i].send_flags =
+			IB_SEND_SIGNALED | IB_SEND_SOLICITED | IB_SEND_INLINE;
+	}
+	for (i = 0; i < lnk->wr_rx_cnt; i++) {
+		lnk->wr_rx_sges[i].addr =
+			lnk->wr_rx_dma_addr + i * SMC_WR_BUF_SIZE;
+		lnk->wr_rx_sges[i].length = SMC_WR_BUF_SIZE;
+		lnk->wr_rx_sges[i].lkey = lnk->roce_pd->local_dma_lkey;
+		lnk->wr_rx_ibs[i].next = NULL;
+		lnk->wr_rx_ibs[i].sg_list = &lnk->wr_rx_sges[i];
+		lnk->wr_rx_ibs[i].num_sge = 1;
+	}
+}
+
+void smc_wr_free_link(struct smc_link *lnk)
+{
+	struct ib_device *ibdev;
+
+	memset(lnk->wr_tx_mask, 0,
+	       BITS_TO_LONGS(SMC_WR_BUF_CNT) * sizeof(*lnk->wr_tx_mask));
+
+	if (!lnk->smcibdev)
+		return;
+	ibdev = lnk->smcibdev->ibdev;
+
+	if (lnk->wr_rx_dma_addr) {
+		ib_dma_unmap_single(ibdev, lnk->wr_rx_dma_addr,
+				    SMC_WR_BUF_SIZE * lnk->wr_rx_cnt,
+				    DMA_FROM_DEVICE);
+		lnk->wr_rx_dma_addr = 0;
+	}
+	if (lnk->wr_tx_dma_addr) {
+		ib_dma_unmap_single(ibdev, lnk->wr_tx_dma_addr,
+				    SMC_WR_BUF_SIZE * lnk->wr_tx_cnt,
+				    DMA_TO_DEVICE);
+		lnk->wr_tx_dma_addr = 0;
+	}
+}
+
+void smc_wr_free_link_mem(struct smc_link *lnk)
+{
+	kfree(lnk->wr_tx_pends);
+	lnk->wr_tx_pends = NULL;
+	kfree(lnk->wr_tx_mask);
+	lnk->wr_tx_mask = NULL;
+	kfree(lnk->wr_tx_sges);
+	lnk->wr_tx_sges = NULL;
+	kfree(lnk->wr_rx_sges);
+	lnk->wr_rx_sges = NULL;
+	kfree(lnk->wr_rx_ibs);
+	lnk->wr_rx_ibs = NULL;
+	kfree(lnk->wr_tx_ibs);
+	lnk->wr_tx_ibs = NULL;
+	kfree(lnk->wr_tx_bufs);
+	lnk->wr_tx_bufs = NULL;
+	kfree(lnk->wr_rx_bufs);
+	lnk->wr_rx_bufs = NULL;
+}
+
+int smc_wr_alloc_link_mem(struct smc_link *link)
+{
+	/* allocate link related memory */
+	link->wr_tx_bufs = kcalloc(SMC_WR_BUF_CNT, SMC_WR_BUF_SIZE, GFP_KERNEL);
+	if (!link->wr_tx_bufs)
+		goto no_mem;
+	link->wr_rx_bufs = kcalloc(SMC_WR_BUF_CNT * 3, SMC_WR_BUF_SIZE,
+				   GFP_KERNEL);
+	if (!link->wr_rx_bufs)
+		goto no_mem_wr_tx_bufs;
+	link->wr_tx_ibs = kcalloc(SMC_WR_BUF_CNT, sizeof(link->wr_tx_ibs[0]),
+				  GFP_KERNEL);
+	if (!link->wr_tx_ibs)
+		goto no_mem_wr_rx_bufs;
+	link->wr_rx_ibs = kcalloc(SMC_WR_BUF_CNT * 3,
+				  sizeof(link->wr_rx_ibs[0]),
+				  GFP_KERNEL);
+	if (!link->wr_rx_ibs)
+		goto no_mem_wr_tx_ibs;
+	link->wr_tx_sges = kcalloc(SMC_WR_BUF_CNT, sizeof(link->wr_tx_sges[0]),
+				   GFP_KERNEL);
+	if (!link->wr_tx_sges)
+		goto no_mem_wr_rx_ibs;
+	link->wr_rx_sges = kcalloc(SMC_WR_BUF_CNT * 3,
+				   sizeof(link->wr_rx_sges[0]),
+				   GFP_KERNEL);
+	if (!link->wr_rx_sges)
+		goto no_mem_wr_tx_sges;
+	link->wr_tx_mask = kzalloc(
+		BITS_TO_LONGS(SMC_WR_BUF_CNT) * sizeof(*link->wr_tx_mask),
+		GFP_KERNEL);
+	if (!link->wr_tx_mask)
+		goto no_mem_wr_rx_sges;
+	link->wr_tx_pends = kcalloc(SMC_WR_BUF_CNT,
+				    sizeof(link->wr_tx_pends[0]),
+				    GFP_KERNEL);
+	if (!link->wr_tx_pends)
+		goto no_mem_wr_tx_mask;
+	return 0;
+
+no_mem_wr_tx_mask:
+	kfree(link->wr_tx_mask);
+no_mem_wr_rx_sges:
+	kfree(link->wr_rx_sges);
+no_mem_wr_tx_sges:
+	kfree(link->wr_tx_sges);
+no_mem_wr_rx_ibs:
+	kfree(link->wr_rx_ibs);
+no_mem_wr_tx_ibs:
+	kfree(link->wr_tx_ibs);
+no_mem_wr_rx_bufs:
+	kfree(link->wr_rx_bufs);
+no_mem_wr_tx_bufs:
+	kfree(link->wr_tx_bufs);
+no_mem:
+	return -ENOMEM;
+}
+
+void smc_wr_remove_dev(struct smc_ib_device *smcibdev)
+{
+	tasklet_kill(&smcibdev->recv_tasklet);
+	tasklet_kill(&smcibdev->send_tasklet);
+}
+
+void smc_wr_add_dev(struct smc_ib_device *smcibdev)
+{
+	tasklet_init(&smcibdev->recv_tasklet, smc_wr_rx_tasklet_fn,
+		     (unsigned long)smcibdev);
+	tasklet_init(&smcibdev->send_tasklet, smc_wr_tx_tasklet_fn,
+		     (unsigned long)smcibdev);
+}
+
+int smc_wr_create_link(struct smc_link *lnk)
+{
+	struct ib_device *ibdev = lnk->smcibdev->ibdev;
+	int rc = 0;
+
+	smc_wr_tx_set_wr_id(&lnk->wr_tx_id, 0);
+	lnk->wr_rx_id = 0;
+	lnk->wr_rx_dma_addr = ib_dma_map_single(
+		ibdev, lnk->wr_rx_bufs,	SMC_WR_BUF_SIZE * lnk->wr_rx_cnt,
+		DMA_FROM_DEVICE);
+	if (ib_dma_mapping_error(ibdev, lnk->wr_rx_dma_addr)) {
+		lnk->wr_rx_dma_addr = 0;
+		rc = -EIO;
+		goto out;
+	}
+	lnk->wr_tx_dma_addr = ib_dma_map_single(
+		ibdev, lnk->wr_tx_bufs,	SMC_WR_BUF_SIZE * lnk->wr_tx_cnt,
+		DMA_TO_DEVICE);
+	if (ib_dma_mapping_error(ibdev, lnk->wr_tx_dma_addr)) {
+		rc = -EIO;
+		goto dma_unmap;
+	}
+	smc_wr_init_sge(lnk);
+	memset(lnk->wr_tx_mask, 0,
+	       BITS_TO_LONGS(SMC_WR_BUF_CNT) * sizeof(*lnk->wr_tx_mask));
+	return rc;
+
+dma_unmap:
+	ib_dma_unmap_single(ibdev, lnk->wr_rx_dma_addr,
+			    SMC_WR_BUF_SIZE * lnk->wr_rx_cnt,
+			    DMA_FROM_DEVICE);
+	lnk->wr_rx_dma_addr = 0;
+out:
+	return rc;
+}
diff --git a/net/smc/smc_wr.h b/net/smc/smc_wr.h
new file mode 100644
index 000000000000..0b9beeda6053
--- /dev/null
+++ b/net/smc/smc_wr.h
@@ -0,0 +1,106 @@
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * Work Requests exploiting Infiniband API
+ *
+ * Copyright IBM Corp. 2016
+ *
+ * Author(s):  Steffen Maier <maier@linux.vnet.ibm.com>
+ */
+
+#ifndef SMC_WR_H
+#define SMC_WR_H
+
+#include <linux/atomic.h>
+#include <rdma/ib_verbs.h>
+#include <asm/div64.h>
+
+#include "smc.h"
+#include "smc_core.h"
+
+#define SMC_WR_MAX_CQE 32768	/* max. # of completion queue elements */
+#define SMC_WR_BUF_CNT 16	/* # of ctrl buffers per link */
+
+#define SMC_WR_TX_WAIT_FREE_SLOT_TIME	(10 * HZ)
+#define SMC_WR_TX_WAIT_PENDING_TIME	(5 * HZ)
+
+#define SMC_WR_TX_SIZE 44 /* actual size of wr_send data (<=SMC_WR_BUF_SIZE) */
+
+#define SMC_WR_TX_PEND_PRIV_SIZE 32
+
+struct smc_wr_tx_pend_priv {
+	u8			priv[SMC_WR_TX_PEND_PRIV_SIZE];
+};
+
+typedef void (*smc_wr_tx_handler)(struct smc_wr_tx_pend_priv *,
+				  struct smc_link *,
+				  enum ib_wc_status);
+
+typedef bool (*smc_wr_tx_filter)(struct smc_wr_tx_pend_priv *,
+				 unsigned long);
+
+typedef void (*smc_wr_tx_dismisser)(struct smc_wr_tx_pend_priv *);
+
+struct smc_wr_rx_handler {
+	struct hlist_node	list;	/* hash table collision resolution */
+	void			(*handler)(struct ib_wc *, void *);
+	u8			type;
+};
+
+/* Only used by RDMA write WRs.
+ * All other WRs (CDC/LLC) use smc_wr_tx_send handling WR_ID implicitly
+ */
+static inline long smc_wr_tx_get_next_wr_id(struct smc_link *link)
+{
+	return atomic_long_inc_return(&link->wr_tx_id);
+}
+
+static inline void smc_wr_tx_set_wr_id(atomic_long_t *wr_tx_id, long val)
+{
+	atomic_long_set(wr_tx_id, val);
+}
+
+/* post a new receive work request to fill a completed old work request entry */
+static inline int smc_wr_rx_post(struct smc_link *link)
+{
+	struct ib_recv_wr *bad_recv_wr = NULL;
+	int rc;
+	u64 wr_id, temp_wr_id;
+	u32 index;
+
+	wr_id = ++link->wr_rx_id; /* tasklet context, thus not atomic */
+	temp_wr_id = wr_id;
+	index = do_div(temp_wr_id, link->wr_rx_cnt);
+	link->wr_rx_ibs[index].wr_id = wr_id;
+	rc = ib_post_recv(link->roce_qp, &link->wr_rx_ibs[index], &bad_recv_wr);
+	return rc;
+}
+
+int smc_wr_create_link(struct smc_link *lnk);
+int smc_wr_alloc_link_mem(struct smc_link *lnk);
+void smc_wr_free_link(struct smc_link *lnk);
+void smc_wr_free_link_mem(struct smc_link *lnk);
+void smc_wr_remember_qp_attr(struct smc_link *lnk);
+void smc_wr_remove_dev(struct smc_ib_device *smcibdev);
+void smc_wr_add_dev(struct smc_ib_device *smcibdev);
+
+int smc_wr_tx_get_free_slot(struct smc_link *link, smc_wr_tx_handler handler,
+			    struct smc_wr_buf **wr_buf,
+			    struct smc_wr_tx_pend_priv **wr_pend_priv);
+int smc_wr_tx_put_slot(struct smc_link *link,
+		       struct smc_wr_tx_pend_priv *wr_pend_priv);
+int smc_wr_tx_send(struct smc_link *link,
+		   struct smc_wr_tx_pend_priv *wr_pend_priv);
+void smc_wr_tx_cq_handler(struct ib_cq *ib_cq, void *cq_context);
+bool smc_wr_tx_has_pending(struct smc_link *link, u8 wr_rx_hdr_type,
+			   smc_wr_tx_filter filter, unsigned long data);
+void smc_wr_tx_dismiss_slots(struct smc_link *lnk, u8 wr_rx_hdr_type,
+			     smc_wr_tx_filter filter,
+			     smc_wr_tx_dismisser dismisser,
+			     unsigned long data);
+
+int smc_wr_rx_register_handler(struct smc_wr_rx_handler *handler);
+int smc_wr_rx_post_init(struct smc_link *link);
+void smc_wr_rx_cq_handler(struct ib_cq *ib_cq, void *cq_context);
+
+#endif /* SMC_WR_H */
diff --git a/net/socket.c b/net/socket.c
index 0758e13754e2..2c1e8677ff2d 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -287,7 +287,7 @@ static void init_once(void *foo)
 	inode_init_once(&ei->vfs_inode);
 }
 
-static int init_inodecache(void)
+static void init_inodecache(void)
 {
 	sock_inode_cachep = kmem_cache_create("sock_inode_cache",
 					      sizeof(struct socket_alloc),
@@ -296,9 +296,7 @@ static int init_inodecache(void)
 					       SLAB_RECLAIM_ACCOUNT |
 					       SLAB_MEM_SPREAD | SLAB_ACCOUNT),
 					      init_once);
-	if (sock_inode_cachep == NULL)
-		return -ENOMEM;
-	return 0;
+	BUG_ON(sock_inode_cachep == NULL);
 }
 
 static const struct super_operations sockfs_ops = {
@@ -1948,6 +1946,8 @@ static int ___sys_sendmsg(struct socket *sock, struct user_msghdr __user *msg,
 		ctl_buf = msg_sys->msg_control;
 		ctl_len = msg_sys->msg_controllen;
 	} else if (ctl_len) {
+		BUILD_BUG_ON(sizeof(struct cmsghdr) !=
+			     CMSG_ALIGN(sizeof(struct cmsghdr)));
 		if (ctl_len > sizeof(ctl)) {
 			ctl_buf = sock_kmalloc(sock->sk, ctl_len, GFP_KERNEL);
 			if (ctl_buf == NULL)
@@ -2228,8 +2228,10 @@ int __sys_recvmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen,
 		return err;
 
 	err = sock_error(sock->sk);
-	if (err)
+	if (err) {
+		datagrams = err;
 		goto out_put;
+	}
 
 	entry = mmsg;
 	compat_entry = (struct compat_mmsghdr __user *)mmsg;
diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c
index cdeb1d814833..4f16953e4954 100644
--- a/net/sunrpc/auth_gss/auth_gss.c
+++ b/net/sunrpc/auth_gss/auth_gss.c
@@ -763,7 +763,7 @@ err_put_ctx:
 err:
 	kfree(buf);
 out:
-	dprintk("RPC:       %s returning %Zd\n", __func__, err);
+	dprintk("RPC:       %s returning %zd\n", __func__, err);
 	return err;
 }
 
diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c
index f5d6c60aae26..d8639da06d9c 100644
--- a/net/sunrpc/cache.c
+++ b/net/sunrpc/cache.c
@@ -1369,7 +1369,7 @@ static int c_show(struct seq_file *m, void *p)
 	ifdebug(CACHE)
 		seq_printf(m, "# expiry=%ld refcnt=%d flags=%lx\n",
 			   convert_to_wallclock(cp->expiry_time),
-			   atomic_read(&cp->ref.refcount), cp->flags);
+			   kref_read(&cp->ref), cp->flags);
 	cache_get(cp);
 	if (cache_check(cd, cp, NULL))
 		/* cache_check does a cache_put on failure */
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index 8492acb4011f..b94efd93d3e4 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -1282,7 +1282,7 @@ svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv)
 	return 0;
 
 err_short_len:
-	svc_printk(rqstp, "short len %Zd, dropping request\n",
+	svc_printk(rqstp, "short len %zd, dropping request\n",
 			argv->iov_len);
 	goto close;
 
diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
index 9c9db55a0c1e..7bfe1fb42add 100644
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -490,7 +490,7 @@ static struct svc_xprt *svc_xprt_dequeue(struct svc_pool *pool)
 		svc_xprt_get(xprt);
 
 		dprintk("svc: transport %p dequeued, inuse=%d\n",
-			xprt, atomic_read(&xprt->xpt_ref.refcount));
+			xprt, kref_read(&xprt->xpt_ref));
 	}
 	spin_unlock_bh(&pool->sp_lock);
 out:
@@ -822,7 +822,7 @@ static int svc_handle_xprt(struct svc_rqst *rqstp, struct svc_xprt *xprt)
 		/* XPT_DATA|XPT_DEFERRED case: */
 		dprintk("svc: server %p, pool %u, transport %p, inuse=%d\n",
 			rqstp, rqstp->rq_pool->sp_id, xprt,
-			atomic_read(&xprt->xpt_ref.refcount));
+			kref_read(&xprt->xpt_ref));
 		rqstp->rq_deferred = svc_deferred_dequeue(xprt);
 		if (rqstp->rq_deferred)
 			len = svc_deferred_recv(rqstp);
@@ -980,7 +980,7 @@ static void svc_age_temp_xprts(unsigned long closure)
 		 * through, close it. */
 		if (!test_and_set_bit(XPT_OLD, &xprt->xpt_flags))
 			continue;
-		if (atomic_read(&xprt->xpt_ref.refcount) > 1 ||
+		if (kref_read(&xprt->xpt_ref) > 1 ||
 		    test_bit(XPT_BUSY, &xprt->xpt_flags))
 			continue;
 		list_del_init(le);
diff --git a/net/sunrpc/svcauth.c b/net/sunrpc/svcauth.c
index e112da8005b5..bb8db3cb8032 100644
--- a/net/sunrpc/svcauth.c
+++ b/net/sunrpc/svcauth.c
@@ -126,13 +126,18 @@ EXPORT_SYMBOL_GPL(svc_auth_unregister);
 static struct hlist_head	auth_domain_table[DN_HASHMAX];
 static DEFINE_SPINLOCK(auth_domain_lock);
 
+static void auth_domain_release(struct kref *kref)
+{
+	struct auth_domain *dom = container_of(kref, struct auth_domain, ref);
+
+	hlist_del(&dom->hash);
+	dom->flavour->domain_release(dom);
+	spin_unlock(&auth_domain_lock);
+}
+
 void auth_domain_put(struct auth_domain *dom)
 {
-	if (atomic_dec_and_lock(&dom->ref.refcount, &auth_domain_lock)) {
-		hlist_del(&dom->hash);
-		dom->flavour->domain_release(dom);
-		spin_unlock(&auth_domain_lock);
-	}
+	kref_put_lock(&dom->ref, auth_domain_release, &auth_domain_lock);
 }
 EXPORT_SYMBOL_GPL(auth_domain_put);
 
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index 1956b8b96b2d..8931e33b6541 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -278,7 +278,7 @@ static int svc_sendto(struct svc_rqst *rqstp, struct xdr_buf *xdr)
 			       rqstp->rq_respages[0], tailoff);
 
 out:
-	dprintk("svc: socket %p sendto([%p %Zu... ], %d) = %d (addr %s)\n",
+	dprintk("svc: socket %p sendto([%p %zu... ], %d) = %d (addr %s)\n",
 		svsk, xdr->head[0].iov_base, xdr->head[0].iov_len,
 		xdr->len, len, svc_print_addr(rqstp, buf, sizeof(buf)));
 
@@ -346,7 +346,7 @@ static int svc_recvfrom(struct svc_rqst *rqstp, struct kvec *iov, int nr,
 	if (len == buflen)
 		set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
 
-	dprintk("svc: socket %p recvfrom(%p, %Zu) = %d\n",
+	dprintk("svc: socket %p recvfrom(%p, %zu) = %d\n",
 		svsk, iov[0].iov_base, iov[0].iov_len, len);
 	return len;
 }
diff --git a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
index b1bc7a796782..ff1df40f0d26 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
@@ -4,6 +4,7 @@
  * Support for backward direction RPCs on RPC/RDMA (server-side).
  */
 
+#include <linux/module.h>
 #include <linux/sunrpc/svc_rdma.h>
 #include "xprt_rdma.h"
 
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index 9d9e4e16080f..c13a5c35ce14 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -1208,9 +1208,9 @@ static void __svc_rdma_free(struct work_struct *work)
 		ib_drain_qp(rdma->sc_qp);
 
 	/* We should only be called from kref_put */
-	if (atomic_read(&xprt->xpt_ref.refcount) != 0)
+	if (kref_read(&xprt->xpt_ref) != 0)
 		pr_err("svcrdma: sc_xprt still in use? (%d)\n",
-		       atomic_read(&xprt->xpt_ref.refcount));
+		       kref_read(&xprt->xpt_ref));
 
 	/*
 	 * Destroy queued, but not processed read completions. Note
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index af392d9b9cec..956c7bce80d1 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -1188,7 +1188,7 @@ static inline void xs_tcp_read_xid(struct sock_xprt *transport, struct xdr_skb_r
 	char *p;
 
 	len = sizeof(transport->tcp_xid) - transport->tcp_offset;
-	dprintk("RPC:       reading XID (%Zu bytes)\n", len);
+	dprintk("RPC:       reading XID (%zu bytes)\n", len);
 	p = ((char *) &transport->tcp_xid) + transport->tcp_offset;
 	used = xdr_skb_read_bits(desc, p, len);
 	transport->tcp_offset += used;
@@ -1219,7 +1219,7 @@ static inline void xs_tcp_read_calldir(struct sock_xprt *transport,
 	 */
 	offset = transport->tcp_offset - sizeof(transport->tcp_xid);
 	len = sizeof(transport->tcp_calldir) - offset;
-	dprintk("RPC:       reading CALL/REPLY flag (%Zu bytes)\n", len);
+	dprintk("RPC:       reading CALL/REPLY flag (%zu bytes)\n", len);
 	p = ((char *) &transport->tcp_calldir) + offset;
 	used = xdr_skb_read_bits(desc, p, len);
 	transport->tcp_offset += used;
@@ -1310,7 +1310,7 @@ static inline void xs_tcp_read_common(struct rpc_xprt *xprt,
 		return;
 	}
 
-	dprintk("RPC:       XID %08x read %Zd bytes\n",
+	dprintk("RPC:       XID %08x read %zd bytes\n",
 			ntohl(transport->tcp_xid), r);
 	dprintk("RPC:       xprt = %p, tcp_copied = %lu, tcp_offset = %u, "
 			"tcp_reclen = %u\n", xprt, transport->tcp_copied,
@@ -1456,7 +1456,7 @@ static inline void xs_tcp_read_discard(struct sock_xprt *transport, struct xdr_s
 	desc->count -= len;
 	desc->offset += len;
 	transport->tcp_offset += len;
-	dprintk("RPC:       discarded %Zu bytes\n", len);
+	dprintk("RPC:       discarded %zu bytes\n", len);
 	xs_tcp_check_fraghdr(transport);
 }
 
diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c
index aa1babbea385..7d99029df342 100644
--- a/net/tipc/bcast.c
+++ b/net/tipc/bcast.c
@@ -1,7 +1,7 @@
 /*
  * net/tipc/bcast.c: TIPC broadcast code
  *
- * Copyright (c) 2004-2006, 2014-2015, Ericsson AB
+ * Copyright (c) 2004-2006, 2014-2016, Ericsson AB
  * Copyright (c) 2004, Intel Corporation.
  * Copyright (c) 2005, 2010-2011, Wind River Systems
  * All rights reserved.
@@ -39,9 +39,8 @@
 #include "socket.h"
 #include "msg.h"
 #include "bcast.h"
-#include "name_distr.h"
 #include "link.h"
-#include "node.h"
+#include "name_table.h"
 
 #define	BCLINK_WIN_DEFAULT	50	/* bcast link window size (default) */
 #define	BCLINK_WIN_MIN	        32	/* bcast minimum link window size */
@@ -54,12 +53,20 @@ const char tipc_bclink_name[] = "broadcast-link";
  * @inputq: data input queue; will only carry SOCK_WAKEUP messages
  * @dest: array keeping number of reachable destinations per bearer
  * @primary_bearer: a bearer having links to all broadcast destinations, if any
+ * @bcast_support: indicates if primary bearer, if any, supports broadcast
+ * @rcast_support: indicates if all peer nodes support replicast
+ * @rc_ratio: dest count as percentage of cluster size where send method changes
+ * @bc_threshold: calculated drom rc_ratio; if dests > threshold use broadcast
  */
 struct tipc_bc_base {
 	struct tipc_link *link;
 	struct sk_buff_head inputq;
 	int dests[MAX_BEARERS];
 	int primary_bearer;
+	bool bcast_support;
+	bool rcast_support;
+	int rc_ratio;
+	int bc_threshold;
 };
 
 static struct tipc_bc_base *tipc_bc_base(struct net *net)
@@ -69,7 +76,20 @@ static struct tipc_bc_base *tipc_bc_base(struct net *net)
 
 int tipc_bcast_get_mtu(struct net *net)
 {
-	return tipc_link_mtu(tipc_bc_sndlink(net));
+	return tipc_link_mtu(tipc_bc_sndlink(net)) - INT_H_SIZE;
+}
+
+void tipc_bcast_disable_rcast(struct net *net)
+{
+	tipc_bc_base(net)->rcast_support = false;
+}
+
+static void tipc_bcbase_calc_bc_threshold(struct net *net)
+{
+	struct tipc_bc_base *bb = tipc_bc_base(net);
+	int cluster_size = tipc_link_bc_peers(tipc_bc_sndlink(net));
+
+	bb->bc_threshold = 1 + (cluster_size * bb->rc_ratio / 100);
 }
 
 /* tipc_bcbase_select_primary(): find a bearer with links to all destinations,
@@ -79,9 +99,10 @@ static void tipc_bcbase_select_primary(struct net *net)
 {
 	struct tipc_bc_base *bb = tipc_bc_base(net);
 	int all_dests =  tipc_link_bc_peers(bb->link);
-	int i, mtu;
+	int i, mtu, prim;
 
 	bb->primary_bearer = INVALID_BEARER_ID;
+	bb->bcast_support = true;
 
 	if (!all_dests)
 		return;
@@ -93,7 +114,7 @@ static void tipc_bcbase_select_primary(struct net *net)
 		mtu = tipc_bearer_mtu(net, i);
 		if (mtu < tipc_link_mtu(bb->link))
 			tipc_link_set_mtu(bb->link, mtu);
-
+		bb->bcast_support &= tipc_bearer_bcast_support(net, i);
 		if (bb->dests[i] < all_dests)
 			continue;
 
@@ -103,6 +124,9 @@ static void tipc_bcbase_select_primary(struct net *net)
 		if ((i ^ tipc_own_addr(net)) & 1)
 			break;
 	}
+	prim = bb->primary_bearer;
+	if (prim != INVALID_BEARER_ID)
+		bb->bcast_support = tipc_bearer_bcast_support(net, prim);
 }
 
 void tipc_bcast_inc_bearer_dst_cnt(struct net *net, int bearer_id)
@@ -170,45 +194,131 @@ static void tipc_bcbase_xmit(struct net *net, struct sk_buff_head *xmitq)
 	__skb_queue_purge(&_xmitq);
 }
 
-/* tipc_bcast_xmit - deliver buffer chain to all nodes in cluster
- *                    and to identified node local sockets
+static void tipc_bcast_select_xmit_method(struct net *net, int dests,
+					  struct tipc_mc_method *method)
+{
+	struct tipc_bc_base *bb = tipc_bc_base(net);
+	unsigned long exp = method->expires;
+
+	/* Broadcast supported by used bearer/bearers? */
+	if (!bb->bcast_support) {
+		method->rcast = true;
+		return;
+	}
+	/* Any destinations which don't support replicast ? */
+	if (!bb->rcast_support) {
+		method->rcast = false;
+		return;
+	}
+	/* Can current method be changed ? */
+	method->expires = jiffies + TIPC_METHOD_EXPIRE;
+	if (method->mandatory || time_before(jiffies, exp))
+		return;
+
+	/* Determine method to use now */
+	method->rcast = dests <= bb->bc_threshold;
+}
+
+/* tipc_bcast_xmit - broadcast the buffer chain to all external nodes
  * @net: the applicable net namespace
- * @list: chain of buffers containing message
- * Consumes the buffer chain, except when returning -ELINKCONG
- * Returns 0 if success, otherwise errno: -ELINKCONG,-EHOSTUNREACH,-EMSGSIZE
+ * @pkts: chain of buffers containing message
+ * @cong_link_cnt: set to 1 if broadcast link is congested, otherwise 0
+ * Consumes the buffer chain.
+ * Returns 0 if success, otherwise errno: -EHOSTUNREACH,-EMSGSIZE
  */
-int tipc_bcast_xmit(struct net *net, struct sk_buff_head *list)
+static int tipc_bcast_xmit(struct net *net, struct sk_buff_head *pkts,
+			   u16 *cong_link_cnt)
 {
 	struct tipc_link *l = tipc_bc_sndlink(net);
-	struct sk_buff_head xmitq, inputq, rcvq;
+	struct sk_buff_head xmitq;
 	int rc = 0;
 
-	__skb_queue_head_init(&rcvq);
 	__skb_queue_head_init(&xmitq);
-	skb_queue_head_init(&inputq);
-
-	/* Prepare message clone for local node */
-	if (unlikely(!tipc_msg_reassemble(list, &rcvq)))
-		return -EHOSTUNREACH;
-
 	tipc_bcast_lock(net);
 	if (tipc_link_bc_peers(l))
-		rc = tipc_link_xmit(l, list, &xmitq);
+		rc = tipc_link_xmit(l, pkts, &xmitq);
 	tipc_bcast_unlock(net);
-
-	/* Don't send to local node if adding to link failed */
-	if (unlikely(rc)) {
-		__skb_queue_purge(&rcvq);
-		return rc;
+	tipc_bcbase_xmit(net, &xmitq);
+	__skb_queue_purge(pkts);
+	if (rc == -ELINKCONG) {
+		*cong_link_cnt = 1;
+		rc = 0;
 	}
+	return rc;
+}
 
-	/* Broadcast to all nodes, inluding local node */
-	tipc_bcbase_xmit(net, &xmitq);
-	tipc_sk_mcast_rcv(net, &rcvq, &inputq);
-	__skb_queue_purge(list);
+/* tipc_rcast_xmit - replicate and send a message to given destination nodes
+ * @net: the applicable net namespace
+ * @pkts: chain of buffers containing message
+ * @dests: list of destination nodes
+ * @cong_link_cnt: returns number of congested links
+ * @cong_links: returns identities of congested links
+ * Returns 0 if success, otherwise errno
+ */
+static int tipc_rcast_xmit(struct net *net, struct sk_buff_head *pkts,
+			   struct tipc_nlist *dests, u16 *cong_link_cnt)
+{
+	struct sk_buff_head _pkts;
+	struct u32_item *n, *tmp;
+	u32 dst, selector;
+
+	selector = msg_link_selector(buf_msg(skb_peek(pkts)));
+	__skb_queue_head_init(&_pkts);
+
+	list_for_each_entry_safe(n, tmp, &dests->list, list) {
+		dst = n->value;
+		if (!tipc_msg_pskb_copy(dst, pkts, &_pkts))
+			return -ENOMEM;
+
+		/* Any other return value than -ELINKCONG is ignored */
+		if (tipc_node_xmit(net, &_pkts, dst, selector) == -ELINKCONG)
+			(*cong_link_cnt)++;
+	}
 	return 0;
 }
 
+/* tipc_mcast_xmit - deliver message to indicated destination nodes
+ *                   and to identified node local sockets
+ * @net: the applicable net namespace
+ * @pkts: chain of buffers containing message
+ * @method: send method to be used
+ * @dests: destination nodes for message.
+ * @cong_link_cnt: returns number of encountered congested destination links
+ * Consumes buffer chain.
+ * Returns 0 if success, otherwise errno
+ */
+int tipc_mcast_xmit(struct net *net, struct sk_buff_head *pkts,
+		    struct tipc_mc_method *method, struct tipc_nlist *dests,
+		    u16 *cong_link_cnt)
+{
+	struct sk_buff_head inputq, localq;
+	int rc = 0;
+
+	skb_queue_head_init(&inputq);
+	skb_queue_head_init(&localq);
+
+	/* Clone packets before they are consumed by next call */
+	if (dests->local && !tipc_msg_reassemble(pkts, &localq)) {
+		rc = -ENOMEM;
+		goto exit;
+	}
+	/* Send according to determined transmit method */
+	if (dests->remote) {
+		tipc_bcast_select_xmit_method(net, dests->remote, method);
+		if (method->rcast)
+			rc = tipc_rcast_xmit(net, pkts, dests, cong_link_cnt);
+		else
+			rc = tipc_bcast_xmit(net, pkts, cong_link_cnt);
+	}
+
+	if (dests->local)
+		tipc_sk_mcast_rcv(net, &localq, &inputq);
+exit:
+	/* This queue should normally be empty by now */
+	__skb_queue_purge(pkts);
+	return rc;
+}
+
 /* tipc_bcast_rcv - receive a broadcast packet, and deliver to rcv link
  *
  * RCU is locked, no other locks set
@@ -313,6 +423,7 @@ void tipc_bcast_add_peer(struct net *net, struct tipc_link *uc_l,
 	tipc_bcast_lock(net);
 	tipc_link_add_bc_peer(snd_l, uc_l, xmitq);
 	tipc_bcbase_select_primary(net);
+	tipc_bcbase_calc_bc_threshold(net);
 	tipc_bcast_unlock(net);
 }
 
@@ -331,6 +442,7 @@ void tipc_bcast_remove_peer(struct net *net, struct tipc_link *rcv_l)
 	tipc_bcast_lock(net);
 	tipc_link_remove_bc_peer(snd_l, rcv_l, &xmitq);
 	tipc_bcbase_select_primary(net);
+	tipc_bcbase_calc_bc_threshold(net);
 	tipc_bcast_unlock(net);
 
 	tipc_bcbase_xmit(net, &xmitq);
@@ -413,6 +525,8 @@ int tipc_bcast_init(struct net *net)
 		goto enomem;
 	bb->link = l;
 	tn->bcl = l;
+	bb->rc_ratio = 25;
+	bb->rcast_support = true;
 	return 0;
 enomem:
 	kfree(bb);
@@ -428,3 +542,33 @@ void tipc_bcast_stop(struct net *net)
 	kfree(tn->bcbase);
 	kfree(tn->bcl);
 }
+
+void tipc_nlist_init(struct tipc_nlist *nl, u32 self)
+{
+	memset(nl, 0, sizeof(*nl));
+	INIT_LIST_HEAD(&nl->list);
+	nl->self = self;
+}
+
+void tipc_nlist_add(struct tipc_nlist *nl, u32 node)
+{
+	if (node == nl->self)
+		nl->local = true;
+	else if (u32_push(&nl->list, node))
+		nl->remote++;
+}
+
+void tipc_nlist_del(struct tipc_nlist *nl, u32 node)
+{
+	if (node == nl->self)
+		nl->local = false;
+	else if (u32_del(&nl->list, node))
+		nl->remote--;
+}
+
+void tipc_nlist_purge(struct tipc_nlist *nl)
+{
+	u32_list_purge(&nl->list);
+	nl->remote = 0;
+	nl->local = 0;
+}
diff --git a/net/tipc/bcast.h b/net/tipc/bcast.h
index 855d53c64ab3..751530ab0c49 100644
--- a/net/tipc/bcast.h
+++ b/net/tipc/bcast.h
@@ -42,9 +42,35 @@
 struct tipc_node;
 struct tipc_msg;
 struct tipc_nl_msg;
-struct tipc_node_map;
+struct tipc_nlist;
+struct tipc_nitem;
 extern const char tipc_bclink_name[];
 
+#define TIPC_METHOD_EXPIRE msecs_to_jiffies(5000)
+
+struct tipc_nlist {
+	struct list_head list;
+	u32 self;
+	u16 remote;
+	bool local;
+};
+
+void tipc_nlist_init(struct tipc_nlist *nl, u32 self);
+void tipc_nlist_purge(struct tipc_nlist *nl);
+void tipc_nlist_add(struct tipc_nlist *nl, u32 node);
+void tipc_nlist_del(struct tipc_nlist *nl, u32 node);
+
+/* Cookie to be used between socket and broadcast layer
+ * @rcast: replicast (instead of broadcast) was used at previous xmit
+ * @mandatory: broadcast/replicast indication was set by user
+ * @expires: re-evaluate non-mandatory transmit method if we are past this
+ */
+struct tipc_mc_method {
+	bool rcast;
+	bool mandatory;
+	unsigned long expires;
+};
+
 int tipc_bcast_init(struct net *net);
 void tipc_bcast_stop(struct net *net);
 void tipc_bcast_add_peer(struct net *net, struct tipc_link *l,
@@ -53,7 +79,10 @@ void tipc_bcast_remove_peer(struct net *net, struct tipc_link *rcv_bcl);
 void tipc_bcast_inc_bearer_dst_cnt(struct net *net, int bearer_id);
 void tipc_bcast_dec_bearer_dst_cnt(struct net *net, int bearer_id);
 int  tipc_bcast_get_mtu(struct net *net);
-int tipc_bcast_xmit(struct net *net, struct sk_buff_head *list);
+void tipc_bcast_disable_rcast(struct net *net);
+int tipc_mcast_xmit(struct net *net, struct sk_buff_head *pkts,
+		    struct tipc_mc_method *method, struct tipc_nlist *dests,
+		    u16 *cong_link_cnt);
 int tipc_bcast_rcv(struct net *net, struct tipc_link *l, struct sk_buff *skb);
 void tipc_bcast_ack_rcv(struct net *net, struct tipc_link *l,
 			struct tipc_msg *hdr);
diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c
index 52d74760fb68..33a5bdfbef76 100644
--- a/net/tipc/bearer.c
+++ b/net/tipc/bearer.c
@@ -431,7 +431,7 @@ int tipc_enable_l2_media(struct net *net, struct tipc_bearer *b,
 	memset(&b->bcast_addr, 0, sizeof(b->bcast_addr));
 	memcpy(b->bcast_addr.value, dev->broadcast, b->media->hwaddr_len);
 	b->bcast_addr.media_id = b->media->type_id;
-	b->bcast_addr.broadcast = 1;
+	b->bcast_addr.broadcast = TIPC_BROADCAST_SUPPORT;
 	b->mtu = dev->mtu;
 	b->media->raw2addr(b, &b->addr, (char *)dev->dev_addr);
 	rcu_assign_pointer(dev->tipc_ptr, b);
@@ -482,6 +482,19 @@ int tipc_l2_send_msg(struct net *net, struct sk_buff *skb,
 	return 0;
 }
 
+bool tipc_bearer_bcast_support(struct net *net, u32 bearer_id)
+{
+	bool supp = false;
+	struct tipc_bearer *b;
+
+	rcu_read_lock();
+	b = bearer_get(net, bearer_id);
+	if (b)
+		supp = (b->bcast_addr.broadcast == TIPC_BROADCAST_SUPPORT);
+	rcu_read_unlock();
+	return supp;
+}
+
 int tipc_bearer_mtu(struct net *net, u32 bearer_id)
 {
 	int mtu = 0;
diff --git a/net/tipc/bearer.h b/net/tipc/bearer.h
index 278ff7f616f9..635c9086e19a 100644
--- a/net/tipc/bearer.h
+++ b/net/tipc/bearer.h
@@ -60,9 +60,14 @@
 #define TIPC_MEDIA_TYPE_IB	2
 #define TIPC_MEDIA_TYPE_UDP	3
 
-/* minimum bearer MTU */
+/* Minimum bearer MTU */
 #define TIPC_MIN_BEARER_MTU	(MAX_H_SIZE + INT_H_SIZE)
 
+/* Identifiers for distinguishing between broadcast/multicast and replicast
+ */
+#define TIPC_BROADCAST_SUPPORT  1
+#define TIPC_REPLICAST_SUPPORT  2
+
 /**
  * struct tipc_media_addr - destination address used by TIPC bearers
  * @value: address info (format defined by media)
@@ -210,6 +215,7 @@ int tipc_bearer_setup(void);
 void tipc_bearer_cleanup(void);
 void tipc_bearer_stop(struct net *net);
 int tipc_bearer_mtu(struct net *net, u32 bearer_id);
+bool tipc_bearer_bcast_support(struct net *net, u32 bearer_id);
 void tipc_bearer_xmit_skb(struct net *net, u32 bearer_id,
 			  struct sk_buff *skb,
 			  struct tipc_media_addr *dest);
diff --git a/net/tipc/link.c b/net/tipc/link.c
index 4e8647aef01c..ddd2dd6f77aa 100644
--- a/net/tipc/link.c
+++ b/net/tipc/link.c
@@ -515,6 +515,10 @@ bool tipc_link_bc_create(struct net *net, u32 ownnode, u32 peer,
 	if (link_is_bc_sndlink(l))
 		l->state = LINK_ESTABLISHED;
 
+	/* Disable replicast if even a single peer doesn't support it */
+	if (link_is_bc_rcvlink(l) && !(peer_caps & TIPC_BCAST_RCAST))
+		tipc_bcast_disable_rcast(net);
+
 	return true;
 }
 
@@ -776,60 +780,47 @@ int tipc_link_timeout(struct tipc_link *l, struct sk_buff_head *xmitq)
 
 /**
  * link_schedule_user - schedule a message sender for wakeup after congestion
- * @link: congested link
- * @list: message that was attempted sent
+ * @l: congested link
+ * @hdr: header of message that is being sent
  * Create pseudo msg to send back to user when congestion abates
- * Does not consume buffer list
  */
-static int link_schedule_user(struct tipc_link *link, struct sk_buff_head *list)
+static int link_schedule_user(struct tipc_link *l, struct tipc_msg *hdr)
 {
-	struct tipc_msg *msg = buf_msg(skb_peek(list));
-	int imp = msg_importance(msg);
-	u32 oport = msg_origport(msg);
-	u32 addr = tipc_own_addr(link->net);
+	u32 dnode = tipc_own_addr(l->net);
+	u32 dport = msg_origport(hdr);
 	struct sk_buff *skb;
 
-	/* This really cannot happen...  */
-	if (unlikely(imp > TIPC_CRITICAL_IMPORTANCE)) {
-		pr_warn("%s<%s>, send queue full", link_rst_msg, link->name);
-		return -ENOBUFS;
-	}
-	/* Non-blocking sender: */
-	if (TIPC_SKB_CB(skb_peek(list))->wakeup_pending)
-		return -ELINKCONG;
-
 	/* Create and schedule wakeup pseudo message */
 	skb = tipc_msg_create(SOCK_WAKEUP, 0, INT_H_SIZE, 0,
-			      addr, addr, oport, 0, 0);
+			      dnode, l->addr, dport, 0, 0);
 	if (!skb)
 		return -ENOBUFS;
-	TIPC_SKB_CB(skb)->chain_sz = skb_queue_len(list);
-	TIPC_SKB_CB(skb)->chain_imp = imp;
-	skb_queue_tail(&link->wakeupq, skb);
-	link->stats.link_congs++;
+	msg_set_dest_droppable(buf_msg(skb), true);
+	TIPC_SKB_CB(skb)->chain_imp = msg_importance(hdr);
+	skb_queue_tail(&l->wakeupq, skb);
+	l->stats.link_congs++;
 	return -ELINKCONG;
 }
 
 /**
  * link_prepare_wakeup - prepare users for wakeup after congestion
- * @link: congested link
- * Move a number of waiting users, as permitted by available space in
- * the send queue, from link wait queue to node wait queue for wakeup
+ * @l: congested link
+ * Wake up a number of waiting users, as permitted by available space
+ * in the send queue
  */
 void link_prepare_wakeup(struct tipc_link *l)
 {
-	int pnd[TIPC_SYSTEM_IMPORTANCE + 1] = {0,};
-	int imp, lim;
 	struct sk_buff *skb, *tmp;
+	int imp, i = 0;
 
 	skb_queue_walk_safe(&l->wakeupq, skb, tmp) {
 		imp = TIPC_SKB_CB(skb)->chain_imp;
-		lim = l->backlog[imp].limit;
-		pnd[imp] += TIPC_SKB_CB(skb)->chain_sz;
-		if ((pnd[imp] + l->backlog[imp].len) >= lim)
+		if (l->backlog[imp].len < l->backlog[imp].limit) {
+			skb_unlink(skb, &l->wakeupq);
+			skb_queue_tail(l->inputq, skb);
+		} else if (i++ > 10) {
 			break;
-		skb_unlink(skb, &l->wakeupq);
-		skb_queue_tail(l->inputq, skb);
+		}
 	}
 }
 
@@ -869,8 +860,7 @@ void tipc_link_reset(struct tipc_link *l)
  * @list: chain of buffers containing message
  * @xmitq: returned list of packets to be sent by caller
  *
- * Consumes the buffer chain, except when returning -ELINKCONG,
- * since the caller then may want to make more send attempts.
+ * Consumes the buffer chain.
  * Returns 0 if success, or errno: -ELINKCONG, -EMSGSIZE or -ENOBUFS
  * Messages at TIPC_SYSTEM_IMPORTANCE are always accepted
  */
@@ -879,7 +869,7 @@ int tipc_link_xmit(struct tipc_link *l, struct sk_buff_head *list,
 {
 	struct tipc_msg *hdr = buf_msg(skb_peek(list));
 	unsigned int maxwin = l->window;
-	unsigned int i, imp = msg_importance(hdr);
+	int imp = msg_importance(hdr);
 	unsigned int mtu = l->mtu;
 	u16 ack = l->rcv_nxt - 1;
 	u16 seqno = l->snd_nxt;
@@ -888,19 +878,22 @@ int tipc_link_xmit(struct tipc_link *l, struct sk_buff_head *list,
 	struct sk_buff_head *backlogq = &l->backlogq;
 	struct sk_buff *skb, *_skb, *bskb;
 	int pkt_cnt = skb_queue_len(list);
+	int rc = 0;
 
-	/* Match msg importance against this and all higher backlog limits: */
-	if (!skb_queue_empty(backlogq)) {
-		for (i = imp; i <= TIPC_SYSTEM_IMPORTANCE; i++) {
-			if (unlikely(l->backlog[i].len >= l->backlog[i].limit))
-				return link_schedule_user(l, list);
-		}
-	}
 	if (unlikely(msg_size(hdr) > mtu)) {
 		skb_queue_purge(list);
 		return -EMSGSIZE;
 	}
 
+	/* Allow oversubscription of one data msg per source at congestion */
+	if (unlikely(l->backlog[imp].len >= l->backlog[imp].limit)) {
+		if (imp == TIPC_SYSTEM_IMPORTANCE) {
+			pr_warn("%s<%s>, link overflow", link_rst_msg, l->name);
+			return -ENOBUFS;
+		}
+		rc = link_schedule_user(l, hdr);
+	}
+
 	if (pkt_cnt > 1) {
 		l->stats.sent_fragmented++;
 		l->stats.sent_fragments += pkt_cnt;
@@ -946,7 +939,7 @@ int tipc_link_xmit(struct tipc_link *l, struct sk_buff_head *list,
 		skb_queue_splice_tail_init(list, backlogq);
 	}
 	l->snd_nxt = seqno;
-	return 0;
+	return rc;
 }
 
 void tipc_link_advance_backlog(struct tipc_link *l, struct sk_buff_head *xmitq)
@@ -1043,11 +1036,17 @@ int tipc_link_retrans(struct tipc_link *l, u16 from, u16 to,
 static bool tipc_data_input(struct tipc_link *l, struct sk_buff *skb,
 			    struct sk_buff_head *inputq)
 {
-	switch (msg_user(buf_msg(skb))) {
+	struct tipc_msg *hdr = buf_msg(skb);
+
+	switch (msg_user(hdr)) {
 	case TIPC_LOW_IMPORTANCE:
 	case TIPC_MEDIUM_IMPORTANCE:
 	case TIPC_HIGH_IMPORTANCE:
 	case TIPC_CRITICAL_IMPORTANCE:
+		if (unlikely(msg_type(hdr) == TIPC_MCAST_MSG)) {
+			skb_queue_tail(l->bc_rcvlink->inputq, skb);
+			return true;
+		}
 	case CONN_MANAGER:
 		skb_queue_tail(inputq, skb);
 		return true;
diff --git a/net/tipc/msg.c b/net/tipc/msg.c
index ab02d0742476..312ef7de57d7 100644
--- a/net/tipc/msg.c
+++ b/net/tipc/msg.c
@@ -607,6 +607,23 @@ error:
 	return false;
 }
 
+bool tipc_msg_pskb_copy(u32 dst, struct sk_buff_head *msg,
+			struct sk_buff_head *cpy)
+{
+	struct sk_buff *skb, *_skb;
+
+	skb_queue_walk(msg, skb) {
+		_skb = pskb_copy(skb, GFP_ATOMIC);
+		if (!_skb) {
+			__skb_queue_purge(cpy);
+			return false;
+		}
+		msg_set_destnode(buf_msg(_skb), dst);
+		__skb_queue_tail(cpy, _skb);
+	}
+	return true;
+}
+
 /* tipc_skb_queue_sorted(); sort pkt into list according to sequence number
  * @list: list to be appended to
  * @seqno: sequence number of buffer to add
diff --git a/net/tipc/msg.h b/net/tipc/msg.h
index 2c3dc38abf9c..c843fd2bc48d 100644
--- a/net/tipc/msg.h
+++ b/net/tipc/msg.h
@@ -98,8 +98,6 @@ struct tipc_skb_cb {
 	u32 bytes_read;
 	struct sk_buff *tail;
 	bool validated;
-	bool wakeup_pending;
-	u16 chain_sz;
 	u16 chain_imp;
 	u16 ackers;
 };
@@ -633,14 +631,11 @@ static inline void msg_set_bc_netid(struct tipc_msg *m, u32 id)
 
 static inline u32 msg_link_selector(struct tipc_msg *m)
 {
+	if (msg_user(m) == MSG_FRAGMENTER)
+		m = (void *)msg_data(m);
 	return msg_bits(m, 4, 0, 1);
 }
 
-static inline void msg_set_link_selector(struct tipc_msg *m, u32 n)
-{
-	msg_set_bits(m, 4, 0, 1, n);
-}
-
 /*
  * Word 5
  */
@@ -837,6 +832,8 @@ int tipc_msg_build(struct tipc_msg *mhdr, struct msghdr *m,
 		   int offset, int dsz, int mtu, struct sk_buff_head *list);
 bool tipc_msg_lookup_dest(struct net *net, struct sk_buff *skb, int *err);
 bool tipc_msg_reassemble(struct sk_buff_head *list, struct sk_buff_head *rcvq);
+bool tipc_msg_pskb_copy(u32 dst, struct sk_buff_head *msg,
+			struct sk_buff_head *cpy);
 void __tipc_skb_queue_sorted(struct sk_buff_head *list, u16 seqno,
 			     struct sk_buff *skb);
 
diff --git a/net/tipc/name_table.c b/net/tipc/name_table.c
index e190460fe0d3..9be6592e4a6f 100644
--- a/net/tipc/name_table.c
+++ b/net/tipc/name_table.c
@@ -608,7 +608,7 @@ not_found:
  * Returns non-zero if any off-node ports overlap
  */
 int tipc_nametbl_mc_translate(struct net *net, u32 type, u32 lower, u32 upper,
-			      u32 limit, struct tipc_plist *dports)
+			      u32 limit, struct list_head *dports)
 {
 	struct name_seq *seq;
 	struct sub_seq *sseq;
@@ -633,7 +633,7 @@ int tipc_nametbl_mc_translate(struct net *net, u32 type, u32 lower, u32 upper,
 		info = sseq->info;
 		list_for_each_entry(publ, &info->node_list, node_list) {
 			if (publ->scope <= limit)
-				tipc_plist_push(dports, publ->ref);
+				u32_push(dports, publ->ref);
 		}
 
 		if (info->cluster_list_size != info->node_list_size)
@@ -645,6 +645,39 @@ exit:
 	return res;
 }
 
+/* tipc_nametbl_lookup_dst_nodes - find broadcast destination nodes
+ * - Creates list of nodes that overlap the given multicast address
+ * - Determines if any node local ports overlap
+ */
+void tipc_nametbl_lookup_dst_nodes(struct net *net, u32 type, u32 lower,
+				   u32 upper, u32 domain,
+				   struct tipc_nlist *nodes)
+{
+	struct sub_seq *sseq, *stop;
+	struct publication *publ;
+	struct name_info *info;
+	struct name_seq *seq;
+
+	rcu_read_lock();
+	seq = nametbl_find_seq(net, type);
+	if (!seq)
+		goto exit;
+
+	spin_lock_bh(&seq->lock);
+	sseq = seq->sseqs + nameseq_locate_subseq(seq, lower);
+	stop = seq->sseqs + seq->first_free;
+	for (; sseq->lower <= upper && sseq != stop; sseq++) {
+		info = sseq->info;
+		list_for_each_entry(publ, &info->zone_list, zone_list) {
+			if (tipc_in_scope(domain, publ->node))
+				tipc_nlist_add(nodes, publ->node);
+		}
+	}
+	spin_unlock_bh(&seq->lock);
+exit:
+	rcu_read_unlock();
+}
+
 /*
  * tipc_nametbl_publish - add name publication to network name tables
  */
@@ -1022,40 +1055,79 @@ int tipc_nl_name_table_dump(struct sk_buff *skb, struct netlink_callback *cb)
 	return skb->len;
 }
 
-void tipc_plist_push(struct tipc_plist *pl, u32 port)
+bool u32_find(struct list_head *l, u32 value)
 {
-	struct tipc_plist *nl;
+	struct u32_item *item;
 
-	if (likely(!pl->port)) {
-		pl->port = port;
-		return;
+	list_for_each_entry(item, l, list) {
+		if (item->value == value)
+			return true;
 	}
-	if (pl->port == port)
-		return;
-	list_for_each_entry(nl, &pl->list, list) {
-		if (nl->port == port)
-			return;
+	return false;
+}
+
+bool u32_push(struct list_head *l, u32 value)
+{
+	struct u32_item *item;
+
+	list_for_each_entry(item, l, list) {
+		if (item->value == value)
+			return false;
+	}
+	item = kmalloc(sizeof(*item), GFP_ATOMIC);
+	if (unlikely(!item))
+		return false;
+
+	item->value = value;
+	list_add(&item->list, l);
+	return true;
+}
+
+u32 u32_pop(struct list_head *l)
+{
+	struct u32_item *item;
+	u32 value = 0;
+
+	if (list_empty(l))
+		return 0;
+	item = list_first_entry(l, typeof(*item), list);
+	value = item->value;
+	list_del(&item->list);
+	kfree(item);
+	return value;
+}
+
+bool u32_del(struct list_head *l, u32 value)
+{
+	struct u32_item *item, *tmp;
+
+	list_for_each_entry_safe(item, tmp, l, list) {
+		if (item->value != value)
+			continue;
+		list_del(&item->list);
+		kfree(item);
+		return true;
 	}
-	nl = kmalloc(sizeof(*nl), GFP_ATOMIC);
-	if (nl) {
-		nl->port = port;
-		list_add(&nl->list, &pl->list);
+	return false;
+}
+
+void u32_list_purge(struct list_head *l)
+{
+	struct u32_item *item, *tmp;
+
+	list_for_each_entry_safe(item, tmp, l, list) {
+		list_del(&item->list);
+		kfree(item);
 	}
 }
 
-u32 tipc_plist_pop(struct tipc_plist *pl)
+int u32_list_len(struct list_head *l)
 {
-	struct tipc_plist *nl;
-	u32 port = 0;
+	struct u32_item *item;
+	int i = 0;
 
-	if (likely(list_empty(&pl->list))) {
-		port = pl->port;
-		pl->port = 0;
-		return port;
+	list_for_each_entry(item, l, list) {
+		i++;
 	}
-	nl = list_first_entry(&pl->list, typeof(*nl), list);
-	port = nl->port;
-	list_del(&nl->list);
-	kfree(nl);
-	return port;
+	return i;
 }
diff --git a/net/tipc/name_table.h b/net/tipc/name_table.h
index 1524a73830f7..6ebdeb1d84a5 100644
--- a/net/tipc/name_table.h
+++ b/net/tipc/name_table.h
@@ -39,6 +39,7 @@
 
 struct tipc_subscription;
 struct tipc_plist;
+struct tipc_nlist;
 
 /*
  * TIPC name types reserved for internal TIPC use (both current and planned)
@@ -99,7 +100,10 @@ int tipc_nl_name_table_dump(struct sk_buff *skb, struct netlink_callback *cb);
 
 u32 tipc_nametbl_translate(struct net *net, u32 type, u32 instance, u32 *node);
 int tipc_nametbl_mc_translate(struct net *net, u32 type, u32 lower, u32 upper,
-			      u32 limit, struct tipc_plist *dports);
+			      u32 limit, struct list_head *dports);
+void tipc_nametbl_lookup_dst_nodes(struct net *net, u32 type, u32 lower,
+				   u32 upper, u32 domain,
+				   struct tipc_nlist *nodes);
 struct publication *tipc_nametbl_publish(struct net *net, u32 type, u32 lower,
 					 u32 upper, u32 scope, u32 port_ref,
 					 u32 key);
@@ -116,18 +120,16 @@ void tipc_nametbl_unsubscribe(struct tipc_subscription *s);
 int tipc_nametbl_init(struct net *net);
 void tipc_nametbl_stop(struct net *net);
 
-struct tipc_plist {
+struct u32_item {
 	struct list_head list;
-	u32 port;
+	u32 value;
 };
 
-static inline void tipc_plist_init(struct tipc_plist *pl)
-{
-	INIT_LIST_HEAD(&pl->list);
-	pl->port = 0;
-}
-
-void tipc_plist_push(struct tipc_plist *pl, u32 port);
-u32 tipc_plist_pop(struct tipc_plist *pl);
+bool u32_push(struct list_head *l, u32 value);
+u32 u32_pop(struct list_head *l);
+bool u32_find(struct list_head *l, u32 value);
+bool u32_del(struct list_head *l, u32 value);
+void u32_list_purge(struct list_head *l);
+int u32_list_len(struct list_head *l);
 
 #endif
diff --git a/net/tipc/net.c b/net/tipc/net.c
index 28bf4feeb81c..ab8a2d5d1e32 100644
--- a/net/tipc/net.c
+++ b/net/tipc/net.c
@@ -110,6 +110,10 @@ int tipc_net_start(struct net *net, u32 addr)
 	char addr_string[16];
 
 	tn->own_addr = addr;
+
+	/* Ensure that the new address is visible before we reinit. */
+	smp_mb();
+
 	tipc_named_reinit(net);
 	tipc_sk_reinit(net);
 
diff --git a/net/tipc/node.c b/net/tipc/node.c
index 27753325e06e..4512e83652b1 100644
--- a/net/tipc/node.c
+++ b/net/tipc/node.c
@@ -1172,7 +1172,7 @@ msg_full:
  * @list: chain of buffers containing message
  * @dnode: address of destination node
  * @selector: a number used for deterministic link selection
- * Consumes the buffer chain, except when returning -ELINKCONG
+ * Consumes the buffer chain.
  * Returns 0 if success, otherwise: -ELINKCONG,-EHOSTUNREACH,-EMSGSIZE,-ENOBUF
  */
 int tipc_node_xmit(struct net *net, struct sk_buff_head *list,
@@ -1211,10 +1211,10 @@ int tipc_node_xmit(struct net *net, struct sk_buff_head *list,
 	spin_unlock_bh(&le->lock);
 	tipc_node_read_unlock(n);
 
-	if (likely(rc == 0))
-		tipc_bearer_xmit(net, bearer_id, &xmitq, &le->maddr);
-	else if (rc == -ENOBUFS)
+	if (unlikely(rc == -ENOBUFS))
 		tipc_node_link_down(n, bearer_id, false);
+	else
+		tipc_bearer_xmit(net, bearer_id, &xmitq, &le->maddr);
 
 	tipc_node_put(n);
 
@@ -1226,20 +1226,15 @@ int tipc_node_xmit(struct net *net, struct sk_buff_head *list,
  * messages, which will not be rejected
  * The only exception is datagram messages rerouted after secondary
  * lookup, which are rare and safe to dispose of anyway.
- * TODO: Return real return value, and let callers use
- * tipc_wait_for_sendpkt() where applicable
  */
 int tipc_node_xmit_skb(struct net *net, struct sk_buff *skb, u32 dnode,
 		       u32 selector)
 {
 	struct sk_buff_head head;
-	int rc;
 
 	skb_queue_head_init(&head);
 	__skb_queue_tail(&head, skb);
-	rc = tipc_node_xmit(net, &head, dnode, selector);
-	if (rc == -ELINKCONG)
-		kfree_skb(skb);
+	tipc_node_xmit(net, &head, dnode, selector);
 	return 0;
 }
 
@@ -1267,6 +1262,19 @@ void tipc_node_broadcast(struct net *net, struct sk_buff *skb)
 	kfree_skb(skb);
 }
 
+static void tipc_node_mcast_rcv(struct tipc_node *n)
+{
+	struct tipc_bclink_entry *be = &n->bc_entry;
+
+	/* 'arrvq' is under inputq2's lock protection */
+	spin_lock_bh(&be->inputq2.lock);
+	spin_lock_bh(&be->inputq1.lock);
+	skb_queue_splice_tail_init(&be->inputq1, &be->arrvq);
+	spin_unlock_bh(&be->inputq1.lock);
+	spin_unlock_bh(&be->inputq2.lock);
+	tipc_sk_mcast_rcv(n->net, &be->arrvq, &be->inputq2);
+}
+
 static void tipc_node_bc_sync_rcv(struct tipc_node *n, struct tipc_msg *hdr,
 				  int bearer_id, struct sk_buff_head *xmitq)
 {
@@ -1340,15 +1348,8 @@ static void tipc_node_bc_rcv(struct net *net, struct sk_buff *skb, int bearer_id
 	if (!skb_queue_empty(&xmitq))
 		tipc_bearer_xmit(net, bearer_id, &xmitq, &le->maddr);
 
-	/* Deliver. 'arrvq' is under inputq2's lock protection */
-	if (!skb_queue_empty(&be->inputq1)) {
-		spin_lock_bh(&be->inputq2.lock);
-		spin_lock_bh(&be->inputq1.lock);
-		skb_queue_splice_tail_init(&be->inputq1, &be->arrvq);
-		spin_unlock_bh(&be->inputq1.lock);
-		spin_unlock_bh(&be->inputq2.lock);
-		tipc_sk_mcast_rcv(net, &be->arrvq, &be->inputq2);
-	}
+	if (!skb_queue_empty(&be->inputq1))
+		tipc_node_mcast_rcv(n);
 
 	if (rc & TIPC_LINK_DOWN_EVT) {
 		/* Reception reassembly failure => reset all links to peer */
@@ -1504,19 +1505,21 @@ void tipc_rcv(struct net *net, struct sk_buff *skb, struct tipc_bearer *b)
 {
 	struct sk_buff_head xmitq;
 	struct tipc_node *n;
-	struct tipc_msg *hdr = buf_msg(skb);
-	int usr = msg_user(hdr);
+	struct tipc_msg *hdr;
 	int bearer_id = b->identity;
 	struct tipc_link_entry *le;
-	u16 bc_ack = msg_bcast_ack(hdr);
 	u32 self = tipc_own_addr(net);
-	int rc = 0;
+	int usr, rc = 0;
+	u16 bc_ack;
 
 	__skb_queue_head_init(&xmitq);
 
-	/* Ensure message is well-formed */
+	/* Ensure message is well-formed before touching the header */
 	if (unlikely(!tipc_msg_validate(skb)))
 		goto discard;
+	hdr = buf_msg(skb);
+	usr = msg_user(hdr);
+	bc_ack = msg_bcast_ack(hdr);
 
 	/* Handle arrival of discovery or broadcast packet */
 	if (unlikely(msg_non_seq(hdr))) {
@@ -1575,6 +1578,9 @@ void tipc_rcv(struct net *net, struct sk_buff *skb, struct tipc_bearer *b)
 	if (unlikely(!skb_queue_empty(&n->bc_entry.namedq)))
 		tipc_named_rcv(net, &n->bc_entry.namedq);
 
+	if (unlikely(!skb_queue_empty(&n->bc_entry.inputq1)))
+		tipc_node_mcast_rcv(n);
+
 	if (!skb_queue_empty(&le->inputq))
 		tipc_sk_rcv(net, &le->inputq);
 
diff --git a/net/tipc/node.h b/net/tipc/node.h
index 39ef54c1f2ad..898c22916984 100644
--- a/net/tipc/node.h
+++ b/net/tipc/node.h
@@ -47,11 +47,13 @@
 enum {
 	TIPC_BCAST_SYNCH      = (1 << 1),
 	TIPC_BCAST_STATE_NACK = (1 << 2),
-	TIPC_BLOCK_FLOWCTL    = (1 << 3)
+	TIPC_BLOCK_FLOWCTL    = (1 << 3),
+	TIPC_BCAST_RCAST      = (1 << 4)
 };
 
 #define TIPC_NODE_CAPABILITIES (TIPC_BCAST_SYNCH | \
 				TIPC_BCAST_STATE_NACK | \
+				TIPC_BCAST_RCAST | \
 				TIPC_BLOCK_FLOWCTL)
 #define INVALID_BEARER_ID -1
 
diff --git a/net/tipc/socket.c b/net/tipc/socket.c
index 800caaa699a1..6b09a778cc71 100644
--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -67,16 +67,19 @@ enum {
  * @max_pkt: maximum packet size "hint" used when building messages sent by port
  * @portid: unique port identity in TIPC socket hash table
  * @phdr: preformatted message header used when sending messages
+ * #cong_links: list of congested links
  * @publications: list of publications for port
+ * @blocking_link: address of the congested link we are currently sleeping on
  * @pub_count: total # of publications port has made during its lifetime
  * @probing_state:
  * @conn_timeout: the time we can wait for an unresponded setup request
  * @dupl_rcvcnt: number of bytes counted twice, in both backlog and rcv queue
- * @link_cong: non-zero if owner must sleep because of link congestion
+ * @cong_link_cnt: number of congested links
  * @sent_unacked: # messages sent by socket, and not yet acked by peer
  * @rcv_unacked: # messages read by user, but not yet acked back to peer
  * @peer: 'connected' peer for dgram/rdm
  * @node: hash table node
+ * @mc_method: cookie for use between socket and broadcast layer
  * @rcu: rcu struct for tipc_sock
  */
 struct tipc_sock {
@@ -87,13 +90,13 @@ struct tipc_sock {
 	u32 max_pkt;
 	u32 portid;
 	struct tipc_msg phdr;
-	struct list_head sock_list;
+	struct list_head cong_links;
 	struct list_head publications;
 	u32 pub_count;
 	uint conn_timeout;
 	atomic_t dupl_rcvcnt;
 	bool probe_unacked;
-	bool link_cong;
+	u16 cong_link_cnt;
 	u16 snt_unacked;
 	u16 snd_win;
 	u16 peer_caps;
@@ -101,6 +104,7 @@ struct tipc_sock {
 	u16 rcv_win;
 	struct sockaddr_tipc peer;
 	struct rhash_head node;
+	struct tipc_mc_method mc_method;
 	struct rcu_head rcu;
 };
 
@@ -110,7 +114,6 @@ static void tipc_write_space(struct sock *sk);
 static void tipc_sock_destruct(struct sock *sk);
 static int tipc_release(struct socket *sock);
 static int tipc_accept(struct socket *sock, struct socket *new_sock, int flags);
-static int tipc_wait_for_sndmsg(struct socket *sock, long *timeo_p);
 static void tipc_sk_timeout(unsigned long data);
 static int tipc_sk_publish(struct tipc_sock *tsk, uint scope,
 			   struct tipc_name_seq const *seq);
@@ -119,8 +122,7 @@ static int tipc_sk_withdraw(struct tipc_sock *tsk, uint scope,
 static struct tipc_sock *tipc_sk_lookup(struct net *net, u32 portid);
 static int tipc_sk_insert(struct tipc_sock *tsk);
 static void tipc_sk_remove(struct tipc_sock *tsk);
-static int __tipc_send_stream(struct socket *sock, struct msghdr *m,
-			      size_t dsz);
+static int __tipc_sendstream(struct socket *sock, struct msghdr *m, size_t dsz);
 static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dsz);
 
 static const struct proto_ops packet_ops;
@@ -334,6 +336,49 @@ static int tipc_set_sk_state(struct sock *sk, int state)
 	return res;
 }
 
+static int tipc_sk_sock_err(struct socket *sock, long *timeout)
+{
+	struct sock *sk = sock->sk;
+	int err = sock_error(sk);
+	int typ = sock->type;
+
+	if (err)
+		return err;
+	if (typ == SOCK_STREAM || typ == SOCK_SEQPACKET) {
+		if (sk->sk_state == TIPC_DISCONNECTING)
+			return -EPIPE;
+		else if (!tipc_sk_connected(sk))
+			return -ENOTCONN;
+	}
+	if (!*timeout)
+		return -EAGAIN;
+	if (signal_pending(current))
+		return sock_intr_errno(*timeout);
+
+	return 0;
+}
+
+#define tipc_wait_for_cond(sock_, timeout_, condition_)			\
+({								        \
+	int rc_ = 0;							\
+	int done_ = 0;							\
+									\
+	while (!(condition_) && !done_) {				\
+		struct sock *sk_ = sock->sk;				\
+		DEFINE_WAIT_FUNC(wait_, woken_wake_function);		\
+									\
+		rc_ = tipc_sk_sock_err(sock_, timeout_);		\
+		if (rc_)						\
+			break;						\
+		prepare_to_wait(sk_sleep(sk_), &wait_,			\
+				TASK_INTERRUPTIBLE);			\
+		done_ = sk_wait_event(sk_, timeout_,			\
+				      (condition_), &wait_);		\
+		remove_wait_queue(sk_sleep(sk_), &wait_);		\
+	}								\
+	rc_;								\
+})
+
 /**
  * tipc_sk_create - create a TIPC socket
  * @net: network namespace (must be default network)
@@ -382,10 +427,9 @@ static int tipc_sk_create(struct net *net, struct socket *sock,
 	tsk = tipc_sk(sk);
 	tsk->max_pkt = MAX_PKT_DEFAULT;
 	INIT_LIST_HEAD(&tsk->publications);
+	INIT_LIST_HEAD(&tsk->cong_links);
 	msg = &tsk->phdr;
 	tn = net_generic(sock_net(sk), tipc_net_id);
-	tipc_msg_init(tn->own_addr, msg, TIPC_LOW_IMPORTANCE, TIPC_NAMED_MSG,
-		      NAMED_H_SIZE, 0);
 
 	/* Finish initializing socket data structures */
 	sock->ops = ops;
@@ -395,6 +439,13 @@ static int tipc_sk_create(struct net *net, struct socket *sock,
 		pr_warn("Socket create failed; port number exhausted\n");
 		return -EINVAL;
 	}
+
+	/* Ensure tsk is visible before we read own_addr. */
+	smp_mb();
+
+	tipc_msg_init(tn->own_addr, msg, TIPC_LOW_IMPORTANCE, TIPC_NAMED_MSG,
+		      NAMED_H_SIZE, 0);
+
 	msg_set_origport(msg, tsk->portid);
 	setup_timer(&sk->sk_timer, tipc_sk_timeout, (unsigned long)tsk);
 	sk->sk_shutdown = 0;
@@ -432,9 +483,14 @@ static void __tipc_shutdown(struct socket *sock, int error)
 	struct sock *sk = sock->sk;
 	struct tipc_sock *tsk = tipc_sk(sk);
 	struct net *net = sock_net(sk);
+	long timeout = CONN_TIMEOUT_DEFAULT;
 	u32 dnode = tsk_peer_node(tsk);
 	struct sk_buff *skb;
 
+	/* Avoid that hi-prio shutdown msgs bypass msgs in link wakeup queue */
+	tipc_wait_for_cond(sock, &timeout, (!tsk->cong_link_cnt &&
+					    !tsk_conn_cong(tsk)));
+
 	/* Reject all unreceived messages, except on an active connection
 	 * (which disconnects locally & sends a 'FIN+' to peer).
 	 */
@@ -505,7 +561,8 @@ static int tipc_release(struct socket *sock)
 
 	/* Reject any messages that accumulated in backlog queue */
 	release_sock(sk);
-
+	u32_list_purge(&tsk->cong_links);
+	tsk->cong_link_cnt = 0;
 	call_rcu(&tsk->rcu, tipc_sk_callback);
 	sock->sk = NULL;
 
@@ -648,7 +705,7 @@ static unsigned int tipc_poll(struct file *file, struct socket *sock,
 
 	switch (sk->sk_state) {
 	case TIPC_ESTABLISHED:
-		if (!tsk->link_cong && !tsk_conn_cong(tsk))
+		if (!tsk->cong_link_cnt && !tsk_conn_cong(tsk))
 			mask |= POLLOUT;
 		/* fall thru' */
 	case TIPC_LISTEN:
@@ -657,7 +714,7 @@ static unsigned int tipc_poll(struct file *file, struct socket *sock,
 			mask |= (POLLIN | POLLRDNORM);
 		break;
 	case TIPC_OPEN:
-		if (!tsk->link_cong)
+		if (!tsk->cong_link_cnt)
 			mask |= POLLOUT;
 		if (tipc_sk_type_connectionless(sk) &&
 		    (!skb_queue_empty(&sk->sk_receive_queue)))
@@ -676,63 +733,60 @@ static unsigned int tipc_poll(struct file *file, struct socket *sock,
  * @sock: socket structure
  * @seq: destination address
  * @msg: message to send
- * @dsz: total length of message data
- * @timeo: timeout to wait for wakeup
+ * @dlen: length of data to send
+ * @timeout: timeout to wait for wakeup
  *
  * Called from function tipc_sendmsg(), which has done all sanity checks
  * Returns the number of bytes sent on success, or errno
  */
 static int tipc_sendmcast(struct  socket *sock, struct tipc_name_seq *seq,
-			  struct msghdr *msg, size_t dsz, long timeo)
+			  struct msghdr *msg, size_t dlen, long timeout)
 {
 	struct sock *sk = sock->sk;
 	struct tipc_sock *tsk = tipc_sk(sk);
+	struct tipc_msg *hdr = &tsk->phdr;
 	struct net *net = sock_net(sk);
-	struct tipc_msg *mhdr = &tsk->phdr;
-	struct sk_buff_head pktchain;
-	struct iov_iter save = msg->msg_iter;
-	uint mtu;
+	int mtu = tipc_bcast_get_mtu(net);
+	struct tipc_mc_method *method = &tsk->mc_method;
+	u32 domain = addr_domain(net, TIPC_CLUSTER_SCOPE);
+	struct sk_buff_head pkts;
+	struct tipc_nlist dsts;
 	int rc;
 
-	if (!timeo && tsk->link_cong)
-		return -ELINKCONG;
+	/* Block or return if any destination link is congested */
+	rc = tipc_wait_for_cond(sock, &timeout, !tsk->cong_link_cnt);
+	if (unlikely(rc))
+		return rc;
 
-	msg_set_type(mhdr, TIPC_MCAST_MSG);
-	msg_set_lookup_scope(mhdr, TIPC_CLUSTER_SCOPE);
-	msg_set_destport(mhdr, 0);
-	msg_set_destnode(mhdr, 0);
-	msg_set_nametype(mhdr, seq->type);
-	msg_set_namelower(mhdr, seq->lower);
-	msg_set_nameupper(mhdr, seq->upper);
-	msg_set_hdr_sz(mhdr, MCAST_H_SIZE);
+	/* Lookup destination nodes */
+	tipc_nlist_init(&dsts, tipc_own_addr(net));
+	tipc_nametbl_lookup_dst_nodes(net, seq->type, seq->lower,
+				      seq->upper, domain, &dsts);
+	if (!dsts.local && !dsts.remote)
+		return -EHOSTUNREACH;
 
-	skb_queue_head_init(&pktchain);
+	/* Build message header */
+	msg_set_type(hdr, TIPC_MCAST_MSG);
+	msg_set_hdr_sz(hdr, MCAST_H_SIZE);
+	msg_set_lookup_scope(hdr, TIPC_CLUSTER_SCOPE);
+	msg_set_destport(hdr, 0);
+	msg_set_destnode(hdr, 0);
+	msg_set_nametype(hdr, seq->type);
+	msg_set_namelower(hdr, seq->lower);
+	msg_set_nameupper(hdr, seq->upper);
 
-new_mtu:
-	mtu = tipc_bcast_get_mtu(net);
-	rc = tipc_msg_build(mhdr, msg, 0, dsz, mtu, &pktchain);
-	if (unlikely(rc < 0))
-		return rc;
+	/* Build message as chain of buffers */
+	skb_queue_head_init(&pkts);
+	rc = tipc_msg_build(hdr, msg, 0, dlen, mtu, &pkts);
 
-	do {
-		rc = tipc_bcast_xmit(net, &pktchain);
-		if (likely(!rc))
-			return dsz;
-
-		if (rc == -ELINKCONG) {
-			tsk->link_cong = 1;
-			rc = tipc_wait_for_sndmsg(sock, &timeo);
-			if (!rc)
-				continue;
-		}
-		__skb_queue_purge(&pktchain);
-		if (rc == -EMSGSIZE) {
-			msg->msg_iter = save;
-			goto new_mtu;
-		}
-		break;
-	} while (1);
-	return rc;
+	/* Send message if build was successful */
+	if (unlikely(rc == dlen))
+		rc = tipc_mcast_xmit(net, &pkts, method, &dsts,
+				     &tsk->cong_link_cnt);
+
+	tipc_nlist_purge(&dsts);
+
+	return rc ? rc : dlen;
 }
 
 /**
@@ -746,7 +800,7 @@ void tipc_sk_mcast_rcv(struct net *net, struct sk_buff_head *arrvq,
 		       struct sk_buff_head *inputq)
 {
 	struct tipc_msg *msg;
-	struct tipc_plist dports;
+	struct list_head dports;
 	u32 portid;
 	u32 scope = TIPC_CLUSTER_SCOPE;
 	struct sk_buff_head tmpq;
@@ -754,7 +808,7 @@ void tipc_sk_mcast_rcv(struct net *net, struct sk_buff_head *arrvq,
 	struct sk_buff *skb, *_skb;
 
 	__skb_queue_head_init(&tmpq);
-	tipc_plist_init(&dports);
+	INIT_LIST_HEAD(&dports);
 
 	skb = tipc_skb_peek(arrvq, &inputq->lock);
 	for (; skb; skb = tipc_skb_peek(arrvq, &inputq->lock)) {
@@ -768,8 +822,8 @@ void tipc_sk_mcast_rcv(struct net *net, struct sk_buff_head *arrvq,
 		tipc_nametbl_mc_translate(net,
 					  msg_nametype(msg), msg_namelower(msg),
 					  msg_nameupper(msg), scope, &dports);
-		portid = tipc_plist_pop(&dports);
-		for (; portid; portid = tipc_plist_pop(&dports)) {
+		portid = u32_pop(&dports);
+		for (; portid; portid = u32_pop(&dports)) {
 			_skb = __pskb_copy(skb, hsz, GFP_ATOMIC);
 			if (_skb) {
 				msg_set_destport(buf_msg(_skb), portid);
@@ -830,31 +884,6 @@ exit:
 	kfree_skb(skb);
 }
 
-static int tipc_wait_for_sndmsg(struct socket *sock, long *timeo_p)
-{
-	DEFINE_WAIT_FUNC(wait, woken_wake_function);
-	struct sock *sk = sock->sk;
-	struct tipc_sock *tsk = tipc_sk(sk);
-	int done;
-
-	do {
-		int err = sock_error(sk);
-		if (err)
-			return err;
-		if (sk->sk_shutdown & SEND_SHUTDOWN)
-			return -EPIPE;
-		if (!*timeo_p)
-			return -EAGAIN;
-		if (signal_pending(current))
-			return sock_intr_errno(*timeo_p);
-
-		add_wait_queue(sk_sleep(sk), &wait);
-		done = sk_wait_event(sk, timeo_p, !tsk->link_cong, &wait);
-		remove_wait_queue(sk_sleep(sk), &wait);
-	} while (!done);
-	return 0;
-}
-
 /**
  * tipc_sendmsg - send message in connectionless manner
  * @sock: socket structure
@@ -881,35 +910,38 @@ static int tipc_sendmsg(struct socket *sock,
 	return ret;
 }
 
-static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dsz)
+static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dlen)
 {
-	DECLARE_SOCKADDR(struct sockaddr_tipc *, dest, m->msg_name);
 	struct sock *sk = sock->sk;
-	struct tipc_sock *tsk = tipc_sk(sk);
 	struct net *net = sock_net(sk);
-	struct tipc_msg *mhdr = &tsk->phdr;
-	u32 dnode, dport;
-	struct sk_buff_head pktchain;
-	bool is_connectionless = tipc_sk_type_connectionless(sk);
-	struct sk_buff *skb;
+	struct tipc_sock *tsk = tipc_sk(sk);
+	DECLARE_SOCKADDR(struct sockaddr_tipc *, dest, m->msg_name);
+	long timeout = sock_sndtimeo(sk, m->msg_flags & MSG_DONTWAIT);
+	struct list_head *clinks = &tsk->cong_links;
+	bool syn = !tipc_sk_type_connectionless(sk);
+	struct tipc_msg *hdr = &tsk->phdr;
 	struct tipc_name_seq *seq;
-	struct iov_iter save;
-	u32 mtu;
-	long timeo;
-	int rc;
+	struct sk_buff_head pkts;
+	u32 type, inst, domain;
+	u32 dnode, dport;
+	int mtu, rc;
 
-	if (dsz > TIPC_MAX_USER_MSG_SIZE)
+	if (unlikely(dlen > TIPC_MAX_USER_MSG_SIZE))
 		return -EMSGSIZE;
+
 	if (unlikely(!dest)) {
-		if (is_connectionless && tsk->peer.family == AF_TIPC)
-			dest = &tsk->peer;
-		else
+		dest = &tsk->peer;
+		if (!syn || dest->family != AF_TIPC)
 			return -EDESTADDRREQ;
-	} else if (unlikely(m->msg_namelen < sizeof(*dest)) ||
-		   dest->family != AF_TIPC) {
-		return -EINVAL;
 	}
-	if (!is_connectionless) {
+
+	if (unlikely(m->msg_namelen < sizeof(*dest)))
+		return -EINVAL;
+
+	if (unlikely(dest->family != AF_TIPC))
+		return -EINVAL;
+
+	if (unlikely(syn)) {
 		if (sk->sk_state == TIPC_LISTEN)
 			return -EPIPE;
 		if (sk->sk_state != TIPC_OPEN)
@@ -921,102 +953,62 @@ static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dsz)
 			tsk->conn_instance = dest->addr.name.name.instance;
 		}
 	}
-	seq = &dest->addr.nameseq;
-	timeo = sock_sndtimeo(sk, m->msg_flags & MSG_DONTWAIT);
 
-	if (dest->addrtype == TIPC_ADDR_MCAST) {
-		return tipc_sendmcast(sock, seq, m, dsz, timeo);
-	} else if (dest->addrtype == TIPC_ADDR_NAME) {
-		u32 type = dest->addr.name.name.type;
-		u32 inst = dest->addr.name.name.instance;
-		u32 domain = dest->addr.name.domain;
+	seq = &dest->addr.nameseq;
+	if (dest->addrtype == TIPC_ADDR_MCAST)
+		return tipc_sendmcast(sock, seq, m, dlen, timeout);
 
+	if (dest->addrtype == TIPC_ADDR_NAME) {
+		type = dest->addr.name.name.type;
+		inst = dest->addr.name.name.instance;
+		domain = dest->addr.name.domain;
 		dnode = domain;
-		msg_set_type(mhdr, TIPC_NAMED_MSG);
-		msg_set_hdr_sz(mhdr, NAMED_H_SIZE);
-		msg_set_nametype(mhdr, type);
-		msg_set_nameinst(mhdr, inst);
-		msg_set_lookup_scope(mhdr, tipc_addr_scope(domain));
+		msg_set_type(hdr, TIPC_NAMED_MSG);
+		msg_set_hdr_sz(hdr, NAMED_H_SIZE);
+		msg_set_nametype(hdr, type);
+		msg_set_nameinst(hdr, inst);
+		msg_set_lookup_scope(hdr, tipc_addr_scope(domain));
 		dport = tipc_nametbl_translate(net, type, inst, &dnode);
-		msg_set_destnode(mhdr, dnode);
-		msg_set_destport(mhdr, dport);
+		msg_set_destnode(hdr, dnode);
+		msg_set_destport(hdr, dport);
 		if (unlikely(!dport && !dnode))
 			return -EHOSTUNREACH;
+
 	} else if (dest->addrtype == TIPC_ADDR_ID) {
 		dnode = dest->addr.id.node;
-		msg_set_type(mhdr, TIPC_DIRECT_MSG);
-		msg_set_lookup_scope(mhdr, 0);
-		msg_set_destnode(mhdr, dnode);
-		msg_set_destport(mhdr, dest->addr.id.ref);
-		msg_set_hdr_sz(mhdr, BASIC_H_SIZE);
+		msg_set_type(hdr, TIPC_DIRECT_MSG);
+		msg_set_lookup_scope(hdr, 0);
+		msg_set_destnode(hdr, dnode);
+		msg_set_destport(hdr, dest->addr.id.ref);
+		msg_set_hdr_sz(hdr, BASIC_H_SIZE);
 	}
 
-	skb_queue_head_init(&pktchain);
-	save = m->msg_iter;
-new_mtu:
-	mtu = tipc_node_get_mtu(net, dnode, tsk->portid);
-	rc = tipc_msg_build(mhdr, m, 0, dsz, mtu, &pktchain);
-	if (rc < 0)
+	/* Block or return if destination link is congested */
+	rc = tipc_wait_for_cond(sock, &timeout, !u32_find(clinks, dnode));
+	if (unlikely(rc))
 		return rc;
 
-	do {
-		skb = skb_peek(&pktchain);
-		TIPC_SKB_CB(skb)->wakeup_pending = tsk->link_cong;
-		rc = tipc_node_xmit(net, &pktchain, dnode, tsk->portid);
-		if (likely(!rc)) {
-			if (!is_connectionless)
-				tipc_set_sk_state(sk, TIPC_CONNECTING);
-			return dsz;
-		}
-		if (rc == -ELINKCONG) {
-			tsk->link_cong = 1;
-			rc = tipc_wait_for_sndmsg(sock, &timeo);
-			if (!rc)
-				continue;
-		}
-		__skb_queue_purge(&pktchain);
-		if (rc == -EMSGSIZE) {
-			m->msg_iter = save;
-			goto new_mtu;
-		}
-		break;
-	} while (1);
-
-	return rc;
-}
+	skb_queue_head_init(&pkts);
+	mtu = tipc_node_get_mtu(net, dnode, tsk->portid);
+	rc = tipc_msg_build(hdr, m, 0, dlen, mtu, &pkts);
+	if (unlikely(rc != dlen))
+		return rc;
 
-static int tipc_wait_for_sndpkt(struct socket *sock, long *timeo_p)
-{
-	DEFINE_WAIT_FUNC(wait, woken_wake_function);
-	struct sock *sk = sock->sk;
-	struct tipc_sock *tsk = tipc_sk(sk);
-	int done;
+	rc = tipc_node_xmit(net, &pkts, dnode, tsk->portid);
+	if (unlikely(rc == -ELINKCONG)) {
+		u32_push(clinks, dnode);
+		tsk->cong_link_cnt++;
+		rc = 0;
+	}
 
-	do {
-		int err = sock_error(sk);
-		if (err)
-			return err;
-		if (sk->sk_state == TIPC_DISCONNECTING)
-			return -EPIPE;
-		else if (!tipc_sk_connected(sk))
-			return -ENOTCONN;
-		if (!*timeo_p)
-			return -EAGAIN;
-		if (signal_pending(current))
-			return sock_intr_errno(*timeo_p);
+	if (unlikely(syn && !rc))
+		tipc_set_sk_state(sk, TIPC_CONNECTING);
 
-		add_wait_queue(sk_sleep(sk), &wait);
-		done = sk_wait_event(sk, timeo_p,
-				     (!tsk->link_cong &&
-				      !tsk_conn_cong(tsk)) ||
-				      !tipc_sk_connected(sk), &wait);
-		remove_wait_queue(sk_sleep(sk), &wait);
-	} while (!done);
-	return 0;
+	return rc ? rc : dlen;
 }
 
 /**
- * tipc_send_stream - send stream-oriented data
+ * tipc_sendstream - send stream-oriented data
  * @sock: socket structure
  * @m: data to send
  * @dsz: total length of data to be transmitted
@@ -1026,94 +1018,69 @@ static int tipc_wait_for_sndpkt(struct socket *sock, long *timeo_p)
  * Returns the number of bytes sent on success (or partial success),
  * or errno if no data sent
  */
-static int tipc_send_stream(struct socket *sock, struct msghdr *m, size_t dsz)
+static int tipc_sendstream(struct socket *sock, struct msghdr *m, size_t dsz)
 {
 	struct sock *sk = sock->sk;
 	int ret;
 
 	lock_sock(sk);
-	ret = __tipc_send_stream(sock, m, dsz);
+	ret = __tipc_sendstream(sock, m, dsz);
 	release_sock(sk);
 
 	return ret;
 }
 
-static int __tipc_send_stream(struct socket *sock, struct msghdr *m, size_t dsz)
+static int __tipc_sendstream(struct socket *sock, struct msghdr *m, size_t dlen)
 {
 	struct sock *sk = sock->sk;
-	struct net *net = sock_net(sk);
-	struct tipc_sock *tsk = tipc_sk(sk);
-	struct tipc_msg *mhdr = &tsk->phdr;
-	struct sk_buff_head pktchain;
 	DECLARE_SOCKADDR(struct sockaddr_tipc *, dest, m->msg_name);
-	u32 portid = tsk->portid;
-	int rc = -EINVAL;
-	long timeo;
-	u32 dnode;
-	uint mtu, send, sent = 0;
-	struct iov_iter save;
-	int hlen = MIN_H_SIZE;
-
-	/* Handle implied connection establishment */
-	if (unlikely(dest)) {
-		rc = __tipc_sendmsg(sock, m, dsz);
-		hlen = msg_hdr_sz(mhdr);
-		if (dsz && (dsz == rc))
-			tsk->snt_unacked = tsk_inc(tsk, dsz + hlen);
-		return rc;
-	}
-	if (dsz > (uint)INT_MAX)
-		return -EMSGSIZE;
-
-	if (unlikely(!tipc_sk_connected(sk))) {
-		if (sk->sk_state == TIPC_DISCONNECTING)
-			return -EPIPE;
-		else
-			return -ENOTCONN;
-	}
+	long timeout = sock_sndtimeo(sk, m->msg_flags & MSG_DONTWAIT);
+	struct tipc_sock *tsk = tipc_sk(sk);
+	struct tipc_msg *hdr = &tsk->phdr;
+	struct net *net = sock_net(sk);
+	struct sk_buff_head pkts;
+	u32 dnode = tsk_peer_node(tsk);
+	int send, sent = 0;
+	int rc = 0;
 
-	timeo = sock_sndtimeo(sk, m->msg_flags & MSG_DONTWAIT);
-	if (!timeo && tsk->link_cong)
-		return -ELINKCONG;
+	skb_queue_head_init(&pkts);
 
-	dnode = tsk_peer_node(tsk);
-	skb_queue_head_init(&pktchain);
+	if (unlikely(dlen > INT_MAX))
+		return -EMSGSIZE;
 
-next:
-	save = m->msg_iter;
-	mtu = tsk->max_pkt;
-	send = min_t(uint, dsz - sent, TIPC_MAX_USER_MSG_SIZE);
-	rc = tipc_msg_build(mhdr, m, sent, send, mtu, &pktchain);
-	if (unlikely(rc < 0))
+	/* Handle implicit connection setup */
+	if (unlikely(dest)) {
+		rc = __tipc_sendmsg(sock, m, dlen);
+		if (dlen && (dlen == rc))
+			tsk->snt_unacked = tsk_inc(tsk, dlen + msg_hdr_sz(hdr));
 		return rc;
+	}
 
 	do {
-		if (likely(!tsk_conn_cong(tsk))) {
-			rc = tipc_node_xmit(net, &pktchain, dnode, portid);
-			if (likely(!rc)) {
-				tsk->snt_unacked += tsk_inc(tsk, send + hlen);
-				sent += send;
-				if (sent == dsz)
-					return dsz;
-				goto next;
-			}
-			if (rc == -EMSGSIZE) {
-				__skb_queue_purge(&pktchain);
-				tsk->max_pkt = tipc_node_get_mtu(net, dnode,
-								 portid);
-				m->msg_iter = save;
-				goto next;
-			}
-			if (rc != -ELINKCONG)
-				break;
+		rc = tipc_wait_for_cond(sock, &timeout,
+					(!tsk->cong_link_cnt &&
+					 !tsk_conn_cong(tsk) &&
+					 tipc_sk_connected(sk)));
+		if (unlikely(rc))
+			break;
 
-			tsk->link_cong = 1;
+		send = min_t(size_t, dlen - sent, TIPC_MAX_USER_MSG_SIZE);
+		rc = tipc_msg_build(hdr, m, sent, send, tsk->max_pkt, &pkts);
+		if (unlikely(rc != send))
+			break;
+
+		rc = tipc_node_xmit(net, &pkts, dnode, tsk->portid);
+		if (unlikely(rc == -ELINKCONG)) {
+			tsk->cong_link_cnt = 1;
+			rc = 0;
 		}
-		rc = tipc_wait_for_sndpkt(sock, &timeo);
-	} while (!rc);
+		if (likely(!rc)) {
+			tsk->snt_unacked += tsk_inc(tsk, send + MIN_H_SIZE);
+			sent += send;
+		}
+	} while (sent < dlen && !rc);
 
-	__skb_queue_purge(&pktchain);
-	return sent ? sent : rc;
+	return rc ? rc : sent;
 }
 
 /**
@@ -1131,7 +1098,7 @@ static int tipc_send_packet(struct socket *sock, struct msghdr *m, size_t dsz)
 	if (dsz > TIPC_MAX_USER_MSG_SIZE)
 		return -EMSGSIZE;
 
-	return tipc_send_stream(sock, m, dsz);
+	return tipc_sendstream(sock, m, dsz);
 }
 
 /* tipc_sk_finish_conn - complete the setup of a connection
@@ -1698,6 +1665,7 @@ static bool filter_rcv(struct sock *sk, struct sk_buff *skb,
 	unsigned int limit = rcvbuf_limit(sk, skb);
 	int err = TIPC_OK;
 	int usr = msg_user(hdr);
+	u32 onode;
 
 	if (unlikely(msg_user(hdr) == CONN_MANAGER)) {
 		tipc_sk_proto_rcv(tsk, skb, xmitq);
@@ -1705,8 +1673,10 @@ static bool filter_rcv(struct sock *sk, struct sk_buff *skb,
 	}
 
 	if (unlikely(usr == SOCK_WAKEUP)) {
+		onode = msg_orignode(hdr);
 		kfree_skb(skb);
-		tsk->link_cong = 0;
+		u32_del(&tsk->cong_links, onode);
+		tsk->cong_link_cnt--;
 		sk->sk_write_space(sk);
 		return false;
 	}
@@ -2114,7 +2084,7 @@ static int tipc_accept(struct socket *sock, struct socket *new_sock, int flags)
 		struct msghdr m = {NULL,};
 
 		tsk_advance_rx_queue(sk);
-		__tipc_send_stream(new_sock, &m, 0);
+		__tipc_sendstream(new_sock, &m, 0);
 	} else {
 		__skb_dequeue(&sk->sk_receive_queue);
 		__skb_queue_head(&new_sk->sk_receive_queue, buf);
@@ -2269,24 +2239,27 @@ static int tipc_sk_withdraw(struct tipc_sock *tsk, uint scope,
 void tipc_sk_reinit(struct net *net)
 {
 	struct tipc_net *tn = net_generic(net, tipc_net_id);
-	const struct bucket_table *tbl;
-	struct rhash_head *pos;
+	struct rhashtable_iter iter;
 	struct tipc_sock *tsk;
 	struct tipc_msg *msg;
-	int i;
 
-	rcu_read_lock();
-	tbl = rht_dereference_rcu((&tn->sk_rht)->tbl, &tn->sk_rht);
-	for (i = 0; i < tbl->size; i++) {
-		rht_for_each_entry_rcu(tsk, pos, tbl, i, node) {
+	rhashtable_walk_enter(&tn->sk_rht, &iter);
+
+	do {
+		tsk = ERR_PTR(rhashtable_walk_start(&iter));
+		if (tsk)
+			continue;
+
+		while ((tsk = rhashtable_walk_next(&iter)) && !IS_ERR(tsk)) {
 			spin_lock_bh(&tsk->sk.sk_lock.slock);
 			msg = &tsk->phdr;
 			msg_set_prevnode(msg, tn->own_addr);
 			msg_set_orignode(msg, tn->own_addr);
 			spin_unlock_bh(&tsk->sk.sk_lock.slock);
 		}
-	}
-	rcu_read_unlock();
+
+		rhashtable_walk_stop(&iter);
+	} while (tsk == ERR_PTR(-EAGAIN));
 }
 
 static struct tipc_sock *tipc_sk_lookup(struct net *net, u32 portid)
@@ -2382,18 +2355,29 @@ static int tipc_setsockopt(struct socket *sock, int lvl, int opt,
 {
 	struct sock *sk = sock->sk;
 	struct tipc_sock *tsk = tipc_sk(sk);
-	u32 value;
-	int res;
+	u32 value = 0;
+	int res = 0;
 
 	if ((lvl == IPPROTO_TCP) && (sock->type == SOCK_STREAM))
 		return 0;
 	if (lvl != SOL_TIPC)
 		return -ENOPROTOOPT;
-	if (ol < sizeof(value))
-		return -EINVAL;
-	res = get_user(value, (u32 __user *)ov);
-	if (res)
-		return res;
+
+	switch (opt) {
+	case TIPC_IMPORTANCE:
+	case TIPC_SRC_DROPPABLE:
+	case TIPC_DEST_DROPPABLE:
+	case TIPC_CONN_TIMEOUT:
+		if (ol < sizeof(value))
+			return -EINVAL;
+		res = get_user(value, (u32 __user *)ov);
+		if (res)
+			return res;
+		break;
+	default:
+		if (ov || ol)
+			return -EINVAL;
+	}
 
 	lock_sock(sk);
 
@@ -2412,7 +2396,14 @@ static int tipc_setsockopt(struct socket *sock, int lvl, int opt,
 		break;
 	case TIPC_CONN_TIMEOUT:
 		tipc_sk(sk)->conn_timeout = value;
-		/* no need to set "res", since already 0 at this point */
+		break;
+	case TIPC_MCAST_BROADCAST:
+		tsk->mc_method.rcast = false;
+		tsk->mc_method.mandatory = true;
+		break;
+	case TIPC_MCAST_REPLICAST:
+		tsk->mc_method.rcast = true;
+		tsk->mc_method.mandatory = true;
 		break;
 	default:
 		res = -EINVAL;
@@ -2575,7 +2566,7 @@ static const struct proto_ops stream_ops = {
 	.shutdown	= tipc_shutdown,
 	.setsockopt	= tipc_setsockopt,
 	.getsockopt	= tipc_getsockopt,
-	.sendmsg	= tipc_send_stream,
+	.sendmsg	= tipc_sendstream,
 	.recvmsg	= tipc_recv_stream,
 	.mmap		= sock_no_mmap,
 	.sendpage	= sock_no_sendpage
diff --git a/net/tipc/udp_media.c b/net/tipc/udp_media.c
index b58dc95f3d35..46061cf48cd1 100644
--- a/net/tipc/udp_media.c
+++ b/net/tipc/udp_media.c
@@ -113,7 +113,7 @@ static void tipc_udp_media_addr_set(struct tipc_media_addr *addr,
 	memcpy(addr->value, ua, sizeof(struct udp_media_addr));
 
 	if (tipc_udp_is_mcast_addr(ua))
-		addr->broadcast = 1;
+		addr->broadcast = TIPC_BROADCAST_SUPPORT;
 }
 
 /* tipc_udp_addr2str - convert ip/udp address to string */
@@ -229,7 +229,7 @@ static int tipc_udp_send_msg(struct net *net, struct sk_buff *skb,
 		goto out;
 	}
 
-	if (!addr->broadcast || list_empty(&ub->rcast.list))
+	if (addr->broadcast != TIPC_REPLICAST_SUPPORT)
 		return tipc_udp_xmit(net, skb, ub, src, dst);
 
 	/* Replicast, send an skb to each configured IP address */
@@ -296,7 +296,7 @@ static int tipc_udp_rcast_add(struct tipc_bearer *b,
 	else if (ntohs(addr->proto) == ETH_P_IPV6)
 		pr_info("New replicast peer: %pI6\n", &rcast->addr.ipv6);
 #endif
-
+	b->bcast_addr.broadcast = TIPC_REPLICAST_SUPPORT;
 	list_add_rcu(&rcast->list, &ub->rcast.list);
 	return 0;
 }
@@ -681,7 +681,7 @@ static int tipc_udp_enable(struct net *net, struct tipc_bearer *b,
 		goto err;
 
 	b->bcast_addr.media_id = TIPC_MEDIA_TYPE_UDP;
-	b->bcast_addr.broadcast = 1;
+	b->bcast_addr.broadcast = TIPC_BROADCAST_SUPPORT;
 	rcu_assign_pointer(b->media_ptr, ub);
 	rcu_assign_pointer(ub->bearer, b);
 	tipc_udp_media_addr_set(&b->addr, &local);
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index cef79873b09d..e2d18b9f910f 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -117,6 +117,7 @@
 #include <net/checksum.h>
 #include <linux/security.h>
 #include <linux/freezer.h>
+#include <linux/file.h>
 
 struct hlist_head unix_socket_table[2 * UNIX_HASH_SIZE];
 EXPORT_SYMBOL_GPL(unix_socket_table);
@@ -2592,6 +2593,43 @@ long unix_outq_len(struct sock *sk)
 }
 EXPORT_SYMBOL_GPL(unix_outq_len);
 
+static int unix_open_file(struct sock *sk)
+{
+	struct path path;
+	struct file *f;
+	int fd;
+
+	if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
+		return -EPERM;
+
+	unix_state_lock(sk);
+	path = unix_sk(sk)->path;
+	if (!path.dentry) {
+		unix_state_unlock(sk);
+		return -ENOENT;
+	}
+
+	path_get(&path);
+	unix_state_unlock(sk);
+
+	fd = get_unused_fd_flags(O_CLOEXEC);
+	if (fd < 0)
+		goto out;
+
+	f = dentry_open(&path, O_PATH, current_cred());
+	if (IS_ERR(f)) {
+		put_unused_fd(fd);
+		fd = PTR_ERR(f);
+		goto out;
+	}
+
+	fd_install(fd, f);
+out:
+	path_put(&path);
+
+	return fd;
+}
+
 static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
 {
 	struct sock *sk = sock->sk;
@@ -2610,6 +2648,9 @@ static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
 		else
 			err = put_user(amount, (int __user *)arg);
 		break;
+	case SIOCUNIXFILE:
+		err = unix_open_file(sk);
+		break;
 	default:
 		err = -ENOIOCTLCMD;
 		break;
diff --git a/net/wireless/Makefile b/net/wireless/Makefile
index 816c9331c8d2..d06e5015751a 100644
--- a/net/wireless/Makefile
+++ b/net/wireless/Makefile
@@ -11,6 +11,7 @@ obj-$(CONFIG_WEXT_PRIV) += wext-priv.o
 
 cfg80211-y += core.o sysfs.o radiotap.o util.o reg.o scan.o nl80211.o
 cfg80211-y += mlme.o ibss.o sme.o chan.o ethtool.o mesh.o ap.o trace.o ocb.o
+cfg80211-$(CONFIG_OF) += of.o
 cfg80211-$(CONFIG_CFG80211_DEBUGFS) += debugfs.o
 cfg80211-$(CONFIG_CFG80211_WEXT) += wext-compat.o wext-sme.o
 cfg80211-$(CONFIG_CFG80211_INTERNAL_REGDB) += regdb.o
diff --git a/net/wireless/core.c b/net/wireless/core.c
index 158c59ecf90a..e55e05bc4805 100644
--- a/net/wireless/core.c
+++ b/net/wireless/core.c
@@ -626,7 +626,8 @@ int wiphy_register(struct wiphy *wiphy)
 
 	if (WARN_ON((wiphy->interface_modes & BIT(NL80211_IFTYPE_NAN)) &&
 		    (!rdev->ops->start_nan || !rdev->ops->stop_nan ||
-		     !rdev->ops->add_nan_func || !rdev->ops->del_nan_func)))
+		     !rdev->ops->add_nan_func || !rdev->ops->del_nan_func ||
+		     !(wiphy->nan_supported_bands & BIT(NL80211_BAND_2GHZ)))))
 		return -EINVAL;
 
 #ifndef CONFIG_WIRELESS_WDS
@@ -1142,6 +1143,8 @@ static int cfg80211_netdev_notifier_call(struct notifier_block *nb,
 		     wdev->iftype == NL80211_IFTYPE_ADHOC) && !wdev->use_4addr)
 			dev->priv_flags |= IFF_DONT_BRIDGE;
 
+		INIT_WORK(&wdev->disconnect_wk, cfg80211_autodisconnect_wk);
+
 		nl80211_notify_iface(rdev, wdev, NL80211_CMD_NEW_INTERFACE);
 		break;
 	case NETDEV_GOING_DOWN:
@@ -1230,6 +1233,7 @@ static int cfg80211_netdev_notifier_call(struct notifier_block *nb,
 #ifdef CONFIG_CFG80211_WEXT
 			kzfree(wdev->wext.keys);
 #endif
+			flush_work(&wdev->disconnect_wk);
 		}
 		/*
 		 * synchronise (so that we won't find this netdev
diff --git a/net/wireless/core.h b/net/wireless/core.h
index af6e023020b1..58ca206982fe 100644
--- a/net/wireless/core.h
+++ b/net/wireless/core.h
@@ -228,6 +228,7 @@ struct cfg80211_event {
 			size_t resp_ie_len;
 			struct cfg80211_bss *bss;
 			int status; /* -1 = failed; 0..65535 = status code */
+			enum nl80211_timeout_reason timeout_reason;
 		} cr;
 		struct {
 			const u8 *req_ie;
@@ -388,7 +389,8 @@ void __cfg80211_connect_result(struct net_device *dev, const u8 *bssid,
 			       const u8 *req_ie, size_t req_ie_len,
 			       const u8 *resp_ie, size_t resp_ie_len,
 			       int status, bool wextev,
-			       struct cfg80211_bss *bss);
+			       struct cfg80211_bss *bss,
+			       enum nl80211_timeout_reason timeout_reason);
 void __cfg80211_disconnected(struct net_device *dev, const u8 *ie,
 			     size_t ie_len, u16 reason, bool from_ap);
 int cfg80211_disconnect(struct cfg80211_registered_device *rdev,
@@ -400,6 +402,7 @@ void __cfg80211_roamed(struct wireless_dev *wdev,
 		       const u8 *resp_ie, size_t resp_ie_len);
 int cfg80211_mgd_wext_connect(struct cfg80211_registered_device *rdev,
 			      struct wireless_dev *wdev);
+void cfg80211_autodisconnect_wk(struct work_struct *work);
 
 /* SME implementation */
 void cfg80211_conn_work(struct work_struct *work);
@@ -430,6 +433,9 @@ int cfg80211_change_iface(struct cfg80211_registered_device *rdev,
 void cfg80211_process_rdev_events(struct cfg80211_registered_device *rdev);
 void cfg80211_process_wdev_events(struct wireless_dev *wdev);
 
+bool cfg80211_does_bw_fit_range(const struct ieee80211_freq_range *freq_range,
+				u32 center_freq_khz, u32 bw_khz);
+
 /**
  * cfg80211_chandef_dfs_usable - checks if chandef is DFS usable
  * @wiphy: the wiphy to validate against
diff --git a/net/wireless/debugfs.c b/net/wireless/debugfs.c
index 5d453916a417..30fc6eb352bc 100644
--- a/net/wireless/debugfs.c
+++ b/net/wireless/debugfs.c
@@ -17,7 +17,7 @@
 static ssize_t name## _read(struct file *file, char __user *userbuf,	\
 			    size_t count, loff_t *ppos)			\
 {									\
-	struct wiphy *wiphy= file->private_data;		\
+	struct wiphy *wiphy = file->private_data;			\
 	char buf[buflen];						\
 	int res;							\
 									\
@@ -29,14 +29,14 @@ static const struct file_operations name## _ops = {			\
 	.read = name## _read,						\
 	.open = simple_open,						\
 	.llseek = generic_file_llseek,					\
-};
+}
 
 DEBUGFS_READONLY_FILE(rts_threshold, 20, "%d",
-		      wiphy->rts_threshold)
+		      wiphy->rts_threshold);
 DEBUGFS_READONLY_FILE(fragmentation_threshold, 20, "%d",
 		      wiphy->frag_threshold);
 DEBUGFS_READONLY_FILE(short_retry_limit, 20, "%d",
-		      wiphy->retry_short)
+		      wiphy->retry_short);
 DEBUGFS_READONLY_FILE(long_retry_limit, 20, "%d",
 		      wiphy->retry_long);
 
@@ -103,7 +103,7 @@ static const struct file_operations ht40allow_map_ops = {
 };
 
 #define DEBUGFS_ADD(name)						\
-	debugfs_create_file(#name, S_IRUGO, phyd, &rdev->wiphy, &name## _ops);
+	debugfs_create_file(#name, 0444, phyd, &rdev->wiphy, &name## _ops)
 
 void cfg80211_debugfs_rdev_add(struct cfg80211_registered_device *rdev)
 {
diff --git a/net/wireless/mlme.c b/net/wireless/mlme.c
index 4646cf5695b9..22b3d9990065 100644
--- a/net/wireless/mlme.c
+++ b/net/wireless/mlme.c
@@ -48,7 +48,8 @@ void cfg80211_rx_assoc_resp(struct net_device *dev, struct cfg80211_bss *bss,
 	/* update current_bss etc., consumes the bss reference */
 	__cfg80211_connect_result(dev, mgmt->bssid, NULL, 0, ie, len - ieoffs,
 				  status_code,
-				  status_code == WLAN_STATUS_SUCCESS, bss);
+				  status_code == WLAN_STATUS_SUCCESS, bss,
+				  NL80211_TIMEOUT_UNSPECIFIED);
 }
 EXPORT_SYMBOL(cfg80211_rx_assoc_resp);
 
@@ -345,6 +346,11 @@ int cfg80211_mlme_deauth(struct cfg80211_registered_device *rdev,
 	     !ether_addr_equal(wdev->current_bss->pub.bssid, bssid)))
 		return 0;
 
+	if (ether_addr_equal(wdev->disconnect_bssid, bssid) ||
+	    (wdev->current_bss &&
+	     ether_addr_equal(wdev->current_bss->pub.bssid, bssid)))
+		wdev->conn_owner_nlportid = 0;
+
 	return rdev_deauth(rdev, dev, &req);
 }
 
@@ -657,8 +663,25 @@ int cfg80211_mlme_mgmt_tx(struct cfg80211_registered_device *rdev,
 			return err;
 	}
 
-	if (!ether_addr_equal(mgmt->sa, wdev_address(wdev)))
-		return -EINVAL;
+	if (!ether_addr_equal(mgmt->sa, wdev_address(wdev))) {
+		/* Allow random TA to be used with Public Action frames if the
+		 * driver has indicated support for this. Otherwise, only allow
+		 * the local address to be used.
+		 */
+		if (!ieee80211_is_action(mgmt->frame_control) ||
+		    mgmt->u.action.category != WLAN_CATEGORY_PUBLIC)
+			return -EINVAL;
+		if (!wdev->current_bss &&
+		    !wiphy_ext_feature_isset(
+			    &rdev->wiphy,
+			    NL80211_EXT_FEATURE_MGMT_TX_RANDOM_TA))
+			return -EINVAL;
+		if (wdev->current_bss &&
+		    !wiphy_ext_feature_isset(
+			    &rdev->wiphy,
+			    NL80211_EXT_FEATURE_MGMT_TX_RANDOM_TA_CONNECTED))
+			return -EINVAL;
+	}
 
 	/* Transmit the Action frame as requested by user space */
 	return rdev_mgmt_tx(rdev, wdev, params, cookie);
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 5c1b267e22be..d7f8be4e321a 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -3,7 +3,7 @@
  *
  * Copyright 2006-2010	Johannes Berg <johannes@sipsolutions.net>
  * Copyright 2013-2014  Intel Mobile Communications GmbH
- * Copyright 2015-2016	Intel Deutschland GmbH
+ * Copyright 2015-2017	Intel Deutschland GmbH
  */
 
 #include <linux/if.h>
@@ -398,13 +398,18 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = {
 	},
 	[NL80211_ATTR_MU_MIMO_FOLLOW_MAC_ADDR] = { .len = ETH_ALEN },
 	[NL80211_ATTR_NAN_MASTER_PREF] = { .type = NLA_U8 },
-	[NL80211_ATTR_NAN_DUAL] = { .type = NLA_U8 },
+	[NL80211_ATTR_BANDS] = { .type = NLA_U32 },
 	[NL80211_ATTR_NAN_FUNC] = { .type = NLA_NESTED },
 	[NL80211_ATTR_FILS_KEK] = { .type = NLA_BINARY,
 				    .len = FILS_MAX_KEK_LEN },
 	[NL80211_ATTR_FILS_NONCES] = { .len = 2 * FILS_NONCE_LEN },
 	[NL80211_ATTR_MULTICAST_TO_UNICAST_ENABLED] = { .type = NLA_FLAG, },
 	[NL80211_ATTR_BSSID] = { .len = ETH_ALEN },
+	[NL80211_ATTR_SCHED_SCAN_RELATIVE_RSSI] = { .type = NLA_S8 },
+	[NL80211_ATTR_SCHED_SCAN_RSSI_ADJUST] = {
+		.len = sizeof(struct nl80211_bss_select_rssi_adjust)
+	},
+	[NL80211_ATTR_TIMEOUT_REASON] = { .type = NLA_U32 },
 };
 
 /* policy for the key attributes */
@@ -1881,6 +1886,10 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *rdev,
 			}
 		}
 
+		if (nla_put_u32(msg, NL80211_ATTR_BANDS,
+				rdev->wiphy.nan_supported_bands))
+			goto nla_put_failure;
+
 		/* done */
 		state->split_start = 0;
 		break;
@@ -3738,6 +3747,49 @@ static int nl80211_parse_beacon(struct nlattr *attrs[],
 	return 0;
 }
 
+static void nl80211_check_ap_rate_selectors(struct cfg80211_ap_settings *params,
+					    const u8 *rates)
+{
+	int i;
+
+	if (!rates)
+		return;
+
+	for (i = 0; i < rates[1]; i++) {
+		if (rates[2 + i] == BSS_MEMBERSHIP_SELECTOR_HT_PHY)
+			params->ht_required = true;
+		if (rates[2 + i] == BSS_MEMBERSHIP_SELECTOR_VHT_PHY)
+			params->vht_required = true;
+	}
+}
+
+/*
+ * Since the nl80211 API didn't include, from the beginning, attributes about
+ * HT/VHT requirements/capabilities, we parse them out of the IEs for the
+ * benefit of drivers that rebuild IEs in the firmware.
+ */
+static void nl80211_calculate_ap_params(struct cfg80211_ap_settings *params)
+{
+	const struct cfg80211_beacon_data *bcn = &params->beacon;
+	size_t ies_len = bcn->beacon_ies_len;
+	const u8 *ies = bcn->beacon_ies;
+	const u8 *rates;
+	const u8 *cap;
+
+	rates = cfg80211_find_ie(WLAN_EID_SUPP_RATES, ies, ies_len);
+	nl80211_check_ap_rate_selectors(params, rates);
+
+	rates = cfg80211_find_ie(WLAN_EID_EXT_SUPP_RATES, ies, ies_len);
+	nl80211_check_ap_rate_selectors(params, rates);
+
+	cap = cfg80211_find_ie(WLAN_EID_HT_CAPABILITY, ies, ies_len);
+	if (cap && cap[1] >= sizeof(*params->ht_cap))
+		params->ht_cap = (void *)(cap + 2);
+	cap = cfg80211_find_ie(WLAN_EID_VHT_CAPABILITY, ies, ies_len);
+	if (cap && cap[1] >= sizeof(*params->vht_cap))
+		params->vht_cap = (void *)(cap + 2);
+}
+
 static bool nl80211_get_ap_channel(struct cfg80211_registered_device *rdev,
 				   struct cfg80211_ap_settings *params)
 {
@@ -3966,6 +4018,8 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info)
 			return PTR_ERR(params.acl);
 	}
 
+	nl80211_calculate_ap_params(&params);
+
 	wdev_lock(wdev);
 	err = rdev_start_ap(rdev, dev, &params);
 	if (!err) {
@@ -5916,6 +5970,7 @@ do {									    \
 			break;
 		}
 		cfg->ht_opmode = ht_opmode;
+		mask |= (1 << (NL80211_MESHCONF_HT_OPMODE - 1));
 	}
 	FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshHWMPactivePathToRootTimeout,
 				  1, 65535, mask,
@@ -6790,13 +6845,10 @@ nl80211_parse_sched_scan_plans(struct wiphy *wiphy, int n_plans,
 
 		/*
 		 * If scan plans are not specified,
-		 * %NL80211_ATTR_SCHED_SCAN_INTERVAL must be specified. In this
+		 * %NL80211_ATTR_SCHED_SCAN_INTERVAL will be specified. In this
 		 * case one scan plan will be set with the specified scan
 		 * interval and infinite number of iterations.
 		 */
-		if (!attrs[NL80211_ATTR_SCHED_SCAN_INTERVAL])
-			return -EINVAL;
-
 		interval = nla_get_u32(attrs[NL80211_ATTR_SCHED_SCAN_INTERVAL]);
 		if (!interval)
 			return -EINVAL;
@@ -6865,7 +6917,7 @@ nl80211_parse_sched_scan_plans(struct wiphy *wiphy, int n_plans,
 
 static struct cfg80211_sched_scan_request *
 nl80211_parse_sched_scan(struct wiphy *wiphy, struct wireless_dev *wdev,
-			 struct nlattr **attrs)
+			 struct nlattr **attrs, int max_match_sets)
 {
 	struct cfg80211_sched_scan_request *request;
 	struct nlattr *attr;
@@ -6930,7 +6982,7 @@ nl80211_parse_sched_scan(struct wiphy *wiphy, struct wireless_dev *wdev,
 	if (!n_match_sets && default_match_rssi != NL80211_SCAN_RSSI_THOLD_OFF)
 		n_match_sets = 1;
 
-	if (n_match_sets > wiphy->max_match_sets)
+	if (n_match_sets > max_match_sets)
 		return ERR_PTR(-EINVAL);
 
 	if (attrs[NL80211_ATTR_IE])
@@ -6968,6 +7020,12 @@ nl80211_parse_sched_scan(struct wiphy *wiphy, struct wireless_dev *wdev,
 	if (!n_plans || n_plans > wiphy->max_sched_scan_plans)
 		return ERR_PTR(-EINVAL);
 
+	if (!wiphy_ext_feature_isset(
+		    wiphy, NL80211_EXT_FEATURE_SCHED_SCAN_RELATIVE_RSSI) &&
+	    (attrs[NL80211_ATTR_SCHED_SCAN_RELATIVE_RSSI] ||
+	     attrs[NL80211_ATTR_SCHED_SCAN_RSSI_ADJUST]))
+		return ERR_PTR(-EINVAL);
+
 	request = kzalloc(sizeof(*request)
 			+ sizeof(*request->ssids) * n_ssids
 			+ sizeof(*request->match_sets) * n_match_sets
@@ -7174,6 +7232,26 @@ nl80211_parse_sched_scan(struct wiphy *wiphy, struct wireless_dev *wdev,
 		request->delay =
 			nla_get_u32(attrs[NL80211_ATTR_SCHED_SCAN_DELAY]);
 
+	if (attrs[NL80211_ATTR_SCHED_SCAN_RELATIVE_RSSI]) {
+		request->relative_rssi = nla_get_s8(
+			attrs[NL80211_ATTR_SCHED_SCAN_RELATIVE_RSSI]);
+		request->relative_rssi_set = true;
+	}
+
+	if (request->relative_rssi_set &&
+	    attrs[NL80211_ATTR_SCHED_SCAN_RSSI_ADJUST]) {
+		struct nl80211_bss_select_rssi_adjust *rssi_adjust;
+
+		rssi_adjust = nla_data(
+			attrs[NL80211_ATTR_SCHED_SCAN_RSSI_ADJUST]);
+		request->rssi_adjust.band = rssi_adjust->band;
+		request->rssi_adjust.delta = rssi_adjust->delta;
+		if (!is_band_valid(wiphy, request->rssi_adjust.band)) {
+			err = -EINVAL;
+			goto out_free;
+		}
+	}
+
 	err = nl80211_parse_sched_scan_plans(wiphy, n_plans, request, attrs);
 	if (err)
 		goto out_free;
@@ -7204,7 +7282,8 @@ static int nl80211_start_sched_scan(struct sk_buff *skb,
 		return -EINPROGRESS;
 
 	sched_scan_req = nl80211_parse_sched_scan(&rdev->wiphy, wdev,
-						  info->attrs);
+						  info->attrs,
+						  rdev->wiphy.max_match_sets);
 
 	err = PTR_ERR_OR_ZERO(sched_scan_req);
 	if (err)
@@ -8068,8 +8147,17 @@ static int nl80211_associate(struct sk_buff *skb, struct genl_info *info)
 	err = nl80211_crypto_settings(rdev, info, &req.crypto, 1);
 	if (!err) {
 		wdev_lock(dev->ieee80211_ptr);
+
 		err = cfg80211_mlme_assoc(rdev, dev, chan, bssid,
 					  ssid, ssid_len, &req);
+
+		if (!err && info->attrs[NL80211_ATTR_SOCKET_OWNER]) {
+			dev->ieee80211_ptr->conn_owner_nlportid =
+				info->snd_portid;
+			memcpy(dev->ieee80211_ptr->disconnect_bssid,
+			       bssid, ETH_ALEN);
+		}
+
 		wdev_unlock(dev->ieee80211_ptr);
 	}
 
@@ -8548,6 +8636,12 @@ static int nl80211_testmode_dump(struct sk_buff *skb,
 		 * so we need to offset by 1.
 		 */
 		phy_idx = cb->args[0] - 1;
+
+		rdev = cfg80211_rdev_by_wiphy_idx(phy_idx);
+		if (!rdev) {
+			err = -ENOENT;
+			goto out_err;
+		}
 	} else {
 		struct nlattr **attrbuf = genl_family_attrbuf(&nl80211_fam);
 
@@ -8562,7 +8656,6 @@ static int nl80211_testmode_dump(struct sk_buff *skb,
 			goto out_err;
 		}
 		phy_idx = rdev->wiphy_idx;
-		rdev = NULL;
 
 		if (attrbuf[NL80211_ATTR_TESTDATA])
 			cb->args[1] = (long)attrbuf[NL80211_ATTR_TESTDATA];
@@ -8573,12 +8666,6 @@ static int nl80211_testmode_dump(struct sk_buff *skb,
 		data_len = nla_len((void *)cb->args[1]);
 	}
 
-	rdev = cfg80211_rdev_by_wiphy_idx(phy_idx);
-	if (!rdev) {
-		err = -ENOENT;
-		goto out_err;
-	}
-
 	if (!rdev->ops->testmode_dump) {
 		err = -EOPNOTSUPP;
 		goto out_err;
@@ -8788,11 +8875,24 @@ static int nl80211_connect(struct sk_buff *skb, struct genl_info *info)
 	}
 
 	wdev_lock(dev->ieee80211_ptr);
+
 	err = cfg80211_connect(rdev, dev, &connect, connkeys,
 			       connect.prev_bssid);
-	wdev_unlock(dev->ieee80211_ptr);
 	if (err)
 		kzfree(connkeys);
+
+	if (!err && info->attrs[NL80211_ATTR_SOCKET_OWNER]) {
+		dev->ieee80211_ptr->conn_owner_nlportid = info->snd_portid;
+		if (connect.bssid)
+			memcpy(dev->ieee80211_ptr->disconnect_bssid,
+			       connect.bssid, ETH_ALEN);
+		else
+			memset(dev->ieee80211_ptr->disconnect_bssid,
+			       0, ETH_ALEN);
+	}
+
+	wdev_unlock(dev->ieee80211_ptr);
+
 	return err;
 }
 
@@ -9379,6 +9479,7 @@ nl80211_attr_cqm_policy[NL80211_ATTR_CQM_MAX + 1] = {
 	[NL80211_ATTR_CQM_TXE_RATE] = { .type = NLA_U32 },
 	[NL80211_ATTR_CQM_TXE_PKTS] = { .type = NLA_U32 },
 	[NL80211_ATTR_CQM_TXE_INTVL] = { .type = NLA_U32 },
+	[NL80211_ATTR_CQM_RSSI_LEVEL] = { .type = NLA_S32 },
 };
 
 static int nl80211_set_cqm_txe(struct genl_info *info,
@@ -9688,6 +9789,20 @@ static int nl80211_send_wowlan_nd(struct sk_buff *msg,
 	if (nla_put_u32(msg, NL80211_ATTR_SCHED_SCAN_DELAY, req->delay))
 		return -ENOBUFS;
 
+	if (req->relative_rssi_set) {
+		struct nl80211_bss_select_rssi_adjust rssi_adjust;
+
+		if (nla_put_s8(msg, NL80211_ATTR_SCHED_SCAN_RELATIVE_RSSI,
+			       req->relative_rssi))
+			return -ENOBUFS;
+
+		rssi_adjust.band = req->rssi_adjust.band;
+		rssi_adjust.delta = req->rssi_adjust.delta;
+		if (nla_put(msg, NL80211_ATTR_SCHED_SCAN_RSSI_ADJUST,
+			    sizeof(rssi_adjust), &rssi_adjust))
+			return -ENOBUFS;
+	}
+
 	freqs = nla_nest_start(msg, NL80211_ATTR_SCAN_FREQUENCIES);
 	if (!freqs)
 		return -ENOBUFS;
@@ -9981,7 +10096,8 @@ static int nl80211_parse_wowlan_nd(struct cfg80211_registered_device *rdev,
 	if (err)
 		goto out;
 
-	trig->nd_config = nl80211_parse_sched_scan(&rdev->wiphy, NULL, tb);
+	trig->nd_config = nl80211_parse_sched_scan(&rdev->wiphy, NULL, tb,
+						   wowlan->max_nd_match_sets);
 	err = PTR_ERR_OR_ZERO(trig->nd_config);
 	if (err)
 		trig->nd_config = NULL;
@@ -10666,15 +10782,22 @@ static int nl80211_start_nan(struct sk_buff *skb, struct genl_info *info)
 	if (!info->attrs[NL80211_ATTR_NAN_MASTER_PREF])
 		return -EINVAL;
 
-	if (!info->attrs[NL80211_ATTR_NAN_DUAL])
-		return -EINVAL;
-
 	conf.master_pref =
 		nla_get_u8(info->attrs[NL80211_ATTR_NAN_MASTER_PREF]);
 	if (!conf.master_pref)
 		return -EINVAL;
 
-	conf.dual = nla_get_u8(info->attrs[NL80211_ATTR_NAN_DUAL]);
+	if (info->attrs[NL80211_ATTR_BANDS]) {
+		u32 bands = nla_get_u32(info->attrs[NL80211_ATTR_BANDS]);
+
+		if (bands & ~(u32)wdev->wiphy->nan_supported_bands)
+			return -EOPNOTSUPP;
+
+		if (bands && !(bands & BIT(NL80211_BAND_2GHZ)))
+			return -EINVAL;
+
+		conf.bands = bands;
+	}
 
 	err = rdev_start_nan(rdev, wdev, &conf);
 	if (err)
@@ -11039,9 +11162,17 @@ static int nl80211_nan_change_config(struct sk_buff *skb,
 		changed |= CFG80211_NAN_CONF_CHANGED_PREF;
 	}
 
-	if (info->attrs[NL80211_ATTR_NAN_DUAL]) {
-		conf.dual = nla_get_u8(info->attrs[NL80211_ATTR_NAN_DUAL]);
-		changed |= CFG80211_NAN_CONF_CHANGED_DUAL;
+	if (info->attrs[NL80211_ATTR_BANDS]) {
+		u32 bands = nla_get_u32(info->attrs[NL80211_ATTR_BANDS]);
+
+		if (bands & ~(u32)wdev->wiphy->nan_supported_bands)
+			return -EOPNOTSUPP;
+
+		if (bands && !(bands & BIT(NL80211_BAND_2GHZ)))
+			return -EINVAL;
+
+		conf.bands = bands;
+		changed |= CFG80211_NAN_CONF_CHANGED_BANDS;
 	}
 
 	if (!changed)
@@ -11822,9 +11953,6 @@ static int nl80211_set_multicast_to_unicast(struct sk_buff *skb,
 	const struct nlattr *nla;
 	bool enabled;
 
-	if (netif_running(dev))
-		return -EBUSY;
-
 	if (!rdev->ops->set_multicast_to_unicast)
 		return -EOPNOTSUPP;
 
@@ -12825,7 +12953,7 @@ static int nl80211_add_scan_req(struct sk_buff *msg,
 	return -ENOBUFS;
 }
 
-static int nl80211_send_scan_msg(struct sk_buff *msg,
+static int nl80211_prep_scan_msg(struct sk_buff *msg,
 				 struct cfg80211_registered_device *rdev,
 				 struct wireless_dev *wdev,
 				 u32 portid, u32 seq, int flags,
@@ -12856,7 +12984,7 @@ static int nl80211_send_scan_msg(struct sk_buff *msg,
 }
 
 static int
-nl80211_send_sched_scan_msg(struct sk_buff *msg,
+nl80211_prep_sched_scan_msg(struct sk_buff *msg,
 			    struct cfg80211_registered_device *rdev,
 			    struct net_device *netdev,
 			    u32 portid, u32 seq, int flags, u32 cmd)
@@ -12888,7 +13016,7 @@ void nl80211_send_scan_start(struct cfg80211_registered_device *rdev,
 	if (!msg)
 		return;
 
-	if (nl80211_send_scan_msg(msg, rdev, wdev, 0, 0, 0,
+	if (nl80211_prep_scan_msg(msg, rdev, wdev, 0, 0, 0,
 				  NL80211_CMD_TRIGGER_SCAN) < 0) {
 		nlmsg_free(msg);
 		return;
@@ -12907,7 +13035,7 @@ struct sk_buff *nl80211_build_scan_msg(struct cfg80211_registered_device *rdev,
 	if (!msg)
 		return NULL;
 
-	if (nl80211_send_scan_msg(msg, rdev, wdev, 0, 0, 0,
+	if (nl80211_prep_scan_msg(msg, rdev, wdev, 0, 0, 0,
 				  aborted ? NL80211_CMD_SCAN_ABORTED :
 					    NL80211_CMD_NEW_SCAN_RESULTS) < 0) {
 		nlmsg_free(msg);
@@ -12917,8 +13045,9 @@ struct sk_buff *nl80211_build_scan_msg(struct cfg80211_registered_device *rdev,
 	return msg;
 }
 
-void nl80211_send_scan_result(struct cfg80211_registered_device *rdev,
-			      struct sk_buff *msg)
+/* send message created by nl80211_build_scan_msg() */
+void nl80211_send_scan_msg(struct cfg80211_registered_device *rdev,
+			   struct sk_buff *msg)
 {
 	if (!msg)
 		return;
@@ -12927,25 +13056,6 @@ void nl80211_send_scan_result(struct cfg80211_registered_device *rdev,
 				NL80211_MCGRP_SCAN, GFP_KERNEL);
 }
 
-void nl80211_send_sched_scan_results(struct cfg80211_registered_device *rdev,
-				     struct net_device *netdev)
-{
-	struct sk_buff *msg;
-
-	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
-	if (!msg)
-		return;
-
-	if (nl80211_send_sched_scan_msg(msg, rdev, netdev, 0, 0, 0,
-					NL80211_CMD_SCHED_SCAN_RESULTS) < 0) {
-		nlmsg_free(msg);
-		return;
-	}
-
-	genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0,
-				NL80211_MCGRP_SCAN, GFP_KERNEL);
-}
-
 void nl80211_send_sched_scan(struct cfg80211_registered_device *rdev,
 			     struct net_device *netdev, u32 cmd)
 {
@@ -12955,7 +13065,7 @@ void nl80211_send_sched_scan(struct cfg80211_registered_device *rdev,
 	if (!msg)
 		return;
 
-	if (nl80211_send_sched_scan_msg(msg, rdev, netdev, 0, 0, 0, cmd) < 0) {
+	if (nl80211_prep_sched_scan_msg(msg, rdev, netdev, 0, 0, 0, cmd) < 0) {
 		nlmsg_free(msg);
 		return;
 	}
@@ -13057,7 +13167,7 @@ static void nl80211_send_mlme_event(struct cfg80211_registered_device *rdev,
 	struct sk_buff *msg;
 	void *hdr;
 
-	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp);
+	msg = nlmsg_new(100 + len, gfp);
 	if (!msg)
 		return;
 
@@ -13204,12 +13314,14 @@ void nl80211_send_connect_result(struct cfg80211_registered_device *rdev,
 				 struct net_device *netdev, const u8 *bssid,
 				 const u8 *req_ie, size_t req_ie_len,
 				 const u8 *resp_ie, size_t resp_ie_len,
-				 int status, gfp_t gfp)
+				 int status,
+				 enum nl80211_timeout_reason timeout_reason,
+				 gfp_t gfp)
 {
 	struct sk_buff *msg;
 	void *hdr;
 
-	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp);
+	msg = nlmsg_new(100 + req_ie_len + resp_ie_len, gfp);
 	if (!msg)
 		return;
 
@@ -13225,7 +13337,9 @@ void nl80211_send_connect_result(struct cfg80211_registered_device *rdev,
 	    nla_put_u16(msg, NL80211_ATTR_STATUS_CODE,
 			status < 0 ? WLAN_STATUS_UNSPECIFIED_FAILURE :
 			status) ||
-	    (status < 0 && nla_put_flag(msg, NL80211_ATTR_TIMED_OUT)) ||
+	    (status < 0 &&
+	     (nla_put_flag(msg, NL80211_ATTR_TIMED_OUT) ||
+	      nla_put_u32(msg, NL80211_ATTR_TIMEOUT_REASON, timeout_reason))) ||
 	    (req_ie &&
 	     nla_put(msg, NL80211_ATTR_REQ_IE, req_ie_len, req_ie)) ||
 	    (resp_ie &&
@@ -13251,7 +13365,7 @@ void nl80211_send_roamed(struct cfg80211_registered_device *rdev,
 	struct sk_buff *msg;
 	void *hdr;
 
-	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp);
+	msg = nlmsg_new(100 + req_ie_len + resp_ie_len, gfp);
 	if (!msg)
 		return;
 
@@ -13288,7 +13402,7 @@ void nl80211_send_disconnected(struct cfg80211_registered_device *rdev,
 	struct sk_buff *msg;
 	void *hdr;
 
-	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	msg = nlmsg_new(100 + ie_len, GFP_KERNEL);
 	if (!msg)
 		return;
 
@@ -13364,7 +13478,7 @@ void cfg80211_notify_new_peer_candidate(struct net_device *dev, const u8 *addr,
 
 	trace_cfg80211_notify_new_peer_candidate(dev, addr);
 
-	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp);
+	msg = nlmsg_new(100 + ie_len, gfp);
 	if (!msg)
 		return;
 
@@ -13735,7 +13849,7 @@ int nl80211_send_mgmt(struct cfg80211_registered_device *rdev,
 	struct sk_buff *msg;
 	void *hdr;
 
-	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp);
+	msg = nlmsg_new(100 + len, gfp);
 	if (!msg)
 		return -ENOMEM;
 
@@ -13779,7 +13893,7 @@ void cfg80211_mgmt_tx_status(struct wireless_dev *wdev, u64 cookie,
 
 	trace_cfg80211_mgmt_tx_status(wdev, cookie, ack);
 
-	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp);
+	msg = nlmsg_new(100 + len, gfp);
 	if (!msg)
 		return;
 
@@ -13866,11 +13980,11 @@ static void cfg80211_send_cqm(struct sk_buff *msg, gfp_t gfp)
 
 void cfg80211_cqm_rssi_notify(struct net_device *dev,
 			      enum nl80211_cqm_rssi_threshold_event rssi_event,
-			      gfp_t gfp)
+			      s32 rssi_level, gfp_t gfp)
 {
 	struct sk_buff *msg;
 
-	trace_cfg80211_cqm_rssi_notify(dev, rssi_event);
+	trace_cfg80211_cqm_rssi_notify(dev, rssi_event, rssi_level);
 
 	if (WARN_ON(rssi_event != NL80211_CQM_RSSI_THRESHOLD_EVENT_LOW &&
 		    rssi_event != NL80211_CQM_RSSI_THRESHOLD_EVENT_HIGH))
@@ -13884,6 +13998,10 @@ void cfg80211_cqm_rssi_notify(struct net_device *dev,
 			rssi_event))
 		goto nla_put_failure;
 
+	if (rssi_level && nla_put_s32(msg, NL80211_ATTR_CQM_RSSI_LEVEL,
+				      rssi_level))
+		goto nla_put_failure;
+
 	cfg80211_send_cqm(msg, gfp);
 
 	return;
@@ -14534,6 +14652,8 @@ static int nl80211_netlink_notify(struct notifier_block * nb,
 
 			if (wdev->owner_nlportid == notify->portid)
 				schedule_destroy_work = true;
+			else if (wdev->conn_owner_nlportid == notify->portid)
+				schedule_work(&wdev->disconnect_wk);
 		}
 
 		spin_lock_bh(&rdev->beacon_registrations_lock);
@@ -14588,7 +14708,7 @@ void cfg80211_ft_event(struct net_device *netdev,
 	if (!ft_event->target_ap)
 		return;
 
-	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	msg = nlmsg_new(100 + ft_event->ric_ies_len, GFP_KERNEL);
 	if (!msg)
 		return;
 
diff --git a/net/wireless/nl80211.h b/net/wireless/nl80211.h
index 7e3821d7fcc5..e488dca87423 100644
--- a/net/wireless/nl80211.h
+++ b/net/wireless/nl80211.h
@@ -14,12 +14,10 @@ void nl80211_send_scan_start(struct cfg80211_registered_device *rdev,
 			     struct wireless_dev *wdev);
 struct sk_buff *nl80211_build_scan_msg(struct cfg80211_registered_device *rdev,
 				       struct wireless_dev *wdev, bool aborted);
-void nl80211_send_scan_result(struct cfg80211_registered_device *rdev,
-			      struct sk_buff *msg);
+void nl80211_send_scan_msg(struct cfg80211_registered_device *rdev,
+			   struct sk_buff *msg);
 void nl80211_send_sched_scan(struct cfg80211_registered_device *rdev,
 			     struct net_device *netdev, u32 cmd);
-void nl80211_send_sched_scan_results(struct cfg80211_registered_device *rdev,
-				     struct net_device *netdev);
 void nl80211_common_reg_change_event(enum nl80211_commands cmd_id,
 				     struct regulatory_request *request);
 
@@ -58,7 +56,9 @@ void nl80211_send_connect_result(struct cfg80211_registered_device *rdev,
 				 struct net_device *netdev, const u8 *bssid,
 				 const u8 *req_ie, size_t req_ie_len,
 				 const u8 *resp_ie, size_t resp_ie_len,
-				 int status, gfp_t gfp);
+				 int status,
+				 enum nl80211_timeout_reason timeout_reason,
+				 gfp_t gfp);
 void nl80211_send_roamed(struct cfg80211_registered_device *rdev,
 			 struct net_device *netdev, const u8 *bssid,
 			 const u8 *req_ie, size_t req_ie_len,
diff --git a/net/wireless/of.c b/net/wireless/of.c
new file mode 100644
index 000000000000..de221f0edca5
--- /dev/null
+++ b/net/wireless/of.c
@@ -0,0 +1,138 @@
+/*
+ * Copyright (C) 2017 Rafał Miłecki <rafal@milecki.pl>
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <linux/of.h>
+#include <net/cfg80211.h>
+#include "core.h"
+
+static bool wiphy_freq_limits_valid_chan(struct wiphy *wiphy,
+					 struct ieee80211_freq_range *freq_limits,
+					 unsigned int n_freq_limits,
+					 struct ieee80211_channel *chan)
+{
+	u32 bw = MHZ_TO_KHZ(20);
+	int i;
+
+	for (i = 0; i < n_freq_limits; i++) {
+		struct ieee80211_freq_range *limit = &freq_limits[i];
+
+		if (cfg80211_does_bw_fit_range(limit,
+					       MHZ_TO_KHZ(chan->center_freq),
+					       bw))
+			return true;
+	}
+
+	return false;
+}
+
+static void wiphy_freq_limits_apply(struct wiphy *wiphy,
+				    struct ieee80211_freq_range *freq_limits,
+				    unsigned int n_freq_limits)
+{
+	enum nl80211_band band;
+	int i;
+
+	if (WARN_ON(!n_freq_limits))
+		return;
+
+	for (band = 0; band < NUM_NL80211_BANDS; band++) {
+		struct ieee80211_supported_band *sband = wiphy->bands[band];
+
+		if (!sband)
+			continue;
+
+		for (i = 0; i < sband->n_channels; i++) {
+			struct ieee80211_channel *chan = &sband->channels[i];
+
+			if (chan->flags & IEEE80211_CHAN_DISABLED)
+				continue;
+
+			if (!wiphy_freq_limits_valid_chan(wiphy, freq_limits,
+							  n_freq_limits,
+							  chan)) {
+				pr_debug("Disabling freq %d MHz as it's out of OF limits\n",
+					 chan->center_freq);
+				chan->flags |= IEEE80211_CHAN_DISABLED;
+			}
+		}
+	}
+}
+
+void wiphy_read_of_freq_limits(struct wiphy *wiphy)
+{
+	struct device *dev = wiphy_dev(wiphy);
+	struct device_node *np;
+	struct property *prop;
+	struct ieee80211_freq_range *freq_limits;
+	unsigned int n_freq_limits;
+	const __be32 *p;
+	int len, i;
+	int err = 0;
+
+	if (!dev)
+		return;
+	np = dev_of_node(dev);
+	if (!np)
+		return;
+
+	prop = of_find_property(np, "ieee80211-freq-limit", &len);
+	if (!prop)
+		return;
+
+	if (!len || len % sizeof(u32) || len / sizeof(u32) % 2) {
+		dev_err(dev, "ieee80211-freq-limit wrong format");
+		return;
+	}
+	n_freq_limits = len / sizeof(u32) / 2;
+
+	freq_limits = kcalloc(n_freq_limits, sizeof(*freq_limits), GFP_KERNEL);
+	if (!freq_limits) {
+		err = -ENOMEM;
+		goto out_kfree;
+	}
+
+	p = NULL;
+	for (i = 0; i < n_freq_limits; i++) {
+		struct ieee80211_freq_range *limit = &freq_limits[i];
+
+		p = of_prop_next_u32(prop, p, &limit->start_freq_khz);
+		if (!p) {
+			err = -EINVAL;
+			goto out_kfree;
+		}
+
+		p = of_prop_next_u32(prop, p, &limit->end_freq_khz);
+		if (!p) {
+			err = -EINVAL;
+			goto out_kfree;
+		}
+
+		if (!limit->start_freq_khz ||
+		    !limit->end_freq_khz ||
+		    limit->start_freq_khz >= limit->end_freq_khz) {
+			err = -EINVAL;
+			goto out_kfree;
+		}
+	}
+
+	wiphy_freq_limits_apply(wiphy, freq_limits, n_freq_limits);
+
+out_kfree:
+	kfree(freq_limits);
+	if (err)
+		dev_err(dev, "Failed to get limits: %d\n", err);
+}
+EXPORT_SYMBOL(wiphy_read_of_freq_limits);
diff --git a/net/wireless/reg.c b/net/wireless/reg.c
index 5dbac3749738..753efcd51fa3 100644
--- a/net/wireless/reg.c
+++ b/net/wireless/reg.c
@@ -748,21 +748,6 @@ static bool is_valid_rd(const struct ieee80211_regdomain *rd)
 	return true;
 }
 
-static bool reg_does_bw_fit(const struct ieee80211_freq_range *freq_range,
-			    u32 center_freq_khz, u32 bw_khz)
-{
-	u32 start_freq_khz, end_freq_khz;
-
-	start_freq_khz = center_freq_khz - (bw_khz/2);
-	end_freq_khz = center_freq_khz + (bw_khz/2);
-
-	if (start_freq_khz >= freq_range->start_freq_khz &&
-	    end_freq_khz <= freq_range->end_freq_khz)
-		return true;
-
-	return false;
-}
-
 /**
  * freq_in_rule_band - tells us if a frequency is in a frequency band
  * @freq_range: frequency rule we want to query
@@ -1070,7 +1055,7 @@ freq_reg_info_regd(u32 center_freq,
 		if (!band_rule_found)
 			band_rule_found = freq_in_rule_band(fr, center_freq);
 
-		bw_fits = reg_does_bw_fit(fr, center_freq, bw);
+		bw_fits = cfg80211_does_bw_fit_range(fr, center_freq, bw);
 
 		if (band_rule_found && bw_fits)
 			return rr;
@@ -1138,11 +1123,13 @@ static uint32_t reg_rule_to_chan_bw_flags(const struct ieee80211_regdomain *regd
 		max_bandwidth_khz = reg_get_max_bandwidth(regd, reg_rule);
 
 	/* If we get a reg_rule we can assume that at least 5Mhz fit */
-	if (!reg_does_bw_fit(freq_range, MHZ_TO_KHZ(chan->center_freq),
-			     MHZ_TO_KHZ(10)))
+	if (!cfg80211_does_bw_fit_range(freq_range,
+					MHZ_TO_KHZ(chan->center_freq),
+					MHZ_TO_KHZ(10)))
 		bw_flags |= IEEE80211_CHAN_NO_10MHZ;
-	if (!reg_does_bw_fit(freq_range, MHZ_TO_KHZ(chan->center_freq),
-			     MHZ_TO_KHZ(20)))
+	if (!cfg80211_does_bw_fit_range(freq_range,
+					MHZ_TO_KHZ(chan->center_freq),
+					MHZ_TO_KHZ(20)))
 		bw_flags |= IEEE80211_CHAN_NO_20MHZ;
 
 	if (max_bandwidth_khz < MHZ_TO_KHZ(10))
diff --git a/net/wireless/scan.c b/net/wireless/scan.c
index 35ad69fd0838..21be56b3128e 100644
--- a/net/wireless/scan.c
+++ b/net/wireless/scan.c
@@ -227,7 +227,7 @@ void ___cfg80211_scan_done(struct cfg80211_registered_device *rdev,
 	ASSERT_RTNL();
 
 	if (rdev->scan_msg) {
-		nl80211_send_scan_result(rdev, rdev->scan_msg);
+		nl80211_send_scan_msg(rdev, rdev->scan_msg);
 		rdev->scan_msg = NULL;
 		return;
 	}
@@ -273,7 +273,7 @@ void ___cfg80211_scan_done(struct cfg80211_registered_device *rdev,
 	if (!send_message)
 		rdev->scan_msg = msg;
 	else
-		nl80211_send_scan_result(rdev, msg);
+		nl80211_send_scan_msg(rdev, msg);
 }
 
 void __cfg80211_scan_done(struct work_struct *wk)
@@ -321,7 +321,8 @@ void __cfg80211_sched_scan_results(struct work_struct *wk)
 			spin_unlock_bh(&rdev->bss_lock);
 			request->scan_start = jiffies;
 		}
-		nl80211_send_sched_scan_results(rdev, request->dev);
+		nl80211_send_sched_scan(rdev, request->dev,
+					NL80211_CMD_SCHED_SCAN_RESULTS);
 	}
 
 	rtnl_unlock();
@@ -1147,7 +1148,7 @@ cfg80211_inform_bss_frame_data(struct wiphy *wiphy,
 	else
 		rcu_assign_pointer(tmp.pub.beacon_ies, ies);
 	rcu_assign_pointer(tmp.pub.ies, ies);
-	
+
 	memcpy(tmp.pub.bssid, mgmt->bssid, ETH_ALEN);
 	tmp.pub.channel = channel;
 	tmp.pub.scan_width = data->scan_width;
diff --git a/net/wireless/sme.c b/net/wireless/sme.c
index 5e0d19380302..b347e63d7aaa 100644
--- a/net/wireless/sme.c
+++ b/net/wireless/sme.c
@@ -34,10 +34,11 @@ struct cfg80211_conn {
 		CFG80211_CONN_SCAN_AGAIN,
 		CFG80211_CONN_AUTHENTICATE_NEXT,
 		CFG80211_CONN_AUTHENTICATING,
-		CFG80211_CONN_AUTH_FAILED,
+		CFG80211_CONN_AUTH_FAILED_TIMEOUT,
 		CFG80211_CONN_ASSOCIATE_NEXT,
 		CFG80211_CONN_ASSOCIATING,
 		CFG80211_CONN_ASSOC_FAILED,
+		CFG80211_CONN_ASSOC_FAILED_TIMEOUT,
 		CFG80211_CONN_DEAUTH,
 		CFG80211_CONN_ABANDON,
 		CFG80211_CONN_CONNECTED,
@@ -140,7 +141,8 @@ static int cfg80211_conn_scan(struct wireless_dev *wdev)
 	return err;
 }
 
-static int cfg80211_conn_do_work(struct wireless_dev *wdev)
+static int cfg80211_conn_do_work(struct wireless_dev *wdev,
+				 enum nl80211_timeout_reason *treason)
 {
 	struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
 	struct cfg80211_connect_params *params;
@@ -171,7 +173,8 @@ static int cfg80211_conn_do_work(struct wireless_dev *wdev)
 					  NULL, 0,
 					  params->key, params->key_len,
 					  params->key_idx, NULL, 0);
-	case CFG80211_CONN_AUTH_FAILED:
+	case CFG80211_CONN_AUTH_FAILED_TIMEOUT:
+		*treason = NL80211_TIMEOUT_AUTH;
 		return -ENOTCONN;
 	case CFG80211_CONN_ASSOCIATE_NEXT:
 		if (WARN_ON(!rdev->ops->assoc))
@@ -198,6 +201,9 @@ static int cfg80211_conn_do_work(struct wireless_dev *wdev)
 					     WLAN_REASON_DEAUTH_LEAVING,
 					     false);
 		return err;
+	case CFG80211_CONN_ASSOC_FAILED_TIMEOUT:
+		*treason = NL80211_TIMEOUT_ASSOC;
+		/* fall through */
 	case CFG80211_CONN_ASSOC_FAILED:
 		cfg80211_mlme_deauth(rdev, wdev->netdev, params->bssid,
 				     NULL, 0,
@@ -223,6 +229,7 @@ void cfg80211_conn_work(struct work_struct *work)
 		container_of(work, struct cfg80211_registered_device, conn_work);
 	struct wireless_dev *wdev;
 	u8 bssid_buf[ETH_ALEN], *bssid = NULL;
+	enum nl80211_timeout_reason treason;
 
 	rtnl_lock();
 
@@ -244,10 +251,12 @@ void cfg80211_conn_work(struct work_struct *work)
 			memcpy(bssid_buf, wdev->conn->params.bssid, ETH_ALEN);
 			bssid = bssid_buf;
 		}
-		if (cfg80211_conn_do_work(wdev)) {
+		treason = NL80211_TIMEOUT_UNSPECIFIED;
+		if (cfg80211_conn_do_work(wdev, &treason)) {
 			__cfg80211_connect_result(
 					wdev->netdev, bssid,
-					NULL, 0, NULL, 0, -1, false, NULL);
+					NULL, 0, NULL, 0, -1, false, NULL,
+					treason);
 		}
 		wdev_unlock(wdev);
 	}
@@ -352,7 +361,8 @@ void cfg80211_sme_rx_auth(struct wireless_dev *wdev, const u8 *buf, size_t len)
 	} else if (status_code != WLAN_STATUS_SUCCESS) {
 		__cfg80211_connect_result(wdev->netdev, mgmt->bssid,
 					  NULL, 0, NULL, 0,
-					  status_code, false, NULL);
+					  status_code, false, NULL,
+					  NL80211_TIMEOUT_UNSPECIFIED);
 	} else if (wdev->conn->state == CFG80211_CONN_AUTHENTICATING) {
 		wdev->conn->state = CFG80211_CONN_ASSOCIATE_NEXT;
 		schedule_work(&rdev->conn_work);
@@ -400,7 +410,7 @@ void cfg80211_sme_auth_timeout(struct wireless_dev *wdev)
 	if (!wdev->conn)
 		return;
 
-	wdev->conn->state = CFG80211_CONN_AUTH_FAILED;
+	wdev->conn->state = CFG80211_CONN_AUTH_FAILED_TIMEOUT;
 	schedule_work(&rdev->conn_work);
 }
 
@@ -422,7 +432,7 @@ void cfg80211_sme_assoc_timeout(struct wireless_dev *wdev)
 	if (!wdev->conn)
 		return;
 
-	wdev->conn->state = CFG80211_CONN_ASSOC_FAILED;
+	wdev->conn->state = CFG80211_CONN_ASSOC_FAILED_TIMEOUT;
 	schedule_work(&rdev->conn_work);
 }
 
@@ -564,7 +574,9 @@ static int cfg80211_sme_connect(struct wireless_dev *wdev,
 
 	/* we're good if we have a matching bss struct */
 	if (bss) {
-		err = cfg80211_conn_do_work(wdev);
+		enum nl80211_timeout_reason treason;
+
+		err = cfg80211_conn_do_work(wdev, &treason);
 		cfg80211_put_bss(wdev->wiphy, bss);
 	} else {
 		/* otherwise we'll need to scan for the AP first */
@@ -661,7 +673,8 @@ void __cfg80211_connect_result(struct net_device *dev, const u8 *bssid,
 			       const u8 *req_ie, size_t req_ie_len,
 			       const u8 *resp_ie, size_t resp_ie_len,
 			       int status, bool wextev,
-			       struct cfg80211_bss *bss)
+			       struct cfg80211_bss *bss,
+			       enum nl80211_timeout_reason timeout_reason)
 {
 	struct wireless_dev *wdev = dev->ieee80211_ptr;
 	const u8 *country_ie;
@@ -680,7 +693,7 @@ void __cfg80211_connect_result(struct net_device *dev, const u8 *bssid,
 	nl80211_send_connect_result(wiphy_to_rdev(wdev->wiphy), dev,
 				    bssid, req_ie, req_ie_len,
 				    resp_ie, resp_ie_len,
-				    status, GFP_KERNEL);
+				    status, timeout_reason, GFP_KERNEL);
 
 #ifdef CONFIG_CFG80211_WEXT
 	if (wextev) {
@@ -727,6 +740,7 @@ void __cfg80211_connect_result(struct net_device *dev, const u8 *bssid,
 		kzfree(wdev->connect_keys);
 		wdev->connect_keys = NULL;
 		wdev->ssid_len = 0;
+		wdev->conn_owner_nlportid = 0;
 		if (bss) {
 			cfg80211_unhold_bss(bss_from_pub(bss));
 			cfg80211_put_bss(wdev->wiphy, bss);
@@ -770,7 +784,8 @@ void __cfg80211_connect_result(struct net_device *dev, const u8 *bssid,
 void cfg80211_connect_bss(struct net_device *dev, const u8 *bssid,
 			  struct cfg80211_bss *bss, const u8 *req_ie,
 			  size_t req_ie_len, const u8 *resp_ie,
-			  size_t resp_ie_len, int status, gfp_t gfp)
+			  size_t resp_ie_len, int status, gfp_t gfp,
+			  enum nl80211_timeout_reason timeout_reason)
 {
 	struct wireless_dev *wdev = dev->ieee80211_ptr;
 	struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
@@ -810,6 +825,7 @@ void cfg80211_connect_bss(struct net_device *dev, const u8 *bssid,
 		cfg80211_hold_bss(bss_from_pub(bss));
 	ev->cr.bss = bss;
 	ev->cr.status = status;
+	ev->cr.timeout_reason = timeout_reason;
 
 	spin_lock_irqsave(&wdev->event_lock, flags);
 	list_add_tail(&ev->list, &wdev->event_list);
@@ -955,6 +971,7 @@ void __cfg80211_disconnected(struct net_device *dev, const u8 *ie,
 
 	wdev->current_bss = NULL;
 	wdev->ssid_len = 0;
+	wdev->conn_owner_nlportid = 0;
 
 	nl80211_send_disconnected(rdev, dev, reason, ie, ie_len, from_ap);
 
@@ -1098,6 +1115,8 @@ int cfg80211_disconnect(struct cfg80211_registered_device *rdev,
 	kzfree(wdev->connect_keys);
 	wdev->connect_keys = NULL;
 
+	wdev->conn_owner_nlportid = 0;
+
 	if (wdev->conn)
 		err = cfg80211_sme_disconnect(wdev, reason);
 	else if (!rdev->ops->disconnect)
@@ -1107,3 +1126,32 @@ int cfg80211_disconnect(struct cfg80211_registered_device *rdev,
 
 	return err;
 }
+
+/*
+ * Used to clean up after the connection / connection attempt owner socket
+ * disconnects
+ */
+void cfg80211_autodisconnect_wk(struct work_struct *work)
+{
+	struct wireless_dev *wdev =
+		container_of(work, struct wireless_dev, disconnect_wk);
+	struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
+
+	wdev_lock(wdev);
+
+	if (wdev->conn_owner_nlportid) {
+		/*
+		 * Use disconnect_bssid if still connecting and ops->disconnect
+		 * not implemented.  Otherwise we can use cfg80211_disconnect.
+		 */
+		if (rdev->ops->disconnect || wdev->current_bss)
+			cfg80211_disconnect(rdev, wdev->netdev,
+					    WLAN_REASON_DEAUTH_LEAVING, true);
+		else
+			cfg80211_mlme_deauth(rdev, wdev->netdev,
+					     wdev->disconnect_bssid, NULL, 0,
+					     WLAN_REASON_DEAUTH_LEAVING, false);
+	}
+
+	wdev_unlock(wdev);
+}
diff --git a/net/wireless/sysfs.c b/net/wireless/sysfs.c
index 14b3f007826d..16b6b5988be9 100644
--- a/net/wireless/sysfs.c
+++ b/net/wireless/sysfs.c
@@ -39,9 +39,11 @@ SHOW_FMT(address_mask, "%pM", wiphy.addr_mask);
 
 static ssize_t name_show(struct device *dev,
 			 struct device_attribute *attr,
-			 char *buf) {
+			 char *buf)
+{
 	struct wiphy *wiphy = &dev_to_rdev(dev)->wiphy;
-	return sprintf(buf, "%s\n", dev_name(&wiphy->dev));
+
+	return sprintf(buf, "%s\n", wiphy_name(wiphy));
 }
 static DEVICE_ATTR_RO(name);
 
diff --git a/net/wireless/trace.h b/net/wireless/trace.h
index ea1b47e04fa4..776e80cef9b4 100644
--- a/net/wireless/trace.h
+++ b/net/wireless/trace.h
@@ -1915,18 +1915,18 @@ TRACE_EVENT(rdev_start_nan,
 		WIPHY_ENTRY
 		WDEV_ENTRY
 		__field(u8, master_pref)
-		__field(u8, dual);
+		__field(u8, bands);
 	),
 	TP_fast_assign(
 		WIPHY_ASSIGN;
 		WDEV_ASSIGN;
 		__entry->master_pref = conf->master_pref;
-		__entry->dual = conf->dual;
+		__entry->bands = conf->bands;
 	),
 	TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT
-		  ", master preference: %u, dual: %d",
+		  ", master preference: %u, bands: 0x%0x",
 		  WIPHY_PR_ARG, WDEV_PR_ARG, __entry->master_pref,
-		  __entry->dual)
+		  __entry->bands)
 );
 
 TRACE_EVENT(rdev_nan_change_conf,
@@ -1937,20 +1937,20 @@ TRACE_EVENT(rdev_nan_change_conf,
 		WIPHY_ENTRY
 		WDEV_ENTRY
 		__field(u8, master_pref)
-		__field(u8, dual);
+		__field(u8, bands);
 		__field(u32, changes);
 	),
 	TP_fast_assign(
 		WIPHY_ASSIGN;
 		WDEV_ASSIGN;
 		__entry->master_pref = conf->master_pref;
-		__entry->dual = conf->dual;
+		__entry->bands = conf->bands;
 		__entry->changes = changes;
 	),
 	TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT
-		  ", master preference: %u, dual: %d, changes: %x",
+		  ", master preference: %u, bands: 0x%0x, changes: %x",
 		  WIPHY_PR_ARG, WDEV_PR_ARG, __entry->master_pref,
-		  __entry->dual, __entry->changes)
+		  __entry->bands, __entry->changes)
 );
 
 DEFINE_EVENT(wiphy_wdev_evt, rdev_stop_nan,
@@ -2490,18 +2490,21 @@ TRACE_EVENT(cfg80211_mgmt_tx_status,
 
 TRACE_EVENT(cfg80211_cqm_rssi_notify,
 	TP_PROTO(struct net_device *netdev,
-		 enum nl80211_cqm_rssi_threshold_event rssi_event),
-	TP_ARGS(netdev, rssi_event),
+		 enum nl80211_cqm_rssi_threshold_event rssi_event,
+		 s32 rssi_level),
+	TP_ARGS(netdev, rssi_event, rssi_level),
 	TP_STRUCT__entry(
 		NETDEV_ENTRY
 		__field(enum nl80211_cqm_rssi_threshold_event, rssi_event)
+		__field(s32, rssi_level)
 	),
 	TP_fast_assign(
 		NETDEV_ASSIGN;
 		__entry->rssi_event = rssi_event;
+		__entry->rssi_level = rssi_level;
 	),
-	TP_printk(NETDEV_PR_FMT ", rssi event: %d",
-		  NETDEV_PR_ARG, __entry->rssi_event)
+	TP_printk(NETDEV_PR_FMT ", rssi event: %d, level: %d",
+		  NETDEV_PR_ARG, __entry->rssi_event, __entry->rssi_level)
 );
 
 TRACE_EVENT(cfg80211_reg_can_beacon,
diff --git a/net/wireless/util.c b/net/wireless/util.c
index e9d040d29846..68e5f2ecee1a 100644
--- a/net/wireless/util.c
+++ b/net/wireless/util.c
@@ -114,8 +114,7 @@ int ieee80211_frequency_to_channel(int freq)
 }
 EXPORT_SYMBOL(ieee80211_frequency_to_channel);
 
-struct ieee80211_channel *__ieee80211_get_channel(struct wiphy *wiphy,
-						  int freq)
+struct ieee80211_channel *ieee80211_get_channel(struct wiphy *wiphy, int freq)
 {
 	enum nl80211_band band;
 	struct ieee80211_supported_band *sband;
@@ -135,14 +134,13 @@ struct ieee80211_channel *__ieee80211_get_channel(struct wiphy *wiphy,
 
 	return NULL;
 }
-EXPORT_SYMBOL(__ieee80211_get_channel);
+EXPORT_SYMBOL(ieee80211_get_channel);
 
-static void set_mandatory_flags_band(struct ieee80211_supported_band *sband,
-				     enum nl80211_band band)
+static void set_mandatory_flags_band(struct ieee80211_supported_band *sband)
 {
 	int i, want;
 
-	switch (band) {
+	switch (sband->band) {
 	case NL80211_BAND_5GHZ:
 		want = 3;
 		for (i = 0; i < sband->n_bitrates; i++) {
@@ -192,6 +190,7 @@ static void set_mandatory_flags_band(struct ieee80211_supported_band *sband,
 		WARN_ON((sband->ht_cap.mcs.rx_mask[0] & 0x1e) != 0x1e);
 		break;
 	case NUM_NL80211_BANDS:
+	default:
 		WARN_ON(1);
 		break;
 	}
@@ -203,7 +202,7 @@ void ieee80211_set_bitrate_flags(struct wiphy *wiphy)
 
 	for (band = 0; band < NUM_NL80211_BANDS; band++)
 		if (wiphy->bands[band])
-			set_mandatory_flags_band(wiphy->bands[band], band);
+			set_mandatory_flags_band(wiphy->bands[band]);
 }
 
 bool cfg80211_supported_cipher_suite(struct wiphy *wiphy, u32 cipher)
@@ -619,8 +618,6 @@ int ieee80211_data_from_8023(struct sk_buff *skb, const u8 *addr,
 
 		if (pskb_expand_head(skb, head_need, 0, GFP_ATOMIC))
 			return -ENOMEM;
-
-		skb->truesize += head_need;
 	}
 
 	if (encaps_data) {
@@ -952,7 +949,7 @@ void cfg80211_process_wdev_events(struct wireless_dev *wdev)
 				ev->cr.resp_ie, ev->cr.resp_ie_len,
 				ev->cr.status,
 				ev->cr.status == WLAN_STATUS_SUCCESS,
-				ev->cr.bss);
+				ev->cr.bss, ev->cr.timeout_reason);
 			break;
 		case EVENT_ROAMED:
 			__cfg80211_roamed(wdev, ev->rm.bss, ev->rm.req_ie,
@@ -1848,6 +1845,21 @@ void cfg80211_free_nan_func(struct cfg80211_nan_func *f)
 }
 EXPORT_SYMBOL(cfg80211_free_nan_func);
 
+bool cfg80211_does_bw_fit_range(const struct ieee80211_freq_range *freq_range,
+				u32 center_freq_khz, u32 bw_khz)
+{
+	u32 start_freq_khz, end_freq_khz;
+
+	start_freq_khz = center_freq_khz - (bw_khz / 2);
+	end_freq_khz = center_freq_khz + (bw_khz / 2);
+
+	if (start_freq_khz >= freq_range->start_freq_khz &&
+	    end_freq_khz <= freq_range->end_freq_khz)
+		return true;
+
+	return false;
+}
+
 /* See IEEE 802.1H for LLC/SNAP encapsulation/decapsulation */
 /* Ethernet-II snap header (RFC1042 for most EtherTypes) */
 const unsigned char rfc1042_header[] __aligned(2) =
diff --git a/net/wireless/wext-core.c b/net/wireless/wext-core.c
index 6250b1cfcde5..1a4db6790e20 100644
--- a/net/wireless/wext-core.c
+++ b/net/wireless/wext-core.c
@@ -1119,3 +1119,70 @@ int compat_wext_handle_ioctl(struct net *net, unsigned int cmd,
 	return ret;
 }
 #endif
+
+char *iwe_stream_add_event(struct iw_request_info *info, char *stream,
+			   char *ends, struct iw_event *iwe, int event_len)
+{
+	int lcp_len = iwe_stream_lcp_len(info);
+
+	event_len = iwe_stream_event_len_adjust(info, event_len);
+
+	/* Check if it's possible */
+	if (likely((stream + event_len) < ends)) {
+		iwe->len = event_len;
+		/* Beware of alignement issues on 64 bits */
+		memcpy(stream, (char *) iwe, IW_EV_LCP_PK_LEN);
+		memcpy(stream + lcp_len, &iwe->u,
+		       event_len - lcp_len);
+		stream += event_len;
+	}
+
+	return stream;
+}
+EXPORT_SYMBOL(iwe_stream_add_event);
+
+char *iwe_stream_add_point(struct iw_request_info *info, char *stream,
+			   char *ends, struct iw_event *iwe, char *extra)
+{
+	int event_len = iwe_stream_point_len(info) + iwe->u.data.length;
+	int point_len = iwe_stream_point_len(info);
+	int lcp_len   = iwe_stream_lcp_len(info);
+
+	/* Check if it's possible */
+	if (likely((stream + event_len) < ends)) {
+		iwe->len = event_len;
+		memcpy(stream, (char *) iwe, IW_EV_LCP_PK_LEN);
+		memcpy(stream + lcp_len,
+		       ((char *) &iwe->u) + IW_EV_POINT_OFF,
+		       IW_EV_POINT_PK_LEN - IW_EV_LCP_PK_LEN);
+		if (iwe->u.data.length && extra)
+			memcpy(stream + point_len, extra, iwe->u.data.length);
+		stream += event_len;
+	}
+
+	return stream;
+}
+EXPORT_SYMBOL(iwe_stream_add_point);
+
+char *iwe_stream_add_value(struct iw_request_info *info, char *event,
+			   char *value, char *ends, struct iw_event *iwe,
+			   int event_len)
+{
+	int lcp_len = iwe_stream_lcp_len(info);
+
+	/* Don't duplicate LCP */
+	event_len -= IW_EV_LCP_LEN;
+
+	/* Check if it's possible */
+	if (likely((value + event_len) < ends)) {
+		/* Add new value */
+		memcpy(value, &iwe->u, event_len);
+		value += event_len;
+		/* Patch LCP */
+		iwe->len = value - event;
+		memcpy(event, (char *) iwe, lcp_len);
+	}
+
+	return value;
+}
+EXPORT_SYMBOL(iwe_stream_add_value);
diff --git a/net/wireless/wext-sme.c b/net/wireless/wext-sme.c
index 995163830a61..c434f193f39a 100644
--- a/net/wireless/wext-sme.c
+++ b/net/wireless/wext-sme.c
@@ -105,30 +105,7 @@ int cfg80211_mgd_wext_siwfreq(struct net_device *dev,
 			goto out;
 	}
 
-
 	wdev->wext.connect.channel = chan;
-
-	/*
-	 * SSID is not set, we just want to switch monitor channel,
-	 * this is really just backward compatibility, if the SSID
-	 * is set then we use the channel to select the BSS to use
-	 * to connect to instead. If we were connected on another
-	 * channel we disconnected above and reconnect below.
-	 */
-	if (chan && !wdev->wext.connect.ssid_len) {
-		struct cfg80211_chan_def chandef = {
-			.width = NL80211_CHAN_WIDTH_20_NOHT,
-			.center_freq1 = freq,
-		};
-
-		chandef.chan = ieee80211_get_channel(&rdev->wiphy, freq);
-		if (chandef.chan)
-			err = cfg80211_set_monitor_channel(rdev, &chandef);
-		else
-			err = -EINVAL;
-		goto out;
-	}
-
 	err = cfg80211_mgd_wext_connect(rdev, wdev);
  out:
 	wdev_unlock(wdev);
diff --git a/net/xfrm/Kconfig b/net/xfrm/Kconfig
index bda1a13628a8..286ed25c1a69 100644
--- a/net/xfrm/Kconfig
+++ b/net/xfrm/Kconfig
@@ -4,6 +4,11 @@
 config XFRM
        bool
        depends on NET
+       select GRO_CELLS
+
+config XFRM_OFFLOAD
+       bool
+       depends on XFRM
 
 config XFRM_ALGO
 	tristate
diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c
index 6e3f0254d8a1..46bdb4fbed0b 100644
--- a/net/xfrm/xfrm_input.c
+++ b/net/xfrm/xfrm_input.c
@@ -19,16 +19,18 @@
 static struct kmem_cache *secpath_cachep __read_mostly;
 
 static DEFINE_SPINLOCK(xfrm_input_afinfo_lock);
-static struct xfrm_input_afinfo __rcu *xfrm_input_afinfo[NPROTO];
+static struct xfrm_input_afinfo const __rcu *xfrm_input_afinfo[AF_INET6 + 1];
 
-int xfrm_input_register_afinfo(struct xfrm_input_afinfo *afinfo)
+static struct gro_cells gro_cells;
+static struct net_device xfrm_napi_dev;
+
+int xfrm_input_register_afinfo(const struct xfrm_input_afinfo *afinfo)
 {
 	int err = 0;
 
-	if (unlikely(afinfo == NULL))
-		return -EINVAL;
-	if (unlikely(afinfo->family >= NPROTO))
+	if (WARN_ON(afinfo->family >= ARRAY_SIZE(xfrm_input_afinfo)))
 		return -EAFNOSUPPORT;
+
 	spin_lock_bh(&xfrm_input_afinfo_lock);
 	if (unlikely(xfrm_input_afinfo[afinfo->family] != NULL))
 		err = -EEXIST;
@@ -39,14 +41,10 @@ int xfrm_input_register_afinfo(struct xfrm_input_afinfo *afinfo)
 }
 EXPORT_SYMBOL(xfrm_input_register_afinfo);
 
-int xfrm_input_unregister_afinfo(struct xfrm_input_afinfo *afinfo)
+int xfrm_input_unregister_afinfo(const struct xfrm_input_afinfo *afinfo)
 {
 	int err = 0;
 
-	if (unlikely(afinfo == NULL))
-		return -EINVAL;
-	if (unlikely(afinfo->family >= NPROTO))
-		return -EAFNOSUPPORT;
 	spin_lock_bh(&xfrm_input_afinfo_lock);
 	if (likely(xfrm_input_afinfo[afinfo->family] != NULL)) {
 		if (unlikely(xfrm_input_afinfo[afinfo->family] != afinfo))
@@ -60,12 +58,13 @@ int xfrm_input_unregister_afinfo(struct xfrm_input_afinfo *afinfo)
 }
 EXPORT_SYMBOL(xfrm_input_unregister_afinfo);
 
-static struct xfrm_input_afinfo *xfrm_input_get_afinfo(unsigned int family)
+static const struct xfrm_input_afinfo *xfrm_input_get_afinfo(unsigned int family)
 {
-	struct xfrm_input_afinfo *afinfo;
+	const struct xfrm_input_afinfo *afinfo;
 
-	if (unlikely(family >= NPROTO))
+	if (WARN_ON_ONCE(family >= ARRAY_SIZE(xfrm_input_afinfo)))
 		return NULL;
+
 	rcu_read_lock();
 	afinfo = rcu_dereference(xfrm_input_afinfo[family]);
 	if (unlikely(!afinfo))
@@ -73,22 +72,17 @@ static struct xfrm_input_afinfo *xfrm_input_get_afinfo(unsigned int family)
 	return afinfo;
 }
 
-static void xfrm_input_put_afinfo(struct xfrm_input_afinfo *afinfo)
-{
-	rcu_read_unlock();
-}
-
 static int xfrm_rcv_cb(struct sk_buff *skb, unsigned int family, u8 protocol,
 		       int err)
 {
 	int ret;
-	struct xfrm_input_afinfo *afinfo = xfrm_input_get_afinfo(family);
+	const struct xfrm_input_afinfo *afinfo = xfrm_input_get_afinfo(family);
 
 	if (!afinfo)
 		return -EAFNOSUPPORT;
 
 	ret = afinfo->callback(skb, protocol, err);
-	xfrm_input_put_afinfo(afinfo);
+	rcu_read_unlock();
 
 	return ret;
 }
@@ -111,6 +105,8 @@ struct sec_path *secpath_dup(struct sec_path *src)
 		return NULL;
 
 	sp->len = 0;
+	sp->olen = 0;
+
 	if (src) {
 		int i;
 
@@ -123,6 +119,24 @@ struct sec_path *secpath_dup(struct sec_path *src)
 }
 EXPORT_SYMBOL(secpath_dup);
 
+int secpath_set(struct sk_buff *skb)
+{
+	struct sec_path *sp;
+
+	/* Allocate new secpath or COW existing one. */
+	if (!skb->sp || atomic_read(&skb->sp->refcnt) != 1) {
+		sp = secpath_dup(skb->sp);
+		if (!sp)
+			return -ENOMEM;
+
+		if (skb->sp)
+			secpath_put(skb->sp);
+		skb->sp = sp;
+	}
+	return 0;
+}
+EXPORT_SYMBOL(secpath_set);
+
 /* Fetch spi and seq from ipsec header */
 
 int xfrm_parse_spi(struct sk_buff *skb, u8 nexthdr, __be32 *spi, __be32 *seq)
@@ -158,6 +172,7 @@ int xfrm_parse_spi(struct sk_buff *skb, u8 nexthdr, __be32 *spi, __be32 *seq)
 	*seq = *(__be32 *)(skb_transport_header(skb) + offset_seq);
 	return 0;
 }
+EXPORT_SYMBOL(xfrm_parse_spi);
 
 int xfrm_prepare_input(struct xfrm_state *x, struct sk_buff *skb)
 {
@@ -192,14 +207,23 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type)
 	unsigned int family;
 	int decaps = 0;
 	int async = 0;
+	struct xfrm_offload *xo;
+	bool xfrm_gro = false;
 
-	/* A negative encap_type indicates async resumption. */
 	if (encap_type < 0) {
-		async = 1;
 		x = xfrm_input_state(skb);
-		seq = XFRM_SKB_CB(skb)->seq.input.low;
 		family = x->outer_mode->afinfo->family;
-		goto resume;
+
+		/* An encap_type of -1 indicates async resumption. */
+		if (encap_type == -1) {
+			async = 1;
+			seq = XFRM_SKB_CB(skb)->seq.input.low;
+			goto resume;
+		}
+		/* encap_type < -1 indicates a GRO call. */
+		encap_type = 0;
+		seq = XFRM_SPI_SKB_CB(skb)->seq;
+		goto lock;
 	}
 
 	daddr = (xfrm_address_t *)(skb_network_header(skb) +
@@ -218,18 +242,10 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type)
 		break;
 	}
 
-	/* Allocate new secpath or COW existing one. */
-	if (!skb->sp || atomic_read(&skb->sp->refcnt) != 1) {
-		struct sec_path *sp;
-
-		sp = secpath_dup(skb->sp);
-		if (!sp) {
-			XFRM_INC_STATS(net, LINUX_MIB_XFRMINERROR);
-			goto drop;
-		}
-		if (skb->sp)
-			secpath_put(skb->sp);
-		skb->sp = sp;
+	err = secpath_set(skb);
+	if (err) {
+		XFRM_INC_STATS(net, LINUX_MIB_XFRMINERROR);
+		goto drop;
 	}
 
 	seq = 0;
@@ -253,6 +269,7 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type)
 
 		skb->sp->xvec[skb->sp->len++] = x;
 
+lock:
 		spin_lock(&x->lock);
 
 		if (unlikely(x->km.state != XFRM_STATE_VALID)) {
@@ -371,10 +388,21 @@ resume:
 
 	if (decaps) {
 		skb_dst_drop(skb);
-		netif_rx(skb);
+		gro_cells_receive(&gro_cells, skb);
 		return 0;
 	} else {
-		return x->inner_mode->afinfo->transport_finish(skb, async);
+		xo = xfrm_offload(skb);
+		if (xo)
+			xfrm_gro = xo->flags & XFRM_GRO;
+
+		err = x->inner_mode->afinfo->transport_finish(skb, async);
+		if (xfrm_gro) {
+			skb_dst_drop(skb);
+			gro_cells_receive(&gro_cells, skb);
+			return err;
+		}
+
+		return err;
 	}
 
 drop_unlock:
@@ -394,6 +422,13 @@ EXPORT_SYMBOL(xfrm_input_resume);
 
 void __init xfrm_input_init(void)
 {
+	int err;
+
+	init_dummy_netdev(&xfrm_napi_dev);
+	err = gro_cells_init(&gro_cells, &xfrm_napi_dev);
+	if (err)
+		gro_cells.cells = NULL;
+
 	secpath_cachep = kmem_cache_create("secpath_cache",
 					   sizeof(struct sec_path),
 					   0, SLAB_HWCACHE_ALIGN|SLAB_PANIC,
diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c
index 637387bbaaea..8ba29fe58352 100644
--- a/net/xfrm/xfrm_output.c
+++ b/net/xfrm/xfrm_output.c
@@ -246,10 +246,8 @@ void xfrm_local_error(struct sk_buff *skb, int mtu)
 		return;
 
 	afinfo = xfrm_state_get_afinfo(proto);
-	if (!afinfo)
-		return;
-
-	afinfo->local_error(skb, mtu);
-	xfrm_state_put_afinfo(afinfo);
+	if (afinfo)
+		afinfo->local_error(skb, mtu);
+	rcu_read_unlock();
 }
 EXPORT_SYMBOL_GPL(xfrm_local_error);
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 177e208e8ff5..0806dccdf507 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -45,7 +45,7 @@ struct xfrm_flo {
 };
 
 static DEFINE_SPINLOCK(xfrm_policy_afinfo_lock);
-static struct xfrm_policy_afinfo __rcu *xfrm_policy_afinfo[NPROTO]
+static struct xfrm_policy_afinfo const __rcu *xfrm_policy_afinfo[AF_INET6 + 1]
 						__read_mostly;
 
 static struct kmem_cache *xfrm_dst_cache __read_mostly;
@@ -103,11 +103,11 @@ bool xfrm_selector_match(const struct xfrm_selector *sel, const struct flowi *fl
 	return false;
 }
 
-static struct xfrm_policy_afinfo *xfrm_policy_get_afinfo(unsigned short family)
+static const struct xfrm_policy_afinfo *xfrm_policy_get_afinfo(unsigned short family)
 {
-	struct xfrm_policy_afinfo *afinfo;
+	const struct xfrm_policy_afinfo *afinfo;
 
-	if (unlikely(family >= NPROTO))
+	if (unlikely(family >= ARRAY_SIZE(xfrm_policy_afinfo)))
 		return NULL;
 	rcu_read_lock();
 	afinfo = rcu_dereference(xfrm_policy_afinfo[family]);
@@ -116,18 +116,13 @@ static struct xfrm_policy_afinfo *xfrm_policy_get_afinfo(unsigned short family)
 	return afinfo;
 }
 
-static void xfrm_policy_put_afinfo(struct xfrm_policy_afinfo *afinfo)
-{
-	rcu_read_unlock();
-}
-
 static inline struct dst_entry *__xfrm_dst_lookup(struct net *net,
 						  int tos, int oif,
 						  const xfrm_address_t *saddr,
 						  const xfrm_address_t *daddr,
 						  int family)
 {
-	struct xfrm_policy_afinfo *afinfo;
+	const struct xfrm_policy_afinfo *afinfo;
 	struct dst_entry *dst;
 
 	afinfo = xfrm_policy_get_afinfo(family);
@@ -136,7 +131,7 @@ static inline struct dst_entry *__xfrm_dst_lookup(struct net *net,
 
 	dst = afinfo->dst_lookup(net, tos, oif, saddr, daddr);
 
-	xfrm_policy_put_afinfo(afinfo);
+	rcu_read_unlock();
 
 	return dst;
 }
@@ -330,7 +325,7 @@ void xfrm_policy_destroy(struct xfrm_policy *policy)
 }
 EXPORT_SYMBOL(xfrm_policy_destroy);
 
-/* Rule must be locked. Release descentant resources, announce
+/* Rule must be locked. Release descendant resources, announce
  * entry dead. The rule must be unlinked from lists to the moment.
  */
 
@@ -1431,12 +1426,12 @@ xfrm_get_saddr(struct net *net, int oif, xfrm_address_t *local,
 	       xfrm_address_t *remote, unsigned short family)
 {
 	int err;
-	struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);
+	const struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);
 
 	if (unlikely(afinfo == NULL))
 		return -EINVAL;
 	err = afinfo->get_saddr(net, oif, local, remote);
-	xfrm_policy_put_afinfo(afinfo);
+	rcu_read_unlock();
 	return err;
 }
 
@@ -1538,21 +1533,15 @@ xfrm_tmpl_resolve(struct xfrm_policy **pols, int npols, const struct flowi *fl,
 
 }
 
-/* Check that the bundle accepts the flow and its components are
- * still valid.
- */
-
-static inline int xfrm_get_tos(const struct flowi *fl, int family)
+static int xfrm_get_tos(const struct flowi *fl, int family)
 {
-	struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);
-	int tos;
-
-	if (!afinfo)
-		return -EINVAL;
+	const struct xfrm_policy_afinfo *afinfo;
+	int tos = 0;
 
-	tos = afinfo->get_tos(fl);
+	afinfo = xfrm_policy_get_afinfo(family);
+	tos = afinfo ? afinfo->get_tos(fl) : 0;
 
-	xfrm_policy_put_afinfo(afinfo);
+	rcu_read_unlock();
 
 	return tos;
 }
@@ -1609,7 +1598,7 @@ static const struct flow_cache_ops xfrm_bundle_fc_ops = {
 
 static inline struct xfrm_dst *xfrm_alloc_dst(struct net *net, int family)
 {
-	struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);
+	const struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);
 	struct dst_ops *dst_ops;
 	struct xfrm_dst *xdst;
 
@@ -1638,7 +1627,7 @@ static inline struct xfrm_dst *xfrm_alloc_dst(struct net *net, int family)
 	} else
 		xdst = ERR_PTR(-ENOBUFS);
 
-	xfrm_policy_put_afinfo(afinfo);
+	rcu_read_unlock();
 
 	return xdst;
 }
@@ -1646,7 +1635,7 @@ static inline struct xfrm_dst *xfrm_alloc_dst(struct net *net, int family)
 static inline int xfrm_init_path(struct xfrm_dst *path, struct dst_entry *dst,
 				 int nfheader_len)
 {
-	struct xfrm_policy_afinfo *afinfo =
+	const struct xfrm_policy_afinfo *afinfo =
 		xfrm_policy_get_afinfo(dst->ops->family);
 	int err;
 
@@ -1655,7 +1644,7 @@ static inline int xfrm_init_path(struct xfrm_dst *path, struct dst_entry *dst,
 
 	err = afinfo->init_path(path, dst, nfheader_len);
 
-	xfrm_policy_put_afinfo(afinfo);
+	rcu_read_unlock();
 
 	return err;
 }
@@ -1663,7 +1652,7 @@ static inline int xfrm_init_path(struct xfrm_dst *path, struct dst_entry *dst,
 static inline int xfrm_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,
 				const struct flowi *fl)
 {
-	struct xfrm_policy_afinfo *afinfo =
+	const struct xfrm_policy_afinfo *afinfo =
 		xfrm_policy_get_afinfo(xdst->u.dst.ops->family);
 	int err;
 
@@ -1672,7 +1661,7 @@ static inline int xfrm_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,
 
 	err = afinfo->fill_dst(xdst, dev, fl);
 
-	xfrm_policy_put_afinfo(afinfo);
+	rcu_read_unlock();
 
 	return err;
 }
@@ -1705,9 +1694,6 @@ static struct dst_entry *xfrm_bundle_create(struct xfrm_policy *policy,
 	xfrm_flowi_addr_get(fl, &saddr, &daddr, family);
 
 	tos = xfrm_get_tos(fl, family);
-	err = tos;
-	if (tos < 0)
-		goto put_states;
 
 	dst_hold(dst);
 
@@ -2215,7 +2201,7 @@ error:
 static struct dst_entry *make_blackhole(struct net *net, u16 family,
 					struct dst_entry *dst_orig)
 {
-	struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);
+	const struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);
 	struct dst_entry *ret;
 
 	if (!afinfo) {
@@ -2224,7 +2210,7 @@ static struct dst_entry *make_blackhole(struct net *net, u16 family,
 	} else {
 		ret = afinfo->blackhole_route(net, dst_orig);
 	}
-	xfrm_policy_put_afinfo(afinfo);
+	rcu_read_unlock();
 
 	return ret;
 }
@@ -2466,7 +2452,7 @@ xfrm_policy_ok(const struct xfrm_tmpl *tmpl, const struct sec_path *sp, int star
 int __xfrm_decode_session(struct sk_buff *skb, struct flowi *fl,
 			  unsigned int family, int reverse)
 {
-	struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);
+	const struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);
 	int err;
 
 	if (unlikely(afinfo == NULL))
@@ -2474,7 +2460,7 @@ int __xfrm_decode_session(struct sk_buff *skb, struct flowi *fl,
 
 	afinfo->decode_session(skb, fl, reverse);
 	err = security_xfrm_decode_session(skb, &fl->flowi_secid);
-	xfrm_policy_put_afinfo(afinfo);
+	rcu_read_unlock();
 	return err;
 }
 EXPORT_SYMBOL(__xfrm_decode_session);
@@ -2742,10 +2728,11 @@ void xfrm_garbage_collect(struct net *net)
 }
 EXPORT_SYMBOL(xfrm_garbage_collect);
 
-static void xfrm_garbage_collect_deferred(struct net *net)
+void xfrm_garbage_collect_deferred(struct net *net)
 {
 	flow_cache_flush_deferred(net);
 }
+EXPORT_SYMBOL(xfrm_garbage_collect_deferred);
 
 static void xfrm_init_pmtu(struct dst_entry *dst)
 {
@@ -2849,22 +2836,52 @@ static unsigned int xfrm_mtu(const struct dst_entry *dst)
 	return mtu ? : dst_mtu(dst->path);
 }
 
+static const void *xfrm_get_dst_nexthop(const struct dst_entry *dst,
+					const void *daddr)
+{
+	const struct dst_entry *path = dst->path;
+
+	for (; dst != path; dst = dst->child) {
+		const struct xfrm_state *xfrm = dst->xfrm;
+
+		if (xfrm->props.mode == XFRM_MODE_TRANSPORT)
+			continue;
+		if (xfrm->type->flags & XFRM_TYPE_REMOTE_COADDR)
+			daddr = xfrm->coaddr;
+		else if (!(xfrm->type->flags & XFRM_TYPE_LOCAL_COADDR))
+			daddr = &xfrm->id.daddr;
+	}
+	return daddr;
+}
+
 static struct neighbour *xfrm_neigh_lookup(const struct dst_entry *dst,
 					   struct sk_buff *skb,
 					   const void *daddr)
 {
-	return dst->path->ops->neigh_lookup(dst, skb, daddr);
+	const struct dst_entry *path = dst->path;
+
+	if (!skb)
+		daddr = xfrm_get_dst_nexthop(dst, daddr);
+	return path->ops->neigh_lookup(path, skb, daddr);
+}
+
+static void xfrm_confirm_neigh(const struct dst_entry *dst, const void *daddr)
+{
+	const struct dst_entry *path = dst->path;
+
+	daddr = xfrm_get_dst_nexthop(dst, daddr);
+	path->ops->confirm_neigh(path, daddr);
 }
 
-int xfrm_policy_register_afinfo(struct xfrm_policy_afinfo *afinfo)
+int xfrm_policy_register_afinfo(const struct xfrm_policy_afinfo *afinfo, int family)
 {
 	int err = 0;
-	if (unlikely(afinfo == NULL))
-		return -EINVAL;
-	if (unlikely(afinfo->family >= NPROTO))
+
+	if (WARN_ON(family >= ARRAY_SIZE(xfrm_policy_afinfo)))
 		return -EAFNOSUPPORT;
+
 	spin_lock(&xfrm_policy_afinfo_lock);
-	if (unlikely(xfrm_policy_afinfo[afinfo->family] != NULL))
+	if (unlikely(xfrm_policy_afinfo[family] != NULL))
 		err = -EEXIST;
 	else {
 		struct dst_ops *dst_ops = afinfo->dst_ops;
@@ -2882,9 +2899,9 @@ int xfrm_policy_register_afinfo(struct xfrm_policy_afinfo *afinfo)
 			dst_ops->link_failure = xfrm_link_failure;
 		if (likely(dst_ops->neigh_lookup == NULL))
 			dst_ops->neigh_lookup = xfrm_neigh_lookup;
-		if (likely(afinfo->garbage_collect == NULL))
-			afinfo->garbage_collect = xfrm_garbage_collect_deferred;
-		rcu_assign_pointer(xfrm_policy_afinfo[afinfo->family], afinfo);
+		if (likely(!dst_ops->confirm_neigh))
+			dst_ops->confirm_neigh = xfrm_confirm_neigh;
+		rcu_assign_pointer(xfrm_policy_afinfo[family], afinfo);
 	}
 	spin_unlock(&xfrm_policy_afinfo_lock);
 
@@ -2892,34 +2909,24 @@ int xfrm_policy_register_afinfo(struct xfrm_policy_afinfo *afinfo)
 }
 EXPORT_SYMBOL(xfrm_policy_register_afinfo);
 
-int xfrm_policy_unregister_afinfo(struct xfrm_policy_afinfo *afinfo)
+void xfrm_policy_unregister_afinfo(const struct xfrm_policy_afinfo *afinfo)
 {
-	int err = 0;
-	if (unlikely(afinfo == NULL))
-		return -EINVAL;
-	if (unlikely(afinfo->family >= NPROTO))
-		return -EAFNOSUPPORT;
-	spin_lock(&xfrm_policy_afinfo_lock);
-	if (likely(xfrm_policy_afinfo[afinfo->family] != NULL)) {
-		if (unlikely(xfrm_policy_afinfo[afinfo->family] != afinfo))
-			err = -EINVAL;
-		else
-			RCU_INIT_POINTER(xfrm_policy_afinfo[afinfo->family],
-					 NULL);
+	struct dst_ops *dst_ops = afinfo->dst_ops;
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(xfrm_policy_afinfo); i++) {
+		if (xfrm_policy_afinfo[i] != afinfo)
+			continue;
+		RCU_INIT_POINTER(xfrm_policy_afinfo[i], NULL);
+		break;
 	}
-	spin_unlock(&xfrm_policy_afinfo_lock);
-	if (!err) {
-		struct dst_ops *dst_ops = afinfo->dst_ops;
 
-		synchronize_rcu();
+	synchronize_rcu();
 
-		dst_ops->kmem_cachep = NULL;
-		dst_ops->check = NULL;
-		dst_ops->negative_advice = NULL;
-		dst_ops->link_failure = NULL;
-		afinfo->garbage_collect = NULL;
-	}
-	return err;
+	dst_ops->kmem_cachep = NULL;
+	dst_ops->check = NULL;
+	dst_ops->negative_advice = NULL;
+	dst_ops->link_failure = NULL;
 }
 EXPORT_SYMBOL(xfrm_policy_unregister_afinfo);
 
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index 64e3c82eedf6..5a597dbbe564 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -192,7 +192,7 @@ int xfrm_register_type(const struct xfrm_type *type, unsigned short family)
 	else
 		err = -EEXIST;
 	spin_unlock_bh(&xfrm_type_lock);
-	xfrm_state_put_afinfo(afinfo);
+	rcu_read_unlock();
 	return err;
 }
 EXPORT_SYMBOL(xfrm_register_type);
@@ -213,7 +213,7 @@ int xfrm_unregister_type(const struct xfrm_type *type, unsigned short family)
 	else
 		typemap[type->proto] = NULL;
 	spin_unlock_bh(&xfrm_type_lock);
-	xfrm_state_put_afinfo(afinfo);
+	rcu_read_unlock();
 	return err;
 }
 EXPORT_SYMBOL(xfrm_unregister_type);
@@ -231,17 +231,18 @@ retry:
 		return NULL;
 	typemap = afinfo->type_map;
 
-	type = typemap[proto];
+	type = READ_ONCE(typemap[proto]);
 	if (unlikely(type && !try_module_get(type->owner)))
 		type = NULL;
+
+	rcu_read_unlock();
+
 	if (!type && !modload_attempted) {
-		xfrm_state_put_afinfo(afinfo);
 		request_module("xfrm-type-%d-%d", family, proto);
 		modload_attempted = 1;
 		goto retry;
 	}
 
-	xfrm_state_put_afinfo(afinfo);
 	return type;
 }
 
@@ -280,7 +281,7 @@ int xfrm_register_mode(struct xfrm_mode *mode, int family)
 
 out:
 	spin_unlock_bh(&xfrm_mode_lock);
-	xfrm_state_put_afinfo(afinfo);
+	rcu_read_unlock();
 	return err;
 }
 EXPORT_SYMBOL(xfrm_register_mode);
@@ -308,7 +309,7 @@ int xfrm_unregister_mode(struct xfrm_mode *mode, int family)
 	}
 
 	spin_unlock_bh(&xfrm_mode_lock);
-	xfrm_state_put_afinfo(afinfo);
+	rcu_read_unlock();
 	return err;
 }
 EXPORT_SYMBOL(xfrm_unregister_mode);
@@ -327,17 +328,17 @@ retry:
 	if (unlikely(afinfo == NULL))
 		return NULL;
 
-	mode = afinfo->mode_map[encap];
+	mode = READ_ONCE(afinfo->mode_map[encap]);
 	if (unlikely(mode && !try_module_get(mode->owner)))
 		mode = NULL;
+
+	rcu_read_unlock();
 	if (!mode && !modload_attempted) {
-		xfrm_state_put_afinfo(afinfo);
 		request_module("xfrm-mode-%d-%d", family, encap);
 		modload_attempted = 1;
 		goto retry;
 	}
 
-	xfrm_state_put_afinfo(afinfo);
 	return mode;
 }
 
@@ -409,7 +410,7 @@ static enum hrtimer_restart xfrm_timer_handler(struct hrtimer *me)
 			if (x->xflags & XFRM_SOFT_EXPIRE) {
 				/* enter hard expire without soft expire first?!
 				 * setting a new date could trigger this.
-				 * workarbound: fix x->curflt.add_time by below:
+				 * workaround: fix x->curflt.add_time by below:
 				 */
 				x->curlft.add_time = now - x->saved_tmo - 1;
 				tmo = x->lft.hard_add_expires_seconds - x->saved_tmo;
@@ -639,26 +640,25 @@ void xfrm_sad_getinfo(struct net *net, struct xfrmk_sadinfo *si)
 }
 EXPORT_SYMBOL(xfrm_sad_getinfo);
 
-static int
+static void
 xfrm_init_tempstate(struct xfrm_state *x, const struct flowi *fl,
 		    const struct xfrm_tmpl *tmpl,
 		    const xfrm_address_t *daddr, const xfrm_address_t *saddr,
 		    unsigned short family)
 {
-	struct xfrm_state_afinfo *afinfo = xfrm_state_get_afinfo(family);
+	struct xfrm_state_afinfo *afinfo = xfrm_state_afinfo_get_rcu(family);
+
 	if (!afinfo)
-		return -1;
+		return;
+
 	afinfo->init_tempsel(&x->sel, fl);
 
 	if (family != tmpl->encap_family) {
-		xfrm_state_put_afinfo(afinfo);
-		afinfo = xfrm_state_get_afinfo(tmpl->encap_family);
+		afinfo = xfrm_state_afinfo_get_rcu(tmpl->encap_family);
 		if (!afinfo)
-			return -1;
+			return;
 	}
 	afinfo->init_temprop(x, tmpl, daddr, saddr);
-	xfrm_state_put_afinfo(afinfo);
-	return 0;
 }
 
 static struct xfrm_state *__xfrm_state_lookup(struct net *net, u32 mark,
@@ -1474,7 +1474,7 @@ xfrm_tmpl_sort(struct xfrm_tmpl **dst, struct xfrm_tmpl **src, int n,
 	if (afinfo->tmpl_sort)
 		err = afinfo->tmpl_sort(dst, src, n);
 	spin_unlock_bh(&net->xfrm.xfrm_state_lock);
-	xfrm_state_put_afinfo(afinfo);
+	rcu_read_unlock();
 	return err;
 }
 EXPORT_SYMBOL(xfrm_tmpl_sort);
@@ -1494,7 +1494,7 @@ xfrm_state_sort(struct xfrm_state **dst, struct xfrm_state **src, int n,
 	if (afinfo->state_sort)
 		err = afinfo->state_sort(dst, src, n);
 	spin_unlock_bh(&net->xfrm.xfrm_state_lock);
-	xfrm_state_put_afinfo(afinfo);
+	rcu_read_unlock();
 	return err;
 }
 EXPORT_SYMBOL(xfrm_state_sort);
@@ -1932,10 +1932,10 @@ EXPORT_SYMBOL(xfrm_unregister_km);
 int xfrm_state_register_afinfo(struct xfrm_state_afinfo *afinfo)
 {
 	int err = 0;
-	if (unlikely(afinfo == NULL))
-		return -EINVAL;
-	if (unlikely(afinfo->family >= NPROTO))
+
+	if (WARN_ON(afinfo->family >= NPROTO))
 		return -EAFNOSUPPORT;
+
 	spin_lock_bh(&xfrm_state_afinfo_lock);
 	if (unlikely(xfrm_state_afinfo[afinfo->family] != NULL))
 		err = -EEXIST;
@@ -1948,14 +1948,14 @@ EXPORT_SYMBOL(xfrm_state_register_afinfo);
 
 int xfrm_state_unregister_afinfo(struct xfrm_state_afinfo *afinfo)
 {
-	int err = 0;
-	if (unlikely(afinfo == NULL))
-		return -EINVAL;
-	if (unlikely(afinfo->family >= NPROTO))
+	int err = 0, family = afinfo->family;
+
+	if (WARN_ON(family >= NPROTO))
 		return -EAFNOSUPPORT;
+
 	spin_lock_bh(&xfrm_state_afinfo_lock);
 	if (likely(xfrm_state_afinfo[afinfo->family] != NULL)) {
-		if (unlikely(xfrm_state_afinfo[afinfo->family] != afinfo))
+		if (rcu_access_pointer(xfrm_state_afinfo[family]) != afinfo)
 			err = -EINVAL;
 		else
 			RCU_INIT_POINTER(xfrm_state_afinfo[afinfo->family], NULL);
@@ -1966,6 +1966,14 @@ int xfrm_state_unregister_afinfo(struct xfrm_state_afinfo *afinfo)
 }
 EXPORT_SYMBOL(xfrm_state_unregister_afinfo);
 
+struct xfrm_state_afinfo *xfrm_state_afinfo_get_rcu(unsigned int family)
+{
+	if (unlikely(family >= NPROTO))
+		return NULL;
+
+	return rcu_dereference(xfrm_state_afinfo[family]);
+}
+
 struct xfrm_state_afinfo *xfrm_state_get_afinfo(unsigned int family)
 {
 	struct xfrm_state_afinfo *afinfo;
@@ -1978,11 +1986,6 @@ struct xfrm_state_afinfo *xfrm_state_get_afinfo(unsigned int family)
 	return afinfo;
 }
 
-void xfrm_state_put_afinfo(struct xfrm_state_afinfo *afinfo)
-{
-	rcu_read_unlock();
-}
-
 /* Temporarily located here until net/xfrm/xfrm_tunnel.c is created */
 void xfrm_state_delete_tunnel(struct xfrm_state *x)
 {
@@ -2000,16 +2003,13 @@ EXPORT_SYMBOL(xfrm_state_delete_tunnel);
 
 int xfrm_state_mtu(struct xfrm_state *x, int mtu)
 {
-	int res;
+	const struct xfrm_type *type = READ_ONCE(x->type);
 
-	spin_lock_bh(&x->lock);
 	if (x->km.state == XFRM_STATE_VALID &&
-	    x->type && x->type->get_mtu)
-		res = x->type->get_mtu(x, mtu);
-	else
-		res = mtu - x->props.header_len;
-	spin_unlock_bh(&x->lock);
-	return res;
+	    type && type->get_mtu)
+		return type->get_mtu(x, mtu);
+
+	return mtu - x->props.header_len;
 }
 
 int __xfrm_init_state(struct xfrm_state *x, bool init_replay)
@@ -2028,7 +2028,7 @@ int __xfrm_init_state(struct xfrm_state *x, bool init_replay)
 	if (afinfo->init_flags)
 		err = afinfo->init_flags(x);
 
-	xfrm_state_put_afinfo(afinfo);
+	rcu_read_unlock();
 
 	if (err)
 		goto error;