133 files changed, 4455 insertions, 1663 deletions
diff --git a/net/atm/common.c b/net/atm/common.c
index 8a4f99114cd2..5763fd241dc3 100644
--- a/net/atm/common.c
+++ b/net/atm/common.c
@@ -14,7 +14,7 @@
 #include <linux/capability.h>
 #include <linux/mm.h>
 #include <linux/sched/signal.h>
-#include <linux/time.h>		/* struct timeval */
+#include <linux/time64.h>	/* 64-bit time for seconds */
 #include <linux/skbuff.h>
 #include <linux/bitops.h>
 #include <linux/init.h>
diff --git a/net/atm/mpc.c b/net/atm/mpc.c
index 7c6a1cc760a2..31e0dcb970f8 100644
--- a/net/atm/mpc.c
+++ b/net/atm/mpc.c
@@ -1089,7 +1089,7 @@ static void MPOA_trigger_rcvd(struct k_message *msg, struct mpoa_client *mpc)
 		msg->type = SND_MPOA_RES_RQST;
 		msg->content.in_info = entry->ctrl_info;
 		msg_to_mpoad(msg, mpc);
-		do_gettimeofday(&(entry->reply_wait));
+		entry->reply_wait = ktime_get_seconds();
 		mpc->in_ops->put(entry);
 		return;
 	}
@@ -1099,7 +1099,7 @@ static void MPOA_trigger_rcvd(struct k_message *msg, struct mpoa_client *mpc)
 		msg->type = SND_MPOA_RES_RQST;
 		msg->content.in_info = entry->ctrl_info;
 		msg_to_mpoad(msg, mpc);
-		do_gettimeofday(&(entry->reply_wait));
+		entry->reply_wait = ktime_get_seconds();
 		mpc->in_ops->put(entry);
 		return;
 	}
@@ -1175,8 +1175,9 @@ static void MPOA_res_reply_rcvd(struct k_message *msg, struct mpoa_client *mpc)
 	}
 
 	entry->ctrl_info = msg->content.in_info;
-	do_gettimeofday(&(entry->tv));
-	do_gettimeofday(&(entry->reply_wait)); /* Used in refreshing func from now on */
+	entry->time = ktime_get_seconds();
+	/* Used in refreshing func from now on */
+	entry->reply_wait = ktime_get_seconds();
 	entry->refresh_time = 0;
 	ddprintk_cont("entry->shortcut = %p\n", entry->shortcut);
 
diff --git a/net/atm/mpoa_caches.c b/net/atm/mpoa_caches.c
index e01450bb32d6..4bb418313720 100644
--- a/net/atm/mpoa_caches.c
+++ b/net/atm/mpoa_caches.c
@@ -117,7 +117,7 @@ static in_cache_entry *in_cache_add_entry(__be32 dst_ip,
 
 	memcpy(entry->MPS_ctrl_ATM_addr, client->mps_ctrl_addr, ATM_ESA_LEN);
 	entry->ctrl_info.in_dst_ip = dst_ip;
-	do_gettimeofday(&(entry->tv));
+	entry->time = ktime_get_seconds();
 	entry->retry_time = client->parameters.mpc_p4;
 	entry->count = 1;
 	entry->entry_state = INGRESS_INVALID;
@@ -148,7 +148,7 @@ static int cache_hit(in_cache_entry *entry, struct mpoa_client *mpc)
 			if (qos != NULL)
 				msg.qos = qos->qos;
 			msg_to_mpoad(&msg, mpc);
-			do_gettimeofday(&(entry->reply_wait));
+			entry->reply_wait = ktime_get_seconds();
 			entry->entry_state = INGRESS_RESOLVING;
 		}
 		if (entry->shortcut != NULL)
@@ -171,7 +171,7 @@ static int cache_hit(in_cache_entry *entry, struct mpoa_client *mpc)
 		if (qos != NULL)
 			msg.qos = qos->qos;
 		msg_to_mpoad(&msg, mpc);
-		do_gettimeofday(&(entry->reply_wait));
+		entry->reply_wait = ktime_get_seconds();
 	}
 
 	return CLOSED;
@@ -227,17 +227,16 @@ static void in_cache_remove_entry(in_cache_entry *entry,
 static void clear_count_and_expired(struct mpoa_client *client)
 {
 	in_cache_entry *entry, *next_entry;
-	struct timeval now;
+	time64_t now;
 
-	do_gettimeofday(&now);
+	now = ktime_get_seconds();
 
 	write_lock_bh(&client->ingress_lock);
 	entry = client->in_cache;
 	while (entry != NULL) {
 		entry->count = 0;
 		next_entry = entry->next;
-		if ((now.tv_sec - entry->tv.tv_sec)
-		   > entry->ctrl_info.holding_time) {
+		if ((now - entry->time) > entry->ctrl_info.holding_time) {
 			dprintk("holding time expired, ip = %pI4\n",
 				&entry->ctrl_info.in_dst_ip);
 			client->in_ops->remove_entry(entry, client);
@@ -253,35 +252,35 @@ static void check_resolving_entries(struct mpoa_client *client)
 
 	struct atm_mpoa_qos *qos;
 	in_cache_entry *entry;
-	struct timeval now;
+	time64_t now;
 	struct k_message msg;
 
-	do_gettimeofday(&now);
+	now = ktime_get_seconds();
 
 	read_lock_bh(&client->ingress_lock);
 	entry = client->in_cache;
 	while (entry != NULL) {
 		if (entry->entry_state == INGRESS_RESOLVING) {
-			if ((now.tv_sec - entry->hold_down.tv_sec) <
-			    client->parameters.mpc_p6) {
+
+			if ((now - entry->hold_down)
+					< client->parameters.mpc_p6) {
 				entry = entry->next;	/* Entry in hold down */
 				continue;
 			}
-			if ((now.tv_sec - entry->reply_wait.tv_sec) >
-			    entry->retry_time) {
+			if ((now - entry->reply_wait) > entry->retry_time) {
 				entry->retry_time = MPC_C1 * (entry->retry_time);
 				/*
 				 * Retry time maximum exceeded,
 				 * put entry in hold down.
 				 */
 				if (entry->retry_time > client->parameters.mpc_p5) {
-					do_gettimeofday(&(entry->hold_down));
+					entry->hold_down = ktime_get_seconds();
 					entry->retry_time = client->parameters.mpc_p4;
 					entry = entry->next;
 					continue;
 				}
 				/* Ask daemon to send a resolution request. */
-				memset(&(entry->hold_down), 0, sizeof(struct timeval));
+				memset(&entry->hold_down, 0, sizeof(time64_t));
 				msg.type = SND_MPOA_RES_RTRY;
 				memcpy(msg.MPS_ctrl, client->mps_ctrl_addr, ATM_ESA_LEN);
 				msg.content.in_info = entry->ctrl_info;
@@ -289,7 +288,7 @@ static void check_resolving_entries(struct mpoa_client *client)
 				if (qos != NULL)
 					msg.qos = qos->qos;
 				msg_to_mpoad(&msg, client);
-				do_gettimeofday(&(entry->reply_wait));
+				entry->reply_wait = ktime_get_seconds();
 			}
 		}
 		entry = entry->next;
@@ -300,18 +299,18 @@ static void check_resolving_entries(struct mpoa_client *client)
 /* Call this every MPC-p5 seconds. */
 static void refresh_entries(struct mpoa_client *client)
 {
-	struct timeval now;
+	time64_t now;
 	struct in_cache_entry *entry = client->in_cache;
 
 	ddprintk("refresh_entries\n");
-	do_gettimeofday(&now);
+	now = ktime_get_seconds();
 
 	read_lock_bh(&client->ingress_lock);
 	while (entry != NULL) {
 		if (entry->entry_state == INGRESS_RESOLVED) {
 			if (!(entry->refresh_time))
 				entry->refresh_time = (2 * (entry->ctrl_info.holding_time))/3;
-			if ((now.tv_sec - entry->reply_wait.tv_sec) >
+			if ((now - entry->reply_wait) >
 			    entry->refresh_time) {
 				dprintk("refreshing an entry.\n");
 				entry->entry_state = INGRESS_REFRESHING;
@@ -480,7 +479,7 @@ static eg_cache_entry *eg_cache_add_entry(struct k_message *msg,
 
 	memcpy(entry->MPS_ctrl_ATM_addr, client->mps_ctrl_addr, ATM_ESA_LEN);
 	entry->ctrl_info = msg->content.eg_info;
-	do_gettimeofday(&(entry->tv));
+	entry->time = ktime_get_seconds();
 	entry->entry_state = EGRESS_RESOLVED;
 	dprintk("new_eg_cache_entry cache_id %u\n",
 		ntohl(entry->ctrl_info.cache_id));
@@ -495,7 +494,7 @@ static eg_cache_entry *eg_cache_add_entry(struct k_message *msg,
 
 static void update_eg_cache_entry(eg_cache_entry *entry, uint16_t holding_time)
 {
-	do_gettimeofday(&(entry->tv));
+	entry->time = ktime_get_seconds();
 	entry->entry_state = EGRESS_RESOLVED;
 	entry->ctrl_info.holding_time = holding_time;
 }
@@ -503,17 +502,16 @@ static void update_eg_cache_entry(eg_cache_entry *entry, uint16_t holding_time)
 static void clear_expired(struct mpoa_client *client)
 {
 	eg_cache_entry *entry, *next_entry;
-	struct timeval now;
+	time64_t now;
 	struct k_message msg;
 
-	do_gettimeofday(&now);
+	now = ktime_get_seconds();
 
 	write_lock_irq(&client->egress_lock);
 	entry = client->eg_cache;
 	while (entry != NULL) {
 		next_entry = entry->next;
-		if ((now.tv_sec - entry->tv.tv_sec)
-		   > entry->ctrl_info.holding_time) {
+		if ((now - entry->time) > entry->ctrl_info.holding_time) {
 			msg.type = SND_EGRESS_PURGE;
 			msg.content.eg_info = entry->ctrl_info;
 			dprintk("egress_cache: holding time expired, cache_id = %u.\n",
diff --git a/net/atm/mpoa_caches.h b/net/atm/mpoa_caches.h
index 6a266669ebf4..464c4c7f8d1f 100644
--- a/net/atm/mpoa_caches.h
+++ b/net/atm/mpoa_caches.h
@@ -2,6 +2,7 @@
 #ifndef MPOA_CACHES_H
 #define MPOA_CACHES_H
 
+#include <linux/time64.h>
 #include <linux/netdevice.h>
 #include <linux/types.h>
 #include <linux/atm.h>
@@ -16,9 +17,9 @@ void atm_mpoa_init_cache(struct mpoa_client *mpc);
 typedef struct in_cache_entry {
 	struct in_cache_entry *next;
 	struct in_cache_entry *prev;
-	struct timeval  tv;
-	struct timeval  reply_wait;
-	struct timeval  hold_down;
+	time64_t  time;
+	time64_t  reply_wait;
+	time64_t  hold_down;
 	uint32_t  packets_fwded;
 	uint16_t  entry_state;
 	uint32_t retry_time;
@@ -53,7 +54,7 @@ struct in_cache_ops{
 typedef struct eg_cache_entry{
 	struct               eg_cache_entry *next;
 	struct               eg_cache_entry *prev;
-	struct               timeval  tv;
+	time64_t	     time;
 	uint8_t              MPS_ctrl_ATM_addr[ATM_ESA_LEN];
 	struct atm_vcc       *shortcut;
 	uint32_t             packets_rcvd;
diff --git a/net/atm/mpoa_proc.c b/net/atm/mpoa_proc.c
index 8a0c17e1c203..2212da9c2da2 100644
--- a/net/atm/mpoa_proc.c
+++ b/net/atm/mpoa_proc.c
@@ -8,7 +8,7 @@
 #include <linux/mm.h>
 #include <linux/module.h>
 #include <linux/proc_fs.h>
-#include <linux/time.h>
+#include <linux/ktime.h>
 #include <linux/seq_file.h>
 #include <linux/uaccess.h>
 #include <linux/atmmpc.h>
@@ -138,7 +138,7 @@ static int mpc_show(struct seq_file *m, void *v)
 	int i;
 	in_cache_entry *in_entry;
 	eg_cache_entry *eg_entry;
-	struct timeval now;
+	time64_t now;
 	unsigned char ip_string[16];
 
 	if (v == SEQ_START_TOKEN) {
@@ -148,15 +148,17 @@ static int mpc_show(struct seq_file *m, void *v)
 
 	seq_printf(m, "\nInterface %d:\n\n", mpc->dev_num);
 	seq_printf(m, "Ingress Entries:\nIP address      State      Holding time  Packets fwded  VPI  VCI\n");
-	do_gettimeofday(&now);
+	now = ktime_get_seconds();
 
 	for (in_entry = mpc->in_cache; in_entry; in_entry = in_entry->next) {
+		unsigned long seconds_delta = now - in_entry->time;
+
 		sprintf(ip_string, "%pI4", &in_entry->ctrl_info.in_dst_ip);
 		seq_printf(m, "%-16s%s%-14lu%-12u",
 			   ip_string,
 			   ingress_state_string(in_entry->entry_state),
 			   in_entry->ctrl_info.holding_time -
-			   (now.tv_sec-in_entry->tv.tv_sec),
+			   seconds_delta,
 			   in_entry->packets_fwded);
 		if (in_entry->shortcut)
 			seq_printf(m, "   %-3d  %-3d",
@@ -169,13 +171,14 @@ static int mpc_show(struct seq_file *m, void *v)
 	seq_printf(m, "Egress Entries:\nIngress MPC ATM addr\nCache-id        State      Holding time  Packets recvd  Latest IP addr   VPI VCI\n");
 	for (eg_entry = mpc->eg_cache; eg_entry; eg_entry = eg_entry->next) {
 		unsigned char *p = eg_entry->ctrl_info.in_MPC_data_ATM_addr;
+		unsigned long seconds_delta = now - eg_entry->time;
+
 		for (i = 0; i < ATM_ESA_LEN; i++)
 			seq_printf(m, "%02x", p[i]);
 		seq_printf(m, "\n%-16lu%s%-14lu%-15u",
 			   (unsigned long)ntohl(eg_entry->ctrl_info.cache_id),
 			   egress_state_string(eg_entry->entry_state),
-			   (eg_entry->ctrl_info.holding_time -
-			    (now.tv_sec-eg_entry->tv.tv_sec)),
+			   (eg_entry->ctrl_info.holding_time - seconds_delta),
 			   eg_entry->packets_rcvd);
 
 		/* latest IP address */
diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c
index af5b8c87f590..1285ca30ab0a 100644
--- a/net/bridge/br_device.c
+++ b/net/bridge/br_device.c
@@ -125,9 +125,16 @@ static int br_dev_init(struct net_device *dev)
 	if (!br->stats)
 		return -ENOMEM;
 
+	err = br_fdb_hash_init(br);
+	if (err) {
+		free_percpu(br->stats);
+		return err;
+	}
+
 	err = br_vlan_init(br);
 	if (err) {
 		free_percpu(br->stats);
+		br_fdb_hash_fini(br);
 		return err;
 	}
 
@@ -135,6 +142,7 @@ static int br_dev_init(struct net_device *dev)
 	if (err) {
 		free_percpu(br->stats);
 		br_vlan_flush(br);
+		br_fdb_hash_fini(br);
 	}
 	br_set_lockdep_class(dev);
 
@@ -148,6 +156,7 @@ static void br_dev_uninit(struct net_device *dev)
 	br_multicast_dev_del(br);
 	br_multicast_uninit_stats(br);
 	br_vlan_flush(br);
+	br_fdb_hash_fini(br);
 	free_percpu(br->stats);
 }
 
@@ -416,6 +425,7 @@ void br_dev_setup(struct net_device *dev)
 	br->dev = dev;
 	spin_lock_init(&br->lock);
 	INIT_LIST_HEAD(&br->port_list);
+	INIT_HLIST_HEAD(&br->fdb_list);
 	spin_lock_init(&br->hash_lock);
 
 	br->bridge_id.prio[0] = 0x80;
diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c
index 4ea5c8bbe286..dc87fbc9a23b 100644
--- a/net/bridge/br_fdb.c
+++ b/net/bridge/br_fdb.c
@@ -28,14 +28,20 @@
 #include <trace/events/bridge.h>
 #include "br_private.h"
 
+static const struct rhashtable_params br_fdb_rht_params = {
+	.head_offset = offsetof(struct net_bridge_fdb_entry, rhnode),
+	.key_offset = offsetof(struct net_bridge_fdb_entry, key),
+	.key_len = sizeof(struct net_bridge_fdb_key),
+	.automatic_shrinking = true,
+	.locks_mul = 1,
+};
+
 static struct kmem_cache *br_fdb_cache __read_mostly;
 static int fdb_insert(struct net_bridge *br, struct net_bridge_port *source,
 		      const unsigned char *addr, u16 vid);
 static void fdb_notify(struct net_bridge *br,
 		       const struct net_bridge_fdb_entry *, int);
 
-static u32 fdb_salt __read_mostly;
-
 int __init br_fdb_init(void)
 {
 	br_fdb_cache = kmem_cache_create("bridge_fdb_cache",
@@ -45,7 +51,6 @@ int __init br_fdb_init(void)
 	if (!br_fdb_cache)
 		return -ENOMEM;
 
-	get_random_bytes(&fdb_salt, sizeof(fdb_salt));
 	return 0;
 }
 
@@ -54,6 +59,15 @@ void br_fdb_fini(void)
 	kmem_cache_destroy(br_fdb_cache);
 }
 
+int br_fdb_hash_init(struct net_bridge *br)
+{
+	return rhashtable_init(&br->fdb_hash_tbl, &br_fdb_rht_params);
+}
+
+void br_fdb_hash_fini(struct net_bridge *br)
+{
+	rhashtable_destroy(&br->fdb_hash_tbl);
+}
 
 /* if topology_changing then use forward_delay (default 15 sec)
  * otherwise keep longer (default 5 minutes)
@@ -70,13 +84,6 @@ static inline int has_expired(const struct net_bridge *br,
 		time_before_eq(fdb->updated + hold_time(br), jiffies);
 }
 
-static inline int br_mac_hash(const unsigned char *mac, __u16 vid)
-{
-	/* use 1 byte of OUI and 3 bytes of NIC */
-	u32 key = get_unaligned((u32 *)(mac + 2));
-	return jhash_2words(key, vid, fdb_salt) & (BR_HASH_SIZE - 1);
-}
-
 static void fdb_rcu_free(struct rcu_head *head)
 {
 	struct net_bridge_fdb_entry *ent
@@ -84,19 +91,18 @@ static void fdb_rcu_free(struct rcu_head *head)
 	kmem_cache_free(br_fdb_cache, ent);
 }
 
-static struct net_bridge_fdb_entry *fdb_find_rcu(struct hlist_head *head,
+static struct net_bridge_fdb_entry *fdb_find_rcu(struct rhashtable *tbl,
 						 const unsigned char *addr,
 						 __u16 vid)
 {
-	struct net_bridge_fdb_entry *f;
+	struct net_bridge_fdb_key key;
 
 	WARN_ON_ONCE(!rcu_read_lock_held());
 
-	hlist_for_each_entry_rcu(f, head, hlist)
-		if (ether_addr_equal(f->addr.addr, addr) && f->vlan_id == vid)
-			break;
+	key.vlan_id = vid;
+	memcpy(key.addr.addr, addr, sizeof(key.addr.addr));
 
-	return f;
+	return rhashtable_lookup(tbl, &key, br_fdb_rht_params);
 }
 
 /* requires bridge hash_lock */
@@ -104,13 +110,12 @@ static struct net_bridge_fdb_entry *br_fdb_find(struct net_bridge *br,
 						const unsigned char *addr,
 						__u16 vid)
 {
-	struct hlist_head *head = &br->hash[br_mac_hash(addr, vid)];
 	struct net_bridge_fdb_entry *fdb;
 
 	lockdep_assert_held_once(&br->hash_lock);
 
 	rcu_read_lock();
-	fdb = fdb_find_rcu(head, addr, vid);
+	fdb = fdb_find_rcu(&br->fdb_hash_tbl, addr, vid);
 	rcu_read_unlock();
 
 	return fdb;
@@ -120,9 +125,7 @@ struct net_bridge_fdb_entry *br_fdb_find_rcu(struct net_bridge *br,
 					     const unsigned char *addr,
 					     __u16 vid)
 {
-	struct hlist_head *head = &br->hash[br_mac_hash(addr, vid)];
-
-	return fdb_find_rcu(head, addr, vid);
+	return fdb_find_rcu(&br->fdb_hash_tbl, addr, vid);
 }
 
 /* When a static FDB entry is added, the mac address from the entry is
@@ -175,9 +178,11 @@ static void fdb_delete(struct net_bridge *br, struct net_bridge_fdb_entry *f)
 	trace_fdb_delete(br, f);
 
 	if (f->is_static)
-		fdb_del_hw_addr(br, f->addr.addr);
+		fdb_del_hw_addr(br, f->key.addr.addr);
 
-	hlist_del_init_rcu(&f->hlist);
+	hlist_del_init_rcu(&f->fdb_node);
+	rhashtable_remove_fast(&br->fdb_hash_tbl, &f->rhnode,
+			       br_fdb_rht_params);
 	fdb_notify(br, f, RTM_DELNEIGH);
 	call_rcu(&f->rcu, fdb_rcu_free);
 }
@@ -187,11 +192,11 @@ static void fdb_delete_local(struct net_bridge *br,
 			     const struct net_bridge_port *p,
 			     struct net_bridge_fdb_entry *f)
 {
-	const unsigned char *addr = f->addr.addr;
+	const unsigned char *addr = f->key.addr.addr;
 	struct net_bridge_vlan_group *vg;
 	const struct net_bridge_vlan *v;
 	struct net_bridge_port *op;
-	u16 vid = f->vlan_id;
+	u16 vid = f->key.vlan_id;
 
 	/* Maybe another port has same hw addr? */
 	list_for_each_entry(op, &br->port_list, list) {
@@ -233,31 +238,23 @@ void br_fdb_find_delete_local(struct net_bridge *br,
 void br_fdb_changeaddr(struct net_bridge_port *p, const unsigned char *newaddr)
 {
 	struct net_bridge_vlan_group *vg;
+	struct net_bridge_fdb_entry *f;
 	struct net_bridge *br = p->br;
 	struct net_bridge_vlan *v;
-	int i;
 
 	spin_lock_bh(&br->hash_lock);
-
 	vg = nbp_vlan_group(p);
-	/* Search all chains since old address/hash is unknown */
-	for (i = 0; i < BR_HASH_SIZE; i++) {
-		struct hlist_node *h;
-		hlist_for_each(h, &br->hash[i]) {
-			struct net_bridge_fdb_entry *f;
-
-			f = hlist_entry(h, struct net_bridge_fdb_entry, hlist);
-			if (f->dst == p && f->is_local && !f->added_by_user) {
-				/* delete old one */
-				fdb_delete_local(br, p, f);
-
-				/* if this port has no vlan information
-				 * configured, we can safely be done at
-				 * this point.
-				 */
-				if (!vg || !vg->num_vlans)
-					goto insert;
-			}
+	hlist_for_each_entry(f, &br->fdb_list, fdb_node) {
+		if (f->dst == p && f->is_local && !f->added_by_user) {
+			/* delete old one */
+			fdb_delete_local(br, p, f);
+
+			/* if this port has no vlan information
+			 * configured, we can safely be done at
+			 * this point.
+			 */
+			if (!vg || !vg->num_vlans)
+				goto insert;
 		}
 	}
 
@@ -316,35 +313,32 @@ void br_fdb_cleanup(struct work_struct *work)
 {
 	struct net_bridge *br = container_of(work, struct net_bridge,
 					     gc_work.work);
+	struct net_bridge_fdb_entry *f = NULL;
 	unsigned long delay = hold_time(br);
 	unsigned long work_delay = delay;
 	unsigned long now = jiffies;
-	int i;
 
-	for (i = 0; i < BR_HASH_SIZE; i++) {
-		struct net_bridge_fdb_entry *f;
-		struct hlist_node *n;
+	/* this part is tricky, in order to avoid blocking learning and
+	 * consequently forwarding, we rely on rcu to delete objects with
+	 * delayed freeing allowing us to continue traversing
+	 */
+	rcu_read_lock();
+	hlist_for_each_entry_rcu(f, &br->fdb_list, fdb_node) {
+		unsigned long this_timer;
 
-		if (!br->hash[i].first)
+		if (f->is_static || f->added_by_external_learn)
 			continue;
-
-		spin_lock_bh(&br->hash_lock);
-		hlist_for_each_entry_safe(f, n, &br->hash[i], hlist) {
-			unsigned long this_timer;
-
-			if (f->is_static)
-				continue;
-			if (f->added_by_external_learn)
-				continue;
-			this_timer = f->updated + delay;
-			if (time_after(this_timer, now))
-				work_delay = min(work_delay, this_timer - now);
-			else
+		this_timer = f->updated + delay;
+		if (time_after(this_timer, now)) {
+			work_delay = min(work_delay, this_timer - now);
+		} else {
+			spin_lock_bh(&br->hash_lock);
+			if (!hlist_unhashed(&f->fdb_node))
 				fdb_delete(br, f);
+			spin_unlock_bh(&br->hash_lock);
 		}
-		spin_unlock_bh(&br->hash_lock);
-		cond_resched();
 	}
+	rcu_read_unlock();
 
 	/* Cleanup minimum 10 milliseconds apart */
 	work_delay = max_t(unsigned long, work_delay, msecs_to_jiffies(10));
@@ -354,16 +348,13 @@ void br_fdb_cleanup(struct work_struct *work)
 /* Completely flush all dynamic entries in forwarding database.*/
 void br_fdb_flush(struct net_bridge *br)
 {
-	int i;
+	struct net_bridge_fdb_entry *f;
+	struct hlist_node *tmp;
 
 	spin_lock_bh(&br->hash_lock);
-	for (i = 0; i < BR_HASH_SIZE; i++) {
-		struct net_bridge_fdb_entry *f;
-		struct hlist_node *n;
-		hlist_for_each_entry_safe(f, n, &br->hash[i], hlist) {
-			if (!f->is_static)
-				fdb_delete(br, f);
-		}
+	hlist_for_each_entry_safe(f, tmp, &br->fdb_list, fdb_node) {
+		if (!f->is_static)
+			fdb_delete(br, f);
 	}
 	spin_unlock_bh(&br->hash_lock);
 }
@@ -377,27 +368,22 @@ void br_fdb_delete_by_port(struct net_bridge *br,
 			   u16 vid,
 			   int do_all)
 {
-	int i;
+	struct net_bridge_fdb_entry *f;
+	struct hlist_node *tmp;
 
 	spin_lock_bh(&br->hash_lock);
-	for (i = 0; i < BR_HASH_SIZE; i++) {
-		struct hlist_node *h, *g;
+	hlist_for_each_entry_safe(f, tmp, &br->fdb_list, fdb_node) {
+		if (f->dst != p)
+			continue;
 
-		hlist_for_each_safe(h, g, &br->hash[i]) {
-			struct net_bridge_fdb_entry *f
-				= hlist_entry(h, struct net_bridge_fdb_entry, hlist);
-			if (f->dst != p)
+		if (!do_all)
+			if (f->is_static || (vid && f->key.vlan_id != vid))
 				continue;
 
-			if (!do_all)
-				if (f->is_static || (vid && f->vlan_id != vid))
-					continue;
-
-			if (f->is_local)
-				fdb_delete_local(br, p, f);
-			else
-				fdb_delete(br, f);
-		}
+		if (f->is_local)
+			fdb_delete_local(br, p, f);
+		else
+			fdb_delete(br, f);
 	}
 	spin_unlock_bh(&br->hash_lock);
 }
@@ -433,52 +419,48 @@ int br_fdb_test_addr(struct net_device *dev, unsigned char *addr)
 int br_fdb_fillbuf(struct net_bridge *br, void *buf,
 		   unsigned long maxnum, unsigned long skip)
 {
-	struct __fdb_entry *fe = buf;
-	int i, num = 0;
 	struct net_bridge_fdb_entry *f;
+	struct __fdb_entry *fe = buf;
+	int num = 0;
 
 	memset(buf, 0, maxnum*sizeof(struct __fdb_entry));
 
 	rcu_read_lock();
-	for (i = 0; i < BR_HASH_SIZE; i++) {
-		hlist_for_each_entry_rcu(f, &br->hash[i], hlist) {
-			if (num >= maxnum)
-				goto out;
+	hlist_for_each_entry_rcu(f, &br->fdb_list, fdb_node) {
+		if (num >= maxnum)
+			break;
 
-			if (has_expired(br, f))
-				continue;
+		if (has_expired(br, f))
+			continue;
 
-			/* ignore pseudo entry for local MAC address */
-			if (!f->dst)
-				continue;
+		/* ignore pseudo entry for local MAC address */
+		if (!f->dst)
+			continue;
 
-			if (skip) {
-				--skip;
-				continue;
-			}
+		if (skip) {
+			--skip;
+			continue;
+		}
 
-			/* convert from internal format to API */
-			memcpy(fe->mac_addr, f->addr.addr, ETH_ALEN);
+		/* convert from internal format to API */
+		memcpy(fe->mac_addr, f->key.addr.addr, ETH_ALEN);
 
-			/* due to ABI compat need to split into hi/lo */
-			fe->port_no = f->dst->port_no;
-			fe->port_hi = f->dst->port_no >> 8;
+		/* due to ABI compat need to split into hi/lo */
+		fe->port_no = f->dst->port_no;
+		fe->port_hi = f->dst->port_no >> 8;
 
-			fe->is_local = f->is_local;
-			if (!f->is_static)
-				fe->ageing_timer_value = jiffies_delta_to_clock_t(jiffies - f->updated);
-			++fe;
-			++num;
-		}
+		fe->is_local = f->is_local;
+		if (!f->is_static)
+			fe->ageing_timer_value = jiffies_delta_to_clock_t(jiffies - f->updated);
+		++fe;
+		++num;
 	}
-
- out:
 	rcu_read_unlock();
 
 	return num;
 }
 
-static struct net_bridge_fdb_entry *fdb_create(struct hlist_head *head,
+static struct net_bridge_fdb_entry *fdb_create(struct net_bridge *br,
 					       struct net_bridge_port *source,
 					       const unsigned char *addr,
 					       __u16 vid,
@@ -489,16 +471,23 @@ static struct net_bridge_fdb_entry *fdb_create(struct hlist_head *head,
 
 	fdb = kmem_cache_alloc(br_fdb_cache, GFP_ATOMIC);
 	if (fdb) {
-		memcpy(fdb->addr.addr, addr, ETH_ALEN);
+		memcpy(fdb->key.addr.addr, addr, ETH_ALEN);
 		fdb->dst = source;
-		fdb->vlan_id = vid;
+		fdb->key.vlan_id = vid;
 		fdb->is_local = is_local;
 		fdb->is_static = is_static;
 		fdb->added_by_user = 0;
 		fdb->added_by_external_learn = 0;
 		fdb->offloaded = 0;
 		fdb->updated = fdb->used = jiffies;
-		hlist_add_head_rcu(&fdb->hlist, head);
+		if (rhashtable_lookup_insert_fast(&br->fdb_hash_tbl,
+						  &fdb->rhnode,
+						  br_fdb_rht_params)) {
+			kmem_cache_free(br_fdb_cache, fdb);
+			fdb = NULL;
+		} else {
+			hlist_add_head_rcu(&fdb->fdb_node, &br->fdb_list);
+		}
 	}
 	return fdb;
 }
@@ -506,7 +495,6 @@ static struct net_bridge_fdb_entry *fdb_create(struct hlist_head *head,
 static int fdb_insert(struct net_bridge *br, struct net_bridge_port *source,
 		  const unsigned char *addr, u16 vid)
 {
-	struct hlist_head *head = &br->hash[br_mac_hash(addr, vid)];
 	struct net_bridge_fdb_entry *fdb;
 
 	if (!is_valid_ether_addr(addr))
@@ -524,7 +512,7 @@ static int fdb_insert(struct net_bridge *br, struct net_bridge_port *source,
 		fdb_delete(br, fdb);
 	}
 
-	fdb = fdb_create(head, source, addr, vid, 1, 1);
+	fdb = fdb_create(br, source, addr, vid, 1, 1);
 	if (!fdb)
 		return -ENOMEM;
 
@@ -548,7 +536,6 @@ int br_fdb_insert(struct net_bridge *br, struct net_bridge_port *source,
 void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source,
 		   const unsigned char *addr, u16 vid, bool added_by_user)
 {
-	struct hlist_head *head = &br->hash[br_mac_hash(addr, vid)];
 	struct net_bridge_fdb_entry *fdb;
 	bool fdb_modified = false;
 
@@ -561,7 +548,7 @@ void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source,
 	      source->state == BR_STATE_FORWARDING))
 		return;
 
-	fdb = fdb_find_rcu(head, addr, vid);
+	fdb = fdb_find_rcu(&br->fdb_hash_tbl, addr, vid);
 	if (likely(fdb)) {
 		/* attempt to update an entry for a local interface */
 		if (unlikely(fdb->is_local)) {
@@ -590,14 +577,13 @@ void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source,
 		}
 	} else {
 		spin_lock(&br->hash_lock);
-		if (likely(!fdb_find_rcu(head, addr, vid))) {
-			fdb = fdb_create(head, source, addr, vid, 0, 0);
-			if (fdb) {
-				if (unlikely(added_by_user))
-					fdb->added_by_user = 1;
-				trace_br_fdb_update(br, source, addr, vid, added_by_user);
-				fdb_notify(br, fdb, RTM_NEWNEIGH);
-			}
+		fdb = fdb_create(br, source, addr, vid, 0, 0);
+		if (fdb) {
+			if (unlikely(added_by_user))
+				fdb->added_by_user = 1;
+			trace_br_fdb_update(br, source, addr, vid,
+					    added_by_user);
+			fdb_notify(br, fdb, RTM_NEWNEIGH);
 		}
 		/* else  we lose race and someone else inserts
 		 * it first, don't bother updating
@@ -646,7 +632,7 @@ static int fdb_fill_info(struct sk_buff *skb, const struct net_bridge *br,
 	if (fdb->added_by_external_learn)
 		ndm->ndm_flags |= NTF_EXT_LEARNED;
 
-	if (nla_put(skb, NDA_LLADDR, ETH_ALEN, &fdb->addr))
+	if (nla_put(skb, NDA_LLADDR, ETH_ALEN, &fdb->key.addr))
 		goto nla_put_failure;
 	if (nla_put_u32(skb, NDA_MASTER, br->dev->ifindex))
 		goto nla_put_failure;
@@ -657,7 +643,8 @@ static int fdb_fill_info(struct sk_buff *skb, const struct net_bridge *br,
 	if (nla_put(skb, NDA_CACHEINFO, sizeof(ci), &ci))
 		goto nla_put_failure;
 
-	if (fdb->vlan_id && nla_put(skb, NDA_VLAN, sizeof(u16), &fdb->vlan_id))
+	if (fdb->key.vlan_id && nla_put(skb, NDA_VLAN, sizeof(u16),
+					&fdb->key.vlan_id))
 		goto nla_put_failure;
 
 	nlmsg_end(skb, nlh);
@@ -711,54 +698,48 @@ int br_fdb_dump(struct sk_buff *skb,
 		int *idx)
 {
 	struct net_bridge *br = netdev_priv(dev);
+	struct net_bridge_fdb_entry *f;
 	int err = 0;
-	int i;
 
 	if (!(dev->priv_flags & IFF_EBRIDGE))
-		goto out;
+		return err;
 
 	if (!filter_dev) {
 		err = ndo_dflt_fdb_dump(skb, cb, dev, NULL, idx);
 		if (err < 0)
-			goto out;
+			return err;
 	}
 
-	for (i = 0; i < BR_HASH_SIZE; i++) {
-		struct net_bridge_fdb_entry *f;
-
-		hlist_for_each_entry_rcu(f, &br->hash[i], hlist) {
-
-			if (*idx < cb->args[2])
+	rcu_read_lock();
+	hlist_for_each_entry_rcu(f, &br->fdb_list, fdb_node) {
+		if (*idx < cb->args[2])
+			goto skip;
+		if (filter_dev && (!f->dst || f->dst->dev != filter_dev)) {
+			if (filter_dev != dev)
 				goto skip;
-
-			if (filter_dev &&
-			    (!f->dst || f->dst->dev != filter_dev)) {
-				if (filter_dev != dev)
-					goto skip;
-				/* !f->dst is a special case for bridge
-				 * It means the MAC belongs to the bridge
-				 * Therefore need a little more filtering
-				 * we only want to dump the !f->dst case
-				 */
-				if (f->dst)
-					goto skip;
-			}
-			if (!filter_dev && f->dst)
+			/* !f->dst is a special case for bridge
+			 * It means the MAC belongs to the bridge
+			 * Therefore need a little more filtering
+			 * we only want to dump the !f->dst case
+			 */
+			if (f->dst)
 				goto skip;
-
-			err = fdb_fill_info(skb, br, f,
-					    NETLINK_CB(cb->skb).portid,
-					    cb->nlh->nlmsg_seq,
-					    RTM_NEWNEIGH,
-					    NLM_F_MULTI);
-			if (err < 0)
-				goto out;
-skip:
-			*idx += 1;
 		}
+		if (!filter_dev && f->dst)
+			goto skip;
+
+		err = fdb_fill_info(skb, br, f,
+				    NETLINK_CB(cb->skb).portid,
+				    cb->nlh->nlmsg_seq,
+				    RTM_NEWNEIGH,
+				    NLM_F_MULTI);
+		if (err < 0)
+			break;
+skip:
+		*idx += 1;
 	}
+	rcu_read_unlock();
 
-out:
 	return err;
 }
 
@@ -766,7 +747,6 @@ out:
 static int fdb_add_entry(struct net_bridge *br, struct net_bridge_port *source,
 			 const __u8 *addr, __u16 state, __u16 flags, __u16 vid)
 {
-	struct hlist_head *head = &br->hash[br_mac_hash(addr, vid)];
 	struct net_bridge_fdb_entry *fdb;
 	bool modified = false;
 
@@ -787,7 +767,7 @@ static int fdb_add_entry(struct net_bridge *br, struct net_bridge_port *source,
 		if (!(flags & NLM_F_CREATE))
 			return -ENOENT;
 
-		fdb = fdb_create(head, source, addr, vid, 0, 0);
+		fdb = fdb_create(br, source, addr, vid, 0, 0);
 		if (!fdb)
 			return -ENOMEM;
 
@@ -1012,65 +992,60 @@ int br_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[],
 
 int br_fdb_sync_static(struct net_bridge *br, struct net_bridge_port *p)
 {
-	struct net_bridge_fdb_entry *fdb, *tmp;
-	int i;
+	struct net_bridge_fdb_entry *f, *tmp;
 	int err;
 
 	ASSERT_RTNL();
 
-	for (i = 0; i < BR_HASH_SIZE; i++) {
-		hlist_for_each_entry(fdb, &br->hash[i], hlist) {
-			/* We only care for static entries */
-			if (!fdb->is_static)
-				continue;
-
-			err = dev_uc_add(p->dev, fdb->addr.addr);
-			if (err)
-				goto rollback;
-		}
+	/* the key here is that static entries change only under rtnl */
+	rcu_read_lock();
+	hlist_for_each_entry_rcu(f, &br->fdb_list, fdb_node) {
+		/* We only care for static entries */
+		if (!f->is_static)
+			continue;
+		err = dev_uc_add(p->dev, f->key.addr.addr);
+		if (err)
+			goto rollback;
 	}
-	return 0;
+done:
+	rcu_read_unlock();
 
-rollback:
-	for (i = 0; i < BR_HASH_SIZE; i++) {
-		hlist_for_each_entry(tmp, &br->hash[i], hlist) {
-			/* If we reached the fdb that failed, we can stop */
-			if (tmp == fdb)
-				break;
-
-			/* We only care for static entries */
-			if (!tmp->is_static)
-				continue;
+	return err;
 
-			dev_uc_del(p->dev, tmp->addr.addr);
-		}
+rollback:
+	hlist_for_each_entry_rcu(tmp, &br->fdb_list, fdb_node) {
+		/* We only care for static entries */
+		if (!tmp->is_static)
+			continue;
+		if (tmp == f)
+			break;
+		dev_uc_del(p->dev, tmp->key.addr.addr);
 	}
-	return err;
+
+	goto done;
 }
 
 void br_fdb_unsync_static(struct net_bridge *br, struct net_bridge_port *p)
 {
-	struct net_bridge_fdb_entry *fdb;
-	int i;
+	struct net_bridge_fdb_entry *f;
 
 	ASSERT_RTNL();
 
-	for (i = 0; i < BR_HASH_SIZE; i++) {
-		hlist_for_each_entry_rcu(fdb, &br->hash[i], hlist) {
-			/* We only care for static entries */
-			if (!fdb->is_static)
-				continue;
+	rcu_read_lock();
+	hlist_for_each_entry_rcu(f, &br->fdb_list, fdb_node) {
+		/* We only care for static entries */
+		if (!f->is_static)
+			continue;
 
-			dev_uc_del(p->dev, fdb->addr.addr);
-		}
+		dev_uc_del(p->dev, f->key.addr.addr);
 	}
+	rcu_read_unlock();
 }
 
 int br_fdb_external_learn_add(struct net_bridge *br, struct net_bridge_port *p,
 			      const unsigned char *addr, u16 vid)
 {
 	struct net_bridge_fdb_entry *fdb;
-	struct hlist_head *head;
 	bool modified = false;
 	int err = 0;
 
@@ -1078,10 +1053,9 @@ int br_fdb_external_learn_add(struct net_bridge *br, struct net_bridge_port *p,
 
 	spin_lock_bh(&br->hash_lock);
 
-	head = &br->hash[br_mac_hash(addr, vid)];
 	fdb = br_fdb_find(br, addr, vid);
 	if (!fdb) {
-		fdb = fdb_create(head, p, addr, vid, 0, 0);
+		fdb = fdb_create(br, p, addr, vid, 0, 0);
 		if (!fdb) {
 			err = -ENOMEM;
 			goto err_unlock;
diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c
index b0f4c734900b..6d9f48bd374a 100644
--- a/net/bridge/br_mdb.c
+++ b/net/bridge/br_mdb.c
@@ -760,9 +760,9 @@ static int br_mdb_del(struct sk_buff *skb, struct nlmsghdr *nlh,
 
 void br_mdb_init(void)
 {
-	rtnl_register(PF_BRIDGE, RTM_GETMDB, NULL, br_mdb_dump, 0);
-	rtnl_register(PF_BRIDGE, RTM_NEWMDB, br_mdb_add, NULL, 0);
-	rtnl_register(PF_BRIDGE, RTM_DELMDB, br_mdb_del, NULL, 0);
+	rtnl_register_module(THIS_MODULE, PF_BRIDGE, RTM_GETMDB, NULL, br_mdb_dump, 0);
+	rtnl_register_module(THIS_MODULE, PF_BRIDGE, RTM_NEWMDB, br_mdb_add, NULL, 0);
+	rtnl_register_module(THIS_MODULE, PF_BRIDGE, RTM_DELMDB, br_mdb_del, NULL, 0);
 }
 
 void br_mdb_uninit(void)
diff --git a/net/bridge/br_nf_core.c b/net/bridge/br_nf_core.c
index 20cbb727df4d..8e2d7cfa4e16 100644
--- a/net/bridge/br_nf_core.c
+++ b/net/bridge/br_nf_core.c
@@ -78,7 +78,6 @@ void br_netfilter_rtable_init(struct net_bridge *br)
 
 	atomic_set(&rt->dst.__refcnt, 1);
 	rt->dst.dev = br->dev;
-	rt->dst.path = &rt->dst;
 	dst_init_metrics(&rt->dst, br_dst_default_metrics, true);
 	rt->dst.flags	= DST_NOXFRM | DST_FAKE_RTABLE;
 	rt->dst.ops = &fake_dst_ops;
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 1312b8d20ec3..80559fd11b7e 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -168,12 +168,17 @@ struct net_bridge_vlan_group {
 	u16				pvid;
 };
 
+struct net_bridge_fdb_key {
+	mac_addr addr;
+	u16 vlan_id;
+};
+
 struct net_bridge_fdb_entry {
-	struct hlist_node		hlist;
+	struct rhash_head		rhnode;
 	struct net_bridge_port		*dst;
 
-	mac_addr			addr;
-	__u16				vlan_id;
+	struct net_bridge_fdb_key	key;
+	struct hlist_node		fdb_node;
 	unsigned char			is_local:1,
 					is_static:1,
 					added_by_user:1,
@@ -315,7 +320,7 @@ struct net_bridge {
 	struct net_bridge_vlan_group	__rcu *vlgrp;
 #endif
 
-	struct hlist_head		hash[BR_HASH_SIZE];
+	struct rhashtable		fdb_hash_tbl;
 #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
 	union {
 		struct rtable		fake_rtable;
@@ -405,6 +410,7 @@ struct net_bridge {
 	int offload_fwd_mark;
 #endif
 	bool				neigh_suppress_enabled;
+	struct hlist_head		fdb_list;
 };
 
 struct br_input_skb_cb {
@@ -515,6 +521,8 @@ static inline void br_netpoll_disable(struct net_bridge_port *p)
 /* br_fdb.c */
 int br_fdb_init(void);
 void br_fdb_fini(void);
+int br_fdb_hash_init(struct net_bridge *br);
+void br_fdb_hash_fini(struct net_bridge *br);
 void br_fdb_flush(struct net_bridge *br);
 void br_fdb_find_delete_local(struct net_bridge *br,
 			      const struct net_bridge_port *p,
diff --git a/net/bridge/br_switchdev.c b/net/bridge/br_switchdev.c
index 9700e0f3307b..ee775f4ff76c 100644
--- a/net/bridge/br_switchdev.c
+++ b/net/bridge/br_switchdev.c
@@ -121,13 +121,13 @@ br_switchdev_fdb_notify(const struct net_bridge_fdb_entry *fdb, int type)
 
 	switch (type) {
 	case RTM_DELNEIGH:
-		br_switchdev_fdb_call_notifiers(false, fdb->addr.addr,
-						fdb->vlan_id,
+		br_switchdev_fdb_call_notifiers(false, fdb->key.addr.addr,
+						fdb->key.vlan_id,
 						fdb->dst->dev);
 		break;
 	case RTM_NEWNEIGH:
-		br_switchdev_fdb_call_notifiers(true, fdb->addr.addr,
-						fdb->vlan_id,
+		br_switchdev_fdb_call_notifiers(true, fdb->key.addr.addr,
+						fdb->key.vlan_id,
 						fdb->dst->dev);
 		break;
 	}
diff --git a/net/can/gw.c b/net/can/gw.c
index 73a02af4b5d7..398dd0395ad9 100644
--- a/net/can/gw.c
+++ b/net/can/gw.c
@@ -1014,6 +1014,8 @@ static struct pernet_operations cangw_pernet_ops = {
 
 static __init int cgw_module_init(void)
 {
+	int ret;
+
 	/* sanitize given module parameter */
 	max_hops = clamp_t(unsigned int, max_hops, CGW_MIN_HOPS, CGW_MAX_HOPS);
 
@@ -1031,15 +1033,19 @@ static __init int cgw_module_init(void)
 	notifier.notifier_call = cgw_notifier;
 	register_netdevice_notifier(&notifier);
 
-	if (__rtnl_register(PF_CAN, RTM_GETROUTE, NULL, cgw_dump_jobs, 0)) {
+	ret = rtnl_register_module(THIS_MODULE, PF_CAN, RTM_GETROUTE,
+				   NULL, cgw_dump_jobs, 0);
+	if (ret) {
 		unregister_netdevice_notifier(&notifier);
 		kmem_cache_destroy(cgw_cache);
 		return -ENOBUFS;
 	}
 
-	/* Only the first call to __rtnl_register can fail */
-	__rtnl_register(PF_CAN, RTM_NEWROUTE, cgw_create_job, NULL, 0);
-	__rtnl_register(PF_CAN, RTM_DELROUTE, cgw_remove_job, NULL, 0);
+	/* Only the first call to rtnl_register_module can fail */
+	rtnl_register_module(THIS_MODULE, PF_CAN, RTM_NEWROUTE,
+			     cgw_create_job, NULL, 0);
+	rtnl_register_module(THIS_MODULE, PF_CAN, RTM_DELROUTE,
+			     cgw_remove_job, NULL, 0);
 
 	return 0;
 }
diff --git a/net/core/dev.c b/net/core/dev.c
index f47e96b62308..b0eee49a2489 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2803,7 +2803,7 @@ struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
 
 	segs = skb_mac_gso_segment(skb, features);
 
-	if (unlikely(skb_needs_check(skb, tx_path)))
+	if (unlikely(skb_needs_check(skb, tx_path) && !IS_ERR(segs)))
 		skb_warn_bad_offload(skb);
 
 	return segs;
@@ -3162,6 +3162,21 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
 	int rc;
 
 	qdisc_calculate_pkt_len(skb, q);
+
+	if (q->flags & TCQ_F_NOLOCK) {
+		if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
+			__qdisc_drop(skb, &to_free);
+			rc = NET_XMIT_DROP;
+		} else {
+			rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK;
+			__qdisc_run(q);
+		}
+
+		if (unlikely(to_free))
+			kfree_skb_list(to_free);
+		return rc;
+	}
+
 	/*
 	 * Heuristic to force contended enqueues to serialize on a
 	 * separate lock before trying to get qdisc main lock.
@@ -3192,9 +3207,9 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
 				contended = false;
 			}
 			__qdisc_run(q);
-		} else
-			qdisc_run_end(q);
+		}
 
+		qdisc_run_end(q);
 		rc = NET_XMIT_SUCCESS;
 	} else {
 		rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK;
@@ -3204,6 +3219,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
 				contended = false;
 			}
 			__qdisc_run(q);
+			qdisc_run_end(q);
 		}
 	}
 	spin_unlock(root_lock);
@@ -4143,19 +4159,22 @@ static __latent_entropy void net_tx_action(struct softirq_action *h)
 
 		while (head) {
 			struct Qdisc *q = head;
-			spinlock_t *root_lock;
+			spinlock_t *root_lock = NULL;
 
 			head = head->next_sched;
 
-			root_lock = qdisc_lock(q);
-			spin_lock(root_lock);
+			if (!(q->flags & TCQ_F_NOLOCK)) {
+				root_lock = qdisc_lock(q);
+				spin_lock(root_lock);
+			}
 			/* We need to make sure head->next_sched is read
 			 * before clearing __QDISC_STATE_SCHED
 			 */
 			smp_mb__before_atomic();
 			clear_bit(__QDISC_STATE_SCHED, &q->state);
 			qdisc_run(q);
-			spin_unlock(root_lock);
+			if (root_lock)
+				spin_unlock(root_lock);
 		}
 	}
 }
@@ -7073,17 +7092,21 @@ int dev_change_proto_down(struct net_device *dev, bool proto_down)
 }
 EXPORT_SYMBOL(dev_change_proto_down);
 
-u8 __dev_xdp_attached(struct net_device *dev, bpf_op_t bpf_op, u32 *prog_id)
+void __dev_xdp_query(struct net_device *dev, bpf_op_t bpf_op,
+		     struct netdev_bpf *xdp)
 {
-	struct netdev_bpf xdp;
-
-	memset(&xdp, 0, sizeof(xdp));
-	xdp.command = XDP_QUERY_PROG;
+	memset(xdp, 0, sizeof(*xdp));
+	xdp->command = XDP_QUERY_PROG;
 
 	/* Query must always succeed. */
-	WARN_ON(bpf_op(dev, &xdp) < 0);
-	if (prog_id)
-		*prog_id = xdp.prog_id;
+	WARN_ON(bpf_op(dev, xdp) < 0);
+}
+
+static u8 __dev_xdp_attached(struct net_device *dev, bpf_op_t bpf_op)
+{
+	struct netdev_bpf xdp;
+
+	__dev_xdp_query(dev, bpf_op, &xdp);
 
 	return xdp.prog_attached;
 }
@@ -7106,6 +7129,27 @@ static int dev_xdp_install(struct net_device *dev, bpf_op_t bpf_op,
 	return bpf_op(dev, &xdp);
 }
 
+static void dev_xdp_uninstall(struct net_device *dev)
+{
+	struct netdev_bpf xdp;
+	bpf_op_t ndo_bpf;
+
+	/* Remove generic XDP */
+	WARN_ON(dev_xdp_install(dev, generic_xdp_install, NULL, 0, NULL));
+
+	/* Remove from the driver */
+	ndo_bpf = dev->netdev_ops->ndo_bpf;
+	if (!ndo_bpf)
+		return;
+
+	__dev_xdp_query(dev, ndo_bpf, &xdp);
+	if (xdp.prog_attached == XDP_ATTACHED_NONE)
+		return;
+
+	/* Program removal should always succeed */
+	WARN_ON(dev_xdp_install(dev, ndo_bpf, NULL, xdp.prog_flags, NULL));
+}
+
 /**
  *	dev_change_xdp_fd - set or clear a bpf program for a device rx path
  *	@dev: device
@@ -7134,10 +7178,10 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
 		bpf_chk = generic_xdp_install;
 
 	if (fd >= 0) {
-		if (bpf_chk && __dev_xdp_attached(dev, bpf_chk, NULL))
+		if (bpf_chk && __dev_xdp_attached(dev, bpf_chk))
 			return -EEXIST;
 		if ((flags & XDP_FLAGS_UPDATE_IF_NOEXIST) &&
-		    __dev_xdp_attached(dev, bpf_op, NULL))
+		    __dev_xdp_attached(dev, bpf_op))
 			return -EBUSY;
 
 		prog = bpf_prog_get_type_dev(fd, BPF_PROG_TYPE_XDP,
@@ -7236,6 +7280,7 @@ static void rollback_registered_many(struct list_head *head)
 		/* Shutdown queueing discipline. */
 		dev_shutdown(dev);
 
+		dev_xdp_uninstall(dev);
 
 		/* Notify protocols, that we are about to destroy
 		 * this device. They should clean all the things.
@@ -8195,7 +8240,6 @@ EXPORT_SYMBOL(alloc_netdev_mqs);
 void free_netdev(struct net_device *dev)
 {
 	struct napi_struct *p, *n;
-	struct bpf_prog *prog;
 
 	might_sleep();
 	netif_free_tx_queues(dev);
@@ -8214,12 +8258,6 @@ void free_netdev(struct net_device *dev)
 	free_percpu(dev->pcpu_refcnt);
 	dev->pcpu_refcnt = NULL;
 
-	prog = rcu_dereference_protected(dev->xdp_prog, 1);
-	if (prog) {
-		bpf_prog_put(prog);
-		static_key_slow_dec(&generic_xdp_needed);
-	}
-
 	/*  Compatibility with error handling in drivers */
 	if (dev->reg_state == NETREG_UNINITIALIZED) {
 		netdev_freemem(dev);
diff --git a/net/core/dst.c b/net/core/dst.c
index 662a2d4a3d19..007aa0b08291 100644
--- a/net/core/dst.c
+++ b/net/core/dst.c
@@ -21,6 +21,7 @@
 #include <linux/sched.h>
 #include <linux/prefetch.h>
 #include <net/lwtunnel.h>
+#include <net/xfrm.h>
 
 #include <net/dst.h>
 #include <net/dst_metadata.h>
@@ -62,15 +63,12 @@ void dst_init(struct dst_entry *dst, struct dst_ops *ops,
 	      struct net_device *dev, int initial_ref, int initial_obsolete,
 	      unsigned short flags)
 {
-	dst->child = NULL;
 	dst->dev = dev;
 	if (dev)
 		dev_hold(dev);
 	dst->ops = ops;
 	dst_init_metrics(dst, dst_default_metrics.metrics, true);
 	dst->expires = 0UL;
-	dst->path = dst;
-	dst->from = NULL;
 #ifdef CONFIG_XFRM
 	dst->xfrm = NULL;
 #endif
@@ -88,7 +86,6 @@ void dst_init(struct dst_entry *dst, struct dst_ops *ops,
 	dst->__use = 0;
 	dst->lastuse = jiffies;
 	dst->flags = flags;
-	dst->next = NULL;
 	if (!(flags & DST_NOCOUNT))
 		dst_entries_add(ops, 1);
 }
@@ -116,12 +113,17 @@ EXPORT_SYMBOL(dst_alloc);
 
 struct dst_entry *dst_destroy(struct dst_entry * dst)
 {
-	struct dst_entry *child;
+	struct dst_entry *child = NULL;
 
 	smp_rmb();
 
-	child = dst->child;
+#ifdef CONFIG_XFRM
+	if (dst->xfrm) {
+		struct xfrm_dst *xdst = (struct xfrm_dst *) dst;
 
+		child = xdst->child;
+	}
+#endif
 	if (!(dst->flags & DST_NOCOUNT))
 		dst_entries_add(dst->ops, -1);
 
diff --git a/net/core/filter.c b/net/core/filter.c
index 6a85e67fafce..754abe1041b7 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3013,6 +3013,8 @@ BPF_CALL_4(bpf_skb_set_tunnel_key, struct sk_buff *, skb,
 	info->key.tun_flags = TUNNEL_KEY | TUNNEL_CSUM | TUNNEL_NOCACHE;
 	if (flags & BPF_F_DONT_FRAGMENT)
 		info->key.tun_flags |= TUNNEL_DONT_FRAGMENT;
+	if (flags & BPF_F_ZERO_CSUM_TX)
+		info->key.tun_flags &= ~TUNNEL_CSUM;
 
 	info->key.tun_id = cpu_to_be64(from->tunnel_id);
 	info->key.tos = from->tunnel_tos;
@@ -3026,8 +3028,6 @@ BPF_CALL_4(bpf_skb_set_tunnel_key, struct sk_buff *, skb,
 				  IPV6_FLOWLABEL_MASK;
 	} else {
 		info->key.u.ipv4.dst = cpu_to_be32(from->remote_ipv4);
-		if (flags & BPF_F_ZERO_CSUM_TX)
-			info->key.tun_flags &= ~TUNNEL_CSUM;
 	}
 
 	return 0;
@@ -4437,6 +4437,42 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
 		*insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
 				      offsetof(struct sock_common, skc_num));
 		break;
+
+	case offsetof(struct bpf_sock_ops, is_fullsock):
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
+						struct bpf_sock_ops_kern,
+						is_fullsock),
+				      si->dst_reg, si->src_reg,
+				      offsetof(struct bpf_sock_ops_kern,
+					       is_fullsock));
+		break;
+
+/* Helper macro for adding read access to tcp_sock fields. */
+#define SOCK_OPS_GET_TCP32(FIELD_NAME)					      \
+	do {								      \
+		BUILD_BUG_ON(FIELD_SIZEOF(struct tcp_sock, FIELD_NAME) != 4); \
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(			      \
+						struct bpf_sock_ops_kern,     \
+						is_fullsock),		      \
+				      si->dst_reg, si->src_reg,		      \
+				      offsetof(struct bpf_sock_ops_kern,      \
+					       is_fullsock));		      \
+		*insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 2);	      \
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(			      \
+						struct bpf_sock_ops_kern, sk),\
+				      si->dst_reg, si->src_reg,		      \
+				      offsetof(struct bpf_sock_ops_kern, sk));\
+		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,        \
+				      offsetof(struct tcp_sock, FIELD_NAME)); \
+	} while (0)
+
+	case offsetof(struct bpf_sock_ops, snd_cwnd):
+		SOCK_OPS_GET_TCP32(snd_cwnd);
+		break;
+
+	case offsetof(struct bpf_sock_ops, srtt_us):
+		SOCK_OPS_GET_TCP32(srtt_us);
+		break;
 	}
 	return insn - insn_buf;
 }
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 15ce30063765..cc75488d3653 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -133,10 +133,10 @@ skb_flow_dissect_set_enc_addr_type(enum flow_dissector_key_id type,
 	ctrl->addr_type = type;
 }
 
-static void
-__skb_flow_dissect_tunnel_info(const struct sk_buff *skb,
-			       struct flow_dissector *flow_dissector,
-			       void *target_container)
+void
+skb_flow_dissect_tunnel_info(const struct sk_buff *skb,
+			     struct flow_dissector *flow_dissector,
+			     void *target_container)
 {
 	struct ip_tunnel_info *info;
 	struct ip_tunnel_key *key;
@@ -212,6 +212,7 @@ __skb_flow_dissect_tunnel_info(const struct sk_buff *skb,
 		tp->dst = key->tp_dst;
 	}
 }
+EXPORT_SYMBOL(skb_flow_dissect_tunnel_info);
 
 static enum flow_dissect_ret
 __skb_flow_dissect_mpls(const struct sk_buff *skb,
@@ -576,9 +577,6 @@ bool __skb_flow_dissect(const struct sk_buff *skb,
 					      FLOW_DISSECTOR_KEY_BASIC,
 					      target_container);
 
-	__skb_flow_dissect_tunnel_info(skb, flow_dissector,
-				       target_container);
-
 	if (dissector_uses_key(flow_dissector,
 			       FLOW_DISSECTOR_KEY_ETH_ADDRS)) {
 		struct ethhdr *eth = eth_hdr(skb);
diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c
index 87f28557b329..b2b2323bdc84 100644
--- a/net/core/gen_stats.c
+++ b/net/core/gen_stats.c
@@ -252,10 +252,10 @@ __gnet_stats_copy_queue_cpu(struct gnet_stats_queue *qstats,
 	}
 }
 
-static void __gnet_stats_copy_queue(struct gnet_stats_queue *qstats,
-				    const struct gnet_stats_queue __percpu *cpu,
-				    const struct gnet_stats_queue *q,
-				    __u32 qlen)
+void __gnet_stats_copy_queue(struct gnet_stats_queue *qstats,
+			     const struct gnet_stats_queue __percpu *cpu,
+			     const struct gnet_stats_queue *q,
+			     __u32 qlen)
 {
 	if (cpu) {
 		__gnet_stats_copy_queue_cpu(qstats, cpu);
@@ -269,6 +269,7 @@ static void __gnet_stats_copy_queue(struct gnet_stats_queue *qstats,
 
 	qstats->qlen = qlen;
 }
+EXPORT_SYMBOL(__gnet_stats_copy_queue);
 
 /**
  * gnet_stats_copy_queue - copy queue statistics into statistics TLV
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index f95a15086225..b9ce241cd28c 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -399,7 +399,7 @@ struct pktgen_dev {
 	__u8	ipsmode;		/* IPSEC mode (config) */
 	__u8	ipsproto;		/* IPSEC type (config) */
 	__u32	spi;
-	struct dst_entry dst;
+	struct xfrm_dst xdst;
 	struct dst_ops dstops;
 #endif
 	char result[512];
@@ -2609,7 +2609,7 @@ static int pktgen_output_ipsec(struct sk_buff *skb, struct pktgen_dev *pkt_dev)
 	 * supports both transport/tunnel mode + ESP/AH type.
 	 */
 	if ((x->props.mode == XFRM_MODE_TUNNEL) && (pkt_dev->spi != 0))
-		skb->_skb_refdst = (unsigned long)&pkt_dev->dst | SKB_DST_NOREF;
+		skb->_skb_refdst = (unsigned long)&pkt_dev->xdst.u.dst | SKB_DST_NOREF;
 
 	rcu_read_lock_bh();
 	err = x->outer_mode->output(x, skb);
@@ -3742,10 +3742,10 @@ static int pktgen_add_device(struct pktgen_thread *t, const char *ifname)
 	 * performance under such circumstance.
 	 */
 	pkt_dev->dstops.family = AF_INET;
-	pkt_dev->dst.dev = pkt_dev->odev;
-	dst_init_metrics(&pkt_dev->dst, pktgen_dst_metrics, false);
-	pkt_dev->dst.child = &pkt_dev->dst;
-	pkt_dev->dst.ops = &pkt_dev->dstops;
+	pkt_dev->xdst.u.dst.dev = pkt_dev->odev;
+	dst_init_metrics(&pkt_dev->xdst.u.dst, pktgen_dst_metrics, false);
+	pkt_dev->xdst.child = &pkt_dev->xdst.u.dst;
+	pkt_dev->xdst.u.dst.ops = &pkt_dev->dstops;
 #endif
 
 	return add_dev_to_thread(t, pkt_dev);
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index dabba2a91fc8..c688dc564b11 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -62,7 +62,9 @@
 struct rtnl_link {
 	rtnl_doit_func		doit;
 	rtnl_dumpit_func	dumpit;
+	struct module		*owner;
 	unsigned int		flags;
+	struct rcu_head		rcu;
 };
 
 static DEFINE_MUTEX(rtnl_mutex);
@@ -127,8 +129,7 @@ bool lockdep_rtnl_is_held(void)
 EXPORT_SYMBOL(lockdep_rtnl_is_held);
 #endif /* #ifdef CONFIG_PROVE_LOCKING */
 
-static struct rtnl_link __rcu *rtnl_msg_handlers[RTNL_FAMILY_MAX + 1];
-static refcount_t rtnl_msg_handlers_ref[RTNL_FAMILY_MAX + 1];
+static struct rtnl_link *__rcu *rtnl_msg_handlers[RTNL_FAMILY_MAX + 1];
 
 static inline int rtm_msgindex(int msgtype)
 {
@@ -144,72 +145,127 @@ static inline int rtm_msgindex(int msgtype)
 	return msgindex;
 }
 
-/**
- * __rtnl_register - Register a rtnetlink message type
- * @protocol: Protocol family or PF_UNSPEC
- * @msgtype: rtnetlink message type
- * @doit: Function pointer called for each request message
- * @dumpit: Function pointer called for each dump request (NLM_F_DUMP) message
- * @flags: rtnl_link_flags to modifiy behaviour of doit/dumpit functions
- *
- * Registers the specified function pointers (at least one of them has
- * to be non-NULL) to be called whenever a request message for the
- * specified protocol family and message type is received.
- *
- * The special protocol family PF_UNSPEC may be used to define fallback
- * function pointers for the case when no entry for the specific protocol
- * family exists.
- *
- * Returns 0 on success or a negative error code.
- */
-int __rtnl_register(int protocol, int msgtype,
-		    rtnl_doit_func doit, rtnl_dumpit_func dumpit,
-		    unsigned int flags)
+static struct rtnl_link *rtnl_get_link(int protocol, int msgtype)
+{
+	struct rtnl_link **tab;
+
+	if (protocol >= ARRAY_SIZE(rtnl_msg_handlers))
+		protocol = PF_UNSPEC;
+
+	tab = rcu_dereference_rtnl(rtnl_msg_handlers[protocol]);
+	if (!tab)
+		tab = rcu_dereference_rtnl(rtnl_msg_handlers[PF_UNSPEC]);
+
+	return tab[msgtype];
+}
+
+static int rtnl_register_internal(struct module *owner,
+				  int protocol, int msgtype,
+				  rtnl_doit_func doit, rtnl_dumpit_func dumpit,
+				  unsigned int flags)
 {
-	struct rtnl_link *tab;
+	struct rtnl_link *link, *old;
+	struct rtnl_link __rcu **tab;
 	int msgindex;
+	int ret = -ENOBUFS;
 
 	BUG_ON(protocol < 0 || protocol > RTNL_FAMILY_MAX);
 	msgindex = rtm_msgindex(msgtype);
 
-	tab = rcu_dereference_raw(rtnl_msg_handlers[protocol]);
+	rtnl_lock();
+	tab = rtnl_msg_handlers[protocol];
 	if (tab == NULL) {
-		tab = kcalloc(RTM_NR_MSGTYPES, sizeof(*tab), GFP_KERNEL);
-		if (tab == NULL)
-			return -ENOBUFS;
+		tab = kcalloc(RTM_NR_MSGTYPES, sizeof(void *), GFP_KERNEL);
+		if (!tab)
+			goto unlock;
 
+		/* ensures we see the 0 stores */
 		rcu_assign_pointer(rtnl_msg_handlers[protocol], tab);
 	}
 
+	old = rtnl_dereference(tab[msgindex]);
+	if (old) {
+		link = kmemdup(old, sizeof(*old), GFP_KERNEL);
+		if (!link)
+			goto unlock;
+	} else {
+		link = kzalloc(sizeof(*link), GFP_KERNEL);
+		if (!link)
+			goto unlock;
+	}
+
+	WARN_ON(link->owner && link->owner != owner);
+	link->owner = owner;
+
+	WARN_ON(doit && link->doit && link->doit != doit);
 	if (doit)
-		tab[msgindex].doit = doit;
+		link->doit = doit;
+	WARN_ON(dumpit && link->dumpit && link->dumpit != dumpit);
 	if (dumpit)
-		tab[msgindex].dumpit = dumpit;
-	tab[msgindex].flags |= flags;
+		link->dumpit = dumpit;
 
-	return 0;
+	link->flags |= flags;
+
+	/* publish protocol:msgtype */
+	rcu_assign_pointer(tab[msgindex], link);
+	ret = 0;
+	if (old)
+		kfree_rcu(old, rcu);
+unlock:
+	rtnl_unlock();
+	return ret;
+}
+
+/**
+ * rtnl_register_module - Register a rtnetlink message type
+ *
+ * @owner: module registering the hook (THIS_MODULE)
+ * @protocol: Protocol family or PF_UNSPEC
+ * @msgtype: rtnetlink message type
+ * @doit: Function pointer called for each request message
+ * @dumpit: Function pointer called for each dump request (NLM_F_DUMP) message
+ * @flags: rtnl_link_flags to modifiy behaviour of doit/dumpit functions
+ *
+ * Like rtnl_register, but for use by removable modules.
+ */
+int rtnl_register_module(struct module *owner,
+			 int protocol, int msgtype,
+			 rtnl_doit_func doit, rtnl_dumpit_func dumpit,
+			 unsigned int flags)
+{
+	return rtnl_register_internal(owner, protocol, msgtype,
+				      doit, dumpit, flags);
 }
-EXPORT_SYMBOL_GPL(__rtnl_register);
+EXPORT_SYMBOL_GPL(rtnl_register_module);
 
 /**
  * rtnl_register - Register a rtnetlink message type
+ * @protocol: Protocol family or PF_UNSPEC
+ * @msgtype: rtnetlink message type
+ * @doit: Function pointer called for each request message
+ * @dumpit: Function pointer called for each dump request (NLM_F_DUMP) message
+ * @flags: rtnl_link_flags to modifiy behaviour of doit/dumpit functions
  *
- * Identical to __rtnl_register() but panics on failure. This is useful
- * as failure of this function is very unlikely, it can only happen due
- * to lack of memory when allocating the chain to store all message
- * handlers for a protocol. Meant for use in init functions where lack
- * of memory implies no sense in continuing.
+ * Registers the specified function pointers (at least one of them has
+ * to be non-NULL) to be called whenever a request message for the
+ * specified protocol family and message type is received.
+ *
+ * The special protocol family PF_UNSPEC may be used to define fallback
+ * function pointers for the case when no entry for the specific protocol
+ * family exists.
  */
 void rtnl_register(int protocol, int msgtype,
 		   rtnl_doit_func doit, rtnl_dumpit_func dumpit,
 		   unsigned int flags)
 {
-	if (__rtnl_register(protocol, msgtype, doit, dumpit, flags) < 0)
-		panic("Unable to register rtnetlink message handler, "
-		      "protocol = %d, message type = %d\n",
-		      protocol, msgtype);
+	int err;
+
+	err = rtnl_register_internal(NULL, protocol, msgtype, doit, dumpit,
+				     flags);
+	if (err)
+		pr_err("Unable to register rtnetlink message handler, "
+		       "protocol = %d, message type = %d\n", protocol, msgtype);
 }
-EXPORT_SYMBOL_GPL(rtnl_register);
 
 /**
  * rtnl_unregister - Unregister a rtnetlink message type
@@ -220,24 +276,25 @@ EXPORT_SYMBOL_GPL(rtnl_register);
  */
 int rtnl_unregister(int protocol, int msgtype)
 {
-	struct rtnl_link *handlers;
+	struct rtnl_link **tab, *link;
 	int msgindex;
 
 	BUG_ON(protocol < 0 || protocol > RTNL_FAMILY_MAX);
 	msgindex = rtm_msgindex(msgtype);
 
 	rtnl_lock();
-	handlers = rtnl_dereference(rtnl_msg_handlers[protocol]);
-	if (!handlers) {
+	tab = rtnl_dereference(rtnl_msg_handlers[protocol]);
+	if (!tab) {
 		rtnl_unlock();
 		return -ENOENT;
 	}
 
-	handlers[msgindex].doit = NULL;
-	handlers[msgindex].dumpit = NULL;
-	handlers[msgindex].flags = 0;
+	link = tab[msgindex];
+	rcu_assign_pointer(tab[msgindex], NULL);
 	rtnl_unlock();
 
+	kfree_rcu(link, rcu);
+
 	return 0;
 }
 EXPORT_SYMBOL_GPL(rtnl_unregister);
@@ -251,20 +308,27 @@ EXPORT_SYMBOL_GPL(rtnl_unregister);
  */
 void rtnl_unregister_all(int protocol)
 {
-	struct rtnl_link *handlers;
+	struct rtnl_link **tab, *link;
+	int msgindex;
 
 	BUG_ON(protocol < 0 || protocol > RTNL_FAMILY_MAX);
 
 	rtnl_lock();
-	handlers = rtnl_dereference(rtnl_msg_handlers[protocol]);
+	tab = rtnl_msg_handlers[protocol];
 	RCU_INIT_POINTER(rtnl_msg_handlers[protocol], NULL);
+	for (msgindex = 0; msgindex < RTM_NR_MSGTYPES; msgindex++) {
+		link = tab[msgindex];
+		if (!link)
+			continue;
+
+		rcu_assign_pointer(tab[msgindex], NULL);
+		kfree_rcu(link, rcu);
+	}
 	rtnl_unlock();
 
 	synchronize_net();
 
-	while (refcount_read(&rtnl_msg_handlers_ref[protocol]) > 1)
-		schedule();
-	kfree(handlers);
+	kfree(tab);
 }
 EXPORT_SYMBOL_GPL(rtnl_unregister_all);
 
@@ -1261,6 +1325,7 @@ static u8 rtnl_xdp_attached_mode(struct net_device *dev, u32 *prog_id)
 {
 	const struct net_device_ops *ops = dev->netdev_ops;
 	const struct bpf_prog *generic_xdp_prog;
+	struct netdev_bpf xdp;
 
 	ASSERT_RTNL();
 
@@ -1273,7 +1338,10 @@ static u8 rtnl_xdp_attached_mode(struct net_device *dev, u32 *prog_id)
 	if (!ops->ndo_bpf)
 		return XDP_ATTACHED_NONE;
 
-	return __dev_xdp_attached(dev, ops->ndo_bpf, prog_id);
+	__dev_xdp_query(dev, ops->ndo_bpf, &xdp);
+	*prog_id = xdp.prog_id;
+
+	return xdp.prog_attached;
 }
 
 static int rtnl_xdp_fill(struct sk_buff *skb, struct net_device *dev)
@@ -1569,6 +1637,8 @@ static const struct nla_policy ifla_policy[IFLA_MAX+1] = {
 	[IFLA_PROMISCUITY]	= { .type = NLA_U32 },
 	[IFLA_NUM_TX_QUEUES]	= { .type = NLA_U32 },
 	[IFLA_NUM_RX_QUEUES]	= { .type = NLA_U32 },
+	[IFLA_GSO_MAX_SEGS]	= { .type = NLA_U32 },
+	[IFLA_GSO_MAX_SIZE]	= { .type = NLA_U32 },
 	[IFLA_PHYS_PORT_ID]	= { .type = NLA_BINARY, .len = MAX_PHYS_ITEM_ID_LEN },
 	[IFLA_CARRIER_CHANGES]	= { .type = NLA_U32 },  /* ignored */
 	[IFLA_PHYS_SWITCH_ID]	= { .type = NLA_BINARY, .len = MAX_PHYS_ITEM_ID_LEN },
@@ -2219,6 +2289,34 @@ static int do_setlink(const struct sk_buff *skb,
 		}
 	}
 
+	if (tb[IFLA_GSO_MAX_SIZE]) {
+		u32 max_size = nla_get_u32(tb[IFLA_GSO_MAX_SIZE]);
+
+		if (max_size > GSO_MAX_SIZE) {
+			err = -EINVAL;
+			goto errout;
+		}
+
+		if (dev->gso_max_size ^ max_size) {
+			netif_set_gso_max_size(dev, max_size);
+			status |= DO_SETLINK_MODIFIED;
+		}
+	}
+
+	if (tb[IFLA_GSO_MAX_SEGS]) {
+		u32 max_segs = nla_get_u32(tb[IFLA_GSO_MAX_SEGS]);
+
+		if (max_segs > GSO_MAX_SEGS) {
+			err = -EINVAL;
+			goto errout;
+		}
+
+		if (dev->gso_max_segs ^ max_segs) {
+			dev->gso_max_segs = max_segs;
+			status |= DO_SETLINK_MODIFIED;
+		}
+	}
+
 	if (tb[IFLA_OPERSTATE])
 		set_operstate(dev, nla_get_u8(tb[IFLA_OPERSTATE]));
 
@@ -2583,6 +2681,10 @@ struct net_device *rtnl_create_link(struct net *net,
 		dev->link_mode = nla_get_u8(tb[IFLA_LINKMODE]);
 	if (tb[IFLA_GROUP])
 		dev_set_group(dev, nla_get_u32(tb[IFLA_GROUP]));
+	if (tb[IFLA_GSO_MAX_SIZE])
+		netif_set_gso_max_size(dev, nla_get_u32(tb[IFLA_GSO_MAX_SIZE]));
+	if (tb[IFLA_GSO_MAX_SEGS])
+		dev->gso_max_segs = nla_get_u32(tb[IFLA_GSO_MAX_SEGS]);
 
 	return dev;
 }
@@ -2973,18 +3075,26 @@ static int rtnl_dump_all(struct sk_buff *skb, struct netlink_callback *cb)
 		s_idx = 1;
 
 	for (idx = 1; idx <= RTNL_FAMILY_MAX; idx++) {
+		struct rtnl_link **tab;
 		int type = cb->nlh->nlmsg_type-RTM_BASE;
-		struct rtnl_link *handlers;
+		struct rtnl_link *link;
 		rtnl_dumpit_func dumpit;
 
 		if (idx < s_idx || idx == PF_PACKET)
 			continue;
 
-		handlers = rtnl_dereference(rtnl_msg_handlers[idx]);
-		if (!handlers)
+		if (type < 0 || type >= RTM_NR_MSGTYPES)
 			continue;
 
-		dumpit = READ_ONCE(handlers[type].dumpit);
+		tab = rcu_dereference_rtnl(rtnl_msg_handlers[idx]);
+		if (!tab)
+			continue;
+
+		link = tab[type];
+		if (!link)
+			continue;
+
+		dumpit = link->dumpit;
 		if (!dumpit)
 			continue;
 
@@ -4314,7 +4424,8 @@ static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh,
 			     struct netlink_ext_ack *extack)
 {
 	struct net *net = sock_net(skb->sk);
-	struct rtnl_link *handlers;
+	struct rtnl_link *link;
+	struct module *owner;
 	int err = -EOPNOTSUPP;
 	rtnl_doit_func doit;
 	unsigned int flags;
@@ -4338,79 +4449,85 @@ static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh,
 	if (kind != 2 && !netlink_net_capable(skb, CAP_NET_ADMIN))
 		return -EPERM;
 
-	if (family >= ARRAY_SIZE(rtnl_msg_handlers))
-		family = PF_UNSPEC;
-
 	rcu_read_lock();
-	handlers = rcu_dereference(rtnl_msg_handlers[family]);
-	if (!handlers) {
-		family = PF_UNSPEC;
-		handlers = rcu_dereference(rtnl_msg_handlers[family]);
-	}
-
 	if (kind == 2 && nlh->nlmsg_flags&NLM_F_DUMP) {
 		struct sock *rtnl;
 		rtnl_dumpit_func dumpit;
 		u16 min_dump_alloc = 0;
 
-		dumpit = READ_ONCE(handlers[type].dumpit);
-		if (!dumpit) {
+		link = rtnl_get_link(family, type);
+		if (!link || !link->dumpit) {
 			family = PF_UNSPEC;
-			handlers = rcu_dereference(rtnl_msg_handlers[PF_UNSPEC]);
-			if (!handlers)
-				goto err_unlock;
-
-			dumpit = READ_ONCE(handlers[type].dumpit);
-			if (!dumpit)
+			link = rtnl_get_link(family, type);
+			if (!link || !link->dumpit)
 				goto err_unlock;
 		}
-
-		refcount_inc(&rtnl_msg_handlers_ref[family]);
+		owner = link->owner;
+		dumpit = link->dumpit;
 
 		if (type == RTM_GETLINK - RTM_BASE)
 			min_dump_alloc = rtnl_calcit(skb, nlh);
 
+		err = 0;
+		/* need to do this before rcu_read_unlock() */
+		if (!try_module_get(owner))
+			err = -EPROTONOSUPPORT;
+
 		rcu_read_unlock();
 
 		rtnl = net->rtnl;
-		{
+		if (err == 0) {
 			struct netlink_dump_control c = {
 				.dump		= dumpit,
 				.min_dump_alloc	= min_dump_alloc,
+				.module		= owner,
 			};
 			err = netlink_dump_start(rtnl, skb, nlh, &c);
+			/* netlink_dump_start() will keep a reference on
+			 * module if dump is still in progress.
+			 */
+			module_put(owner);
 		}
-		refcount_dec(&rtnl_msg_handlers_ref[family]);
 		return err;
 	}
 
-	doit = READ_ONCE(handlers[type].doit);
-	if (!doit) {
+	link = rtnl_get_link(family, type);
+	if (!link || !link->doit) {
 		family = PF_UNSPEC;
-		handlers = rcu_dereference(rtnl_msg_handlers[family]);
+		link = rtnl_get_link(PF_UNSPEC, type);
+		if (!link || !link->doit)
+			goto out_unlock;
 	}
 
-	flags = READ_ONCE(handlers[type].flags);
+	owner = link->owner;
+	if (!try_module_get(owner)) {
+		err = -EPROTONOSUPPORT;
+		goto out_unlock;
+	}
+
+	flags = link->flags;
 	if (flags & RTNL_FLAG_DOIT_UNLOCKED) {
-		refcount_inc(&rtnl_msg_handlers_ref[family]);
-		doit = READ_ONCE(handlers[type].doit);
+		doit = link->doit;
 		rcu_read_unlock();
 		if (doit)
 			err = doit(skb, nlh, extack);
-		refcount_dec(&rtnl_msg_handlers_ref[family]);
+		module_put(owner);
 		return err;
 	}
-
 	rcu_read_unlock();
 
 	rtnl_lock();
-	handlers = rtnl_dereference(rtnl_msg_handlers[family]);
-	if (handlers) {
-		doit = READ_ONCE(handlers[type].doit);
-		if (doit)
-			err = doit(skb, nlh, extack);
-	}
+	link = rtnl_get_link(family, type);
+	if (link && link->doit)
+		err = link->doit(skb, nlh, extack);
 	rtnl_unlock();
+
+	module_put(owner);
+
+	return err;
+
+out_unlock:
+	rcu_read_unlock();
 	return err;
 
 err_unlock:
@@ -4498,11 +4615,6 @@ static struct pernet_operations rtnetlink_net_ops = {
 
 void __init rtnetlink_init(void)
 {
-	int i;
-
-	for (i = 0; i < ARRAY_SIZE(rtnl_msg_handlers_ref); i++)
-		refcount_set(&rtnl_msg_handlers_ref[i], 1);
-
 	if (register_pernet_subsys(&rtnetlink_net_ops))
 		panic("rtnetlink_init: cannot initialize rtnetlink\n");
 
diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c
index 5eeb1d20cc38..c5bb52bc73a1 100644
--- a/net/core/sock_reuseport.c
+++ b/net/core/sock_reuseport.c
@@ -235,7 +235,9 @@ struct sock *reuseport_select_sock(struct sock *sk,
 
 		if (prog && skb)
 			sk2 = run_bpf(reuse, socks, prog, skb, hdr_len);
-		else
+
+		/* no bpf or invalid bpf result: fall back to hash usage */
+		if (!sk2)
 			sk2 = reuse->socks[reciprocal_scale(hash, socks)];
 	}
 
diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c
index 178bb9833311..37ccbe62eb1a 100644
--- a/net/dccp/minisocks.c
+++ b/net/dccp/minisocks.c
@@ -63,9 +63,10 @@ void dccp_time_wait(struct sock *sk, int state, int timeo)
 		 */
 		local_bh_disable();
 		inet_twsk_schedule(tw, timeo);
-		/* Linkage updates. */
-		__inet_twsk_hashdance(tw, sk, &dccp_hashinfo);
-		inet_twsk_put(tw);
+		/* Linkage updates.
+		 * Note that access to tw after this point is illegal.
+		 */
+		inet_twsk_hashdance(tw, sk, &dccp_hashinfo);
 		local_bh_enable();
 	} else {
 		/* Sorry, if we're out of memory, just CLOSE this
diff --git a/net/decnet/dn_dev.c b/net/decnet/dn_dev.c
index 9153247dad28..d1885cf59319 100644
--- a/net/decnet/dn_dev.c
+++ b/net/decnet/dn_dev.c
@@ -1418,9 +1418,12 @@ void __init dn_dev_init(void)
 
 	dn_dev_devices_on();
 
-	rtnl_register(PF_DECnet, RTM_NEWADDR, dn_nl_newaddr, NULL, 0);
-	rtnl_register(PF_DECnet, RTM_DELADDR, dn_nl_deladdr, NULL, 0);
-	rtnl_register(PF_DECnet, RTM_GETADDR, NULL, dn_nl_dump_ifaddr, 0);
+	rtnl_register_module(THIS_MODULE, PF_DECnet, RTM_NEWADDR,
+			     dn_nl_newaddr, NULL, 0);
+	rtnl_register_module(THIS_MODULE, PF_DECnet, RTM_DELADDR,
+			     dn_nl_deladdr, NULL, 0);
+	rtnl_register_module(THIS_MODULE, PF_DECnet, RTM_GETADDR,
+			     NULL, dn_nl_dump_ifaddr, 0);
 
 	proc_create("decnet_dev", S_IRUGO, init_net.proc_net, &dn_dev_seq_fops);
 
diff --git a/net/decnet/dn_fib.c b/net/decnet/dn_fib.c
index b37a1b833c77..fce94cbd4378 100644
--- a/net/decnet/dn_fib.c
+++ b/net/decnet/dn_fib.c
@@ -792,8 +792,10 @@ void __init dn_fib_init(void)
 
 	register_dnaddr_notifier(&dn_fib_dnaddr_notifier);
 
-	rtnl_register(PF_DECnet, RTM_NEWROUTE, dn_fib_rtm_newroute, NULL, 0);
-	rtnl_register(PF_DECnet, RTM_DELROUTE, dn_fib_rtm_delroute, NULL, 0);
+	rtnl_register_module(THIS_MODULE, PF_DECnet, RTM_NEWROUTE,
+			     dn_fib_rtm_newroute, NULL, 0);
+	rtnl_register_module(THIS_MODULE, PF_DECnet, RTM_DELROUTE,
+			     dn_fib_rtm_delroute, NULL, 0);
 }
 
 
diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c
index 324cb9f2f551..73160d4aebbe 100644
--- a/net/decnet/dn_route.c
+++ b/net/decnet/dn_route.c
@@ -199,11 +199,11 @@ static void dn_dst_check_expire(struct timer_list *unused)
 						lockdep_is_held(&dn_rt_hash_table[i].lock))) != NULL) {
 			if (atomic_read(&rt->dst.__refcnt) > 1 ||
 			    (now - rt->dst.lastuse) < expire) {
-				rtp = &rt->dst.dn_next;
+				rtp = &rt->dn_next;
 				continue;
 			}
-			*rtp = rt->dst.dn_next;
-			rt->dst.dn_next = NULL;
+			*rtp = rt->dn_next;
+			rt->dn_next = NULL;
 			dst_dev_put(&rt->dst);
 			dst_release(&rt->dst);
 		}
@@ -233,11 +233,11 @@ static int dn_dst_gc(struct dst_ops *ops)
 						lockdep_is_held(&dn_rt_hash_table[i].lock))) != NULL) {
 			if (atomic_read(&rt->dst.__refcnt) > 1 ||
 			    (now - rt->dst.lastuse) < expire) {
-				rtp = &rt->dst.dn_next;
+				rtp = &rt->dn_next;
 				continue;
 			}
-			*rtp = rt->dst.dn_next;
-			rt->dst.dn_next = NULL;
+			*rtp = rt->dn_next;
+			rt->dn_next = NULL;
 			dst_dev_put(&rt->dst);
 			dst_release(&rt->dst);
 			break;
@@ -333,8 +333,8 @@ static int dn_insert_route(struct dn_route *rt, unsigned int hash, struct dn_rou
 						lockdep_is_held(&dn_rt_hash_table[hash].lock))) != NULL) {
 		if (compare_keys(&rth->fld, &rt->fld)) {
 			/* Put it first */
-			*rthp = rth->dst.dn_next;
-			rcu_assign_pointer(rth->dst.dn_next,
+			*rthp = rth->dn_next;
+			rcu_assign_pointer(rth->dn_next,
 					   dn_rt_hash_table[hash].chain);
 			rcu_assign_pointer(dn_rt_hash_table[hash].chain, rth);
 
@@ -345,10 +345,10 @@ static int dn_insert_route(struct dn_route *rt, unsigned int hash, struct dn_rou
 			*rp = rth;
 			return 0;
 		}
-		rthp = &rth->dst.dn_next;
+		rthp = &rth->dn_next;
 	}
 
-	rcu_assign_pointer(rt->dst.dn_next, dn_rt_hash_table[hash].chain);
+	rcu_assign_pointer(rt->dn_next, dn_rt_hash_table[hash].chain);
 	rcu_assign_pointer(dn_rt_hash_table[hash].chain, rt);
 
 	dst_hold_and_use(&rt->dst, now);
@@ -369,8 +369,8 @@ static void dn_run_flush(struct timer_list *unused)
 			goto nothing_to_declare;
 
 		for(; rt; rt = next) {
-			next = rcu_dereference_raw(rt->dst.dn_next);
-			RCU_INIT_POINTER(rt->dst.dn_next, NULL);
+			next = rcu_dereference_raw(rt->dn_next);
+			RCU_INIT_POINTER(rt->dn_next, NULL);
 			dst_dev_put(&rt->dst);
 			dst_release(&rt->dst);
 		}
@@ -1183,6 +1183,7 @@ make_route:
 	if (rt == NULL)
 		goto e_nobufs;
 
+	rt->dn_next = NULL;
 	memset(&rt->fld, 0, sizeof(rt->fld));
 	rt->fld.saddr        = oldflp->saddr;
 	rt->fld.daddr        = oldflp->daddr;
@@ -1252,7 +1253,7 @@ static int __dn_route_output_key(struct dst_entry **pprt, const struct flowidn *
 	if (!(flags & MSG_TRYHARD)) {
 		rcu_read_lock_bh();
 		for (rt = rcu_dereference_bh(dn_rt_hash_table[hash].chain); rt;
-			rt = rcu_dereference_bh(rt->dst.dn_next)) {
+			rt = rcu_dereference_bh(rt->dn_next)) {
 			if ((flp->daddr == rt->fld.daddr) &&
 			    (flp->saddr == rt->fld.saddr) &&
 			    (flp->flowidn_mark == rt->fld.flowidn_mark) &&
@@ -1448,6 +1449,7 @@ make_route:
 	if (rt == NULL)
 		goto e_nobufs;
 
+	rt->dn_next = NULL;
 	memset(&rt->fld, 0, sizeof(rt->fld));
 	rt->rt_saddr      = fld.saddr;
 	rt->rt_daddr      = fld.daddr;
@@ -1529,7 +1531,7 @@ static int dn_route_input(struct sk_buff *skb)
 
 	rcu_read_lock();
 	for(rt = rcu_dereference(dn_rt_hash_table[hash].chain); rt != NULL;
-	    rt = rcu_dereference(rt->dst.dn_next)) {
+	    rt = rcu_dereference(rt->dn_next)) {
 		if ((rt->fld.saddr == cb->src) &&
 		    (rt->fld.daddr == cb->dst) &&
 		    (rt->fld.flowidn_oif == 0) &&
@@ -1749,7 +1751,7 @@ int dn_cache_dump(struct sk_buff *skb, struct netlink_callback *cb)
 		rcu_read_lock_bh();
 		for(rt = rcu_dereference_bh(dn_rt_hash_table[h].chain), idx = 0;
 			rt;
-			rt = rcu_dereference_bh(rt->dst.dn_next), idx++) {
+			rt = rcu_dereference_bh(rt->dn_next), idx++) {
 			if (idx < s_idx)
 				continue;
 			skb_dst_set(skb, dst_clone(&rt->dst));
@@ -1795,7 +1797,7 @@ static struct dn_route *dn_rt_cache_get_next(struct seq_file *seq, struct dn_rou
 {
 	struct dn_rt_cache_iter_state *s = seq->private;
 
-	rt = rcu_dereference_bh(rt->dst.dn_next);
+	rt = rcu_dereference_bh(rt->dn_next);
 	while (!rt) {
 		rcu_read_unlock_bh();
 		if (--s->bucket < 0)
@@ -1921,11 +1923,11 @@ void __init dn_route_init(void)
 		    &dn_rt_cache_seq_fops);
 
 #ifdef CONFIG_DECNET_ROUTER
-	rtnl_register(PF_DECnet, RTM_GETROUTE, dn_cache_getroute,
-		      dn_fib_dump, 0);
+	rtnl_register_module(THIS_MODULE, PF_DECnet, RTM_GETROUTE,
+			     dn_cache_getroute, dn_fib_dump, 0);
 #else
-	rtnl_register(PF_DECnet, RTM_GETROUTE, dn_cache_getroute,
-		      dn_cache_dump, 0);
+	rtnl_register_module(THIS_MODULE, PF_DECnet, RTM_GETROUTE,
+			     dn_cache_getroute, dn_cache_dump, 0);
 #endif
 }
 
diff --git a/net/dsa/Kconfig b/net/dsa/Kconfig
index 03c3bdf25468..bbf2c82cf7b2 100644
--- a/net/dsa/Kconfig
+++ b/net/dsa/Kconfig
@@ -16,6 +16,15 @@ config NET_DSA
 
 if NET_DSA
 
+config NET_DSA_LEGACY
+	bool "Support for older platform device and Device Tree registration"
+	default y
+	---help---
+	  Say Y if you want to enable support for the older platform device and
+	  deprecated Device Tree binding registration.
+
+	  This feature is scheduled for removal in 4.17.
+
 # tagging formats
 config NET_DSA_TAG_BRCM
 	bool
diff --git a/net/dsa/Makefile b/net/dsa/Makefile
index 0e13c1f95d13..9e4d3536f977 100644
--- a/net/dsa/Makefile
+++ b/net/dsa/Makefile
@@ -1,7 +1,8 @@
 # SPDX-License-Identifier: GPL-2.0
 # the core
 obj-$(CONFIG_NET_DSA) += dsa_core.o
-dsa_core-y += dsa.o dsa2.o legacy.o master.o port.o slave.o switch.o
+dsa_core-y += dsa.o dsa2.o master.o port.o slave.o switch.o
+dsa_core-$(CONFIG_NET_DSA_LEGACY) += legacy.o
 
 # tagging formats
 dsa_core-$(CONFIG_NET_DSA_TAG_BRCM) += tag_brcm.o
diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c
index 1e287420ff49..21f9bed11988 100644
--- a/net/dsa/dsa2.c
+++ b/net/dsa/dsa2.c
@@ -241,7 +241,7 @@ static int dsa_tree_setup_default_cpu(struct dsa_switch_tree *dst)
 		for (port = 0; port < ds->num_ports; port++) {
 			dp = &ds->ports[port];
 
-			if (dsa_port_is_user(dp))
+			if (dsa_port_is_user(dp) || dsa_port_is_dsa(dp))
 				dp->cpu_dp = dst->cpu_dp;
 		}
 	}
diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h
index 7d036696e8c4..b03665e8fb4e 100644
--- a/net/dsa/dsa_priv.h
+++ b/net/dsa/dsa_priv.h
@@ -97,8 +97,17 @@ const struct dsa_device_ops *dsa_resolve_tag_protocol(int tag_protocol);
 bool dsa_schedule_work(struct work_struct *work);
 
 /* legacy.c */
+#if IS_ENABLED(CONFIG_NET_DSA_LEGACY)
 int dsa_legacy_register(void);
 void dsa_legacy_unregister(void);
+#else
+static inline int dsa_legacy_register(void)
+{
+	return -ENODEV;
+}
+
+static inline void dsa_legacy_unregister(void) { }
+#endif
 int dsa_legacy_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
 		       struct net_device *dev,
 		       const unsigned char *addr, u16 vid,
diff --git a/net/dsa/legacy.c b/net/dsa/legacy.c
index 84611d7fcfa2..aa56d3fb5da4 100644
--- a/net/dsa/legacy.c
+++ b/net/dsa/legacy.c
@@ -718,26 +718,6 @@ static int dsa_resume(struct device *d)
 }
 #endif
 
-/* legacy way, bypassing the bridge *****************************************/
-int dsa_legacy_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
-		       struct net_device *dev,
-		       const unsigned char *addr, u16 vid,
-		       u16 flags)
-{
-	struct dsa_port *dp = dsa_slave_to_port(dev);
-
-	return dsa_port_fdb_add(dp, addr, vid);
-}
-
-int dsa_legacy_fdb_del(struct ndmsg *ndm, struct nlattr *tb[],
-		       struct net_device *dev,
-		       const unsigned char *addr, u16 vid)
-{
-	struct dsa_port *dp = dsa_slave_to_port(dev);
-
-	return dsa_port_fdb_del(dp, addr, vid);
-}
-
 static SIMPLE_DEV_PM_OPS(dsa_pm_ops, dsa_suspend, dsa_resume);
 
 static const struct of_device_id dsa_of_match_table[] = {
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index a95a55f79137..f52307296de4 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -708,14 +708,12 @@ static int dsa_slave_add_cls_matchall(struct net_device *dev,
 	struct dsa_slave_priv *p = netdev_priv(dev);
 	struct dsa_mall_tc_entry *mall_tc_entry;
 	__be16 protocol = cls->common.protocol;
-	struct net *net = dev_net(dev);
 	struct dsa_switch *ds = dp->ds;
 	struct net_device *to_dev;
 	const struct tc_action *a;
 	struct dsa_port *to_dp;
 	int err = -EOPNOTSUPP;
 	LIST_HEAD(actions);
-	int ifindex;
 
 	if (!ds->ops->port_mirror_add)
 		return err;
@@ -729,8 +727,7 @@ static int dsa_slave_add_cls_matchall(struct net_device *dev,
 	if (is_tcf_mirred_egress_mirror(a) && protocol == htons(ETH_P_ALL)) {
 		struct dsa_mall_mirror_tc_entry *mirror;
 
-		ifindex = tcf_mirred_ifindex(a);
-		to_dev = __dev_get_by_index(net, ifindex);
+		to_dev = tcf_mirred_dev(a);
 		if (!to_dev)
 			return -EINVAL;
 
@@ -943,6 +940,26 @@ static const struct ethtool_ops dsa_slave_ethtool_ops = {
 	.set_rxnfc		= dsa_slave_set_rxnfc,
 };
 
+/* legacy way, bypassing the bridge *****************************************/
+int dsa_legacy_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
+		       struct net_device *dev,
+		       const unsigned char *addr, u16 vid,
+		       u16 flags)
+{
+	struct dsa_port *dp = dsa_slave_to_port(dev);
+
+	return dsa_port_fdb_add(dp, addr, vid);
+}
+
+int dsa_legacy_fdb_del(struct ndmsg *ndm, struct nlattr *tb[],
+		       struct net_device *dev,
+		       const unsigned char *addr, u16 vid)
+{
+	struct dsa_port *dp = dsa_slave_to_port(dev);
+
+	return dsa_port_fdb_del(dp, addr, vid);
+}
+
 static const struct net_device_ops dsa_slave_netdev_ops = {
 	.ndo_open	 	= dsa_slave_open,
 	.ndo_stop		= dsa_slave_close,
diff --git a/net/dsa/switch.c b/net/dsa/switch.c
index 29608d087a7c..b93511726069 100644
--- a/net/dsa/switch.c
+++ b/net/dsa/switch.c
@@ -83,29 +83,52 @@ static int dsa_switch_bridge_leave(struct dsa_switch *ds,
 static int dsa_switch_fdb_add(struct dsa_switch *ds,
 			      struct dsa_notifier_fdb_info *info)
 {
-	/* Do not care yet about other switch chips of the fabric */
-	if (ds->index != info->sw_index)
-		return 0;
+	int port = dsa_towards_port(ds, info->sw_index, info->port);
 
 	if (!ds->ops->port_fdb_add)
 		return -EOPNOTSUPP;
 
-	return ds->ops->port_fdb_add(ds, info->port, info->addr,
-				     info->vid);
+	return ds->ops->port_fdb_add(ds, port, info->addr, info->vid);
 }
 
 static int dsa_switch_fdb_del(struct dsa_switch *ds,
 			      struct dsa_notifier_fdb_info *info)
 {
-	/* Do not care yet about other switch chips of the fabric */
-	if (ds->index != info->sw_index)
-		return 0;
+	int port = dsa_towards_port(ds, info->sw_index, info->port);
 
 	if (!ds->ops->port_fdb_del)
 		return -EOPNOTSUPP;
 
-	return ds->ops->port_fdb_del(ds, info->port, info->addr,
-				     info->vid);
+	return ds->ops->port_fdb_del(ds, port, info->addr, info->vid);
+}
+
+static int
+dsa_switch_mdb_prepare_bitmap(struct dsa_switch *ds,
+			      const struct switchdev_obj_port_mdb *mdb,
+			      const unsigned long *bitmap)
+{
+	int port, err;
+
+	if (!ds->ops->port_mdb_prepare || !ds->ops->port_mdb_add)
+		return -EOPNOTSUPP;
+
+	for_each_set_bit(port, bitmap, ds->num_ports) {
+		err = ds->ops->port_mdb_prepare(ds, port, mdb);
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
+static void dsa_switch_mdb_add_bitmap(struct dsa_switch *ds,
+				      const struct switchdev_obj_port_mdb *mdb,
+				      const unsigned long *bitmap)
+{
+	int port;
+
+	for_each_set_bit(port, bitmap, ds->num_ports)
+		ds->ops->port_mdb_add(ds, port, mdb);
 }
 
 static int dsa_switch_mdb_add(struct dsa_switch *ds,
@@ -114,7 +137,7 @@ static int dsa_switch_mdb_add(struct dsa_switch *ds,
 	const struct switchdev_obj_port_mdb *mdb = info->mdb;
 	struct switchdev_trans *trans = info->trans;
 	DECLARE_BITMAP(group, ds->num_ports);
-	int port, err;
+	int port;
 
 	/* Build a mask of Multicast group members */
 	bitmap_zero(group, ds->num_ports);
@@ -124,21 +147,10 @@ static int dsa_switch_mdb_add(struct dsa_switch *ds,
 		if (dsa_is_dsa_port(ds, port))
 			set_bit(port, group);
 
-	if (switchdev_trans_ph_prepare(trans)) {
-		if (!ds->ops->port_mdb_prepare || !ds->ops->port_mdb_add)
-			return -EOPNOTSUPP;
-
-		for_each_set_bit(port, group, ds->num_ports) {
-			err = ds->ops->port_mdb_prepare(ds, port, mdb, trans);
-			if (err)
-				return err;
-		}
-
-		return 0;
-	}
+	if (switchdev_trans_ph_prepare(trans))
+		return dsa_switch_mdb_prepare_bitmap(ds, mdb, group);
 
-	for_each_set_bit(port, group, ds->num_ports)
-		ds->ops->port_mdb_add(ds, port, mdb, trans);
+	dsa_switch_mdb_add_bitmap(ds, mdb, group);
 
 	return 0;
 }
@@ -157,13 +169,43 @@ static int dsa_switch_mdb_del(struct dsa_switch *ds,
 	return 0;
 }
 
+static int
+dsa_switch_vlan_prepare_bitmap(struct dsa_switch *ds,
+			       const struct switchdev_obj_port_vlan *vlan,
+			       const unsigned long *bitmap)
+{
+	int port, err;
+
+	if (!ds->ops->port_vlan_prepare || !ds->ops->port_vlan_add)
+		return -EOPNOTSUPP;
+
+	for_each_set_bit(port, bitmap, ds->num_ports) {
+		err = ds->ops->port_vlan_prepare(ds, port, vlan);
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
+static void
+dsa_switch_vlan_add_bitmap(struct dsa_switch *ds,
+			   const struct switchdev_obj_port_vlan *vlan,
+			   const unsigned long *bitmap)
+{
+	int port;
+
+	for_each_set_bit(port, bitmap, ds->num_ports)
+		ds->ops->port_vlan_add(ds, port, vlan);
+}
+
 static int dsa_switch_vlan_add(struct dsa_switch *ds,
 			       struct dsa_notifier_vlan_info *info)
 {
 	const struct switchdev_obj_port_vlan *vlan = info->vlan;
 	struct switchdev_trans *trans = info->trans;
 	DECLARE_BITMAP(members, ds->num_ports);
-	int port, err;
+	int port;
 
 	/* Build a mask of VLAN members */
 	bitmap_zero(members, ds->num_ports);
@@ -173,21 +215,10 @@ static int dsa_switch_vlan_add(struct dsa_switch *ds,
 		if (dsa_is_cpu_port(ds, port) || dsa_is_dsa_port(ds, port))
 			set_bit(port, members);
 
-	if (switchdev_trans_ph_prepare(trans)) {
-		if (!ds->ops->port_vlan_prepare || !ds->ops->port_vlan_add)
-			return -EOPNOTSUPP;
-
-		for_each_set_bit(port, members, ds->num_ports) {
-			err = ds->ops->port_vlan_prepare(ds, port, vlan, trans);
-			if (err)
-				return err;
-		}
-
-		return 0;
-	}
+	if (switchdev_trans_ph_prepare(trans))
+		return dsa_switch_vlan_prepare_bitmap(ds, vlan, members);
 
-	for_each_set_bit(port, members, ds->num_ports)
-		ds->ops->port_vlan_add(ds, port, vlan, trans);
+	dsa_switch_vlan_add_bitmap(ds, vlan, members);
 
 	return 0;
 }
diff --git a/net/dsa/tag_mtk.c b/net/dsa/tag_mtk.c
index 8475434af7d5..11535bc70743 100644
--- a/net/dsa/tag_mtk.c
+++ b/net/dsa/tag_mtk.c
@@ -13,10 +13,13 @@
  */
 
 #include <linux/etherdevice.h>
+#include <linux/if_vlan.h>
 
 #include "dsa_priv.h"
 
 #define MTK_HDR_LEN		4
+#define MTK_HDR_XMIT_UNTAGGED		0
+#define MTK_HDR_XMIT_TAGGED_TPID_8100	1
 #define MTK_HDR_RECV_SOURCE_PORT_MASK	GENMASK(2, 0)
 #define MTK_HDR_XMIT_DP_BIT_MASK	GENMASK(5, 0)
 
@@ -25,20 +28,37 @@ static struct sk_buff *mtk_tag_xmit(struct sk_buff *skb,
 {
 	struct dsa_port *dp = dsa_slave_to_port(dev);
 	u8 *mtk_tag;
+	bool is_vlan_skb = true;
 
-	if (skb_cow_head(skb, MTK_HDR_LEN) < 0)
-		return NULL;
-
-	skb_push(skb, MTK_HDR_LEN);
+	/* Build the special tag after the MAC Source Address. If VLAN header
+	 * is present, it's required that VLAN header and special tag is
+	 * being combined. Only in this way we can allow the switch can parse
+	 * the both special and VLAN tag at the same time and then look up VLAN
+	 * table with VID.
+	 */
+	if (!skb_vlan_tagged(skb)) {
+		if (skb_cow_head(skb, MTK_HDR_LEN) < 0)
+			return NULL;
 
-	memmove(skb->data, skb->data + MTK_HDR_LEN, 2 * ETH_ALEN);
+		skb_push(skb, MTK_HDR_LEN);
+		memmove(skb->data, skb->data + MTK_HDR_LEN, 2 * ETH_ALEN);
+		is_vlan_skb = false;
+	}
 
-	/* Build the tag after the MAC Source Address */
 	mtk_tag = skb->data + 2 * ETH_ALEN;
-	mtk_tag[0] = 0;
+
+	/* Mark tag attribute on special tag insertion to notify hardware
+	 * whether that's a combined special tag with 802.1Q header.
+	 */
+	mtk_tag[0] = is_vlan_skb ? MTK_HDR_XMIT_TAGGED_TPID_8100 :
+		     MTK_HDR_XMIT_UNTAGGED;
 	mtk_tag[1] = (1 << dp->index) & MTK_HDR_XMIT_DP_BIT_MASK;
-	mtk_tag[2] = 0;
-	mtk_tag[3] = 0;
+
+	/* Tag control information is kept for 802.1Q */
+	if (!is_vlan_skb) {
+		mtk_tag[2] = 0;
+		mtk_tag[3] = 0;
+	}
 
 	return skb;
 }
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index e7d15fb0d94d..f6f58108b4c5 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -19,6 +19,7 @@
 #include <linux/slab.h>
 #include <linux/wait.h>
 #include <linux/vmalloc.h>
+#include <linux/bootmem.h>
 
 #include <net/addrconf.h>
 #include <net/inet_connection_sock.h>
@@ -168,6 +169,60 @@ int __inet_inherit_port(const struct sock *sk, struct sock *child)
 }
 EXPORT_SYMBOL_GPL(__inet_inherit_port);
 
+static struct inet_listen_hashbucket *
+inet_lhash2_bucket_sk(struct inet_hashinfo *h, struct sock *sk)
+{
+	u32 hash;
+
+#if IS_ENABLED(CONFIG_IPV6)
+	if (sk->sk_family == AF_INET6)
+		hash = ipv6_portaddr_hash(sock_net(sk),
+					  &sk->sk_v6_rcv_saddr,
+					  inet_sk(sk)->inet_num);
+	else
+#endif
+		hash = ipv4_portaddr_hash(sock_net(sk),
+					  inet_sk(sk)->inet_rcv_saddr,
+					  inet_sk(sk)->inet_num);
+	return inet_lhash2_bucket(h, hash);
+}
+
+static void inet_hash2(struct inet_hashinfo *h, struct sock *sk)
+{
+	struct inet_listen_hashbucket *ilb2;
+
+	if (!h->lhash2)
+		return;
+
+	ilb2 = inet_lhash2_bucket_sk(h, sk);
+
+	spin_lock(&ilb2->lock);
+	if (sk->sk_reuseport && sk->sk_family == AF_INET6)
+		hlist_add_tail_rcu(&inet_csk(sk)->icsk_listen_portaddr_node,
+				   &ilb2->head);
+	else
+		hlist_add_head_rcu(&inet_csk(sk)->icsk_listen_portaddr_node,
+				   &ilb2->head);
+	ilb2->count++;
+	spin_unlock(&ilb2->lock);
+}
+
+static void inet_unhash2(struct inet_hashinfo *h, struct sock *sk)
+{
+	struct inet_listen_hashbucket *ilb2;
+
+	if (!h->lhash2 ||
+	    WARN_ON_ONCE(hlist_unhashed(&inet_csk(sk)->icsk_listen_portaddr_node)))
+		return;
+
+	ilb2 = inet_lhash2_bucket_sk(h, sk);
+
+	spin_lock(&ilb2->lock);
+	hlist_del_init_rcu(&inet_csk(sk)->icsk_listen_portaddr_node);
+	ilb2->count--;
+	spin_unlock(&ilb2->lock);
+}
+
 static inline int compute_score(struct sock *sk, struct net *net,
 				const unsigned short hnum, const __be32 daddr,
 				const int dif, const int sdif, bool exact_dif)
@@ -207,6 +262,40 @@ static inline int compute_score(struct sock *sk, struct net *net,
  */
 
 /* called with rcu_read_lock() : No refcount taken on the socket */
+static struct sock *inet_lhash2_lookup(struct net *net,
+				struct inet_listen_hashbucket *ilb2,
+				struct sk_buff *skb, int doff,
+				const __be32 saddr, __be16 sport,
+				const __be32 daddr, const unsigned short hnum,
+				const int dif, const int sdif)
+{
+	bool exact_dif = inet_exact_dif_match(net, skb);
+	struct inet_connection_sock *icsk;
+	struct sock *sk, *result = NULL;
+	int score, hiscore = 0;
+	u32 phash = 0;
+
+	inet_lhash2_for_each_icsk_rcu(icsk, &ilb2->head) {
+		sk = (struct sock *)icsk;
+		score = compute_score(sk, net, hnum, daddr,
+				      dif, sdif, exact_dif);
+		if (score > hiscore) {
+			if (sk->sk_reuseport) {
+				phash = inet_ehashfn(net, daddr, hnum,
+						     saddr, sport);
+				result = reuseport_select_sock(sk, phash,
+							       skb, doff);
+				if (result)
+					return result;
+			}
+			result = sk;
+			hiscore = score;
+		}
+	}
+
+	return result;
+}
+
 struct sock *__inet_lookup_listener(struct net *net,
 				    struct inet_hashinfo *hashinfo,
 				    struct sk_buff *skb, int doff,
@@ -216,32 +305,57 @@ struct sock *__inet_lookup_listener(struct net *net,
 {
 	unsigned int hash = inet_lhashfn(net, hnum);
 	struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash];
-	int score, hiscore = 0, matches = 0, reuseport = 0;
 	bool exact_dif = inet_exact_dif_match(net, skb);
+	struct inet_listen_hashbucket *ilb2;
 	struct sock *sk, *result = NULL;
+	int score, hiscore = 0;
+	unsigned int hash2;
 	u32 phash = 0;
 
+	if (ilb->count <= 10 || !hashinfo->lhash2)
+		goto port_lookup;
+
+	/* Too many sk in the ilb bucket (which is hashed by port alone).
+	 * Try lhash2 (which is hashed by port and addr) instead.
+	 */
+
+	hash2 = ipv4_portaddr_hash(net, daddr, hnum);
+	ilb2 = inet_lhash2_bucket(hashinfo, hash2);
+	if (ilb2->count > ilb->count)
+		goto port_lookup;
+
+	result = inet_lhash2_lookup(net, ilb2, skb, doff,
+				    saddr, sport, daddr, hnum,
+				    dif, sdif);
+	if (result)
+		return result;
+
+	/* Lookup lhash2 with INADDR_ANY */
+
+	hash2 = ipv4_portaddr_hash(net, htonl(INADDR_ANY), hnum);
+	ilb2 = inet_lhash2_bucket(hashinfo, hash2);
+	if (ilb2->count > ilb->count)
+		goto port_lookup;
+
+	return inet_lhash2_lookup(net, ilb2, skb, doff,
+				  saddr, sport, daddr, hnum,
+				  dif, sdif);
+
+port_lookup:
 	sk_for_each_rcu(sk, &ilb->head) {
 		score = compute_score(sk, net, hnum, daddr,
 				      dif, sdif, exact_dif);
 		if (score > hiscore) {
-			reuseport = sk->sk_reuseport;
-			if (reuseport) {
+			if (sk->sk_reuseport) {
 				phash = inet_ehashfn(net, daddr, hnum,
 						     saddr, sport);
 				result = reuseport_select_sock(sk, phash,
 							       skb, doff);
 				if (result)
 					return result;
-				matches = 1;
 			}
 			result = sk;
 			hiscore = score;
-		} else if (score == hiscore && reuseport) {
-			matches++;
-			if (reciprocal_scale(phash, matches) == 0)
-				result = sk;
-			phash = next_pseudo_random32(phash);
 		}
 	}
 	return result;
@@ -483,6 +597,8 @@ int __inet_hash(struct sock *sk, struct sock *osk)
 		hlist_add_tail_rcu(&sk->sk_node, &ilb->head);
 	else
 		hlist_add_head_rcu(&sk->sk_node, &ilb->head);
+	inet_hash2(hashinfo, sk);
+	ilb->count++;
 	sock_set_flag(sk, SOCK_RCU_FREE);
 	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
 unlock:
@@ -509,28 +625,35 @@ EXPORT_SYMBOL_GPL(inet_hash);
 void inet_unhash(struct sock *sk)
 {
 	struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
+	struct inet_listen_hashbucket *ilb;
 	spinlock_t *lock;
 	bool listener = false;
-	int done;
 
 	if (sk_unhashed(sk))
 		return;
 
 	if (sk->sk_state == TCP_LISTEN) {
-		lock = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)].lock;
+		ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)];
+		lock = &ilb->lock;
 		listener = true;
 	} else {
 		lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
 	}
 	spin_lock_bh(lock);
+	if (sk_unhashed(sk))
+		goto unlock;
+
 	if (rcu_access_pointer(sk->sk_reuseport_cb))
 		reuseport_detach_sock(sk);
-	if (listener)
-		done = __sk_del_node_init(sk);
-	else
-		done = __sk_nulls_del_node_init_rcu(sk);
-	if (done)
-		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
+	if (listener) {
+		inet_unhash2(hashinfo, sk);
+		 __sk_del_node_init(sk);
+		 ilb->count--;
+	} else {
+		__sk_nulls_del_node_init_rcu(sk);
+	}
+	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
+unlock:
 	spin_unlock_bh(lock);
 }
 EXPORT_SYMBOL_GPL(inet_unhash);
@@ -665,10 +788,37 @@ void inet_hashinfo_init(struct inet_hashinfo *h)
 	for (i = 0; i < INET_LHTABLE_SIZE; i++) {
 		spin_lock_init(&h->listening_hash[i].lock);
 		INIT_HLIST_HEAD(&h->listening_hash[i].head);
+		h->listening_hash[i].count = 0;
 	}
+
+	h->lhash2 = NULL;
 }
 EXPORT_SYMBOL_GPL(inet_hashinfo_init);
 
+void __init inet_hashinfo2_init(struct inet_hashinfo *h, const char *name,
+				unsigned long numentries, int scale,
+				unsigned long low_limit,
+				unsigned long high_limit)
+{
+	unsigned int i;
+
+	h->lhash2 = alloc_large_system_hash(name,
+					    sizeof(*h->lhash2),
+					    numentries,
+					    scale,
+					    0,
+					    NULL,
+					    &h->lhash2_mask,
+					    low_limit,
+					    high_limit);
+
+	for (i = 0; i <= h->lhash2_mask; i++) {
+		spin_lock_init(&h->lhash2[i].lock);
+		INIT_HLIST_HEAD(&h->lhash2[i].head);
+		h->lhash2[i].count = 0;
+	}
+}
+
 int inet_ehash_locks_alloc(struct inet_hashinfo *hashinfo)
 {
 	unsigned int locksz = sizeof(spinlock_t);
diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
index b563e0c46bac..277ff69a312d 100644
--- a/net/ipv4/inet_timewait_sock.c
+++ b/net/ipv4/inet_timewait_sock.c
@@ -97,7 +97,7 @@ static void inet_twsk_add_bind_node(struct inet_timewait_sock *tw,
  * Essentially we whip up a timewait bucket, copy the relevant info into it
  * from the SK, and mess with hash chains and list linkage.
  */
-void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk,
+void inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk,
 			   struct inet_hashinfo *hashinfo)
 {
 	const struct inet_sock *inet = inet_sk(sk);
@@ -119,18 +119,6 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk,
 
 	spin_lock(lock);
 
-	/*
-	 * Step 2: Hash TW into tcp ehash chain.
-	 * Notes :
-	 * - tw_refcnt is set to 4 because :
-	 * - We have one reference from bhash chain.
-	 * - We have one reference from ehash chain.
-	 * - We have one reference from timer.
-	 * - One reference for ourself (our caller will release it).
-	 * We can use atomic_set() because prior spin_lock()/spin_unlock()
-	 * committed into memory all tw fields.
-	 */
-	refcount_set(&tw->tw_refcnt, 4);
 	inet_twsk_add_node_rcu(tw, &ehead->chain);
 
 	/* Step 3: Remove SK from hash chain */
@@ -138,8 +126,19 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk,
 		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
 
 	spin_unlock(lock);
+
+	/* tw_refcnt is set to 3 because we have :
+	 * - one reference for bhash chain.
+	 * - one reference for ehash chain.
+	 * - one reference for timer.
+	 * We can use atomic_set() because prior spin_lock()/spin_unlock()
+	 * committed into memory all tw fields.
+	 * Also note that after this point, we lost our implicit reference
+	 * so we are not allowed to use tw anymore.
+	 */
+	refcount_set(&tw->tw_refcnt, 3);
 }
-EXPORT_SYMBOL_GPL(__inet_twsk_hashdance);
+EXPORT_SYMBOL_GPL(inet_twsk_hashdance);
 
 static void tw_timer_handler(struct timer_list *t)
 {
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 9c1735632c8c..9a80d84fc182 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -114,7 +114,8 @@ MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN");
 static struct rtnl_link_ops ipgre_link_ops __read_mostly;
 static int ipgre_tunnel_init(struct net_device *dev);
 static void erspan_build_header(struct sk_buff *skb,
-				__be32 id, u32 index, bool truncate);
+				__be32 id, u32 index,
+				bool truncate, bool is_ipv4);
 
 static unsigned int ipgre_net_id __read_mostly;
 static unsigned int gre_tap_net_id __read_mostly;
@@ -255,34 +256,41 @@ static int erspan_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi,
 {
 	struct net *net = dev_net(skb->dev);
 	struct metadata_dst *tun_dst = NULL;
+	struct erspan_base_hdr *ershdr;
+	struct erspan_metadata *pkt_md;
 	struct ip_tunnel_net *itn;
 	struct ip_tunnel *tunnel;
-	struct erspanhdr *ershdr;
 	const struct iphdr *iph;
-	__be32 index;
+	int ver;
 	int len;
 
 	itn = net_generic(net, erspan_net_id);
 	len = gre_hdr_len + sizeof(*ershdr);
 
+	/* Check based hdr len */
 	if (unlikely(!pskb_may_pull(skb, len)))
 		return PACKET_REJECT;
 
 	iph = ip_hdr(skb);
-	ershdr = (struct erspanhdr *)(skb->data + gre_hdr_len);
+	ershdr = (struct erspan_base_hdr *)(skb->data + gre_hdr_len);
+	ver = (ntohs(ershdr->ver_vlan) & VER_MASK) >> VER_OFFSET;
 
 	/* The original GRE header does not have key field,
 	 * Use ERSPAN 10-bit session ID as key.
 	 */
 	tpi->key = cpu_to_be32(ntohs(ershdr->session_id) & ID_MASK);
-	index = ershdr->md.index;
+	pkt_md = (struct erspan_metadata *)(ershdr + 1);
 	tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex,
 				  tpi->flags | TUNNEL_KEY,
 				  iph->saddr, iph->daddr, tpi->key);
 
 	if (tunnel) {
+		len = gre_hdr_len + erspan_hdr_len(ver);
+		if (unlikely(!pskb_may_pull(skb, len)))
+			return -ENOMEM;
+
 		if (__iptunnel_pull_header(skb,
-					   gre_hdr_len + sizeof(*ershdr),
+					   len,
 					   htons(ETH_P_TEB),
 					   false, false) < 0)
 			goto drop;
@@ -306,12 +314,27 @@ static int erspan_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi,
 			if (!md)
 				return PACKET_REJECT;
 
-			md->index = index;
+			memcpy(md, pkt_md, sizeof(*md));
+			md->version = ver;
+
 			info = &tun_dst->u.tun_info;
 			info->key.tun_flags |= TUNNEL_ERSPAN_OPT;
 			info->options_len = sizeof(*md);
 		} else {
-			tunnel->index = ntohl(index);
+			tunnel->erspan_ver = ver;
+			if (ver == 1) {
+				tunnel->index = ntohl(pkt_md->u.index);
+			} else {
+				u16 md2_flags;
+				u16 dir, hwid;
+
+				md2_flags = ntohs(pkt_md->u.md2.flags);
+				dir = (md2_flags & DIR_MASK) >> DIR_OFFSET;
+				hwid = (md2_flags & HWID_MASK) >> HWID_OFFSET;
+				tunnel->dir = dir;
+				tunnel->hwid = hwid;
+			}
+
 		}
 
 		skb_reset_mac_header(skb);
@@ -405,7 +428,8 @@ static int gre_rcv(struct sk_buff *skb)
 	if (hdr_len < 0)
 		goto drop;
 
-	if (unlikely(tpi.proto == htons(ETH_P_ERSPAN))) {
+	if (unlikely(tpi.proto == htons(ETH_P_ERSPAN) ||
+		     tpi.proto == htons(ETH_P_ERSPAN2))) {
 		if (erspan_rcv(skb, &tpi, hdr_len) == PACKET_RCVD)
 			return 0;
 	}
@@ -560,6 +584,7 @@ static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev,
 	bool truncate = false;
 	struct flowi4 fl;
 	int tunnel_hlen;
+	int version;
 	__be16 df;
 
 	tun_info = skb_tunnel_info(skb);
@@ -568,9 +593,13 @@ static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev,
 		goto err_free_skb;
 
 	key = &tun_info->key;
+	md = ip_tunnel_info_opts(tun_info);
+	if (!md)
+		goto err_free_rt;
 
 	/* ERSPAN has fixed 8 byte GRE header */
-	tunnel_hlen = 8 + sizeof(struct erspanhdr);
+	version = md->version;
+	tunnel_hlen = 8 + erspan_hdr_len(version);
 
 	rt = prepare_fb_xmit(skb, dev, &fl, tunnel_hlen);
 	if (!rt)
@@ -584,12 +613,23 @@ static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev,
 		truncate = true;
 	}
 
-	md = ip_tunnel_info_opts(tun_info);
-	if (!md)
-		goto err_free_rt;
+	if (version == 1) {
+		erspan_build_header(skb, tunnel_id_to_key32(key->tun_id),
+				    ntohl(md->u.index), truncate, true);
+	} else if (version == 2) {
+		u16 md2_flags;
+		u8 direction;
+		u16 hwid;
 
-	erspan_build_header(skb, tunnel_id_to_key32(key->tun_id),
-			    ntohl(md->index), truncate);
+		md2_flags = ntohs(md->u.md2.flags);
+		direction = (md2_flags & DIR_MASK) >> DIR_OFFSET;
+		hwid = (md2_flags & HWID_MASK) >> HWID_OFFSET;
+
+		erspan_build_header_v2(skb, tunnel_id_to_key32(key->tun_id),
+				       direction, hwid,	truncate, true);
+	} else {
+		goto err_free_rt;
+	}
 
 	gre_build_header(skb, 8, TUNNEL_SEQ,
 			 htons(ETH_P_ERSPAN), 0, htonl(tunnel->o_seqno++));
@@ -668,52 +708,6 @@ free_skb:
 	return NETDEV_TX_OK;
 }
 
-static inline u8 tos_to_cos(u8 tos)
-{
-	u8 dscp, cos;
-
-	dscp = tos >> 2;
-	cos = dscp >> 3;
-	return cos;
-}
-
-static void erspan_build_header(struct sk_buff *skb,
-				__be32 id, u32 index, bool truncate)
-{
-	struct iphdr *iphdr = ip_hdr(skb);
-	struct ethhdr *eth = eth_hdr(skb);
-	enum erspan_encap_type enc_type;
-	struct erspanhdr *ershdr;
-	struct qtag_prefix {
-		__be16 eth_type;
-		__be16 tci;
-	} *qp;
-	u16 vlan_tci = 0;
-
-	enc_type = ERSPAN_ENCAP_NOVLAN;
-
-	/* If mirrored packet has vlan tag, extract tci and
-	 *  perserve vlan header in the mirrored frame.
-	 */
-	if (eth->h_proto == htons(ETH_P_8021Q)) {
-		qp = (struct qtag_prefix *)(skb->data + 2 * ETH_ALEN);
-		vlan_tci = ntohs(qp->tci);
-		enc_type = ERSPAN_ENCAP_INFRAME;
-	}
-
-	skb_push(skb, sizeof(*ershdr));
-	ershdr = (struct erspanhdr *)skb->data;
-	memset(ershdr, 0, sizeof(*ershdr));
-
-	ershdr->ver_vlan = htons((vlan_tci & VLAN_MASK) |
-				 (ERSPAN_VERSION << VER_OFFSET));
-	ershdr->session_id = htons((u16)(ntohl(id) & ID_MASK) |
-			   ((tos_to_cos(iphdr->tos) << COS_OFFSET) & COS_MASK) |
-			   (enc_type << EN_OFFSET & EN_MASK) |
-			   ((truncate << T_OFFSET) & T_MASK));
-	ershdr->md.index = htonl(index & INDEX_MASK);
-}
-
 static netdev_tx_t erspan_xmit(struct sk_buff *skb,
 			       struct net_device *dev)
 {
@@ -737,7 +731,14 @@ static netdev_tx_t erspan_xmit(struct sk_buff *skb,
 	}
 
 	/* Push ERSPAN header */
-	erspan_build_header(skb, tunnel->parms.o_key, tunnel->index, truncate);
+	if (tunnel->erspan_ver == 1)
+		erspan_build_header(skb, tunnel->parms.o_key, tunnel->index,
+				    truncate, true);
+	else
+		erspan_build_header_v2(skb, tunnel->parms.o_key,
+				       tunnel->dir, tunnel->hwid,
+				       truncate, true);
+
 	tunnel->parms.o_flags &= ~TUNNEL_KEY;
 	__gre_xmit(skb, dev, &tunnel->parms.iph, htons(ETH_P_ERSPAN));
 	return NETDEV_TX_OK;
@@ -1209,13 +1210,32 @@ static int ipgre_netlink_parms(struct net_device *dev,
 	if (data[IFLA_GRE_FWMARK])
 		*fwmark = nla_get_u32(data[IFLA_GRE_FWMARK]);
 
-	if (data[IFLA_GRE_ERSPAN_INDEX]) {
-		t->index = nla_get_u32(data[IFLA_GRE_ERSPAN_INDEX]);
+	if (data[IFLA_GRE_ERSPAN_VER]) {
+		t->erspan_ver = nla_get_u8(data[IFLA_GRE_ERSPAN_VER]);
 
-		if (t->index & ~INDEX_MASK)
+		if (t->erspan_ver != 1 && t->erspan_ver != 2)
 			return -EINVAL;
 	}
 
+	if (t->erspan_ver == 1) {
+		if (data[IFLA_GRE_ERSPAN_INDEX]) {
+			t->index = nla_get_u32(data[IFLA_GRE_ERSPAN_INDEX]);
+			if (t->index & ~INDEX_MASK)
+				return -EINVAL;
+		}
+	} else if (t->erspan_ver == 2) {
+		if (data[IFLA_GRE_ERSPAN_DIR]) {
+			t->dir = nla_get_u8(data[IFLA_GRE_ERSPAN_DIR]);
+			if (t->dir & ~(DIR_MASK >> DIR_OFFSET))
+				return -EINVAL;
+		}
+		if (data[IFLA_GRE_ERSPAN_HWID]) {
+			t->hwid = nla_get_u16(data[IFLA_GRE_ERSPAN_HWID]);
+			if (t->hwid & ~(HWID_MASK >> HWID_OFFSET))
+				return -EINVAL;
+		}
+	}
+
 	return 0;
 }
 
@@ -1282,7 +1302,7 @@ static int erspan_tunnel_init(struct net_device *dev)
 	tunnel->tun_hlen = 8;
 	tunnel->parms.iph.protocol = IPPROTO_GRE;
 	tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen +
-		       sizeof(struct erspanhdr);
+		       erspan_hdr_len(tunnel->erspan_ver);
 	t_hlen = tunnel->hlen + sizeof(struct iphdr);
 
 	dev->needed_headroom = LL_MAX_HEADER + t_hlen + 4;
@@ -1412,6 +1432,12 @@ static size_t ipgre_get_size(const struct net_device *dev)
 		nla_total_size(4) +
 		/* IFLA_GRE_ERSPAN_INDEX */
 		nla_total_size(4) +
+		/* IFLA_GRE_ERSPAN_VER */
+		nla_total_size(1) +
+		/* IFLA_GRE_ERSPAN_DIR */
+		nla_total_size(1) +
+		/* IFLA_GRE_ERSPAN_HWID */
+		nla_total_size(2) +
 		0;
 }
 
@@ -1454,9 +1480,18 @@ static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)
 			goto nla_put_failure;
 	}
 
-	if (t->index)
+	if (nla_put_u8(skb, IFLA_GRE_ERSPAN_VER, t->erspan_ver))
+		goto nla_put_failure;
+
+	if (t->erspan_ver == 1) {
 		if (nla_put_u32(skb, IFLA_GRE_ERSPAN_INDEX, t->index))
 			goto nla_put_failure;
+	} else if (t->erspan_ver == 2) {
+		if (nla_put_u8(skb, IFLA_GRE_ERSPAN_DIR, t->dir))
+			goto nla_put_failure;
+		if (nla_put_u16(skb, IFLA_GRE_ERSPAN_HWID, t->hwid))
+			goto nla_put_failure;
+	}
 
 	return 0;
 
@@ -1492,6 +1527,9 @@ static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = {
 	[IFLA_GRE_IGNORE_DF]	= { .type = NLA_U8 },
 	[IFLA_GRE_FWMARK]	= { .type = NLA_U32 },
 	[IFLA_GRE_ERSPAN_INDEX]	= { .type = NLA_U32 },
+	[IFLA_GRE_ERSPAN_VER]	= { .type = NLA_U8 },
+	[IFLA_GRE_ERSPAN_DIR]	= { .type = NLA_U8 },
+	[IFLA_GRE_ERSPAN_HWID]	= { .type = NLA_U16 },
 };
 
 static struct rtnl_link_ops ipgre_link_ops __read_mostly = {
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 43b69af242e1..f0ed031f3594 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1106,7 +1106,7 @@ void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
 		new = true;
 	}
 
-	__ip_rt_update_pmtu((struct rtable *) rt->dst.path, &fl4, mtu);
+	__ip_rt_update_pmtu((struct rtable *) xfrm_dst_path(&rt->dst), &fl4, mtu);
 
 	if (!dst_check(&rt->dst, 0)) {
 		if (new)
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index f08eebe60446..c470fec9062f 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -3578,6 +3578,9 @@ void __init tcp_init(void)
 	percpu_counter_init(&tcp_sockets_allocated, 0, GFP_KERNEL);
 	percpu_counter_init(&tcp_orphan_count, 0, GFP_KERNEL);
 	inet_hashinfo_init(&tcp_hashinfo);
+	inet_hashinfo2_init(&tcp_hashinfo, "tcp_listen_portaddr_hash",
+			    thash_entries, 21,  /* one slot per 2 MB*/
+			    0, 64 * 1024);
 	tcp_hashinfo.bind_bucket_cachep =
 		kmem_cache_create("tcp_bind_bucket",
 				  sizeof(struct inet_bind_bucket), 0,
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
index 78c192ee03a4..018a48477355 100644
--- a/net/ipv4/tcp_fastopen.c
+++ b/net/ipv4/tcp_fastopen.c
@@ -379,18 +379,9 @@ fastopen:
 bool tcp_fastopen_cookie_check(struct sock *sk, u16 *mss,
 			       struct tcp_fastopen_cookie *cookie)
 {
-	unsigned long last_syn_loss = 0;
 	const struct dst_entry *dst;
-	int syn_loss = 0;
 
-	tcp_fastopen_cache_get(sk, mss, cookie, &syn_loss, &last_syn_loss);
-
-	/* Recurring FO SYN losses: no cookie or data in SYN */
-	if (syn_loss > 1 &&
-	    time_before(jiffies, last_syn_loss + (60*HZ << syn_loss))) {
-		cookie->len = -1;
-		return false;
-	}
+	tcp_fastopen_cache_get(sk, mss, cookie);
 
 	/* Firewall blackhole issue check */
 	if (tcp_fastopen_active_should_disable(sk)) {
@@ -448,6 +439,8 @@ EXPORT_SYMBOL(tcp_fastopen_defer_connect);
  * following circumstances:
  *   1. client side TFO socket receives out of order FIN
  *   2. client side TFO socket receives out of order RST
+ *   3. client side TFO socket has timed out three times consecutively during
+ *      or after handshake
  * We disable active side TFO globally for 1hr at first. Then if it
  * happens again, we disable it for 2h, then 4h, 8h, ...
  * And we reset the timeout back to 1hr when we see a successful active
@@ -524,3 +517,20 @@ void tcp_fastopen_active_disable_ofo_check(struct sock *sk)
 		dst_release(dst);
 	}
 }
+
+void tcp_fastopen_active_detect_blackhole(struct sock *sk, bool expired)
+{
+	u32 timeouts = inet_csk(sk)->icsk_retransmits;
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	/* Broken middle-boxes may black-hole Fast Open connection during or
+	 * even after the handshake. Be extremely conservative and pause
+	 * Fast Open globally after hitting the third consecutive timeout or
+	 * exceeding the configured timeout limit.
+	 */
+	if ((tp->syn_fastopen || tp->syn_data || tp->syn_data_acked) &&
+	    (timeouts == 2 || (timeouts < 2 && expired))) {
+		tcp_fastopen_active_disable(sk);
+		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVEFAIL);
+	}
+}
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 45f750e85714..4d55c4b338ee 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -578,8 +578,8 @@ static inline void tcp_rcv_rtt_measure_ts(struct sock *sk,
 void tcp_rcv_space_adjust(struct sock *sk)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
+	u32 copied;
 	int time;
-	int copied;
 
 	tcp_mstamp_refresh(tp);
 	time = tcp_stamp_us_delta(tp->tcp_mstamp, tp->rcvq_space.time);
@@ -602,38 +602,31 @@ void tcp_rcv_space_adjust(struct sock *sk)
 
 	if (sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf &&
 	    !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) {
-		int rcvwin, rcvmem, rcvbuf;
+		int rcvmem, rcvbuf;
+		u64 rcvwin, grow;
 
 		/* minimal window to cope with packet losses, assuming
 		 * steady state. Add some cushion because of small variations.
 		 */
-		rcvwin = (copied << 1) + 16 * tp->advmss;
+		rcvwin = ((u64)copied << 1) + 16 * tp->advmss;
 
-		/* If rate increased by 25%,
-		 *	assume slow start, rcvwin = 3 * copied
-		 * If rate increased by 50%,
-		 *	assume sender can use 2x growth, rcvwin = 4 * copied
-		 */
-		if (copied >=
-		    tp->rcvq_space.space + (tp->rcvq_space.space >> 2)) {
-			if (copied >=
-			    tp->rcvq_space.space + (tp->rcvq_space.space >> 1))
-				rcvwin <<= 1;
-			else
-				rcvwin += (rcvwin >> 1);
-		}
+		/* Accommodate for sender rate increase (eg. slow start) */
+		grow = rcvwin * (copied - tp->rcvq_space.space);
+		do_div(grow, tp->rcvq_space.space);
+		rcvwin += (grow << 1);
 
 		rcvmem = SKB_TRUESIZE(tp->advmss + MAX_TCP_HEADER);
 		while (tcp_win_from_space(sk, rcvmem) < tp->advmss)
 			rcvmem += 128;
 
-		rcvbuf = min(rcvwin / tp->advmss * rcvmem,
-			     sock_net(sk)->ipv4.sysctl_tcp_rmem[2]);
+		do_div(rcvwin, tp->advmss);
+		rcvbuf = min_t(u64, rcvwin * rcvmem,
+			       sock_net(sk)->ipv4.sysctl_tcp_rmem[2]);
 		if (rcvbuf > sk->sk_rcvbuf) {
 			sk->sk_rcvbuf = rcvbuf;
 
 			/* Make the window clamp follow along.  */
-			tp->window_clamp = rcvwin;
+			tp->window_clamp = tcp_win_from_space(sk, rcvbuf);
 		}
 	}
 	tp->rcvq_space.space = copied;
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
index 7097f92d16e5..759e6bc8327b 100644
--- a/net/ipv4/tcp_metrics.c
+++ b/net/ipv4/tcp_metrics.c
@@ -546,8 +546,7 @@ bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst)
 static DEFINE_SEQLOCK(fastopen_seqlock);
 
 void tcp_fastopen_cache_get(struct sock *sk, u16 *mss,
-			    struct tcp_fastopen_cookie *cookie,
-			    int *syn_loss, unsigned long *last_syn_loss)
+			    struct tcp_fastopen_cookie *cookie)
 {
 	struct tcp_metrics_block *tm;
 
@@ -564,8 +563,6 @@ void tcp_fastopen_cache_get(struct sock *sk, u16 *mss,
 			*cookie = tfom->cookie;
 			if (cookie->len <= 0 && tfom->try_exp == 1)
 				cookie->exp = true;
-			*syn_loss = tfom->syn_loss;
-			*last_syn_loss = *syn_loss ? tfom->last_syn_loss : 0;
 		} while (read_seqretry(&fastopen_seqlock, seq));
 	}
 	rcu_read_unlock();
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index b079b619b60c..a8384b0c11f8 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -316,9 +316,10 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
 		 */
 		local_bh_disable();
 		inet_twsk_schedule(tw, timeo);
-		/* Linkage updates. */
-		__inet_twsk_hashdance(tw, sk, &tcp_hashinfo);
-		inet_twsk_put(tw);
+		/* Linkage updates.
+		 * Note that access to tw after this point is illegal.
+		 */
+		inet_twsk_hashdance(tw, sk, &tcp_hashinfo);
 		local_bh_enable();
 	} else {
 		/* Sorry, if we're out of memory, just CLOSE this
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index a4d214c7b506..04be9f833927 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2414,15 +2414,12 @@ bool tcp_schedule_loss_probe(struct sock *sk, bool advancing_rto)
 
 	early_retrans = sock_net(sk)->ipv4.sysctl_tcp_early_retrans;
 	/* Schedule a loss probe in 2*RTT for SACK capable connections
-	 * in Open state, that are either limited by cwnd or application.
+	 * not in loss recovery, that are either limited by cwnd or application.
 	 */
 	if ((early_retrans != 3 && early_retrans != 4) ||
 	    !tp->packets_out || !tcp_is_sack(tp) ||
-	    icsk->icsk_ca_state != TCP_CA_Open)
-		return false;
-
-	if ((tp->snd_cwnd > tcp_packets_in_flight(tp)) &&
-	     !tcp_write_queue_empty(sk))
+	    (icsk->icsk_ca_state != TCP_CA_Open &&
+	     icsk->icsk_ca_state != TCP_CA_CWR))
 		return false;
 
 	/* Probe timeout is 2*rtt. Add minimum RTO to account
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 968fda198376..6db3124cdbda 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -183,11 +183,6 @@ static int tcp_write_timeout(struct sock *sk)
 	if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
 		if (icsk->icsk_retransmits) {
 			dst_negative_advice(sk);
-			if (tp->syn_fastopen || tp->syn_data)
-				tcp_fastopen_cache_set(sk, 0, NULL, true, 0);
-			if (tp->syn_data && icsk->icsk_retransmits == 1)
-				NET_INC_STATS(sock_net(sk),
-					      LINUX_MIB_TCPFASTOPENACTIVEFAIL);
 		} else if (!tp->syn_data && !tp->syn_fastopen) {
 			sk_rethink_txhash(sk);
 		}
@@ -195,17 +190,6 @@ static int tcp_write_timeout(struct sock *sk)
 		expired = icsk->icsk_retransmits >= retry_until;
 	} else {
 		if (retransmits_timed_out(sk, net->ipv4.sysctl_tcp_retries1, 0)) {
-			/* Some middle-boxes may black-hole Fast Open _after_
-			 * the handshake. Therefore we conservatively disable
-			 * Fast Open on this path on recurring timeouts after
-			 * successful Fast Open.
-			 */
-			if (tp->syn_data_acked) {
-				tcp_fastopen_cache_set(sk, 0, NULL, true, 0);
-				if (icsk->icsk_retransmits == net->ipv4.sysctl_tcp_retries1)
-					NET_INC_STATS(sock_net(sk),
-						      LINUX_MIB_TCPFASTOPENACTIVEFAIL);
-			}
 			/* Black hole detection */
 			tcp_mtu_probing(icsk, sk);
 
@@ -228,6 +212,7 @@ static int tcp_write_timeout(struct sock *sk)
 		expired = retransmits_timed_out(sk, retry_until,
 						icsk->icsk_user_timeout);
 	}
+	tcp_fastopen_active_detect_blackhole(sk, expired);
 	if (expired) {
 		/* Has it gone just too far? */
 		tcp_write_err(sk);
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index e4ff25c947c5..e9c0d1e1772e 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -357,18 +357,12 @@ fail:
 }
 EXPORT_SYMBOL(udp_lib_get_port);
 
-static u32 udp4_portaddr_hash(const struct net *net, __be32 saddr,
-			      unsigned int port)
-{
-	return jhash_1word((__force u32)saddr, net_hash_mix(net)) ^ port;
-}
-
 int udp_v4_get_port(struct sock *sk, unsigned short snum)
 {
 	unsigned int hash2_nulladdr =
-		udp4_portaddr_hash(sock_net(sk), htonl(INADDR_ANY), snum);
+		ipv4_portaddr_hash(sock_net(sk), htonl(INADDR_ANY), snum);
 	unsigned int hash2_partial =
-		udp4_portaddr_hash(sock_net(sk), inet_sk(sk)->inet_rcv_saddr, 0);
+		ipv4_portaddr_hash(sock_net(sk), inet_sk(sk)->inet_rcv_saddr, 0);
 
 	/* precompute partial secondary hash */
 	udp_sk(sk)->udp_portaddr_hash = hash2_partial;
@@ -445,7 +439,7 @@ static struct sock *udp4_lib_lookup2(struct net *net,
 				     struct sk_buff *skb)
 {
 	struct sock *sk, *result;
-	int score, badness, matches = 0, reuseport = 0;
+	int score, badness;
 	u32 hash = 0;
 
 	result = NULL;
@@ -454,23 +448,16 @@ static struct sock *udp4_lib_lookup2(struct net *net,
 		score = compute_score(sk, net, saddr, sport,
 				      daddr, hnum, dif, sdif, exact_dif);
 		if (score > badness) {
-			reuseport = sk->sk_reuseport;
-			if (reuseport) {
+			if (sk->sk_reuseport) {
 				hash = udp_ehashfn(net, daddr, hnum,
 						   saddr, sport);
 				result = reuseport_select_sock(sk, hash, skb,
 							sizeof(struct udphdr));
 				if (result)
 					return result;
-				matches = 1;
 			}
 			badness = score;
 			result = sk;
-		} else if (score == badness && reuseport) {
-			matches++;
-			if (reciprocal_scale(hash, matches) == 0)
-				result = sk;
-			hash = next_pseudo_random32(hash);
 		}
 	}
 	return result;
@@ -488,11 +475,11 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
 	unsigned int hash2, slot2, slot = udp_hashfn(net, hnum, udptable->mask);
 	struct udp_hslot *hslot2, *hslot = &udptable->hash[slot];
 	bool exact_dif = udp_lib_exact_dif_match(net, skb);
-	int score, badness, matches = 0, reuseport = 0;
+	int score, badness;
 	u32 hash = 0;
 
 	if (hslot->count > 10) {
-		hash2 = udp4_portaddr_hash(net, daddr, hnum);
+		hash2 = ipv4_portaddr_hash(net, daddr, hnum);
 		slot2 = hash2 & udptable->mask;
 		hslot2 = &udptable->hash2[slot2];
 		if (hslot->count < hslot2->count)
@@ -503,7 +490,7 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
 					  exact_dif, hslot2, skb);
 		if (!result) {
 			unsigned int old_slot2 = slot2;
-			hash2 = udp4_portaddr_hash(net, htonl(INADDR_ANY), hnum);
+			hash2 = ipv4_portaddr_hash(net, htonl(INADDR_ANY), hnum);
 			slot2 = hash2 & udptable->mask;
 			/* avoid searching the same slot again. */
 			if (unlikely(slot2 == old_slot2))
@@ -526,23 +513,16 @@ begin:
 		score = compute_score(sk, net, saddr, sport,
 				      daddr, hnum, dif, sdif, exact_dif);
 		if (score > badness) {
-			reuseport = sk->sk_reuseport;
-			if (reuseport) {
+			if (sk->sk_reuseport) {
 				hash = udp_ehashfn(net, daddr, hnum,
 						   saddr, sport);
 				result = reuseport_select_sock(sk, hash, skb,
 							sizeof(struct udphdr));
 				if (result)
 					return result;
-				matches = 1;
 			}
 			result = sk;
 			badness = score;
-		} else if (score == badness && reuseport) {
-			matches++;
-			if (reciprocal_scale(hash, matches) == 0)
-				result = sk;
-			hash = next_pseudo_random32(hash);
 		}
 	}
 	return result;
@@ -1775,7 +1755,7 @@ EXPORT_SYMBOL(udp_lib_rehash);
 
 static void udp_v4_rehash(struct sock *sk)
 {
-	u16 new_hash = udp4_portaddr_hash(sock_net(sk),
+	u16 new_hash = ipv4_portaddr_hash(sock_net(sk),
 					  inet_sk(sk)->inet_rcv_saddr,
 					  inet_sk(sk)->inet_num);
 	udp_lib_rehash(sk, new_hash);
@@ -1966,9 +1946,9 @@ static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
 	struct sk_buff *nskb;
 
 	if (use_hash2) {
-		hash2_any = udp4_portaddr_hash(net, htonl(INADDR_ANY), hnum) &
+		hash2_any = ipv4_portaddr_hash(net, htonl(INADDR_ANY), hnum) &
 			    udptable->mask;
-		hash2 = udp4_portaddr_hash(net, daddr, hnum) & udptable->mask;
+		hash2 = ipv4_portaddr_hash(net, daddr, hnum) & udptable->mask;
 start_lookup:
 		hslot = &udptable->hash2[hash2];
 		offset = offsetof(typeof(*sk), __sk_common.skc_portaddr_node);
@@ -2200,7 +2180,7 @@ static struct sock *__udp4_lib_demux_lookup(struct net *net,
 					    int dif, int sdif)
 {
 	unsigned short hnum = ntohs(loc_port);
-	unsigned int hash2 = udp4_portaddr_hash(net, loc_addr, hnum);
+	unsigned int hash2 = ipv4_portaddr_hash(net, loc_addr, hnum);
 	unsigned int slot2 = hash2 & udp_table.mask;
 	struct udp_hslot *hslot2 = &udp_table.hash2[slot2];
 	INET_ADDR_COOKIE(acookie, rmt_addr, loc_addr);
diff --git a/net/ipv4/xfrm4_mode_tunnel.c b/net/ipv4/xfrm4_mode_tunnel.c
index e6265e2c274e..7d885a44dc9d 100644
--- a/net/ipv4/xfrm4_mode_tunnel.c
+++ b/net/ipv4/xfrm4_mode_tunnel.c
@@ -62,7 +62,7 @@ static int xfrm4_mode_tunnel_output(struct xfrm_state *x, struct sk_buff *skb)
 	top_iph->frag_off = (flags & XFRM_STATE_NOPMTUDISC) ?
 		0 : (XFRM_MODE_SKB_CB(skb)->frag_off & htons(IP_DF));
 
-	top_iph->ttl = ip4_dst_hoplimit(dst->child);
+	top_iph->ttl = ip4_dst_hoplimit(xfrm_dst_child(dst));
 
 	top_iph->saddr = x->props.saddr.a4;
 	top_iph->daddr = x->id.daddr.a4;
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index f49bd7897e95..ed06b1190f05 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -6595,27 +6595,45 @@ int __init addrconf_init(void)
 
 	rtnl_af_register(&inet6_ops);
 
-	err = __rtnl_register(PF_INET6, RTM_GETLINK, NULL, inet6_dump_ifinfo,
-			      0);
+	err = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETLINK,
+				   NULL, inet6_dump_ifinfo, 0);
 	if (err < 0)
 		goto errout;
 
-	/* Only the first call to __rtnl_register can fail */
-	__rtnl_register(PF_INET6, RTM_NEWADDR, inet6_rtm_newaddr, NULL, 0);
-	__rtnl_register(PF_INET6, RTM_DELADDR, inet6_rtm_deladdr, NULL, 0);
-	__rtnl_register(PF_INET6, RTM_GETADDR, inet6_rtm_getaddr,
-			inet6_dump_ifaddr, RTNL_FLAG_DOIT_UNLOCKED);
-	__rtnl_register(PF_INET6, RTM_GETMULTICAST, NULL,
-			inet6_dump_ifmcaddr, 0);
-	__rtnl_register(PF_INET6, RTM_GETANYCAST, NULL,
-			inet6_dump_ifacaddr, 0);
-	__rtnl_register(PF_INET6, RTM_GETNETCONF, inet6_netconf_get_devconf,
-			inet6_netconf_dump_devconf, RTNL_FLAG_DOIT_UNLOCKED);
-
-	ipv6_addr_label_rtnl_register();
+	err = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWADDR,
+				   inet6_rtm_newaddr, NULL, 0);
+	if (err < 0)
+		goto errout;
+	err = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELADDR,
+				   inet6_rtm_deladdr, NULL, 0);
+	if (err < 0)
+		goto errout;
+	err = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETADDR,
+				   inet6_rtm_getaddr, inet6_dump_ifaddr,
+				   RTNL_FLAG_DOIT_UNLOCKED);
+	if (err < 0)
+		goto errout;
+	err = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETMULTICAST,
+				   NULL, inet6_dump_ifmcaddr, 0);
+	if (err < 0)
+		goto errout;
+	err = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETANYCAST,
+				   NULL, inet6_dump_ifacaddr, 0);
+	if (err < 0)
+		goto errout;
+	err = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETNETCONF,
+				   inet6_netconf_get_devconf,
+				   inet6_netconf_dump_devconf,
+				   RTNL_FLAG_DOIT_UNLOCKED);
+	if (err < 0)
+		goto errout;
+	err = ipv6_addr_label_rtnl_register();
+	if (err < 0)
+		goto errout;
 
 	return 0;
 errout:
+	rtnl_unregister_all(PF_INET6);
 	rtnl_af_unregister(&inet6_ops);
 	unregister_netdevice_notifier(&ipv6_dev_notf);
 errlo:
diff --git a/net/ipv6/addrlabel.c b/net/ipv6/addrlabel.c
index 00e1f8ee08f8..1d6ced37ad71 100644
--- a/net/ipv6/addrlabel.c
+++ b/net/ipv6/addrlabel.c
@@ -547,13 +547,22 @@ static int ip6addrlbl_get(struct sk_buff *in_skb, struct nlmsghdr *nlh,
 	return err;
 }
 
-void __init ipv6_addr_label_rtnl_register(void)
+int __init ipv6_addr_label_rtnl_register(void)
 {
-	__rtnl_register(PF_INET6, RTM_NEWADDRLABEL, ip6addrlbl_newdel,
-			NULL, RTNL_FLAG_DOIT_UNLOCKED);
-	__rtnl_register(PF_INET6, RTM_DELADDRLABEL, ip6addrlbl_newdel,
-			NULL, RTNL_FLAG_DOIT_UNLOCKED);
-	__rtnl_register(PF_INET6, RTM_GETADDRLABEL, ip6addrlbl_get,
-			ip6addrlbl_dump, RTNL_FLAG_DOIT_UNLOCKED);
-}
+	int ret;
 
+	ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWADDRLABEL,
+				   ip6addrlbl_newdel,
+				   NULL, RTNL_FLAG_DOIT_UNLOCKED);
+	if (ret < 0)
+		return ret;
+	ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELADDRLABEL,
+				   ip6addrlbl_newdel,
+				   NULL, RTNL_FLAG_DOIT_UNLOCKED);
+	if (ret < 0)
+		return ret;
+	ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETADDRLABEL,
+				   ip6addrlbl_get,
+				   ip6addrlbl_dump, RTNL_FLAG_DOIT_UNLOCKED);
+	return ret;
+}
diff --git a/net/ipv6/ila/ila_xlat.c b/net/ipv6/ila/ila_xlat.c
index 6eb5e68f112a..44c39c5f0638 100644
--- a/net/ipv6/ila/ila_xlat.c
+++ b/net/ipv6/ila/ila_xlat.c
@@ -512,9 +512,7 @@ static int ila_nl_dump(struct sk_buff *skb, struct netlink_callback *cb)
 	struct ila_map *ila;
 	int ret;
 
-	ret = rhashtable_walk_start(rhiter);
-	if (ret && ret != -EAGAIN)
-		goto done;
+	rhashtable_walk_start(rhiter);
 
 	for (;;) {
 		ila = rhashtable_walk_next(rhiter);
diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c
index b01858f5deb1..2febe26de6a1 100644
--- a/net/ipv6/inet6_hashtables.c
+++ b/net/ipv6/inet6_hashtables.c
@@ -125,6 +125,40 @@ static inline int compute_score(struct sock *sk, struct net *net,
 }
 
 /* called with rcu_read_lock() */
+static struct sock *inet6_lhash2_lookup(struct net *net,
+		struct inet_listen_hashbucket *ilb2,
+		struct sk_buff *skb, int doff,
+		const struct in6_addr *saddr,
+		const __be16 sport, const struct in6_addr *daddr,
+		const unsigned short hnum, const int dif, const int sdif)
+{
+	bool exact_dif = inet6_exact_dif_match(net, skb);
+	struct inet_connection_sock *icsk;
+	struct sock *sk, *result = NULL;
+	int score, hiscore = 0;
+	u32 phash = 0;
+
+	inet_lhash2_for_each_icsk_rcu(icsk, &ilb2->head) {
+		sk = (struct sock *)icsk;
+		score = compute_score(sk, net, hnum, daddr, dif, sdif,
+				      exact_dif);
+		if (score > hiscore) {
+			if (sk->sk_reuseport) {
+				phash = inet6_ehashfn(net, daddr, hnum,
+						      saddr, sport);
+				result = reuseport_select_sock(sk, phash,
+							       skb, doff);
+				if (result)
+					return result;
+			}
+			result = sk;
+			hiscore = score;
+		}
+	}
+
+	return result;
+}
+
 struct sock *inet6_lookup_listener(struct net *net,
 		struct inet_hashinfo *hashinfo,
 		struct sk_buff *skb, int doff,
@@ -134,31 +168,56 @@ struct sock *inet6_lookup_listener(struct net *net,
 {
 	unsigned int hash = inet_lhashfn(net, hnum);
 	struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash];
-	int score, hiscore = 0, matches = 0, reuseport = 0;
 	bool exact_dif = inet6_exact_dif_match(net, skb);
+	struct inet_listen_hashbucket *ilb2;
 	struct sock *sk, *result = NULL;
+	int score, hiscore = 0;
+	unsigned int hash2;
 	u32 phash = 0;
 
+	if (ilb->count <= 10 || !hashinfo->lhash2)
+		goto port_lookup;
+
+	/* Too many sk in the ilb bucket (which is hashed by port alone).
+	 * Try lhash2 (which is hashed by port and addr) instead.
+	 */
+
+	hash2 = ipv6_portaddr_hash(net, daddr, hnum);
+	ilb2 = inet_lhash2_bucket(hashinfo, hash2);
+	if (ilb2->count > ilb->count)
+		goto port_lookup;
+
+	result = inet6_lhash2_lookup(net, ilb2, skb, doff,
+				     saddr, sport, daddr, hnum,
+				     dif, sdif);
+	if (result)
+		return result;
+
+	/* Lookup lhash2 with in6addr_any */
+
+	hash2 = ipv6_portaddr_hash(net, &in6addr_any, hnum);
+	ilb2 = inet_lhash2_bucket(hashinfo, hash2);
+	if (ilb2->count > ilb->count)
+		goto port_lookup;
+
+	return inet6_lhash2_lookup(net, ilb2, skb, doff,
+				   saddr, sport, daddr, hnum,
+				   dif, sdif);
+
+port_lookup:
 	sk_for_each(sk, &ilb->head) {
 		score = compute_score(sk, net, hnum, daddr, dif, sdif, exact_dif);
 		if (score > hiscore) {
-			reuseport = sk->sk_reuseport;
-			if (reuseport) {
+			if (sk->sk_reuseport) {
 				phash = inet6_ehashfn(net, daddr, hnum,
 						      saddr, sport);
 				result = reuseport_select_sock(sk, phash,
 							       skb, doff);
 				if (result)
 					return result;
-				matches = 1;
 			}
 			result = sk;
 			hiscore = score;
-		} else if (score == hiscore && reuseport) {
-			matches++;
-			if (reciprocal_scale(phash, matches) == 0)
-				result = sk;
-			phash = next_pseudo_random32(phash);
 		}
 	}
 	return result;
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index f5285f4e1d08..a64d559fa513 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -893,7 +893,7 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt,
 	ins = &fn->leaf;
 
 	for (iter = leaf; iter;
-	     iter = rcu_dereference_protected(iter->dst.rt6_next,
+	     iter = rcu_dereference_protected(iter->rt6_next,
 				lockdep_is_held(&rt->rt6i_table->tb6_lock))) {
 		/*
 		 *	Search for duplicates
@@ -950,7 +950,7 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt,
 			break;
 
 next_iter:
-		ins = &iter->dst.rt6_next;
+		ins = &iter->rt6_next;
 	}
 
 	if (fallback_ins && !found) {
@@ -979,7 +979,7 @@ next_iter:
 					      &sibling->rt6i_siblings);
 				break;
 			}
-			sibling = rcu_dereference_protected(sibling->dst.rt6_next,
+			sibling = rcu_dereference_protected(sibling->rt6_next,
 				    lockdep_is_held(&rt->rt6i_table->tb6_lock));
 		}
 		/* For each sibling in the list, increment the counter of
@@ -1009,7 +1009,7 @@ add:
 		if (err)
 			return err;
 
-		rcu_assign_pointer(rt->dst.rt6_next, iter);
+		rcu_assign_pointer(rt->rt6_next, iter);
 		atomic_inc(&rt->rt6i_ref);
 		rcu_assign_pointer(rt->rt6i_node, fn);
 		rcu_assign_pointer(*ins, rt);
@@ -1040,7 +1040,7 @@ add:
 
 		atomic_inc(&rt->rt6i_ref);
 		rcu_assign_pointer(rt->rt6i_node, fn);
-		rt->dst.rt6_next = iter->dst.rt6_next;
+		rt->rt6_next = iter->rt6_next;
 		rcu_assign_pointer(*ins, rt);
 		call_fib6_entry_notifiers(info->nl_net, FIB_EVENT_ENTRY_REPLACE,
 					  rt, extack);
@@ -1059,14 +1059,14 @@ add:
 
 		if (nsiblings) {
 			/* Replacing an ECMP route, remove all siblings */
-			ins = &rt->dst.rt6_next;
+			ins = &rt->rt6_next;
 			iter = rcu_dereference_protected(*ins,
 				    lockdep_is_held(&rt->rt6i_table->tb6_lock));
 			while (iter) {
 				if (iter->rt6i_metric > rt->rt6i_metric)
 					break;
 				if (rt6_qualify_for_ecmp(iter)) {
-					*ins = iter->dst.rt6_next;
+					*ins = iter->rt6_next;
 					iter->rt6i_node = NULL;
 					fib6_purge_rt(iter, fn, info->nl_net);
 					if (rcu_access_pointer(fn->rr_ptr) == iter)
@@ -1075,7 +1075,7 @@ add:
 					nsiblings--;
 					info->nl_net->ipv6.rt6_stats->fib_rt_entries--;
 				} else {
-					ins = &iter->dst.rt6_next;
+					ins = &iter->rt6_next;
 				}
 				iter = rcu_dereference_protected(*ins,
 					lockdep_is_held(&rt->rt6i_table->tb6_lock));
@@ -1644,7 +1644,7 @@ static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn,
 	WARN_ON_ONCE(rt->rt6i_flags & RTF_CACHE);
 
 	/* Unlink it */
-	*rtp = rt->dst.rt6_next;
+	*rtp = rt->rt6_next;
 	rt->rt6i_node = NULL;
 	net->ipv6.rt6_stats->fib_rt_entries--;
 	net->ipv6.rt6_stats->fib_discarded_routes++;
@@ -1672,7 +1672,7 @@ static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn,
 	FOR_WALKERS(net, w) {
 		if (w->state == FWS_C && w->leaf == rt) {
 			RT6_TRACE("walker %p adjusted by delroute\n", w);
-			w->leaf = rcu_dereference_protected(rt->dst.rt6_next,
+			w->leaf = rcu_dereference_protected(rt->rt6_next,
 					    lockdep_is_held(&table->tb6_lock));
 			if (!w->leaf)
 				w->state = FWS_U;
@@ -1731,7 +1731,7 @@ int fib6_del(struct rt6_info *rt, struct nl_info *info)
 			fib6_del_route(table, fn, rtp, info);
 			return 0;
 		}
-		rtp_next = &cur->dst.rt6_next;
+		rtp_next = &cur->rt6_next;
 	}
 	return -ENOENT;
 }
@@ -2142,8 +2142,8 @@ int __init fib6_init(void)
 	if (ret)
 		goto out_kmem_cache_create;
 
-	ret = __rtnl_register(PF_INET6, RTM_GETROUTE, NULL, inet6_dump_fib,
-			      0);
+	ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE, NULL,
+				   inet6_dump_fib, 0);
 	if (ret)
 		goto out_unregister_subsys;
 
@@ -2208,7 +2208,7 @@ static int ipv6_route_yield(struct fib6_walker *w)
 
 	do {
 		iter->w.leaf = rcu_dereference_protected(
-				iter->w.leaf->dst.rt6_next,
+				iter->w.leaf->rt6_next,
 				lockdep_is_held(&iter->tbl->tb6_lock));
 		iter->skip--;
 		if (!iter->skip && iter->w.leaf)
@@ -2274,7 +2274,7 @@ static void *ipv6_route_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 	if (!v)
 		goto iter_table;
 
-	n = rcu_dereference_bh(((struct rt6_info *)v)->dst.rt6_next);
+	n = rcu_dereference_bh(((struct rt6_info *)v)->rt6_next);
 	if (n) {
 		++*pos;
 		return n;
diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index 4cfd8e0696fe..5c9c65f1d5c2 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -55,6 +55,8 @@
 #include <net/ip6_route.h>
 #include <net/ip6_tunnel.h>
 #include <net/gre.h>
+#include <net/erspan.h>
+#include <net/dst_metadata.h>
 
 
 static bool log_ecn_error = true;
@@ -68,11 +70,13 @@ static unsigned int ip6gre_net_id __read_mostly;
 struct ip6gre_net {
 	struct ip6_tnl __rcu *tunnels[4][IP6_GRE_HASH_SIZE];
 
+	struct ip6_tnl __rcu *collect_md_tun;
 	struct net_device *fb_tunnel_dev;
 };
 
 static struct rtnl_link_ops ip6gre_link_ops __read_mostly;
 static struct rtnl_link_ops ip6gre_tap_ops __read_mostly;
+static struct rtnl_link_ops ip6erspan_tap_ops __read_mostly;
 static int ip6gre_tunnel_init(struct net_device *dev);
 static void ip6gre_tunnel_setup(struct net_device *dev);
 static void ip6gre_tunnel_link(struct ip6gre_net *ign, struct ip6_tnl *t);
@@ -121,7 +125,8 @@ static struct ip6_tnl *ip6gre_tunnel_lookup(struct net_device *dev,
 	unsigned int h1 = HASH_KEY(key);
 	struct ip6_tnl *t, *cand = NULL;
 	struct ip6gre_net *ign = net_generic(net, ip6gre_net_id);
-	int dev_type = (gre_proto == htons(ETH_P_TEB)) ?
+	int dev_type = (gre_proto == htons(ETH_P_TEB) ||
+			gre_proto == htons(ETH_P_ERSPAN)) ?
 		       ARPHRD_ETHER : ARPHRD_IP6GRE;
 	int score, cand_score = 4;
 
@@ -226,6 +231,10 @@ static struct ip6_tnl *ip6gre_tunnel_lookup(struct net_device *dev,
 	if (cand)
 		return cand;
 
+	t = rcu_dereference(ign->collect_md_tun);
+	if (t && t->dev->flags & IFF_UP)
+		return t;
+
 	dev = ign->fb_tunnel_dev;
 	if (dev->flags & IFF_UP)
 		return netdev_priv(dev);
@@ -261,6 +270,9 @@ static void ip6gre_tunnel_link(struct ip6gre_net *ign, struct ip6_tnl *t)
 {
 	struct ip6_tnl __rcu **tp = ip6gre_bucket(ign, t);
 
+	if (t->parms.collect_md)
+		rcu_assign_pointer(ign->collect_md_tun, t);
+
 	rcu_assign_pointer(t->next, rtnl_dereference(*tp));
 	rcu_assign_pointer(*tp, t);
 }
@@ -270,6 +282,9 @@ static void ip6gre_tunnel_unlink(struct ip6gre_net *ign, struct ip6_tnl *t)
 	struct ip6_tnl __rcu **tp;
 	struct ip6_tnl *iter;
 
+	if (t->parms.collect_md)
+		rcu_assign_pointer(ign->collect_md_tun, NULL);
+
 	for (tp = ip6gre_bucket(ign, t);
 	     (iter = rtnl_dereference(*tp)) != NULL;
 	     tp = &iter->next) {
@@ -460,7 +475,108 @@ static int ip6gre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi)
 				      &ipv6h->saddr, &ipv6h->daddr, tpi->key,
 				      tpi->proto);
 	if (tunnel) {
-		ip6_tnl_rcv(tunnel, skb, tpi, NULL, log_ecn_error);
+		if (tunnel->parms.collect_md) {
+			struct metadata_dst *tun_dst;
+			__be64 tun_id;
+			__be16 flags;
+
+			flags = tpi->flags;
+			tun_id = key32_to_tunnel_id(tpi->key);
+
+			tun_dst = ipv6_tun_rx_dst(skb, flags, tun_id, 0);
+			if (!tun_dst)
+				return PACKET_REJECT;
+
+			ip6_tnl_rcv(tunnel, skb, tpi, tun_dst, log_ecn_error);
+		} else {
+			ip6_tnl_rcv(tunnel, skb, tpi, NULL, log_ecn_error);
+		}
+
+		return PACKET_RCVD;
+	}
+
+	return PACKET_REJECT;
+}
+
+static int ip6erspan_rcv(struct sk_buff *skb, int gre_hdr_len,
+			 struct tnl_ptk_info *tpi)
+{
+	struct erspan_base_hdr *ershdr;
+	struct erspan_metadata *pkt_md;
+	const struct ipv6hdr *ipv6h;
+	struct ip6_tnl *tunnel;
+	u8 ver;
+
+	ipv6h = ipv6_hdr(skb);
+	ershdr = (struct erspan_base_hdr *)skb->data;
+
+	if (unlikely(!pskb_may_pull(skb, sizeof(*ershdr))))
+		return PACKET_REJECT;
+
+	ver = (ntohs(ershdr->ver_vlan) & VER_MASK) >> VER_OFFSET;
+	tpi->key = cpu_to_be32(ntohs(ershdr->session_id) & ID_MASK);
+	pkt_md = (struct erspan_metadata *)(ershdr + 1);
+
+	tunnel = ip6gre_tunnel_lookup(skb->dev,
+				      &ipv6h->saddr, &ipv6h->daddr, tpi->key,
+				      tpi->proto);
+	if (tunnel) {
+		int len = erspan_hdr_len(ver);
+
+		if (unlikely(!pskb_may_pull(skb, len)))
+			return -ENOMEM;
+
+		if (__iptunnel_pull_header(skb, len,
+					   htons(ETH_P_TEB),
+					   false, false) < 0)
+			return PACKET_REJECT;
+
+		if (tunnel->parms.collect_md) {
+			struct metadata_dst *tun_dst;
+			struct ip_tunnel_info *info;
+			struct erspan_metadata *md;
+			__be64 tun_id;
+			__be16 flags;
+
+			tpi->flags |= TUNNEL_KEY;
+			flags = tpi->flags;
+			tun_id = key32_to_tunnel_id(tpi->key);
+
+			tun_dst = ipv6_tun_rx_dst(skb, flags, tun_id,
+						  sizeof(*md));
+			if (!tun_dst)
+				return PACKET_REJECT;
+
+			info = &tun_dst->u.tun_info;
+			md = ip_tunnel_info_opts(info);
+			if (!md)
+				return PACKET_REJECT;
+
+			memcpy(md, pkt_md, sizeof(*md));
+			md->version = ver;
+			info->key.tun_flags |= TUNNEL_ERSPAN_OPT;
+			info->options_len = sizeof(*md);
+
+			ip6_tnl_rcv(tunnel, skb, tpi, tun_dst, log_ecn_error);
+
+		} else {
+			tunnel->parms.erspan_ver = ver;
+
+			if (ver == 1) {
+				tunnel->parms.index = ntohl(pkt_md->u.index);
+			} else {
+				u16 md2_flags;
+				u16 dir, hwid;
+
+				md2_flags = ntohs(pkt_md->u.md2.flags);
+				dir = (md2_flags & DIR_MASK) >> DIR_OFFSET;
+				hwid = (md2_flags & HWID_MASK) >> HWID_OFFSET;
+				tunnel->parms.dir = dir;
+				tunnel->parms.hwid = hwid;
+			}
+
+			ip6_tnl_rcv(tunnel, skb, tpi, NULL, log_ecn_error);
+		}
 
 		return PACKET_RCVD;
 	}
@@ -481,6 +597,13 @@ static int gre_rcv(struct sk_buff *skb)
 	if (iptunnel_pull_header(skb, hdr_len, tpi.proto, false))
 		goto drop;
 
+	if (unlikely(tpi.proto == htons(ETH_P_ERSPAN) ||
+		     tpi.proto == htons(ETH_P_ERSPAN2))) {
+		if (ip6erspan_rcv(skb, hdr_len, &tpi) == PACKET_RCVD)
+			return 0;
+		goto drop;
+	}
+
 	if (ip6gre_rcv(skb, &tpi) == PACKET_RCVD)
 		return 0;
 
@@ -496,6 +619,78 @@ static int gre_handle_offloads(struct sk_buff *skb, bool csum)
 					csum ? SKB_GSO_GRE_CSUM : SKB_GSO_GRE);
 }
 
+static void prepare_ip6gre_xmit_ipv4(struct sk_buff *skb,
+				     struct net_device *dev,
+				     struct flowi6 *fl6, __u8 *dsfield,
+				     int *encap_limit)
+{
+	const struct iphdr *iph = ip_hdr(skb);
+	struct ip6_tnl *t = netdev_priv(dev);
+
+	if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT))
+		*encap_limit = t->parms.encap_limit;
+
+	memcpy(fl6, &t->fl.u.ip6, sizeof(*fl6));
+
+	if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS)
+		*dsfield = ipv4_get_dsfield(iph);
+	else
+		*dsfield = ip6_tclass(t->parms.flowinfo);
+
+	if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK)
+		fl6->flowi6_mark = skb->mark;
+	else
+		fl6->flowi6_mark = t->parms.fwmark;
+
+	fl6->flowi6_uid = sock_net_uid(dev_net(dev), NULL);
+}
+
+static int prepare_ip6gre_xmit_ipv6(struct sk_buff *skb,
+				    struct net_device *dev,
+				    struct flowi6 *fl6, __u8 *dsfield,
+				    int *encap_limit)
+{
+	struct ipv6hdr *ipv6h = ipv6_hdr(skb);
+	struct ip6_tnl *t = netdev_priv(dev);
+	__u16 offset;
+
+	offset = ip6_tnl_parse_tlv_enc_lim(skb, skb_network_header(skb));
+	/* ip6_tnl_parse_tlv_enc_lim() might have reallocated skb->head */
+
+	if (offset > 0) {
+		struct ipv6_tlv_tnl_enc_lim *tel;
+
+		tel = (struct ipv6_tlv_tnl_enc_lim *)&skb_network_header(skb)[offset];
+		if (tel->encap_limit == 0) {
+			icmpv6_send(skb, ICMPV6_PARAMPROB,
+				    ICMPV6_HDR_FIELD, offset + 2);
+			return -1;
+		}
+		*encap_limit = tel->encap_limit - 1;
+	} else if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) {
+		*encap_limit = t->parms.encap_limit;
+	}
+
+	memcpy(fl6, &t->fl.u.ip6, sizeof(*fl6));
+
+	if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS)
+		*dsfield = ipv6_get_dsfield(ipv6h);
+	else
+		*dsfield = ip6_tclass(t->parms.flowinfo);
+
+	if (t->parms.flags & IP6_TNL_F_USE_ORIG_FLOWLABEL)
+		fl6->flowlabel |= ip6_flowlabel(ipv6h);
+
+	if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK)
+		fl6->flowi6_mark = skb->mark;
+	else
+		fl6->flowi6_mark = t->parms.fwmark;
+
+	fl6->flowi6_uid = sock_net_uid(dev_net(dev), NULL);
+
+	return 0;
+}
+
 static netdev_tx_t __gre6_xmit(struct sk_buff *skb,
 			       struct net_device *dev, __u8 dsfield,
 			       struct flowi6 *fl6, int encap_limit,
@@ -517,8 +712,38 @@ static netdev_tx_t __gre6_xmit(struct sk_buff *skb,
 
 	/* Push GRE header. */
 	protocol = (dev->type == ARPHRD_ETHER) ? htons(ETH_P_TEB) : proto;
-	gre_build_header(skb, tunnel->tun_hlen, tunnel->parms.o_flags,
-			 protocol, tunnel->parms.o_key, htonl(tunnel->o_seqno));
+
+	if (tunnel->parms.collect_md) {
+		struct ip_tunnel_info *tun_info;
+		const struct ip_tunnel_key *key;
+		__be16 flags;
+
+		tun_info = skb_tunnel_info(skb);
+		if (unlikely(!tun_info ||
+			     !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
+			     ip_tunnel_info_af(tun_info) != AF_INET6))
+			return -EINVAL;
+
+		key = &tun_info->key;
+		memset(fl6, 0, sizeof(*fl6));
+		fl6->flowi6_proto = IPPROTO_GRE;
+		fl6->daddr = key->u.ipv6.dst;
+		fl6->flowlabel = key->label;
+		fl6->flowi6_uid = sock_net_uid(dev_net(dev), NULL);
+
+		dsfield = key->tos;
+		flags = key->tun_flags & (TUNNEL_CSUM | TUNNEL_KEY);
+		tunnel->tun_hlen = gre_calc_hlen(flags);
+
+		gre_build_header(skb, tunnel->tun_hlen,
+				 flags, protocol,
+				 tunnel_id_to_key32(tun_info->key.tun_id), 0);
+
+	} else {
+		gre_build_header(skb, tunnel->tun_hlen, tunnel->parms.o_flags,
+				 protocol, tunnel->parms.o_key,
+				 htonl(tunnel->o_seqno));
+	}
 
 	return ip6_tnl_xmit(skb, dev, dsfield, fl6, encap_limit, pmtu,
 			    NEXTHDR_GRE);
@@ -527,30 +752,17 @@ static netdev_tx_t __gre6_xmit(struct sk_buff *skb,
 static inline int ip6gre_xmit_ipv4(struct sk_buff *skb, struct net_device *dev)
 {
 	struct ip6_tnl *t = netdev_priv(dev);
-	const struct iphdr  *iph = ip_hdr(skb);
 	int encap_limit = -1;
 	struct flowi6 fl6;
-	__u8 dsfield;
+	__u8 dsfield = 0;
 	__u32 mtu;
 	int err;
 
 	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
 
-	if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT))
-		encap_limit = t->parms.encap_limit;
-
-	memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6));
-
-	if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS)
-		dsfield = ipv4_get_dsfield(iph);
-	else
-		dsfield = ip6_tclass(t->parms.flowinfo);
-	if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK)
-		fl6.flowi6_mark = skb->mark;
-	else
-		fl6.flowi6_mark = t->parms.fwmark;
-
-	fl6.flowi6_uid = sock_net_uid(dev_net(dev), NULL);
+	if (!t->parms.collect_md)
+		prepare_ip6gre_xmit_ipv4(skb, dev, &fl6,
+					 &dsfield, &encap_limit);
 
 	err = gre_handle_offloads(skb, !!(t->parms.o_flags & TUNNEL_CSUM));
 	if (err)
@@ -574,46 +786,17 @@ static inline int ip6gre_xmit_ipv6(struct sk_buff *skb, struct net_device *dev)
 	struct ip6_tnl *t = netdev_priv(dev);
 	struct ipv6hdr *ipv6h = ipv6_hdr(skb);
 	int encap_limit = -1;
-	__u16 offset;
 	struct flowi6 fl6;
-	__u8 dsfield;
+	__u8 dsfield = 0;
 	__u32 mtu;
 	int err;
 
 	if (ipv6_addr_equal(&t->parms.raddr, &ipv6h->saddr))
 		return -1;
 
-	offset = ip6_tnl_parse_tlv_enc_lim(skb, skb_network_header(skb));
-	/* ip6_tnl_parse_tlv_enc_lim() might have reallocated skb->head */
-	ipv6h = ipv6_hdr(skb);
-
-	if (offset > 0) {
-		struct ipv6_tlv_tnl_enc_lim *tel;
-		tel = (struct ipv6_tlv_tnl_enc_lim *)&skb_network_header(skb)[offset];
-		if (tel->encap_limit == 0) {
-			icmpv6_send(skb, ICMPV6_PARAMPROB,
-				    ICMPV6_HDR_FIELD, offset + 2);
-			return -1;
-		}
-		encap_limit = tel->encap_limit - 1;
-	} else if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT))
-		encap_limit = t->parms.encap_limit;
-
-	memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6));
-
-	if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS)
-		dsfield = ipv6_get_dsfield(ipv6h);
-	else
-		dsfield = ip6_tclass(t->parms.flowinfo);
-
-	if (t->parms.flags & IP6_TNL_F_USE_ORIG_FLOWLABEL)
-		fl6.flowlabel |= ip6_flowlabel(ipv6h);
-	if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK)
-		fl6.flowi6_mark = skb->mark;
-	else
-		fl6.flowi6_mark = t->parms.fwmark;
-
-	fl6.flowi6_uid = sock_net_uid(dev_net(dev), NULL);
+	if (!t->parms.collect_md &&
+	    prepare_ip6gre_xmit_ipv6(skb, dev, &fl6, &dsfield, &encap_limit))
+		return -1;
 
 	if (gre_handle_offloads(skb, !!(t->parms.o_flags & TUNNEL_CSUM)))
 		return -1;
@@ -660,7 +843,8 @@ static int ip6gre_xmit_other(struct sk_buff *skb, struct net_device *dev)
 	if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT))
 		encap_limit = t->parms.encap_limit;
 
-	memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6));
+	if (!t->parms.collect_md)
+		memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6));
 
 	err = gre_handle_offloads(skb, !!(t->parms.o_flags & TUNNEL_CSUM));
 	if (err)
@@ -705,6 +889,141 @@ tx_err:
 	return NETDEV_TX_OK;
 }
 
+static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb,
+					 struct net_device *dev)
+{
+	struct ipv6hdr *ipv6h = ipv6_hdr(skb);
+	struct ip6_tnl *t = netdev_priv(dev);
+	struct dst_entry *dst = skb_dst(skb);
+	struct net_device_stats *stats;
+	bool truncate = false;
+	int encap_limit = -1;
+	__u8 dsfield = false;
+	struct flowi6 fl6;
+	int err = -EINVAL;
+	__u32 mtu;
+
+	if (!ip6_tnl_xmit_ctl(t, &t->parms.laddr, &t->parms.raddr))
+		goto tx_err;
+
+	if (gre_handle_offloads(skb, false))
+		goto tx_err;
+
+	if (skb->len > dev->mtu + dev->hard_header_len) {
+		pskb_trim(skb, dev->mtu + dev->hard_header_len);
+		truncate = true;
+	}
+
+	t->parms.o_flags &= ~TUNNEL_KEY;
+	IPCB(skb)->flags = 0;
+
+	/* For collect_md mode, derive fl6 from the tunnel key,
+	 * for native mode, call prepare_ip6gre_xmit_{ipv4,ipv6}.
+	 */
+	if (t->parms.collect_md) {
+		struct ip_tunnel_info *tun_info;
+		const struct ip_tunnel_key *key;
+		struct erspan_metadata *md;
+
+		tun_info = skb_tunnel_info(skb);
+		if (unlikely(!tun_info ||
+			     !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
+			     ip_tunnel_info_af(tun_info) != AF_INET6))
+			return -EINVAL;
+
+		key = &tun_info->key;
+		memset(&fl6, 0, sizeof(fl6));
+		fl6.flowi6_proto = IPPROTO_GRE;
+		fl6.daddr = key->u.ipv6.dst;
+		fl6.flowlabel = key->label;
+		fl6.flowi6_uid = sock_net_uid(dev_net(dev), NULL);
+
+		dsfield = key->tos;
+		md = ip_tunnel_info_opts(tun_info);
+		if (!md)
+			goto tx_err;
+
+		if (md->version == 1) {
+			erspan_build_header(skb,
+					    tunnel_id_to_key32(key->tun_id),
+					    ntohl(md->u.index), truncate,
+					    false);
+		} else if (md->version == 2) {
+			u16 md2_flags;
+			u16 dir, hwid;
+
+			md2_flags = ntohs(md->u.md2.flags);
+			dir = (md2_flags & DIR_MASK) >> DIR_OFFSET;
+			hwid = (md2_flags & HWID_MASK) >> HWID_OFFSET;
+
+			erspan_build_header_v2(skb,
+					       tunnel_id_to_key32(key->tun_id),
+					       dir, hwid, truncate,
+					       false);
+		}
+	} else {
+		switch (skb->protocol) {
+		case htons(ETH_P_IP):
+			memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
+			prepare_ip6gre_xmit_ipv4(skb, dev, &fl6,
+						 &dsfield, &encap_limit);
+			break;
+		case htons(ETH_P_IPV6):
+			if (ipv6_addr_equal(&t->parms.raddr, &ipv6h->saddr))
+				goto tx_err;
+			if (prepare_ip6gre_xmit_ipv6(skb, dev, &fl6,
+						     &dsfield, &encap_limit))
+				goto tx_err;
+			break;
+		default:
+			memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6));
+			break;
+		}
+
+		if (t->parms.erspan_ver == 1)
+			erspan_build_header(skb, t->parms.o_key,
+					    t->parms.index,
+					    truncate, false);
+		else
+			erspan_build_header_v2(skb, t->parms.o_key,
+					       t->parms.dir,
+					       t->parms.hwid,
+					       truncate, false);
+		fl6.daddr = t->parms.raddr;
+	}
+
+	/* Push GRE header. */
+	gre_build_header(skb, 8, TUNNEL_SEQ,
+			 htons(ETH_P_ERSPAN), 0, htonl(t->o_seqno++));
+
+	/* TooBig packet may have updated dst->dev's mtu */
+	if (!t->parms.collect_md && dst && dst_mtu(dst) > dst->dev->mtu)
+		dst->ops->update_pmtu(dst, NULL, skb, dst->dev->mtu);
+
+	err = ip6_tnl_xmit(skb, dev, dsfield, &fl6, encap_limit, &mtu,
+			   NEXTHDR_GRE);
+	if (err != 0) {
+		/* XXX: send ICMP error even if DF is not set. */
+		if (err == -EMSGSIZE) {
+			if (skb->protocol == htons(ETH_P_IP))
+				icmp_send(skb, ICMP_DEST_UNREACH,
+					  ICMP_FRAG_NEEDED, htonl(mtu));
+			else
+				icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
+		}
+
+		goto tx_err;
+	}
+	return NETDEV_TX_OK;
+
+tx_err:
+	stats = &t->dev->stats;
+	stats->tx_errors++;
+	stats->tx_dropped++;
+	kfree_skb(skb);
+	return NETDEV_TX_OK;
+}
+
 static void ip6gre_tnl_link_config(struct ip6_tnl *t, int set_mtu)
 {
 	struct net_device *dev = t->dev;
@@ -1048,6 +1367,11 @@ static int ip6gre_tunnel_init_common(struct net_device *dev)
 	if (!(tunnel->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT))
 		dev->mtu -= 8;
 
+	if (tunnel->parms.collect_md) {
+		dev->features |= NETIF_F_NETNS_LOCAL;
+		netif_keep_dst(dev);
+	}
+
 	return 0;
 }
 
@@ -1062,6 +1386,9 @@ static int ip6gre_tunnel_init(struct net_device *dev)
 
 	tunnel = netdev_priv(dev);
 
+	if (tunnel->parms.collect_md)
+		return 0;
+
 	memcpy(dev->dev_addr, &tunnel->parms.laddr, sizeof(struct in6_addr));
 	memcpy(dev->broadcast, &tunnel->parms.raddr, sizeof(struct in6_addr));
 
@@ -1084,7 +1411,6 @@ static void ip6gre_fb_tunnel_init(struct net_device *dev)
 	dev_hold(dev);
 }
 
-
 static struct inet6_protocol ip6gre_protocol __read_mostly = {
 	.handler     = gre_rcv,
 	.err_handler = ip6gre_err,
@@ -1099,7 +1425,8 @@ static void ip6gre_destroy_tunnels(struct net *net, struct list_head *head)
 
 	for_each_netdev_safe(net, dev, aux)
 		if (dev->rtnl_link_ops == &ip6gre_link_ops ||
-		    dev->rtnl_link_ops == &ip6gre_tap_ops)
+		    dev->rtnl_link_ops == &ip6gre_tap_ops ||
+		    dev->rtnl_link_ops == &ip6erspan_tap_ops)
 			unregister_netdevice_queue(dev, head);
 
 	for (prio = 0; prio < 4; prio++) {
@@ -1221,6 +1548,70 @@ out:
 	return ip6gre_tunnel_validate(tb, data, extack);
 }
 
+static int ip6erspan_tap_validate(struct nlattr *tb[], struct nlattr *data[],
+				  struct netlink_ext_ack *extack)
+{
+	__be16 flags = 0;
+	int ret, ver = 0;
+
+	if (!data)
+		return 0;
+
+	ret = ip6gre_tap_validate(tb, data, extack);
+	if (ret)
+		return ret;
+
+	/* ERSPAN should only have GRE sequence and key flag */
+	if (data[IFLA_GRE_OFLAGS])
+		flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]);
+	if (data[IFLA_GRE_IFLAGS])
+		flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]);
+	if (!data[IFLA_GRE_COLLECT_METADATA] &&
+	    flags != (GRE_SEQ | GRE_KEY))
+		return -EINVAL;
+
+	/* ERSPAN Session ID only has 10-bit. Since we reuse
+	 * 32-bit key field as ID, check it's range.
+	 */
+	if (data[IFLA_GRE_IKEY] &&
+	    (ntohl(nla_get_be32(data[IFLA_GRE_IKEY])) & ~ID_MASK))
+		return -EINVAL;
+
+	if (data[IFLA_GRE_OKEY] &&
+	    (ntohl(nla_get_be32(data[IFLA_GRE_OKEY])) & ~ID_MASK))
+		return -EINVAL;
+
+	if (data[IFLA_GRE_ERSPAN_VER]) {
+		ver = nla_get_u8(data[IFLA_GRE_ERSPAN_VER]);
+		if (ver != 1 && ver != 2)
+			return -EINVAL;
+	}
+
+	if (ver == 1) {
+		if (data[IFLA_GRE_ERSPAN_INDEX]) {
+			u32 index = nla_get_u32(data[IFLA_GRE_ERSPAN_INDEX]);
+
+			if (index & ~INDEX_MASK)
+				return -EINVAL;
+		}
+	} else if (ver == 2) {
+		if (data[IFLA_GRE_ERSPAN_DIR]) {
+			u16 dir = nla_get_u8(data[IFLA_GRE_ERSPAN_DIR]);
+
+			if (dir & ~(DIR_MASK >> DIR_OFFSET))
+				return -EINVAL;
+		}
+
+		if (data[IFLA_GRE_ERSPAN_HWID]) {
+			u16 hwid = nla_get_u16(data[IFLA_GRE_ERSPAN_HWID]);
+
+			if (hwid & ~(HWID_MASK >> HWID_OFFSET))
+				return -EINVAL;
+		}
+	}
+
+	return 0;
+}
 
 static void ip6gre_netlink_parms(struct nlattr *data[],
 				struct __ip6_tnl_parm *parms)
@@ -1267,6 +1658,22 @@ static void ip6gre_netlink_parms(struct nlattr *data[],
 
 	if (data[IFLA_GRE_FWMARK])
 		parms->fwmark = nla_get_u32(data[IFLA_GRE_FWMARK]);
+
+	if (data[IFLA_GRE_COLLECT_METADATA])
+		parms->collect_md = true;
+
+	if (data[IFLA_GRE_ERSPAN_VER])
+		parms->erspan_ver = nla_get_u8(data[IFLA_GRE_ERSPAN_VER]);
+
+	if (parms->erspan_ver == 1) {
+		if (data[IFLA_GRE_ERSPAN_INDEX])
+			parms->index = nla_get_u32(data[IFLA_GRE_ERSPAN_INDEX]);
+	} else if (parms->erspan_ver == 2) {
+		if (data[IFLA_GRE_ERSPAN_DIR])
+			parms->dir = nla_get_u8(data[IFLA_GRE_ERSPAN_DIR]);
+		if (data[IFLA_GRE_ERSPAN_HWID])
+			parms->hwid = nla_get_u16(data[IFLA_GRE_ERSPAN_HWID]);
+	}
 }
 
 static int ip6gre_tap_init(struct net_device *dev)
@@ -1303,6 +1710,59 @@ static const struct net_device_ops ip6gre_tap_netdev_ops = {
 		       NETIF_F_HIGHDMA |		\
 		       NETIF_F_HW_CSUM)
 
+static int ip6erspan_tap_init(struct net_device *dev)
+{
+	struct ip6_tnl *tunnel;
+	int t_hlen;
+	int ret;
+
+	tunnel = netdev_priv(dev);
+
+	tunnel->dev = dev;
+	tunnel->net = dev_net(dev);
+	strcpy(tunnel->parms.name, dev->name);
+
+	dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
+	if (!dev->tstats)
+		return -ENOMEM;
+
+	ret = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL);
+	if (ret) {
+		free_percpu(dev->tstats);
+		dev->tstats = NULL;
+		return ret;
+	}
+
+	tunnel->tun_hlen = 8;
+	tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen +
+		       erspan_hdr_len(tunnel->parms.erspan_ver);
+	t_hlen = tunnel->hlen + sizeof(struct ipv6hdr);
+
+	dev->hard_header_len = LL_MAX_HEADER + t_hlen;
+	dev->mtu = ETH_DATA_LEN - t_hlen;
+	if (dev->type == ARPHRD_ETHER)
+		dev->mtu -= ETH_HLEN;
+	if (!(tunnel->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT))
+		dev->mtu -= 8;
+
+	dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
+	tunnel = netdev_priv(dev);
+	ip6gre_tnl_link_config(tunnel, 1);
+
+	return 0;
+}
+
+static const struct net_device_ops ip6erspan_netdev_ops = {
+	.ndo_init =		ip6erspan_tap_init,
+	.ndo_uninit =		ip6gre_tunnel_uninit,
+	.ndo_start_xmit =	ip6erspan_tunnel_xmit,
+	.ndo_set_mac_address =	eth_mac_addr,
+	.ndo_validate_addr =	eth_validate_addr,
+	.ndo_change_mtu =	ip6_tnl_change_mtu,
+	.ndo_get_stats64 =	ip_tunnel_get_stats64,
+	.ndo_get_iflink =	ip6_tnl_get_iflink,
+};
+
 static void ip6gre_tap_setup(struct net_device *dev)
 {
 
@@ -1372,8 +1832,13 @@ static int ip6gre_newlink(struct net *src_net, struct net_device *dev,
 
 	ip6gre_netlink_parms(data, &nt->parms);
 
-	if (ip6gre_tunnel_find(net, &nt->parms, dev->type))
-		return -EEXIST;
+	if (nt->parms.collect_md) {
+		if (rtnl_dereference(ign->collect_md_tun))
+			return -EEXIST;
+	} else {
+		if (ip6gre_tunnel_find(net, &nt->parms, dev->type))
+			return -EEXIST;
+	}
 
 	if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
 		eth_hw_addr_random(dev);
@@ -1492,8 +1957,12 @@ static size_t ip6gre_get_size(const struct net_device *dev)
 		nla_total_size(2) +
 		/* IFLA_GRE_ENCAP_DPORT */
 		nla_total_size(2) +
+		/* IFLA_GRE_COLLECT_METADATA */
+		nla_total_size(0) +
 		/* IFLA_GRE_FWMARK */
 		nla_total_size(4) +
+		/* IFLA_GRE_ERSPAN_INDEX */
+		nla_total_size(4) +
 		0;
 }
 
@@ -1515,7 +1984,8 @@ static int ip6gre_fill_info(struct sk_buff *skb, const struct net_device *dev)
 	    nla_put_u8(skb, IFLA_GRE_ENCAP_LIMIT, p->encap_limit) ||
 	    nla_put_be32(skb, IFLA_GRE_FLOWINFO, p->flowinfo) ||
 	    nla_put_u32(skb, IFLA_GRE_FLAGS, p->flags) ||
-	    nla_put_u32(skb, IFLA_GRE_FWMARK, p->fwmark))
+	    nla_put_u32(skb, IFLA_GRE_FWMARK, p->fwmark) ||
+	    nla_put_u32(skb, IFLA_GRE_ERSPAN_INDEX, p->index))
 		goto nla_put_failure;
 
 	if (nla_put_u16(skb, IFLA_GRE_ENCAP_TYPE,
@@ -1528,6 +1998,24 @@ static int ip6gre_fill_info(struct sk_buff *skb, const struct net_device *dev)
 			t->encap.flags))
 		goto nla_put_failure;
 
+	if (p->collect_md) {
+		if (nla_put_flag(skb, IFLA_GRE_COLLECT_METADATA))
+			goto nla_put_failure;
+	}
+
+	if (nla_put_u8(skb, IFLA_GRE_ERSPAN_VER, p->erspan_ver))
+		goto nla_put_failure;
+
+	if (p->erspan_ver == 1) {
+		if (nla_put_u32(skb, IFLA_GRE_ERSPAN_INDEX, p->index))
+			goto nla_put_failure;
+	} else if (p->erspan_ver == 2) {
+		if (nla_put_u8(skb, IFLA_GRE_ERSPAN_DIR, p->dir))
+			goto nla_put_failure;
+		if (nla_put_u16(skb, IFLA_GRE_ERSPAN_HWID, p->hwid))
+			goto nla_put_failure;
+	}
+
 	return 0;
 
 nla_put_failure:
@@ -1550,9 +2038,28 @@ static const struct nla_policy ip6gre_policy[IFLA_GRE_MAX + 1] = {
 	[IFLA_GRE_ENCAP_FLAGS]  = { .type = NLA_U16 },
 	[IFLA_GRE_ENCAP_SPORT]  = { .type = NLA_U16 },
 	[IFLA_GRE_ENCAP_DPORT]  = { .type = NLA_U16 },
+	[IFLA_GRE_COLLECT_METADATA] = { .type = NLA_FLAG },
 	[IFLA_GRE_FWMARK]       = { .type = NLA_U32 },
+	[IFLA_GRE_ERSPAN_INDEX] = { .type = NLA_U32 },
+	[IFLA_GRE_ERSPAN_VER]	= { .type = NLA_U8 },
+	[IFLA_GRE_ERSPAN_DIR]	= { .type = NLA_U8 },
+	[IFLA_GRE_ERSPAN_HWID]	= { .type = NLA_U16 },
 };
 
+static void ip6erspan_tap_setup(struct net_device *dev)
+{
+	ether_setup(dev);
+
+	dev->netdev_ops = &ip6erspan_netdev_ops;
+	dev->needs_free_netdev = true;
+	dev->priv_destructor = ip6gre_dev_free;
+
+	dev->features |= NETIF_F_NETNS_LOCAL;
+	dev->priv_flags &= ~IFF_TX_SKB_SHARING;
+	dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
+	netif_keep_dst(dev);
+}
+
 static struct rtnl_link_ops ip6gre_link_ops __read_mostly = {
 	.kind		= "ip6gre",
 	.maxtype	= IFLA_GRE_MAX,
@@ -1582,6 +2089,20 @@ static struct rtnl_link_ops ip6gre_tap_ops __read_mostly = {
 	.get_link_net	= ip6_tnl_get_link_net,
 };
 
+static struct rtnl_link_ops ip6erspan_tap_ops __read_mostly = {
+	.kind		= "ip6erspan",
+	.maxtype	= IFLA_GRE_MAX,
+	.policy		= ip6gre_policy,
+	.priv_size	= sizeof(struct ip6_tnl),
+	.setup		= ip6erspan_tap_setup,
+	.validate	= ip6erspan_tap_validate,
+	.newlink	= ip6gre_newlink,
+	.changelink	= ip6gre_changelink,
+	.get_size	= ip6gre_get_size,
+	.fill_info	= ip6gre_fill_info,
+	.get_link_net	= ip6_tnl_get_link_net,
+};
+
 /*
  *	And now the modules code and kernel interface.
  */
@@ -1610,9 +2131,15 @@ static int __init ip6gre_init(void)
 	if (err < 0)
 		goto tap_ops_failed;
 
+	err = rtnl_link_register(&ip6erspan_tap_ops);
+	if (err < 0)
+		goto erspan_link_failed;
+
 out:
 	return err;
 
+erspan_link_failed:
+	rtnl_link_unregister(&ip6gre_tap_ops);
 tap_ops_failed:
 	rtnl_link_unregister(&ip6gre_link_ops);
 rtnl_link_failed:
@@ -1626,6 +2153,7 @@ static void __exit ip6gre_fini(void)
 {
 	rtnl_link_unregister(&ip6gre_tap_ops);
 	rtnl_link_unregister(&ip6gre_link_ops);
+	rtnl_link_unregister(&ip6erspan_tap_ops);
 	inet6_del_protocol(&ip6gre_protocol, IPPROTO_GRE);
 	unregister_pernet_device(&ip6gre_net_ops);
 }
@@ -1637,4 +2165,5 @@ MODULE_AUTHOR("D. Kozlov (xeb@mail.ru)");
 MODULE_DESCRIPTION("GRE over IPv6 tunneling device");
 MODULE_ALIAS_RTNL_LINK("ip6gre");
 MODULE_ALIAS_RTNL_LINK("ip6gretap");
+MODULE_ALIAS_RTNL_LINK("ip6erspan");
 MODULE_ALIAS_NETDEV("ip6gre0");
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 5110a418cc4d..176d74fb3b4d 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -1201,13 +1201,13 @@ static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
 		      rt->dst.dev->mtu : dst_mtu(&rt->dst);
 	else
 		mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
-		      rt->dst.dev->mtu : dst_mtu(rt->dst.path);
+		      rt->dst.dev->mtu : dst_mtu(xfrm_dst_path(&rt->dst));
 	if (np->frag_size < mtu) {
 		if (np->frag_size)
 			mtu = np->frag_size;
 	}
 	cork->base.fragsize = mtu;
-	if (dst_allfrag(rt->dst.path))
+	if (dst_allfrag(xfrm_dst_path(&rt->dst)))
 		cork->base.flags |= IPCORK_ALLFRAG;
 	cork->base.length = 0;
 
diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index db84f523656d..6ff2f21ae3fc 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -861,7 +861,7 @@ int ip6_tnl_rcv(struct ip6_tnl *t, struct sk_buff *skb,
 		struct metadata_dst *tun_dst,
 		bool log_ecn_err)
 {
-	return __ip6_tnl_rcv(t, skb, tpi, NULL, ip6ip6_dscp_ecn_decapsulate,
+	return __ip6_tnl_rcv(t, skb, tpi, tun_dst, ip6ip6_dscp_ecn_decapsulate,
 			     log_ecn_err);
 }
 EXPORT_SYMBOL(ip6_tnl_rcv);
@@ -979,6 +979,9 @@ int ip6_tnl_xmit_ctl(struct ip6_tnl *t,
 	int ret = 0;
 	struct net *net = t->net;
 
+	if (t->parms.collect_md)
+		return 1;
+
 	if ((p->flags & IP6_TNL_F_CAP_XMIT) ||
 	    ((p->flags & IP6_TNL_F_CAP_PER_PACKET) &&
 	     (ip6_tnl_get_cap(t, laddr, raddr) & IP6_TNL_F_CAP_XMIT))) {
diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index a2e1a864eb46..890f9bda06a4 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -1425,10 +1425,13 @@ int __init ip6_mr_init(void)
 		goto add_proto_fail;
 	}
 #endif
-	rtnl_register(RTNL_FAMILY_IP6MR, RTM_GETROUTE, NULL,
-		      ip6mr_rtm_dumproute, 0);
-	return 0;
+	err = rtnl_register_module(THIS_MODULE, RTNL_FAMILY_IP6MR, RTM_GETROUTE,
+				   NULL, ip6mr_rtm_dumproute, 0);
+	if (err == 0)
+		return 0;
+
 #ifdef CONFIG_IPV6_PIMSM_V2
+	inet6_del_protocol(&pim6_protocol, IPPROTO_PIM);
 add_proto_fail:
 	unregister_netdevice_notifier(&ip6_mr_notifier);
 #endif
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 7a8d1500d374..b3f4d19b3ca5 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -186,7 +186,7 @@ static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
 
 static u32 *rt6_pcpu_cow_metrics(struct rt6_info *rt)
 {
-	return dst_metrics_write_ptr(rt->dst.from);
+	return dst_metrics_write_ptr(&rt->from->dst);
 }
 
 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
@@ -391,7 +391,7 @@ static void ip6_dst_destroy(struct dst_entry *dst)
 {
 	struct rt6_info *rt = (struct rt6_info *)dst;
 	struct rt6_exception_bucket *bucket;
-	struct dst_entry *from = dst->from;
+	struct rt6_info *from = rt->from;
 	struct inet6_dev *idev;
 
 	dst_destroy_metrics_generic(dst);
@@ -409,8 +409,8 @@ static void ip6_dst_destroy(struct dst_entry *dst)
 		kfree(bucket);
 	}
 
-	dst->from = NULL;
-	dst_release(from);
+	rt->from = NULL;
+	dst_release(&from->dst);
 }
 
 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
@@ -443,9 +443,9 @@ static bool rt6_check_expired(const struct rt6_info *rt)
 	if (rt->rt6i_flags & RTF_EXPIRES) {
 		if (time_after(jiffies, rt->dst.expires))
 			return true;
-	} else if (rt->dst.from) {
+	} else if (rt->from) {
 		return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
-		       rt6_check_expired((struct rt6_info *)rt->dst.from);
+			rt6_check_expired(rt->from);
 	}
 	return false;
 }
@@ -502,7 +502,7 @@ static inline struct rt6_info *rt6_device_match(struct net *net,
 	if (!oif && ipv6_addr_any(saddr))
 		goto out;
 
-	for (sprt = rt; sprt; sprt = rcu_dereference(sprt->dst.rt6_next)) {
+	for (sprt = rt; sprt; sprt = rcu_dereference(sprt->rt6_next)) {
 		struct net_device *dev = sprt->dst.dev;
 
 		if (oif) {
@@ -721,7 +721,7 @@ static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
 
 	match = NULL;
 	cont = NULL;
-	for (rt = rr_head; rt; rt = rcu_dereference(rt->dst.rt6_next)) {
+	for (rt = rr_head; rt; rt = rcu_dereference(rt->rt6_next)) {
 		if (rt->rt6i_metric != metric) {
 			cont = rt;
 			break;
@@ -731,7 +731,7 @@ static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
 	}
 
 	for (rt = leaf; rt && rt != rr_head;
-	     rt = rcu_dereference(rt->dst.rt6_next)) {
+	     rt = rcu_dereference(rt->rt6_next)) {
 		if (rt->rt6i_metric != metric) {
 			cont = rt;
 			break;
@@ -743,7 +743,7 @@ static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
 	if (match || !cont)
 		return match;
 
-	for (rt = cont; rt; rt = rcu_dereference(rt->dst.rt6_next))
+	for (rt = cont; rt; rt = rcu_dereference(rt->rt6_next))
 		match = find_match(rt, oif, strict, &mpri, match, do_rr);
 
 	return match;
@@ -781,7 +781,7 @@ static struct rt6_info *rt6_select(struct net *net, struct fib6_node *fn,
 			     &do_rr);
 
 	if (do_rr) {
-		struct rt6_info *next = rcu_dereference(rt0->dst.rt6_next);
+		struct rt6_info *next = rcu_dereference(rt0->rt6_next);
 
 		/* no entries matched; do round-robin */
 		if (!next || next->rt6i_metric != rt0->rt6i_metric)
@@ -1054,7 +1054,7 @@ static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort,
 	 */
 
 	if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
-		ort = (struct rt6_info *)ort->dst.from;
+		ort = ort->from;
 
 	rcu_read_lock();
 	dev = ip6_rt_get_dev_rcu(ort);
@@ -1274,7 +1274,7 @@ static int rt6_insert_exception(struct rt6_info *nrt,
 
 	/* ort can't be a cache or pcpu route */
 	if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
-		ort = (struct rt6_info *)ort->dst.from;
+		ort = ort->from;
 	WARN_ON_ONCE(ort->rt6i_flags & (RTF_CACHE | RTF_PCPU));
 
 	spin_lock_bh(&rt6_exception_lock);
@@ -1415,8 +1415,8 @@ static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt,
 /* Remove the passed in cached rt from the hash table that contains it */
 int rt6_remove_exception_rt(struct rt6_info *rt)
 {
-	struct rt6_info *from = (struct rt6_info *)rt->dst.from;
 	struct rt6_exception_bucket *bucket;
+	struct rt6_info *from = rt->from;
 	struct in6_addr *src_key = NULL;
 	struct rt6_exception *rt6_ex;
 	int err;
@@ -1460,8 +1460,8 @@ int rt6_remove_exception_rt(struct rt6_info *rt)
  */
 static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
 {
-	struct rt6_info *from = (struct rt6_info *)rt->dst.from;
 	struct rt6_exception_bucket *bucket;
+	struct rt6_info *from = rt->from;
 	struct in6_addr *src_key = NULL;
 	struct rt6_exception *rt6_ex;
 
@@ -1929,9 +1929,9 @@ struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_ori
 
 static void rt6_dst_from_metrics_check(struct rt6_info *rt)
 {
-	if (rt->dst.from &&
-	    dst_metrics_ptr(&rt->dst) != dst_metrics_ptr(rt->dst.from))
-		dst_init_metrics(&rt->dst, dst_metrics_ptr(rt->dst.from), true);
+	if (rt->from &&
+	    dst_metrics_ptr(&rt->dst) != dst_metrics_ptr(&rt->from->dst))
+		dst_init_metrics(&rt->dst, dst_metrics_ptr(&rt->from->dst), true);
 }
 
 static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie)
@@ -1951,7 +1951,7 @@ static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, u32 cookie)
 {
 	if (!__rt6_check_expired(rt) &&
 	    rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
-	    rt6_check((struct rt6_info *)(rt->dst.from), cookie))
+	    rt6_check(rt->from, cookie))
 		return &rt->dst;
 	else
 		return NULL;
@@ -1971,7 +1971,7 @@ static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
 	rt6_dst_from_metrics_check(rt);
 
 	if (rt->rt6i_flags & RTF_PCPU ||
-	    (unlikely(!list_empty(&rt->rt6i_uncached)) && rt->dst.from))
+	    (unlikely(!list_empty(&rt->rt6i_uncached)) && rt->from))
 		return rt6_dst_from_check(rt, cookie);
 	else
 		return rt6_check(rt, cookie);
@@ -3055,11 +3055,11 @@ out:
 
 static void rt6_set_from(struct rt6_info *rt, struct rt6_info *from)
 {
-	BUG_ON(from->dst.from);
+	BUG_ON(from->from);
 
 	rt->rt6i_flags &= ~RTF_EXPIRES;
 	dst_hold(&from->dst);
-	rt->dst.from = &from->dst;
+	rt->from = from;
 	dst_init_metrics(&rt->dst, dst_metrics_ptr(&from->dst), true);
 }
 
@@ -4596,8 +4596,6 @@ static int __net_init ip6_route_net_init(struct net *net)
 					   GFP_KERNEL);
 	if (!net->ipv6.ip6_null_entry)
 		goto out_ip6_dst_entries;
-	net->ipv6.ip6_null_entry->dst.path =
-		(struct dst_entry *)net->ipv6.ip6_null_entry;
 	net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
 	dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
 			 ip6_template_metrics, true);
@@ -4609,8 +4607,6 @@ static int __net_init ip6_route_net_init(struct net *net)
 					       GFP_KERNEL);
 	if (!net->ipv6.ip6_prohibit_entry)
 		goto out_ip6_null_entry;
-	net->ipv6.ip6_prohibit_entry->dst.path =
-		(struct dst_entry *)net->ipv6.ip6_prohibit_entry;
 	net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
 	dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
 			 ip6_template_metrics, true);
@@ -4620,8 +4616,6 @@ static int __net_init ip6_route_net_init(struct net *net)
 					       GFP_KERNEL);
 	if (!net->ipv6.ip6_blk_hole_entry)
 		goto out_ip6_prohibit_entry;
-	net->ipv6.ip6_blk_hole_entry->dst.path =
-		(struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
 	net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
 	dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
 			 ip6_template_metrics, true);
@@ -4778,11 +4772,20 @@ int __init ip6_route_init(void)
 	if (ret)
 		goto fib6_rules_init;
 
-	ret = -ENOBUFS;
-	if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, 0) ||
-	    __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, 0) ||
-	    __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL,
-			    RTNL_FLAG_DOIT_UNLOCKED))
+	ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE,
+				   inet6_rtm_newroute, NULL, 0);
+	if (ret < 0)
+		goto out_register_late_subsys;
+
+	ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE,
+				   inet6_rtm_delroute, NULL, 0);
+	if (ret < 0)
+		goto out_register_late_subsys;
+
+	ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE,
+				   inet6_rtm_getroute, NULL,
+				   RTNL_FLAG_DOIT_UNLOCKED);
+	if (ret < 0)
 		goto out_register_late_subsys;
 
 	ret = register_netdevice_notifier(&ip6_route_dev_notifier);
@@ -4800,6 +4803,7 @@ out:
 	return ret;
 
 out_register_late_subsys:
+	rtnl_unregister_all(PF_INET6);
 	unregister_pernet_subsys(&ip6_route_net_late_ops);
 fib6_rules_init:
 	fib6_rules_cleanup();
diff --git a/net/ipv6/seg6.c b/net/ipv6/seg6.c
index c81407770956..7f5621d09571 100644
--- a/net/ipv6/seg6.c
+++ b/net/ipv6/seg6.c
@@ -306,9 +306,7 @@ static int seg6_genl_dumphmac(struct sk_buff *skb, struct netlink_callback *cb)
 	struct seg6_hmac_info *hinfo;
 	int ret;
 
-	ret = rhashtable_walk_start(iter);
-	if (ret && ret != -EAGAIN)
-		goto done;
+	rhashtable_walk_start(iter);
 
 	for (;;) {
 		hinfo = rhashtable_walk_next(iter);
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 3f30fa313bf2..eecf9f0faf29 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -89,28 +89,12 @@ static u32 udp6_ehashfn(const struct net *net,
 			       udp_ipv6_hash_secret + net_hash_mix(net));
 }
 
-static u32 udp6_portaddr_hash(const struct net *net,
-			      const struct in6_addr *addr6,
-			      unsigned int port)
-{
-	unsigned int hash, mix = net_hash_mix(net);
-
-	if (ipv6_addr_any(addr6))
-		hash = jhash_1word(0, mix);
-	else if (ipv6_addr_v4mapped(addr6))
-		hash = jhash_1word((__force u32)addr6->s6_addr32[3], mix);
-	else
-		hash = jhash2((__force u32 *)addr6->s6_addr32, 4, mix);
-
-	return hash ^ port;
-}
-
 int udp_v6_get_port(struct sock *sk, unsigned short snum)
 {
 	unsigned int hash2_nulladdr =
-		udp6_portaddr_hash(sock_net(sk), &in6addr_any, snum);
+		ipv6_portaddr_hash(sock_net(sk), &in6addr_any, snum);
 	unsigned int hash2_partial =
-		udp6_portaddr_hash(sock_net(sk), &sk->sk_v6_rcv_saddr, 0);
+		ipv6_portaddr_hash(sock_net(sk), &sk->sk_v6_rcv_saddr, 0);
 
 	/* precompute partial secondary hash */
 	udp_sk(sk)->udp_portaddr_hash = hash2_partial;
@@ -119,7 +103,7 @@ int udp_v6_get_port(struct sock *sk, unsigned short snum)
 
 static void udp_v6_rehash(struct sock *sk)
 {
-	u16 new_hash = udp6_portaddr_hash(sock_net(sk),
+	u16 new_hash = ipv6_portaddr_hash(sock_net(sk),
 					  &sk->sk_v6_rcv_saddr,
 					  inet_sk(sk)->inet_num);
 
@@ -184,7 +168,7 @@ static struct sock *udp6_lib_lookup2(struct net *net,
 		struct udp_hslot *hslot2, struct sk_buff *skb)
 {
 	struct sock *sk, *result;
-	int score, badness, matches = 0, reuseport = 0;
+	int score, badness;
 	u32 hash = 0;
 
 	result = NULL;
@@ -193,8 +177,7 @@ static struct sock *udp6_lib_lookup2(struct net *net,
 		score = compute_score(sk, net, saddr, sport,
 				      daddr, hnum, dif, sdif, exact_dif);
 		if (score > badness) {
-			reuseport = sk->sk_reuseport;
-			if (reuseport) {
+			if (sk->sk_reuseport) {
 				hash = udp6_ehashfn(net, daddr, hnum,
 						    saddr, sport);
 
@@ -202,15 +185,9 @@ static struct sock *udp6_lib_lookup2(struct net *net,
 							sizeof(struct udphdr));
 				if (result)
 					return result;
-				matches = 1;
 			}
 			result = sk;
 			badness = score;
-		} else if (score == badness && reuseport) {
-			matches++;
-			if (reciprocal_scale(hash, matches) == 0)
-				result = sk;
-			hash = next_pseudo_random32(hash);
 		}
 	}
 	return result;
@@ -228,11 +205,11 @@ struct sock *__udp6_lib_lookup(struct net *net,
 	unsigned int hash2, slot2, slot = udp_hashfn(net, hnum, udptable->mask);
 	struct udp_hslot *hslot2, *hslot = &udptable->hash[slot];
 	bool exact_dif = udp6_lib_exact_dif_match(net, skb);
-	int score, badness, matches = 0, reuseport = 0;
+	int score, badness;
 	u32 hash = 0;
 
 	if (hslot->count > 10) {
-		hash2 = udp6_portaddr_hash(net, daddr, hnum);
+		hash2 = ipv6_portaddr_hash(net, daddr, hnum);
 		slot2 = hash2 & udptable->mask;
 		hslot2 = &udptable->hash2[slot2];
 		if (hslot->count < hslot2->count)
@@ -243,7 +220,7 @@ struct sock *__udp6_lib_lookup(struct net *net,
 					  hslot2, skb);
 		if (!result) {
 			unsigned int old_slot2 = slot2;
-			hash2 = udp6_portaddr_hash(net, &in6addr_any, hnum);
+			hash2 = ipv6_portaddr_hash(net, &in6addr_any, hnum);
 			slot2 = hash2 & udptable->mask;
 			/* avoid searching the same slot again. */
 			if (unlikely(slot2 == old_slot2))
@@ -267,23 +244,16 @@ begin:
 		score = compute_score(sk, net, saddr, sport, daddr, hnum, dif,
 				      sdif, exact_dif);
 		if (score > badness) {
-			reuseport = sk->sk_reuseport;
-			if (reuseport) {
+			if (sk->sk_reuseport) {
 				hash = udp6_ehashfn(net, daddr, hnum,
 						    saddr, sport);
 				result = reuseport_select_sock(sk, hash, skb,
 							sizeof(struct udphdr));
 				if (result)
 					return result;
-				matches = 1;
 			}
 			result = sk;
 			badness = score;
-		} else if (score == badness && reuseport) {
-			matches++;
-			if (reciprocal_scale(hash, matches) == 0)
-				result = sk;
-			hash = next_pseudo_random32(hash);
 		}
 	}
 	return result;
@@ -719,9 +689,9 @@ static int __udp6_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
 	struct sk_buff *nskb;
 
 	if (use_hash2) {
-		hash2_any = udp6_portaddr_hash(net, &in6addr_any, hnum) &
+		hash2_any = ipv6_portaddr_hash(net, &in6addr_any, hnum) &
 			    udptable->mask;
-		hash2 = udp6_portaddr_hash(net, daddr, hnum) & udptable->mask;
+		hash2 = ipv6_portaddr_hash(net, daddr, hnum) & udptable->mask;
 start_lookup:
 		hslot = &udptable->hash2[hash2];
 		offset = offsetof(typeof(*sk), __sk_common.skc_portaddr_node);
@@ -909,7 +879,7 @@ static struct sock *__udp6_lib_demux_lookup(struct net *net,
 			int dif, int sdif)
 {
 	unsigned short hnum = ntohs(loc_port);
-	unsigned int hash2 = udp6_portaddr_hash(net, loc_addr, hnum);
+	unsigned int hash2 = ipv6_portaddr_hash(net, loc_addr, hnum);
 	unsigned int slot2 = hash2 & udp_table.mask;
 	struct udp_hslot *hslot2 = &udp_table.hash2[slot2];
 	const __portpair ports = INET_COMBINED_PORTS(rmt_port, hnum);
diff --git a/net/ipv6/xfrm6_mode_tunnel.c b/net/ipv6/xfrm6_mode_tunnel.c
index 02556e356f87..e66b94f46532 100644
--- a/net/ipv6/xfrm6_mode_tunnel.c
+++ b/net/ipv6/xfrm6_mode_tunnel.c
@@ -59,7 +59,7 @@ static int xfrm6_mode_tunnel_output(struct xfrm_state *x, struct sk_buff *skb)
 	if (x->props.flags & XFRM_STATE_NOECN)
 		dsfield &= ~INET_ECN_MASK;
 	ipv6_change_dsfield(top_iph, 0, dsfield);
-	top_iph->hop_limit = ip6_dst_hoplimit(dst->child);
+	top_iph->hop_limit = ip6_dst_hoplimit(xfrm_dst_child(dst));
 	top_iph->saddr = *(struct in6_addr *)&x->props.saddr;
 	top_iph->daddr = *(struct in6_addr *)&x->id.daddr;
 	return 0;
diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c
index 885ade234a49..09fb44ee3b45 100644
--- a/net/ipv6/xfrm6_policy.c
+++ b/net/ipv6/xfrm6_policy.c
@@ -265,7 +265,7 @@ static void xfrm6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
 			in6_dev_put(xdst->u.rt6.rt6i_idev);
 			xdst->u.rt6.rt6i_idev = loopback_idev;
 			in6_dev_hold(loopback_idev);
-			xdst = (struct xfrm_dst *)xdst->u.dst.child;
+			xdst = (struct xfrm_dst *)xfrm_dst_child(&xdst->u.dst);
 		} while (xdst->u.dst.xfrm);
 
 		__in6_dev_put(loopback_idev);
diff --git a/net/mac80211/mesh_pathtbl.c b/net/mac80211/mesh_pathtbl.c
index 86c8dfef56a4..a5125624a76d 100644
--- a/net/mac80211/mesh_pathtbl.c
+++ b/net/mac80211/mesh_pathtbl.c
@@ -257,9 +257,7 @@ __mesh_path_lookup_by_idx(struct mesh_table *tbl, int idx)
 	if (ret)
 		return NULL;
 
-	ret = rhashtable_walk_start(&iter);
-	if (ret && ret != -EAGAIN)
-		goto err;
+	rhashtable_walk_start(&iter);
 
 	while ((mpath = rhashtable_walk_next(&iter))) {
 		if (IS_ERR(mpath) && PTR_ERR(mpath) == -EAGAIN)
@@ -269,7 +267,6 @@ __mesh_path_lookup_by_idx(struct mesh_table *tbl, int idx)
 		if (i++ == idx)
 			break;
 	}
-err:
 	rhashtable_walk_stop(&iter);
 	rhashtable_walk_exit(&iter);
 
@@ -513,9 +510,7 @@ void mesh_plink_broken(struct sta_info *sta)
 	if (ret)
 		return;
 
-	ret = rhashtable_walk_start(&iter);
-	if (ret && ret != -EAGAIN)
-		goto out;
+	rhashtable_walk_start(&iter);
 
 	while ((mpath = rhashtable_walk_next(&iter))) {
 		if (IS_ERR(mpath) && PTR_ERR(mpath) == -EAGAIN)
@@ -535,7 +530,6 @@ void mesh_plink_broken(struct sta_info *sta)
 				WLAN_REASON_MESH_PATH_DEST_UNREACHABLE, bcast);
 		}
 	}
-out:
 	rhashtable_walk_stop(&iter);
 	rhashtable_walk_exit(&iter);
 }
@@ -584,9 +578,7 @@ void mesh_path_flush_by_nexthop(struct sta_info *sta)
 	if (ret)
 		return;
 
-	ret = rhashtable_walk_start(&iter);
-	if (ret && ret != -EAGAIN)
-		goto out;
+	rhashtable_walk_start(&iter);
 
 	while ((mpath = rhashtable_walk_next(&iter))) {
 		if (IS_ERR(mpath) && PTR_ERR(mpath) == -EAGAIN)
@@ -597,7 +589,7 @@ void mesh_path_flush_by_nexthop(struct sta_info *sta)
 		if (rcu_access_pointer(mpath->next_hop) == sta)
 			__mesh_path_del(tbl, mpath);
 	}
-out:
+
 	rhashtable_walk_stop(&iter);
 	rhashtable_walk_exit(&iter);
 }
@@ -614,9 +606,7 @@ static void mpp_flush_by_proxy(struct ieee80211_sub_if_data *sdata,
 	if (ret)
 		return;
 
-	ret = rhashtable_walk_start(&iter);
-	if (ret && ret != -EAGAIN)
-		goto out;
+	rhashtable_walk_start(&iter);
 
 	while ((mpath = rhashtable_walk_next(&iter))) {
 		if (IS_ERR(mpath) && PTR_ERR(mpath) == -EAGAIN)
@@ -627,7 +617,7 @@ static void mpp_flush_by_proxy(struct ieee80211_sub_if_data *sdata,
 		if (ether_addr_equal(mpath->mpp, proxy))
 			__mesh_path_del(tbl, mpath);
 	}
-out:
+
 	rhashtable_walk_stop(&iter);
 	rhashtable_walk_exit(&iter);
 }
@@ -642,9 +632,7 @@ static void table_flush_by_iface(struct mesh_table *tbl)
 	if (ret)
 		return;
 
-	ret = rhashtable_walk_start(&iter);
-	if (ret && ret != -EAGAIN)
-		goto out;
+	rhashtable_walk_start(&iter);
 
 	while ((mpath = rhashtable_walk_next(&iter))) {
 		if (IS_ERR(mpath) && PTR_ERR(mpath) == -EAGAIN)
@@ -653,7 +641,7 @@ static void table_flush_by_iface(struct mesh_table *tbl)
 			break;
 		__mesh_path_del(tbl, mpath);
 	}
-out:
+
 	rhashtable_walk_stop(&iter);
 	rhashtable_walk_exit(&iter);
 }
@@ -873,9 +861,7 @@ void mesh_path_tbl_expire(struct ieee80211_sub_if_data *sdata,
 	if (ret)
 		return;
 
-	ret = rhashtable_walk_start(&iter);
-	if (ret && ret != -EAGAIN)
-		goto out;
+	rhashtable_walk_start(&iter);
 
 	while ((mpath = rhashtable_walk_next(&iter))) {
 		if (IS_ERR(mpath) && PTR_ERR(mpath) == -EAGAIN)
@@ -887,7 +873,7 @@ void mesh_path_tbl_expire(struct ieee80211_sub_if_data *sdata,
 		     time_after(jiffies, mpath->exp_time + MESH_PATH_EXPIRE))
 			__mesh_path_del(tbl, mpath);
 	}
-out:
+
 	rhashtable_walk_stop(&iter);
 	rhashtable_walk_exit(&iter);
 }
diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c
index 8ca9915befc8..5dce8336d33f 100644
--- a/net/mpls/af_mpls.c
+++ b/net/mpls/af_mpls.c
@@ -2510,12 +2510,15 @@ static int __init mpls_init(void)
 
 	rtnl_af_register(&mpls_af_ops);
 
-	rtnl_register(PF_MPLS, RTM_NEWROUTE, mpls_rtm_newroute, NULL, 0);
-	rtnl_register(PF_MPLS, RTM_DELROUTE, mpls_rtm_delroute, NULL, 0);
-	rtnl_register(PF_MPLS, RTM_GETROUTE, mpls_getroute, mpls_dump_routes,
-		      0);
-	rtnl_register(PF_MPLS, RTM_GETNETCONF, mpls_netconf_get_devconf,
-		      mpls_netconf_dump_devconf, 0);
+	rtnl_register_module(THIS_MODULE, PF_MPLS, RTM_NEWROUTE,
+			     mpls_rtm_newroute, NULL, 0);
+	rtnl_register_module(THIS_MODULE, PF_MPLS, RTM_DELROUTE,
+			     mpls_rtm_delroute, NULL, 0);
+	rtnl_register_module(THIS_MODULE, PF_MPLS, RTM_GETROUTE,
+			     mpls_getroute, mpls_dump_routes, 0);
+	rtnl_register_module(THIS_MODULE, PF_MPLS, RTM_GETNETCONF,
+			     mpls_netconf_get_devconf,
+			     mpls_netconf_dump_devconf, 0);
 	err = ipgre_tunnel_encap_add_mpls_ops();
 	if (err)
 		pr_err("Can't add mpls over gre tunnel ops\n");
diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c
index f8166c1d5430..3f1624ee056f 100644
--- a/net/netfilter/nft_set_hash.c
+++ b/net/netfilter/nft_set_hash.c
@@ -251,11 +251,7 @@ static void nft_rhash_walk(const struct nft_ctx *ctx, struct nft_set *set,
 	if (err)
 		return;
 
-	err = rhashtable_walk_start(&hti);
-	if (err && err != -EAGAIN) {
-		iter->err = err;
-		goto out;
-	}
+	rhashtable_walk_start(&hti);
 
 	while ((he = rhashtable_walk_next(&hti))) {
 		if (IS_ERR(he)) {
@@ -306,9 +302,7 @@ static void nft_rhash_gc(struct work_struct *work)
 	if (err)
 		goto schedule;
 
-	err = rhashtable_walk_start(&hti);
-	if (err && err != -EAGAIN)
-		goto out;
+	rhashtable_walk_start(&hti);
 
 	while ((he = rhashtable_walk_next(&hti))) {
 		if (IS_ERR(he)) {
diff --git a/net/netfilter/xt_policy.c b/net/netfilter/xt_policy.c
index 2b4ab189bba7..5639fb03bdd9 100644
--- a/net/netfilter/xt_policy.c
+++ b/net/netfilter/xt_policy.c
@@ -93,7 +93,8 @@ match_policy_out(const struct sk_buff *skb, const struct xt_policy_info *info,
 	if (dst->xfrm == NULL)
 		return -1;
 
-	for (i = 0; dst && dst->xfrm; dst = dst->child, i++) {
+	for (i = 0; dst && dst->xfrm;
+	     dst = ((struct xfrm_dst *)dst)->child, i++) {
 		pos = strict ? i : 0;
 		if (pos >= info->len)
 			return 0;
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 79cc1bf36e4a..972bfe113043 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -65,6 +65,7 @@
 #include <linux/net_namespace.h>
 
 #include <net/net_namespace.h>
+#include <net/netns/generic.h>
 #include <net/sock.h>
 #include <net/scm.h>
 #include <net/netlink.h>
@@ -145,8 +146,6 @@ static atomic_t nl_table_users = ATOMIC_INIT(0);
 
 static BLOCKING_NOTIFIER_HEAD(netlink_chain);
 
-static DEFINE_SPINLOCK(netlink_tap_lock);
-static struct list_head netlink_tap_all __read_mostly;
 
 static const struct rhashtable_params netlink_rhashtable_params;
 
@@ -173,14 +172,24 @@ static struct sk_buff *netlink_to_full_skb(const struct sk_buff *skb,
 	return new;
 }
 
+static unsigned int netlink_tap_net_id;
+
+struct netlink_tap_net {
+	struct list_head netlink_tap_all;
+	struct mutex netlink_tap_lock;
+};
+
 int netlink_add_tap(struct netlink_tap *nt)
 {
+	struct net *net = dev_net(nt->dev);
+	struct netlink_tap_net *nn = net_generic(net, netlink_tap_net_id);
+
 	if (unlikely(nt->dev->type != ARPHRD_NETLINK))
 		return -EINVAL;
 
-	spin_lock(&netlink_tap_lock);
-	list_add_rcu(&nt->list, &netlink_tap_all);
-	spin_unlock(&netlink_tap_lock);
+	mutex_lock(&nn->netlink_tap_lock);
+	list_add_rcu(&nt->list, &nn->netlink_tap_all);
+	mutex_unlock(&nn->netlink_tap_lock);
 
 	__module_get(nt->module);
 
@@ -190,12 +199,14 @@ EXPORT_SYMBOL_GPL(netlink_add_tap);
 
 static int __netlink_remove_tap(struct netlink_tap *nt)
 {
+	struct net *net = dev_net(nt->dev);
+	struct netlink_tap_net *nn = net_generic(net, netlink_tap_net_id);
 	bool found = false;
 	struct netlink_tap *tmp;
 
-	spin_lock(&netlink_tap_lock);
+	mutex_lock(&nn->netlink_tap_lock);
 
-	list_for_each_entry(tmp, &netlink_tap_all, list) {
+	list_for_each_entry(tmp, &nn->netlink_tap_all, list) {
 		if (nt == tmp) {
 			list_del_rcu(&nt->list);
 			found = true;
@@ -205,7 +216,7 @@ static int __netlink_remove_tap(struct netlink_tap *nt)
 
 	pr_warn("__netlink_remove_tap: %p not found\n", nt);
 out:
-	spin_unlock(&netlink_tap_lock);
+	mutex_unlock(&nn->netlink_tap_lock);
 
 	if (found)
 		module_put(nt->module);
@@ -224,6 +235,26 @@ int netlink_remove_tap(struct netlink_tap *nt)
 }
 EXPORT_SYMBOL_GPL(netlink_remove_tap);
 
+static __net_init int netlink_tap_init_net(struct net *net)
+{
+	struct netlink_tap_net *nn = net_generic(net, netlink_tap_net_id);
+
+	INIT_LIST_HEAD(&nn->netlink_tap_all);
+	mutex_init(&nn->netlink_tap_lock);
+	return 0;
+}
+
+static void __net_exit netlink_tap_exit_net(struct net *net)
+{
+}
+
+static struct pernet_operations netlink_tap_net_ops = {
+	.init = netlink_tap_init_net,
+	.exit = netlink_tap_exit_net,
+	.id   = &netlink_tap_net_id,
+	.size = sizeof(struct netlink_tap_net),
+};
+
 static bool netlink_filter_tap(const struct sk_buff *skb)
 {
 	struct sock *sk = skb->sk;
@@ -277,7 +308,7 @@ static int __netlink_deliver_tap_skb(struct sk_buff *skb,
 	return ret;
 }
 
-static void __netlink_deliver_tap(struct sk_buff *skb)
+static void __netlink_deliver_tap(struct sk_buff *skb, struct netlink_tap_net *nn)
 {
 	int ret;
 	struct netlink_tap *tmp;
@@ -285,19 +316,21 @@ static void __netlink_deliver_tap(struct sk_buff *skb)
 	if (!netlink_filter_tap(skb))
 		return;
 
-	list_for_each_entry_rcu(tmp, &netlink_tap_all, list) {
+	list_for_each_entry_rcu(tmp, &nn->netlink_tap_all, list) {
 		ret = __netlink_deliver_tap_skb(skb, tmp->dev);
 		if (unlikely(ret))
 			break;
 	}
 }
 
-static void netlink_deliver_tap(struct sk_buff *skb)
+static void netlink_deliver_tap(struct net *net, struct sk_buff *skb)
 {
+	struct netlink_tap_net *nn = net_generic(net, netlink_tap_net_id);
+
 	rcu_read_lock();
 
-	if (unlikely(!list_empty(&netlink_tap_all)))
-		__netlink_deliver_tap(skb);
+	if (unlikely(!list_empty(&nn->netlink_tap_all)))
+		__netlink_deliver_tap(skb, nn);
 
 	rcu_read_unlock();
 }
@@ -306,7 +339,7 @@ static void netlink_deliver_tap_kernel(struct sock *dst, struct sock *src,
 				       struct sk_buff *skb)
 {
 	if (!(netlink_is_kernel(dst) && netlink_is_kernel(src)))
-		netlink_deliver_tap(skb);
+		netlink_deliver_tap(sock_net(dst), skb);
 }
 
 static void netlink_overrun(struct sock *sk)
@@ -1216,7 +1249,7 @@ static int __netlink_sendskb(struct sock *sk, struct sk_buff *skb)
 {
 	int len = skb->len;
 
-	netlink_deliver_tap(skb);
+	netlink_deliver_tap(sock_net(sk), skb);
 
 	skb_queue_tail(&sk->sk_receive_queue, skb);
 	sk->sk_data_ready(sk);
@@ -2481,8 +2514,9 @@ static int netlink_walk_start(struct nl_seq_iter *iter)
 		return err;
 	}
 
-	err = rhashtable_walk_start(&iter->hti);
-	return err == -EAGAIN ? 0 : err;
+	rhashtable_walk_start(&iter->hti);
+
+	return 0;
 }
 
 static void netlink_walk_stop(struct nl_seq_iter *iter)
@@ -2733,12 +2767,11 @@ static int __init netlink_proto_init(void)
 		}
 	}
 
-	INIT_LIST_HEAD(&netlink_tap_all);
-
 	netlink_add_usersock_entry();
 
 	sock_register(&netlink_family_ops);
 	register_pernet_subsys(&netlink_net_ops);
+	register_pernet_subsys(&netlink_tap_net_ops);
 	/* The netlink device handler may be needed early. */
 	rtnetlink_init();
 out:
diff --git a/net/netlink/diag.c b/net/netlink/diag.c
index 8faa20b4d457..7dda33b9b784 100644
--- a/net/netlink/diag.c
+++ b/net/netlink/diag.c
@@ -115,11 +115,7 @@ static int __netlink_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
 	if (!s_num)
 		rhashtable_walk_enter(&tbl->hash, hti);
 
-	ret = rhashtable_walk_start(hti);
-	if (ret == -EAGAIN)
-		ret = 0;
-	if (ret)
-		goto stop;
+	rhashtable_walk_start(hti);
 
 	while ((nlsk = rhashtable_walk_next(hti))) {
 		if (IS_ERR(nlsk)) {
@@ -146,8 +142,8 @@ static int __netlink_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
 		}
 	}
 
-stop:
 	rhashtable_walk_stop(hti);
+
 	if (ret)
 		goto done;
 
diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c
index dbe2379329c5..76d050aba7a4 100644
--- a/net/openvswitch/flow.c
+++ b/net/openvswitch/flow.c
@@ -56,12 +56,12 @@
 
 u64 ovs_flow_used_time(unsigned long flow_jiffies)
 {
-	struct timespec cur_ts;
+	struct timespec64 cur_ts;
 	u64 cur_ms, idle_ms;
 
-	ktime_get_ts(&cur_ts);
+	ktime_get_ts64(&cur_ts);
 	idle_ms = jiffies_to_msecs(jiffies - flow_jiffies);
-	cur_ms = (u64)cur_ts.tv_sec * MSEC_PER_SEC +
+	cur_ms = (u64)(u32)cur_ts.tv_sec * MSEC_PER_SEC +
 		 cur_ts.tv_nsec / NSEC_PER_MSEC;
 
 	return cur_ms - idle_ms;
diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c
index 624ea74353dd..bce1f78b0de5 100644
--- a/net/openvswitch/flow_netlink.c
+++ b/net/openvswitch/flow_netlink.c
@@ -644,12 +644,12 @@ static int erspan_tun_opt_from_nlattr(const struct nlattr *attr,
 	BUILD_BUG_ON(sizeof(opts) > sizeof(match->key->tun_opts));
 
 	memset(&opts, 0, sizeof(opts));
-	opts.index = nla_get_be32(attr);
+	opts.u.index = nla_get_be32(attr);
 
 	/* Index has only 20-bit */
-	if (ntohl(opts.index) & ~INDEX_MASK) {
+	if (ntohl(opts.u.index) & ~INDEX_MASK) {
 		OVS_NLERR(log, "ERSPAN index number %x too large.",
-			  ntohl(opts.index));
+			  ntohl(opts.u.index));
 		return -EINVAL;
 	}
 
@@ -907,7 +907,7 @@ static int __ip_tun_to_nlattr(struct sk_buff *skb,
 			return -EMSGSIZE;
 		else if (output->tun_flags & TUNNEL_ERSPAN_OPT &&
 			 nla_put_be32(skb, OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS,
-				      ((struct erspan_metadata *)tun_opts)->index))
+				      ((struct erspan_metadata *)tun_opts)->u.index))
 			return -EMSGSIZE;
 	}
 
diff --git a/net/openvswitch/vport-internal_dev.c b/net/openvswitch/vport-internal_dev.c
index 04a3128adcf0..3e7747549f90 100644
--- a/net/openvswitch/vport-internal_dev.c
+++ b/net/openvswitch/vport-internal_dev.c
@@ -126,18 +126,12 @@ internal_get_stats(struct net_device *dev, struct rtnl_link_stats64 *stats)
 	}
 }
 
-static void internal_set_rx_headroom(struct net_device *dev, int new_hr)
-{
-	dev->needed_headroom = new_hr < 0 ? 0 : new_hr;
-}
-
 static const struct net_device_ops internal_dev_netdev_ops = {
 	.ndo_open = internal_dev_open,
 	.ndo_stop = internal_dev_stop,
 	.ndo_start_xmit = internal_dev_xmit,
 	.ndo_set_mac_address = eth_mac_addr,
 	.ndo_get_stats64 = internal_get_stats,
-	.ndo_set_rx_headroom = internal_set_rx_headroom,
 };
 
 static struct rtnl_link_ops internal_dev_link_ops __read_mostly = {
@@ -154,7 +148,7 @@ static void do_setup(struct net_device *netdev)
 
 	netdev->priv_flags &= ~IFF_TX_SKB_SHARING;
 	netdev->priv_flags |= IFF_LIVE_ADDR_CHANGE | IFF_OPENVSWITCH |
-			      IFF_PHONY_HEADROOM | IFF_NO_QUEUE;
+			      IFF_NO_QUEUE;
 	netdev->needs_free_netdev = true;
 	netdev->priv_destructor = internal_dev_destructor;
 	netdev->ethtool_ops = &internal_dev_ethtool_ops;
@@ -195,7 +189,6 @@ static struct vport *internal_dev_create(const struct vport_parms *parms)
 		err = -ENOMEM;
 		goto error_free_netdev;
 	}
-	vport->dev->needed_headroom = vport->dp->max_headroom;
 
 	dev_net_set(vport->dev, ovs_dp_get_net(vport->dp));
 	internal_dev = internal_dev_priv(vport->dev);
diff --git a/net/phonet/pn_netlink.c b/net/phonet/pn_netlink.c
index da754fc926e7..871eaf2cb85e 100644
--- a/net/phonet/pn_netlink.c
+++ b/net/phonet/pn_netlink.c
@@ -299,16 +299,21 @@ out:
 
 int __init phonet_netlink_register(void)
 {
-	int err = __rtnl_register(PF_PHONET, RTM_NEWADDR, addr_doit,
-				  NULL, 0);
+	int err = rtnl_register_module(THIS_MODULE, PF_PHONET, RTM_NEWADDR,
+				       addr_doit, NULL, 0);
 	if (err)
 		return err;
 
-	/* Further __rtnl_register() cannot fail */
-	__rtnl_register(PF_PHONET, RTM_DELADDR, addr_doit, NULL, 0);
-	__rtnl_register(PF_PHONET, RTM_GETADDR, NULL, getaddr_dumpit, 0);
-	__rtnl_register(PF_PHONET, RTM_NEWROUTE, route_doit, NULL, 0);
-	__rtnl_register(PF_PHONET, RTM_DELROUTE, route_doit, NULL, 0);
-	__rtnl_register(PF_PHONET, RTM_GETROUTE, NULL, route_dumpit, 0);
+	/* Further rtnl_register_module() cannot fail */
+	rtnl_register_module(THIS_MODULE, PF_PHONET, RTM_DELADDR,
+			     addr_doit, NULL, 0);
+	rtnl_register_module(THIS_MODULE, PF_PHONET, RTM_GETADDR,
+			     NULL, getaddr_dumpit, 0);
+	rtnl_register_module(THIS_MODULE, PF_PHONET, RTM_NEWROUTE,
+			     route_doit, NULL, 0);
+	rtnl_register_module(THIS_MODULE, PF_PHONET, RTM_DELROUTE,
+			     route_doit, NULL, 0);
+	rtnl_register_module(THIS_MODULE, PF_PHONET, RTM_GETROUTE,
+			     NULL, route_dumpit, 0);
 	return 0;
 }
diff --git a/net/qrtr/qrtr.c b/net/qrtr/qrtr.c
index 77ab05e23001..5fb3929e3d7d 100644
--- a/net/qrtr/qrtr.c
+++ b/net/qrtr/qrtr.c
@@ -1116,9 +1116,13 @@ static int __init qrtr_proto_init(void)
 		return rc;
 	}
 
-	rtnl_register(PF_QIPCRTR, RTM_NEWADDR, qrtr_addr_doit, NULL, 0);
+	rc = rtnl_register_module(THIS_MODULE, PF_QIPCRTR, RTM_NEWADDR, qrtr_addr_doit, NULL, 0);
+	if (rc) {
+		sock_unregister(qrtr_family.family);
+		proto_unregister(&qrtr_proto);
+	}
 
-	return 0;
+	return rc;
 }
 postcore_initcall(qrtr_proto_init);
 
diff --git a/net/rds/connection.c b/net/rds/connection.c
index 7ee2d5d68b78..6492c0b608a4 100644
--- a/net/rds/connection.c
+++ b/net/rds/connection.c
@@ -230,8 +230,8 @@ static struct rds_connection *__rds_conn_create(struct net *net,
 
 	rdsdebug("allocated conn %p for %pI4 -> %pI4 over %s %s\n",
 	  conn, &laddr, &faddr,
-	  trans->t_name ? trans->t_name : "[unknown]",
-	  is_outgoing ? "(outgoing)" : "");
+	  strnlen(trans->t_name, sizeof(trans->t_name)) ? trans->t_name :
+	  "[unknown]", is_outgoing ? "(outgoing)" : "");
 
 	/*
 	 * Since we ran without holding the conn lock, someone could
@@ -366,6 +366,8 @@ void rds_conn_shutdown(struct rds_conn_path *cp)
 	 * to the conn hash, so we never trigger a reconnect on this
 	 * conn - the reconnect is always triggered by the active peer. */
 	cancel_delayed_work_sync(&cp->cp_conn_w);
+	if (conn->c_destroy_in_prog)
+		return;
 	rcu_read_lock();
 	if (!hlist_unhashed(&conn->c_hash_node)) {
 		rcu_read_unlock();
@@ -445,7 +447,6 @@ void rds_conn_destroy(struct rds_connection *conn)
 	 */
 	rds_cong_remove_conn(conn);
 
-	put_net(conn->c_net);
 	kfree(conn->c_path);
 	kmem_cache_free(rds_conn_slab, conn);
 
diff --git a/net/rds/rds.h b/net/rds/rds.h
index c349c71babff..d09f6c1facb4 100644
--- a/net/rds/rds.h
+++ b/net/rds/rds.h
@@ -150,7 +150,7 @@ struct rds_connection {
 
 	/* Protocol version */
 	unsigned int		c_version;
-	struct net		*c_net;
+	possible_net_t		c_net;
 
 	struct list_head	c_map_item;
 	unsigned long		c_map_queued;
@@ -165,13 +165,13 @@ struct rds_connection {
 static inline
 struct net *rds_conn_net(struct rds_connection *conn)
 {
-	return conn->c_net;
+	return read_pnet(&conn->c_net);
 }
 
 static inline
 void rds_conn_net_set(struct rds_connection *conn, struct net *net)
 {
-	conn->c_net = get_net(net);
+	write_pnet(&conn->c_net, net);
 }
 
 #define RDS_FLAG_CONG_BITMAP	0x01
diff --git a/net/rds/tcp.c b/net/rds/tcp.c
index 6b7ee71f40c6..39f502d47969 100644
--- a/net/rds/tcp.c
+++ b/net/rds/tcp.c
@@ -306,7 +306,8 @@ static void rds_tcp_conn_free(void *arg)
 	rdsdebug("freeing tc %p\n", tc);
 
 	spin_lock_irqsave(&rds_tcp_conn_lock, flags);
-	list_del(&tc->t_tcp_node);
+	if (!tc->t_tcp_node_detached)
+		list_del(&tc->t_tcp_node);
 	spin_unlock_irqrestore(&rds_tcp_conn_lock, flags);
 
 	kmem_cache_free(rds_tcp_conn_slab, tc);
@@ -495,27 +496,6 @@ static struct pernet_operations rds_tcp_net_ops = {
 	.size = sizeof(struct rds_tcp_net),
 };
 
-/* explicitly send a RST on each socket, thereby releasing any socket refcnts
- * that may otherwise hold up netns deletion.
- */
-static void rds_tcp_conn_paths_destroy(struct rds_connection *conn)
-{
-	struct rds_conn_path *cp;
-	struct rds_tcp_connection *tc;
-	int i;
-	struct sock *sk;
-
-	for (i = 0; i < RDS_MPATH_WORKERS; i++) {
-		cp = &conn->c_path[i];
-		tc = cp->cp_transport_data;
-		if (!tc->t_sock)
-			continue;
-		sk = tc->t_sock->sk;
-		sk->sk_prot->disconnect(sk, 0);
-		tcp_done(sk);
-	}
-}
-
 static void rds_tcp_kill_sock(struct net *net)
 {
 	struct rds_tcp_connection *tc, *_tc;
@@ -527,18 +507,20 @@ static void rds_tcp_kill_sock(struct net *net)
 	rds_tcp_listen_stop(lsock, &rtn->rds_tcp_accept_w);
 	spin_lock_irq(&rds_tcp_conn_lock);
 	list_for_each_entry_safe(tc, _tc, &rds_tcp_conn_list, t_tcp_node) {
-		struct net *c_net = tc->t_cpath->cp_conn->c_net;
+		struct net *c_net = read_pnet(&tc->t_cpath->cp_conn->c_net);
 
 		if (net != c_net || !tc->t_sock)
 			continue;
-		if (!list_has_conn(&tmp_list, tc->t_cpath->cp_conn))
+		if (!list_has_conn(&tmp_list, tc->t_cpath->cp_conn)) {
 			list_move_tail(&tc->t_tcp_node, &tmp_list);
+		} else {
+			list_del(&tc->t_tcp_node);
+			tc->t_tcp_node_detached = true;
+		}
 	}
 	spin_unlock_irq(&rds_tcp_conn_lock);
-	list_for_each_entry_safe(tc, _tc, &tmp_list, t_tcp_node) {
-		rds_tcp_conn_paths_destroy(tc->t_cpath->cp_conn);
+	list_for_each_entry_safe(tc, _tc, &tmp_list, t_tcp_node)
 		rds_conn_destroy(tc->t_cpath->cp_conn);
-	}
 }
 
 void *rds_tcp_listen_sock_def_readable(struct net *net)
@@ -586,7 +568,7 @@ static void rds_tcp_sysctl_reset(struct net *net)
 
 	spin_lock_irq(&rds_tcp_conn_lock);
 	list_for_each_entry_safe(tc, _tc, &rds_tcp_conn_list, t_tcp_node) {
-		struct net *c_net = tc->t_cpath->cp_conn->c_net;
+		struct net *c_net = read_pnet(&tc->t_cpath->cp_conn->c_net);
 
 		if (net != c_net || !tc->t_sock)
 			continue;
diff --git a/net/rds/tcp.h b/net/rds/tcp.h
index 1aafbf7c3011..e7858ee8ed8b 100644
--- a/net/rds/tcp.h
+++ b/net/rds/tcp.h
@@ -12,6 +12,7 @@ struct rds_tcp_incoming {
 struct rds_tcp_connection {
 
 	struct list_head	t_tcp_node;
+	bool			t_tcp_node_detached;
 	struct rds_conn_path	*t_cpath;
 	/* t_conn_path_lock synchronizes the connection establishment between
 	 * rds_tcp_accept_one and rds_tcp_conn_path_connect
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index 4d33a50a8a6d..52622a3d2517 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -99,7 +99,7 @@ int __tcf_idr_release(struct tc_action *p, bool bind, bool strict)
 		p->tcfa_refcnt--;
 		if (p->tcfa_bindcnt <= 0 && p->tcfa_refcnt <= 0) {
 			if (p->ops->cleanup)
-				p->ops->cleanup(p, bind);
+				p->ops->cleanup(p);
 			tcf_idr_remove(p->idrinfo, p);
 			ret = ACT_P_DELETED;
 		}
diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c
index 5ef8ce8c83d4..b3f2c15affa7 100644
--- a/net/sched/act_bpf.c
+++ b/net/sched/act_bpf.c
@@ -357,7 +357,7 @@ out:
 	return ret;
 }
 
-static void tcf_bpf_cleanup(struct tc_action *act, int bind)
+static void tcf_bpf_cleanup(struct tc_action *act)
 {
 	struct tcf_bpf_cfg tmp;
 
@@ -401,16 +401,14 @@ static __net_init int bpf_init_net(struct net *net)
 	return tc_action_net_init(tn, &act_bpf_ops);
 }
 
-static void __net_exit bpf_exit_net(struct net *net)
+static void __net_exit bpf_exit_net(struct list_head *net_list)
 {
-	struct tc_action_net *tn = net_generic(net, bpf_net_id);
-
-	tc_action_net_exit(tn);
+	tc_action_net_exit(net_list, bpf_net_id);
 }
 
 static struct pernet_operations bpf_net_ops = {
 	.init = bpf_init_net,
-	.exit = bpf_exit_net,
+	.exit_batch = bpf_exit_net,
 	.id   = &bpf_net_id,
 	.size = sizeof(struct tc_action_net),
 };
diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c
index 10b7a8855a6c..2b15ba84e0c8 100644
--- a/net/sched/act_connmark.c
+++ b/net/sched/act_connmark.c
@@ -209,16 +209,14 @@ static __net_init int connmark_init_net(struct net *net)
 	return tc_action_net_init(tn, &act_connmark_ops);
 }
 
-static void __net_exit connmark_exit_net(struct net *net)
+static void __net_exit connmark_exit_net(struct list_head *net_list)
 {
-	struct tc_action_net *tn = net_generic(net, connmark_net_id);
-
-	tc_action_net_exit(tn);
+	tc_action_net_exit(net_list, connmark_net_id);
 }
 
 static struct pernet_operations connmark_net_ops = {
 	.init = connmark_init_net,
-	.exit = connmark_exit_net,
+	.exit_batch = connmark_exit_net,
 	.id   = &connmark_net_id,
 	.size = sizeof(struct tc_action_net),
 };
diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c
index d836f998117b..af4b8ec60d9a 100644
--- a/net/sched/act_csum.c
+++ b/net/sched/act_csum.c
@@ -635,16 +635,14 @@ static __net_init int csum_init_net(struct net *net)
 	return tc_action_net_init(tn, &act_csum_ops);
 }
 
-static void __net_exit csum_exit_net(struct net *net)
+static void __net_exit csum_exit_net(struct list_head *net_list)
 {
-	struct tc_action_net *tn = net_generic(net, csum_net_id);
-
-	tc_action_net_exit(tn);
+	tc_action_net_exit(net_list, csum_net_id);
 }
 
 static struct pernet_operations csum_net_ops = {
 	.init = csum_init_net,
-	.exit = csum_exit_net,
+	.exit_batch = csum_exit_net,
 	.id   = &csum_net_id,
 	.size = sizeof(struct tc_action_net),
 };
diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c
index e29a48ef7fc3..9d632e92cad0 100644
--- a/net/sched/act_gact.c
+++ b/net/sched/act_gact.c
@@ -235,16 +235,14 @@ static __net_init int gact_init_net(struct net *net)
 	return tc_action_net_init(tn, &act_gact_ops);
 }
 
-static void __net_exit gact_exit_net(struct net *net)
+static void __net_exit gact_exit_net(struct list_head *net_list)
 {
-	struct tc_action_net *tn = net_generic(net, gact_net_id);
-
-	tc_action_net_exit(tn);
+	tc_action_net_exit(net_list, gact_net_id);
 }
 
 static struct pernet_operations gact_net_ops = {
 	.init = gact_init_net,
-	.exit = gact_exit_net,
+	.exit_batch = gact_exit_net,
 	.id   = &gact_net_id,
 	.size = sizeof(struct tc_action_net),
 };
diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c
index 3007cb1310ea..5954e992685a 100644
--- a/net/sched/act_ife.c
+++ b/net/sched/act_ife.c
@@ -387,7 +387,7 @@ out_nlmsg_trim:
 }
 
 /* under ife->tcf_lock */
-static void _tcf_ife_cleanup(struct tc_action *a, int bind)
+static void _tcf_ife_cleanup(struct tc_action *a)
 {
 	struct tcf_ife_info *ife = to_ife(a);
 	struct tcf_meta_info *e, *n;
@@ -405,13 +405,13 @@ static void _tcf_ife_cleanup(struct tc_action *a, int bind)
 	}
 }
 
-static void tcf_ife_cleanup(struct tc_action *a, int bind)
+static void tcf_ife_cleanup(struct tc_action *a)
 {
 	struct tcf_ife_info *ife = to_ife(a);
 	struct tcf_ife_params *p;
 
 	spin_lock_bh(&ife->tcf_lock);
-	_tcf_ife_cleanup(a, bind);
+	_tcf_ife_cleanup(a);
 	spin_unlock_bh(&ife->tcf_lock);
 
 	p = rcu_dereference_protected(ife->params, 1);
@@ -546,7 +546,7 @@ metadata_parse_err:
 			if (exists)
 				tcf_idr_release(*a, bind);
 			if (ret == ACT_P_CREATED)
-				_tcf_ife_cleanup(*a, bind);
+				_tcf_ife_cleanup(*a);
 
 			if (exists)
 				spin_unlock_bh(&ife->tcf_lock);
@@ -567,7 +567,7 @@ metadata_parse_err:
 		err = use_all_metadata(ife);
 		if (err) {
 			if (ret == ACT_P_CREATED)
-				_tcf_ife_cleanup(*a, bind);
+				_tcf_ife_cleanup(*a);
 
 			if (exists)
 				spin_unlock_bh(&ife->tcf_lock);
@@ -858,16 +858,14 @@ static __net_init int ife_init_net(struct net *net)
 	return tc_action_net_init(tn, &act_ife_ops);
 }
 
-static void __net_exit ife_exit_net(struct net *net)
+static void __net_exit ife_exit_net(struct list_head *net_list)
 {
-	struct tc_action_net *tn = net_generic(net, ife_net_id);
-
-	tc_action_net_exit(tn);
+	tc_action_net_exit(net_list, ife_net_id);
 }
 
 static struct pernet_operations ife_net_ops = {
 	.init = ife_init_net,
-	.exit = ife_exit_net,
+	.exit_batch = ife_exit_net,
 	.id   = &ife_net_id,
 	.size = sizeof(struct tc_action_net),
 };
diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c
index d9e399a7e3d5..06e380ae0928 100644
--- a/net/sched/act_ipt.c
+++ b/net/sched/act_ipt.c
@@ -77,7 +77,7 @@ static void ipt_destroy_target(struct xt_entry_target *t)
 	module_put(par.target->me);
 }
 
-static void tcf_ipt_release(struct tc_action *a, int bind)
+static void tcf_ipt_release(struct tc_action *a)
 {
 	struct tcf_ipt *ipt = to_ipt(a);
 	ipt_destroy_target(ipt->tcfi_t);
@@ -337,16 +337,14 @@ static __net_init int ipt_init_net(struct net *net)
 	return tc_action_net_init(tn, &act_ipt_ops);
 }
 
-static void __net_exit ipt_exit_net(struct net *net)
+static void __net_exit ipt_exit_net(struct list_head *net_list)
 {
-	struct tc_action_net *tn = net_generic(net, ipt_net_id);
-
-	tc_action_net_exit(tn);
+	tc_action_net_exit(net_list, ipt_net_id);
 }
 
 static struct pernet_operations ipt_net_ops = {
 	.init = ipt_init_net,
-	.exit = ipt_exit_net,
+	.exit_batch = ipt_exit_net,
 	.id   = &ipt_net_id,
 	.size = sizeof(struct tc_action_net),
 };
@@ -387,16 +385,14 @@ static __net_init int xt_init_net(struct net *net)
 	return tc_action_net_init(tn, &act_xt_ops);
 }
 
-static void __net_exit xt_exit_net(struct net *net)
+static void __net_exit xt_exit_net(struct list_head *net_list)
 {
-	struct tc_action_net *tn = net_generic(net, xt_net_id);
-
-	tc_action_net_exit(tn);
+	tc_action_net_exit(net_list, xt_net_id);
 }
 
 static struct pernet_operations xt_net_ops = {
 	.init = xt_init_net,
-	.exit = xt_exit_net,
+	.exit_batch = xt_exit_net,
 	.id   = &xt_net_id,
 	.size = sizeof(struct tc_action_net),
 };
diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c
index 8b3e59388480..37e5e4decbd6 100644
--- a/net/sched/act_mirred.c
+++ b/net/sched/act_mirred.c
@@ -29,7 +29,6 @@
 #include <net/tc_act/tc_mirred.h>
 
 static LIST_HEAD(mirred_list);
-static DEFINE_SPINLOCK(mirred_list_lock);
 
 static bool tcf_mirred_is_act_redirect(int action)
 {
@@ -50,18 +49,15 @@ static bool tcf_mirred_act_wants_ingress(int action)
 	}
 }
 
-static void tcf_mirred_release(struct tc_action *a, int bind)
+static void tcf_mirred_release(struct tc_action *a)
 {
 	struct tcf_mirred *m = to_mirred(a);
 	struct net_device *dev;
 
-	/* We could be called either in a RCU callback or with RTNL lock held. */
-	spin_lock_bh(&mirred_list_lock);
 	list_del(&m->tcfm_list);
-	dev = rcu_dereference_protected(m->tcfm_dev, 1);
+	dev = rtnl_dereference(m->tcfm_dev);
 	if (dev)
 		dev_put(dev);
-	spin_unlock_bh(&mirred_list_lock);
 }
 
 static const struct nla_policy mirred_policy[TCA_MIRRED_MAX + 1] = {
@@ -139,8 +135,6 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla,
 	m->tcf_action = parm->action;
 	m->tcfm_eaction = parm->eaction;
 	if (dev != NULL) {
-		m->tcfm_ifindex = parm->ifindex;
-		m->net = net;
 		if (ret != ACT_P_CREATED)
 			dev_put(rcu_dereference_protected(m->tcfm_dev, 1));
 		dev_hold(dev);
@@ -149,9 +143,7 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla,
 	}
 
 	if (ret == ACT_P_CREATED) {
-		spin_lock_bh(&mirred_list_lock);
 		list_add(&m->tcfm_list, &mirred_list);
-		spin_unlock_bh(&mirred_list_lock);
 		tcf_idr_insert(tn, *a);
 	}
 
@@ -247,13 +239,14 @@ static int tcf_mirred_dump(struct sk_buff *skb, struct tc_action *a, int bind,
 {
 	unsigned char *b = skb_tail_pointer(skb);
 	struct tcf_mirred *m = to_mirred(a);
+	struct net_device *dev = rtnl_dereference(m->tcfm_dev);
 	struct tc_mirred opt = {
 		.index   = m->tcf_index,
 		.action  = m->tcf_action,
 		.refcnt  = m->tcf_refcnt - ref,
 		.bindcnt = m->tcf_bindcnt - bind,
 		.eaction = m->tcfm_eaction,
-		.ifindex = m->tcfm_ifindex,
+		.ifindex = dev ? dev->ifindex : 0,
 	};
 	struct tcf_t t;
 
@@ -294,7 +287,6 @@ static int mirred_device_event(struct notifier_block *unused,
 
 	ASSERT_RTNL();
 	if (event == NETDEV_UNREGISTER) {
-		spin_lock_bh(&mirred_list_lock);
 		list_for_each_entry(m, &mirred_list, tcfm_list) {
 			if (rcu_access_pointer(m->tcfm_dev) == dev) {
 				dev_put(dev);
@@ -304,7 +296,6 @@ static int mirred_device_event(struct notifier_block *unused,
 				RCU_INIT_POINTER(m->tcfm_dev, NULL);
 			}
 		}
-		spin_unlock_bh(&mirred_list_lock);
 	}
 
 	return NOTIFY_DONE;
@@ -318,7 +309,7 @@ static struct net_device *tcf_mirred_get_dev(const struct tc_action *a)
 {
 	struct tcf_mirred *m = to_mirred(a);
 
-	return __dev_get_by_index(m->net, m->tcfm_ifindex);
+	return rtnl_dereference(m->tcfm_dev);
 }
 
 static struct tc_action_ops act_mirred_ops = {
@@ -343,16 +334,14 @@ static __net_init int mirred_init_net(struct net *net)
 	return tc_action_net_init(tn, &act_mirred_ops);
 }
 
-static void __net_exit mirred_exit_net(struct net *net)
+static void __net_exit mirred_exit_net(struct list_head *net_list)
 {
-	struct tc_action_net *tn = net_generic(net, mirred_net_id);
-
-	tc_action_net_exit(tn);
+	tc_action_net_exit(net_list, mirred_net_id);
 }
 
 static struct pernet_operations mirred_net_ops = {
 	.init = mirred_init_net,
-	.exit = mirred_exit_net,
+	.exit_batch = mirred_exit_net,
 	.id   = &mirred_net_id,
 	.size = sizeof(struct tc_action_net),
 };
diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c
index c365d01b99c8..98c6a4b2f523 100644
--- a/net/sched/act_nat.c
+++ b/net/sched/act_nat.c
@@ -310,16 +310,14 @@ static __net_init int nat_init_net(struct net *net)
 	return tc_action_net_init(tn, &act_nat_ops);
 }
 
-static void __net_exit nat_exit_net(struct net *net)
+static void __net_exit nat_exit_net(struct list_head *net_list)
 {
-	struct tc_action_net *tn = net_generic(net, nat_net_id);
-
-	tc_action_net_exit(tn);
+	tc_action_net_exit(net_list, nat_net_id);
 }
 
 static struct pernet_operations nat_net_ops = {
 	.init = nat_init_net,
-	.exit = nat_exit_net,
+	.exit_batch = nat_exit_net,
 	.id   = &nat_net_id,
 	.size = sizeof(struct tc_action_net),
 };
diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c
index 491fe5deb09e..349beaffb29e 100644
--- a/net/sched/act_pedit.c
+++ b/net/sched/act_pedit.c
@@ -216,7 +216,7 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla,
 	return ret;
 }
 
-static void tcf_pedit_cleanup(struct tc_action *a, int bind)
+static void tcf_pedit_cleanup(struct tc_action *a)
 {
 	struct tcf_pedit *p = to_pedit(a);
 	struct tc_pedit_key *keys = p->tcfp_keys;
@@ -453,16 +453,14 @@ static __net_init int pedit_init_net(struct net *net)
 	return tc_action_net_init(tn, &act_pedit_ops);
 }
 
-static void __net_exit pedit_exit_net(struct net *net)
+static void __net_exit pedit_exit_net(struct list_head *net_list)
 {
-	struct tc_action_net *tn = net_generic(net, pedit_net_id);
-
-	tc_action_net_exit(tn);
+	tc_action_net_exit(net_list, pedit_net_id);
 }
 
 static struct pernet_operations pedit_net_ops = {
 	.init = pedit_init_net,
-	.exit = pedit_exit_net,
+	.exit_batch = pedit_exit_net,
 	.id   = &pedit_net_id,
 	.size = sizeof(struct tc_action_net),
 };
diff --git a/net/sched/act_police.c b/net/sched/act_police.c
index 3bb2ebf9e9ae..bf483db993a1 100644
--- a/net/sched/act_police.c
+++ b/net/sched/act_police.c
@@ -334,16 +334,14 @@ static __net_init int police_init_net(struct net *net)
 	return tc_action_net_init(tn, &act_police_ops);
 }
 
-static void __net_exit police_exit_net(struct net *net)
+static void __net_exit police_exit_net(struct list_head *net_list)
 {
-	struct tc_action_net *tn = net_generic(net, police_net_id);
-
-	tc_action_net_exit(tn);
+	tc_action_net_exit(net_list, police_net_id);
 }
 
 static struct pernet_operations police_net_ops = {
 	.init = police_init_net,
-	.exit = police_exit_net,
+	.exit_batch = police_exit_net,
 	.id   = &police_net_id,
 	.size = sizeof(struct tc_action_net),
 };
diff --git a/net/sched/act_sample.c b/net/sched/act_sample.c
index 9438969290a6..1ba0df238756 100644
--- a/net/sched/act_sample.c
+++ b/net/sched/act_sample.c
@@ -96,7 +96,7 @@ static int tcf_sample_init(struct net *net, struct nlattr *nla,
 	return ret;
 }
 
-static void tcf_sample_cleanup(struct tc_action *a, int bind)
+static void tcf_sample_cleanup(struct tc_action *a)
 {
 	struct tcf_sample *s = to_sample(a);
 	struct psample_group *psample_group;
@@ -236,16 +236,14 @@ static __net_init int sample_init_net(struct net *net)
 	return tc_action_net_init(tn, &act_sample_ops);
 }
 
-static void __net_exit sample_exit_net(struct net *net)
+static void __net_exit sample_exit_net(struct list_head *net_list)
 {
-	struct tc_action_net *tn = net_generic(net, sample_net_id);
-
-	tc_action_net_exit(tn);
+	tc_action_net_exit(net_list, sample_net_id);
 }
 
 static struct pernet_operations sample_net_ops = {
 	.init = sample_init_net,
-	.exit = sample_exit_net,
+	.exit_batch = sample_exit_net,
 	.id   = &sample_net_id,
 	.size = sizeof(struct tc_action_net),
 };
diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c
index e7b57e5071a3..425eac11f6da 100644
--- a/net/sched/act_simple.c
+++ b/net/sched/act_simple.c
@@ -47,7 +47,7 @@ static int tcf_simp(struct sk_buff *skb, const struct tc_action *a,
 	return d->tcf_action;
 }
 
-static void tcf_simp_release(struct tc_action *a, int bind)
+static void tcf_simp_release(struct tc_action *a)
 {
 	struct tcf_defact *d = to_defact(a);
 	kfree(d->tcfd_defdata);
@@ -204,16 +204,14 @@ static __net_init int simp_init_net(struct net *net)
 	return tc_action_net_init(tn, &act_simp_ops);
 }
 
-static void __net_exit simp_exit_net(struct net *net)
+static void __net_exit simp_exit_net(struct list_head *net_list)
 {
-	struct tc_action_net *tn = net_generic(net, simp_net_id);
-
-	tc_action_net_exit(tn);
+	tc_action_net_exit(net_list, simp_net_id);
 }
 
 static struct pernet_operations simp_net_ops = {
 	.init = simp_init_net,
-	.exit = simp_exit_net,
+	.exit_batch = simp_exit_net,
 	.id   = &simp_net_id,
 	.size = sizeof(struct tc_action_net),
 };
diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c
index 59949d61f20d..5a3f691bb545 100644
--- a/net/sched/act_skbedit.c
+++ b/net/sched/act_skbedit.c
@@ -241,16 +241,14 @@ static __net_init int skbedit_init_net(struct net *net)
 	return tc_action_net_init(tn, &act_skbedit_ops);
 }
 
-static void __net_exit skbedit_exit_net(struct net *net)
+static void __net_exit skbedit_exit_net(struct list_head *net_list)
 {
-	struct tc_action_net *tn = net_generic(net, skbedit_net_id);
-
-	tc_action_net_exit(tn);
+	tc_action_net_exit(net_list, skbedit_net_id);
 }
 
 static struct pernet_operations skbedit_net_ops = {
 	.init = skbedit_init_net,
-	.exit = skbedit_exit_net,
+	.exit_batch = skbedit_exit_net,
 	.id   = &skbedit_net_id,
 	.size = sizeof(struct tc_action_net),
 };
diff --git a/net/sched/act_skbmod.c b/net/sched/act_skbmod.c
index b642ad3d39dd..fa975262dbac 100644
--- a/net/sched/act_skbmod.c
+++ b/net/sched/act_skbmod.c
@@ -184,7 +184,7 @@ static int tcf_skbmod_init(struct net *net, struct nlattr *nla,
 	return ret;
 }
 
-static void tcf_skbmod_cleanup(struct tc_action *a, int bind)
+static void tcf_skbmod_cleanup(struct tc_action *a)
 {
 	struct tcf_skbmod *d = to_skbmod(a);
 	struct tcf_skbmod_params  *p;
@@ -266,16 +266,14 @@ static __net_init int skbmod_init_net(struct net *net)
 	return tc_action_net_init(tn, &act_skbmod_ops);
 }
 
-static void __net_exit skbmod_exit_net(struct net *net)
+static void __net_exit skbmod_exit_net(struct list_head *net_list)
 {
-	struct tc_action_net *tn = net_generic(net, skbmod_net_id);
-
-	tc_action_net_exit(tn);
+	tc_action_net_exit(net_list, skbmod_net_id);
 }
 
 static struct pernet_operations skbmod_net_ops = {
 	.init = skbmod_init_net,
-	.exit = skbmod_exit_net,
+	.exit_batch = skbmod_exit_net,
 	.id   = &skbmod_net_id,
 	.size = sizeof(struct tc_action_net),
 };
diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c
index 30c96274c638..0e23aac09ad6 100644
--- a/net/sched/act_tunnel_key.c
+++ b/net/sched/act_tunnel_key.c
@@ -201,7 +201,7 @@ err_out:
 	return ret;
 }
 
-static void tunnel_key_release(struct tc_action *a, int bind)
+static void tunnel_key_release(struct tc_action *a)
 {
 	struct tcf_tunnel_key *t = to_tunnel_key(a);
 	struct tcf_tunnel_key_params *params;
@@ -325,16 +325,14 @@ static __net_init int tunnel_key_init_net(struct net *net)
 	return tc_action_net_init(tn, &act_tunnel_key_ops);
 }
 
-static void __net_exit tunnel_key_exit_net(struct net *net)
+static void __net_exit tunnel_key_exit_net(struct list_head *net_list)
 {
-	struct tc_action_net *tn = net_generic(net, tunnel_key_net_id);
-
-	tc_action_net_exit(tn);
+	tc_action_net_exit(net_list, tunnel_key_net_id);
 }
 
 static struct pernet_operations tunnel_key_net_ops = {
 	.init = tunnel_key_init_net,
-	.exit = tunnel_key_exit_net,
+	.exit_batch = tunnel_key_exit_net,
 	.id   = &tunnel_key_net_id,
 	.size = sizeof(struct tc_action_net),
 };
diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c
index 97f717a13ad5..e1a1b3f3983a 100644
--- a/net/sched/act_vlan.c
+++ b/net/sched/act_vlan.c
@@ -219,7 +219,7 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla,
 	return ret;
 }
 
-static void tcf_vlan_cleanup(struct tc_action *a, int bind)
+static void tcf_vlan_cleanup(struct tc_action *a)
 {
 	struct tcf_vlan *v = to_vlan(a);
 	struct tcf_vlan_params *p;
@@ -301,16 +301,14 @@ static __net_init int vlan_init_net(struct net *net)
 	return tc_action_net_init(tn, &act_vlan_ops);
 }
 
-static void __net_exit vlan_exit_net(struct net *net)
+static void __net_exit vlan_exit_net(struct list_head *net_list)
 {
-	struct tc_action_net *tn = net_generic(net, vlan_net_id);
-
-	tc_action_net_exit(tn);
+	tc_action_net_exit(net_list, vlan_net_id);
 }
 
 static struct pernet_operations vlan_net_ops = {
 	.init = vlan_init_net,
-	.exit = vlan_exit_net,
+	.exit_batch = vlan_exit_net,
 	.id   = &vlan_net_id,
 	.size = sizeof(struct tc_action_net),
 };
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index b91ea03e3afa..32b1ea7cf863 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -217,8 +217,12 @@ static void tcf_chain_flush(struct tcf_chain *chain)
 
 static void tcf_chain_destroy(struct tcf_chain *chain)
 {
+	struct tcf_block *block = chain->block;
+
 	list_del(&chain->list);
 	kfree(chain);
+	if (list_empty(&block->chain_list))
+		kfree(block);
 }
 
 static void tcf_chain_hold(struct tcf_chain *chain)
@@ -329,49 +333,34 @@ int tcf_block_get(struct tcf_block **p_block,
 }
 EXPORT_SYMBOL(tcf_block_get);
 
-static void tcf_block_put_final(struct work_struct *work)
-{
-	struct tcf_block *block = container_of(work, struct tcf_block, work);
-	struct tcf_chain *chain, *tmp;
-
-	rtnl_lock();
-
-	/* At this point, all the chains should have refcnt == 1. */
-	list_for_each_entry_safe(chain, tmp, &block->chain_list, list)
-		tcf_chain_put(chain);
-	rtnl_unlock();
-	kfree(block);
-}
-
 /* XXX: Standalone actions are not allowed to jump to any chain, and bound
  * actions should be all removed after flushing.
  */
 void tcf_block_put_ext(struct tcf_block *block, struct Qdisc *q,
 		       struct tcf_block_ext_info *ei)
 {
-	struct tcf_chain *chain;
+	struct tcf_chain *chain, *tmp;
 
-	if (!block)
-		return;
-	/* Hold a refcnt for all chains, except 0, so that they don't disappear
+	/* Hold a refcnt for all chains, so that they don't disappear
 	 * while we are iterating.
 	 */
+	if (!block)
+		return;
 	list_for_each_entry(chain, &block->chain_list, list)
-		if (chain->index)
-			tcf_chain_hold(chain);
+		tcf_chain_hold(chain);
 
 	list_for_each_entry(chain, &block->chain_list, list)
 		tcf_chain_flush(chain);
 
 	tcf_block_offload_unbind(block, q, ei);
 
-	INIT_WORK(&block->work, tcf_block_put_final);
-	/* Wait for existing RCU callbacks to cool down, make sure their works
-	 * have been queued before this. We can not flush pending works here
-	 * because we are holding the RTNL lock.
-	 */
-	rcu_barrier();
-	tcf_queue_work(&block->work);
+	/* At this point, all the chains should have refcnt >= 1. */
+	list_for_each_entry_safe(chain, tmp, &block->chain_list, list)
+		tcf_chain_put(chain);
+
+	/* Finally, put chain 0 and allow block to be freed. */
+	chain = list_first_entry(&block->chain_list, struct tcf_chain, list);
+	tcf_chain_put(chain);
 }
 EXPORT_SYMBOL(tcf_block_put_ext);
 
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index 543a3e875d05..6132a7317efa 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -166,6 +166,7 @@ static int fl_classify(struct sk_buff *skb, const struct tcf_proto *tp,
 	 * so do it rather here.
 	 */
 	skb_key.basic.n_proto = skb->protocol;
+	skb_flow_dissect_tunnel_info(skb, &head->dissector, &skb_key);
 	skb_flow_dissect(skb, &head->dissector, &skb_key, 0);
 
 	fl_set_masked_key(&skb_mkey, &skb_key, &head->mask);
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index 0f1eab99ff4e..74c22b4e365e 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -799,7 +799,8 @@ static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
 		goto nla_put_failure;
 	if (q->ops->dump && q->ops->dump(q, skb) < 0)
 		goto nla_put_failure;
-	qlen = q->q.qlen;
+
+	qlen = qdisc_qlen_sum(q);
 
 	stab = rtnl_dereference(q->stab);
 	if (stab && qdisc_dump_stab(skb, stab) < 0)
@@ -956,6 +957,11 @@ skip:
 	} else {
 		const struct Qdisc_class_ops *cops = parent->ops->cl_ops;
 
+		/* Only support running class lockless if parent is lockless */
+		if (new && (new->flags & TCQ_F_NOLOCK) &&
+		    parent && !(parent->flags & TCQ_F_NOLOCK))
+			new->flags &= ~TCQ_F_NOLOCK;
+
 		err = -EOPNOTSUPP;
 		if (cops && cops->graft) {
 			unsigned long cl = cops->find(parent, classid);
@@ -1022,7 +1028,7 @@ static struct Qdisc *qdisc_create(struct net_device *dev,
 #endif
 
 	err = -ENOENT;
-	if (ops == NULL)
+	if (!ops)
 		goto err_out;
 
 	sch = qdisc_alloc(dev_queue, ops);
@@ -1062,54 +1068,60 @@ static struct Qdisc *qdisc_create(struct net_device *dev,
 		netdev_info(dev, "Caught tx_queue_len zero misconfig\n");
 	}
 
-	if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS])) == 0) {
-		if (qdisc_is_percpu_stats(sch)) {
-			sch->cpu_bstats =
-				netdev_alloc_pcpu_stats(struct gnet_stats_basic_cpu);
-			if (!sch->cpu_bstats)
-				goto err_out4;
+	if (ops->init) {
+		err = ops->init(sch, tca[TCA_OPTIONS]);
+		if (err != 0)
+			goto err_out5;
+	}
 
-			sch->cpu_qstats = alloc_percpu(struct gnet_stats_queue);
-			if (!sch->cpu_qstats)
-				goto err_out4;
-		}
+	if (qdisc_is_percpu_stats(sch)) {
+		sch->cpu_bstats =
+			netdev_alloc_pcpu_stats(struct gnet_stats_basic_cpu);
+		if (!sch->cpu_bstats)
+			goto err_out4;
 
-		if (tca[TCA_STAB]) {
-			stab = qdisc_get_stab(tca[TCA_STAB]);
-			if (IS_ERR(stab)) {
-				err = PTR_ERR(stab);
-				goto err_out4;
-			}
-			rcu_assign_pointer(sch->stab, stab);
-		}
-		if (tca[TCA_RATE]) {
-			seqcount_t *running;
-
-			err = -EOPNOTSUPP;
-			if (sch->flags & TCQ_F_MQROOT)
-				goto err_out4;
+		sch->cpu_qstats = alloc_percpu(struct gnet_stats_queue);
+		if (!sch->cpu_qstats)
+			goto err_out4;
+	}
 
-			if ((sch->parent != TC_H_ROOT) &&
-			    !(sch->flags & TCQ_F_INGRESS) &&
-			    (!p || !(p->flags & TCQ_F_MQROOT)))
-				running = qdisc_root_sleeping_running(sch);
-			else
-				running = &sch->running;
-
-			err = gen_new_estimator(&sch->bstats,
-						sch->cpu_bstats,
-						&sch->rate_est,
-						NULL,
-						running,
-						tca[TCA_RATE]);
-			if (err)
-				goto err_out4;
+	if (tca[TCA_STAB]) {
+		stab = qdisc_get_stab(tca[TCA_STAB]);
+		if (IS_ERR(stab)) {
+			err = PTR_ERR(stab);
+			goto err_out4;
 		}
+		rcu_assign_pointer(sch->stab, stab);
+	}
+	if (tca[TCA_RATE]) {
+		seqcount_t *running;
 
-		qdisc_hash_add(sch, false);
+		err = -EOPNOTSUPP;
+		if (sch->flags & TCQ_F_MQROOT)
+			goto err_out4;
 
-		return sch;
+		if (sch->parent != TC_H_ROOT &&
+		    !(sch->flags & TCQ_F_INGRESS) &&
+		    (!p || !(p->flags & TCQ_F_MQROOT)))
+			running = qdisc_root_sleeping_running(sch);
+		else
+			running = &sch->running;
+
+		err = gen_new_estimator(&sch->bstats,
+					sch->cpu_bstats,
+					&sch->rate_est,
+					NULL,
+					running,
+					tca[TCA_RATE]);
+		if (err)
+			goto err_out4;
 	}
+
+	qdisc_hash_add(sch, false);
+
+	return sch;
+
+err_out5:
 	/* ops->init() failed, we call ->destroy() like qdisc_create_dflt() */
 	if (ops->destroy)
 		ops->destroy(sch);
@@ -1141,7 +1153,7 @@ static int qdisc_change(struct Qdisc *sch, struct nlattr **tca)
 	int err = 0;
 
 	if (tca[TCA_OPTIONS]) {
-		if (sch->ops->change == NULL)
+		if (!sch->ops->change)
 			return -EINVAL;
 		err = sch->ops->change(sch, tca[TCA_OPTIONS]);
 		if (err)
@@ -1346,7 +1358,8 @@ replay:
 					goto create_n_graft;
 				if (n->nlmsg_flags & NLM_F_EXCL)
 					return -EEXIST;
-				if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
+				if (tca[TCA_KIND] &&
+				    nla_strcmp(tca[TCA_KIND], q->ops->id))
 					return -EINVAL;
 				if (q == p ||
 				    (p && check_loop(q, p, 0)))
@@ -1391,7 +1404,7 @@ replay:
 	}
 
 	/* Change qdisc parameters */
-	if (q == NULL)
+	if (!q)
 		return -ENOENT;
 	if (n->nlmsg_flags & NLM_F_EXCL)
 		return -EEXIST;
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index cd1b200acae7..981c08fe810b 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -26,6 +26,7 @@
 #include <linux/list.h>
 #include <linux/slab.h>
 #include <linux/if_vlan.h>
+#include <linux/skb_array.h>
 #include <linux/if_macvlan.h>
 #include <net/sch_generic.h>
 #include <net/pkt_sched.h>
@@ -47,9 +48,70 @@ EXPORT_SYMBOL(default_qdisc_ops);
  * - updates to tree and tree walking are only done under the rtnl mutex.
  */
 
-static inline int dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q)
+static inline struct sk_buff *__skb_dequeue_bad_txq(struct Qdisc *q)
+{
+	const struct netdev_queue *txq = q->dev_queue;
+	spinlock_t *lock = NULL;
+	struct sk_buff *skb;
+
+	if (q->flags & TCQ_F_NOLOCK) {
+		lock = qdisc_lock(q);
+		spin_lock(lock);
+	}
+
+	skb = skb_peek(&q->skb_bad_txq);
+	if (skb) {
+		/* check the reason of requeuing without tx lock first */
+		txq = skb_get_tx_queue(txq->dev, skb);
+		if (!netif_xmit_frozen_or_stopped(txq)) {
+			skb = __skb_dequeue(&q->skb_bad_txq);
+			if (qdisc_is_percpu_stats(q)) {
+				qdisc_qstats_cpu_backlog_dec(q, skb);
+				qdisc_qstats_cpu_qlen_dec(q);
+			} else {
+				qdisc_qstats_backlog_dec(q, skb);
+				q->q.qlen--;
+			}
+		} else {
+			skb = NULL;
+		}
+	}
+
+	if (lock)
+		spin_unlock(lock);
+
+	return skb;
+}
+
+static inline struct sk_buff *qdisc_dequeue_skb_bad_txq(struct Qdisc *q)
+{
+	struct sk_buff *skb = skb_peek(&q->skb_bad_txq);
+
+	if (unlikely(skb))
+		skb = __skb_dequeue_bad_txq(q);
+
+	return skb;
+}
+
+static inline void qdisc_enqueue_skb_bad_txq(struct Qdisc *q,
+					     struct sk_buff *skb)
 {
-	q->gso_skb = skb;
+	spinlock_t *lock = NULL;
+
+	if (q->flags & TCQ_F_NOLOCK) {
+		lock = qdisc_lock(q);
+		spin_lock(lock);
+	}
+
+	__skb_queue_tail(&q->skb_bad_txq, skb);
+
+	if (lock)
+		spin_unlock(lock);
+}
+
+static inline int __dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q)
+{
+	__skb_queue_head(&q->gso_skb, skb);
 	q->qstats.requeues++;
 	qdisc_qstats_backlog_inc(q, skb);
 	q->q.qlen++;	/* it's still part of the queue */
@@ -58,6 +120,30 @@ static inline int dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q)
 	return 0;
 }
 
+static inline int dev_requeue_skb_locked(struct sk_buff *skb, struct Qdisc *q)
+{
+	spinlock_t *lock = qdisc_lock(q);
+
+	spin_lock(lock);
+	__skb_queue_tail(&q->gso_skb, skb);
+	spin_unlock(lock);
+
+	qdisc_qstats_cpu_requeues_inc(q);
+	qdisc_qstats_cpu_backlog_inc(q, skb);
+	qdisc_qstats_cpu_qlen_inc(q);
+	__netif_schedule(q);
+
+	return 0;
+}
+
+static inline int dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q)
+{
+	if (q->flags & TCQ_F_NOLOCK)
+		return dev_requeue_skb_locked(skb, q);
+	else
+		return __dev_requeue_skb(skb, q);
+}
+
 static void try_bulk_dequeue_skb(struct Qdisc *q,
 				 struct sk_buff *skb,
 				 const struct netdev_queue *txq,
@@ -95,9 +181,15 @@ static void try_bulk_dequeue_skb_slow(struct Qdisc *q,
 		if (!nskb)
 			break;
 		if (unlikely(skb_get_queue_mapping(nskb) != mapping)) {
-			q->skb_bad_txq = nskb;
-			qdisc_qstats_backlog_inc(q, nskb);
-			q->q.qlen++;
+			qdisc_enqueue_skb_bad_txq(q, nskb);
+
+			if (qdisc_is_percpu_stats(q)) {
+				qdisc_qstats_cpu_backlog_inc(q, nskb);
+				qdisc_qstats_cpu_qlen_inc(q);
+			} else {
+				qdisc_qstats_backlog_inc(q, nskb);
+				q->q.qlen++;
+			}
 			break;
 		}
 		skb->next = nskb;
@@ -113,40 +205,60 @@ static void try_bulk_dequeue_skb_slow(struct Qdisc *q,
 static struct sk_buff *dequeue_skb(struct Qdisc *q, bool *validate,
 				   int *packets)
 {
-	struct sk_buff *skb = q->gso_skb;
 	const struct netdev_queue *txq = q->dev_queue;
+	struct sk_buff *skb = NULL;
 
 	*packets = 1;
-	if (unlikely(skb)) {
+	if (unlikely(!skb_queue_empty(&q->gso_skb))) {
+		spinlock_t *lock = NULL;
+
+		if (q->flags & TCQ_F_NOLOCK) {
+			lock = qdisc_lock(q);
+			spin_lock(lock);
+		}
+
+		skb = skb_peek(&q->gso_skb);
+
+		/* skb may be null if another cpu pulls gso_skb off in between
+		 * empty check and lock.
+		 */
+		if (!skb) {
+			if (lock)
+				spin_unlock(lock);
+			goto validate;
+		}
+
 		/* skb in gso_skb were already validated */
 		*validate = false;
 		/* check the reason of requeuing without tx lock first */
 		txq = skb_get_tx_queue(txq->dev, skb);
 		if (!netif_xmit_frozen_or_stopped(txq)) {
-			q->gso_skb = NULL;
-			qdisc_qstats_backlog_dec(q, skb);
-			q->q.qlen--;
-		} else
+			skb = __skb_dequeue(&q->gso_skb);
+			if (qdisc_is_percpu_stats(q)) {
+				qdisc_qstats_cpu_backlog_dec(q, skb);
+				qdisc_qstats_cpu_qlen_dec(q);
+			} else {
+				qdisc_qstats_backlog_dec(q, skb);
+				q->q.qlen--;
+			}
+		} else {
 			skb = NULL;
-		goto trace;
-	}
-	*validate = true;
-	skb = q->skb_bad_txq;
-	if (unlikely(skb)) {
-		/* check the reason of requeuing without tx lock first */
-		txq = skb_get_tx_queue(txq->dev, skb);
-		if (!netif_xmit_frozen_or_stopped(txq)) {
-			q->skb_bad_txq = NULL;
-			qdisc_qstats_backlog_dec(q, skb);
-			q->q.qlen--;
-			goto bulk;
 		}
-		skb = NULL;
+		if (lock)
+			spin_unlock(lock);
 		goto trace;
 	}
-	if (!(q->flags & TCQ_F_ONETXQUEUE) ||
-	    !netif_xmit_frozen_or_stopped(txq))
-		skb = q->dequeue(q);
+validate:
+	*validate = true;
+
+	if ((q->flags & TCQ_F_ONETXQUEUE) &&
+	    netif_xmit_frozen_or_stopped(txq))
+		return skb;
+
+	skb = qdisc_dequeue_skb_bad_txq(q);
+	if (unlikely(skb))
+		goto bulk;
+	skb = q->dequeue(q);
 	if (skb) {
 bulk:
 		if (qdisc_may_bulk(q))
@@ -165,17 +277,18 @@ trace:
  * only one CPU can execute this function.
  *
  * Returns to the caller:
- *				0  - queue is empty or throttled.
- *				>0 - queue is not empty.
+ *				false  - hardware queue frozen backoff
+ *				true   - feel free to send more pkts
  */
-int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q,
-		    struct net_device *dev, struct netdev_queue *txq,
-		    spinlock_t *root_lock, bool validate)
+bool sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q,
+		     struct net_device *dev, struct netdev_queue *txq,
+		     spinlock_t *root_lock, bool validate)
 {
 	int ret = NETDEV_TX_BUSY;
 
 	/* And release qdisc */
-	spin_unlock(root_lock);
+	if (root_lock)
+		spin_unlock(root_lock);
 
 	/* Note that we validate skb (GSO, checksum, ...) outside of locks */
 	if (validate)
@@ -188,27 +301,28 @@ int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q,
 
 		HARD_TX_UNLOCK(dev, txq);
 	} else {
-		spin_lock(root_lock);
-		return qdisc_qlen(q);
+		if (root_lock)
+			spin_lock(root_lock);
+		return true;
 	}
-	spin_lock(root_lock);
 
-	if (dev_xmit_complete(ret)) {
-		/* Driver sent out skb successfully or skb was consumed */
-		ret = qdisc_qlen(q);
-	} else {
+	if (root_lock)
+		spin_lock(root_lock);
+
+	if (!dev_xmit_complete(ret)) {
 		/* Driver returned NETDEV_TX_BUSY - requeue skb */
 		if (unlikely(ret != NETDEV_TX_BUSY))
 			net_warn_ratelimited("BUG %s code %d qlen %d\n",
 					     dev->name, ret, q->q.qlen);
 
-		ret = dev_requeue_skb(skb, q);
+		dev_requeue_skb(skb, q);
+		return false;
 	}
 
 	if (ret && netif_xmit_frozen_or_stopped(txq))
-		ret = 0;
+		return false;
 
-	return ret;
+	return true;
 }
 
 /*
@@ -230,20 +344,22 @@ int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q,
  *				>0 - queue is not empty.
  *
  */
-static inline int qdisc_restart(struct Qdisc *q, int *packets)
+static inline bool qdisc_restart(struct Qdisc *q, int *packets)
 {
+	spinlock_t *root_lock = NULL;
 	struct netdev_queue *txq;
 	struct net_device *dev;
-	spinlock_t *root_lock;
 	struct sk_buff *skb;
 	bool validate;
 
 	/* Dequeue packet */
 	skb = dequeue_skb(q, &validate, packets);
 	if (unlikely(!skb))
-		return 0;
+		return false;
+
+	if (!(q->flags & TCQ_F_NOLOCK))
+		root_lock = qdisc_lock(q);
 
-	root_lock = qdisc_lock(q);
 	dev = qdisc_dev(q);
 	txq = skb_get_tx_queue(dev, skb);
 
@@ -267,8 +383,6 @@ void __qdisc_run(struct Qdisc *q)
 			break;
 		}
 	}
-
-	qdisc_run_end(q);
 }
 
 unsigned long dev_trans_start(struct net_device *dev)
@@ -468,93 +582,93 @@ static const u8 prio2band[TC_PRIO_MAX + 1] = {
 
 /*
  * Private data for a pfifo_fast scheduler containing:
- * 	- queues for the three band
- * 	- bitmap indicating which of the bands contain skbs
+ *	- rings for priority bands
  */
 struct pfifo_fast_priv {
-	u32 bitmap;
-	struct qdisc_skb_head q[PFIFO_FAST_BANDS];
+	struct skb_array q[PFIFO_FAST_BANDS];
 };
 
-/*
- * Convert a bitmap to the first band number where an skb is queued, where:
- * 	bitmap=0 means there are no skbs on any band.
- * 	bitmap=1 means there is an skb on band 0.
- *	bitmap=7 means there are skbs on all 3 bands, etc.
- */
-static const int bitmap2band[] = {-1, 0, 1, 0, 2, 0, 1, 0};
-
-static inline struct qdisc_skb_head *band2list(struct pfifo_fast_priv *priv,
-					     int band)
+static inline struct skb_array *band2list(struct pfifo_fast_priv *priv,
+					  int band)
 {
-	return priv->q + band;
+	return &priv->q[band];
 }
 
 static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc *qdisc,
 			      struct sk_buff **to_free)
 {
-	if (qdisc->q.qlen < qdisc_dev(qdisc)->tx_queue_len) {
-		int band = prio2band[skb->priority & TC_PRIO_MAX];
-		struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
-		struct qdisc_skb_head *list = band2list(priv, band);
-
-		priv->bitmap |= (1 << band);
-		qdisc->q.qlen++;
-		return __qdisc_enqueue_tail(skb, qdisc, list);
-	}
+	int band = prio2band[skb->priority & TC_PRIO_MAX];
+	struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
+	struct skb_array *q = band2list(priv, band);
+	int err;
+
+	err = skb_array_produce(q, skb);
 
-	return qdisc_drop(skb, qdisc, to_free);
+	if (unlikely(err))
+		return qdisc_drop_cpu(skb, qdisc, to_free);
+
+	qdisc_qstats_cpu_qlen_inc(qdisc);
+	qdisc_qstats_cpu_backlog_inc(qdisc, skb);
+	return NET_XMIT_SUCCESS;
 }
 
 static struct sk_buff *pfifo_fast_dequeue(struct Qdisc *qdisc)
 {
 	struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
-	int band = bitmap2band[priv->bitmap];
-
-	if (likely(band >= 0)) {
-		struct qdisc_skb_head *qh = band2list(priv, band);
-		struct sk_buff *skb = __qdisc_dequeue_head(qh);
+	struct sk_buff *skb = NULL;
+	int band;
 
-		if (likely(skb != NULL)) {
-			qdisc_qstats_backlog_dec(qdisc, skb);
-			qdisc_bstats_update(qdisc, skb);
-		}
+	for (band = 0; band < PFIFO_FAST_BANDS && !skb; band++) {
+		struct skb_array *q = band2list(priv, band);
 
-		qdisc->q.qlen--;
-		if (qh->qlen == 0)
-			priv->bitmap &= ~(1 << band);
+		if (__skb_array_empty(q))
+			continue;
 
-		return skb;
+		skb = skb_array_consume_bh(q);
+	}
+	if (likely(skb)) {
+		qdisc_qstats_cpu_backlog_dec(qdisc, skb);
+		qdisc_bstats_cpu_update(qdisc, skb);
+		qdisc_qstats_cpu_qlen_dec(qdisc);
 	}
 
-	return NULL;
+	return skb;
 }
 
 static struct sk_buff *pfifo_fast_peek(struct Qdisc *qdisc)
 {
 	struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
-	int band = bitmap2band[priv->bitmap];
+	struct sk_buff *skb = NULL;
+	int band;
 
-	if (band >= 0) {
-		struct qdisc_skb_head *qh = band2list(priv, band);
+	for (band = 0; band < PFIFO_FAST_BANDS && !skb; band++) {
+		struct skb_array *q = band2list(priv, band);
 
-		return qh->head;
+		skb = __skb_array_peek(q);
 	}
 
-	return NULL;
+	return skb;
 }
 
 static void pfifo_fast_reset(struct Qdisc *qdisc)
 {
-	int prio;
+	int i, band;
 	struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
 
-	for (prio = 0; prio < PFIFO_FAST_BANDS; prio++)
-		__qdisc_reset_queue(band2list(priv, prio));
+	for (band = 0; band < PFIFO_FAST_BANDS; band++) {
+		struct skb_array *q = band2list(priv, band);
+		struct sk_buff *skb;
 
-	priv->bitmap = 0;
-	qdisc->qstats.backlog = 0;
-	qdisc->q.qlen = 0;
+		while ((skb = skb_array_consume_bh(q)) != NULL)
+			kfree_skb(skb);
+	}
+
+	for_each_possible_cpu(i) {
+		struct gnet_stats_queue *q = per_cpu_ptr(qdisc->cpu_qstats, i);
+
+		q->backlog = 0;
+		q->qlen = 0;
+	}
 }
 
 static int pfifo_fast_dump(struct Qdisc *qdisc, struct sk_buff *skb)
@@ -572,17 +686,48 @@ nla_put_failure:
 
 static int pfifo_fast_init(struct Qdisc *qdisc, struct nlattr *opt)
 {
-	int prio;
+	unsigned int qlen = qdisc_dev(qdisc)->tx_queue_len;
 	struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
+	int prio;
 
-	for (prio = 0; prio < PFIFO_FAST_BANDS; prio++)
-		qdisc_skb_head_init(band2list(priv, prio));
+	/* guard against zero length rings */
+	if (!qlen)
+		return -EINVAL;
+
+	for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) {
+		struct skb_array *q = band2list(priv, prio);
+		int err;
+
+		err = skb_array_init(q, qlen, GFP_KERNEL);
+		if (err)
+			return -ENOMEM;
+	}
 
 	/* Can by-pass the queue discipline */
 	qdisc->flags |= TCQ_F_CAN_BYPASS;
 	return 0;
 }
 
+static void pfifo_fast_destroy(struct Qdisc *sch)
+{
+	struct pfifo_fast_priv *priv = qdisc_priv(sch);
+	int prio;
+
+	for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) {
+		struct skb_array *q = band2list(priv, prio);
+
+		/* NULL ring is possible if destroy path is due to a failed
+		 * skb_array_init() in pfifo_fast_init() case.
+		 */
+		if (!&q->ring.queue)
+			continue;
+		/* Destroy ring but no need to kfree_skb because a call to
+		 * pfifo_fast_reset() has already done that work.
+		 */
+		ptr_ring_cleanup(&q->ring, NULL);
+	}
+}
+
 struct Qdisc_ops pfifo_fast_ops __read_mostly = {
 	.id		=	"pfifo_fast",
 	.priv_size	=	sizeof(struct pfifo_fast_priv),
@@ -590,9 +735,11 @@ struct Qdisc_ops pfifo_fast_ops __read_mostly = {
 	.dequeue	=	pfifo_fast_dequeue,
 	.peek		=	pfifo_fast_peek,
 	.init		=	pfifo_fast_init,
+	.destroy	=	pfifo_fast_destroy,
 	.reset		=	pfifo_fast_reset,
 	.dump		=	pfifo_fast_dump,
 	.owner		=	THIS_MODULE,
+	.static_flags	=	TCQ_F_NOLOCK | TCQ_F_CPUSTATS,
 };
 EXPORT_SYMBOL(pfifo_fast_ops);
 
@@ -630,9 +777,24 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue,
 		sch = (struct Qdisc *) QDISC_ALIGN((unsigned long) p);
 		sch->padded = (char *) sch - (char *) p;
 	}
+	__skb_queue_head_init(&sch->gso_skb);
+	__skb_queue_head_init(&sch->skb_bad_txq);
 	qdisc_skb_head_init(&sch->q);
 	spin_lock_init(&sch->q.lock);
 
+	if (ops->static_flags & TCQ_F_CPUSTATS) {
+		sch->cpu_bstats =
+			netdev_alloc_pcpu_stats(struct gnet_stats_basic_cpu);
+		if (!sch->cpu_bstats)
+			goto errout1;
+
+		sch->cpu_qstats = alloc_percpu(struct gnet_stats_queue);
+		if (!sch->cpu_qstats) {
+			free_percpu(sch->cpu_bstats);
+			goto errout1;
+		}
+	}
+
 	spin_lock_init(&sch->busylock);
 	lockdep_set_class(&sch->busylock,
 			  dev->qdisc_tx_busylock ?: &qdisc_tx_busylock);
@@ -642,6 +804,7 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue,
 			  dev->qdisc_running_key ?: &qdisc_running_key);
 
 	sch->ops = ops;
+	sch->flags = ops->static_flags;
 	sch->enqueue = ops->enqueue;
 	sch->dequeue = ops->dequeue;
 	sch->dev_queue = dev_queue;
@@ -649,6 +812,8 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue,
 	refcount_set(&sch->refcnt, 1);
 
 	return sch;
+errout1:
+	kfree(p);
 errout:
 	return ERR_PTR(err);
 }
@@ -682,17 +847,21 @@ EXPORT_SYMBOL(qdisc_create_dflt);
 void qdisc_reset(struct Qdisc *qdisc)
 {
 	const struct Qdisc_ops *ops = qdisc->ops;
+	struct sk_buff *skb, *tmp;
 
 	if (ops->reset)
 		ops->reset(qdisc);
 
-	kfree_skb(qdisc->skb_bad_txq);
-	qdisc->skb_bad_txq = NULL;
+	skb_queue_walk_safe(&qdisc->gso_skb, skb, tmp) {
+		__skb_unlink(skb, &qdisc->gso_skb);
+		kfree_skb_list(skb);
+	}
 
-	if (qdisc->gso_skb) {
-		kfree_skb_list(qdisc->gso_skb);
-		qdisc->gso_skb = NULL;
+	skb_queue_walk_safe(&qdisc->skb_bad_txq, skb, tmp) {
+		__skb_unlink(skb, &qdisc->skb_bad_txq);
+		kfree_skb_list(skb);
 	}
+
 	qdisc->q.qlen = 0;
 	qdisc->qstats.backlog = 0;
 }
@@ -711,6 +880,7 @@ static void qdisc_free(struct Qdisc *qdisc)
 void qdisc_destroy(struct Qdisc *qdisc)
 {
 	const struct Qdisc_ops  *ops = qdisc->ops;
+	struct sk_buff *skb, *tmp;
 
 	if (qdisc->flags & TCQ_F_BUILTIN ||
 	    !refcount_dec_and_test(&qdisc->refcnt))
@@ -730,8 +900,16 @@ void qdisc_destroy(struct Qdisc *qdisc)
 	module_put(ops->owner);
 	dev_put(qdisc_dev(qdisc));
 
-	kfree_skb_list(qdisc->gso_skb);
-	kfree_skb(qdisc->skb_bad_txq);
+	skb_queue_walk_safe(&qdisc->gso_skb, skb, tmp) {
+		__skb_unlink(skb, &qdisc->gso_skb);
+		kfree_skb_list(skb);
+	}
+
+	skb_queue_walk_safe(&qdisc->skb_bad_txq, skb, tmp) {
+		__skb_unlink(skb, &qdisc->skb_bad_txq);
+		kfree_skb_list(skb);
+	}
+
 	qdisc_free(qdisc);
 }
 EXPORT_SYMBOL(qdisc_destroy);
@@ -746,10 +924,6 @@ struct Qdisc *dev_graft_qdisc(struct netdev_queue *dev_queue,
 	root_lock = qdisc_lock(oqdisc);
 	spin_lock_bh(root_lock);
 
-	/* Prune old scheduler */
-	if (oqdisc && refcount_read(&oqdisc->refcnt) <= 1)
-		qdisc_reset(oqdisc);
-
 	/* ... and graft new one */
 	if (qdisc == NULL)
 		qdisc = &noop_qdisc;
@@ -885,14 +1059,18 @@ static bool some_qdisc_is_busy(struct net_device *dev)
 
 		dev_queue = netdev_get_tx_queue(dev, i);
 		q = dev_queue->qdisc_sleeping;
-		root_lock = qdisc_lock(q);
 
-		spin_lock_bh(root_lock);
+		if (q->flags & TCQ_F_NOLOCK) {
+			val = test_bit(__QDISC_STATE_SCHED, &q->state);
+		} else {
+			root_lock = qdisc_lock(q);
+			spin_lock_bh(root_lock);
 
-		val = (qdisc_is_running(q) ||
-		       test_bit(__QDISC_STATE_SCHED, &q->state));
+			val = (qdisc_is_running(q) ||
+			       test_bit(__QDISC_STATE_SCHED, &q->state));
 
-		spin_unlock_bh(root_lock);
+			spin_unlock_bh(root_lock);
+		}
 
 		if (val)
 			return true;
@@ -900,6 +1078,16 @@ static bool some_qdisc_is_busy(struct net_device *dev)
 	return false;
 }
 
+static void dev_qdisc_reset(struct net_device *dev,
+			    struct netdev_queue *dev_queue,
+			    void *none)
+{
+	struct Qdisc *qdisc = dev_queue->qdisc_sleeping;
+
+	if (qdisc)
+		qdisc_reset(qdisc);
+}
+
 /**
  * 	dev_deactivate_many - deactivate transmissions on several devices
  * 	@head: list of devices to deactivate
@@ -910,7 +1098,6 @@ static bool some_qdisc_is_busy(struct net_device *dev)
 void dev_deactivate_many(struct list_head *head)
 {
 	struct net_device *dev;
-	bool sync_needed = false;
 
 	list_for_each_entry(dev, head, close_list) {
 		netdev_for_each_tx_queue(dev, dev_deactivate_queue,
@@ -920,20 +1107,25 @@ void dev_deactivate_many(struct list_head *head)
 					     &noop_qdisc);
 
 		dev_watchdog_down(dev);
-		sync_needed |= !dev->dismantle;
 	}
 
 	/* Wait for outstanding qdisc-less dev_queue_xmit calls.
 	 * This is avoided if all devices are in dismantle phase :
 	 * Caller will call synchronize_net() for us
 	 */
-	if (sync_needed)
-		synchronize_net();
+	synchronize_net();
 
 	/* Wait for outstanding qdisc_run calls. */
-	list_for_each_entry(dev, head, close_list)
+	list_for_each_entry(dev, head, close_list) {
 		while (some_qdisc_is_busy(dev))
 			yield();
+		/* The new qdisc is assigned at this point so we can safely
+		 * unwind stale skb lists and qdisc statistics
+		 */
+		netdev_for_each_tx_queue(dev, dev_qdisc_reset, NULL);
+		if (dev_ingress_queue(dev))
+			dev_qdisc_reset(dev, dev_ingress_queue(dev), NULL);
+	}
 }
 
 void dev_deactivate(struct net_device *dev)
@@ -954,6 +1146,8 @@ static void dev_init_scheduler_queue(struct net_device *dev,
 
 	rcu_assign_pointer(dev_queue->qdisc, qdisc);
 	dev_queue->qdisc_sleeping = qdisc;
+	__skb_queue_head_init(&qdisc->gso_skb);
+	__skb_queue_head_init(&qdisc->skb_bad_txq);
 }
 
 void dev_init_scheduler(struct net_device *dev)
diff --git a/net/sched/sch_mq.c b/net/sched/sch_mq.c
index 213b586a06a0..8cbb5c829d59 100644
--- a/net/sched/sch_mq.c
+++ b/net/sched/sch_mq.c
@@ -17,6 +17,7 @@
 #include <linux/skbuff.h>
 #include <net/netlink.h>
 #include <net/pkt_sched.h>
+#include <net/sch_generic.h>
 
 struct mq_sched {
 	struct Qdisc		**qdiscs;
@@ -97,23 +98,42 @@ static int mq_dump(struct Qdisc *sch, struct sk_buff *skb)
 	struct net_device *dev = qdisc_dev(sch);
 	struct Qdisc *qdisc;
 	unsigned int ntx;
+	__u32 qlen = 0;
 
 	sch->q.qlen = 0;
 	memset(&sch->bstats, 0, sizeof(sch->bstats));
 	memset(&sch->qstats, 0, sizeof(sch->qstats));
 
+	/* MQ supports lockless qdiscs. However, statistics accounting needs
+	 * to account for all, none, or a mix of locked and unlocked child
+	 * qdiscs. Percpu stats are added to counters in-band and locking
+	 * qdisc totals are added at end.
+	 */
 	for (ntx = 0; ntx < dev->num_tx_queues; ntx++) {
 		qdisc = netdev_get_tx_queue(dev, ntx)->qdisc_sleeping;
 		spin_lock_bh(qdisc_lock(qdisc));
-		sch->q.qlen		+= qdisc->q.qlen;
-		sch->bstats.bytes	+= qdisc->bstats.bytes;
-		sch->bstats.packets	+= qdisc->bstats.packets;
-		sch->qstats.backlog	+= qdisc->qstats.backlog;
-		sch->qstats.drops	+= qdisc->qstats.drops;
-		sch->qstats.requeues	+= qdisc->qstats.requeues;
-		sch->qstats.overlimits	+= qdisc->qstats.overlimits;
+
+		if (qdisc_is_percpu_stats(qdisc)) {
+			qlen = qdisc_qlen_sum(qdisc);
+			__gnet_stats_copy_basic(NULL, &sch->bstats,
+						qdisc->cpu_bstats,
+						&qdisc->bstats);
+			__gnet_stats_copy_queue(&sch->qstats,
+						qdisc->cpu_qstats,
+						&qdisc->qstats, qlen);
+		} else {
+			sch->q.qlen		+= qdisc->q.qlen;
+			sch->bstats.bytes	+= qdisc->bstats.bytes;
+			sch->bstats.packets	+= qdisc->bstats.packets;
+			sch->qstats.backlog	+= qdisc->qstats.backlog;
+			sch->qstats.drops	+= qdisc->qstats.drops;
+			sch->qstats.requeues	+= qdisc->qstats.requeues;
+			sch->qstats.overlimits	+= qdisc->qstats.overlimits;
+		}
+
 		spin_unlock_bh(qdisc_lock(qdisc));
 	}
+
 	return 0;
 }
 
diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c
index b85885a9d8a1..8622745f3cd9 100644
--- a/net/sched/sch_mqprio.c
+++ b/net/sched/sch_mqprio.c
@@ -388,22 +388,40 @@ static int mqprio_dump(struct Qdisc *sch, struct sk_buff *skb)
 	struct nlattr *nla = (struct nlattr *)skb_tail_pointer(skb);
 	struct tc_mqprio_qopt opt = { 0 };
 	struct Qdisc *qdisc;
-	unsigned int i;
+	unsigned int ntx, tc;
 
 	sch->q.qlen = 0;
 	memset(&sch->bstats, 0, sizeof(sch->bstats));
 	memset(&sch->qstats, 0, sizeof(sch->qstats));
 
-	for (i = 0; i < dev->num_tx_queues; i++) {
-		qdisc = rtnl_dereference(netdev_get_tx_queue(dev, i)->qdisc);
+	/* MQ supports lockless qdiscs. However, statistics accounting needs
+	 * to account for all, none, or a mix of locked and unlocked child
+	 * qdiscs. Percpu stats are added to counters in-band and locking
+	 * qdisc totals are added at end.
+	 */
+	for (ntx = 0; ntx < dev->num_tx_queues; ntx++) {
+		qdisc = netdev_get_tx_queue(dev, ntx)->qdisc_sleeping;
 		spin_lock_bh(qdisc_lock(qdisc));
-		sch->q.qlen		+= qdisc->q.qlen;
-		sch->bstats.bytes	+= qdisc->bstats.bytes;
-		sch->bstats.packets	+= qdisc->bstats.packets;
-		sch->qstats.backlog	+= qdisc->qstats.backlog;
-		sch->qstats.drops	+= qdisc->qstats.drops;
-		sch->qstats.requeues	+= qdisc->qstats.requeues;
-		sch->qstats.overlimits	+= qdisc->qstats.overlimits;
+
+		if (qdisc_is_percpu_stats(qdisc)) {
+			__u32 qlen = qdisc_qlen_sum(qdisc);
+
+			__gnet_stats_copy_basic(NULL, &sch->bstats,
+						qdisc->cpu_bstats,
+						&qdisc->bstats);
+			__gnet_stats_copy_queue(&sch->qstats,
+						qdisc->cpu_qstats,
+						&qdisc->qstats, qlen);
+		} else {
+			sch->q.qlen		+= qdisc->q.qlen;
+			sch->bstats.bytes	+= qdisc->bstats.bytes;
+			sch->bstats.packets	+= qdisc->bstats.packets;
+			sch->qstats.backlog	+= qdisc->qstats.backlog;
+			sch->qstats.drops	+= qdisc->qstats.drops;
+			sch->qstats.requeues	+= qdisc->qstats.requeues;
+			sch->qstats.overlimits	+= qdisc->qstats.overlimits;
+		}
+
 		spin_unlock_bh(qdisc_lock(qdisc));
 	}
 
@@ -411,9 +429,9 @@ static int mqprio_dump(struct Qdisc *sch, struct sk_buff *skb)
 	memcpy(opt.prio_tc_map, dev->prio_tc_map, sizeof(opt.prio_tc_map));
 	opt.hw = priv->hw_offload;
 
-	for (i = 0; i < netdev_get_num_tc(dev); i++) {
-		opt.count[i] = dev->tc_to_txq[i].count;
-		opt.offset[i] = dev->tc_to_txq[i].offset;
+	for (tc = 0; tc < netdev_get_num_tc(dev); tc++) {
+		opt.count[tc] = dev->tc_to_txq[tc].count;
+		opt.offset[tc] = dev->tc_to_txq[tc].offset;
 	}
 
 	if (nla_put(skb, TCA_OPTIONS, NLA_ALIGN(sizeof(opt)), &opt))
@@ -495,7 +513,6 @@ static int mqprio_dump_class_stats(struct Qdisc *sch, unsigned long cl,
 	if (cl >= TC_H_MIN_PRIORITY) {
 		int i;
 		__u32 qlen = 0;
-		struct Qdisc *qdisc;
 		struct gnet_stats_queue qstats = {0};
 		struct gnet_stats_basic_packed bstats = {0};
 		struct net_device *dev = qdisc_dev(sch);
@@ -511,18 +528,26 @@ static int mqprio_dump_class_stats(struct Qdisc *sch, unsigned long cl,
 
 		for (i = tc.offset; i < tc.offset + tc.count; i++) {
 			struct netdev_queue *q = netdev_get_tx_queue(dev, i);
+			struct Qdisc *qdisc = rtnl_dereference(q->qdisc);
+			struct gnet_stats_basic_cpu __percpu *cpu_bstats = NULL;
+			struct gnet_stats_queue __percpu *cpu_qstats = NULL;
 
-			qdisc = rtnl_dereference(q->qdisc);
 			spin_lock_bh(qdisc_lock(qdisc));
-			qlen		  += qdisc->q.qlen;
-			bstats.bytes      += qdisc->bstats.bytes;
-			bstats.packets    += qdisc->bstats.packets;
-			qstats.backlog    += qdisc->qstats.backlog;
-			qstats.drops      += qdisc->qstats.drops;
-			qstats.requeues   += qdisc->qstats.requeues;
-			qstats.overlimits += qdisc->qstats.overlimits;
+			if (qdisc_is_percpu_stats(qdisc)) {
+				cpu_bstats = qdisc->cpu_bstats;
+				cpu_qstats = qdisc->cpu_qstats;
+			}
+
+			qlen = qdisc_qlen_sum(qdisc);
+			__gnet_stats_copy_basic(NULL, &sch->bstats,
+						cpu_bstats, &qdisc->bstats);
+			__gnet_stats_copy_queue(&sch->qstats,
+						cpu_qstats,
+						&qdisc->qstats,
+						qlen);
 			spin_unlock_bh(qdisc_lock(qdisc));
 		}
+
 		/* Reclaim root sleeping lock before completing stats */
 		if (d->lock)
 			spin_lock_bh(d->lock);
diff --git a/net/sctp/Makefile b/net/sctp/Makefile
index 1ca84a288443..54bd9c1a8aa1 100644
--- a/net/sctp/Makefile
+++ b/net/sctp/Makefile
@@ -14,7 +14,7 @@ sctp-y := sm_statetable.o sm_statefuns.o sm_sideeffect.o \
 	  tsnmap.o bind_addr.o socket.o primitive.o \
 	  output.o input.o debug.o stream.o auth.o \
 	  offload.o stream_sched.o stream_sched_prio.o \
-	  stream_sched_rr.o
+	  stream_sched_rr.o stream_interleave.o
 
 sctp_probe-y := probe.o
 
diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index 69394f4d6091..837806dd5799 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -861,7 +861,7 @@ void sctp_assoc_control_transport(struct sctp_association *asoc,
 		event = sctp_ulpevent_make_peer_addr_change(asoc, &addr,
 					0, spc_state, error, GFP_ATOMIC);
 		if (event)
-			sctp_ulpq_tail_event(&asoc->ulpq, event);
+			asoc->stream.si->enqueue_event(&asoc->ulpq, event);
 	}
 
 	/* Select new active and retran paths. */
diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c
index 7f8baa48e7c2..991a530c6b31 100644
--- a/net/sctp/chunk.c
+++ b/net/sctp/chunk.c
@@ -124,7 +124,7 @@ static void sctp_datamsg_destroy(struct sctp_datamsg *msg)
 			ev = sctp_ulpevent_make_send_failed(asoc, chunk, sent,
 							    error, GFP_ATOMIC);
 			if (ev)
-				sctp_ulpq_tail_event(&asoc->ulpq, ev);
+				asoc->stream.si->enqueue_event(&asoc->ulpq, ev);
 		}
 
 		sctp_chunk_put(chunk);
@@ -191,7 +191,7 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc,
 	 */
 	max_data = asoc->pathmtu -
 		   sctp_sk(asoc->base.sk)->pf->af->net_header_len -
-		   sizeof(struct sctphdr) - sizeof(struct sctp_data_chunk);
+		   sizeof(struct sctphdr) - sctp_datachk_len(&asoc->stream);
 	max_data = SCTP_TRUNC4(max_data);
 
 	/* If the the peer requested that we authenticate DATA chunks
@@ -264,8 +264,8 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc,
 				frag |= SCTP_DATA_SACK_IMM;
 		}
 
-		chunk = sctp_make_datafrag_empty(asoc, sinfo, len, frag,
-						 0, GFP_KERNEL);
+		chunk = asoc->stream.si->make_datafrag(asoc, sinfo, len, frag,
+						       GFP_KERNEL);
 		if (!chunk) {
 			err = -ENOMEM;
 			goto errout;
diff --git a/net/sctp/output.c b/net/sctp/output.c
index 4a865cd06d76..01a26ee051e3 100644
--- a/net/sctp/output.c
+++ b/net/sctp/output.c
@@ -313,6 +313,7 @@ static enum sctp_xmit __sctp_packet_append_chunk(struct sctp_packet *packet,
 	/* We believe that this chunk is OK to add to the packet */
 	switch (chunk->chunk_hdr->type) {
 	case SCTP_CID_DATA:
+	case SCTP_CID_I_DATA:
 		/* Account for the data being in the packet */
 		sctp_packet_append_data(packet, chunk);
 		/* Disallow SACK bundling after DATA. */
@@ -724,7 +725,7 @@ static enum sctp_xmit sctp_packet_can_append_data(struct sctp_packet *packet,
 	 * or delay in hopes of bundling a full sized packet.
 	 */
 	if (chunk->skb->len + q->out_qlen > transport->pathmtu -
-		packet->overhead - sizeof(struct sctp_data_chunk) - 4)
+	    packet->overhead - sctp_datachk_len(&chunk->asoc->stream) - 4)
 		/* Enough data queued to fill a packet */
 		return SCTP_XMIT_OK;
 
@@ -759,7 +760,7 @@ static void sctp_packet_append_data(struct sctp_packet *packet,
 
 	asoc->peer.rwnd = rwnd;
 	sctp_chunk_assign_tsn(chunk);
-	sctp_chunk_assign_ssn(chunk);
+	asoc->stream.si->assign_number(chunk);
 }
 
 static enum sctp_xmit sctp_packet_will_fit(struct sctp_packet *packet,
diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c
index 7d67feeeffc1..af9b5ebcae50 100644
--- a/net/sctp/outqueue.c
+++ b/net/sctp/outqueue.c
@@ -67,8 +67,6 @@ static void sctp_mark_missing(struct sctp_outq *q,
 			      __u32 highest_new_tsn,
 			      int count_of_newacks);
 
-static void sctp_generate_fwdtsn(struct sctp_outq *q, __u32 sack_ctsn);
-
 static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp);
 
 /* Add data to the front of the queue. */
@@ -591,7 +589,7 @@ void sctp_retransmit(struct sctp_outq *q, struct sctp_transport *transport,
 	 * following the procedures outlined in C1 - C5.
 	 */
 	if (reason == SCTP_RTXR_T3_RTX)
-		sctp_generate_fwdtsn(q, q->asoc->ctsn_ack_point);
+		q->asoc->stream.si->generate_ftsn(q, q->asoc->ctsn_ack_point);
 
 	/* Flush the queues only on timeout, since fast_rtx is only
 	 * triggered during sack processing and the queue
@@ -942,6 +940,7 @@ static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp)
 		case SCTP_CID_ECN_ECNE:
 		case SCTP_CID_ASCONF:
 		case SCTP_CID_FWD_TSN:
+		case SCTP_CID_I_FWD_TSN:
 		case SCTP_CID_RECONF:
 			status = sctp_packet_transmit_chunk(packet, chunk,
 							    one_packet, gfp);
@@ -956,7 +955,8 @@ static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp)
 			 * sender MUST assure that at least one T3-rtx
 			 * timer is running.
 			 */
-			if (chunk->chunk_hdr->type == SCTP_CID_FWD_TSN) {
+			if (chunk->chunk_hdr->type == SCTP_CID_FWD_TSN ||
+			    chunk->chunk_hdr->type == SCTP_CID_I_FWD_TSN) {
 				sctp_transport_reset_t3_rtx(transport);
 				transport->last_time_sent = jiffies;
 			}
@@ -1372,7 +1372,7 @@ int sctp_outq_sack(struct sctp_outq *q, struct sctp_chunk *chunk)
 
 	asoc->peer.rwnd = sack_a_rwnd;
 
-	sctp_generate_fwdtsn(q, sack_ctsn);
+	asoc->stream.si->generate_ftsn(q, sack_ctsn);
 
 	pr_debug("%s: sack cumulative tsn ack:0x%x\n", __func__, sack_ctsn);
 	pr_debug("%s: cumulative tsn ack of assoc:%p is 0x%x, "
@@ -1795,7 +1795,7 @@ static inline int sctp_get_skip_pos(struct sctp_fwdtsn_skip *skiplist,
 }
 
 /* Create and add a fwdtsn chunk to the outq's control queue if needed. */
-static void sctp_generate_fwdtsn(struct sctp_outq *q, __u32 ctsn)
+void sctp_generate_fwdtsn(struct sctp_outq *q, __u32 ctsn)
 {
 	struct sctp_association *asoc = q->asoc;
 	struct sctp_chunk *ftsn_chunk = NULL;
diff --git a/net/sctp/proc.c b/net/sctp/proc.c
index 26b4be6b4172..4545bc2aff84 100644
--- a/net/sctp/proc.c
+++ b/net/sctp/proc.c
@@ -288,12 +288,8 @@ struct sctp_ht_iter {
 static void *sctp_transport_seq_start(struct seq_file *seq, loff_t *pos)
 {
 	struct sctp_ht_iter *iter = seq->private;
-	int err = sctp_transport_walk_start(&iter->hti);
 
-	if (err) {
-		iter->start_fail = 1;
-		return ERR_PTR(err);
-	}
+	sctp_transport_walk_start(&iter->hti);
 
 	iter->start_fail = 0;
 	return sctp_transport_get_idx(seq_file_net(seq), &iter->hti, *pos);
diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index 9bf575f2e8ed..b9b269cf615e 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -228,7 +228,7 @@ struct sctp_chunk *sctp_make_init(const struct sctp_association *asoc,
 	struct sctp_inithdr init;
 	union sctp_params addrs;
 	struct sctp_sock *sp;
-	__u8 extensions[4];
+	__u8 extensions[5];
 	size_t chunksize;
 	__be16 types[2];
 	int num_ext = 0;
@@ -278,6 +278,11 @@ struct sctp_chunk *sctp_make_init(const struct sctp_association *asoc,
 	if (sp->adaptation_ind)
 		chunksize += sizeof(aiparam);
 
+	if (sp->strm_interleave) {
+		extensions[num_ext] = SCTP_CID_I_DATA;
+		num_ext += 1;
+	}
+
 	chunksize += vparam_len;
 
 	/* Account for AUTH related parameters */
@@ -392,7 +397,7 @@ struct sctp_chunk *sctp_make_init_ack(const struct sctp_association *asoc,
 	struct sctp_inithdr initack;
 	union sctp_params addrs;
 	struct sctp_sock *sp;
-	__u8 extensions[4];
+	__u8 extensions[5];
 	size_t chunksize;
 	int num_ext = 0;
 	int cookie_len;
@@ -442,6 +447,11 @@ struct sctp_chunk *sctp_make_init_ack(const struct sctp_association *asoc,
 	if (sp->adaptation_ind)
 		chunksize += sizeof(aiparam);
 
+	if (asoc->intl_enable) {
+		extensions[num_ext] = SCTP_CID_I_DATA;
+		num_ext += 1;
+	}
+
 	if (asoc->peer.auth_capable) {
 		auth_random = (struct sctp_paramhdr *)asoc->c.auth_random;
 		chunksize += ntohs(auth_random->length);
@@ -711,38 +721,31 @@ nodata:
 /* Make a DATA chunk for the given association from the provided
  * parameters.  However, do not populate the data payload.
  */
-struct sctp_chunk *sctp_make_datafrag_empty(struct sctp_association *asoc,
+struct sctp_chunk *sctp_make_datafrag_empty(const struct sctp_association *asoc,
 					    const struct sctp_sndrcvinfo *sinfo,
-					    int data_len, __u8 flags, __u16 ssn,
-					    gfp_t gfp)
+					    int len, __u8 flags, gfp_t gfp)
 {
 	struct sctp_chunk *retval;
 	struct sctp_datahdr dp;
-	int chunk_len;
 
 	/* We assign the TSN as LATE as possible, not here when
 	 * creating the chunk.
 	 */
-	dp.tsn = 0;
+	memset(&dp, 0, sizeof(dp));
+	dp.ppid = sinfo->sinfo_ppid;
 	dp.stream = htons(sinfo->sinfo_stream);
-	dp.ppid   = sinfo->sinfo_ppid;
 
 	/* Set the flags for an unordered send.  */
-	if (sinfo->sinfo_flags & SCTP_UNORDERED) {
+	if (sinfo->sinfo_flags & SCTP_UNORDERED)
 		flags |= SCTP_DATA_UNORDERED;
-		dp.ssn = 0;
-	} else
-		dp.ssn = htons(ssn);
 
-	chunk_len = sizeof(dp) + data_len;
-	retval = sctp_make_data(asoc, flags, chunk_len, gfp);
+	retval = sctp_make_data(asoc, flags, sizeof(dp) + len, gfp);
 	if (!retval)
-		goto nodata;
+		return NULL;
 
 	retval->subh.data_hdr = sctp_addto_chunk(retval, sizeof(dp), &dp);
 	memcpy(&retval->sinfo, sinfo, sizeof(struct sctp_sndrcvinfo));
 
-nodata:
 	return retval;
 }
 
@@ -1415,6 +1418,12 @@ static struct sctp_chunk *sctp_make_data(const struct sctp_association *asoc,
 	return _sctp_make_chunk(asoc, SCTP_CID_DATA, flags, paylen, gfp);
 }
 
+struct sctp_chunk *sctp_make_idata(const struct sctp_association *asoc,
+				   __u8 flags, int paylen, gfp_t gfp)
+{
+	return _sctp_make_chunk(asoc, SCTP_CID_I_DATA, flags, paylen, gfp);
+}
+
 static struct sctp_chunk *sctp_make_control(const struct sctp_association *asoc,
 					    __u8 type, __u8 flags, int paylen,
 					    gfp_t gfp)
@@ -2032,6 +2041,10 @@ static void sctp_process_ext_param(struct sctp_association *asoc,
 			if (net->sctp.addip_enable)
 				asoc->peer.asconf_capable = 1;
 			break;
+		case SCTP_CID_I_DATA:
+			if (sctp_sk(asoc->base.sk)->strm_interleave)
+				asoc->intl_enable = 1;
+			break;
 		default:
 			break;
 		}
@@ -3523,6 +3536,30 @@ struct sctp_chunk *sctp_make_fwdtsn(const struct sctp_association *asoc,
 	return retval;
 }
 
+struct sctp_chunk *sctp_make_ifwdtsn(const struct sctp_association *asoc,
+				     __u32 new_cum_tsn, size_t nstreams,
+				     struct sctp_ifwdtsn_skip *skiplist)
+{
+	struct sctp_chunk *retval = NULL;
+	struct sctp_ifwdtsn_hdr ftsn_hdr;
+	size_t hint;
+
+	hint = (nstreams + 1) * sizeof(__u32);
+
+	retval = sctp_make_control(asoc, SCTP_CID_I_FWD_TSN, 0, hint,
+				   GFP_ATOMIC);
+	if (!retval)
+		return NULL;
+
+	ftsn_hdr.new_cum_tsn = htonl(new_cum_tsn);
+	retval->subh.ifwdtsn_hdr =
+		sctp_addto_chunk(retval, sizeof(ftsn_hdr), &ftsn_hdr);
+
+	sctp_addto_chunk(retval, nstreams * sizeof(skiplist[0]), skiplist);
+
+	return retval;
+}
+
 /* RE-CONFIG 3.1 (RE-CONFIG chunk)
  *   0                   1                   2                   3
  *   0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c
index df94d77401e7..16ddf2ca1438 100644
--- a/net/sctp/sm_sideeffect.c
+++ b/net/sctp/sm_sideeffect.c
@@ -632,7 +632,7 @@ static void sctp_cmd_assoc_failed(struct sctp_cmd_seq *commands,
 	struct sctp_chunk *abort;
 
 	/* Cancel any partial delivery in progress. */
-	sctp_ulpq_abort_pd(&asoc->ulpq, GFP_ATOMIC);
+	asoc->stream.si->abort_pd(&asoc->ulpq, GFP_ATOMIC);
 
 	if (event_type == SCTP_EVENT_T_CHUNK && subtype.chunk == SCTP_CID_ABORT)
 		event = sctp_ulpevent_make_assoc_change(asoc, 0, SCTP_COMM_LOST,
@@ -972,7 +972,7 @@ static void sctp_cmd_process_operr(struct sctp_cmd_seq *cmds,
 		if (!ev)
 			return;
 
-		sctp_ulpq_tail_event(&asoc->ulpq, ev);
+		asoc->stream.si->enqueue_event(&asoc->ulpq, ev);
 
 		switch (err_hdr->cause) {
 		case SCTP_ERROR_UNKNOWN_CHUNK:
@@ -1007,18 +1007,6 @@ static void sctp_cmd_process_operr(struct sctp_cmd_seq *cmds,
 	}
 }
 
-/* Process variable FWDTSN chunk information. */
-static void sctp_cmd_process_fwdtsn(struct sctp_ulpq *ulpq,
-				    struct sctp_chunk *chunk)
-{
-	struct sctp_fwdtsn_skip *skip;
-
-	/* Walk through all the skipped SSNs */
-	sctp_walk_fwdtsn(skip, chunk) {
-		sctp_ulpq_skip(ulpq, ntohs(skip->stream), ntohs(skip->ssn));
-	}
-}
-
 /* Helper function to remove the association non-primary peer
  * transports.
  */
@@ -1058,7 +1046,7 @@ static void sctp_cmd_assoc_change(struct sctp_cmd_seq *commands,
 					    asoc->c.sinit_max_instreams,
 					    NULL, GFP_ATOMIC);
 	if (ev)
-		sctp_ulpq_tail_event(&asoc->ulpq, ev);
+		asoc->stream.si->enqueue_event(&asoc->ulpq, ev);
 }
 
 /* Helper function to generate an adaptation indication event */
@@ -1070,7 +1058,7 @@ static void sctp_cmd_adaptation_ind(struct sctp_cmd_seq *commands,
 	ev = sctp_ulpevent_make_adaptation_indication(asoc, GFP_ATOMIC);
 
 	if (ev)
-		sctp_ulpq_tail_event(&asoc->ulpq, ev);
+		asoc->stream.si->enqueue_event(&asoc->ulpq, ev);
 }
 
 
@@ -1368,18 +1356,12 @@ static int sctp_cmd_interpreter(enum sctp_event event_type,
 			break;
 
 		case SCTP_CMD_REPORT_FWDTSN:
-			/* Move the Cumulattive TSN Ack ahead. */
-			sctp_tsnmap_skip(&asoc->peer.tsn_map, cmd->obj.u32);
-
-			/* purge the fragmentation queue */
-			sctp_ulpq_reasm_flushtsn(&asoc->ulpq, cmd->obj.u32);
-
-			/* Abort any in progress partial delivery. */
-			sctp_ulpq_abort_pd(&asoc->ulpq, GFP_ATOMIC);
+			asoc->stream.si->report_ftsn(&asoc->ulpq, cmd->obj.u32);
 			break;
 
 		case SCTP_CMD_PROCESS_FWDTSN:
-			sctp_cmd_process_fwdtsn(&asoc->ulpq, cmd->obj.chunk);
+			asoc->stream.si->handle_ftsn(&asoc->ulpq,
+						     cmd->obj.chunk);
 			break;
 
 		case SCTP_CMD_GEN_SACK:
@@ -1483,8 +1465,9 @@ static int sctp_cmd_interpreter(enum sctp_event event_type,
 			pr_debug("%s: sm_sideff: chunk_up:%p, ulpq:%p\n",
 				 __func__, cmd->obj.chunk, &asoc->ulpq);
 
-			sctp_ulpq_tail_data(&asoc->ulpq, cmd->obj.chunk,
-					    GFP_ATOMIC);
+			asoc->stream.si->ulpevent_data(&asoc->ulpq,
+						       cmd->obj.chunk,
+						       GFP_ATOMIC);
 			break;
 
 		case SCTP_CMD_EVENT_ULP:
@@ -1492,7 +1475,8 @@ static int sctp_cmd_interpreter(enum sctp_event event_type,
 			pr_debug("%s: sm_sideff: event_up:%p, ulpq:%p\n",
 				 __func__, cmd->obj.ulpevent, &asoc->ulpq);
 
-			sctp_ulpq_tail_event(&asoc->ulpq, cmd->obj.ulpevent);
+			asoc->stream.si->enqueue_event(&asoc->ulpq,
+						       cmd->obj.ulpevent);
 			break;
 
 		case SCTP_CMD_REPLY:
@@ -1729,12 +1713,13 @@ static int sctp_cmd_interpreter(enum sctp_event event_type,
 			break;
 
 		case SCTP_CMD_PART_DELIVER:
-			sctp_ulpq_partial_delivery(&asoc->ulpq, GFP_ATOMIC);
+			asoc->stream.si->start_pd(&asoc->ulpq, GFP_ATOMIC);
 			break;
 
 		case SCTP_CMD_RENEGE:
-			sctp_ulpq_renege(&asoc->ulpq, cmd->obj.chunk,
-					 GFP_ATOMIC);
+			asoc->stream.si->renege_events(&asoc->ulpq,
+						       cmd->obj.chunk,
+						       GFP_ATOMIC);
 			break;
 
 		case SCTP_CMD_SETUP_T4:
diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
index 8f8ccded13e4..541f34735346 100644
--- a/net/sctp/sm_statefuns.c
+++ b/net/sctp/sm_statefuns.c
@@ -3013,7 +3013,7 @@ enum sctp_disposition sctp_sf_eat_data_6_2(struct net *net,
 		return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
 	}
 
-	if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_data_chunk)))
+	if (!sctp_chunk_length_valid(chunk, sctp_datachk_len(&asoc->stream)))
 		return sctp_sf_violation_chunklen(net, ep, asoc, type, arg,
 						  commands);
 
@@ -3034,7 +3034,7 @@ enum sctp_disposition sctp_sf_eat_data_6_2(struct net *net,
 	case SCTP_IERROR_PROTO_VIOLATION:
 		return sctp_sf_abort_violation(net, ep, asoc, chunk, commands,
 					       (u8 *)chunk->subh.data_hdr,
-					       sizeof(struct sctp_datahdr));
+					       sctp_datahdr_len(&asoc->stream));
 	default:
 		BUG();
 	}
@@ -3133,7 +3133,7 @@ enum sctp_disposition sctp_sf_eat_data_fast_4_4(
 		return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
 	}
 
-	if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_data_chunk)))
+	if (!sctp_chunk_length_valid(chunk, sctp_datachk_len(&asoc->stream)))
 		return sctp_sf_violation_chunklen(net, ep, asoc, type, arg,
 						  commands);
 
@@ -3150,7 +3150,7 @@ enum sctp_disposition sctp_sf_eat_data_fast_4_4(
 	case SCTP_IERROR_PROTO_VIOLATION:
 		return sctp_sf_abort_violation(net, ep, asoc, chunk, commands,
 					       (u8 *)chunk->subh.data_hdr,
-					       sizeof(struct sctp_datahdr));
+					       sctp_datahdr_len(&asoc->stream));
 	default:
 		BUG();
 	}
@@ -3957,7 +3957,6 @@ enum sctp_disposition sctp_sf_eat_fwd_tsn(struct net *net,
 {
 	struct sctp_fwdtsn_hdr *fwdtsn_hdr;
 	struct sctp_chunk *chunk = arg;
-	struct sctp_fwdtsn_skip *skip;
 	__u16 len;
 	__u32 tsn;
 
@@ -3971,7 +3970,7 @@ enum sctp_disposition sctp_sf_eat_fwd_tsn(struct net *net,
 		return sctp_sf_unk_chunk(net, ep, asoc, type, arg, commands);
 
 	/* Make sure that the FORWARD_TSN chunk has valid length.  */
-	if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_fwdtsn_chunk)))
+	if (!sctp_chunk_length_valid(chunk, sctp_ftsnchk_len(&asoc->stream)))
 		return sctp_sf_violation_chunklen(net, ep, asoc, type, arg,
 						  commands);
 
@@ -3990,14 +3989,11 @@ enum sctp_disposition sctp_sf_eat_fwd_tsn(struct net *net,
 	if (sctp_tsnmap_check(&asoc->peer.tsn_map, tsn) < 0)
 		goto discard_noforce;
 
-	/* Silently discard the chunk if stream-id is not valid */
-	sctp_walk_fwdtsn(skip, chunk) {
-		if (ntohs(skip->stream) >= asoc->stream.incnt)
-			goto discard_noforce;
-	}
+	if (!asoc->stream.si->validate_ftsn(chunk))
+		goto discard_noforce;
 
 	sctp_add_cmd_sf(commands, SCTP_CMD_REPORT_FWDTSN, SCTP_U32(tsn));
-	if (len > sizeof(struct sctp_fwdtsn_hdr))
+	if (len > sctp_ftsnhdr_len(&asoc->stream))
 		sctp_add_cmd_sf(commands, SCTP_CMD_PROCESS_FWDTSN,
 				SCTP_CHUNK(chunk));
 
@@ -4028,7 +4024,6 @@ enum sctp_disposition sctp_sf_eat_fwd_tsn_fast(
 {
 	struct sctp_fwdtsn_hdr *fwdtsn_hdr;
 	struct sctp_chunk *chunk = arg;
-	struct sctp_fwdtsn_skip *skip;
 	__u16 len;
 	__u32 tsn;
 
@@ -4042,7 +4037,7 @@ enum sctp_disposition sctp_sf_eat_fwd_tsn_fast(
 		return sctp_sf_unk_chunk(net, ep, asoc, type, arg, commands);
 
 	/* Make sure that the FORWARD_TSN chunk has a valid length.  */
-	if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_fwdtsn_chunk)))
+	if (!sctp_chunk_length_valid(chunk, sctp_ftsnchk_len(&asoc->stream)))
 		return sctp_sf_violation_chunklen(net, ep, asoc, type, arg,
 						  commands);
 
@@ -4061,14 +4056,11 @@ enum sctp_disposition sctp_sf_eat_fwd_tsn_fast(
 	if (sctp_tsnmap_check(&asoc->peer.tsn_map, tsn) < 0)
 		goto gen_shutdown;
 
-	/* Silently discard the chunk if stream-id is not valid */
-	sctp_walk_fwdtsn(skip, chunk) {
-		if (ntohs(skip->stream) >= asoc->stream.incnt)
-			goto gen_shutdown;
-	}
+	if (!asoc->stream.si->validate_ftsn(chunk))
+		goto gen_shutdown;
 
 	sctp_add_cmd_sf(commands, SCTP_CMD_REPORT_FWDTSN, SCTP_U32(tsn));
-	if (len > sizeof(struct sctp_fwdtsn_hdr))
+	if (len > sctp_ftsnhdr_len(&asoc->stream))
 		sctp_add_cmd_sf(commands, SCTP_CMD_PROCESS_FWDTSN,
 				SCTP_CHUNK(chunk));
 
@@ -6244,14 +6236,12 @@ static int sctp_eat_data(const struct sctp_association *asoc,
 	struct sctp_chunk *err;
 	enum sctp_verb deliver;
 	size_t datalen;
-	u8 ordered = 0;
-	u16 ssn, sid;
 	__u32 tsn;
 	int tmp;
 
 	data_hdr = (struct sctp_datahdr *)chunk->skb->data;
 	chunk->subh.data_hdr = data_hdr;
-	skb_pull(chunk->skb, sizeof(*data_hdr));
+	skb_pull(chunk->skb, sctp_datahdr_len(&asoc->stream));
 
 	tsn = ntohl(data_hdr->tsn);
 	pr_debug("%s: TSN 0x%x\n", __func__, tsn);
@@ -6299,7 +6289,7 @@ static int sctp_eat_data(const struct sctp_association *asoc,
 	 * Actually, allow a little bit of overflow (up to a MTU).
 	 */
 	datalen = ntohs(chunk->chunk_hdr->length);
-	datalen -= sizeof(struct sctp_data_chunk);
+	datalen -= sctp_datachk_len(&asoc->stream);
 
 	deliver = SCTP_CMD_CHUNK_ULP;
 
@@ -6394,7 +6384,6 @@ static int sctp_eat_data(const struct sctp_association *asoc,
 		SCTP_INC_STATS(net, SCTP_MIB_INORDERCHUNKS);
 		if (chunk->asoc)
 			chunk->asoc->stats.iodchunks++;
-		ordered = 1;
 	}
 
 	/* RFC 2960 6.5 Stream Identifier and Stream Sequence Number
@@ -6405,8 +6394,7 @@ static int sctp_eat_data(const struct sctp_association *asoc,
 	 * with cause set to "Invalid Stream Identifier" (See Section 3.3.10)
 	 * and discard the DATA chunk.
 	 */
-	sid = ntohs(data_hdr->stream);
-	if (sid >= asoc->stream.incnt) {
+	if (ntohs(data_hdr->stream) >= asoc->stream.incnt) {
 		/* Mark tsn as received even though we drop it */
 		sctp_add_cmd_sf(commands, SCTP_CMD_REPORT_TSN, SCTP_U32(tsn));
 
@@ -6427,8 +6415,7 @@ static int sctp_eat_data(const struct sctp_association *asoc,
 	 * SSN is smaller then the next expected one.  If it is, it wrapped
 	 * and is invalid.
 	 */
-	ssn = ntohs(data_hdr->ssn);
-	if (ordered && SSN_lt(ssn, sctp_ssn_peek(&asoc->stream, in, sid)))
+	if (!asoc->stream.si->validate_data(chunk))
 		return SCTP_IERROR_PROTO_VIOLATION;
 
 	/* Send the data up to the user.  Note:  Schedule  the
diff --git a/net/sctp/sm_statetable.c b/net/sctp/sm_statetable.c
index 79b6bee5b768..691d9dc620e3 100644
--- a/net/sctp/sm_statetable.c
+++ b/net/sctp/sm_statetable.c
@@ -985,11 +985,14 @@ static const struct sctp_sm_table_entry *sctp_chunk_event_lookup(
 	if (state > SCTP_STATE_MAX)
 		return &bug;
 
+	if (cid == SCTP_CID_I_DATA)
+		cid = SCTP_CID_DATA;
+
 	if (cid <= SCTP_CID_BASE_MAX)
 		return &chunk_event_table[cid][state];
 
 	if (net->sctp.prsctp_enable) {
-		if (cid == SCTP_CID_FWD_TSN)
+		if (cid == SCTP_CID_FWD_TSN || cid == SCTP_CID_I_FWD_TSN)
 			return &prsctp_chunk_event_table[0][state];
 	}
 
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 3253f724a995..5e4100df7bae 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -201,6 +201,22 @@ static void sctp_for_each_tx_datachunk(struct sctp_association *asoc,
 		cb(chunk);
 }
 
+static void sctp_for_each_rx_skb(struct sctp_association *asoc, struct sock *sk,
+				 void (*cb)(struct sk_buff *, struct sock *))
+
+{
+	struct sk_buff *skb, *tmp;
+
+	sctp_skb_for_each(skb, &asoc->ulpq.lobby, tmp)
+		cb(skb, sk);
+
+	sctp_skb_for_each(skb, &asoc->ulpq.reasm, tmp)
+		cb(skb, sk);
+
+	sctp_skb_for_each(skb, &asoc->ulpq.reasm_uo, tmp)
+		cb(skb, sk);
+}
+
 /* Verify that this is a valid address. */
 static inline int sctp_verify_addr(struct sock *sk, union sctp_addr *addr,
 				   int len)
@@ -1554,6 +1570,7 @@ static void sctp_close(struct sock *sk, long timeout)
 
 		if (data_was_unread || !skb_queue_empty(&asoc->ulpq.lobby) ||
 		    !skb_queue_empty(&asoc->ulpq.reasm) ||
+		    !skb_queue_empty(&asoc->ulpq.reasm_uo) ||
 		    (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime)) {
 			struct sctp_chunk *chunk;
 
@@ -2002,7 +2019,20 @@ static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len)
 		if (err < 0)
 			goto out_free;
 
-		wait_connect = true;
+		/* If stream interleave is enabled, wait_connect has to be
+		 * done earlier than data enqueue, as it needs to make data
+		 * or idata according to asoc->intl_enable which is set
+		 * after connection is done.
+		 */
+		if (sctp_sk(asoc->base.sk)->strm_interleave) {
+			timeo = sock_sndtimeo(sk, 0);
+			err = sctp_wait_for_connect(asoc, &timeo);
+			if (err)
+				goto out_unlock;
+		} else {
+			wait_connect = true;
+		}
+
 		pr_debug("%s: we associated primitively\n", __func__);
 	}
 
@@ -2281,7 +2311,7 @@ static int sctp_setsockopt_events(struct sock *sk, char __user *optval,
 			if (!event)
 				return -ENOMEM;
 
-			sctp_ulpq_tail_event(&asoc->ulpq, event);
+			asoc->stream.si->enqueue_event(&asoc->ulpq, event);
 		}
 	}
 
@@ -3180,7 +3210,7 @@ static int sctp_setsockopt_maxseg(struct sock *sk, char __user *optval, unsigned
 		if (val == 0) {
 			val = asoc->pathmtu - sp->pf->af->net_header_len;
 			val -= sizeof(struct sctphdr) +
-			       sizeof(struct sctp_data_chunk);
+			       sctp_datachk_len(&asoc->stream);
 		}
 		asoc->user_frag = val;
 		asoc->frag_point = sctp_frag_point(asoc, asoc->pathmtu);
@@ -3350,7 +3380,10 @@ static int sctp_setsockopt_fragment_interleave(struct sock *sk,
 	if (get_user(val, (int __user *)optval))
 		return -EFAULT;
 
-	sctp_sk(sk)->frag_interleave = (val == 0) ? 0 : 1;
+	sctp_sk(sk)->frag_interleave = !!val;
+
+	if (!sctp_sk(sk)->frag_interleave)
+		sctp_sk(sk)->strm_interleave = 0;
 
 	return 0;
 }
@@ -4023,6 +4056,40 @@ out:
 	return retval;
 }
 
+static int sctp_setsockopt_interleaving_supported(struct sock *sk,
+						  char __user *optval,
+						  unsigned int optlen)
+{
+	struct sctp_sock *sp = sctp_sk(sk);
+	struct net *net = sock_net(sk);
+	struct sctp_assoc_value params;
+	int retval = -EINVAL;
+
+	if (optlen < sizeof(params))
+		goto out;
+
+	optlen = sizeof(params);
+	if (copy_from_user(&params, optval, optlen)) {
+		retval = -EFAULT;
+		goto out;
+	}
+
+	if (params.assoc_id)
+		goto out;
+
+	if (!net->sctp.intl_enable || !sp->frag_interleave) {
+		retval = -EPERM;
+		goto out;
+	}
+
+	sp->strm_interleave = !!params.assoc_value;
+
+	retval = 0;
+
+out:
+	return retval;
+}
+
 /* API 6.2 setsockopt(), getsockopt()
  *
  * Applications use setsockopt() and getsockopt() to set or retrieve
@@ -4210,6 +4277,10 @@ static int sctp_setsockopt(struct sock *sk, int level, int optname,
 	case SCTP_STREAM_SCHEDULER_VALUE:
 		retval = sctp_setsockopt_scheduler_value(sk, optval, optlen);
 		break;
+	case SCTP_INTERLEAVING_SUPPORTED:
+		retval = sctp_setsockopt_interleaving_supported(sk, optval,
+								optlen);
+		break;
 	default:
 		retval = -ENOPROTOOPT;
 		break;
@@ -4680,20 +4751,11 @@ int sctp_get_sctp_info(struct sock *sk, struct sctp_association *asoc,
 EXPORT_SYMBOL_GPL(sctp_get_sctp_info);
 
 /* use callback to avoid exporting the core structure */
-int sctp_transport_walk_start(struct rhashtable_iter *iter)
+void sctp_transport_walk_start(struct rhashtable_iter *iter)
 {
-	int err;
-
 	rhltable_walk_enter(&sctp_transport_hashtable, iter);
 
-	err = rhashtable_walk_start(iter);
-	if (err && err != -EAGAIN) {
-		rhashtable_walk_stop(iter);
-		rhashtable_walk_exit(iter);
-		return err;
-	}
-
-	return 0;
+	rhashtable_walk_start(iter);
 }
 
 void sctp_transport_walk_stop(struct rhashtable_iter *iter)
@@ -4784,12 +4846,10 @@ int sctp_for_each_transport(int (*cb)(struct sctp_transport *, void *),
 			    struct net *net, int *pos, void *p) {
 	struct rhashtable_iter hti;
 	struct sctp_transport *tsp;
-	int ret;
+	int ret = 0;
 
 again:
-	ret = sctp_transport_walk_start(&hti);
-	if (ret)
-		return ret;
+	sctp_transport_walk_start(&hti);
 
 	tsp = sctp_transport_get_idx(net, &hti, *pos + 1);
 	for (; !IS_ERR_OR_NULL(tsp); tsp = sctp_transport_get_next(net, &hti)) {
@@ -6984,6 +7044,47 @@ out:
 	return retval;
 }
 
+static int sctp_getsockopt_interleaving_supported(struct sock *sk, int len,
+						  char __user *optval,
+						  int __user *optlen)
+{
+	struct sctp_assoc_value params;
+	struct sctp_association *asoc;
+	int retval = -EFAULT;
+
+	if (len < sizeof(params)) {
+		retval = -EINVAL;
+		goto out;
+	}
+
+	len = sizeof(params);
+	if (copy_from_user(&params, optval, len))
+		goto out;
+
+	asoc = sctp_id2assoc(sk, params.assoc_id);
+	if (asoc) {
+		params.assoc_value = asoc->intl_enable;
+	} else if (!params.assoc_id) {
+		struct sctp_sock *sp = sctp_sk(sk);
+
+		params.assoc_value = sp->strm_interleave;
+	} else {
+		retval = -EINVAL;
+		goto out;
+	}
+
+	if (put_user(len, optlen))
+		goto out;
+
+	if (copy_to_user(optval, &params, len))
+		goto out;
+
+	retval = 0;
+
+out:
+	return retval;
+}
+
 static int sctp_getsockopt(struct sock *sk, int level, int optname,
 			   char __user *optval, int __user *optlen)
 {
@@ -7174,6 +7275,10 @@ static int sctp_getsockopt(struct sock *sk, int level, int optname,
 		retval = sctp_getsockopt_scheduler_value(sk, len, optval,
 							 optlen);
 		break;
+	case SCTP_INTERLEAVING_SUPPORTED:
+		retval = sctp_getsockopt_interleaving_supported(sk, len, optval,
+								optlen);
+		break;
 	default:
 		retval = -ENOPROTOOPT;
 		break;
@@ -8411,11 +8516,7 @@ static void sctp_sock_migrate(struct sock *oldsk, struct sock *newsk,
 
 	}
 
-	sctp_skb_for_each(skb, &assoc->ulpq.reasm, tmp)
-		sctp_skb_set_owner_r_frag(skb, newsk);
-
-	sctp_skb_for_each(skb, &assoc->ulpq.lobby, tmp)
-		sctp_skb_set_owner_r_frag(skb, newsk);
+	sctp_for_each_rx_skb(assoc, newsk, sctp_skb_set_owner_r_frag);
 
 	/* Set the type of socket to indicate that it is peeled off from the
 	 * original UDP-style socket or created with the accept() call on a
diff --git a/net/sctp/stream.c b/net/sctp/stream.c
index 76ea66be0bbe..06b644dd858c 100644
--- a/net/sctp/stream.c
+++ b/net/sctp/stream.c
@@ -167,6 +167,7 @@ int sctp_stream_init(struct sctp_stream *stream, __u16 outcnt, __u16 incnt,
 	sched->init(stream);
 
 in:
+	sctp_stream_interleave_init(stream);
 	if (!incnt)
 		goto out;
 
@@ -215,11 +216,13 @@ void sctp_stream_clear(struct sctp_stream *stream)
 {
 	int i;
 
-	for (i = 0; i < stream->outcnt; i++)
-		stream->out[i].ssn = 0;
+	for (i = 0; i < stream->outcnt; i++) {
+		stream->out[i].mid = 0;
+		stream->out[i].mid_uo = 0;
+	}
 
 	for (i = 0; i < stream->incnt; i++)
-		stream->in[i].ssn = 0;
+		stream->in[i].mid = 0;
 }
 
 void sctp_stream_update(struct sctp_stream *stream, struct sctp_stream *new)
@@ -606,10 +609,10 @@ struct sctp_chunk *sctp_process_strreset_outreq(
 		}
 
 		for (i = 0; i < nums; i++)
-			stream->in[ntohs(str_p[i])].ssn = 0;
+			stream->in[ntohs(str_p[i])].mid = 0;
 	} else {
 		for (i = 0; i < stream->incnt; i++)
-			stream->in[i].ssn = 0;
+			stream->in[i].mid = 0;
 	}
 
 	result = SCTP_STRRESET_PERFORMED;
@@ -753,8 +756,7 @@ struct sctp_chunk *sctp_process_strreset_tsnreq(
 	 *     performed.
 	 */
 	max_tsn_seen = sctp_tsnmap_get_max_tsn_seen(&asoc->peer.tsn_map);
-	sctp_ulpq_reasm_flushtsn(&asoc->ulpq, max_tsn_seen);
-	sctp_ulpq_abort_pd(&asoc->ulpq, GFP_ATOMIC);
+	asoc->stream.si->report_ftsn(&asoc->ulpq, max_tsn_seen);
 
 	/* G1: Compute an appropriate value for the Receiver's Next TSN -- the
 	 *     TSN that the peer should use to send the next DATA chunk.  The
@@ -783,10 +785,12 @@ struct sctp_chunk *sctp_process_strreset_tsnreq(
 	/* G5:  The next expected and outgoing SSNs MUST be reset to 0 for all
 	 *      incoming and outgoing streams.
 	 */
-	for (i = 0; i < stream->outcnt; i++)
-		stream->out[i].ssn = 0;
+	for (i = 0; i < stream->outcnt; i++) {
+		stream->out[i].mid = 0;
+		stream->out[i].mid_uo = 0;
+	}
 	for (i = 0; i < stream->incnt; i++)
-		stream->in[i].ssn = 0;
+		stream->in[i].mid = 0;
 
 	result = SCTP_STRRESET_PERFORMED;
 
@@ -976,11 +980,15 @@ struct sctp_chunk *sctp_process_strreset_resp(
 
 		if (result == SCTP_STRRESET_PERFORMED) {
 			if (nums) {
-				for (i = 0; i < nums; i++)
-					stream->out[ntohs(str_p[i])].ssn = 0;
+				for (i = 0; i < nums; i++) {
+					stream->out[ntohs(str_p[i])].mid = 0;
+					stream->out[ntohs(str_p[i])].mid_uo = 0;
+				}
 			} else {
-				for (i = 0; i < stream->outcnt; i++)
-					stream->out[i].ssn = 0;
+				for (i = 0; i < stream->outcnt; i++) {
+					stream->out[i].mid = 0;
+					stream->out[i].mid_uo = 0;
+				}
 			}
 
 			flags = SCTP_STREAM_RESET_OUTGOING_SSN;
@@ -1023,8 +1031,7 @@ struct sctp_chunk *sctp_process_strreset_resp(
 						&asoc->peer.tsn_map);
 			LIST_HEAD(temp);
 
-			sctp_ulpq_reasm_flushtsn(&asoc->ulpq, mtsn);
-			sctp_ulpq_abort_pd(&asoc->ulpq, GFP_ATOMIC);
+			asoc->stream.si->report_ftsn(&asoc->ulpq, mtsn);
 
 			sctp_tsnmap_init(&asoc->peer.tsn_map,
 					 SCTP_TSN_MAP_INITIAL,
@@ -1042,10 +1049,12 @@ struct sctp_chunk *sctp_process_strreset_resp(
 			asoc->ctsn_ack_point = asoc->next_tsn - 1;
 			asoc->adv_peer_ack_point = asoc->ctsn_ack_point;
 
-			for (i = 0; i < stream->outcnt; i++)
-				stream->out[i].ssn = 0;
+			for (i = 0; i < stream->outcnt; i++) {
+				stream->out[i].mid = 0;
+				stream->out[i].mid_uo = 0;
+			}
 			for (i = 0; i < stream->incnt; i++)
-				stream->in[i].ssn = 0;
+				stream->in[i].mid = 0;
 		}
 
 		for (i = 0; i < stream->outcnt; i++)
diff --git a/net/sctp/stream_interleave.c b/net/sctp/stream_interleave.c
new file mode 100644
index 000000000000..8c7cf8f08711
--- /dev/null
+++ b/net/sctp/stream_interleave.c
@@ -0,0 +1,1334 @@
+/* SCTP kernel implementation
+ * (C) Copyright Red Hat Inc. 2017
+ *
+ * This file is part of the SCTP kernel implementation
+ *
+ * These functions manipulate sctp stream queue/scheduling.
+ *
+ * This SCTP implementation is free software;
+ * you can redistribute it and/or modify it under the terms of
+ * the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * This SCTP implementation is distributed in the hope that it
+ * will be useful, but WITHOUT ANY WARRANTY; without even the implied
+ *                 ************************
+ * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ * See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU CC; see the file COPYING.  If not, see
+ * <http://www.gnu.org/licenses/>.
+ *
+ * Please send any bug reports or fixes you make to the
+ * email addresched(es):
+ *    lksctp developers <linux-sctp@vger.kernel.org>
+ *
+ * Written or modified by:
+ *    Xin Long <lucien.xin@gmail.com>
+ */
+
+#include <net/busy_poll.h>
+#include <net/sctp/sctp.h>
+#include <net/sctp/sm.h>
+#include <net/sctp/ulpevent.h>
+#include <linux/sctp.h>
+
+static struct sctp_chunk *sctp_make_idatafrag_empty(
+					const struct sctp_association *asoc,
+					const struct sctp_sndrcvinfo *sinfo,
+					int len, __u8 flags, gfp_t gfp)
+{
+	struct sctp_chunk *retval;
+	struct sctp_idatahdr dp;
+
+	memset(&dp, 0, sizeof(dp));
+	dp.stream = htons(sinfo->sinfo_stream);
+
+	if (sinfo->sinfo_flags & SCTP_UNORDERED)
+		flags |= SCTP_DATA_UNORDERED;
+
+	retval = sctp_make_idata(asoc, flags, sizeof(dp) + len, gfp);
+	if (!retval)
+		return NULL;
+
+	retval->subh.idata_hdr = sctp_addto_chunk(retval, sizeof(dp), &dp);
+	memcpy(&retval->sinfo, sinfo, sizeof(struct sctp_sndrcvinfo));
+
+	return retval;
+}
+
+static void sctp_chunk_assign_mid(struct sctp_chunk *chunk)
+{
+	struct sctp_stream *stream;
+	struct sctp_chunk *lchunk;
+	__u32 cfsn = 0;
+	__u16 sid;
+
+	if (chunk->has_mid)
+		return;
+
+	sid = sctp_chunk_stream_no(chunk);
+	stream = &chunk->asoc->stream;
+
+	list_for_each_entry(lchunk, &chunk->msg->chunks, frag_list) {
+		struct sctp_idatahdr *hdr;
+		__u32 mid;
+
+		lchunk->has_mid = 1;
+
+		hdr = lchunk->subh.idata_hdr;
+
+		if (lchunk->chunk_hdr->flags & SCTP_DATA_FIRST_FRAG)
+			hdr->ppid = lchunk->sinfo.sinfo_ppid;
+		else
+			hdr->fsn = htonl(cfsn++);
+
+		if (lchunk->chunk_hdr->flags & SCTP_DATA_UNORDERED) {
+			mid = lchunk->chunk_hdr->flags & SCTP_DATA_LAST_FRAG ?
+				sctp_mid_uo_next(stream, out, sid) :
+				sctp_mid_uo_peek(stream, out, sid);
+		} else {
+			mid = lchunk->chunk_hdr->flags & SCTP_DATA_LAST_FRAG ?
+				sctp_mid_next(stream, out, sid) :
+				sctp_mid_peek(stream, out, sid);
+		}
+		hdr->mid = htonl(mid);
+	}
+}
+
+static bool sctp_validate_data(struct sctp_chunk *chunk)
+{
+	const struct sctp_stream *stream;
+	__u16 sid, ssn;
+
+	if (chunk->chunk_hdr->type != SCTP_CID_DATA)
+		return false;
+
+	if (chunk->chunk_hdr->flags & SCTP_DATA_UNORDERED)
+		return true;
+
+	stream = &chunk->asoc->stream;
+	sid = sctp_chunk_stream_no(chunk);
+	ssn = ntohs(chunk->subh.data_hdr->ssn);
+
+	return !SSN_lt(ssn, sctp_ssn_peek(stream, in, sid));
+}
+
+static bool sctp_validate_idata(struct sctp_chunk *chunk)
+{
+	struct sctp_stream *stream;
+	__u32 mid;
+	__u16 sid;
+
+	if (chunk->chunk_hdr->type != SCTP_CID_I_DATA)
+		return false;
+
+	if (chunk->chunk_hdr->flags & SCTP_DATA_UNORDERED)
+		return true;
+
+	stream = &chunk->asoc->stream;
+	sid = sctp_chunk_stream_no(chunk);
+	mid = ntohl(chunk->subh.idata_hdr->mid);
+
+	return !MID_lt(mid, sctp_mid_peek(stream, in, sid));
+}
+
+static void sctp_intl_store_reasm(struct sctp_ulpq *ulpq,
+				  struct sctp_ulpevent *event)
+{
+	struct sctp_ulpevent *cevent;
+	struct sk_buff *pos;
+
+	pos = skb_peek_tail(&ulpq->reasm);
+	if (!pos) {
+		__skb_queue_tail(&ulpq->reasm, sctp_event2skb(event));
+		return;
+	}
+
+	cevent = sctp_skb2event(pos);
+
+	if (event->stream == cevent->stream &&
+	    event->mid == cevent->mid &&
+	    (cevent->msg_flags & SCTP_DATA_FIRST_FRAG ||
+	     (!(event->msg_flags & SCTP_DATA_FIRST_FRAG) &&
+	      event->fsn > cevent->fsn))) {
+		__skb_queue_tail(&ulpq->reasm, sctp_event2skb(event));
+		return;
+	}
+
+	if ((event->stream == cevent->stream &&
+	     MID_lt(cevent->mid, event->mid)) ||
+	    event->stream > cevent->stream) {
+		__skb_queue_tail(&ulpq->reasm, sctp_event2skb(event));
+		return;
+	}
+
+	skb_queue_walk(&ulpq->reasm, pos) {
+		cevent = sctp_skb2event(pos);
+
+		if (event->stream < cevent->stream ||
+		    (event->stream == cevent->stream &&
+		     MID_lt(event->mid, cevent->mid)))
+			break;
+
+		if (event->stream == cevent->stream &&
+		    event->mid == cevent->mid &&
+		    !(cevent->msg_flags & SCTP_DATA_FIRST_FRAG) &&
+		    (event->msg_flags & SCTP_DATA_FIRST_FRAG ||
+		     event->fsn < cevent->fsn))
+			break;
+	}
+
+	__skb_queue_before(&ulpq->reasm, pos, sctp_event2skb(event));
+}
+
+static struct sctp_ulpevent *sctp_intl_retrieve_partial(
+						struct sctp_ulpq *ulpq,
+						struct sctp_ulpevent *event)
+{
+	struct sk_buff *first_frag = NULL;
+	struct sk_buff *last_frag = NULL;
+	struct sctp_ulpevent *retval;
+	struct sctp_stream_in *sin;
+	struct sk_buff *pos;
+	__u32 next_fsn = 0;
+	int is_last = 0;
+
+	sin = sctp_stream_in(ulpq->asoc, event->stream);
+
+	skb_queue_walk(&ulpq->reasm, pos) {
+		struct sctp_ulpevent *cevent = sctp_skb2event(pos);
+
+		if (cevent->stream < event->stream)
+			continue;
+
+		if (cevent->stream > event->stream ||
+		    cevent->mid != sin->mid)
+			break;
+
+		switch (cevent->msg_flags & SCTP_DATA_FRAG_MASK) {
+		case SCTP_DATA_FIRST_FRAG:
+			goto out;
+		case SCTP_DATA_MIDDLE_FRAG:
+			if (!first_frag) {
+				if (cevent->fsn == sin->fsn) {
+					first_frag = pos;
+					last_frag = pos;
+					next_fsn = cevent->fsn + 1;
+				}
+			} else if (cevent->fsn == next_fsn) {
+				last_frag = pos;
+				next_fsn++;
+			} else {
+				goto out;
+			}
+			break;
+		case SCTP_DATA_LAST_FRAG:
+			if (!first_frag) {
+				if (cevent->fsn == sin->fsn) {
+					first_frag = pos;
+					last_frag = pos;
+					next_fsn = 0;
+					is_last = 1;
+				}
+			} else if (cevent->fsn == next_fsn) {
+				last_frag = pos;
+				next_fsn = 0;
+				is_last = 1;
+			}
+			goto out;
+		default:
+			goto out;
+		}
+	}
+
+out:
+	if (!first_frag)
+		return NULL;
+
+	retval = sctp_make_reassembled_event(sock_net(ulpq->asoc->base.sk),
+					     &ulpq->reasm, first_frag,
+					     last_frag);
+	if (retval) {
+		sin->fsn = next_fsn;
+		if (is_last) {
+			retval->msg_flags |= MSG_EOR;
+			sin->pd_mode = 0;
+		}
+	}
+
+	return retval;
+}
+
+static struct sctp_ulpevent *sctp_intl_retrieve_reassembled(
+						struct sctp_ulpq *ulpq,
+						struct sctp_ulpevent *event)
+{
+	struct sctp_association *asoc = ulpq->asoc;
+	struct sk_buff *pos, *first_frag = NULL;
+	struct sctp_ulpevent *retval = NULL;
+	struct sk_buff *pd_first = NULL;
+	struct sk_buff *pd_last = NULL;
+	struct sctp_stream_in *sin;
+	__u32 next_fsn = 0;
+	__u32 pd_point = 0;
+	__u32 pd_len = 0;
+	__u32 mid = 0;
+
+	sin = sctp_stream_in(ulpq->asoc, event->stream);
+
+	skb_queue_walk(&ulpq->reasm, pos) {
+		struct sctp_ulpevent *cevent = sctp_skb2event(pos);
+
+		if (cevent->stream < event->stream)
+			continue;
+		if (cevent->stream > event->stream)
+			break;
+
+		if (MID_lt(cevent->mid, event->mid))
+			continue;
+		if (MID_lt(event->mid, cevent->mid))
+			break;
+
+		switch (cevent->msg_flags & SCTP_DATA_FRAG_MASK) {
+		case SCTP_DATA_FIRST_FRAG:
+			if (cevent->mid == sin->mid) {
+				pd_first = pos;
+				pd_last = pos;
+				pd_len = pos->len;
+			}
+
+			first_frag = pos;
+			next_fsn = 0;
+			mid = cevent->mid;
+			break;
+
+		case SCTP_DATA_MIDDLE_FRAG:
+			if (first_frag && cevent->mid == mid &&
+			    cevent->fsn == next_fsn) {
+				next_fsn++;
+				if (pd_first) {
+					pd_last = pos;
+					pd_len += pos->len;
+				}
+			} else {
+				first_frag = NULL;
+			}
+			break;
+
+		case SCTP_DATA_LAST_FRAG:
+			if (first_frag && cevent->mid == mid &&
+			    cevent->fsn == next_fsn)
+				goto found;
+			else
+				first_frag = NULL;
+			break;
+		}
+	}
+
+	if (!pd_first)
+		goto out;
+
+	pd_point = sctp_sk(asoc->base.sk)->pd_point;
+	if (pd_point && pd_point <= pd_len) {
+		retval = sctp_make_reassembled_event(sock_net(asoc->base.sk),
+						     &ulpq->reasm,
+						     pd_first, pd_last);
+		if (retval) {
+			sin->fsn = next_fsn;
+			sin->pd_mode = 1;
+		}
+	}
+	goto out;
+
+found:
+	retval = sctp_make_reassembled_event(sock_net(asoc->base.sk),
+					     &ulpq->reasm,
+					     first_frag, pos);
+	if (retval)
+		retval->msg_flags |= MSG_EOR;
+
+out:
+	return retval;
+}
+
+static struct sctp_ulpevent *sctp_intl_reasm(struct sctp_ulpq *ulpq,
+					     struct sctp_ulpevent *event)
+{
+	struct sctp_ulpevent *retval = NULL;
+	struct sctp_stream_in *sin;
+
+	if (SCTP_DATA_NOT_FRAG == (event->msg_flags & SCTP_DATA_FRAG_MASK)) {
+		event->msg_flags |= MSG_EOR;
+		return event;
+	}
+
+	sctp_intl_store_reasm(ulpq, event);
+
+	sin = sctp_stream_in(ulpq->asoc, event->stream);
+	if (sin->pd_mode && event->mid == sin->mid &&
+	    event->fsn == sin->fsn)
+		retval = sctp_intl_retrieve_partial(ulpq, event);
+
+	if (!retval)
+		retval = sctp_intl_retrieve_reassembled(ulpq, event);
+
+	return retval;
+}
+
+static void sctp_intl_store_ordered(struct sctp_ulpq *ulpq,
+				    struct sctp_ulpevent *event)
+{
+	struct sctp_ulpevent *cevent;
+	struct sk_buff *pos;
+
+	pos = skb_peek_tail(&ulpq->lobby);
+	if (!pos) {
+		__skb_queue_tail(&ulpq->lobby, sctp_event2skb(event));
+		return;
+	}
+
+	cevent = (struct sctp_ulpevent *)pos->cb;
+	if (event->stream == cevent->stream &&
+	    MID_lt(cevent->mid, event->mid)) {
+		__skb_queue_tail(&ulpq->lobby, sctp_event2skb(event));
+		return;
+	}
+
+	if (event->stream > cevent->stream) {
+		__skb_queue_tail(&ulpq->lobby, sctp_event2skb(event));
+		return;
+	}
+
+	skb_queue_walk(&ulpq->lobby, pos) {
+		cevent = (struct sctp_ulpevent *)pos->cb;
+
+		if (cevent->stream > event->stream)
+			break;
+
+		if (cevent->stream == event->stream &&
+		    MID_lt(event->mid, cevent->mid))
+			break;
+	}
+
+	__skb_queue_before(&ulpq->lobby, pos, sctp_event2skb(event));
+}
+
+static void sctp_intl_retrieve_ordered(struct sctp_ulpq *ulpq,
+				       struct sctp_ulpevent *event)
+{
+	struct sk_buff_head *event_list;
+	struct sctp_stream *stream;
+	struct sk_buff *pos, *tmp;
+	__u16 sid = event->stream;
+
+	stream  = &ulpq->asoc->stream;
+	event_list = (struct sk_buff_head *)sctp_event2skb(event)->prev;
+
+	sctp_skb_for_each(pos, &ulpq->lobby, tmp) {
+		struct sctp_ulpevent *cevent = (struct sctp_ulpevent *)pos->cb;
+
+		if (cevent->stream > sid)
+			break;
+
+		if (cevent->stream < sid)
+			continue;
+
+		if (cevent->mid != sctp_mid_peek(stream, in, sid))
+			break;
+
+		sctp_mid_next(stream, in, sid);
+
+		__skb_unlink(pos, &ulpq->lobby);
+
+		__skb_queue_tail(event_list, pos);
+	}
+}
+
+static struct sctp_ulpevent *sctp_intl_order(struct sctp_ulpq *ulpq,
+					     struct sctp_ulpevent *event)
+{
+	struct sctp_stream *stream;
+	__u16 sid;
+
+	stream  = &ulpq->asoc->stream;
+	sid = event->stream;
+
+	if (event->mid != sctp_mid_peek(stream, in, sid)) {
+		sctp_intl_store_ordered(ulpq, event);
+		return NULL;
+	}
+
+	sctp_mid_next(stream, in, sid);
+
+	sctp_intl_retrieve_ordered(ulpq, event);
+
+	return event;
+}
+
+static int sctp_enqueue_event(struct sctp_ulpq *ulpq,
+			      struct sctp_ulpevent *event)
+{
+	struct sk_buff *skb = sctp_event2skb(event);
+	struct sock *sk = ulpq->asoc->base.sk;
+	struct sctp_sock *sp = sctp_sk(sk);
+	struct sk_buff_head *skb_list;
+
+	skb_list = (struct sk_buff_head *)skb->prev;
+
+	if (sk->sk_shutdown & RCV_SHUTDOWN &&
+	    (sk->sk_shutdown & SEND_SHUTDOWN ||
+	     !sctp_ulpevent_is_notification(event)))
+		goto out_free;
+
+	if (!sctp_ulpevent_is_notification(event)) {
+		sk_mark_napi_id(sk, skb);
+		sk_incoming_cpu_update(sk);
+	}
+
+	if (!sctp_ulpevent_is_enabled(event, &sp->subscribe))
+		goto out_free;
+
+	if (skb_list)
+		skb_queue_splice_tail_init(skb_list,
+					   &sk->sk_receive_queue);
+	else
+		__skb_queue_tail(&sk->sk_receive_queue, skb);
+
+	if (!sp->data_ready_signalled) {
+		sp->data_ready_signalled = 1;
+		sk->sk_data_ready(sk);
+	}
+
+	return 1;
+
+out_free:
+	if (skb_list)
+		sctp_queue_purge_ulpevents(skb_list);
+	else
+		sctp_ulpevent_free(event);
+
+	return 0;
+}
+
+static void sctp_intl_store_reasm_uo(struct sctp_ulpq *ulpq,
+				     struct sctp_ulpevent *event)
+{
+	struct sctp_ulpevent *cevent;
+	struct sk_buff *pos;
+
+	pos = skb_peek_tail(&ulpq->reasm_uo);
+	if (!pos) {
+		__skb_queue_tail(&ulpq->reasm_uo, sctp_event2skb(event));
+		return;
+	}
+
+	cevent = sctp_skb2event(pos);
+
+	if (event->stream == cevent->stream &&
+	    event->mid == cevent->mid &&
+	    (cevent->msg_flags & SCTP_DATA_FIRST_FRAG ||
+	     (!(event->msg_flags & SCTP_DATA_FIRST_FRAG) &&
+	      event->fsn > cevent->fsn))) {
+		__skb_queue_tail(&ulpq->reasm_uo, sctp_event2skb(event));
+		return;
+	}
+
+	if ((event->stream == cevent->stream &&
+	     MID_lt(cevent->mid, event->mid)) ||
+	    event->stream > cevent->stream) {
+		__skb_queue_tail(&ulpq->reasm_uo, sctp_event2skb(event));
+		return;
+	}
+
+	skb_queue_walk(&ulpq->reasm_uo, pos) {
+		cevent = sctp_skb2event(pos);
+
+		if (event->stream < cevent->stream ||
+		    (event->stream == cevent->stream &&
+		     MID_lt(event->mid, cevent->mid)))
+			break;
+
+		if (event->stream == cevent->stream &&
+		    event->mid == cevent->mid &&
+		    !(cevent->msg_flags & SCTP_DATA_FIRST_FRAG) &&
+		    (event->msg_flags & SCTP_DATA_FIRST_FRAG ||
+		     event->fsn < cevent->fsn))
+			break;
+	}
+
+	__skb_queue_before(&ulpq->reasm_uo, pos, sctp_event2skb(event));
+}
+
+static struct sctp_ulpevent *sctp_intl_retrieve_partial_uo(
+						struct sctp_ulpq *ulpq,
+						struct sctp_ulpevent *event)
+{
+	struct sk_buff *first_frag = NULL;
+	struct sk_buff *last_frag = NULL;
+	struct sctp_ulpevent *retval;
+	struct sctp_stream_in *sin;
+	struct sk_buff *pos;
+	__u32 next_fsn = 0;
+	int is_last = 0;
+
+	sin = sctp_stream_in(ulpq->asoc, event->stream);
+
+	skb_queue_walk(&ulpq->reasm_uo, pos) {
+		struct sctp_ulpevent *cevent = sctp_skb2event(pos);
+
+		if (cevent->stream < event->stream)
+			continue;
+		if (cevent->stream > event->stream)
+			break;
+
+		if (MID_lt(cevent->mid, sin->mid_uo))
+			continue;
+		if (MID_lt(sin->mid_uo, cevent->mid))
+			break;
+
+		switch (cevent->msg_flags & SCTP_DATA_FRAG_MASK) {
+		case SCTP_DATA_FIRST_FRAG:
+			goto out;
+		case SCTP_DATA_MIDDLE_FRAG:
+			if (!first_frag) {
+				if (cevent->fsn == sin->fsn_uo) {
+					first_frag = pos;
+					last_frag = pos;
+					next_fsn = cevent->fsn + 1;
+				}
+			} else if (cevent->fsn == next_fsn) {
+				last_frag = pos;
+				next_fsn++;
+			} else {
+				goto out;
+			}
+			break;
+		case SCTP_DATA_LAST_FRAG:
+			if (!first_frag) {
+				if (cevent->fsn == sin->fsn_uo) {
+					first_frag = pos;
+					last_frag = pos;
+					next_fsn = 0;
+					is_last = 1;
+				}
+			} else if (cevent->fsn == next_fsn) {
+				last_frag = pos;
+				next_fsn = 0;
+				is_last = 1;
+			}
+			goto out;
+		default:
+			goto out;
+		}
+	}
+
+out:
+	if (!first_frag)
+		return NULL;
+
+	retval = sctp_make_reassembled_event(sock_net(ulpq->asoc->base.sk),
+					     &ulpq->reasm_uo, first_frag,
+					     last_frag);
+	if (retval) {
+		sin->fsn_uo = next_fsn;
+		if (is_last) {
+			retval->msg_flags |= MSG_EOR;
+			sin->pd_mode_uo = 0;
+		}
+	}
+
+	return retval;
+}
+
+static struct sctp_ulpevent *sctp_intl_retrieve_reassembled_uo(
+						struct sctp_ulpq *ulpq,
+						struct sctp_ulpevent *event)
+{
+	struct sctp_association *asoc = ulpq->asoc;
+	struct sk_buff *pos, *first_frag = NULL;
+	struct sctp_ulpevent *retval = NULL;
+	struct sk_buff *pd_first = NULL;
+	struct sk_buff *pd_last = NULL;
+	struct sctp_stream_in *sin;
+	__u32 next_fsn = 0;
+	__u32 pd_point = 0;
+	__u32 pd_len = 0;
+	__u32 mid = 0;
+
+	sin = sctp_stream_in(ulpq->asoc, event->stream);
+
+	skb_queue_walk(&ulpq->reasm_uo, pos) {
+		struct sctp_ulpevent *cevent = sctp_skb2event(pos);
+
+		if (cevent->stream < event->stream)
+			continue;
+		if (cevent->stream > event->stream)
+			break;
+
+		if (MID_lt(cevent->mid, event->mid))
+			continue;
+		if (MID_lt(event->mid, cevent->mid))
+			break;
+
+		switch (cevent->msg_flags & SCTP_DATA_FRAG_MASK) {
+		case SCTP_DATA_FIRST_FRAG:
+			if (!sin->pd_mode_uo) {
+				sin->mid_uo = cevent->mid;
+				pd_first = pos;
+				pd_last = pos;
+				pd_len = pos->len;
+			}
+
+			first_frag = pos;
+			next_fsn = 0;
+			mid = cevent->mid;
+			break;
+
+		case SCTP_DATA_MIDDLE_FRAG:
+			if (first_frag && cevent->mid == mid &&
+			    cevent->fsn == next_fsn) {
+				next_fsn++;
+				if (pd_first) {
+					pd_last = pos;
+					pd_len += pos->len;
+				}
+			} else {
+				first_frag = NULL;
+			}
+			break;
+
+		case SCTP_DATA_LAST_FRAG:
+			if (first_frag && cevent->mid == mid &&
+			    cevent->fsn == next_fsn)
+				goto found;
+			else
+				first_frag = NULL;
+			break;
+		}
+	}
+
+	if (!pd_first)
+		goto out;
+
+	pd_point = sctp_sk(asoc->base.sk)->pd_point;
+	if (pd_point && pd_point <= pd_len) {
+		retval = sctp_make_reassembled_event(sock_net(asoc->base.sk),
+						     &ulpq->reasm_uo,
+						     pd_first, pd_last);
+		if (retval) {
+			sin->fsn_uo = next_fsn;
+			sin->pd_mode_uo = 1;
+		}
+	}
+	goto out;
+
+found:
+	retval = sctp_make_reassembled_event(sock_net(asoc->base.sk),
+					     &ulpq->reasm_uo,
+					     first_frag, pos);
+	if (retval)
+		retval->msg_flags |= MSG_EOR;
+
+out:
+	return retval;
+}
+
+static struct sctp_ulpevent *sctp_intl_reasm_uo(struct sctp_ulpq *ulpq,
+						struct sctp_ulpevent *event)
+{
+	struct sctp_ulpevent *retval = NULL;
+	struct sctp_stream_in *sin;
+
+	if (SCTP_DATA_NOT_FRAG == (event->msg_flags & SCTP_DATA_FRAG_MASK)) {
+		event->msg_flags |= MSG_EOR;
+		return event;
+	}
+
+	sctp_intl_store_reasm_uo(ulpq, event);
+
+	sin = sctp_stream_in(ulpq->asoc, event->stream);
+	if (sin->pd_mode_uo && event->mid == sin->mid_uo &&
+	    event->fsn == sin->fsn_uo)
+		retval = sctp_intl_retrieve_partial_uo(ulpq, event);
+
+	if (!retval)
+		retval = sctp_intl_retrieve_reassembled_uo(ulpq, event);
+
+	return retval;
+}
+
+static struct sctp_ulpevent *sctp_intl_retrieve_first_uo(struct sctp_ulpq *ulpq)
+{
+	struct sctp_stream_in *csin, *sin = NULL;
+	struct sk_buff *first_frag = NULL;
+	struct sk_buff *last_frag = NULL;
+	struct sctp_ulpevent *retval;
+	struct sk_buff *pos;
+	__u32 next_fsn = 0;
+	__u16 sid = 0;
+
+	skb_queue_walk(&ulpq->reasm_uo, pos) {
+		struct sctp_ulpevent *cevent = sctp_skb2event(pos);
+
+		csin = sctp_stream_in(ulpq->asoc, cevent->stream);
+		if (csin->pd_mode_uo)
+			continue;
+
+		switch (cevent->msg_flags & SCTP_DATA_FRAG_MASK) {
+		case SCTP_DATA_FIRST_FRAG:
+			if (first_frag)
+				goto out;
+			first_frag = pos;
+			last_frag = pos;
+			next_fsn = 0;
+			sin = csin;
+			sid = cevent->stream;
+			sin->mid_uo = cevent->mid;
+			break;
+		case SCTP_DATA_MIDDLE_FRAG:
+			if (!first_frag)
+				break;
+			if (cevent->stream == sid &&
+			    cevent->mid == sin->mid_uo &&
+			    cevent->fsn == next_fsn) {
+				next_fsn++;
+				last_frag = pos;
+			} else {
+				goto out;
+			}
+			break;
+		case SCTP_DATA_LAST_FRAG:
+			if (first_frag)
+				goto out;
+			break;
+		default:
+			break;
+		}
+	}
+
+	if (!first_frag)
+		return NULL;
+
+out:
+	retval = sctp_make_reassembled_event(sock_net(ulpq->asoc->base.sk),
+					     &ulpq->reasm_uo, first_frag,
+					     last_frag);
+	if (retval) {
+		sin->fsn_uo = next_fsn;
+		sin->pd_mode_uo = 1;
+	}
+
+	return retval;
+}
+
+static int sctp_ulpevent_idata(struct sctp_ulpq *ulpq,
+			       struct sctp_chunk *chunk, gfp_t gfp)
+{
+	struct sctp_ulpevent *event;
+	struct sk_buff_head temp;
+	int event_eor = 0;
+
+	event = sctp_ulpevent_make_rcvmsg(chunk->asoc, chunk, gfp);
+	if (!event)
+		return -ENOMEM;
+
+	event->mid = ntohl(chunk->subh.idata_hdr->mid);
+	if (event->msg_flags & SCTP_DATA_FIRST_FRAG)
+		event->ppid = chunk->subh.idata_hdr->ppid;
+	else
+		event->fsn = ntohl(chunk->subh.idata_hdr->fsn);
+
+	if (!(event->msg_flags & SCTP_DATA_UNORDERED)) {
+		event = sctp_intl_reasm(ulpq, event);
+		if (event && event->msg_flags & MSG_EOR) {
+			skb_queue_head_init(&temp);
+			__skb_queue_tail(&temp, sctp_event2skb(event));
+
+			event = sctp_intl_order(ulpq, event);
+		}
+	} else {
+		event = sctp_intl_reasm_uo(ulpq, event);
+	}
+
+	if (event) {
+		event_eor = (event->msg_flags & MSG_EOR) ? 1 : 0;
+		sctp_enqueue_event(ulpq, event);
+	}
+
+	return event_eor;
+}
+
+static struct sctp_ulpevent *sctp_intl_retrieve_first(struct sctp_ulpq *ulpq)
+{
+	struct sctp_stream_in *csin, *sin = NULL;
+	struct sk_buff *first_frag = NULL;
+	struct sk_buff *last_frag = NULL;
+	struct sctp_ulpevent *retval;
+	struct sk_buff *pos;
+	__u32 next_fsn = 0;
+	__u16 sid = 0;
+
+	skb_queue_walk(&ulpq->reasm, pos) {
+		struct sctp_ulpevent *cevent = sctp_skb2event(pos);
+
+		csin = sctp_stream_in(ulpq->asoc, cevent->stream);
+		if (csin->pd_mode)
+			continue;
+
+		switch (cevent->msg_flags & SCTP_DATA_FRAG_MASK) {
+		case SCTP_DATA_FIRST_FRAG:
+			if (first_frag)
+				goto out;
+			if (cevent->mid == csin->mid) {
+				first_frag = pos;
+				last_frag = pos;
+				next_fsn = 0;
+				sin = csin;
+				sid = cevent->stream;
+			}
+			break;
+		case SCTP_DATA_MIDDLE_FRAG:
+			if (!first_frag)
+				break;
+			if (cevent->stream == sid &&
+			    cevent->mid == sin->mid &&
+			    cevent->fsn == next_fsn) {
+				next_fsn++;
+				last_frag = pos;
+			} else {
+				goto out;
+			}
+			break;
+		case SCTP_DATA_LAST_FRAG:
+			if (first_frag)
+				goto out;
+			break;
+		default:
+			break;
+		}
+	}
+
+	if (!first_frag)
+		return NULL;
+
+out:
+	retval = sctp_make_reassembled_event(sock_net(ulpq->asoc->base.sk),
+					     &ulpq->reasm, first_frag,
+					     last_frag);
+	if (retval) {
+		sin->fsn = next_fsn;
+		sin->pd_mode = 1;
+	}
+
+	return retval;
+}
+
+static void sctp_intl_start_pd(struct sctp_ulpq *ulpq, gfp_t gfp)
+{
+	struct sctp_ulpevent *event;
+
+	if (!skb_queue_empty(&ulpq->reasm)) {
+		do {
+			event = sctp_intl_retrieve_first(ulpq);
+			if (event)
+				sctp_enqueue_event(ulpq, event);
+		} while (event);
+	}
+
+	if (!skb_queue_empty(&ulpq->reasm_uo)) {
+		do {
+			event = sctp_intl_retrieve_first_uo(ulpq);
+			if (event)
+				sctp_enqueue_event(ulpq, event);
+		} while (event);
+	}
+}
+
+static void sctp_renege_events(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk,
+			       gfp_t gfp)
+{
+	struct sctp_association *asoc = ulpq->asoc;
+	__u32 freed = 0;
+	__u16 needed;
+
+	if (chunk) {
+		needed = ntohs(chunk->chunk_hdr->length);
+		needed -= sizeof(struct sctp_idata_chunk);
+	} else {
+		needed = SCTP_DEFAULT_MAXWINDOW;
+	}
+
+	if (skb_queue_empty(&asoc->base.sk->sk_receive_queue)) {
+		freed = sctp_ulpq_renege_list(ulpq, &ulpq->lobby, needed);
+		if (freed < needed)
+			freed += sctp_ulpq_renege_list(ulpq, &ulpq->reasm,
+						       needed);
+		if (freed < needed)
+			freed += sctp_ulpq_renege_list(ulpq, &ulpq->reasm_uo,
+						       needed);
+	}
+
+	if (chunk && freed >= needed)
+		if (sctp_ulpevent_idata(ulpq, chunk, gfp) <= 0)
+			sctp_intl_start_pd(ulpq, gfp);
+
+	sk_mem_reclaim(asoc->base.sk);
+}
+
+static void sctp_intl_stream_abort_pd(struct sctp_ulpq *ulpq, __u16 sid,
+				      __u32 mid, __u16 flags, gfp_t gfp)
+{
+	struct sock *sk = ulpq->asoc->base.sk;
+	struct sctp_ulpevent *ev = NULL;
+
+	if (!sctp_ulpevent_type_enabled(SCTP_PARTIAL_DELIVERY_EVENT,
+					&sctp_sk(sk)->subscribe))
+		return;
+
+	ev = sctp_ulpevent_make_pdapi(ulpq->asoc, SCTP_PARTIAL_DELIVERY_ABORTED,
+				      sid, mid, flags, gfp);
+	if (ev) {
+		__skb_queue_tail(&sk->sk_receive_queue, sctp_event2skb(ev));
+
+		if (!sctp_sk(sk)->data_ready_signalled) {
+			sctp_sk(sk)->data_ready_signalled = 1;
+			sk->sk_data_ready(sk);
+		}
+	}
+}
+
+static void sctp_intl_reap_ordered(struct sctp_ulpq *ulpq, __u16 sid)
+{
+	struct sctp_stream *stream = &ulpq->asoc->stream;
+	struct sctp_ulpevent *cevent, *event = NULL;
+	struct sk_buff_head *lobby = &ulpq->lobby;
+	struct sk_buff *pos, *tmp;
+	struct sk_buff_head temp;
+	__u16 csid;
+	__u32 cmid;
+
+	skb_queue_head_init(&temp);
+	sctp_skb_for_each(pos, lobby, tmp) {
+		cevent = (struct sctp_ulpevent *)pos->cb;
+		csid = cevent->stream;
+		cmid = cevent->mid;
+
+		if (csid > sid)
+			break;
+
+		if (csid < sid)
+			continue;
+
+		if (!MID_lt(cmid, sctp_mid_peek(stream, in, csid)))
+			break;
+
+		__skb_unlink(pos, lobby);
+		if (!event)
+			event = sctp_skb2event(pos);
+
+		__skb_queue_tail(&temp, pos);
+	}
+
+	if (!event && pos != (struct sk_buff *)lobby) {
+		cevent = (struct sctp_ulpevent *)pos->cb;
+		csid = cevent->stream;
+		cmid = cevent->mid;
+
+		if (csid == sid && cmid == sctp_mid_peek(stream, in, csid)) {
+			sctp_mid_next(stream, in, csid);
+			__skb_unlink(pos, lobby);
+			__skb_queue_tail(&temp, pos);
+			event = sctp_skb2event(pos);
+		}
+	}
+
+	if (event) {
+		sctp_intl_retrieve_ordered(ulpq, event);
+		sctp_enqueue_event(ulpq, event);
+	}
+}
+
+static void sctp_intl_abort_pd(struct sctp_ulpq *ulpq, gfp_t gfp)
+{
+	struct sctp_stream *stream = &ulpq->asoc->stream;
+	__u16 sid;
+
+	for (sid = 0; sid < stream->incnt; sid++) {
+		struct sctp_stream_in *sin = &stream->in[sid];
+		__u32 mid;
+
+		if (sin->pd_mode_uo) {
+			sin->pd_mode_uo = 0;
+
+			mid = sin->mid_uo;
+			sctp_intl_stream_abort_pd(ulpq, sid, mid, 0x1, gfp);
+		}
+
+		if (sin->pd_mode) {
+			sin->pd_mode = 0;
+
+			mid = sin->mid;
+			sctp_intl_stream_abort_pd(ulpq, sid, mid, 0, gfp);
+			sctp_mid_skip(stream, in, sid, mid);
+
+			sctp_intl_reap_ordered(ulpq, sid);
+		}
+	}
+
+	/* intl abort pd happens only when all data needs to be cleaned */
+	sctp_ulpq_flush(ulpq);
+}
+
+static inline int sctp_get_skip_pos(struct sctp_ifwdtsn_skip *skiplist,
+				    int nskips, __be16 stream, __u8 flags)
+{
+	int i;
+
+	for (i = 0; i < nskips; i++)
+		if (skiplist[i].stream == stream &&
+		    skiplist[i].flags == flags)
+			return i;
+
+	return i;
+}
+
+#define SCTP_FTSN_U_BIT	0x1
+static void sctp_generate_iftsn(struct sctp_outq *q, __u32 ctsn)
+{
+	struct sctp_ifwdtsn_skip ftsn_skip_arr[10];
+	struct sctp_association *asoc = q->asoc;
+	struct sctp_chunk *ftsn_chunk = NULL;
+	struct list_head *lchunk, *temp;
+	int nskips = 0, skip_pos;
+	struct sctp_chunk *chunk;
+	__u32 tsn;
+
+	if (!asoc->peer.prsctp_capable)
+		return;
+
+	if (TSN_lt(asoc->adv_peer_ack_point, ctsn))
+		asoc->adv_peer_ack_point = ctsn;
+
+	list_for_each_safe(lchunk, temp, &q->abandoned) {
+		chunk = list_entry(lchunk, struct sctp_chunk, transmitted_list);
+		tsn = ntohl(chunk->subh.data_hdr->tsn);
+
+		if (TSN_lte(tsn, ctsn)) {
+			list_del_init(lchunk);
+			sctp_chunk_free(chunk);
+		} else if (TSN_lte(tsn, asoc->adv_peer_ack_point + 1)) {
+			__be16 sid = chunk->subh.idata_hdr->stream;
+			__be32 mid = chunk->subh.idata_hdr->mid;
+			__u8 flags = 0;
+
+			if (chunk->chunk_hdr->flags & SCTP_DATA_UNORDERED)
+				flags |= SCTP_FTSN_U_BIT;
+
+			asoc->adv_peer_ack_point = tsn;
+			skip_pos = sctp_get_skip_pos(&ftsn_skip_arr[0], nskips,
+						     sid, flags);
+			ftsn_skip_arr[skip_pos].stream = sid;
+			ftsn_skip_arr[skip_pos].reserved = 0;
+			ftsn_skip_arr[skip_pos].flags = flags;
+			ftsn_skip_arr[skip_pos].mid = mid;
+			if (skip_pos == nskips)
+				nskips++;
+			if (nskips == 10)
+				break;
+		} else {
+			break;
+		}
+	}
+
+	if (asoc->adv_peer_ack_point > ctsn)
+		ftsn_chunk = sctp_make_ifwdtsn(asoc, asoc->adv_peer_ack_point,
+					       nskips, &ftsn_skip_arr[0]);
+
+	if (ftsn_chunk) {
+		list_add_tail(&ftsn_chunk->list, &q->control_chunk_list);
+		SCTP_INC_STATS(sock_net(asoc->base.sk), SCTP_MIB_OUTCTRLCHUNKS);
+	}
+}
+
+#define _sctp_walk_ifwdtsn(pos, chunk, end) \
+	for (pos = chunk->subh.ifwdtsn_hdr->skip; \
+	     (void *)pos < (void *)chunk->subh.ifwdtsn_hdr->skip + (end); pos++)
+
+#define sctp_walk_ifwdtsn(pos, ch) \
+	_sctp_walk_ifwdtsn((pos), (ch), ntohs((ch)->chunk_hdr->length) - \
+					sizeof(struct sctp_ifwdtsn_chunk))
+
+static bool sctp_validate_fwdtsn(struct sctp_chunk *chunk)
+{
+	struct sctp_fwdtsn_skip *skip;
+	__u16 incnt;
+
+	if (chunk->chunk_hdr->type != SCTP_CID_FWD_TSN)
+		return false;
+
+	incnt = chunk->asoc->stream.incnt;
+	sctp_walk_fwdtsn(skip, chunk)
+		if (ntohs(skip->stream) >= incnt)
+			return false;
+
+	return true;
+}
+
+static bool sctp_validate_iftsn(struct sctp_chunk *chunk)
+{
+	struct sctp_ifwdtsn_skip *skip;
+	__u16 incnt;
+
+	if (chunk->chunk_hdr->type != SCTP_CID_I_FWD_TSN)
+		return false;
+
+	incnt = chunk->asoc->stream.incnt;
+	sctp_walk_ifwdtsn(skip, chunk)
+		if (ntohs(skip->stream) >= incnt)
+			return false;
+
+	return true;
+}
+
+static void sctp_report_fwdtsn(struct sctp_ulpq *ulpq, __u32 ftsn)
+{
+	/* Move the Cumulattive TSN Ack ahead. */
+	sctp_tsnmap_skip(&ulpq->asoc->peer.tsn_map, ftsn);
+	/* purge the fragmentation queue */
+	sctp_ulpq_reasm_flushtsn(ulpq, ftsn);
+	/* Abort any in progress partial delivery. */
+	sctp_ulpq_abort_pd(ulpq, GFP_ATOMIC);
+}
+
+static void sctp_intl_reasm_flushtsn(struct sctp_ulpq *ulpq, __u32 ftsn)
+{
+	struct sk_buff *pos, *tmp;
+
+	skb_queue_walk_safe(&ulpq->reasm, pos, tmp) {
+		struct sctp_ulpevent *event = sctp_skb2event(pos);
+		__u32 tsn = event->tsn;
+
+		if (TSN_lte(tsn, ftsn)) {
+			__skb_unlink(pos, &ulpq->reasm);
+			sctp_ulpevent_free(event);
+		}
+	}
+
+	skb_queue_walk_safe(&ulpq->reasm_uo, pos, tmp) {
+		struct sctp_ulpevent *event = sctp_skb2event(pos);
+		__u32 tsn = event->tsn;
+
+		if (TSN_lte(tsn, ftsn)) {
+			__skb_unlink(pos, &ulpq->reasm_uo);
+			sctp_ulpevent_free(event);
+		}
+	}
+}
+
+static void sctp_report_iftsn(struct sctp_ulpq *ulpq, __u32 ftsn)
+{
+	/* Move the Cumulattive TSN Ack ahead. */
+	sctp_tsnmap_skip(&ulpq->asoc->peer.tsn_map, ftsn);
+	/* purge the fragmentation queue */
+	sctp_intl_reasm_flushtsn(ulpq, ftsn);
+	/* abort only when it's for all data */
+	if (ftsn == sctp_tsnmap_get_max_tsn_seen(&ulpq->asoc->peer.tsn_map))
+		sctp_intl_abort_pd(ulpq, GFP_ATOMIC);
+}
+
+static void sctp_handle_fwdtsn(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk)
+{
+	struct sctp_fwdtsn_skip *skip;
+
+	/* Walk through all the skipped SSNs */
+	sctp_walk_fwdtsn(skip, chunk)
+		sctp_ulpq_skip(ulpq, ntohs(skip->stream), ntohs(skip->ssn));
+}
+
+static void sctp_intl_skip(struct sctp_ulpq *ulpq, __u16 sid, __u32 mid,
+			   __u8 flags)
+{
+	struct sctp_stream_in *sin = sctp_stream_in(ulpq->asoc, sid);
+	struct sctp_stream *stream  = &ulpq->asoc->stream;
+
+	if (flags & SCTP_FTSN_U_BIT) {
+		if (sin->pd_mode_uo && MID_lt(sin->mid_uo, mid)) {
+			sin->pd_mode_uo = 0;
+			sctp_intl_stream_abort_pd(ulpq, sid, mid, 0x1,
+						  GFP_ATOMIC);
+		}
+		return;
+	}
+
+	if (MID_lt(mid, sctp_mid_peek(stream, in, sid)))
+		return;
+
+	if (sin->pd_mode) {
+		sin->pd_mode = 0;
+		sctp_intl_stream_abort_pd(ulpq, sid, mid, 0x0, GFP_ATOMIC);
+	}
+
+	sctp_mid_skip(stream, in, sid, mid);
+
+	sctp_intl_reap_ordered(ulpq, sid);
+}
+
+static void sctp_handle_iftsn(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk)
+{
+	struct sctp_ifwdtsn_skip *skip;
+
+	/* Walk through all the skipped MIDs and abort stream pd if possible */
+	sctp_walk_ifwdtsn(skip, chunk)
+		sctp_intl_skip(ulpq, ntohs(skip->stream),
+			       ntohl(skip->mid), skip->flags);
+}
+
+static struct sctp_stream_interleave sctp_stream_interleave_0 = {
+	.data_chunk_len		= sizeof(struct sctp_data_chunk),
+	.ftsn_chunk_len		= sizeof(struct sctp_fwdtsn_chunk),
+	/* DATA process functions */
+	.make_datafrag		= sctp_make_datafrag_empty,
+	.assign_number		= sctp_chunk_assign_ssn,
+	.validate_data		= sctp_validate_data,
+	.ulpevent_data		= sctp_ulpq_tail_data,
+	.enqueue_event		= sctp_ulpq_tail_event,
+	.renege_events		= sctp_ulpq_renege,
+	.start_pd		= sctp_ulpq_partial_delivery,
+	.abort_pd		= sctp_ulpq_abort_pd,
+	/* FORWARD-TSN process functions */
+	.generate_ftsn		= sctp_generate_fwdtsn,
+	.validate_ftsn		= sctp_validate_fwdtsn,
+	.report_ftsn		= sctp_report_fwdtsn,
+	.handle_ftsn		= sctp_handle_fwdtsn,
+};
+
+static struct sctp_stream_interleave sctp_stream_interleave_1 = {
+	.data_chunk_len		= sizeof(struct sctp_idata_chunk),
+	.ftsn_chunk_len		= sizeof(struct sctp_ifwdtsn_chunk),
+	/* I-DATA process functions */
+	.make_datafrag		= sctp_make_idatafrag_empty,
+	.assign_number		= sctp_chunk_assign_mid,
+	.validate_data		= sctp_validate_idata,
+	.ulpevent_data		= sctp_ulpevent_idata,
+	.enqueue_event		= sctp_enqueue_event,
+	.renege_events		= sctp_renege_events,
+	.start_pd		= sctp_intl_start_pd,
+	.abort_pd		= sctp_intl_abort_pd,
+	/* I-FORWARD-TSN process functions */
+	.generate_ftsn		= sctp_generate_iftsn,
+	.validate_ftsn		= sctp_validate_iftsn,
+	.report_ftsn		= sctp_report_iftsn,
+	.handle_ftsn		= sctp_handle_iftsn,
+};
+
+void sctp_stream_interleave_init(struct sctp_stream *stream)
+{
+	struct sctp_association *asoc;
+
+	asoc = container_of(stream, struct sctp_association, stream);
+	stream->si = asoc->intl_enable ? &sctp_stream_interleave_1
+				       : &sctp_stream_interleave_0;
+}
diff --git a/net/sctp/stream_sched.c b/net/sctp/stream_sched.c
index d8c162a4089c..f5fcd425232a 100644
--- a/net/sctp/stream_sched.c
+++ b/net/sctp/stream_sched.c
@@ -242,7 +242,8 @@ int sctp_sched_get_value(struct sctp_association *asoc, __u16 sid,
 
 void sctp_sched_dequeue_done(struct sctp_outq *q, struct sctp_chunk *ch)
 {
-	if (!list_is_last(&ch->frag_list, &ch->msg->chunks)) {
+	if (!list_is_last(&ch->frag_list, &ch->msg->chunks) &&
+	    !q->asoc->intl_enable) {
 		struct sctp_stream_out *sout;
 		__u16 sid;
 
diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c
index ef7ca44d6e6a..33ca5b73cdb3 100644
--- a/net/sctp/sysctl.c
+++ b/net/sctp/sysctl.c
@@ -289,6 +289,13 @@ static struct ctl_table sctp_net_table[] = {
 		.proc_handler	= proc_sctp_do_auth,
 	},
 	{
+		.procname	= "intl_enable",
+		.data		= &init_net.sctp.intl_enable,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{
 		.procname	= "addr_scope_policy",
 		.data		= &init_net.sctp.scope_policy,
 		.maxlen		= sizeof(int),
diff --git a/net/sctp/ulpevent.c b/net/sctp/ulpevent.c
index 5447228bf1a0..84207ad33e8e 100644
--- a/net/sctp/ulpevent.c
+++ b/net/sctp/ulpevent.c
@@ -443,8 +443,8 @@ struct sctp_ulpevent *sctp_ulpevent_make_send_failed(
 		goto fail;
 
 	/* Pull off the common chunk header and DATA header.  */
-	skb_pull(skb, sizeof(struct sctp_data_chunk));
-	len -= sizeof(struct sctp_data_chunk);
+	skb_pull(skb, sctp_datachk_len(&asoc->stream));
+	len -= sctp_datachk_len(&asoc->stream);
 
 	/* Embed the event fields inside the cloned skb.  */
 	event = sctp_skb2event(skb);
@@ -705,8 +705,6 @@ struct sctp_ulpevent *sctp_ulpevent_make_rcvmsg(struct sctp_association *asoc,
 	sctp_ulpevent_receive_data(event, asoc);
 
 	event->stream = ntohs(chunk->subh.data_hdr->stream);
-	event->ssn = ntohs(chunk->subh.data_hdr->ssn);
-	event->ppid = chunk->subh.data_hdr->ppid;
 	if (chunk->chunk_hdr->flags & SCTP_DATA_UNORDERED) {
 		event->flags |= SCTP_UNORDERED;
 		event->cumtsn = sctp_tsnmap_get_ctsn(&asoc->peer.tsn_map);
@@ -732,8 +730,9 @@ fail:
  *   various events.
  */
 struct sctp_ulpevent *sctp_ulpevent_make_pdapi(
-	const struct sctp_association *asoc, __u32 indication,
-	gfp_t gfp)
+					const struct sctp_association *asoc,
+					__u32 indication, __u32 sid, __u32 seq,
+					__u32 flags, gfp_t gfp)
 {
 	struct sctp_ulpevent *event;
 	struct sctp_pdapi_event *pd;
@@ -754,7 +753,9 @@ struct sctp_ulpevent *sctp_ulpevent_make_pdapi(
 	 *   Currently unused.
 	 */
 	pd->pdapi_type = SCTP_PARTIAL_DELIVERY_EVENT;
-	pd->pdapi_flags = 0;
+	pd->pdapi_flags = flags;
+	pd->pdapi_stream = sid;
+	pd->pdapi_seq = seq;
 
 	/* pdapi_length: 32 bits (unsigned integer)
 	 *
diff --git a/net/sctp/ulpqueue.c b/net/sctp/ulpqueue.c
index a71be33f3afe..97fae53310e0 100644
--- a/net/sctp/ulpqueue.c
+++ b/net/sctp/ulpqueue.c
@@ -60,6 +60,7 @@ struct sctp_ulpq *sctp_ulpq_init(struct sctp_ulpq *ulpq,
 
 	ulpq->asoc = asoc;
 	skb_queue_head_init(&ulpq->reasm);
+	skb_queue_head_init(&ulpq->reasm_uo);
 	skb_queue_head_init(&ulpq->lobby);
 	ulpq->pd_mode  = 0;
 
@@ -83,6 +84,10 @@ void sctp_ulpq_flush(struct sctp_ulpq *ulpq)
 		sctp_ulpevent_free(event);
 	}
 
+	while ((skb = __skb_dequeue(&ulpq->reasm_uo)) != NULL) {
+		event = sctp_skb2event(skb);
+		sctp_ulpevent_free(event);
+	}
 }
 
 /* Dispose of a ulpqueue.  */
@@ -104,6 +109,9 @@ int sctp_ulpq_tail_data(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk,
 	if (!event)
 		return -ENOMEM;
 
+	event->ssn = ntohs(chunk->subh.data_hdr->ssn);
+	event->ppid = chunk->subh.data_hdr->ppid;
+
 	/* Do reassembly if needed.  */
 	event = sctp_ulpq_reasm(ulpq, event);
 
@@ -328,9 +336,10 @@ static void sctp_ulpq_store_reasm(struct sctp_ulpq *ulpq,
  * payload was fragmented on the way and ip had to reassemble them.
  * We add the rest of skb's to the first skb's fraglist.
  */
-static struct sctp_ulpevent *sctp_make_reassembled_event(struct net *net,
-	struct sk_buff_head *queue, struct sk_buff *f_frag,
-	struct sk_buff *l_frag)
+struct sctp_ulpevent *sctp_make_reassembled_event(struct net *net,
+						  struct sk_buff_head *queue,
+						  struct sk_buff *f_frag,
+						  struct sk_buff *l_frag)
 {
 	struct sk_buff *pos;
 	struct sk_buff *new = NULL;
@@ -853,7 +862,7 @@ static struct sctp_ulpevent *sctp_ulpq_order(struct sctp_ulpq *ulpq,
 	struct sctp_stream *stream;
 
 	/* Check if this message needs ordering.  */
-	if (SCTP_DATA_UNORDERED & event->msg_flags)
+	if (event->msg_flags & SCTP_DATA_UNORDERED)
 		return event;
 
 	/* Note: The stream ID must be verified before this routine.  */
@@ -974,8 +983,8 @@ void sctp_ulpq_skip(struct sctp_ulpq *ulpq, __u16 sid, __u16 ssn)
 	sctp_ulpq_reap_ordered(ulpq, sid);
 }
 
-static __u16 sctp_ulpq_renege_list(struct sctp_ulpq *ulpq,
-		struct sk_buff_head *list, __u16 needed)
+__u16 sctp_ulpq_renege_list(struct sctp_ulpq *ulpq, struct sk_buff_head *list,
+			    __u16 needed)
 {
 	__u16 freed = 0;
 	__u32 tsn, last_tsn;
@@ -1140,7 +1149,7 @@ void sctp_ulpq_abort_pd(struct sctp_ulpq *ulpq, gfp_t gfp)
 				       &sctp_sk(sk)->subscribe))
 		ev = sctp_ulpevent_make_pdapi(ulpq->asoc,
 					      SCTP_PARTIAL_DELIVERY_ABORTED,
-					      gfp);
+					      0, 0, 0, gfp);
 	if (ev)
 		__skb_queue_tail(&sk->sk_receive_queue, sctp_event2skb(ev));
 
diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index 6451c5013e06..daf8075f5a4c 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -520,7 +520,7 @@ decline_rdma:
 	smc->use_fallback = true;
 	if (reason_code && (reason_code != SMC_CLC_DECL_REPLY)) {
 		rc = smc_clc_send_decline(smc, reason_code);
-		if (rc < sizeof(struct smc_clc_msg_decline))
+		if (rc < 0)
 			goto out_err;
 	}
 	goto out_connected;
@@ -751,14 +751,16 @@ static void smc_listen_work(struct work_struct *work)
 {
 	struct smc_sock *new_smc = container_of(work, struct smc_sock,
 						smc_listen_work);
+	struct smc_clc_msg_proposal_prefix *pclc_prfx;
 	struct socket *newclcsock = new_smc->clcsock;
 	struct smc_sock *lsmc = new_smc->listen_smc;
 	struct smc_clc_msg_accept_confirm cclc;
 	int local_contact = SMC_REUSE_CONTACT;
 	struct sock *newsmcsk = &new_smc->sk;
-	struct smc_clc_msg_proposal pclc;
+	struct smc_clc_msg_proposal *pclc;
 	struct smc_ib_device *smcibdev;
 	struct sockaddr_in peeraddr;
+	u8 buf[SMC_CLC_MAX_LEN];
 	struct smc_link *link;
 	int reason_code = 0;
 	int rc = 0, len;
@@ -775,7 +777,7 @@ static void smc_listen_work(struct work_struct *work)
 	/* do inband token exchange -
 	 *wait for and receive SMC Proposal CLC message
 	 */
-	reason_code = smc_clc_wait_msg(new_smc, &pclc, sizeof(pclc),
+	reason_code = smc_clc_wait_msg(new_smc, &buf, sizeof(buf),
 				       SMC_CLC_PROPOSAL);
 	if (reason_code < 0)
 		goto out_err;
@@ -804,8 +806,11 @@ static void smc_listen_work(struct work_struct *work)
 		reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */
 		goto decline_rdma;
 	}
-	if ((pclc.outgoing_subnet != subnet) ||
-	    (pclc.prefix_len != prefix_len)) {
+
+	pclc = (struct smc_clc_msg_proposal *)&buf;
+	pclc_prfx = smc_clc_proposal_get_prefix(pclc);
+	if (pclc_prfx->outgoing_subnet != subnet ||
+	    pclc_prfx->prefix_len != prefix_len) {
 		reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */
 		goto decline_rdma;
 	}
@@ -816,7 +821,7 @@ static void smc_listen_work(struct work_struct *work)
 	/* allocate connection / link group */
 	mutex_lock(&smc_create_lgr_pending);
 	local_contact = smc_conn_create(new_smc, peeraddr.sin_addr.s_addr,
-					smcibdev, ibport, &pclc.lcl, 0);
+					smcibdev, ibport, &pclc->lcl, 0);
 	if (local_contact < 0) {
 		rc = local_contact;
 		if (rc == -ENOMEM)
@@ -879,11 +884,9 @@ static void smc_listen_work(struct work_struct *work)
 		}
 		/* QP confirmation over RoCE fabric */
 		reason_code = smc_serv_conf_first_link(new_smc);
-		if (reason_code < 0) {
+		if (reason_code < 0)
 			/* peer is not aware of a problem */
-			rc = reason_code;
 			goto out_err_unlock;
-		}
 		if (reason_code > 0)
 			goto decline_rdma_unlock;
 	}
@@ -916,8 +919,7 @@ decline_rdma:
 	smc_conn_free(&new_smc->conn);
 	new_smc->use_fallback = true;
 	if (reason_code && (reason_code != SMC_CLC_DECL_REPLY)) {
-		rc = smc_clc_send_decline(new_smc, reason_code);
-		if (rc < sizeof(struct smc_clc_msg_decline))
+		if (smc_clc_send_decline(new_smc, reason_code) < 0)
 			goto out_err;
 	}
 	goto out_connected;
diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c
index 87f7bede6eab..d4155ff6acde 100644
--- a/net/smc/smc_cdc.c
+++ b/net/smc/smc_cdc.c
@@ -213,6 +213,9 @@ static void smc_cdc_msg_recv_action(struct smc_sock *smc,
 		/* guarantee 0 <= bytes_to_rcv <= rmbe_size */
 		smp_mb__after_atomic();
 		smc->sk.sk_data_ready(&smc->sk);
+	} else if ((conn->local_rx_ctrl.prod_flags.write_blocked) ||
+		   (conn->local_rx_ctrl.prod_flags.cons_curs_upd_req)) {
+		smc->sk.sk_data_ready(&smc->sk);
 	}
 
 	if (conn->local_rx_ctrl.conn_state_flags.peer_conn_abort) {
@@ -234,15 +237,6 @@ static void smc_cdc_msg_recv_action(struct smc_sock *smc,
 		/* trigger socket release if connection closed */
 		smc_close_wake_tx_prepared(smc);
 	}
-
-	/* socket connected but not accepted */
-	if (!smc->sk.sk_socket)
-		return;
-
-	/* data available */
-	if ((conn->local_rx_ctrl.prod_flags.write_blocked) ||
-	    (conn->local_rx_ctrl.prod_flags.cons_curs_upd_req))
-		smc_tx_consumer_update(conn);
 }
 
 /* called under tasklet context */
diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c
index 1800e16b2a02..abf7ceb6690b 100644
--- a/net/smc/smc_clc.c
+++ b/net/smc/smc_clc.c
@@ -22,6 +22,54 @@
 #include "smc_clc.h"
 #include "smc_ib.h"
 
+/* check if received message has a correct header length and contains valid
+ * heading and trailing eyecatchers
+ */
+static bool smc_clc_msg_hdr_valid(struct smc_clc_msg_hdr *clcm)
+{
+	struct smc_clc_msg_proposal_prefix *pclc_prfx;
+	struct smc_clc_msg_accept_confirm *clc;
+	struct smc_clc_msg_proposal *pclc;
+	struct smc_clc_msg_decline *dclc;
+	struct smc_clc_msg_trail *trl;
+
+	if (memcmp(clcm->eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)))
+		return false;
+	switch (clcm->type) {
+	case SMC_CLC_PROPOSAL:
+		pclc = (struct smc_clc_msg_proposal *)clcm;
+		pclc_prfx = smc_clc_proposal_get_prefix(pclc);
+		if (ntohs(pclc->hdr.length) !=
+			sizeof(*pclc) + ntohs(pclc->iparea_offset) +
+			sizeof(*pclc_prfx) +
+			pclc_prfx->ipv6_prefixes_cnt *
+				sizeof(struct smc_clc_ipv6_prefix) +
+			sizeof(*trl))
+			return false;
+		trl = (struct smc_clc_msg_trail *)
+			((u8 *)pclc + ntohs(pclc->hdr.length) - sizeof(*trl));
+		break;
+	case SMC_CLC_ACCEPT:
+	case SMC_CLC_CONFIRM:
+		clc = (struct smc_clc_msg_accept_confirm *)clcm;
+		if (ntohs(clc->hdr.length) != sizeof(*clc))
+			return false;
+		trl = &clc->trl;
+		break;
+	case SMC_CLC_DECLINE:
+		dclc = (struct smc_clc_msg_decline *)clcm;
+		if (ntohs(dclc->hdr.length) != sizeof(*dclc))
+			return false;
+		trl = &dclc->trl;
+		break;
+	default:
+		return false;
+	}
+	if (memcmp(trl->eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)))
+		return false;
+	return true;
+}
+
 /* Wait for data on the tcp-socket, analyze received data
  * Returns:
  * 0 if success and it was not a decline that we received.
@@ -72,9 +120,7 @@ int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen,
 	}
 	datlen = ntohs(clcm->length);
 	if ((len < sizeof(struct smc_clc_msg_hdr)) ||
-	    (datlen < sizeof(struct smc_clc_msg_decline)) ||
-	    (datlen > sizeof(struct smc_clc_msg_accept_confirm)) ||
-	    memcmp(clcm->eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)) ||
+	    (datlen > buflen) ||
 	    ((clcm->type != SMC_CLC_DECLINE) &&
 	     (clcm->type != expected_type))) {
 		smc->sk.sk_err = EPROTO;
@@ -89,7 +135,7 @@ int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen,
 	krflags = MSG_WAITALL;
 	smc->clcsock->sk->sk_rcvtimeo = CLC_WAIT_TIME;
 	len = kernel_recvmsg(smc->clcsock, &msg, &vec, 1, datlen, krflags);
-	if (len < datlen) {
+	if (len < datlen || !smc_clc_msg_hdr_valid(clcm)) {
 		smc->sk.sk_err = EPROTO;
 		reason_code = -EPROTO;
 		goto out;
@@ -133,7 +179,7 @@ int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info)
 		smc->sk.sk_err = EPROTO;
 	if (len < 0)
 		smc->sk.sk_err = -len;
-	return len;
+	return sock_error(&smc->sk);
 }
 
 /* send CLC PROPOSAL message across internal TCP socket */
@@ -141,33 +187,43 @@ int smc_clc_send_proposal(struct smc_sock *smc,
 			  struct smc_ib_device *smcibdev,
 			  u8 ibport)
 {
+	struct smc_clc_msg_proposal_prefix pclc_prfx;
 	struct smc_clc_msg_proposal pclc;
+	struct smc_clc_msg_trail trl;
 	int reason_code = 0;
+	struct kvec vec[3];
 	struct msghdr msg;
-	struct kvec vec;
-	int len, rc;
+	int len, plen, rc;
 
 	/* send SMC Proposal CLC message */
+	plen = sizeof(pclc) + sizeof(pclc_prfx) + sizeof(trl);
 	memset(&pclc, 0, sizeof(pclc));
 	memcpy(pclc.hdr.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
 	pclc.hdr.type = SMC_CLC_PROPOSAL;
-	pclc.hdr.length = htons(sizeof(pclc));
+	pclc.hdr.length = htons(plen);
 	pclc.hdr.version = SMC_CLC_V1;		/* SMC version */
 	memcpy(pclc.lcl.id_for_peer, local_systemid, sizeof(local_systemid));
 	memcpy(&pclc.lcl.gid, &smcibdev->gid[ibport - 1], SMC_GID_SIZE);
 	memcpy(&pclc.lcl.mac, &smcibdev->mac[ibport - 1], ETH_ALEN);
+	pclc.iparea_offset = htons(0);
 
+	memset(&pclc_prfx, 0, sizeof(pclc_prfx));
 	/* determine subnet and mask from internal TCP socket */
-	rc = smc_netinfo_by_tcpsk(smc->clcsock, &pclc.outgoing_subnet,
-				  &pclc.prefix_len);
+	rc = smc_netinfo_by_tcpsk(smc->clcsock, &pclc_prfx.outgoing_subnet,
+				  &pclc_prfx.prefix_len);
 	if (rc)
 		return SMC_CLC_DECL_CNFERR; /* configuration error */
-	memcpy(pclc.trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
+	pclc_prfx.ipv6_prefixes_cnt = 0;
+	memcpy(trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
 	memset(&msg, 0, sizeof(msg));
-	vec.iov_base = &pclc;
-	vec.iov_len = sizeof(pclc);
+	vec[0].iov_base = &pclc;
+	vec[0].iov_len = sizeof(pclc);
+	vec[1].iov_base = &pclc_prfx;
+	vec[1].iov_len = sizeof(pclc_prfx);
+	vec[2].iov_base = &trl;
+	vec[2].iov_len = sizeof(trl);
 	/* due to the few bytes needed for clc-handshake this cannot block */
-	len = kernel_sendmsg(smc->clcsock, &msg, &vec, 1, sizeof(pclc));
+	len = kernel_sendmsg(smc->clcsock, &msg, vec, 3, plen);
 	if (len < sizeof(pclc)) {
 		if (len >= 0) {
 			reason_code = -ENETUNREACH;
diff --git a/net/smc/smc_clc.h b/net/smc/smc_clc.h
index 12a9af1539a2..c145a0f36a68 100644
--- a/net/smc/smc_clc.h
+++ b/net/smc/smc_clc.h
@@ -44,7 +44,7 @@ struct smc_clc_msg_hdr {	/* header1 of clc messages */
 #if defined(__BIG_ENDIAN_BITFIELD)
 	u8 version : 4,
 	   flag    : 1,
-	   rsvd	   : 3;
+	   rsvd    : 3;
 #elif defined(__LITTLE_ENDIAN_BITFIELD)
 	u8 rsvd    : 3,
 	   flag    : 1,
@@ -62,17 +62,31 @@ struct smc_clc_msg_local {	/* header2 of clc messages */
 	u8 mac[6];		/* mac of ib_device port */
 };
 
-struct smc_clc_msg_proposal {	/* clc proposal message */
-	struct smc_clc_msg_hdr hdr;
-	struct smc_clc_msg_local lcl;
-	__be16 iparea_offset;	/* offset to IP address information area */
+struct smc_clc_ipv6_prefix {
+	u8 prefix[4];
+	u8 prefix_len;
+} __packed;
+
+struct smc_clc_msg_proposal_prefix {	/* prefix part of clc proposal message*/
 	__be32 outgoing_subnet;	/* subnet mask */
 	u8 prefix_len;		/* number of significant bits in mask */
 	u8 reserved[2];
 	u8 ipv6_prefixes_cnt;	/* number of IPv6 prefixes in prefix array */
-	struct smc_clc_msg_trail trl; /* eye catcher "SMCR" EBCDIC */
 } __aligned(4);
 
+struct smc_clc_msg_proposal {	/* clc proposal message sent by Linux */
+	struct smc_clc_msg_hdr hdr;
+	struct smc_clc_msg_local lcl;
+	__be16 iparea_offset;	/* offset to IP address information area */
+} __aligned(4);
+
+#define SMC_CLC_PROPOSAL_MAX_OFFSET	0x28
+#define SMC_CLC_PROPOSAL_MAX_PREFIX	(8 * sizeof(struct smc_clc_ipv6_prefix))
+#define SMC_CLC_MAX_LEN		(sizeof(struct smc_clc_msg_proposal) + \
+				 SMC_CLC_PROPOSAL_MAX_OFFSET + \
+				 SMC_CLC_PROPOSAL_MAX_PREFIX + \
+				 sizeof(struct smc_clc_msg_trail))
+
 struct smc_clc_msg_accept_confirm {	/* clc accept / confirm message */
 	struct smc_clc_msg_hdr hdr;
 	struct smc_clc_msg_local lcl;
@@ -102,6 +116,14 @@ struct smc_clc_msg_decline {	/* clc decline message */
 	struct smc_clc_msg_trail trl; /* eye catcher "SMCR" EBCDIC */
 } __aligned(4);
 
+/* determine start of the prefix area within the proposal message */
+static inline struct smc_clc_msg_proposal_prefix *
+smc_clc_proposal_get_prefix(struct smc_clc_msg_proposal *pclc)
+{
+	return (struct smc_clc_msg_proposal_prefix *)
+	       ((u8 *)pclc + sizeof(*pclc) + ntohs(pclc->iparea_offset));
+}
+
 struct smc_sock;
 struct smc_ib_device;
 
diff --git a/net/smc/smc_close.c b/net/smc/smc_close.c
index 48615d2ac4aa..e194c6cc308a 100644
--- a/net/smc/smc_close.c
+++ b/net/smc/smc_close.c
@@ -113,7 +113,7 @@ static int smc_close_abort(struct smc_connection *conn)
 /* terminate smc socket abnormally - active abort
  * RDMA communication no longer possible
  */
-void smc_close_active_abort(struct smc_sock *smc)
+static void smc_close_active_abort(struct smc_sock *smc)
 {
 	struct smc_cdc_conn_state_flags *txflags =
 		&smc->conn.local_tx_ctrl.conn_state_flags;
diff --git a/net/smc/smc_close.h b/net/smc/smc_close.h
index ed82506b1b0a..8c498885d758 100644
--- a/net/smc/smc_close.h
+++ b/net/smc/smc_close.h
@@ -20,7 +20,6 @@
 #define SMC_CLOSE_SOCK_PUT_DELAY		HZ
 
 void smc_close_wake_tx_prepared(struct smc_sock *smc);
-void smc_close_active_abort(struct smc_sock *smc);
 int smc_close_active(struct smc_sock *smc);
 void smc_close_sock_put_work(struct work_struct *work);
 int smc_close_shutdown_write(struct smc_sock *smc);
diff --git a/net/smc/smc_rx.c b/net/smc/smc_rx.c
index cbf58637ee14..9dc392ca06bf 100644
--- a/net/smc/smc_rx.c
+++ b/net/smc/smc_rx.c
@@ -65,7 +65,6 @@ static int smc_rx_wait_data(struct smc_sock *smc, long *timeo)
 	rc = sk_wait_event(sk, timeo,
 			   sk->sk_err ||
 			   sk->sk_shutdown & RCV_SHUTDOWN ||
-			   sock_flag(sk, SOCK_DONE) ||
 			   atomic_read(&conn->bytes_to_rcv) ||
 			   smc_cdc_rxed_any_close_or_senddone(conn),
 			   &wait);
@@ -116,7 +115,7 @@ int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg, size_t len,
 		if (read_done) {
 			if (sk->sk_err ||
 			    sk->sk_state == SMC_CLOSED ||
-			    (sk->sk_shutdown & RCV_SHUTDOWN) ||
+			    sk->sk_shutdown & RCV_SHUTDOWN ||
 			    !timeo ||
 			    signal_pending(current) ||
 			    smc_cdc_rxed_any_close_or_senddone(conn) ||
@@ -124,8 +123,6 @@ int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg, size_t len,
 			    peer_conn_abort)
 				break;
 		} else {
-			if (sock_flag(sk, SOCK_DONE))
-				break;
 			if (sk->sk_err) {
 				read_done = sock_error(sk);
 				break;
diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c
index c48dc2d5fd3a..2e50fddf8ce9 100644
--- a/net/smc/smc_tx.c
+++ b/net/smc/smc_tx.c
@@ -104,14 +104,12 @@ static int smc_tx_wait_memory(struct smc_sock *smc, int flags)
 		if (atomic_read(&conn->sndbuf_space))
 			break; /* at least 1 byte of free space available */
 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
-		sk->sk_write_pending++;
 		sk_wait_event(sk, &timeo,
 			      sk->sk_err ||
 			      (sk->sk_shutdown & SEND_SHUTDOWN) ||
 			      smc_cdc_rxed_any_close_or_senddone(conn) ||
 			      atomic_read(&conn->sndbuf_space),
 			      &wait);
-		sk->sk_write_pending--;
 	}
 	remove_wait_queue(sk_sleep(sk), &wait);
 	return rc;
@@ -450,9 +448,7 @@ static void smc_tx_work(struct work_struct *work)
 void smc_tx_consumer_update(struct smc_connection *conn)
 {
 	union smc_host_cursor cfed, cons;
-	struct smc_cdc_tx_pend *pend;
-	struct smc_wr_buf *wr_buf;
-	int to_confirm, rc;
+	int to_confirm;
 
 	smc_curs_write(&cons,
 		       smc_curs_read(&conn->local_tx_ctrl.cons, conn),
@@ -466,10 +462,7 @@ void smc_tx_consumer_update(struct smc_connection *conn)
 	    ((to_confirm > conn->rmbe_update_limit) &&
 	     ((to_confirm > (conn->rmbe_size / 2)) ||
 	      conn->local_rx_ctrl.prod_flags.write_blocked))) {
-		rc = smc_cdc_get_free_slot(conn, &wr_buf, &pend);
-		if (!rc)
-			rc = smc_cdc_msg_send(conn, wr_buf, pend);
-		if (rc < 0) {
+		if (smc_cdc_get_slot_and_msg_send(conn) < 0) {
 			schedule_delayed_work(&conn->tx_work,
 					      SMC_TX_WORK_DELAY);
 			return;
diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c
index 329325bd553e..37892b3909af 100644
--- a/net/tipc/bcast.c
+++ b/net/tipc/bcast.c
@@ -1,7 +1,7 @@
 /*
  * net/tipc/bcast.c: TIPC broadcast code
  *
- * Copyright (c) 2004-2006, 2014-2016, Ericsson AB
+ * Copyright (c) 2004-2006, 2014-2017, Ericsson AB
  * Copyright (c) 2004, Intel Corporation.
  * Copyright (c) 2005, 2010-2011, Wind River Systems
  * All rights reserved.
@@ -42,8 +42,8 @@
 #include "link.h"
 #include "name_table.h"
 
-#define	BCLINK_WIN_DEFAULT	50	/* bcast link window size (default) */
-#define	BCLINK_WIN_MIN	        32	/* bcast minimum link window size */
+#define BCLINK_WIN_DEFAULT  50	/* bcast link window size (default) */
+#define BCLINK_WIN_MIN      32	/* bcast minimum link window size */
 
 const char tipc_bclink_name[] = "broadcast-link";
 
@@ -74,6 +74,10 @@ static struct tipc_bc_base *tipc_bc_base(struct net *net)
 	return tipc_net(net)->bcbase;
 }
 
+/* tipc_bcast_get_mtu(): -get the MTU currently used by broadcast link
+ * Note: the MTU is decremented to give room for a tunnel header, in
+ * case the message needs to be sent as replicast
+ */
 int tipc_bcast_get_mtu(struct net *net)
 {
 	return tipc_link_mtu(tipc_bc_sndlink(net)) - INT_H_SIZE;
@@ -515,7 +519,7 @@ int tipc_bcast_init(struct net *net)
 	spin_lock_init(&tipc_net(net)->bclock);
 
 	if (!tipc_link_bc_create(net, 0, 0,
-				 U16_MAX,
+				 FB_MTU,
 				 BCLINK_WIN_DEFAULT,
 				 0,
 				 &bb->inputq,
diff --git a/net/tipc/link.c b/net/tipc/link.c
index 6bce0b1117bd..2d6b2aed30e0 100644
--- a/net/tipc/link.c
+++ b/net/tipc/link.c
@@ -483,7 +483,7 @@ bool tipc_link_create(struct net *net, char *if_name, int bearer_id,
 /**
  * tipc_link_bc_create - create new link to be used for broadcast
  * @n: pointer to associated node
- * @mtu: mtu to be used
+ * @mtu: mtu to be used initially if no peers
  * @window: send window to be used
  * @inputq: queue to put messages ready for delivery
  * @namedq: queue to put binding table update messages ready for delivery
diff --git a/net/tipc/msg.c b/net/tipc/msg.c
index b0d07b35909d..55d8ba92291d 100644
--- a/net/tipc/msg.c
+++ b/net/tipc/msg.c
@@ -251,20 +251,23 @@ bool tipc_msg_validate(struct sk_buff **_skb)
  * @pktmax: Max packet size that can be used
  * @list: Buffer or chain of buffers to be returned to caller
  *
+ * Note that the recursive call we are making here is safe, since it can
+ * logically go only one further level down.
+ *
  * Returns message data size or errno: -ENOMEM, -EFAULT
  */
-int tipc_msg_build(struct tipc_msg *mhdr, struct msghdr *m,
-		   int offset, int dsz, int pktmax, struct sk_buff_head *list)
+int tipc_msg_build(struct tipc_msg *mhdr, struct msghdr *m, int offset,
+		   int dsz, int pktmax, struct sk_buff_head *list)
 {
 	int mhsz = msg_hdr_sz(mhdr);
+	struct tipc_msg pkthdr;
 	int msz = mhsz + dsz;
-	int pktno = 1;
-	int pktsz;
 	int pktrem = pktmax;
-	int drem = dsz;
-	struct tipc_msg pkthdr;
 	struct sk_buff *skb;
+	int drem = dsz;
+	int pktno = 1;
 	char *pktpos;
+	int pktsz;
 	int rc;
 
 	msg_set_size(mhdr, msz);
@@ -272,8 +275,18 @@ int tipc_msg_build(struct tipc_msg *mhdr, struct msghdr *m,
 	/* No fragmentation needed? */
 	if (likely(msz <= pktmax)) {
 		skb = tipc_buf_acquire(msz, GFP_KERNEL);
-		if (unlikely(!skb))
+
+		/* Fall back to smaller MTU if node local message */
+		if (unlikely(!skb)) {
+			if (pktmax != MAX_MSG_SIZE)
+				return -ENOMEM;
+			rc = tipc_msg_build(mhdr, m, offset, dsz, FB_MTU, list);
+			if (rc != dsz)
+				return rc;
+			if (tipc_msg_assemble(list))
+				return dsz;
 			return -ENOMEM;
+		}
 		skb_orphan(skb);
 		__skb_queue_tail(list, skb);
 		skb_copy_to_linear_data(skb, mhdr, mhsz);
@@ -589,6 +602,30 @@ bool tipc_msg_lookup_dest(struct net *net, struct sk_buff *skb, int *err)
 	return true;
 }
 
+/* tipc_msg_assemble() - assemble chain of fragments into one message
+ */
+bool tipc_msg_assemble(struct sk_buff_head *list)
+{
+	struct sk_buff *skb, *tmp = NULL;
+
+	if (skb_queue_len(list) == 1)
+		return true;
+
+	while ((skb = __skb_dequeue(list))) {
+		skb->next = NULL;
+		if (tipc_buf_append(&tmp, &skb)) {
+			__skb_queue_tail(list, skb);
+			return true;
+		}
+		if (!tmp)
+			break;
+	}
+	__skb_queue_purge(list);
+	__skb_queue_head_init(list);
+	pr_warn("Failed do assemble buffer\n");
+	return false;
+}
+
 /* tipc_msg_reassemble() - clone a buffer chain of fragments and
  *                         reassemble the clones into one message
  */
diff --git a/net/tipc/msg.h b/net/tipc/msg.h
index 3e4384c222f7..b4ba1b4f9ae7 100644
--- a/net/tipc/msg.h
+++ b/net/tipc/msg.h
@@ -98,7 +98,7 @@ struct plist;
 #define MAX_H_SIZE                60	/* Largest possible TIPC header size */
 
 #define MAX_MSG_SIZE (MAX_H_SIZE + TIPC_MAX_USER_MSG_SIZE)
-
+#define FB_MTU                  3744
 #define TIPC_MEDIA_INFO_OFFSET	5
 
 struct tipc_skb_cb {
@@ -943,6 +943,7 @@ bool tipc_msg_extract(struct sk_buff *skb, struct sk_buff **iskb, int *pos);
 int tipc_msg_build(struct tipc_msg *mhdr, struct msghdr *m,
 		   int offset, int dsz, int mtu, struct sk_buff_head *list);
 bool tipc_msg_lookup_dest(struct net *net, struct sk_buff *skb, int *err);
+bool tipc_msg_assemble(struct sk_buff_head *list);
 bool tipc_msg_reassemble(struct sk_buff_head *list, struct sk_buff_head *rcvq);
 bool tipc_msg_pskb_copy(u32 dst, struct sk_buff_head *msg,
 			struct sk_buff_head *cpy);
diff --git a/net/tipc/socket.c b/net/tipc/socket.c
index 41127d0b925e..0cdf5c2ad881 100644
--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -2640,9 +2640,7 @@ void tipc_sk_reinit(struct net *net)
 	rhashtable_walk_enter(&tn->sk_rht, &iter);
 
 	do {
-		tsk = ERR_PTR(rhashtable_walk_start(&iter));
-		if (IS_ERR(tsk))
-			goto walk_stop;
+		rhashtable_walk_start(&iter);
 
 		while ((tsk = rhashtable_walk_next(&iter)) && !IS_ERR(tsk)) {
 			spin_lock_bh(&tsk->sk.sk_lock.slock);
@@ -2651,7 +2649,7 @@ void tipc_sk_reinit(struct net *net)
 			msg_set_orignode(msg, tn->own_addr);
 			spin_unlock_bh(&tsk->sk.sk_lock.slock);
 		}
-walk_stop:
+
 		rhashtable_walk_stop(&iter);
 	} while (tsk == ERR_PTR(-EAGAIN));
 }
diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c
index 30e5746085b8..00641b611aed 100644
--- a/net/xfrm/xfrm_device.c
+++ b/net/xfrm/xfrm_device.c
@@ -67,7 +67,7 @@ int xfrm_dev_state_add(struct net *net, struct xfrm_state *x,
 
 	/* We don't yet support UDP encapsulation, TFC padding and ESN. */
 	if (x->encap || x->tfcpad || (x->props.flags & XFRM_STATE_ESN))
-		return 0;
+		return -EINVAL;
 
 	dev = dev_get_by_index(net, xuo->ifindex);
 	if (!dev) {
@@ -120,8 +120,8 @@ bool xfrm_dev_offload_ok(struct sk_buff *skb, struct xfrm_state *x)
 	if (!x->type_offload || x->encap)
 		return false;
 
-	if ((x->xso.offload_handle && (dev == dst->path->dev)) &&
-	     !dst->child->xfrm && x->type->get_mtu) {
+	if ((x->xso.offload_handle && (dev == xfrm_dst_path(dst)->dev)) &&
+	     !xdst->child->xfrm && x->type->get_mtu) {
 		mtu = x->type->get_mtu(x, xdst->child_mtu_cached);
 
 		if (skb->len <= mtu)
diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c
index 347ab31574d5..ac277b97e0d7 100644
--- a/net/xfrm/xfrm_input.c
+++ b/net/xfrm/xfrm_input.c
@@ -231,7 +231,6 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type)
 
 		if (xo && (xo->flags & CRYPTO_DONE)) {
 			crypto_done = true;
-			x = xfrm_input_state(skb);
 			family = XFRM_SPI_SKB_CB(skb)->family;
 
 			if (!(xo->status & CRYPTO_SUCCESS)) {
diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c
index 73ad8c8ef344..23468672a767 100644
--- a/net/xfrm/xfrm_output.c
+++ b/net/xfrm/xfrm_output.c
@@ -44,7 +44,7 @@ static int xfrm_skb_check_space(struct sk_buff *skb)
 
 static struct dst_entry *skb_dst_pop(struct sk_buff *skb)
 {
-	struct dst_entry *child = dst_clone(skb_dst(skb)->child);
+	struct dst_entry *child = dst_clone(xfrm_dst_child(skb_dst(skb)));
 
 	skb_dst_drop(skb);
 	return child;
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 9542975eb2f9..e3a5aca9cdda 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -54,7 +54,7 @@ static struct xfrm_policy_afinfo const __rcu *xfrm_policy_afinfo[AF_INET6 + 1]
 static struct kmem_cache *xfrm_dst_cache __read_mostly;
 static __read_mostly seqcount_t xfrm_policy_hash_generation;
 
-static void xfrm_init_pmtu(struct dst_entry *dst);
+static void xfrm_init_pmtu(struct xfrm_dst **bundle, int nr);
 static int stale_bundle(struct dst_entry *dst);
 static int xfrm_bundle_ok(struct xfrm_dst *xdst);
 static void xfrm_policy_queue_process(struct timer_list *t);
@@ -1251,7 +1251,7 @@ EXPORT_SYMBOL(xfrm_policy_delete);
 
 int xfrm_sk_policy_insert(struct sock *sk, int dir, struct xfrm_policy *pol)
 {
-	struct net *net = xp_net(pol);
+	struct net *net = sock_net(sk);
 	struct xfrm_policy *old_pol;
 
 #ifdef CONFIG_XFRM_SUB_POLICY
@@ -1538,7 +1538,9 @@ static inline int xfrm_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,
  */
 
 static struct dst_entry *xfrm_bundle_create(struct xfrm_policy *policy,
-					    struct xfrm_state **xfrm, int nx,
+					    struct xfrm_state **xfrm,
+					    struct xfrm_dst **bundle,
+					    int nx,
 					    const struct flowi *fl,
 					    struct dst_entry *dst)
 {
@@ -1546,8 +1548,8 @@ static struct dst_entry *xfrm_bundle_create(struct xfrm_policy *policy,
 	unsigned long now = jiffies;
 	struct net_device *dev;
 	struct xfrm_mode *inner_mode;
-	struct dst_entry *dst_prev = NULL;
-	struct dst_entry *dst0 = NULL;
+	struct xfrm_dst *xdst_prev = NULL;
+	struct xfrm_dst *xdst0 = NULL;
 	int i = 0;
 	int err;
 	int header_len = 0;
@@ -1573,13 +1575,14 @@ static struct dst_entry *xfrm_bundle_create(struct xfrm_policy *policy,
 			goto put_states;
 		}
 
-		if (!dst_prev)
-			dst0 = dst1;
+		bundle[i] = xdst;
+		if (!xdst_prev)
+			xdst0 = xdst;
 		else
 			/* Ref count is taken during xfrm_alloc_dst()
 			 * No need to do dst_clone() on dst1
 			 */
-			dst_prev->child = dst1;
+			xfrm_dst_set_child(xdst_prev, &xdst->u.dst);
 
 		if (xfrm[i]->sel.family == AF_UNSPEC) {
 			inner_mode = xfrm_ip2inner_mode(xfrm[i],
@@ -1616,8 +1619,7 @@ static struct dst_entry *xfrm_bundle_create(struct xfrm_policy *policy,
 		dst1->input = dst_discard;
 		dst1->output = inner_mode->afinfo->output;
 
-		dst1->next = dst_prev;
-		dst_prev = dst1;
+		xdst_prev = xdst;
 
 		header_len += xfrm[i]->props.header_len;
 		if (xfrm[i]->type->flags & XFRM_TYPE_NON_FRAGMENT)
@@ -1625,40 +1627,39 @@ static struct dst_entry *xfrm_bundle_create(struct xfrm_policy *policy,
 		trailer_len += xfrm[i]->props.trailer_len;
 	}
 
-	dst_prev->child = dst;
-	dst0->path = dst;
+	xfrm_dst_set_child(xdst_prev, dst);
+	xdst0->path = dst;
 
 	err = -ENODEV;
 	dev = dst->dev;
 	if (!dev)
 		goto free_dst;
 
-	xfrm_init_path((struct xfrm_dst *)dst0, dst, nfheader_len);
-	xfrm_init_pmtu(dst_prev);
-
-	for (dst_prev = dst0; dst_prev != dst; dst_prev = dst_prev->child) {
-		struct xfrm_dst *xdst = (struct xfrm_dst *)dst_prev;
+	xfrm_init_path(xdst0, dst, nfheader_len);
+	xfrm_init_pmtu(bundle, nx);
 
-		err = xfrm_fill_dst(xdst, dev, fl);
+	for (xdst_prev = xdst0; xdst_prev != (struct xfrm_dst *)dst;
+	     xdst_prev = (struct xfrm_dst *) xfrm_dst_child(&xdst_prev->u.dst)) {
+		err = xfrm_fill_dst(xdst_prev, dev, fl);
 		if (err)
 			goto free_dst;
 
-		dst_prev->header_len = header_len;
-		dst_prev->trailer_len = trailer_len;
-		header_len -= xdst->u.dst.xfrm->props.header_len;
-		trailer_len -= xdst->u.dst.xfrm->props.trailer_len;
+		xdst_prev->u.dst.header_len = header_len;
+		xdst_prev->u.dst.trailer_len = trailer_len;
+		header_len -= xdst_prev->u.dst.xfrm->props.header_len;
+		trailer_len -= xdst_prev->u.dst.xfrm->props.trailer_len;
 	}
 
 out:
-	return dst0;
+	return &xdst0->u.dst;
 
 put_states:
 	for (; i < nx; i++)
 		xfrm_state_put(xfrm[i]);
 free_dst:
-	if (dst0)
-		dst_release_immediate(dst0);
-	dst0 = ERR_PTR(err);
+	if (xdst0)
+		dst_release_immediate(&xdst0->u.dst);
+	xdst0 = ERR_PTR(err);
 	goto out;
 }
 
@@ -1800,7 +1801,7 @@ static bool xfrm_xdst_can_reuse(struct xfrm_dst *xdst,
 	for (i = 0; i < num; i++) {
 		if (!dst || dst->xfrm != xfrm[i])
 			return false;
-		dst = dst->child;
+		dst = xfrm_dst_child(dst);
 	}
 
 	return xfrm_bundle_ok(xdst);
@@ -1813,6 +1814,7 @@ xfrm_resolve_and_create_bundle(struct xfrm_policy **pols, int num_pols,
 {
 	struct net *net = xp_net(pols[0]);
 	struct xfrm_state *xfrm[XFRM_MAX_DEPTH];
+	struct xfrm_dst *bundle[XFRM_MAX_DEPTH];
 	struct xfrm_dst *xdst, *old;
 	struct dst_entry *dst;
 	int err;
@@ -1840,7 +1842,7 @@ xfrm_resolve_and_create_bundle(struct xfrm_policy **pols, int num_pols,
 
 	old = xdst;
 
-	dst = xfrm_bundle_create(pols[0], xfrm, err, fl, dst_orig);
+	dst = xfrm_bundle_create(pols[0], xfrm, bundle, err, fl, dst_orig);
 	if (IS_ERR(dst)) {
 		XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTBUNDLEGENERROR);
 		return ERR_CAST(dst);
@@ -1880,8 +1882,8 @@ static void xfrm_policy_queue_process(struct timer_list *t)
 	xfrm_decode_session(skb, &fl, dst->ops->family);
 	spin_unlock(&pq->hold_queue.lock);
 
-	dst_hold(dst->path);
-	dst = xfrm_lookup(net, dst->path, &fl, sk, 0);
+	dst_hold(xfrm_dst_path(dst));
+	dst = xfrm_lookup(net, xfrm_dst_path(dst), &fl, sk, 0);
 	if (IS_ERR(dst))
 		goto purge_queue;
 
@@ -1910,8 +1912,8 @@ static void xfrm_policy_queue_process(struct timer_list *t)
 		skb = __skb_dequeue(&list);
 
 		xfrm_decode_session(skb, &fl, skb_dst(skb)->ops->family);
-		dst_hold(skb_dst(skb)->path);
-		dst = xfrm_lookup(net, skb_dst(skb)->path, &fl, skb->sk, 0);
+		dst_hold(xfrm_dst_path(skb_dst(skb)));
+		dst = xfrm_lookup(net, xfrm_dst_path(skb_dst(skb)), &fl, skb->sk, 0);
 		if (IS_ERR(dst)) {
 			kfree_skb(skb);
 			continue;
@@ -2012,8 +2014,8 @@ static struct xfrm_dst *xfrm_create_dummy_bundle(struct net *net,
 	dst1->output = xdst_queue_output;
 
 	dst_hold(dst);
-	dst1->child = dst;
-	dst1->path = dst;
+	xfrm_dst_set_child(xdst, dst);
+	xdst->path = dst;
 
 	xfrm_init_path((struct xfrm_dst *)dst1, dst, 0);
 
@@ -2576,7 +2578,7 @@ static int stale_bundle(struct dst_entry *dst)
 
 void xfrm_dst_ifdown(struct dst_entry *dst, struct net_device *dev)
 {
-	while ((dst = dst->child) && dst->xfrm && dst->dev == dev) {
+	while ((dst = xfrm_dst_child(dst)) && dst->xfrm && dst->dev == dev) {
 		dst->dev = dev_net(dev)->loopback_dev;
 		dev_hold(dst->dev);
 		dev_put(dev);
@@ -2600,13 +2602,15 @@ static struct dst_entry *xfrm_negative_advice(struct dst_entry *dst)
 	return dst;
 }
 
-static void xfrm_init_pmtu(struct dst_entry *dst)
+static void xfrm_init_pmtu(struct xfrm_dst **bundle, int nr)
 {
-	do {
-		struct xfrm_dst *xdst = (struct xfrm_dst *)dst;
+	while (nr--) {
+		struct xfrm_dst *xdst = bundle[nr];
 		u32 pmtu, route_mtu_cached;
+		struct dst_entry *dst;
 
-		pmtu = dst_mtu(dst->child);
+		dst = &xdst->u.dst;
+		pmtu = dst_mtu(xfrm_dst_child(dst));
 		xdst->child_mtu_cached = pmtu;
 
 		pmtu = xfrm_state_mtu(dst->xfrm, pmtu);
@@ -2618,7 +2622,7 @@ static void xfrm_init_pmtu(struct dst_entry *dst)
 			pmtu = route_mtu_cached;
 
 		dst_metric_set(dst, RTAX_MTU, pmtu);
-	} while ((dst = dst->next));
+	}
 }
 
 /* Check that the bundle accepts the flow and its components are
@@ -2627,19 +2631,20 @@ static void xfrm_init_pmtu(struct dst_entry *dst)
 
 static int xfrm_bundle_ok(struct xfrm_dst *first)
 {
+	struct xfrm_dst *bundle[XFRM_MAX_DEPTH];
 	struct dst_entry *dst = &first->u.dst;
-	struct xfrm_dst *last;
+	struct xfrm_dst *xdst;
+	int start_from, nr;
 	u32 mtu;
 
-	if (!dst_check(dst->path, ((struct xfrm_dst *)dst)->path_cookie) ||
+	if (!dst_check(xfrm_dst_path(dst), ((struct xfrm_dst *)dst)->path_cookie) ||
 	    (dst->dev && !netif_running(dst->dev)))
 		return 0;
 
 	if (dst->flags & DST_XFRM_QUEUE)
 		return 1;
 
-	last = NULL;
-
+	start_from = nr = 0;
 	do {
 		struct xfrm_dst *xdst = (struct xfrm_dst *)dst;
 
@@ -2651,9 +2656,11 @@ static int xfrm_bundle_ok(struct xfrm_dst *first)
 		    xdst->policy_genid != atomic_read(&xdst->pols[0]->genid))
 			return 0;
 
-		mtu = dst_mtu(dst->child);
+		bundle[nr++] = xdst;
+
+		mtu = dst_mtu(xfrm_dst_child(dst));
 		if (xdst->child_mtu_cached != mtu) {
-			last = xdst;
+			start_from = nr;
 			xdst->child_mtu_cached = mtu;
 		}
 
@@ -2661,30 +2668,30 @@ static int xfrm_bundle_ok(struct xfrm_dst *first)
 			return 0;
 		mtu = dst_mtu(xdst->route);
 		if (xdst->route_mtu_cached != mtu) {
-			last = xdst;
+			start_from = nr;
 			xdst->route_mtu_cached = mtu;
 		}
 
-		dst = dst->child;
+		dst = xfrm_dst_child(dst);
 	} while (dst->xfrm);
 
-	if (likely(!last))
+	if (likely(!start_from))
 		return 1;
 
-	mtu = last->child_mtu_cached;
-	for (;;) {
-		dst = &last->u.dst;
+	xdst = bundle[start_from - 1];
+	mtu = xdst->child_mtu_cached;
+	while (start_from--) {
+		dst = &xdst->u.dst;
 
 		mtu = xfrm_state_mtu(dst->xfrm, mtu);
-		if (mtu > last->route_mtu_cached)
-			mtu = last->route_mtu_cached;
+		if (mtu > xdst->route_mtu_cached)
+			mtu = xdst->route_mtu_cached;
 		dst_metric_set(dst, RTAX_MTU, mtu);
-
-		if (last == first)
+		if (!start_from)
 			break;
 
-		last = (struct xfrm_dst *)last->u.dst.next;
-		last->child_mtu_cached = mtu;
+		xdst = bundle[start_from - 1];
+		xdst->child_mtu_cached = mtu;
 	}
 
 	return 1;
@@ -2692,22 +2699,20 @@ static int xfrm_bundle_ok(struct xfrm_dst *first)
 
 static unsigned int xfrm_default_advmss(const struct dst_entry *dst)
 {
-	return dst_metric_advmss(dst->path);
+	return dst_metric_advmss(xfrm_dst_path(dst));
 }
 
 static unsigned int xfrm_mtu(const struct dst_entry *dst)
 {
 	unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
 
-	return mtu ? : dst_mtu(dst->path);
+	return mtu ? : dst_mtu(xfrm_dst_path(dst));
 }
 
 static const void *xfrm_get_dst_nexthop(const struct dst_entry *dst,
 					const void *daddr)
 {
-	const struct dst_entry *path = dst->path;
-
-	for (; dst != path; dst = dst->child) {
+	while (dst->xfrm) {
 		const struct xfrm_state *xfrm = dst->xfrm;
 
 		if (xfrm->props.mode == XFRM_MODE_TRANSPORT)
@@ -2716,6 +2721,8 @@ static const void *xfrm_get_dst_nexthop(const struct dst_entry *dst,
 			daddr = xfrm->coaddr;
 		else if (!(xfrm->type->flags & XFRM_TYPE_LOCAL_COADDR))
 			daddr = &xfrm->id.daddr;
+
+		dst = xfrm_dst_child(dst);
 	}
 	return daddr;
 }
@@ -2724,7 +2731,7 @@ static struct neighbour *xfrm_neigh_lookup(const struct dst_entry *dst,
 					   struct sk_buff *skb,
 					   const void *daddr)
 {
-	const struct dst_entry *path = dst->path;
+	const struct dst_entry *path = xfrm_dst_path(dst);
 
 	if (!skb)
 		daddr = xfrm_get_dst_nexthop(dst, daddr);
@@ -2733,7 +2740,7 @@ static struct neighbour *xfrm_neigh_lookup(const struct dst_entry *dst,
 
 static void xfrm_confirm_neigh(const struct dst_entry *dst, const void *daddr)
 {
-	const struct dst_entry *path = dst->path;
+	const struct dst_entry *path = xfrm_dst_path(dst);
 
 	daddr = xfrm_get_dst_nexthop(dst, daddr);
 	path->ops->confirm_neigh(path, daddr);
diff --git a/net/xfrm/xfrm_replay.c b/net/xfrm/xfrm_replay.c
index 8b23c5bcf8e8..02501817227b 100644
--- a/net/xfrm/xfrm_replay.c
+++ b/net/xfrm/xfrm_replay.c
@@ -666,7 +666,7 @@ static int xfrm_replay_overflow_offload_esn(struct xfrm_state *x, struct sk_buff
 		if (unlikely(oseq < replay_esn->oseq)) {
 			XFRM_SKB_CB(skb)->seq.output.hi = ++oseq_hi;
 			xo->seq.hi = oseq_hi;
-
+			replay_esn->oseq_hi = oseq_hi;
 			if (replay_esn->oseq_hi == 0) {
 				replay_esn->oseq--;
 				replay_esn->oseq_hi--;
@@ -678,7 +678,6 @@ static int xfrm_replay_overflow_offload_esn(struct xfrm_state *x, struct sk_buff
 		}
 
 		replay_esn->oseq = oseq;
-		replay_esn->oseq_hi = oseq_hi;
 
 		if (xfrm_aevent_is_on(net))
 			x->repl->notify(x, XFRM_REPLAY_UPDATE);
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index 065d89606888..1b7856be3eeb 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -2048,6 +2048,13 @@ int xfrm_user_policy(struct sock *sk, int optname, u8 __user *optval, int optlen
 	struct xfrm_mgr *km;
 	struct xfrm_policy *pol = NULL;
 
+	if (!optval && !optlen) {
+		xfrm_sk_policy_insert(sk, XFRM_POLICY_IN, NULL);
+		xfrm_sk_policy_insert(sk, XFRM_POLICY_OUT, NULL);
+		__sk_dst_reset(sk);
+		return 0;
+	}
+
 	if (optlen <= 0 || optlen > PAGE_SIZE)
 		return -EMSGSIZE;