From 18afa4b028b46f8b45ca64f94aefe717c297b07d Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 23 Jul 2013 16:13:17 +0200
Subject: net: Make devnet_rename_seq static

No users outside net/core/dev.c.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/core/dev.c')

diff --git a/net/core/dev.c b/net/core/dev.c
index 26755dd40daa..dfd9f5d56ae0 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -174,7 +174,7 @@ static DEFINE_SPINLOCK(napi_hash_lock);
 static unsigned int napi_gen_id;
 static DEFINE_HASHTABLE(napi_hash, 8);
 
-seqcount_t devnet_rename_seq;
+static seqcount_t devnet_rename_seq;
 
 static inline void dev_base_seq_inc(struct net *net)
 {
-- 
cgit v1.2.3


From 66b52b0dc82c5c88d769dc1c7d44cf45d0deb07c Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@resnulli.us>
Date: Mon, 29 Jul 2013 18:16:49 +0200
Subject: net: add ndo to get id of physical port of the device

This patch adds a ndo for getting physical port of the device. Driver
which is aware of being virtual function of some physical port should
implement this ndo. This is applicable not only for IOV, but for other
solutions (NPAR, multichannel) as well. Basically if there is possible
to have multiple netdevs on the single hw port.

Signed-off-by: Jiri Pirko <jiri@resnulli.us>
Acked-by: Ben Hutchings <bhutchings@solarflare.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 20 ++++++++++++++++++++
 net/core/dev.c            | 18 ++++++++++++++++++
 2 files changed, 38 insertions(+)

(limited to 'net/core/dev.c')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 3ca60b070ef0..875f869dc38a 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -728,6 +728,16 @@ struct netdev_fcoe_hbainfo {
 };
 #endif
 
+#define MAX_PHYS_PORT_ID_LEN 32
+
+/* This structure holds a unique identifier to identify the
+ * physical port used by a netdevice.
+ */
+struct netdev_phys_port_id {
+	unsigned char id[MAX_PHYS_PORT_ID_LEN];
+	unsigned char id_len;
+};
+
 /*
  * This structure defines the management hooks for network devices.
  * The following hooks can be defined; unless noted otherwise, they are
@@ -932,6 +942,12 @@ struct netdev_fcoe_hbainfo {
  *	that determine carrier state from physical hardware properties (eg
  *	network cables) or protocol-dependent mechanisms (eg
  *	USB_CDC_NOTIFY_NETWORK_CONNECTION) should NOT implement this function.
+ *
+ * int (*ndo_get_phys_port_id)(struct net_device *dev,
+ *			       struct netdev_phys_port_id *ppid);
+ *	Called to get ID of physical port of this device. If driver does
+ *	not implement this, it is assumed that the hw is not able to have
+ *	multiple net devices on single physical port.
  */
 struct net_device_ops {
 	int			(*ndo_init)(struct net_device *dev);
@@ -1060,6 +1076,8 @@ struct net_device_ops {
 						      struct nlmsghdr *nlh);
 	int			(*ndo_change_carrier)(struct net_device *dev,
 						      bool new_carrier);
+	int			(*ndo_get_phys_port_id)(struct net_device *dev,
+							struct netdev_phys_port_id *ppid);
 };
 
 /*
@@ -2315,6 +2333,8 @@ extern int		dev_set_mac_address(struct net_device *,
 					    struct sockaddr *);
 extern int		dev_change_carrier(struct net_device *,
 					   bool new_carrier);
+extern int		dev_get_phys_port_id(struct net_device *dev,
+					     struct netdev_phys_port_id *ppid);
 extern int		dev_hard_start_xmit(struct sk_buff *skb,
 					    struct net_device *dev,
 					    struct netdev_queue *txq);
diff --git a/net/core/dev.c b/net/core/dev.c
index dfd9f5d56ae0..58eb802584b9 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4988,6 +4988,24 @@ int dev_change_carrier(struct net_device *dev, bool new_carrier)
 }
 EXPORT_SYMBOL(dev_change_carrier);
 
+/**
+ *	dev_get_phys_port_id - Get device physical port ID
+ *	@dev: device
+ *	@ppid: port ID
+ *
+ *	Get device physical port ID
+ */
+int dev_get_phys_port_id(struct net_device *dev,
+			 struct netdev_phys_port_id *ppid)
+{
+	const struct net_device_ops *ops = dev->netdev_ops;
+
+	if (!ops->ndo_get_phys_port_id)
+		return -EOPNOTSUPP;
+	return ops->ndo_get_phys_port_id(dev, ppid);
+}
+EXPORT_SYMBOL(dev_get_phys_port_id);
+
 /**
  *	dev_new_index	-	allocate an ifindex
  *	@net: the applicable net namespace
-- 
cgit v1.2.3


From 64261f230a9157f5f520ce30ec6827d679375e2f Mon Sep 17 00:00:00 2001
From: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Date: Tue, 13 Aug 2013 17:51:09 +0200
Subject: dev: move skb_scrub_packet() after eth_type_trans()

skb_scrub_packet() was called before eth_type_trans() to let eth_type_trans()
set pkt_type.

In fact, we should force pkt_type to PACKET_HOST, so move the call after
eth_type_trans().

Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c       | 6 +++---
 net/ipv4/ip_tunnel.c | 7 ++++---
 2 files changed, 7 insertions(+), 6 deletions(-)

(limited to 'net/core/dev.c')

diff --git a/net/core/dev.c b/net/core/dev.c
index 58eb802584b9..1ed2b66a10a6 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1691,13 +1691,13 @@ int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
 		kfree_skb(skb);
 		return NET_RX_DROP;
 	}
-	skb_scrub_packet(skb);
 	skb->protocol = eth_type_trans(skb, dev);
 
 	/* eth_type_trans() can set pkt_type.
-	 * clear pkt_type _after_ calling eth_type_trans()
+	 * call skb_scrub_packet() after it to clear pkt_type _after_ calling
+	 * eth_type_trans().
 	 */
-	skb->pkt_type = PACKET_HOST;
+	skb_scrub_packet(skb);
 
 	return netif_rx(skb);
 }
diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index 9fdf8a6d95f3..fbc1094964bf 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -454,15 +454,16 @@ int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
 	tstats->rx_bytes += skb->len;
 	u64_stats_update_end(&tstats->syncp);
 
-	if (tunnel->net != dev_net(tunnel->dev))
-		skb_scrub_packet(skb);
-
 	if (tunnel->dev->type == ARPHRD_ETHER) {
 		skb->protocol = eth_type_trans(skb, tunnel->dev);
 		skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
 	} else {
 		skb->dev = tunnel->dev;
 	}
+
+	if (tunnel->net != dev_net(tunnel->dev))
+		skb_scrub_packet(skb);
+
 	gro_cells_receive(&tunnel->gro_cells, skb);
 	return 0;
 
-- 
cgit v1.2.3


From aa9d85605f5ab070b64842b3eba797cf81698ae1 Mon Sep 17 00:00:00 2001
From: Veaceslav Falico <vfalico@redhat.com>
Date: Wed, 28 Aug 2013 23:25:04 +0200
Subject: net: rename netdev_upper to netdev_adjacent

Rename the structure to reflect the upcoming addition of lower_dev_list.

CC: "David S. Miller" <davem@davemloft.net>
CC: Eric Dumazet <edumazet@google.com>
CC: Jiri Pirko <jiri@resnulli.us>
CC: Alexander Duyck <alexander.h.duyck@intel.com>
CC: Cong Wang <amwang@redhat.com>
Signed-off-by: Veaceslav Falico <vfalico@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

(limited to 'net/core/dev.c')

diff --git a/net/core/dev.c b/net/core/dev.c
index 1ed2b66a10a6..5072e2c1a072 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4367,7 +4367,7 @@ softnet_break:
 	goto out;
 }
 
-struct netdev_upper {
+struct netdev_adjacent {
 	struct net_device *dev;
 	bool master;
 	struct list_head list;
@@ -4378,7 +4378,7 @@ struct netdev_upper {
 static void __append_search_uppers(struct list_head *search_list,
 				   struct net_device *dev)
 {
-	struct netdev_upper *upper;
+	struct netdev_adjacent *upper;
 
 	list_for_each_entry(upper, &dev->upper_dev_list, list) {
 		/* check if this upper is not already in search list */
@@ -4391,8 +4391,8 @@ static bool __netdev_search_upper_dev(struct net_device *dev,
 				      struct net_device *upper_dev)
 {
 	LIST_HEAD(search_list);
-	struct netdev_upper *upper;
-	struct netdev_upper *tmp;
+	struct netdev_adjacent *upper;
+	struct netdev_adjacent *tmp;
 	bool ret = false;
 
 	__append_search_uppers(&search_list, dev);
@@ -4408,10 +4408,10 @@ static bool __netdev_search_upper_dev(struct net_device *dev,
 	return ret;
 }
 
-static struct netdev_upper *__netdev_find_upper(struct net_device *dev,
+static struct netdev_adjacent *__netdev_find_upper(struct net_device *dev,
 						struct net_device *upper_dev)
 {
-	struct netdev_upper *upper;
+	struct netdev_adjacent *upper;
 
 	list_for_each_entry(upper, &dev->upper_dev_list, list) {
 		if (upper->dev == upper_dev)
@@ -4462,7 +4462,7 @@ EXPORT_SYMBOL(netdev_has_any_upper_dev);
  */
 struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
 {
-	struct netdev_upper *upper;
+	struct netdev_adjacent *upper;
 
 	ASSERT_RTNL();
 
@@ -4470,7 +4470,7 @@ struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
 		return NULL;
 
 	upper = list_first_entry(&dev->upper_dev_list,
-				 struct netdev_upper, list);
+				 struct netdev_adjacent, list);
 	if (likely(upper->master))
 		return upper->dev;
 	return NULL;
@@ -4486,10 +4486,10 @@ EXPORT_SYMBOL(netdev_master_upper_dev_get);
  */
 struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
 {
-	struct netdev_upper *upper;
+	struct netdev_adjacent *upper;
 
 	upper = list_first_or_null_rcu(&dev->upper_dev_list,
-				       struct netdev_upper, list);
+				       struct netdev_adjacent, list);
 	if (upper && likely(upper->master))
 		return upper->dev;
 	return NULL;
@@ -4499,7 +4499,7 @@ EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
 static int __netdev_upper_dev_link(struct net_device *dev,
 				   struct net_device *upper_dev, bool master)
 {
-	struct netdev_upper *upper;
+	struct netdev_adjacent *upper;
 
 	ASSERT_RTNL();
 
@@ -4580,7 +4580,7 @@ EXPORT_SYMBOL(netdev_master_upper_dev_link);
 void netdev_upper_dev_unlink(struct net_device *dev,
 			     struct net_device *upper_dev)
 {
-	struct netdev_upper *upper;
+	struct netdev_adjacent *upper;
 
 	ASSERT_RTNL();
 
-- 
cgit v1.2.3


From 5d261913ca3daf6c2d21d38924235667b3d07c40 Mon Sep 17 00:00:00 2001
From: Veaceslav Falico <vfalico@redhat.com>
Date: Wed, 28 Aug 2013 23:25:05 +0200
Subject: net: add lower_dev_list to net_device and make a full mesh

This patch adds lower_dev_list list_head to net_device, which is the same
as upper_dev_list, only for lower devices, and begins to use it in the same
way as the upper list.

It also changes the way the whole adjacent device lists work - now they
contain *all* of upper/lower devices, not only the first level. The first
level devices are distinguished by the bool neighbour field in
netdev_adjacent, also added by this patch.

There are cases when a device can be added several times to the adjacent
list, the simplest would be:

     /---- eth0.10 ---\
eth0-		       --- bond0
     \---- eth0.20 ---/

where both bond0 and eth0 'see' each other in the adjacent lists two times.
To avoid duplication of netdev_adjacent structures ref_nr is being kept as
the number of times the device was added to the list.

The 'full view' is achieved by adding, on link creation, all of the
upper_dev's upper_dev_list devices as upper devices to all of the
lower_dev's lower_dev_list devices (and to the lower_dev itself), and vice
versa. On unlink they are removed using the same logic.

I've tested it with thousands vlans/bonds/bridges, everything works ok and
no observable lags even on a huge number of interfaces.

Memory footprint for 128 devices interconnected with each other via both
upper and lower (which is impossible, but for the comparison) lists would be:

128*128*2*sizeof(netdev_adjacent) = 1.5MB

but in the real world we usualy have at most several devices with slaves
and a lot of vlans, so the footprint will be much lower.

CC: "David S. Miller" <davem@davemloft.net>
CC: Eric Dumazet <edumazet@google.com>
CC: Jiri Pirko <jiri@resnulli.us>
CC: Alexander Duyck <alexander.h.duyck@intel.com>
CC: Cong Wang <amwang@redhat.com>
Signed-off-by: Veaceslav Falico <vfalico@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |   1 +
 net/core/dev.c            | 285 +++++++++++++++++++++++++++++++++++++++++-----
 2 files changed, 259 insertions(+), 27 deletions(-)

(limited to 'net/core/dev.c')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 077363dcd860..5ccf5b73c378 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1125,6 +1125,7 @@ struct net_device {
 	struct list_head	napi_list;
 	struct list_head	unreg_list;
 	struct list_head	upper_dev_list; /* List of upper devices */
+	struct list_head	lower_dev_list;
 
 
 	/* currently active device features */
diff --git a/net/core/dev.c b/net/core/dev.c
index 5072e2c1a072..2aa914eee057 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4369,7 +4369,16 @@ softnet_break:
 
 struct netdev_adjacent {
 	struct net_device *dev;
+
+	/* upper master flag, there can only be one master device per list */
 	bool master;
+
+	/* indicates that this dev is our first-level lower/upper device */
+	bool neighbour;
+
+	/* counter for the number of times this device was added to us */
+	u16 ref_nr;
+
 	struct list_head list;
 	struct rcu_head rcu;
 	struct list_head search_list;
@@ -4408,18 +4417,34 @@ static bool __netdev_search_upper_dev(struct net_device *dev,
 	return ret;
 }
 
-static struct netdev_adjacent *__netdev_find_upper(struct net_device *dev,
-						struct net_device *upper_dev)
+static struct netdev_adjacent *__netdev_find_adj(struct net_device *dev,
+						 struct net_device *adj_dev,
+						 bool upper)
 {
-	struct netdev_adjacent *upper;
+	struct netdev_adjacent *adj;
+	struct list_head *dev_list;
 
-	list_for_each_entry(upper, &dev->upper_dev_list, list) {
-		if (upper->dev == upper_dev)
-			return upper;
+	dev_list = upper ? &dev->upper_dev_list : &dev->lower_dev_list;
+
+	list_for_each_entry(adj, dev_list, list) {
+		if (adj->dev == adj_dev)
+			return adj;
 	}
 	return NULL;
 }
 
+static inline struct netdev_adjacent *__netdev_find_upper(struct net_device *dev,
+							  struct net_device *udev)
+{
+	return __netdev_find_adj(dev, udev, true);
+}
+
+static inline struct netdev_adjacent *__netdev_find_lower(struct net_device *dev,
+							  struct net_device *ldev)
+{
+	return __netdev_find_adj(dev, ldev, false);
+}
+
 /**
  * netdev_has_upper_dev - Check if device is linked to an upper device
  * @dev: device
@@ -4496,10 +4521,149 @@ struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
 }
 EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
 
+static int __netdev_adjacent_dev_insert(struct net_device *dev,
+					struct net_device *adj_dev,
+					bool neighbour, bool master,
+					bool upper)
+{
+	struct netdev_adjacent *adj;
+
+	adj = __netdev_find_adj(dev, adj_dev, upper);
+
+	if (adj) {
+		BUG_ON(neighbour);
+		adj->ref_nr++;
+		return 0;
+	}
+
+	adj = kmalloc(sizeof(*adj), GFP_KERNEL);
+	if (!adj)
+		return -ENOMEM;
+
+	adj->dev = adj_dev;
+	adj->master = master;
+	adj->neighbour = neighbour;
+	adj->ref_nr = 1;
+	INIT_LIST_HEAD(&adj->search_list);
+
+	dev_hold(adj_dev);
+	pr_debug("dev_hold for %s, because of %s link added from %s to %s\n",
+		 adj_dev->name, upper ? "upper" : "lower", dev->name,
+		 adj_dev->name);
+
+	if (!upper) {
+		list_add_tail_rcu(&adj->list, &dev->lower_dev_list);
+		return 0;
+	}
+
+	/* Ensure that master upper link is always the first item in list. */
+	if (master)
+		list_add_rcu(&adj->list, &dev->upper_dev_list);
+	else
+		list_add_tail_rcu(&adj->list, &dev->upper_dev_list);
+
+	return 0;
+}
+
+static inline int __netdev_upper_dev_insert(struct net_device *dev,
+					    struct net_device *udev,
+					    bool master, bool neighbour)
+{
+	return __netdev_adjacent_dev_insert(dev, udev, neighbour, master,
+					    true);
+}
+
+static inline int __netdev_lower_dev_insert(struct net_device *dev,
+					    struct net_device *ldev,
+					    bool neighbour)
+{
+	return __netdev_adjacent_dev_insert(dev, ldev, neighbour, false,
+					    false);
+}
+
+void __netdev_adjacent_dev_remove(struct net_device *dev,
+				  struct net_device *adj_dev, bool upper)
+{
+	struct netdev_adjacent *adj;
+
+	if (upper)
+		adj = __netdev_find_upper(dev, adj_dev);
+	else
+		adj = __netdev_find_lower(dev, adj_dev);
+
+	if (!adj)
+		BUG();
+
+	if (adj->ref_nr > 1) {
+		adj->ref_nr--;
+		return;
+	}
+
+	list_del_rcu(&adj->list);
+	pr_debug("dev_put for %s, because of %s link removed from %s to %s\n",
+		 adj_dev->name, upper ? "upper" : "lower", dev->name,
+		 adj_dev->name);
+	dev_put(adj_dev);
+	kfree_rcu(adj, rcu);
+}
+
+static inline void __netdev_upper_dev_remove(struct net_device *dev,
+					     struct net_device *udev)
+{
+	return __netdev_adjacent_dev_remove(dev, udev, true);
+}
+
+static inline void __netdev_lower_dev_remove(struct net_device *dev,
+					     struct net_device *ldev)
+{
+	return __netdev_adjacent_dev_remove(dev, ldev, false);
+}
+
+int __netdev_adjacent_dev_insert_link(struct net_device *dev,
+				      struct net_device *upper_dev,
+				      bool master, bool neighbour)
+{
+	int ret;
+
+	ret = __netdev_upper_dev_insert(dev, upper_dev, master, neighbour);
+	if (ret)
+		return ret;
+
+	ret = __netdev_lower_dev_insert(upper_dev, dev, neighbour);
+	if (ret) {
+		__netdev_upper_dev_remove(dev, upper_dev);
+		return ret;
+	}
+
+	return 0;
+}
+
+static inline int __netdev_adjacent_dev_link(struct net_device *dev,
+					     struct net_device *udev)
+{
+	return __netdev_adjacent_dev_insert_link(dev, udev, false, false);
+}
+
+static inline int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
+						       struct net_device *udev,
+						       bool master)
+{
+	return __netdev_adjacent_dev_insert_link(dev, udev, master, true);
+}
+
+void __netdev_adjacent_dev_unlink(struct net_device *dev,
+				  struct net_device *upper_dev)
+{
+	__netdev_upper_dev_remove(dev, upper_dev);
+	__netdev_lower_dev_remove(upper_dev, dev);
+}
+
+
 static int __netdev_upper_dev_link(struct net_device *dev,
 				   struct net_device *upper_dev, bool master)
 {
-	struct netdev_adjacent *upper;
+	struct netdev_adjacent *i, *j, *to_i, *to_j;
+	int ret = 0;
 
 	ASSERT_RTNL();
 
@@ -4516,22 +4680,76 @@ static int __netdev_upper_dev_link(struct net_device *dev,
 	if (master && netdev_master_upper_dev_get(dev))
 		return -EBUSY;
 
-	upper = kmalloc(sizeof(*upper), GFP_KERNEL);
-	if (!upper)
-		return -ENOMEM;
+	ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, master);
+	if (ret)
+		return ret;
 
-	upper->dev = upper_dev;
-	upper->master = master;
-	INIT_LIST_HEAD(&upper->search_list);
+	/* Now that we linked these devs, make all the upper_dev's
+	 * upper_dev_list visible to every dev's lower_dev_list and vice
+	 * versa, and don't forget the devices itself. All of these
+	 * links are non-neighbours.
+	 */
+	list_for_each_entry(i, &upper_dev->upper_dev_list, list) {
+		list_for_each_entry(j, &dev->lower_dev_list, list) {
+			ret = __netdev_adjacent_dev_link(i->dev, j->dev);
+			if (ret)
+				goto rollback_mesh;
+		}
+	}
+
+	/* add dev to every upper_dev's upper device */
+	list_for_each_entry(i, &upper_dev->upper_dev_list, list) {
+		ret = __netdev_adjacent_dev_link(dev, i->dev);
+		if (ret)
+			goto rollback_upper_mesh;
+	}
+
+	/* add upper_dev to every dev's lower device */
+	list_for_each_entry(i, &dev->lower_dev_list, list) {
+		ret = __netdev_adjacent_dev_link(i->dev, upper_dev);
+		if (ret)
+			goto rollback_lower_mesh;
+	}
 
-	/* Ensure that master upper link is always the first item in list. */
-	if (master)
-		list_add_rcu(&upper->list, &dev->upper_dev_list);
-	else
-		list_add_tail_rcu(&upper->list, &dev->upper_dev_list);
-	dev_hold(upper_dev);
 	call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev);
 	return 0;
+
+rollback_lower_mesh:
+	to_i = i;
+	list_for_each_entry(i, &dev->lower_dev_list, list) {
+		if (i == to_i)
+			break;
+		__netdev_adjacent_dev_unlink(i->dev, upper_dev);
+	}
+
+	i = NULL;
+
+rollback_upper_mesh:
+	to_i = i;
+	list_for_each_entry(i, &upper_dev->upper_dev_list, list) {
+		if (i == to_i)
+			break;
+		__netdev_adjacent_dev_unlink(dev, i->dev);
+	}
+
+	i = j = NULL;
+
+rollback_mesh:
+	to_i = i;
+	to_j = j;
+	list_for_each_entry(i, &dev->lower_dev_list, list) {
+		list_for_each_entry(j, &upper_dev->upper_dev_list, list) {
+			if (i == to_i && j == to_j)
+				break;
+			__netdev_adjacent_dev_unlink(i->dev, j->dev);
+		}
+		if (i == to_i)
+			break;
+	}
+
+	__netdev_adjacent_dev_unlink(dev, upper_dev);
+
+	return ret;
 }
 
 /**
@@ -4580,16 +4798,28 @@ EXPORT_SYMBOL(netdev_master_upper_dev_link);
 void netdev_upper_dev_unlink(struct net_device *dev,
 			     struct net_device *upper_dev)
 {
-	struct netdev_adjacent *upper;
-
+	struct netdev_adjacent *i, *j;
 	ASSERT_RTNL();
 
-	upper = __netdev_find_upper(dev, upper_dev);
-	if (!upper)
-		return;
-	list_del_rcu(&upper->list);
-	dev_put(upper_dev);
-	kfree_rcu(upper, rcu);
+	__netdev_adjacent_dev_unlink(dev, upper_dev);
+
+	/* Here is the tricky part. We must remove all dev's lower
+	 * devices from all upper_dev's upper devices and vice
+	 * versa, to maintain the graph relationship.
+	 */
+	list_for_each_entry(i, &dev->lower_dev_list, list)
+		list_for_each_entry(j, &upper_dev->upper_dev_list, list)
+			__netdev_adjacent_dev_unlink(i->dev, j->dev);
+
+	/* remove also the devices itself from lower/upper device
+	 * list
+	 */
+	list_for_each_entry(i, &dev->lower_dev_list, list)
+		__netdev_adjacent_dev_unlink(i->dev, upper_dev);
+
+	list_for_each_entry(i, &upper_dev->upper_dev_list, list)
+		__netdev_adjacent_dev_unlink(dev, i->dev);
+
 	call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev);
 }
 EXPORT_SYMBOL(netdev_upper_dev_unlink);
@@ -5850,6 +6080,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
 	INIT_LIST_HEAD(&dev->unreg_list);
 	INIT_LIST_HEAD(&dev->link_watch_list);
 	INIT_LIST_HEAD(&dev->upper_dev_list);
+	INIT_LIST_HEAD(&dev->lower_dev_list);
 	dev->priv_flags = IFF_XMIT_DST_RELEASE;
 	setup(dev);
 
-- 
cgit v1.2.3


From 620f3186caa8124e0efaf329751cf51c5d55c731 Mon Sep 17 00:00:00 2001
From: Veaceslav Falico <vfalico@redhat.com>
Date: Wed, 28 Aug 2013 23:25:06 +0200
Subject: net: remove search_list from netdev_adjacent

We already don't need it cause we see every upper/lower device in the list
already.

CC: "David S. Miller" <davem@davemloft.net>
CC: Eric Dumazet <edumazet@google.com>
CC: Jiri Pirko <jiri@resnulli.us>
CC: Alexander Duyck <alexander.h.duyck@intel.com>
CC: Cong Wang <amwang@redhat.com>
Signed-off-by: Veaceslav Falico <vfalico@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 37 +------------------------------------
 1 file changed, 1 insertion(+), 36 deletions(-)

(limited to 'net/core/dev.c')

diff --git a/net/core/dev.c b/net/core/dev.c
index 2aa914eee057..749925a040a4 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4381,42 +4381,8 @@ struct netdev_adjacent {
 
 	struct list_head list;
 	struct rcu_head rcu;
-	struct list_head search_list;
 };
 
-static void __append_search_uppers(struct list_head *search_list,
-				   struct net_device *dev)
-{
-	struct netdev_adjacent *upper;
-
-	list_for_each_entry(upper, &dev->upper_dev_list, list) {
-		/* check if this upper is not already in search list */
-		if (list_empty(&upper->search_list))
-			list_add_tail(&upper->search_list, search_list);
-	}
-}
-
-static bool __netdev_search_upper_dev(struct net_device *dev,
-				      struct net_device *upper_dev)
-{
-	LIST_HEAD(search_list);
-	struct netdev_adjacent *upper;
-	struct netdev_adjacent *tmp;
-	bool ret = false;
-
-	__append_search_uppers(&search_list, dev);
-	list_for_each_entry(upper, &search_list, search_list) {
-		if (upper->dev == upper_dev) {
-			ret = true;
-			break;
-		}
-		__append_search_uppers(&search_list, upper->dev);
-	}
-	list_for_each_entry_safe(upper, tmp, &search_list, search_list)
-		INIT_LIST_HEAD(&upper->search_list);
-	return ret;
-}
-
 static struct netdev_adjacent *__netdev_find_adj(struct net_device *dev,
 						 struct net_device *adj_dev,
 						 bool upper)
@@ -4544,7 +4510,6 @@ static int __netdev_adjacent_dev_insert(struct net_device *dev,
 	adj->master = master;
 	adj->neighbour = neighbour;
 	adj->ref_nr = 1;
-	INIT_LIST_HEAD(&adj->search_list);
 
 	dev_hold(adj_dev);
 	pr_debug("dev_hold for %s, because of %s link added from %s to %s\n",
@@ -4671,7 +4636,7 @@ static int __netdev_upper_dev_link(struct net_device *dev,
 		return -EBUSY;
 
 	/* To prevent loops, check if dev is not upper device to upper_dev. */
-	if (__netdev_search_upper_dev(upper_dev, dev))
+	if (__netdev_find_upper(upper_dev, dev))
 		return -EBUSY;
 
 	if (__netdev_find_upper(dev, upper_dev))
-- 
cgit v1.2.3


From 48311f46853c0361f9fba7e0e6bb1652d633c049 Mon Sep 17 00:00:00 2001
From: Veaceslav Falico <vfalico@redhat.com>
Date: Wed, 28 Aug 2013 23:25:07 +0200
Subject: net: add netdev_upper_get_next_dev_rcu(dev, iter)

This function returns the next dev in the dev->upper_dev_list after the
struct list_head **iter position, and updates *iter accordingly. Returns
NULL if there are no devices left.

Caller must hold RCU read lock.

CC: "David S. Miller" <davem@davemloft.net>
CC: Eric Dumazet <edumazet@google.com>
CC: Jiri Pirko <jiri@resnulli.us>
CC: Alexander Duyck <alexander.h.duyck@intel.com>
CC: Cong Wang <amwang@redhat.com>
Signed-off-by: Veaceslav Falico <vfalico@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

(limited to 'net/core/dev.c')

diff --git a/net/core/dev.c b/net/core/dev.c
index 749925a040a4..6fbb0c90849b 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4468,6 +4468,31 @@ struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
 }
 EXPORT_SYMBOL(netdev_master_upper_dev_get);
 
+/* netdev_upper_get_next_dev_rcu - Get the next dev from upper list
+ * @dev: device
+ * @iter: list_head ** of the current position
+ *
+ * Gets the next device from the dev's upper list, starting from iter
+ * position. The caller must hold RCU read lock.
+ */
+struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
+						 struct list_head **iter)
+{
+	struct netdev_adjacent *upper;
+
+	WARN_ON_ONCE(!rcu_read_lock_held());
+
+	upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
+
+	if (&upper->list == &dev->upper_dev_list)
+		return NULL;
+
+	*iter = &upper->list;
+
+	return upper->dev;
+}
+EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);
+
 /**
  * netdev_master_upper_dev_get_rcu - Get master upper device
  * @dev: device
-- 
cgit v1.2.3


From 8b27f27797cac5ed9b2f3e63dac89a7ae70e70a7 Mon Sep 17 00:00:00 2001
From: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Date: Mon, 2 Sep 2013 15:34:56 +0200
Subject: skb: allow skb_scrub_packet() to be used by tunnels

This function was only used when a packet was sent to another netns. Now, it can
also be used after tunnel encapsulation or decapsulation.

Only skb_orphan() should not be done when a packet is not crossing netns.

Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h |  2 +-
 net/core/dev.c         |  2 +-
 net/core/skbuff.c      | 19 ++++++++++++-------
 net/ipv4/ip_tunnel.c   |  4 ++--
 net/ipv6/ip6_tunnel.c  |  4 ++--
 net/ipv6/sit.c         |  4 ++--
 6 files changed, 20 insertions(+), 15 deletions(-)

(limited to 'net/core/dev.c')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 6f1330af1ebb..2ddb48d9312c 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -2392,7 +2392,7 @@ extern void	       skb_split(struct sk_buff *skb,
 				 struct sk_buff *skb1, const u32 len);
 extern int	       skb_shift(struct sk_buff *tgt, struct sk_buff *skb,
 				 int shiftlen);
-extern void	       skb_scrub_packet(struct sk_buff *skb);
+extern void	       skb_scrub_packet(struct sk_buff *skb, bool xnet);
 
 extern struct sk_buff *skb_segment(struct sk_buff *skb,
 				   netdev_features_t features);
diff --git a/net/core/dev.c b/net/core/dev.c
index 6fbb0c90849b..07684e880a5d 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1697,7 +1697,7 @@ int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
 	 * call skb_scrub_packet() after it to clear pkt_type _after_ calling
 	 * eth_type_trans().
 	 */
-	skb_scrub_packet(skb);
+	skb_scrub_packet(skb, true);
 
 	return netif_rx(skb);
 }
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 2c3d0f53d198..d81cff119f73 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -3500,17 +3500,22 @@ bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from,
 EXPORT_SYMBOL(skb_try_coalesce);
 
 /**
- * skb_scrub_packet - scrub an skb before sending it to another netns
+ * skb_scrub_packet - scrub an skb
  *
  * @skb: buffer to clean
- *
- * skb_scrub_packet can be used to clean an skb before injecting it in
- * another namespace. We have to clear all information in the skb that
- * could impact namespace isolation.
+ * @xnet: packet is crossing netns
+ *
+ * skb_scrub_packet can be used after encapsulating or decapsulting a packet
+ * into/from a tunnel. Some information have to be cleared during these
+ * operations.
+ * skb_scrub_packet can also be used to clean a skb before injecting it in
+ * another namespace (@xnet == true). We have to clear all information in the
+ * skb that could impact namespace isolation.
  */
-void skb_scrub_packet(struct sk_buff *skb)
+void skb_scrub_packet(struct sk_buff *skb, bool xnet)
 {
-	skb_orphan(skb);
+	if (xnet)
+		skb_orphan(skb);
 	skb->tstamp.tv64 = 0;
 	skb->pkt_type = PACKET_HOST;
 	skb->skb_iif = 0;
diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index 0a6cf0e69478..b0e74e17088f 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -462,7 +462,7 @@ int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
 	}
 
 	if (!net_eq(tunnel->net, dev_net(tunnel->dev)))
-		skb_scrub_packet(skb);
+		skb_scrub_packet(skb, true);
 
 	gro_cells_receive(&tunnel->gro_cells, skb);
 	return 0;
@@ -615,7 +615,7 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
 	}
 
 	if (!net_eq(tunnel->net, dev_net(dev)))
-		skb_scrub_packet(skb);
+		skb_scrub_packet(skb, true);
 
 	if (tunnel->err_count > 0) {
 		if (time_before(jiffies,
diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index d6e00a39274c..72372ac90159 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -830,7 +830,7 @@ static int ip6_tnl_rcv(struct sk_buff *skb, __u16 protocol,
 		tstats->rx_bytes += skb->len;
 
 		if (!net_eq(t->net, dev_net(t->dev)))
-			skb_scrub_packet(skb);
+			skb_scrub_packet(skb, true);
 
 		netif_rx(skb);
 
@@ -1002,7 +1002,7 @@ static int ip6_tnl_xmit2(struct sk_buff *skb,
 	}
 
 	if (!net_eq(t->net, dev_net(dev)))
-		skb_scrub_packet(skb);
+		skb_scrub_packet(skb, true);
 
 	/*
 	 * Okay, now see if we can stuff it in the buffer as-is.
diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c
index 1d1458a3b7c4..b2e44f478e14 100644
--- a/net/ipv6/sit.c
+++ b/net/ipv6/sit.c
@@ -622,7 +622,7 @@ static int ipip6_rcv(struct sk_buff *skb)
 		tstats->rx_bytes += skb->len;
 
 		if (!net_eq(tunnel->net, dev_net(tunnel->dev)))
-			skb_scrub_packet(skb);
+			skb_scrub_packet(skb, true);
 		netif_rx(skb);
 
 		return 0;
@@ -861,7 +861,7 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb,
 	}
 
 	if (!net_eq(tunnel->net, dev_net(dev)))
-		skb_scrub_packet(skb);
+		skb_scrub_packet(skb, true);
 
 	/*
 	 * Okay, now see if we can stuff it in the buffer as-is.
-- 
cgit v1.2.3


From 82476b316084e6826a9dd339d1dad892a598af9a Mon Sep 17 00:00:00 2001
From: Veaceslav Falico <vfalico@redhat.com>
Date: Mon, 2 Sep 2013 16:26:51 +0200
Subject: net: correctly interlink lower/upper devices

Currently we're linking upper devices to lower ones, which results in
upside-down relationship: upper devices seeing lower devices via its upper
lists.

Fix this by correctly linking lower devices to the upper ones.

CC: "David S. Miller" <davem@davemloft.net>
CC: Eric Dumazet <edumazet@google.com>
CC: Jiri Pirko <jiri@resnulli.us>
CC: Alexander Duyck <alexander.h.duyck@intel.com>
CC: Cong Wang <amwang@redhat.com>
Signed-off-by: Veaceslav Falico <vfalico@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net/core/dev.c')

diff --git a/net/core/dev.c b/net/core/dev.c
index 07684e880a5d..5c713f2239cc 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4679,8 +4679,8 @@ static int __netdev_upper_dev_link(struct net_device *dev,
 	 * versa, and don't forget the devices itself. All of these
 	 * links are non-neighbours.
 	 */
-	list_for_each_entry(i, &upper_dev->upper_dev_list, list) {
-		list_for_each_entry(j, &dev->lower_dev_list, list) {
+	list_for_each_entry(i, &dev->lower_dev_list, list) {
+		list_for_each_entry(j, &upper_dev->upper_dev_list, list) {
 			ret = __netdev_adjacent_dev_link(i->dev, j->dev);
 			if (ret)
 				goto rollback_mesh;
-- 
cgit v1.2.3


From 50624c934db18ab90aaea4908f60dd39aab4e6e5 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Mon, 23 Sep 2013 21:19:49 -0700
Subject: net: Delay default_device_exit_batch until no devices are
 unregistering v2

There is currently serialization network namespaces exiting and
network devices exiting as the final part of netdev_run_todo does not
happen under the rtnl_lock.  This is compounded by the fact that the
only list of devices unregistering in netdev_run_todo is local to the
netdev_run_todo.

This lack of serialization in extreme cases results in network devices
unregistering in netdev_run_todo after the loopback device of their
network namespace has been freed (making dst_ifdown unsafe), and after
the their network namespace has exited (making the NETDEV_UNREGISTER,
and NETDEV_UNREGISTER_FINAL callbacks unsafe).

Add the missing serialization by a per network namespace count of how
many network devices are unregistering and having a wait queue that is
woken up whenever the count is decreased.  The count and wait queue
allow default_device_exit_batch to wait until all of the unregistration
activity for a network namespace has finished before proceeding to
unregister the loopback device and then allowing the network namespace
to exit.

Only a single global wait queue is used because there is a single global
lock, and there is a single waiter, per network namespace wait queues
would be a waste of resources.

The per network namespace count of unregistering devices gives a
progress guarantee because the number of network devices unregistering
in an exiting network namespace must ultimately drop to zero (assuming
network device unregistration completes).

The basic logic remains the same as in v1.  This patch is now half
comment and half rtnl_lock_unregistering an expanded version of
wait_event performs no extra work in the common case where no network
devices are unregistering when we get to default_device_exit_batch.

Reported-by: Francesco Ruggeri <fruggeri@aristanetworks.com>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/net_namespace.h |  1 +
 net/core/dev.c              | 49 ++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 49 insertions(+), 1 deletion(-)

(limited to 'net/core/dev.c')

diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
index 1313456a0994..9d22f08896c6 100644
--- a/include/net/net_namespace.h
+++ b/include/net/net_namespace.h
@@ -74,6 +74,7 @@ struct net {
 	struct hlist_head	*dev_index_head;
 	unsigned int		dev_base_seq;	/* protected by rtnl_mutex */
 	int			ifindex;
+	unsigned int		dev_unreg_count;
 
 	/* core fib_rules */
 	struct list_head	rules_ops;
diff --git a/net/core/dev.c b/net/core/dev.c
index 5c713f2239cc..65f829cfd928 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -5247,10 +5247,12 @@ static int dev_new_index(struct net *net)
 
 /* Delayed registration/unregisteration */
 static LIST_HEAD(net_todo_list);
+static DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
 
 static void net_set_todo(struct net_device *dev)
 {
 	list_add_tail(&dev->todo_list, &net_todo_list);
+	dev_net(dev)->dev_unreg_count++;
 }
 
 static void rollback_registered_many(struct list_head *head)
@@ -5918,6 +5920,12 @@ void netdev_run_todo(void)
 		if (dev->destructor)
 			dev->destructor(dev);
 
+		/* Report a network device has been unregistered */
+		rtnl_lock();
+		dev_net(dev)->dev_unreg_count--;
+		__rtnl_unlock();
+		wake_up(&netdev_unregistering_wq);
+
 		/* Free network device */
 		kobject_put(&dev->dev.kobj);
 	}
@@ -6603,6 +6611,34 @@ static void __net_exit default_device_exit(struct net *net)
 	rtnl_unlock();
 }
 
+static void __net_exit rtnl_lock_unregistering(struct list_head *net_list)
+{
+	/* Return with the rtnl_lock held when there are no network
+	 * devices unregistering in any network namespace in net_list.
+	 */
+	struct net *net;
+	bool unregistering;
+	DEFINE_WAIT(wait);
+
+	for (;;) {
+		prepare_to_wait(&netdev_unregistering_wq, &wait,
+				TASK_UNINTERRUPTIBLE);
+		unregistering = false;
+		rtnl_lock();
+		list_for_each_entry(net, net_list, exit_list) {
+			if (net->dev_unreg_count > 0) {
+				unregistering = true;
+				break;
+			}
+		}
+		if (!unregistering)
+			break;
+		__rtnl_unlock();
+		schedule();
+	}
+	finish_wait(&netdev_unregistering_wq, &wait);
+}
+
 static void __net_exit default_device_exit_batch(struct list_head *net_list)
 {
 	/* At exit all network devices most be removed from a network
@@ -6614,7 +6650,18 @@ static void __net_exit default_device_exit_batch(struct list_head *net_list)
 	struct net *net;
 	LIST_HEAD(dev_kill_list);
 
-	rtnl_lock();
+	/* To prevent network device cleanup code from dereferencing
+	 * loopback devices or network devices that have been freed
+	 * wait here for all pending unregistrations to complete,
+	 * before unregistring the loopback device and allowing the
+	 * network namespace be freed.
+	 *
+	 * The netdev todo list containing all network devices
+	 * unregistrations that happen in default_device_exit_batch
+	 * will run in the rtnl_unlock() at the end of
+	 * default_device_exit_batch.
+	 */
+	rtnl_lock_unregistering(net_list);
 	list_for_each_entry(net, net_list, exit_list) {
 		for_each_netdev_reverse(net, dev) {
 			if (dev->rtnl_link_ops)
-- 
cgit v1.2.3