From 18afa4b028b46f8b45ca64f94aefe717c297b07d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 23 Jul 2013 16:13:17 +0200 Subject: net: Make devnet_rename_seq static No users outside net/core/dev.c. Signed-off-by: Thomas Gleixner Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 26755dd40daa..dfd9f5d56ae0 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -174,7 +174,7 @@ static DEFINE_SPINLOCK(napi_hash_lock); static unsigned int napi_gen_id; static DEFINE_HASHTABLE(napi_hash, 8); -seqcount_t devnet_rename_seq; +static seqcount_t devnet_rename_seq; static inline void dev_base_seq_inc(struct net *net) { -- cgit v1.2.3 From 66b52b0dc82c5c88d769dc1c7d44cf45d0deb07c Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Mon, 29 Jul 2013 18:16:49 +0200 Subject: net: add ndo to get id of physical port of the device This patch adds a ndo for getting physical port of the device. Driver which is aware of being virtual function of some physical port should implement this ndo. This is applicable not only for IOV, but for other solutions (NPAR, multichannel) as well. Basically if there is possible to have multiple netdevs on the single hw port. Signed-off-by: Jiri Pirko Acked-by: Ben Hutchings Signed-off-by: David S. Miller --- include/linux/netdevice.h | 20 ++++++++++++++++++++ net/core/dev.c | 18 ++++++++++++++++++ 2 files changed, 38 insertions(+) (limited to 'net/core/dev.c') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 3ca60b070ef0..875f869dc38a 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -728,6 +728,16 @@ struct netdev_fcoe_hbainfo { }; #endif +#define MAX_PHYS_PORT_ID_LEN 32 + +/* This structure holds a unique identifier to identify the + * physical port used by a netdevice. + */ +struct netdev_phys_port_id { + unsigned char id[MAX_PHYS_PORT_ID_LEN]; + unsigned char id_len; +}; + /* * This structure defines the management hooks for network devices. * The following hooks can be defined; unless noted otherwise, they are @@ -932,6 +942,12 @@ struct netdev_fcoe_hbainfo { * that determine carrier state from physical hardware properties (eg * network cables) or protocol-dependent mechanisms (eg * USB_CDC_NOTIFY_NETWORK_CONNECTION) should NOT implement this function. + * + * int (*ndo_get_phys_port_id)(struct net_device *dev, + * struct netdev_phys_port_id *ppid); + * Called to get ID of physical port of this device. If driver does + * not implement this, it is assumed that the hw is not able to have + * multiple net devices on single physical port. */ struct net_device_ops { int (*ndo_init)(struct net_device *dev); @@ -1060,6 +1076,8 @@ struct net_device_ops { struct nlmsghdr *nlh); int (*ndo_change_carrier)(struct net_device *dev, bool new_carrier); + int (*ndo_get_phys_port_id)(struct net_device *dev, + struct netdev_phys_port_id *ppid); }; /* @@ -2315,6 +2333,8 @@ extern int dev_set_mac_address(struct net_device *, struct sockaddr *); extern int dev_change_carrier(struct net_device *, bool new_carrier); +extern int dev_get_phys_port_id(struct net_device *dev, + struct netdev_phys_port_id *ppid); extern int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, struct netdev_queue *txq); diff --git a/net/core/dev.c b/net/core/dev.c index dfd9f5d56ae0..58eb802584b9 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4988,6 +4988,24 @@ int dev_change_carrier(struct net_device *dev, bool new_carrier) } EXPORT_SYMBOL(dev_change_carrier); +/** + * dev_get_phys_port_id - Get device physical port ID + * @dev: device + * @ppid: port ID + * + * Get device physical port ID + */ +int dev_get_phys_port_id(struct net_device *dev, + struct netdev_phys_port_id *ppid) +{ + const struct net_device_ops *ops = dev->netdev_ops; + + if (!ops->ndo_get_phys_port_id) + return -EOPNOTSUPP; + return ops->ndo_get_phys_port_id(dev, ppid); +} +EXPORT_SYMBOL(dev_get_phys_port_id); + /** * dev_new_index - allocate an ifindex * @net: the applicable net namespace -- cgit v1.2.3 From 64261f230a9157f5f520ce30ec6827d679375e2f Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Tue, 13 Aug 2013 17:51:09 +0200 Subject: dev: move skb_scrub_packet() after eth_type_trans() skb_scrub_packet() was called before eth_type_trans() to let eth_type_trans() set pkt_type. In fact, we should force pkt_type to PACKET_HOST, so move the call after eth_type_trans(). Signed-off-by: Nicolas Dichtel Signed-off-by: David S. Miller --- net/core/dev.c | 6 +++--- net/ipv4/ip_tunnel.c | 7 ++++--- 2 files changed, 7 insertions(+), 6 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 58eb802584b9..1ed2b66a10a6 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1691,13 +1691,13 @@ int dev_forward_skb(struct net_device *dev, struct sk_buff *skb) kfree_skb(skb); return NET_RX_DROP; } - skb_scrub_packet(skb); skb->protocol = eth_type_trans(skb, dev); /* eth_type_trans() can set pkt_type. - * clear pkt_type _after_ calling eth_type_trans() + * call skb_scrub_packet() after it to clear pkt_type _after_ calling + * eth_type_trans(). */ - skb->pkt_type = PACKET_HOST; + skb_scrub_packet(skb); return netif_rx(skb); } diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index 9fdf8a6d95f3..fbc1094964bf 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -454,15 +454,16 @@ int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb, tstats->rx_bytes += skb->len; u64_stats_update_end(&tstats->syncp); - if (tunnel->net != dev_net(tunnel->dev)) - skb_scrub_packet(skb); - if (tunnel->dev->type == ARPHRD_ETHER) { skb->protocol = eth_type_trans(skb, tunnel->dev); skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); } else { skb->dev = tunnel->dev; } + + if (tunnel->net != dev_net(tunnel->dev)) + skb_scrub_packet(skb); + gro_cells_receive(&tunnel->gro_cells, skb); return 0; -- cgit v1.2.3 From aa9d85605f5ab070b64842b3eba797cf81698ae1 Mon Sep 17 00:00:00 2001 From: Veaceslav Falico Date: Wed, 28 Aug 2013 23:25:04 +0200 Subject: net: rename netdev_upper to netdev_adjacent Rename the structure to reflect the upcoming addition of lower_dev_list. CC: "David S. Miller" CC: Eric Dumazet CC: Jiri Pirko CC: Alexander Duyck CC: Cong Wang Signed-off-by: Veaceslav Falico Signed-off-by: David S. Miller --- net/core/dev.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 1ed2b66a10a6..5072e2c1a072 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4367,7 +4367,7 @@ softnet_break: goto out; } -struct netdev_upper { +struct netdev_adjacent { struct net_device *dev; bool master; struct list_head list; @@ -4378,7 +4378,7 @@ struct netdev_upper { static void __append_search_uppers(struct list_head *search_list, struct net_device *dev) { - struct netdev_upper *upper; + struct netdev_adjacent *upper; list_for_each_entry(upper, &dev->upper_dev_list, list) { /* check if this upper is not already in search list */ @@ -4391,8 +4391,8 @@ static bool __netdev_search_upper_dev(struct net_device *dev, struct net_device *upper_dev) { LIST_HEAD(search_list); - struct netdev_upper *upper; - struct netdev_upper *tmp; + struct netdev_adjacent *upper; + struct netdev_adjacent *tmp; bool ret = false; __append_search_uppers(&search_list, dev); @@ -4408,10 +4408,10 @@ static bool __netdev_search_upper_dev(struct net_device *dev, return ret; } -static struct netdev_upper *__netdev_find_upper(struct net_device *dev, +static struct netdev_adjacent *__netdev_find_upper(struct net_device *dev, struct net_device *upper_dev) { - struct netdev_upper *upper; + struct netdev_adjacent *upper; list_for_each_entry(upper, &dev->upper_dev_list, list) { if (upper->dev == upper_dev) @@ -4462,7 +4462,7 @@ EXPORT_SYMBOL(netdev_has_any_upper_dev); */ struct net_device *netdev_master_upper_dev_get(struct net_device *dev) { - struct netdev_upper *upper; + struct netdev_adjacent *upper; ASSERT_RTNL(); @@ -4470,7 +4470,7 @@ struct net_device *netdev_master_upper_dev_get(struct net_device *dev) return NULL; upper = list_first_entry(&dev->upper_dev_list, - struct netdev_upper, list); + struct netdev_adjacent, list); if (likely(upper->master)) return upper->dev; return NULL; @@ -4486,10 +4486,10 @@ EXPORT_SYMBOL(netdev_master_upper_dev_get); */ struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev) { - struct netdev_upper *upper; + struct netdev_adjacent *upper; upper = list_first_or_null_rcu(&dev->upper_dev_list, - struct netdev_upper, list); + struct netdev_adjacent, list); if (upper && likely(upper->master)) return upper->dev; return NULL; @@ -4499,7 +4499,7 @@ EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu); static int __netdev_upper_dev_link(struct net_device *dev, struct net_device *upper_dev, bool master) { - struct netdev_upper *upper; + struct netdev_adjacent *upper; ASSERT_RTNL(); @@ -4580,7 +4580,7 @@ EXPORT_SYMBOL(netdev_master_upper_dev_link); void netdev_upper_dev_unlink(struct net_device *dev, struct net_device *upper_dev) { - struct netdev_upper *upper; + struct netdev_adjacent *upper; ASSERT_RTNL(); -- cgit v1.2.3 From 5d261913ca3daf6c2d21d38924235667b3d07c40 Mon Sep 17 00:00:00 2001 From: Veaceslav Falico Date: Wed, 28 Aug 2013 23:25:05 +0200 Subject: net: add lower_dev_list to net_device and make a full mesh This patch adds lower_dev_list list_head to net_device, which is the same as upper_dev_list, only for lower devices, and begins to use it in the same way as the upper list. It also changes the way the whole adjacent device lists work - now they contain *all* of upper/lower devices, not only the first level. The first level devices are distinguished by the bool neighbour field in netdev_adjacent, also added by this patch. There are cases when a device can be added several times to the adjacent list, the simplest would be: /---- eth0.10 ---\ eth0- --- bond0 \---- eth0.20 ---/ where both bond0 and eth0 'see' each other in the adjacent lists two times. To avoid duplication of netdev_adjacent structures ref_nr is being kept as the number of times the device was added to the list. The 'full view' is achieved by adding, on link creation, all of the upper_dev's upper_dev_list devices as upper devices to all of the lower_dev's lower_dev_list devices (and to the lower_dev itself), and vice versa. On unlink they are removed using the same logic. I've tested it with thousands vlans/bonds/bridges, everything works ok and no observable lags even on a huge number of interfaces. Memory footprint for 128 devices interconnected with each other via both upper and lower (which is impossible, but for the comparison) lists would be: 128*128*2*sizeof(netdev_adjacent) = 1.5MB but in the real world we usualy have at most several devices with slaves and a lot of vlans, so the footprint will be much lower. CC: "David S. Miller" CC: Eric Dumazet CC: Jiri Pirko CC: Alexander Duyck CC: Cong Wang Signed-off-by: Veaceslav Falico Signed-off-by: David S. Miller --- include/linux/netdevice.h | 1 + net/core/dev.c | 285 +++++++++++++++++++++++++++++++++++++++++----- 2 files changed, 259 insertions(+), 27 deletions(-) (limited to 'net/core/dev.c') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 077363dcd860..5ccf5b73c378 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1125,6 +1125,7 @@ struct net_device { struct list_head napi_list; struct list_head unreg_list; struct list_head upper_dev_list; /* List of upper devices */ + struct list_head lower_dev_list; /* currently active device features */ diff --git a/net/core/dev.c b/net/core/dev.c index 5072e2c1a072..2aa914eee057 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4369,7 +4369,16 @@ softnet_break: struct netdev_adjacent { struct net_device *dev; + + /* upper master flag, there can only be one master device per list */ bool master; + + /* indicates that this dev is our first-level lower/upper device */ + bool neighbour; + + /* counter for the number of times this device was added to us */ + u16 ref_nr; + struct list_head list; struct rcu_head rcu; struct list_head search_list; @@ -4408,18 +4417,34 @@ static bool __netdev_search_upper_dev(struct net_device *dev, return ret; } -static struct netdev_adjacent *__netdev_find_upper(struct net_device *dev, - struct net_device *upper_dev) +static struct netdev_adjacent *__netdev_find_adj(struct net_device *dev, + struct net_device *adj_dev, + bool upper) { - struct netdev_adjacent *upper; + struct netdev_adjacent *adj; + struct list_head *dev_list; - list_for_each_entry(upper, &dev->upper_dev_list, list) { - if (upper->dev == upper_dev) - return upper; + dev_list = upper ? &dev->upper_dev_list : &dev->lower_dev_list; + + list_for_each_entry(adj, dev_list, list) { + if (adj->dev == adj_dev) + return adj; } return NULL; } +static inline struct netdev_adjacent *__netdev_find_upper(struct net_device *dev, + struct net_device *udev) +{ + return __netdev_find_adj(dev, udev, true); +} + +static inline struct netdev_adjacent *__netdev_find_lower(struct net_device *dev, + struct net_device *ldev) +{ + return __netdev_find_adj(dev, ldev, false); +} + /** * netdev_has_upper_dev - Check if device is linked to an upper device * @dev: device @@ -4496,10 +4521,149 @@ struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev) } EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu); +static int __netdev_adjacent_dev_insert(struct net_device *dev, + struct net_device *adj_dev, + bool neighbour, bool master, + bool upper) +{ + struct netdev_adjacent *adj; + + adj = __netdev_find_adj(dev, adj_dev, upper); + + if (adj) { + BUG_ON(neighbour); + adj->ref_nr++; + return 0; + } + + adj = kmalloc(sizeof(*adj), GFP_KERNEL); + if (!adj) + return -ENOMEM; + + adj->dev = adj_dev; + adj->master = master; + adj->neighbour = neighbour; + adj->ref_nr = 1; + INIT_LIST_HEAD(&adj->search_list); + + dev_hold(adj_dev); + pr_debug("dev_hold for %s, because of %s link added from %s to %s\n", + adj_dev->name, upper ? "upper" : "lower", dev->name, + adj_dev->name); + + if (!upper) { + list_add_tail_rcu(&adj->list, &dev->lower_dev_list); + return 0; + } + + /* Ensure that master upper link is always the first item in list. */ + if (master) + list_add_rcu(&adj->list, &dev->upper_dev_list); + else + list_add_tail_rcu(&adj->list, &dev->upper_dev_list); + + return 0; +} + +static inline int __netdev_upper_dev_insert(struct net_device *dev, + struct net_device *udev, + bool master, bool neighbour) +{ + return __netdev_adjacent_dev_insert(dev, udev, neighbour, master, + true); +} + +static inline int __netdev_lower_dev_insert(struct net_device *dev, + struct net_device *ldev, + bool neighbour) +{ + return __netdev_adjacent_dev_insert(dev, ldev, neighbour, false, + false); +} + +void __netdev_adjacent_dev_remove(struct net_device *dev, + struct net_device *adj_dev, bool upper) +{ + struct netdev_adjacent *adj; + + if (upper) + adj = __netdev_find_upper(dev, adj_dev); + else + adj = __netdev_find_lower(dev, adj_dev); + + if (!adj) + BUG(); + + if (adj->ref_nr > 1) { + adj->ref_nr--; + return; + } + + list_del_rcu(&adj->list); + pr_debug("dev_put for %s, because of %s link removed from %s to %s\n", + adj_dev->name, upper ? "upper" : "lower", dev->name, + adj_dev->name); + dev_put(adj_dev); + kfree_rcu(adj, rcu); +} + +static inline void __netdev_upper_dev_remove(struct net_device *dev, + struct net_device *udev) +{ + return __netdev_adjacent_dev_remove(dev, udev, true); +} + +static inline void __netdev_lower_dev_remove(struct net_device *dev, + struct net_device *ldev) +{ + return __netdev_adjacent_dev_remove(dev, ldev, false); +} + +int __netdev_adjacent_dev_insert_link(struct net_device *dev, + struct net_device *upper_dev, + bool master, bool neighbour) +{ + int ret; + + ret = __netdev_upper_dev_insert(dev, upper_dev, master, neighbour); + if (ret) + return ret; + + ret = __netdev_lower_dev_insert(upper_dev, dev, neighbour); + if (ret) { + __netdev_upper_dev_remove(dev, upper_dev); + return ret; + } + + return 0; +} + +static inline int __netdev_adjacent_dev_link(struct net_device *dev, + struct net_device *udev) +{ + return __netdev_adjacent_dev_insert_link(dev, udev, false, false); +} + +static inline int __netdev_adjacent_dev_link_neighbour(struct net_device *dev, + struct net_device *udev, + bool master) +{ + return __netdev_adjacent_dev_insert_link(dev, udev, master, true); +} + +void __netdev_adjacent_dev_unlink(struct net_device *dev, + struct net_device *upper_dev) +{ + __netdev_upper_dev_remove(dev, upper_dev); + __netdev_lower_dev_remove(upper_dev, dev); +} + + static int __netdev_upper_dev_link(struct net_device *dev, struct net_device *upper_dev, bool master) { - struct netdev_adjacent *upper; + struct netdev_adjacent *i, *j, *to_i, *to_j; + int ret = 0; ASSERT_RTNL(); @@ -4516,22 +4680,76 @@ static int __netdev_upper_dev_link(struct net_device *dev, if (master && netdev_master_upper_dev_get(dev)) return -EBUSY; - upper = kmalloc(sizeof(*upper), GFP_KERNEL); - if (!upper) - return -ENOMEM; + ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, master); + if (ret) + return ret; - upper->dev = upper_dev; - upper->master = master; - INIT_LIST_HEAD(&upper->search_list); + /* Now that we linked these devs, make all the upper_dev's + * upper_dev_list visible to every dev's lower_dev_list and vice + * versa, and don't forget the devices itself. All of these + * links are non-neighbours. + */ + list_for_each_entry(i, &upper_dev->upper_dev_list, list) { + list_for_each_entry(j, &dev->lower_dev_list, list) { + ret = __netdev_adjacent_dev_link(i->dev, j->dev); + if (ret) + goto rollback_mesh; + } + } + + /* add dev to every upper_dev's upper device */ + list_for_each_entry(i, &upper_dev->upper_dev_list, list) { + ret = __netdev_adjacent_dev_link(dev, i->dev); + if (ret) + goto rollback_upper_mesh; + } + + /* add upper_dev to every dev's lower device */ + list_for_each_entry(i, &dev->lower_dev_list, list) { + ret = __netdev_adjacent_dev_link(i->dev, upper_dev); + if (ret) + goto rollback_lower_mesh; + } - /* Ensure that master upper link is always the first item in list. */ - if (master) - list_add_rcu(&upper->list, &dev->upper_dev_list); - else - list_add_tail_rcu(&upper->list, &dev->upper_dev_list); - dev_hold(upper_dev); call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev); return 0; + +rollback_lower_mesh: + to_i = i; + list_for_each_entry(i, &dev->lower_dev_list, list) { + if (i == to_i) + break; + __netdev_adjacent_dev_unlink(i->dev, upper_dev); + } + + i = NULL; + +rollback_upper_mesh: + to_i = i; + list_for_each_entry(i, &upper_dev->upper_dev_list, list) { + if (i == to_i) + break; + __netdev_adjacent_dev_unlink(dev, i->dev); + } + + i = j = NULL; + +rollback_mesh: + to_i = i; + to_j = j; + list_for_each_entry(i, &dev->lower_dev_list, list) { + list_for_each_entry(j, &upper_dev->upper_dev_list, list) { + if (i == to_i && j == to_j) + break; + __netdev_adjacent_dev_unlink(i->dev, j->dev); + } + if (i == to_i) + break; + } + + __netdev_adjacent_dev_unlink(dev, upper_dev); + + return ret; } /** @@ -4580,16 +4798,28 @@ EXPORT_SYMBOL(netdev_master_upper_dev_link); void netdev_upper_dev_unlink(struct net_device *dev, struct net_device *upper_dev) { - struct netdev_adjacent *upper; - + struct netdev_adjacent *i, *j; ASSERT_RTNL(); - upper = __netdev_find_upper(dev, upper_dev); - if (!upper) - return; - list_del_rcu(&upper->list); - dev_put(upper_dev); - kfree_rcu(upper, rcu); + __netdev_adjacent_dev_unlink(dev, upper_dev); + + /* Here is the tricky part. We must remove all dev's lower + * devices from all upper_dev's upper devices and vice + * versa, to maintain the graph relationship. + */ + list_for_each_entry(i, &dev->lower_dev_list, list) + list_for_each_entry(j, &upper_dev->upper_dev_list, list) + __netdev_adjacent_dev_unlink(i->dev, j->dev); + + /* remove also the devices itself from lower/upper device + * list + */ + list_for_each_entry(i, &dev->lower_dev_list, list) + __netdev_adjacent_dev_unlink(i->dev, upper_dev); + + list_for_each_entry(i, &upper_dev->upper_dev_list, list) + __netdev_adjacent_dev_unlink(dev, i->dev); + call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev); } EXPORT_SYMBOL(netdev_upper_dev_unlink); @@ -5850,6 +6080,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, INIT_LIST_HEAD(&dev->unreg_list); INIT_LIST_HEAD(&dev->link_watch_list); INIT_LIST_HEAD(&dev->upper_dev_list); + INIT_LIST_HEAD(&dev->lower_dev_list); dev->priv_flags = IFF_XMIT_DST_RELEASE; setup(dev); -- cgit v1.2.3 From 620f3186caa8124e0efaf329751cf51c5d55c731 Mon Sep 17 00:00:00 2001 From: Veaceslav Falico Date: Wed, 28 Aug 2013 23:25:06 +0200 Subject: net: remove search_list from netdev_adjacent We already don't need it cause we see every upper/lower device in the list already. CC: "David S. Miller" CC: Eric Dumazet CC: Jiri Pirko CC: Alexander Duyck CC: Cong Wang Signed-off-by: Veaceslav Falico Signed-off-by: David S. Miller --- net/core/dev.c | 37 +------------------------------------ 1 file changed, 1 insertion(+), 36 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 2aa914eee057..749925a040a4 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4381,42 +4381,8 @@ struct netdev_adjacent { struct list_head list; struct rcu_head rcu; - struct list_head search_list; }; -static void __append_search_uppers(struct list_head *search_list, - struct net_device *dev) -{ - struct netdev_adjacent *upper; - - list_for_each_entry(upper, &dev->upper_dev_list, list) { - /* check if this upper is not already in search list */ - if (list_empty(&upper->search_list)) - list_add_tail(&upper->search_list, search_list); - } -} - -static bool __netdev_search_upper_dev(struct net_device *dev, - struct net_device *upper_dev) -{ - LIST_HEAD(search_list); - struct netdev_adjacent *upper; - struct netdev_adjacent *tmp; - bool ret = false; - - __append_search_uppers(&search_list, dev); - list_for_each_entry(upper, &search_list, search_list) { - if (upper->dev == upper_dev) { - ret = true; - break; - } - __append_search_uppers(&search_list, upper->dev); - } - list_for_each_entry_safe(upper, tmp, &search_list, search_list) - INIT_LIST_HEAD(&upper->search_list); - return ret; -} - static struct netdev_adjacent *__netdev_find_adj(struct net_device *dev, struct net_device *adj_dev, bool upper) @@ -4544,7 +4510,6 @@ static int __netdev_adjacent_dev_insert(struct net_device *dev, adj->master = master; adj->neighbour = neighbour; adj->ref_nr = 1; - INIT_LIST_HEAD(&adj->search_list); dev_hold(adj_dev); pr_debug("dev_hold for %s, because of %s link added from %s to %s\n", @@ -4671,7 +4636,7 @@ static int __netdev_upper_dev_link(struct net_device *dev, return -EBUSY; /* To prevent loops, check if dev is not upper device to upper_dev. */ - if (__netdev_search_upper_dev(upper_dev, dev)) + if (__netdev_find_upper(upper_dev, dev)) return -EBUSY; if (__netdev_find_upper(dev, upper_dev)) -- cgit v1.2.3 From 48311f46853c0361f9fba7e0e6bb1652d633c049 Mon Sep 17 00:00:00 2001 From: Veaceslav Falico Date: Wed, 28 Aug 2013 23:25:07 +0200 Subject: net: add netdev_upper_get_next_dev_rcu(dev, iter) This function returns the next dev in the dev->upper_dev_list after the struct list_head **iter position, and updates *iter accordingly. Returns NULL if there are no devices left. Caller must hold RCU read lock. CC: "David S. Miller" CC: Eric Dumazet CC: Jiri Pirko CC: Alexander Duyck CC: Cong Wang Signed-off-by: Veaceslav Falico Signed-off-by: David S. Miller --- net/core/dev.c | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 749925a040a4..6fbb0c90849b 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4468,6 +4468,31 @@ struct net_device *netdev_master_upper_dev_get(struct net_device *dev) } EXPORT_SYMBOL(netdev_master_upper_dev_get); +/* netdev_upper_get_next_dev_rcu - Get the next dev from upper list + * @dev: device + * @iter: list_head ** of the current position + * + * Gets the next device from the dev's upper list, starting from iter + * position. The caller must hold RCU read lock. + */ +struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev, + struct list_head **iter) +{ + struct netdev_adjacent *upper; + + WARN_ON_ONCE(!rcu_read_lock_held()); + + upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list); + + if (&upper->list == &dev->upper_dev_list) + return NULL; + + *iter = &upper->list; + + return upper->dev; +} +EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu); + /** * netdev_master_upper_dev_get_rcu - Get master upper device * @dev: device -- cgit v1.2.3 From 8b27f27797cac5ed9b2f3e63dac89a7ae70e70a7 Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Mon, 2 Sep 2013 15:34:56 +0200 Subject: skb: allow skb_scrub_packet() to be used by tunnels This function was only used when a packet was sent to another netns. Now, it can also be used after tunnel encapsulation or decapsulation. Only skb_orphan() should not be done when a packet is not crossing netns. Signed-off-by: Nicolas Dichtel Signed-off-by: David S. Miller --- include/linux/skbuff.h | 2 +- net/core/dev.c | 2 +- net/core/skbuff.c | 19 ++++++++++++------- net/ipv4/ip_tunnel.c | 4 ++-- net/ipv6/ip6_tunnel.c | 4 ++-- net/ipv6/sit.c | 4 ++-- 6 files changed, 20 insertions(+), 15 deletions(-) (limited to 'net/core/dev.c') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 6f1330af1ebb..2ddb48d9312c 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -2392,7 +2392,7 @@ extern void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len); extern int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen); -extern void skb_scrub_packet(struct sk_buff *skb); +extern void skb_scrub_packet(struct sk_buff *skb, bool xnet); extern struct sk_buff *skb_segment(struct sk_buff *skb, netdev_features_t features); diff --git a/net/core/dev.c b/net/core/dev.c index 6fbb0c90849b..07684e880a5d 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1697,7 +1697,7 @@ int dev_forward_skb(struct net_device *dev, struct sk_buff *skb) * call skb_scrub_packet() after it to clear pkt_type _after_ calling * eth_type_trans(). */ - skb_scrub_packet(skb); + skb_scrub_packet(skb, true); return netif_rx(skb); } diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 2c3d0f53d198..d81cff119f73 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -3500,17 +3500,22 @@ bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from, EXPORT_SYMBOL(skb_try_coalesce); /** - * skb_scrub_packet - scrub an skb before sending it to another netns + * skb_scrub_packet - scrub an skb * * @skb: buffer to clean - * - * skb_scrub_packet can be used to clean an skb before injecting it in - * another namespace. We have to clear all information in the skb that - * could impact namespace isolation. + * @xnet: packet is crossing netns + * + * skb_scrub_packet can be used after encapsulating or decapsulting a packet + * into/from a tunnel. Some information have to be cleared during these + * operations. + * skb_scrub_packet can also be used to clean a skb before injecting it in + * another namespace (@xnet == true). We have to clear all information in the + * skb that could impact namespace isolation. */ -void skb_scrub_packet(struct sk_buff *skb) +void skb_scrub_packet(struct sk_buff *skb, bool xnet) { - skb_orphan(skb); + if (xnet) + skb_orphan(skb); skb->tstamp.tv64 = 0; skb->pkt_type = PACKET_HOST; skb->skb_iif = 0; diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index 0a6cf0e69478..b0e74e17088f 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -462,7 +462,7 @@ int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb, } if (!net_eq(tunnel->net, dev_net(tunnel->dev))) - skb_scrub_packet(skb); + skb_scrub_packet(skb, true); gro_cells_receive(&tunnel->gro_cells, skb); return 0; @@ -615,7 +615,7 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, } if (!net_eq(tunnel->net, dev_net(dev))) - skb_scrub_packet(skb); + skb_scrub_packet(skb, true); if (tunnel->err_count > 0) { if (time_before(jiffies, diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index d6e00a39274c..72372ac90159 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -830,7 +830,7 @@ static int ip6_tnl_rcv(struct sk_buff *skb, __u16 protocol, tstats->rx_bytes += skb->len; if (!net_eq(t->net, dev_net(t->dev))) - skb_scrub_packet(skb); + skb_scrub_packet(skb, true); netif_rx(skb); @@ -1002,7 +1002,7 @@ static int ip6_tnl_xmit2(struct sk_buff *skb, } if (!net_eq(t->net, dev_net(dev))) - skb_scrub_packet(skb); + skb_scrub_packet(skb, true); /* * Okay, now see if we can stuff it in the buffer as-is. diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index 1d1458a3b7c4..b2e44f478e14 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -622,7 +622,7 @@ static int ipip6_rcv(struct sk_buff *skb) tstats->rx_bytes += skb->len; if (!net_eq(tunnel->net, dev_net(tunnel->dev))) - skb_scrub_packet(skb); + skb_scrub_packet(skb, true); netif_rx(skb); return 0; @@ -861,7 +861,7 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb, } if (!net_eq(tunnel->net, dev_net(dev))) - skb_scrub_packet(skb); + skb_scrub_packet(skb, true); /* * Okay, now see if we can stuff it in the buffer as-is. -- cgit v1.2.3 From 82476b316084e6826a9dd339d1dad892a598af9a Mon Sep 17 00:00:00 2001 From: Veaceslav Falico Date: Mon, 2 Sep 2013 16:26:51 +0200 Subject: net: correctly interlink lower/upper devices Currently we're linking upper devices to lower ones, which results in upside-down relationship: upper devices seeing lower devices via its upper lists. Fix this by correctly linking lower devices to the upper ones. CC: "David S. Miller" CC: Eric Dumazet CC: Jiri Pirko CC: Alexander Duyck CC: Cong Wang Signed-off-by: Veaceslav Falico Signed-off-by: David S. Miller --- net/core/dev.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 07684e880a5d..5c713f2239cc 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4679,8 +4679,8 @@ static int __netdev_upper_dev_link(struct net_device *dev, * versa, and don't forget the devices itself. All of these * links are non-neighbours. */ - list_for_each_entry(i, &upper_dev->upper_dev_list, list) { - list_for_each_entry(j, &dev->lower_dev_list, list) { + list_for_each_entry(i, &dev->lower_dev_list, list) { + list_for_each_entry(j, &upper_dev->upper_dev_list, list) { ret = __netdev_adjacent_dev_link(i->dev, j->dev); if (ret) goto rollback_mesh; -- cgit v1.2.3 From 50624c934db18ab90aaea4908f60dd39aab4e6e5 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 23 Sep 2013 21:19:49 -0700 Subject: net: Delay default_device_exit_batch until no devices are unregistering v2 There is currently serialization network namespaces exiting and network devices exiting as the final part of netdev_run_todo does not happen under the rtnl_lock. This is compounded by the fact that the only list of devices unregistering in netdev_run_todo is local to the netdev_run_todo. This lack of serialization in extreme cases results in network devices unregistering in netdev_run_todo after the loopback device of their network namespace has been freed (making dst_ifdown unsafe), and after the their network namespace has exited (making the NETDEV_UNREGISTER, and NETDEV_UNREGISTER_FINAL callbacks unsafe). Add the missing serialization by a per network namespace count of how many network devices are unregistering and having a wait queue that is woken up whenever the count is decreased. The count and wait queue allow default_device_exit_batch to wait until all of the unregistration activity for a network namespace has finished before proceeding to unregister the loopback device and then allowing the network namespace to exit. Only a single global wait queue is used because there is a single global lock, and there is a single waiter, per network namespace wait queues would be a waste of resources. The per network namespace count of unregistering devices gives a progress guarantee because the number of network devices unregistering in an exiting network namespace must ultimately drop to zero (assuming network device unregistration completes). The basic logic remains the same as in v1. This patch is now half comment and half rtnl_lock_unregistering an expanded version of wait_event performs no extra work in the common case where no network devices are unregistering when we get to default_device_exit_batch. Reported-by: Francesco Ruggeri Signed-off-by: "Eric W. Biederman" Signed-off-by: David S. Miller --- include/net/net_namespace.h | 1 + net/core/dev.c | 49 ++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 49 insertions(+), 1 deletion(-) (limited to 'net/core/dev.c') diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index 1313456a0994..9d22f08896c6 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -74,6 +74,7 @@ struct net { struct hlist_head *dev_index_head; unsigned int dev_base_seq; /* protected by rtnl_mutex */ int ifindex; + unsigned int dev_unreg_count; /* core fib_rules */ struct list_head rules_ops; diff --git a/net/core/dev.c b/net/core/dev.c index 5c713f2239cc..65f829cfd928 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5247,10 +5247,12 @@ static int dev_new_index(struct net *net) /* Delayed registration/unregisteration */ static LIST_HEAD(net_todo_list); +static DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq); static void net_set_todo(struct net_device *dev) { list_add_tail(&dev->todo_list, &net_todo_list); + dev_net(dev)->dev_unreg_count++; } static void rollback_registered_many(struct list_head *head) @@ -5918,6 +5920,12 @@ void netdev_run_todo(void) if (dev->destructor) dev->destructor(dev); + /* Report a network device has been unregistered */ + rtnl_lock(); + dev_net(dev)->dev_unreg_count--; + __rtnl_unlock(); + wake_up(&netdev_unregistering_wq); + /* Free network device */ kobject_put(&dev->dev.kobj); } @@ -6603,6 +6611,34 @@ static void __net_exit default_device_exit(struct net *net) rtnl_unlock(); } +static void __net_exit rtnl_lock_unregistering(struct list_head *net_list) +{ + /* Return with the rtnl_lock held when there are no network + * devices unregistering in any network namespace in net_list. + */ + struct net *net; + bool unregistering; + DEFINE_WAIT(wait); + + for (;;) { + prepare_to_wait(&netdev_unregistering_wq, &wait, + TASK_UNINTERRUPTIBLE); + unregistering = false; + rtnl_lock(); + list_for_each_entry(net, net_list, exit_list) { + if (net->dev_unreg_count > 0) { + unregistering = true; + break; + } + } + if (!unregistering) + break; + __rtnl_unlock(); + schedule(); + } + finish_wait(&netdev_unregistering_wq, &wait); +} + static void __net_exit default_device_exit_batch(struct list_head *net_list) { /* At exit all network devices most be removed from a network @@ -6614,7 +6650,18 @@ static void __net_exit default_device_exit_batch(struct list_head *net_list) struct net *net; LIST_HEAD(dev_kill_list); - rtnl_lock(); + /* To prevent network device cleanup code from dereferencing + * loopback devices or network devices that have been freed + * wait here for all pending unregistrations to complete, + * before unregistring the loopback device and allowing the + * network namespace be freed. + * + * The netdev todo list containing all network devices + * unregistrations that happen in default_device_exit_batch + * will run in the rtnl_unlock() at the end of + * default_device_exit_batch. + */ + rtnl_lock_unregistering(net_list); list_for_each_entry(net, net_list, exit_list) { for_each_netdev_reverse(net, dev) { if (dev->rtnl_link_ops) -- cgit v1.2.3