From 80d19669ecd34423e85ca04f2210b0e42a47cb16 Mon Sep 17 00:00:00 2001 From: Amritha Nambiar Date: Fri, 29 Jun 2018 21:26:41 -0700 Subject: net: Refactor XPS for CPUs and Rx queues Refactor XPS code to support Tx queue selection based on CPU(s) map or Rx queue(s) map. Signed-off-by: Amritha Nambiar Signed-off-by: David S. Miller --- net/core/net-sysfs.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/core/net-sysfs.c') diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index bb7e80f4ced3..b39987c81d53 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -1227,13 +1227,13 @@ static ssize_t xps_cpus_show(struct netdev_queue *queue, return -ENOMEM; rcu_read_lock(); - dev_maps = rcu_dereference(dev->xps_maps); + dev_maps = rcu_dereference(dev->xps_cpus_map); if (dev_maps) { for_each_possible_cpu(cpu) { int i, tci = cpu * num_tc + tc; struct xps_map *map; - map = rcu_dereference(dev_maps->cpu_map[tci]); + map = rcu_dereference(dev_maps->attr_map[tci]); if (!map) continue; -- cgit v1.2.3 From 8af2c06ff4b144064b51b7f688194474123d9c9c Mon Sep 17 00:00:00 2001 From: Amritha Nambiar Date: Fri, 29 Jun 2018 21:27:07 -0700 Subject: net-sysfs: Add interface for Rx queue(s) map per Tx queue Extend transmit queue sysfs attribute to configure Rx queue(s) map per Tx queue. By default no receive queues are configured for the Tx queue. - /sys/class/net/eth0/queues/tx-*/xps_rxqs Signed-off-by: Amritha Nambiar Signed-off-by: David S. Miller --- net/core/net-sysfs.c | 83 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 83 insertions(+) (limited to 'net/core/net-sysfs.c') diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index b39987c81d53..f25ac5ff48a6 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -1283,6 +1283,88 @@ static ssize_t xps_cpus_store(struct netdev_queue *queue, static struct netdev_queue_attribute xps_cpus_attribute __ro_after_init = __ATTR_RW(xps_cpus); + +static ssize_t xps_rxqs_show(struct netdev_queue *queue, char *buf) +{ + struct net_device *dev = queue->dev; + struct xps_dev_maps *dev_maps; + unsigned long *mask, index; + int j, len, num_tc = 1, tc = 0; + + index = get_netdev_queue_index(queue); + + if (dev->num_tc) { + num_tc = dev->num_tc; + tc = netdev_txq_to_tc(dev, index); + if (tc < 0) + return -EINVAL; + } + mask = kcalloc(BITS_TO_LONGS(dev->num_rx_queues), sizeof(long), + GFP_KERNEL); + if (!mask) + return -ENOMEM; + + rcu_read_lock(); + dev_maps = rcu_dereference(dev->xps_rxqs_map); + if (!dev_maps) + goto out_no_maps; + + for (j = -1; j = netif_attrmask_next(j, NULL, dev->num_rx_queues), + j < dev->num_rx_queues;) { + int i, tci = j * num_tc + tc; + struct xps_map *map; + + map = rcu_dereference(dev_maps->attr_map[tci]); + if (!map) + continue; + + for (i = map->len; i--;) { + if (map->queues[i] == index) { + set_bit(j, mask); + break; + } + } + } +out_no_maps: + rcu_read_unlock(); + + len = bitmap_print_to_pagebuf(false, buf, mask, dev->num_rx_queues); + kfree(mask); + + return len < PAGE_SIZE ? len : -EINVAL; +} + +static ssize_t xps_rxqs_store(struct netdev_queue *queue, const char *buf, + size_t len) +{ + struct net_device *dev = queue->dev; + struct net *net = dev_net(dev); + unsigned long *mask, index; + int err; + + if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) + return -EPERM; + + mask = kcalloc(BITS_TO_LONGS(dev->num_rx_queues), sizeof(long), + GFP_KERNEL); + if (!mask) + return -ENOMEM; + + index = get_netdev_queue_index(queue); + + err = bitmap_parse(buf, len, mask, dev->num_rx_queues); + if (err) { + kfree(mask); + return err; + } + + err = __netif_set_xps_queue(dev, mask, index, true); + kfree(mask); + return err ? : len; +} + +static struct netdev_queue_attribute xps_rxqs_attribute __ro_after_init + = __ATTR_RW(xps_rxqs); #endif /* CONFIG_XPS */ static struct attribute *netdev_queue_default_attrs[] __ro_after_init = { @@ -1290,6 +1372,7 @@ static struct attribute *netdev_queue_default_attrs[] __ro_after_init = { &queue_traffic_class.attr, #ifdef CONFIG_XPS &xps_cpus_attribute.attr, + &xps_rxqs_attribute.attr, &queue_tx_maxrate.attr, #endif NULL -- cgit v1.2.3 From d7be97756f8a4874ac17003de5843c742dd84153 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Mon, 9 Jul 2018 12:19:32 -0400 Subject: net-sysfs: Drop support for XPS and traffic_class on single queue device This patch makes it so that we do not report the traffic class or allow XPS configuration on single queue devices. This is mostly to avoid unnecessary complexity with changes I have planned that will allow us to reuse the unused tc_to_txq and XPS configuration on a single queue device to allow it to make use of a subset of queues on an underlying device. Signed-off-by: Alexander Duyck Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- net/core/net-sysfs.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) (limited to 'net/core/net-sysfs.c') diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index f25ac5ff48a6..dce3ae0fbca2 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -1047,9 +1047,14 @@ static ssize_t traffic_class_show(struct netdev_queue *queue, char *buf) { struct net_device *dev = queue->dev; - int index = get_netdev_queue_index(queue); - int tc = netdev_txq_to_tc(dev, index); + int index; + int tc; + if (!netif_is_multiqueue(dev)) + return -ENOENT; + + index = get_netdev_queue_index(queue); + tc = netdev_txq_to_tc(dev, index); if (tc < 0) return -EINVAL; @@ -1214,6 +1219,9 @@ static ssize_t xps_cpus_show(struct netdev_queue *queue, cpumask_var_t mask; unsigned long index; + if (!netif_is_multiqueue(dev)) + return -ENOENT; + index = get_netdev_queue_index(queue); if (dev->num_tc) { @@ -1260,6 +1268,9 @@ static ssize_t xps_cpus_store(struct netdev_queue *queue, cpumask_var_t mask; int err; + if (!netif_is_multiqueue(dev)) + return -ENOENT; + if (!capable(CAP_NET_ADMIN)) return -EPERM; -- cgit v1.2.3 From ffcfe25bb50f27395e15fa999f1a7eb769f55360 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Mon, 9 Jul 2018 12:19:38 -0400 Subject: net: Add support for subordinate device traffic classes This patch is meant to provide the basic tools needed to allow us to create subordinate device traffic classes. The general idea here is to allow subdividing the queues of a device into queue groups accessible through an upper device such as a macvlan. The idea here is to enforce the idea that an upper device has to be a single queue device, ideally with IFF_NO_QUQUE set. With that being the case we can pretty much guarantee that the tc_to_txq mappings and XPS maps for the upper device are unused. As such we could reuse those in order to support subdividing the lower device and distributing those queues between the subordinate devices. In order to distinguish between a regular set of traffic classes and if a device is carrying subordinate traffic classes I changed num_tc from a u8 to a s16 value and use the negative values to represent the subordinate pool values. So starting at -1 and running to -32768 we can encode those as pool values, and the existing values of 0 to 15 can be maintained. Signed-off-by: Alexander Duyck Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- include/linux/netdevice.h | 16 ++++++++- net/core/dev.c | 89 +++++++++++++++++++++++++++++++++++++++++++++++ net/core/net-sysfs.c | 21 ++++++++++- 3 files changed, 124 insertions(+), 2 deletions(-) (limited to 'net/core/net-sysfs.c') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index b683971e500d..b1ff77276bc4 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -575,6 +575,9 @@ struct netdev_queue { * (/sys/class/net/DEV/Q/trans_timeout) */ unsigned long trans_timeout; + + /* Subordinate device that the queue has been assigned to */ + struct net_device *sb_dev; /* * write-mostly part */ @@ -1991,7 +1994,7 @@ struct net_device { #ifdef CONFIG_DCB const struct dcbnl_rtnl_ops *dcbnl_ops; #endif - u8 num_tc; + s16 num_tc; struct netdev_tc_txq tc_to_txq[TC_MAX_QUEUE]; u8 prio_tc_map[TC_BITMASK + 1]; @@ -2045,6 +2048,17 @@ int netdev_get_num_tc(struct net_device *dev) return dev->num_tc; } +void netdev_unbind_sb_channel(struct net_device *dev, + struct net_device *sb_dev); +int netdev_bind_sb_channel_queue(struct net_device *dev, + struct net_device *sb_dev, + u8 tc, u16 count, u16 offset); +int netdev_set_sb_channel(struct net_device *dev, u16 channel); +static inline int netdev_get_sb_channel(struct net_device *dev) +{ + return max_t(int, -dev->num_tc, 0); +} + static inline struct netdev_queue *netdev_get_tx_queue(const struct net_device *dev, unsigned int index) diff --git a/net/core/dev.c b/net/core/dev.c index 89825c1eccdc..cc1d6bba017a 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2067,11 +2067,13 @@ int netdev_txq_to_tc(struct net_device *dev, unsigned int txq) struct netdev_tc_txq *tc = &dev->tc_to_txq[0]; int i; + /* walk through the TCs and see if it falls into any of them */ for (i = 0; i < TC_MAX_QUEUE; i++, tc++) { if ((txq - tc->offset) < tc->count) return i; } + /* didn't find it, just return -1 to indicate no match */ return -1; } @@ -2260,7 +2262,14 @@ int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask, unsigned int nr_ids; if (dev->num_tc) { + /* Do not allow XPS on subordinate device directly */ num_tc = dev->num_tc; + if (num_tc < 0) + return -EINVAL; + + /* If queue belongs to subordinate dev use its map */ + dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev; + tc = netdev_txq_to_tc(dev, index); if (tc < 0) return -EINVAL; @@ -2448,11 +2457,25 @@ int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask, EXPORT_SYMBOL(netif_set_xps_queue); #endif +static void netdev_unbind_all_sb_channels(struct net_device *dev) +{ + struct netdev_queue *txq = &dev->_tx[dev->num_tx_queues]; + + /* Unbind any subordinate channels */ + while (txq-- != &dev->_tx[0]) { + if (txq->sb_dev) + netdev_unbind_sb_channel(dev, txq->sb_dev); + } +} + void netdev_reset_tc(struct net_device *dev) { #ifdef CONFIG_XPS netif_reset_xps_queues_gt(dev, 0); #endif + netdev_unbind_all_sb_channels(dev); + + /* Reset TC configuration of device */ dev->num_tc = 0; memset(dev->tc_to_txq, 0, sizeof(dev->tc_to_txq)); memset(dev->prio_tc_map, 0, sizeof(dev->prio_tc_map)); @@ -2481,11 +2504,77 @@ int netdev_set_num_tc(struct net_device *dev, u8 num_tc) #ifdef CONFIG_XPS netif_reset_xps_queues_gt(dev, 0); #endif + netdev_unbind_all_sb_channels(dev); + dev->num_tc = num_tc; return 0; } EXPORT_SYMBOL(netdev_set_num_tc); +void netdev_unbind_sb_channel(struct net_device *dev, + struct net_device *sb_dev) +{ + struct netdev_queue *txq = &dev->_tx[dev->num_tx_queues]; + +#ifdef CONFIG_XPS + netif_reset_xps_queues_gt(sb_dev, 0); +#endif + memset(sb_dev->tc_to_txq, 0, sizeof(sb_dev->tc_to_txq)); + memset(sb_dev->prio_tc_map, 0, sizeof(sb_dev->prio_tc_map)); + + while (txq-- != &dev->_tx[0]) { + if (txq->sb_dev == sb_dev) + txq->sb_dev = NULL; + } +} +EXPORT_SYMBOL(netdev_unbind_sb_channel); + +int netdev_bind_sb_channel_queue(struct net_device *dev, + struct net_device *sb_dev, + u8 tc, u16 count, u16 offset) +{ + /* Make certain the sb_dev and dev are already configured */ + if (sb_dev->num_tc >= 0 || tc >= dev->num_tc) + return -EINVAL; + + /* We cannot hand out queues we don't have */ + if ((offset + count) > dev->real_num_tx_queues) + return -EINVAL; + + /* Record the mapping */ + sb_dev->tc_to_txq[tc].count = count; + sb_dev->tc_to_txq[tc].offset = offset; + + /* Provide a way for Tx queue to find the tc_to_txq map or + * XPS map for itself. + */ + while (count--) + netdev_get_tx_queue(dev, count + offset)->sb_dev = sb_dev; + + return 0; +} +EXPORT_SYMBOL(netdev_bind_sb_channel_queue); + +int netdev_set_sb_channel(struct net_device *dev, u16 channel) +{ + /* Do not use a multiqueue device to represent a subordinate channel */ + if (netif_is_multiqueue(dev)) + return -ENODEV; + + /* We allow channels 1 - 32767 to be used for subordinate channels. + * Channel 0 is meant to be "native" mode and used only to represent + * the main root device. We allow writing 0 to reset the device back + * to normal mode after being used as a subordinate channel. + */ + if (channel > S16_MAX) + return -EINVAL; + + dev->num_tc = -channel; + + return 0; +} +EXPORT_SYMBOL(netdev_set_sb_channel); + /* * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues * greater than real_num_tx_queues stale skbs on the qdisc must be flushed. diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index dce3ae0fbca2..ffa1d18f2c2c 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -1054,11 +1054,23 @@ static ssize_t traffic_class_show(struct netdev_queue *queue, return -ENOENT; index = get_netdev_queue_index(queue); + + /* If queue belongs to subordinate dev use its TC mapping */ + dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev; + tc = netdev_txq_to_tc(dev, index); if (tc < 0) return -EINVAL; - return sprintf(buf, "%u\n", tc); + /* We can report the traffic class one of two ways: + * Subordinate device traffic classes are reported with the traffic + * class first, and then the subordinate class so for example TC0 on + * subordinate device 2 will be reported as "0-2". If the queue + * belongs to the root device it will be reported with just the + * traffic class, so just "0" for TC 0 for example. + */ + return dev->num_tc < 0 ? sprintf(buf, "%u%d\n", tc, dev->num_tc) : + sprintf(buf, "%u\n", tc); } #ifdef CONFIG_XPS @@ -1225,7 +1237,14 @@ static ssize_t xps_cpus_show(struct netdev_queue *queue, index = get_netdev_queue_index(queue); if (dev->num_tc) { + /* Do not allow XPS on subordinate device directly */ num_tc = dev->num_tc; + if (num_tc < 0) + return -EINVAL; + + /* If queue belongs to subordinate dev use its map */ + dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev; + tc = netdev_txq_to_tc(dev, index); if (tc < 0) return -EINVAL; -- cgit v1.2.3 From 3033fced2f689d4a870b3ba6a8a676db1261d262 Mon Sep 17 00:00:00 2001 From: Tyler Hicks Date: Fri, 20 Jul 2018 21:56:51 +0000 Subject: net-sysfs: require net admin in the init ns for setting tx_maxrate An upcoming change will allow container root to open some /sys/class/net files for writing. The tx_maxrate attribute can result in changes to actual hardware devices so err on the side of caution by requiring CAP_NET_ADMIN in the init namespace in the corresponding attribute store operation. Signed-off-by: Tyler Hicks Signed-off-by: David S. Miller --- net/core/net-sysfs.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net/core/net-sysfs.c') diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index ffa1d18f2c2c..405c41ecb20b 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -1087,6 +1087,9 @@ static ssize_t tx_maxrate_store(struct netdev_queue *queue, int err, index = get_netdev_queue_index(queue); u32 rate = 0; + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + err = kstrtou32(buf, 10, &rate); if (err < 0) return err; -- cgit v1.2.3 From b0e37c0d8a6abed0cd1b611314a7ebf50b0a8ed4 Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Fri, 20 Jul 2018 21:56:52 +0000 Subject: net-sysfs: make sure objects belong to container's owner When creating various objects in /sys/class/net/... make sure that they belong to container's owner instead of global root (if they belong to a container/namespace). Co-Developed-by: Tyler Hicks Signed-off-by: Dmitry Torokhov Signed-off-by: Tyler Hicks Signed-off-by: David S. Miller --- net/core/net-sysfs.c | 47 ++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 46 insertions(+), 1 deletion(-) (limited to 'net/core/net-sysfs.c') diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 405c41ecb20b..ada065fc685e 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -656,6 +656,24 @@ static const struct attribute_group wireless_group = { #define net_class_groups NULL #endif /* CONFIG_SYSFS */ +static void net_ns_get_ownership(const struct net *net, + kuid_t *uid, kgid_t *gid) +{ + if (net) { + kuid_t ns_root_uid = make_kuid(net->user_ns, 0); + kgid_t ns_root_gid = make_kgid(net->user_ns, 0); + + if (uid_valid(ns_root_uid)) + *uid = ns_root_uid; + + if (gid_valid(ns_root_gid)) + *gid = ns_root_gid; + } else { + *uid = GLOBAL_ROOT_UID; + *gid = GLOBAL_ROOT_GID; + } +} + #ifdef CONFIG_SYSFS #define to_rx_queue_attr(_attr) \ container_of(_attr, struct rx_queue_attribute, attr) @@ -905,11 +923,20 @@ static const void *rx_queue_namespace(struct kobject *kobj) return ns; } +static void rx_queue_get_ownership(struct kobject *kobj, + kuid_t *uid, kgid_t *gid) +{ + const struct net *net = rx_queue_namespace(kobj); + + net_ns_get_ownership(net, uid, gid); +} + static struct kobj_type rx_queue_ktype __ro_after_init = { .sysfs_ops = &rx_queue_sysfs_ops, .release = rx_queue_release, .default_attrs = rx_queue_default_attrs, - .namespace = rx_queue_namespace + .namespace = rx_queue_namespace, + .get_ownership = rx_queue_get_ownership, }; static int rx_queue_add_kobject(struct net_device *dev, int index) @@ -1431,11 +1458,20 @@ static const void *netdev_queue_namespace(struct kobject *kobj) return ns; } +static void netdev_queue_get_ownership(struct kobject *kobj, + kuid_t *uid, kgid_t *gid) +{ + const struct net *net = netdev_queue_namespace(kobj); + + net_ns_get_ownership(net, uid, gid); +} + static struct kobj_type netdev_queue_ktype __ro_after_init = { .sysfs_ops = &netdev_queue_sysfs_ops, .release = netdev_queue_release, .default_attrs = netdev_queue_default_attrs, .namespace = netdev_queue_namespace, + .get_ownership = netdev_queue_get_ownership, }; static int netdev_queue_add_kobject(struct net_device *dev, int index) @@ -1625,6 +1661,14 @@ static const void *net_namespace(struct device *d) return dev_net(dev); } +static void net_get_ownership(struct device *d, kuid_t *uid, kgid_t *gid) +{ + struct net_device *dev = to_net_dev(d); + const struct net *net = dev_net(dev); + + net_ns_get_ownership(net, uid, gid); +} + static struct class net_class __ro_after_init = { .name = "net", .dev_release = netdev_release, @@ -1632,6 +1676,7 @@ static struct class net_class __ro_after_init = { .dev_uevent = netdev_uevent, .ns_type = &net_ns_type_operations, .namespace = net_namespace, + .get_ownership = net_get_ownership, }; #ifdef CONFIG_OF_NET -- cgit v1.2.3 From fbdeaed408cf2728c62640c10848ddb1b67e63d3 Mon Sep 17 00:00:00 2001 From: Tyler Hicks Date: Fri, 20 Jul 2018 21:56:53 +0000 Subject: net: create reusable function for getting ownership info of sysfs inodes Make net_ns_get_ownership() reusable by networking code outside of core. This is useful, for example, to allow bridge related sysfs files to be owned by container root. Add a function comment since this is a potentially dangerous function to use given the way that kobject_get_ownership() works by initializing uid and gid before calling .get_ownership(). Signed-off-by: Tyler Hicks Signed-off-by: David S. Miller --- include/net/net_namespace.h | 10 ++++++++++ net/core/net-sysfs.c | 18 ------------------ net/core/net_namespace.c | 28 ++++++++++++++++++++++++++++ 3 files changed, 38 insertions(+), 18 deletions(-) (limited to 'net/core/net-sysfs.c') diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index a71264d75d7f..9b5fdc50519a 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -10,6 +10,7 @@ #include #include #include +#include #include #include @@ -170,6 +171,8 @@ extern struct net init_net; struct net *copy_net_ns(unsigned long flags, struct user_namespace *user_ns, struct net *old_net); +void net_ns_get_ownership(const struct net *net, kuid_t *uid, kgid_t *gid); + void net_ns_barrier(void); #else /* CONFIG_NET_NS */ #include @@ -182,6 +185,13 @@ static inline struct net *copy_net_ns(unsigned long flags, return old_net; } +static inline void net_ns_get_ownership(const struct net *net, + kuid_t *uid, kgid_t *gid) +{ + *uid = GLOBAL_ROOT_UID; + *gid = GLOBAL_ROOT_GID; +} + static inline void net_ns_barrier(void) {} #endif /* CONFIG_NET_NS */ diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index ada065fc685e..0a95bcf64cdc 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -656,24 +656,6 @@ static const struct attribute_group wireless_group = { #define net_class_groups NULL #endif /* CONFIG_SYSFS */ -static void net_ns_get_ownership(const struct net *net, - kuid_t *uid, kgid_t *gid) -{ - if (net) { - kuid_t ns_root_uid = make_kuid(net->user_ns, 0); - kgid_t ns_root_gid = make_kgid(net->user_ns, 0); - - if (uid_valid(ns_root_uid)) - *uid = ns_root_uid; - - if (gid_valid(ns_root_gid)) - *gid = ns_root_gid; - } else { - *uid = GLOBAL_ROOT_UID; - *gid = GLOBAL_ROOT_GID; - } -} - #ifdef CONFIG_SYSFS #define to_rx_queue_attr(_attr) \ container_of(_attr, struct rx_queue_attribute, attr) diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index a11e03f920d3..738871af5efa 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include @@ -448,6 +449,33 @@ dec_ucounts: return net; } +/** + * net_ns_get_ownership - get sysfs ownership data for @net + * @net: network namespace in question (can be NULL) + * @uid: kernel user ID for sysfs objects + * @gid: kernel group ID for sysfs objects + * + * Returns the uid/gid pair of root in the user namespace associated with the + * given network namespace. + */ +void net_ns_get_ownership(const struct net *net, kuid_t *uid, kgid_t *gid) +{ + if (net) { + kuid_t ns_root_uid = make_kuid(net->user_ns, 0); + kgid_t ns_root_gid = make_kgid(net->user_ns, 0); + + if (uid_valid(ns_root_uid)) + *uid = ns_root_uid; + + if (gid_valid(ns_root_gid)) + *gid = ns_root_gid; + } else { + *uid = GLOBAL_ROOT_UID; + *gid = GLOBAL_ROOT_GID; + } +} +EXPORT_SYMBOL_GPL(net_ns_get_ownership); + static void unhash_nsid(struct net *net, struct net *last) { struct net *tmp; -- cgit v1.2.3 From 4d99f6602cb552fb58db0c3b1d935bb6fa017f24 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Wed, 8 Aug 2018 20:07:35 -0700 Subject: net: allow to call netif_reset_xps_queues() under cpus_read_lock The definition of static_key_slow_inc() has cpus_read_lock in place. In the virtio_net driver, XPS queues are initialized after setting the queue:cpu affinity in virtnet_set_affinity() which is already protected within cpus_read_lock. Lockdep prints a warning when we are trying to acquire cpus_read_lock when it is already held. This patch adds an ability to call __netif_set_xps_queue under cpus_read_lock(). Acked-by: Jason Wang ============================================ WARNING: possible recursive locking detected 4.18.0-rc3-next-20180703+ #1 Not tainted -------------------------------------------- swapper/0/1 is trying to acquire lock: 00000000cf973d46 (cpu_hotplug_lock.rw_sem){++++}, at: static_key_slow_inc+0xe/0x20 but task is already holding lock: 00000000cf973d46 (cpu_hotplug_lock.rw_sem){++++}, at: init_vqs+0x513/0x5a0 other info that might help us debug this: Possible unsafe locking scenario: CPU0 ---- lock(cpu_hotplug_lock.rw_sem); lock(cpu_hotplug_lock.rw_sem); *** DEADLOCK *** May be due to missing lock nesting notation 3 locks held by swapper/0/1: #0: 00000000244bc7da (&dev->mutex){....}, at: __driver_attach+0x5a/0x110 #1: 00000000cf973d46 (cpu_hotplug_lock.rw_sem){++++}, at: init_vqs+0x513/0x5a0 #2: 000000005cd8463f (xps_map_mutex){+.+.}, at: __netif_set_xps_queue+0x8d/0xc60 v2: move cpus_read_lock() out of __netif_set_xps_queue() Cc: "Nambiar, Amritha" Cc: "Michael S. Tsirkin" Cc: Jason Wang Fixes: 8af2c06ff4b1 ("net-sysfs: Add interface for Rx queue(s) map per Tx queue") Signed-off-by: Andrei Vagin Signed-off-by: David S. Miller --- drivers/net/virtio_net.c | 4 +++- net/core/dev.c | 20 +++++++++++++++----- net/core/net-sysfs.c | 4 ++++ 3 files changed, 22 insertions(+), 6 deletions(-) (limited to 'net/core/net-sysfs.c') diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 62311dde6e71..39a7f4452587 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -1903,9 +1903,11 @@ static void virtnet_set_affinity(struct virtnet_info *vi) i = 0; for_each_online_cpu(cpu) { + const unsigned long *mask = cpumask_bits(cpumask_of(cpu)); + virtqueue_set_affinity(vi->rq[i].vq, cpu); virtqueue_set_affinity(vi->sq[i].vq, cpu); - netif_set_xps_queue(vi->dev, cpumask_of(cpu), i); + __netif_set_xps_queue(vi->dev, mask, i, false); i++; } diff --git a/net/core/dev.c b/net/core/dev.c index f68122f0ab02..325fc5088370 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2176,6 +2176,7 @@ static void netif_reset_xps_queues(struct net_device *dev, u16 offset, if (!static_key_false(&xps_needed)) return; + cpus_read_lock(); mutex_lock(&xps_map_mutex); if (static_key_false(&xps_rxqs_needed)) { @@ -2199,10 +2200,11 @@ static void netif_reset_xps_queues(struct net_device *dev, u16 offset, out_no_maps: if (static_key_enabled(&xps_rxqs_needed)) - static_key_slow_dec(&xps_rxqs_needed); + static_key_slow_dec_cpuslocked(&xps_rxqs_needed); - static_key_slow_dec(&xps_needed); + static_key_slow_dec_cpuslocked(&xps_needed); mutex_unlock(&xps_map_mutex); + cpus_read_unlock(); } static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index) @@ -2250,6 +2252,7 @@ static struct xps_map *expand_xps_map(struct xps_map *map, int attr_index, return new_map; } +/* Must be called under cpus_read_lock */ int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask, u16 index, bool is_rxqs_map) { @@ -2317,9 +2320,9 @@ int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask, if (!new_dev_maps) goto out_no_new_maps; - static_key_slow_inc(&xps_needed); + static_key_slow_inc_cpuslocked(&xps_needed); if (is_rxqs_map) - static_key_slow_inc(&xps_rxqs_needed); + static_key_slow_inc_cpuslocked(&xps_rxqs_needed); for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids), j < nr_ids;) { @@ -2448,11 +2451,18 @@ error: kfree(new_dev_maps); return -ENOMEM; } +EXPORT_SYMBOL_GPL(__netif_set_xps_queue); int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask, u16 index) { - return __netif_set_xps_queue(dev, cpumask_bits(mask), index, false); + int ret; + + cpus_read_lock(); + ret = __netif_set_xps_queue(dev, cpumask_bits(mask), index, false); + cpus_read_unlock(); + + return ret; } EXPORT_SYMBOL(netif_set_xps_queue); diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 0a95bcf64cdc..bd67c4d0fcfd 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -26,6 +26,7 @@ #include #include #include +#include #include "net-sysfs.h" @@ -1400,7 +1401,10 @@ static ssize_t xps_rxqs_store(struct netdev_queue *queue, const char *buf, return err; } + cpus_read_lock(); err = __netif_set_xps_queue(dev, mask, index, true); + cpus_read_unlock(); + kfree(mask); return err ? : len; } -- cgit v1.2.3