From 79c61899b5eee317907efd1b0d06a1ada0cc00d8 Mon Sep 17 00:00:00 2001 From: Antoine Tenart Date: Tue, 4 Feb 2025 18:03:10 +0100 Subject: net-sysfs: remove rtnl_trylock from device attributes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There is an ABBA deadlock between net device unregistration and sysfs files being accessed[1][2]. To prevent this from happening all paths taking the rtnl lock after the sysfs one (actually kn->active refcount) use rtnl_trylock and return early (using restart_syscall)[3], which can make syscalls to spin for a long time when there is contention on the rtnl lock[4]. There are not many possibilities to improve the above: - Rework the entire net/ locking logic. - Invert two locks in one of the paths — not possible. But here it's actually possible to drop one of the locks safely: the kernfs_node refcount. More details in the code itself, which comes with lots of comments. Note that we check the device is alive in the added sysfs_rtnl_lock helper to disallow sysfs operations to run after device dismantle has started. This also help keeping the same behavior as before. Because of this calls to dev_isalive in sysfs ops were removed. [1] https://lore.kernel.org/netdev/49A4D5D5.5090602@trash.net/ [2] https://lore.kernel.org/netdev/m14oyhis31.fsf@fess.ebiederm.org/ [3] https://lore.kernel.org/netdev/20090226084924.16cb3e08@nehalam/ [4] https://lore.kernel.org/all/20210928125500.167943-1-atenart@kernel.org/T/ Signed-off-by: Antoine Tenart Link: https://patch.msgid.link/20250204170314.146022-2-atenart@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/rtnetlink.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h index 4bc2ee0b10b0..ccaaf4c7d5f6 100644 --- a/include/linux/rtnetlink.h +++ b/include/linux/rtnetlink.h @@ -43,6 +43,7 @@ extern void rtnl_lock(void); extern void rtnl_unlock(void); extern int rtnl_trylock(void); extern int rtnl_is_locked(void); +extern int rtnl_lock_interruptible(void); extern int rtnl_lock_killable(void); extern bool refcount_dec_and_rtnl_lock(refcount_t *r); -- cgit v1.2.3 From b7ecc1de51ca7d0a9fa8dbc3f756ab87b99a1838 Mon Sep 17 00:00:00 2001 From: Antoine Tenart Date: Tue, 4 Feb 2025 18:03:11 +0100 Subject: net-sysfs: move queue attribute groups outside the default groups Rx/tx queues embed their own kobject for registering their per-queue sysfs files. The issue is they're using the kobject default groups for this and entirely rely on the kobject refcounting for releasing their sysfs paths. In order to remove rtnl_trylock calls we need sysfs files not to rely on their associated kobject refcounting for their release. Thus we here move queues sysfs files from the kobject default groups to their own groups which can be removed separately. Signed-off-by: Antoine Tenart Link: https://patch.msgid.link/20250204170314.146022-3-atenart@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 1 + include/net/netdev_rx_queue.h | 1 + net/core/net-sysfs.c | 27 +++++++++++++++++++++------ 3 files changed, 23 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 2a59034a5fa2..1dcc76af7520 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -658,6 +658,7 @@ struct netdev_queue { struct Qdisc __rcu *qdisc_sleeping; #ifdef CONFIG_SYSFS struct kobject kobj; + const struct attribute_group **groups; #endif unsigned long tx_maxrate; /* diff --git a/include/net/netdev_rx_queue.h b/include/net/netdev_rx_queue.h index 596836abf7bf..af40842f229d 100644 --- a/include/net/netdev_rx_queue.h +++ b/include/net/netdev_rx_queue.h @@ -16,6 +16,7 @@ struct netdev_rx_queue { struct rps_dev_flow_table __rcu *rps_flow_table; #endif struct kobject kobj; + const struct attribute_group **groups; struct net_device *dev; netdevice_tracker dev_tracker; diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index e012234c739a..0b7ee260613d 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -1188,7 +1188,6 @@ static void rx_queue_get_ownership(const struct kobject *kobj, static const struct kobj_type rx_queue_ktype = { .sysfs_ops = &rx_queue_sysfs_ops, .release = rx_queue_release, - .default_groups = rx_queue_default_groups, .namespace = rx_queue_namespace, .get_ownership = rx_queue_get_ownership, }; @@ -1222,20 +1221,27 @@ static int rx_queue_add_kobject(struct net_device *dev, int index) if (error) goto err; + queue->groups = rx_queue_default_groups; + error = sysfs_create_groups(kobj, queue->groups); + if (error) + goto err; + if (dev->sysfs_rx_queue_group) { error = sysfs_create_group(kobj, dev->sysfs_rx_queue_group); if (error) - goto err; + goto err_default_groups; } error = rx_queue_default_mask(dev, queue); if (error) - goto err; + goto err_default_groups; kobject_uevent(kobj, KOBJ_ADD); return error; +err_default_groups: + sysfs_remove_groups(kobj, queue->groups); err: kobject_put(kobj); return error; @@ -1280,12 +1286,14 @@ net_rx_queue_update_kobjects(struct net_device *dev, int old_num, int new_num) } while (--i >= new_num) { - struct kobject *kobj = &dev->_rx[i].kobj; + struct netdev_rx_queue *queue = &dev->_rx[i]; + struct kobject *kobj = &queue->kobj; if (!refcount_read(&dev_net(dev)->ns.count)) kobj->uevent_suppress = 1; if (dev->sysfs_rx_queue_group) sysfs_remove_group(kobj, dev->sysfs_rx_queue_group); + sysfs_remove_groups(kobj, queue->groups); kobject_put(kobj); } @@ -1872,7 +1880,6 @@ static void netdev_queue_get_ownership(const struct kobject *kobj, static const struct kobj_type netdev_queue_ktype = { .sysfs_ops = &netdev_queue_sysfs_ops, .release = netdev_queue_release, - .default_groups = netdev_queue_default_groups, .namespace = netdev_queue_namespace, .get_ownership = netdev_queue_get_ownership, }; @@ -1902,15 +1909,22 @@ static int netdev_queue_add_kobject(struct net_device *dev, int index) if (error) goto err; + queue->groups = netdev_queue_default_groups; + error = sysfs_create_groups(kobj, queue->groups); + if (error) + goto err; + if (netdev_uses_bql(dev)) { error = sysfs_create_group(kobj, &dql_group); if (error) - goto err; + goto err_default_groups; } kobject_uevent(kobj, KOBJ_ADD); return 0; +err_default_groups: + sysfs_remove_groups(kobj, queue->groups); err: kobject_put(kobj); return error; @@ -1965,6 +1979,7 @@ netdev_queue_update_kobjects(struct net_device *dev, int old_num, int new_num) if (netdev_uses_bql(dev)) sysfs_remove_group(&queue->kobj, &dql_group); + sysfs_remove_groups(&queue->kobj, queue->groups); kobject_put(&queue->kobj); } -- cgit v1.2.3