From 1a57feb847c56d6193f67d0e892c24e71f9e3ab1 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Tue, 13 Feb 2018 12:26:23 +0300 Subject: net: Introduce net_sem for protection of pernet_list Currently, the mutex is mostly used to protect pernet operations list. It orders setup_net() and cleanup_net() with parallel {un,}register_pernet_operations() calls, so ->exit{,batch} methods of the same pernet operations are executed for a dying net, as were used to call ->init methods, even after the net namespace is unlinked from net_namespace_list in cleanup_net(). But there are several problems with scalability. The first one is that more than one net can't be created or destroyed at the same moment on the node. For big machines with many cpus running many containers it's very sensitive. The second one is that it's need to synchronize_rcu() after net is removed from net_namespace_list(): Destroy net_ns: cleanup_net() mutex_lock(&net_mutex) list_del_rcu(&net->list) synchronize_rcu() <--- Sleep there for ages list_for_each_entry_reverse(ops, &pernet_list, list) ops_exit_list(ops, &net_exit_list) list_for_each_entry_reverse(ops, &pernet_list, list) ops_free_list(ops, &net_exit_list) mutex_unlock(&net_mutex) This primitive is not fast, especially on the systems with many processors and/or when preemptible RCU is enabled in config. So, all the time, while cleanup_net() is waiting for RCU grace period, creation of new net namespaces is not possible, the tasks, who makes it, are sleeping on the same mutex: Create net_ns: copy_net_ns() mutex_lock_killable(&net_mutex) <--- Sleep there for ages I observed 20-30 seconds hangs of "unshare -n" on ordinary 8-cpu laptop with preemptible RCU enabled after CRIU tests round is finished. The solution is to convert net_mutex to the rw_semaphore and add fine grain locks to really small number of pernet_operations, what really need them. Then, pernet_operations::init/::exit methods, modifying the net-related data, will require down_read() locking only, while down_write() will be used for changing pernet_list (i.e., when modules are being loaded and unloaded). This gives signify performance increase, after all patch set is applied, like you may see here: %for i in {1..10000}; do unshare -n bash -c exit; done *before* real 1m40,377s user 0m9,672s sys 0m19,928s *after* real 0m17,007s user 0m5,311s sys 0m11,779 (5.8 times faster) This patch starts replacing net_mutex to net_sem. It adds rw_semaphore, describes the variables it protects, and makes to use, where appropriate. net_mutex is still present, and next patches will kick it out step-by-step. Signed-off-by: Kirill Tkhai Acked-by: Andrei Vagin Signed-off-by: David S. Miller --- include/linux/rtnetlink.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux/rtnetlink.h') diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h index 1fdcde96eb65..e9ee9ad0a681 100644 --- a/include/linux/rtnetlink.h +++ b/include/linux/rtnetlink.h @@ -36,6 +36,7 @@ extern int rtnl_is_locked(void); extern wait_queue_head_t netdev_unregistering_wq; extern struct mutex net_mutex; +extern struct rw_semaphore net_sem; #ifdef CONFIG_PROVE_LOCKING extern bool lockdep_rtnl_is_held(void); -- cgit v1.2.3 From 19efbd93e6fb05eab81856b4fc8d64211dd37088 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Mon, 19 Feb 2018 12:58:38 +0300 Subject: net: Kill net_mutex We take net_mutex, when there are !async pernet_operations registered, and read locking of net_sem is not enough. But we may get rid of taking the mutex, and just change the logic to write lock net_sem in such cases. This obviously reduces the number of lock operations, we do. Signed-off-by: Kirill Tkhai Signed-off-by: David S. Miller --- include/linux/rtnetlink.h | 1 - include/net/net_namespace.h | 11 ++++++---- net/core/net_namespace.c | 53 +++++++++++++++++++++++++++------------------ 3 files changed, 39 insertions(+), 26 deletions(-) (limited to 'include/linux/rtnetlink.h') diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h index e9ee9ad0a681..3573b4bf2fdf 100644 --- a/include/linux/rtnetlink.h +++ b/include/linux/rtnetlink.h @@ -35,7 +35,6 @@ extern int rtnl_trylock(void); extern int rtnl_is_locked(void); extern wait_queue_head_t netdev_unregistering_wq; -extern struct mutex net_mutex; extern struct rw_semaphore net_sem; #ifdef CONFIG_PROVE_LOCKING diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index 9158ec1ad06f..115b01b92f4d 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -60,8 +60,11 @@ struct net { struct list_head list; /* list of network namespaces */ struct list_head cleanup_list; /* namespaces on death row */ - struct list_head exit_list; /* Use only net_mutex */ - + struct list_head exit_list; /* To linked to call pernet exit + * methods on dead net (net_sem + * read locked), or to unregister + * pernet ops (net_sem wr locked). + */ struct user_namespace *user_ns; /* Owning user namespace */ struct ucounts *ucounts; spinlock_t nsid_lock; @@ -89,7 +92,7 @@ struct net { /* core fib_rules */ struct list_head rules_ops; - struct list_head fib_notifier_ops; /* protected by net_mutex */ + struct list_head fib_notifier_ops; /* protected by net_sem */ struct net_device *loopback_dev; /* The loopback */ struct netns_core core; @@ -316,7 +319,7 @@ struct pernet_operations { /* * Indicates above methods are allowed to be executed in parallel * with methods of any other pernet_operations, i.e. they are not - * need synchronization via net_mutex. + * need write locked net_sem. */ bool async; }; diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index bcab9a938d6f..e89a516620dd 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -29,8 +29,6 @@ static LIST_HEAD(pernet_list); static struct list_head *first_device = &pernet_list; -/* Used only if there are !async pernet_operations registered */ -DEFINE_MUTEX(net_mutex); LIST_HEAD(net_namespace_list); EXPORT_SYMBOL_GPL(net_namespace_list); @@ -407,6 +405,7 @@ struct net *copy_net_ns(unsigned long flags, { struct ucounts *ucounts; struct net *net; + unsigned write; int rv; if (!(flags & CLONE_NEWNET)) @@ -424,20 +423,26 @@ struct net *copy_net_ns(unsigned long flags, refcount_set(&net->passive, 1); net->ucounts = ucounts; get_user_ns(user_ns); - - rv = down_read_killable(&net_sem); +again: + write = READ_ONCE(nr_sync_pernet_ops); + if (write) + rv = down_write_killable(&net_sem); + else + rv = down_read_killable(&net_sem); if (rv < 0) goto put_userns; - if (nr_sync_pernet_ops) { - rv = mutex_lock_killable(&net_mutex); - if (rv < 0) - goto up_read; + + if (!write && unlikely(READ_ONCE(nr_sync_pernet_ops))) { + up_read(&net_sem); + goto again; } rv = setup_net(net, user_ns); - if (nr_sync_pernet_ops) - mutex_unlock(&net_mutex); -up_read: - up_read(&net_sem); + + if (write) + up_write(&net_sem); + else + up_read(&net_sem); + if (rv < 0) { put_userns: put_user_ns(user_ns); @@ -485,15 +490,23 @@ static void cleanup_net(struct work_struct *work) struct net *net, *tmp, *last; struct list_head net_kill_list; LIST_HEAD(net_exit_list); + unsigned write; /* Atomically snapshot the list of namespaces to cleanup */ spin_lock_irq(&cleanup_list_lock); list_replace_init(&cleanup_list, &net_kill_list); spin_unlock_irq(&cleanup_list_lock); +again: + write = READ_ONCE(nr_sync_pernet_ops); + if (write) + down_write(&net_sem); + else + down_read(&net_sem); - down_read(&net_sem); - if (nr_sync_pernet_ops) - mutex_lock(&net_mutex); + if (!write && unlikely(READ_ONCE(nr_sync_pernet_ops))) { + up_read(&net_sem); + goto again; + } /* Don't let anyone else find us. */ rtnl_lock(); @@ -528,14 +541,14 @@ static void cleanup_net(struct work_struct *work) list_for_each_entry_reverse(ops, &pernet_list, list) ops_exit_list(ops, &net_exit_list); - if (nr_sync_pernet_ops) - mutex_unlock(&net_mutex); - /* Free the net generic variables */ list_for_each_entry_reverse(ops, &pernet_list, list) ops_free_list(ops, &net_exit_list); - up_read(&net_sem); + if (write) + up_write(&net_sem); + else + up_read(&net_sem); /* Ensure there are no outstanding rcu callbacks using this * network namespace. @@ -563,8 +576,6 @@ static void cleanup_net(struct work_struct *work) void net_ns_barrier(void) { down_write(&net_sem); - mutex_lock(&net_mutex); - mutex_unlock(&net_mutex); up_write(&net_sem); } EXPORT_SYMBOL(net_ns_barrier); -- cgit v1.2.3 From 79ffdfc6522ae33d8a33e971070c08ee5f27439b Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Wed, 14 Mar 2018 22:17:20 +0300 Subject: net: Add rtnl_lock_killable() rtnl_lock() is widely used mutex in kernel. Some of kernel code does memory allocations under it. In case of memory deficit this may invoke OOM killer, but the problem is a killed task can't exit if it's waiting for the mutex. This may be a reason of deadlock and panic. This patch adds a new primitive, which responds on SIGKILL, and it allows to use it in the places, where we don't want to sleep forever. Signed-off-by: Kirill Tkhai Signed-off-by: David S. Miller --- include/linux/rtnetlink.h | 1 + net/core/rtnetlink.c | 6 ++++++ 2 files changed, 7 insertions(+) (limited to 'include/linux/rtnetlink.h') diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h index 3573b4bf2fdf..562a175c35a9 100644 --- a/include/linux/rtnetlink.h +++ b/include/linux/rtnetlink.h @@ -33,6 +33,7 @@ extern void rtnl_lock(void); extern void rtnl_unlock(void); extern int rtnl_trylock(void); extern int rtnl_is_locked(void); +extern int rtnl_lock_killable(void); extern wait_queue_head_t netdev_unregistering_wq; extern struct rw_semaphore net_sem; diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 67f375cfb982..87079eaa871b 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -75,6 +75,12 @@ void rtnl_lock(void) } EXPORT_SYMBOL(rtnl_lock); +int rtnl_lock_killable(void) +{ + return mutex_lock_killable(&rtnl_mutex); +} +EXPORT_SYMBOL(rtnl_lock_killable); + static struct sk_buff *defer_kfree_skb_list; void rtnl_kfree_skbs(struct sk_buff *head, struct sk_buff *tail) { -- cgit v1.2.3 From 4420bf21fb6c0306e36ad58ade1e741fba57ce65 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Tue, 27 Mar 2018 18:02:23 +0300 Subject: net: Rename net_sem to pernet_ops_rwsem net_sem is some undefined area name, so it will be better to make the area more defined. Rename it to pernet_ops_rwsem for better readability and better intelligibility. Signed-off-by: Kirill Tkhai Signed-off-by: David S. Miller --- include/linux/rtnetlink.h | 2 +- include/net/net_namespace.h | 12 +++++++----- net/core/net_namespace.c | 40 ++++++++++++++++++++-------------------- net/core/rtnetlink.c | 4 ++-- 4 files changed, 30 insertions(+), 28 deletions(-) (limited to 'include/linux/rtnetlink.h') diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h index 562a175c35a9..c7d1e4689325 100644 --- a/include/linux/rtnetlink.h +++ b/include/linux/rtnetlink.h @@ -36,7 +36,7 @@ extern int rtnl_is_locked(void); extern int rtnl_lock_killable(void); extern wait_queue_head_t netdev_unregistering_wq; -extern struct rw_semaphore net_sem; +extern struct rw_semaphore pernet_ops_rwsem; #ifdef CONFIG_PROVE_LOCKING extern bool lockdep_rtnl_is_held(void); diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index 37bcf8382b61..922e8b6fb422 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -60,9 +60,10 @@ struct net { struct list_head list; /* list of network namespaces */ struct list_head exit_list; /* To linked to call pernet exit - * methods on dead net (net_sem - * read locked), or to unregister - * pernet ops (net_sem wr locked). + * methods on dead net ( + * pernet_ops_rwsem read locked), + * or to unregister pernet ops + * (pernet_ops_rwsem write locked). */ struct llist_node cleanup_list; /* namespaces on death row */ @@ -95,8 +96,9 @@ struct net { /* core fib_rules */ struct list_head rules_ops; - struct list_head fib_notifier_ops; /* protected by net_sem */ - + struct list_head fib_notifier_ops; /* Populated by + * register_pernet_subsys() + */ struct net_device *loopback_dev; /* The loopback */ struct netns_core core; struct netns_mib mib; diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index eef17ad29dea..9e8ee4640451 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -41,10 +41,10 @@ EXPORT_SYMBOL(init_net); static bool init_net_initialized; /* - * net_sem: protects: pernet_list, net_generic_ids, + * pernet_ops_rwsem: protects: pernet_list, net_generic_ids, * init_net_initialized and first_device pointer. */ -DECLARE_RWSEM(net_sem); +DECLARE_RWSEM(pernet_ops_rwsem); #define MIN_PERNET_OPS_ID \ ((sizeof(struct net_generic) + sizeof(void *) - 1) / sizeof(void *)) @@ -72,7 +72,7 @@ static int net_assign_generic(struct net *net, unsigned int id, void *data) BUG_ON(id < MIN_PERNET_OPS_ID); old_ng = rcu_dereference_protected(net->gen, - lockdep_is_held(&net_sem)); + lockdep_is_held(&pernet_ops_rwsem)); if (old_ng->s.len > id) { old_ng->ptr[id] = data; return 0; @@ -289,7 +289,7 @@ struct net *get_net_ns_by_id(struct net *net, int id) */ static __net_init int setup_net(struct net *net, struct user_namespace *user_ns) { - /* Must be called with net_sem held */ + /* Must be called with pernet_ops_rwsem held */ const struct pernet_operations *ops, *saved_ops; int error = 0; LIST_HEAD(net_exit_list); @@ -422,13 +422,13 @@ struct net *copy_net_ns(unsigned long flags, net->ucounts = ucounts; get_user_ns(user_ns); - rv = down_read_killable(&net_sem); + rv = down_read_killable(&pernet_ops_rwsem); if (rv < 0) goto put_userns; rv = setup_net(net, user_ns); - up_read(&net_sem); + up_read(&pernet_ops_rwsem); if (rv < 0) { put_userns: @@ -480,7 +480,7 @@ static void cleanup_net(struct work_struct *work) /* Atomically snapshot the list of namespaces to cleanup */ net_kill_list = llist_del_all(&cleanup_list); - down_read(&net_sem); + down_read(&pernet_ops_rwsem); /* Don't let anyone else find us. */ rtnl_lock(); @@ -519,7 +519,7 @@ static void cleanup_net(struct work_struct *work) list_for_each_entry_reverse(ops, &pernet_list, list) ops_free_list(ops, &net_exit_list); - up_read(&net_sem); + up_read(&pernet_ops_rwsem); /* Ensure there are no outstanding rcu callbacks using this * network namespace. @@ -546,8 +546,8 @@ static void cleanup_net(struct work_struct *work) */ void net_ns_barrier(void) { - down_write(&net_sem); - up_write(&net_sem); + down_write(&pernet_ops_rwsem); + up_write(&pernet_ops_rwsem); } EXPORT_SYMBOL(net_ns_barrier); @@ -869,12 +869,12 @@ static int __init net_ns_init(void) rcu_assign_pointer(init_net.gen, ng); - down_write(&net_sem); + down_write(&pernet_ops_rwsem); if (setup_net(&init_net, &init_user_ns)) panic("Could not setup the initial network namespace"); init_net_initialized = true; - up_write(&net_sem); + up_write(&pernet_ops_rwsem); register_pernet_subsys(&net_ns_ops); @@ -1013,9 +1013,9 @@ static void unregister_pernet_operations(struct pernet_operations *ops) int register_pernet_subsys(struct pernet_operations *ops) { int error; - down_write(&net_sem); + down_write(&pernet_ops_rwsem); error = register_pernet_operations(first_device, ops); - up_write(&net_sem); + up_write(&pernet_ops_rwsem); return error; } EXPORT_SYMBOL_GPL(register_pernet_subsys); @@ -1031,9 +1031,9 @@ EXPORT_SYMBOL_GPL(register_pernet_subsys); */ void unregister_pernet_subsys(struct pernet_operations *ops) { - down_write(&net_sem); + down_write(&pernet_ops_rwsem); unregister_pernet_operations(ops); - up_write(&net_sem); + up_write(&pernet_ops_rwsem); } EXPORT_SYMBOL_GPL(unregister_pernet_subsys); @@ -1059,11 +1059,11 @@ EXPORT_SYMBOL_GPL(unregister_pernet_subsys); int register_pernet_device(struct pernet_operations *ops) { int error; - down_write(&net_sem); + down_write(&pernet_ops_rwsem); error = register_pernet_operations(&pernet_list, ops); if (!error && (first_device == &pernet_list)) first_device = &ops->list; - up_write(&net_sem); + up_write(&pernet_ops_rwsem); return error; } EXPORT_SYMBOL_GPL(register_pernet_device); @@ -1079,11 +1079,11 @@ EXPORT_SYMBOL_GPL(register_pernet_device); */ void unregister_pernet_device(struct pernet_operations *ops) { - down_write(&net_sem); + down_write(&pernet_ops_rwsem); if (&ops->list == first_device) first_device = first_device->next; unregister_pernet_operations(ops); - up_write(&net_sem); + up_write(&pernet_ops_rwsem); } EXPORT_SYMBOL_GPL(unregister_pernet_device); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 31438b63d4b4..73011a60434c 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -460,11 +460,11 @@ static void rtnl_lock_unregistering_all(void) void rtnl_link_unregister(struct rtnl_link_ops *ops) { /* Close the race with cleanup_net() */ - down_write(&net_sem); + down_write(&pernet_ops_rwsem); rtnl_lock_unregistering_all(); __rtnl_link_unregister(ops); rtnl_unlock(); - up_write(&net_sem); + up_write(&pernet_ops_rwsem); } EXPORT_SYMBOL_GPL(rtnl_link_unregister); -- cgit v1.2.3 From f0b07bb151b098d291fd1fd71ef7a2df56fb124a Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Thu, 29 Mar 2018 19:20:32 +0300 Subject: net: Introduce net_rwsem to protect net_namespace_list rtnl_lock() is used everywhere, and contention is very high. When someone wants to iterate over alive net namespaces, he/she has no a possibility to do that without exclusive lock. But the exclusive rtnl_lock() in such places is overkill, and it just increases the contention. Yes, there is already for_each_net_rcu() in kernel, but it requires rcu_read_lock(), and this can't be sleepable. Also, sometimes it may be need really prevent net_namespace_list growth, so for_each_net_rcu() is not fit there. This patch introduces new rw_semaphore, which will be used instead of rtnl_mutex to protect net_namespace_list. It is sleepable and allows not-exclusive iterations over net namespaces list. It allows to stop using rtnl_lock() in several places (what is made in next patches) and makes less the time, we keep rtnl_mutex. Here we just add new lock, while the explanation of we can remove rtnl_lock() there are in next patches. Fine grained locks generally are better, then one big lock, so let's do that with net_namespace_list, while the situation allows that. Signed-off-by: Kirill Tkhai Signed-off-by: David S. Miller --- drivers/infiniband/core/roce_gid_mgmt.c | 2 ++ include/linux/rtnetlink.h | 1 + include/net/net_namespace.h | 1 + net/core/dev.c | 5 +++++ net/core/fib_notifier.c | 2 ++ net/core/net_namespace.c | 18 +++++++++++++----- net/core/rtnetlink.c | 5 +++++ net/netfilter/nf_conntrack_core.c | 2 ++ net/openvswitch/datapath.c | 2 ++ net/wireless/wext-core.c | 2 ++ security/selinux/include/xfrm.h | 2 ++ 11 files changed, 37 insertions(+), 5 deletions(-) (limited to 'include/linux/rtnetlink.h') diff --git a/drivers/infiniband/core/roce_gid_mgmt.c b/drivers/infiniband/core/roce_gid_mgmt.c index 5a52ec77940a..cc2966380c0c 100644 --- a/drivers/infiniband/core/roce_gid_mgmt.c +++ b/drivers/infiniband/core/roce_gid_mgmt.c @@ -403,10 +403,12 @@ static void enum_all_gids_of_dev_cb(struct ib_device *ib_dev, * our feet */ rtnl_lock(); + down_read(&net_rwsem); for_each_net(net) for_each_netdev(net, ndev) if (is_eth_port_of_netdev(ib_dev, port, rdma_ndev, ndev)) add_netdev_ips(ib_dev, port, rdma_ndev, ndev); + up_read(&net_rwsem); rtnl_unlock(); } diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h index c7d1e4689325..5225832bd6ff 100644 --- a/include/linux/rtnetlink.h +++ b/include/linux/rtnetlink.h @@ -37,6 +37,7 @@ extern int rtnl_lock_killable(void); extern wait_queue_head_t netdev_unregistering_wq; extern struct rw_semaphore pernet_ops_rwsem; +extern struct rw_semaphore net_rwsem; #ifdef CONFIG_PROVE_LOCKING extern bool lockdep_rtnl_is_held(void); diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index 1ab4f920f109..47e35cce3b64 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -291,6 +291,7 @@ static inline struct net *read_pnet(const possible_net_t *pnet) #endif } +/* Protected by net_rwsem */ #define for_each_net(VAR) \ list_for_each_entry(VAR, &net_namespace_list, list) diff --git a/net/core/dev.c b/net/core/dev.c index e13807b5c84d..eca5458b2753 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1629,6 +1629,7 @@ int register_netdevice_notifier(struct notifier_block *nb) goto unlock; if (dev_boot_phase) goto unlock; + down_read(&net_rwsem); for_each_net(net) { for_each_netdev(net, dev) { err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev); @@ -1642,6 +1643,7 @@ int register_netdevice_notifier(struct notifier_block *nb) call_netdevice_notifier(nb, NETDEV_UP, dev); } } + up_read(&net_rwsem); unlock: rtnl_unlock(); @@ -1664,6 +1666,7 @@ rollback: } outroll: + up_read(&net_rwsem); raw_notifier_chain_unregister(&netdev_chain, nb); goto unlock; } @@ -1694,6 +1697,7 @@ int unregister_netdevice_notifier(struct notifier_block *nb) if (err) goto unlock; + down_read(&net_rwsem); for_each_net(net) { for_each_netdev(net, dev) { if (dev->flags & IFF_UP) { @@ -1704,6 +1708,7 @@ int unregister_netdevice_notifier(struct notifier_block *nb) call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev); } } + up_read(&net_rwsem); unlock: rtnl_unlock(); return err; diff --git a/net/core/fib_notifier.c b/net/core/fib_notifier.c index 0c048bdeb016..614b985c92a4 100644 --- a/net/core/fib_notifier.c +++ b/net/core/fib_notifier.c @@ -33,6 +33,7 @@ static unsigned int fib_seq_sum(void) struct net *net; rtnl_lock(); + down_read(&net_rwsem); for_each_net(net) { rcu_read_lock(); list_for_each_entry_rcu(ops, &net->fib_notifier_ops, list) { @@ -43,6 +44,7 @@ static unsigned int fib_seq_sum(void) } rcu_read_unlock(); } + up_read(&net_rwsem); rtnl_unlock(); return fib_seq; diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index b5796d17a302..7fdf321d4997 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -33,6 +33,10 @@ static struct list_head *first_device = &pernet_list; LIST_HEAD(net_namespace_list); EXPORT_SYMBOL_GPL(net_namespace_list); +/* Protects net_namespace_list. Nests iside rtnl_lock() */ +DECLARE_RWSEM(net_rwsem); +EXPORT_SYMBOL_GPL(net_rwsem); + struct net init_net = { .count = REFCOUNT_INIT(1), .dev_base_head = LIST_HEAD_INIT(init_net.dev_base_head), @@ -309,9 +313,9 @@ static __net_init int setup_net(struct net *net, struct user_namespace *user_ns) if (error < 0) goto out_undo; } - rtnl_lock(); + down_write(&net_rwsem); list_add_tail_rcu(&net->list, &net_namespace_list); - rtnl_unlock(); + up_write(&net_rwsem); out: return error; @@ -450,7 +454,7 @@ static void unhash_nsid(struct net *net, struct net *last) * and this work is the only process, that may delete * a net from net_namespace_list. So, when the below * is executing, the list may only grow. Thus, we do not - * use for_each_net_rcu() or rtnl_lock(). + * use for_each_net_rcu() or net_rwsem. */ for_each_net(tmp) { int id; @@ -485,7 +489,7 @@ static void cleanup_net(struct work_struct *work) down_read(&pernet_ops_rwsem); /* Don't let anyone else find us. */ - rtnl_lock(); + down_write(&net_rwsem); llist_for_each_entry(net, net_kill_list, cleanup_list) list_del_rcu(&net->list); /* Cache last net. After we unlock rtnl, no one new net @@ -499,7 +503,7 @@ static void cleanup_net(struct work_struct *work) * useless anyway, as netns_ids are destroyed there. */ last = list_last_entry(&net_namespace_list, struct net, list); - rtnl_unlock(); + up_write(&net_rwsem); llist_for_each_entry(net, net_kill_list, cleanup_list) { unhash_nsid(net, last); @@ -900,6 +904,9 @@ static int __register_pernet_operations(struct list_head *list, list_add_tail(&ops->list, list); if (ops->init || (ops->id && ops->size)) { + /* We held write locked pernet_ops_rwsem, and parallel + * setup_net() and cleanup_net() are not possible. + */ for_each_net(net) { error = ops_init(ops, net); if (error) @@ -923,6 +930,7 @@ static void __unregister_pernet_operations(struct pernet_operations *ops) LIST_HEAD(net_exit_list); list_del(&ops->list); + /* See comment in __register_pernet_operations() */ for_each_net(net) list_add_tail(&net->exit_list, &net_exit_list); ops_exit_list(ops, &net_exit_list); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 2d3949789cef..e86b28482ca7 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -418,9 +418,11 @@ void __rtnl_link_unregister(struct rtnl_link_ops *ops) { struct net *net; + down_read(&net_rwsem); for_each_net(net) { __rtnl_kill_links(net, ops); } + up_read(&net_rwsem); list_del(&ops->list); } EXPORT_SYMBOL_GPL(__rtnl_link_unregister); @@ -438,6 +440,9 @@ static void rtnl_lock_unregistering_all(void) for (;;) { unregistering = false; rtnl_lock(); + /* We held write locked pernet_ops_rwsem, and parallel + * setup_net() and cleanup_net() are not possible. + */ for_each_net(net) { if (net->dev_unreg_count > 0) { unregistering = true; diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 705198de671d..370f9b7f051b 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -1764,12 +1764,14 @@ nf_ct_iterate_destroy(int (*iter)(struct nf_conn *i, void *data), void *data) struct net *net; rtnl_lock(); + down_read(&net_rwsem); for_each_net(net) { if (atomic_read(&net->ct.count) == 0) continue; __nf_ct_unconfirmed_destroy(net); nf_queue_nf_hook_drop(net); } + up_read(&net_rwsem); rtnl_unlock(); /* Need to wait for netns cleanup worker to finish, if its diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index ef38e5aecd28..9746ee30a99b 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -2364,8 +2364,10 @@ static void __net_exit ovs_exit_net(struct net *dnet) __dp_destroy(dp); rtnl_lock(); + down_read(&net_rwsem); for_each_net(net) list_vports_from_net(net, dnet, &head); + up_read(&net_rwsem); rtnl_unlock(); /* Detach all vports from given namespace. */ diff --git a/net/wireless/wext-core.c b/net/wireless/wext-core.c index 9efbfc753347..544d7b62d7ca 100644 --- a/net/wireless/wext-core.c +++ b/net/wireless/wext-core.c @@ -349,11 +349,13 @@ void wireless_nlevent_flush(void) ASSERT_RTNL(); + down_read(&net_rwsem); for_each_net(net) { while ((skb = skb_dequeue(&net->wext_nlevents))) rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL, GFP_KERNEL); } + up_read(&net_rwsem); } EXPORT_SYMBOL_GPL(wireless_nlevent_flush); diff --git a/security/selinux/include/xfrm.h b/security/selinux/include/xfrm.h index 1f173a7a4daa..31d66431be1e 100644 --- a/security/selinux/include/xfrm.h +++ b/security/selinux/include/xfrm.h @@ -48,8 +48,10 @@ static inline void selinux_xfrm_notify_policyload(void) struct net *net; rtnl_lock(); + down_read(&net_rwsem); for_each_net(net) rt_genid_bump_all(net); + up_read(&net_rwsem); rtnl_unlock(); } #else -- cgit v1.2.3