diff options
65 files changed, 3173 insertions, 867 deletions
diff --git a/Documentation/RCU/00-INDEX b/Documentation/RCU/00-INDEX index f773a264ae02..1672573b037a 100644 --- a/Documentation/RCU/00-INDEX +++ b/Documentation/RCU/00-INDEX @@ -17,7 +17,7 @@ rcu_dereference.txt rcubarrier.txt - RCU and Unloadable Modules rculist_nulls.txt - - RCU list primitives for use with SLAB_DESTROY_BY_RCU + - RCU list primitives for use with SLAB_TYPESAFE_BY_RCU rcuref.txt - Reference-count design for elements of lists/arrays protected by RCU rcu.txt diff --git a/Documentation/RCU/Design/Data-Structures/Data-Structures.html b/Documentation/RCU/Design/Data-Structures/Data-Structures.html index 2ab38ee420c5..38d6d800761f 100644 --- a/Documentation/RCU/Design/Data-Structures/Data-Structures.html +++ b/Documentation/RCU/Design/Data-Structures/Data-Structures.html @@ -1185,6 +1185,9 @@ Its fields are as follows: 1 int dynticks_nesting; 2 int dynticks_nmi_nesting; 3 atomic_t dynticks; + 4 bool rcu_need_heavy_qs; + 5 unsigned long rcu_qs_ctr; + 6 bool rcu_urgent_qs; </pre> <p>The <tt>->dynticks_nesting</tt> field counts the @@ -1198,11 +1201,32 @@ NMIs are counted by the <tt>->dynticks_nmi_nesting</tt> field, except that NMIs that interrupt non-dyntick-idle execution are not counted. -</p><p>Finally, the <tt>->dynticks</tt> field counts the corresponding +</p><p>The <tt>->dynticks</tt> field counts the corresponding CPU's transitions to and from dyntick-idle mode, so that this counter has an even value when the CPU is in dyntick-idle mode and an odd value otherwise. +</p><p>The <tt>->rcu_need_heavy_qs</tt> field is used +to record the fact that the RCU core code would really like to +see a quiescent state from the corresponding CPU, so much so that +it is willing to call for heavy-weight dyntick-counter operations. +This flag is checked by RCU's context-switch and <tt>cond_resched()</tt> +code, which provide a momentary idle sojourn in response. + +</p><p>The <tt>->rcu_qs_ctr</tt> field is used to record +quiescent states from <tt>cond_resched()</tt>. +Because <tt>cond_resched()</tt> can execute quite frequently, this +must be quite lightweight, as in a non-atomic increment of this +per-CPU field. + +</p><p>Finally, the <tt>->rcu_urgent_qs</tt> field is used to record +the fact that the RCU core code would really like to see a quiescent +state from the corresponding CPU, with the various other fields indicating +just how badly RCU wants this quiescent state. +This flag is checked by RCU's context-switch and <tt>cond_resched()</tt> +code, which, if nothing else, non-atomically increment <tt>->rcu_qs_ctr</tt> +in response. + <table> <tr><th> </th></tr> <tr><th align="left">Quick Quiz:</th></tr> diff --git a/Documentation/RCU/rculist_nulls.txt b/Documentation/RCU/rculist_nulls.txt index 18f9651ff23d..8151f0195f76 100644 --- a/Documentation/RCU/rculist_nulls.txt +++ b/Documentation/RCU/rculist_nulls.txt @@ -1,5 +1,5 @@ Using hlist_nulls to protect read-mostly linked lists and -objects using SLAB_DESTROY_BY_RCU allocations. +objects using SLAB_TYPESAFE_BY_RCU allocations. Please read the basics in Documentation/RCU/listRCU.txt @@ -7,7 +7,7 @@ Using special makers (called 'nulls') is a convenient way to solve following problem : A typical RCU linked list managing objects which are -allocated with SLAB_DESTROY_BY_RCU kmem_cache can +allocated with SLAB_TYPESAFE_BY_RCU kmem_cache can use following algos : 1) Lookup algo @@ -96,7 +96,7 @@ unlock_chain(); // typically a spin_unlock() 3) Remove algo -------------- Nothing special here, we can use a standard RCU hlist deletion. -But thanks to SLAB_DESTROY_BY_RCU, beware a deleted object can be reused +But thanks to SLAB_TYPESAFE_BY_RCU, beware a deleted object can be reused very very fast (before the end of RCU grace period) if (put_last_reference_on(obj) { diff --git a/Documentation/RCU/whatisRCU.txt b/Documentation/RCU/whatisRCU.txt index 8c131a1c62ea..8ed6c9f6133c 100644 --- a/Documentation/RCU/whatisRCU.txt +++ b/Documentation/RCU/whatisRCU.txt @@ -928,7 +928,8 @@ d. Do you need RCU grace periods to complete even in the face e. Is your workload too update-intensive for normal use of RCU, but inappropriate for other synchronization mechanisms? - If so, consider SLAB_DESTROY_BY_RCU. But please be careful! + If so, consider SLAB_TYPESAFE_BY_RCU (which was originally + named SLAB_DESTROY_BY_RCU). But please be careful! f. Do you need read-side critical sections that are respected even though they are in the middle of the idle loop, during diff --git a/arch/Kconfig b/arch/Kconfig index cd211a14a88f..adefaf344239 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -320,6 +320,9 @@ config HAVE_CMPXCHG_LOCAL config HAVE_CMPXCHG_DOUBLE bool +config ARCH_WEAK_RELEASE_ACQUIRE + bool + config ARCH_WANT_IPC_PARSE_VERSION bool diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 97a8bc8a095c..7a5c9b764cd2 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -99,6 +99,7 @@ config PPC select ARCH_USE_BUILTIN_BSWAP select ARCH_USE_CMPXCHG_LOCKREF if PPC64 select ARCH_WANT_IPC_PARSE_VERSION + select ARCH_WEAK_RELEASE_ACQUIRE select BINFMT_ELF select BUILDTIME_EXTABLE_SORT select CLONE_BACKWARDS diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c index 6908123162d1..3b668895ac24 100644 --- a/drivers/gpu/drm/i915/i915_gem.c +++ b/drivers/gpu/drm/i915/i915_gem.c @@ -4552,7 +4552,7 @@ i915_gem_load_init(struct drm_i915_private *dev_priv) dev_priv->requests = KMEM_CACHE(drm_i915_gem_request, SLAB_HWCACHE_ALIGN | SLAB_RECLAIM_ACCOUNT | - SLAB_DESTROY_BY_RCU); + SLAB_TYPESAFE_BY_RCU); if (!dev_priv->requests) goto err_vmas; diff --git a/drivers/gpu/drm/i915/i915_gem_request.h b/drivers/gpu/drm/i915/i915_gem_request.h index ea511f06efaf..9ee2750e1dde 100644 --- a/drivers/gpu/drm/i915/i915_gem_request.h +++ b/drivers/gpu/drm/i915/i915_gem_request.h @@ -493,7 +493,7 @@ static inline struct drm_i915_gem_request * __i915_gem_active_get_rcu(const struct i915_gem_active *active) { /* Performing a lockless retrieval of the active request is super - * tricky. SLAB_DESTROY_BY_RCU merely guarantees that the backing + * tricky. SLAB_TYPESAFE_BY_RCU merely guarantees that the backing * slab of request objects will not be freed whilst we hold the * RCU read lock. It does not guarantee that the request itself * will not be freed and then *reused*. Viz, diff --git a/drivers/staging/lustre/lustre/ldlm/ldlm_lockd.c b/drivers/staging/lustre/lustre/ldlm/ldlm_lockd.c index 12647af5a336..e7fb47e84a93 100644 --- a/drivers/staging/lustre/lustre/ldlm/ldlm_lockd.c +++ b/drivers/staging/lustre/lustre/ldlm/ldlm_lockd.c @@ -1071,7 +1071,7 @@ int ldlm_init(void) ldlm_lock_slab = kmem_cache_create("ldlm_locks", sizeof(struct ldlm_lock), 0, SLAB_HWCACHE_ALIGN | - SLAB_DESTROY_BY_RCU, NULL); + SLAB_TYPESAFE_BY_RCU, NULL); if (!ldlm_lock_slab) { kmem_cache_destroy(ldlm_resource_slab); return -ENOMEM; diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index a1a359bfcc9c..7f8f962454e5 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -2340,7 +2340,7 @@ static int jbd2_journal_init_journal_head_cache(void) jbd2_journal_head_cache = kmem_cache_create("jbd2_journal_head", sizeof(struct journal_head), 0, /* offset */ - SLAB_TEMPORARY | SLAB_DESTROY_BY_RCU, + SLAB_TEMPORARY | SLAB_TYPESAFE_BY_RCU, NULL); /* ctor */ retval = 0; if (!jbd2_journal_head_cache) { diff --git a/fs/signalfd.c b/fs/signalfd.c index 270221fcef42..7e3d71109f51 100644 --- a/fs/signalfd.c +++ b/fs/signalfd.c @@ -38,7 +38,7 @@ void signalfd_cleanup(struct sighand_struct *sighand) /* * The lockless check can race with remove_wait_queue() in progress, * but in this case its caller should run under rcu_read_lock() and - * sighand_cachep is SLAB_DESTROY_BY_RCU, we can safely return. + * sighand_cachep is SLAB_TYPESAFE_BY_RCU, we can safely return. */ if (likely(!waitqueue_active(wqh))) return; diff --git a/include/linux/dma-fence.h b/include/linux/dma-fence.h index 6048fa404e57..a5195a7d6f77 100644 --- a/include/linux/dma-fence.h +++ b/include/linux/dma-fence.h @@ -229,7 +229,7 @@ static inline struct dma_fence *dma_fence_get_rcu(struct dma_fence *fence) * * Function returns NULL if no refcount could be obtained, or the fence. * This function handles acquiring a reference to a fence that may be - * reallocated within the RCU grace period (such as with SLAB_DESTROY_BY_RCU), + * reallocated within the RCU grace period (such as with SLAB_TYPESAFE_BY_RCU), * so long as the caller is using RCU on the pointer to the fence. * * An alternative mechanism is to employ a seqlock to protect a bunch of @@ -257,7 +257,7 @@ dma_fence_get_rcu_safe(struct dma_fence * __rcu *fencep) * have successfully acquire a reference to it. If it no * longer matches, we are holding a reference to some other * reallocated pointer. This is possible if the allocator - * is using a freelist like SLAB_DESTROY_BY_RCU where the + * is using a freelist like SLAB_TYPESAFE_BY_RCU where the * fence remains valid for the RCU grace period, but it * may be reallocated. When using such allocators, we are * responsible for ensuring the reference we get is to diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 2c14ad9809da..96c8e29c6442 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -375,8 +375,6 @@ struct kvm { struct mutex slots_lock; struct mm_struct *mm; /* userspace tied to this vm */ struct kvm_memslots *memslots[KVM_ADDRESS_SPACE_NUM]; - struct srcu_struct srcu; - struct srcu_struct irq_srcu; struct kvm_vcpu *vcpus[KVM_MAX_VCPUS]; /* @@ -429,6 +427,8 @@ struct kvm { struct list_head devices; struct dentry *debugfs_dentry; struct kvm_stat_data **debugfs_stat_data; + struct srcu_struct srcu; + struct srcu_struct irq_srcu; }; #define kvm_err(fmt, ...) \ diff --git a/include/linux/rcu_node_tree.h b/include/linux/rcu_node_tree.h new file mode 100644 index 000000000000..4b766b61e1a0 --- /dev/null +++ b/include/linux/rcu_node_tree.h @@ -0,0 +1,99 @@ +/* + * RCU node combining tree definitions. These are used to compute + * global attributes while avoiding common-case global contention. A key + * property that these computations rely on is a tournament-style approach + * where only one of the tasks contending a lower level in the tree need + * advance to the next higher level. If properly configured, this allows + * unlimited scalability while maintaining a constant level of contention + * on the root node. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, you can access it online at + * http://www.gnu.org/licenses/gpl-2.0.html. + * + * Copyright IBM Corporation, 2017 + * + * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com> + */ + +#ifndef __LINUX_RCU_NODE_TREE_H +#define __LINUX_RCU_NODE_TREE_H + +/* + * Define shape of hierarchy based on NR_CPUS, CONFIG_RCU_FANOUT, and + * CONFIG_RCU_FANOUT_LEAF. + * In theory, it should be possible to add more levels straightforwardly. + * In practice, this did work well going from three levels to four. + * Of course, your mileage may vary. + */ + +#ifdef CONFIG_RCU_FANOUT +#define RCU_FANOUT CONFIG_RCU_FANOUT +#else /* #ifdef CONFIG_RCU_FANOUT */ +# ifdef CONFIG_64BIT +# define RCU_FANOUT 64 +# else +# define RCU_FANOUT 32 +# endif +#endif /* #else #ifdef CONFIG_RCU_FANOUT */ + +#ifdef CONFIG_RCU_FANOUT_LEAF +#define RCU_FANOUT_LEAF CONFIG_RCU_FANOUT_LEAF +#else /* #ifdef CONFIG_RCU_FANOUT_LEAF */ +#define RCU_FANOUT_LEAF 16 +#endif /* #else #ifdef CONFIG_RCU_FANOUT_LEAF */ + +#define RCU_FANOUT_1 (RCU_FANOUT_LEAF) +#define RCU_FANOUT_2 (RCU_FANOUT_1 * RCU_FANOUT) +#define RCU_FANOUT_3 (RCU_FANOUT_2 * RCU_FANOUT) +#define RCU_FANOUT_4 (RCU_FANOUT_3 * RCU_FANOUT) + +#if NR_CPUS <= RCU_FANOUT_1 +# define RCU_NUM_LVLS 1 +# define NUM_RCU_LVL_0 1 +# define NUM_RCU_NODES NUM_RCU_LVL_0 +# define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0 } +# define RCU_NODE_NAME_INIT { "rcu_node_0" } +# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0" } +#elif NR_CPUS <= RCU_FANOUT_2 +# define RCU_NUM_LVLS 2 +# define NUM_RCU_LVL_0 1 +# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1) +# define NUM_RCU_NODES (NUM_RCU_LVL_0 + NUM_RCU_LVL_1) +# define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1 } +# define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1" } +# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1" } +#elif NR_CPUS <= RCU_FANOUT_3 +# define RCU_NUM_LVLS 3 +# define NUM_RCU_LVL_0 1 +# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2) +# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1) +# define NUM_RCU_NODES (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2) +# define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1, NUM_RCU_LVL_2 } +# define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1", "rcu_node_2" } +# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2" } +#elif NR_CPUS <= RCU_FANOUT_4 +# define RCU_NUM_LVLS 4 +# define NUM_RCU_LVL_0 1 +# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_3) +# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2) +# define NUM_RCU_LVL_3 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1) +# define NUM_RCU_NODES (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3) +# define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1, NUM_RCU_LVL_2, NUM_RCU_LVL_3 } +# define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1", "rcu_node_2", "rcu_node_3" } +# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2", "rcu_node_fqs_3" } +#else +# error "CONFIG_RCU_FANOUT insufficient for NR_CPUS" +#endif /* #if (NR_CPUS) <= RCU_FANOUT_1 */ + +#endif /* __LINUX_RCU_NODE_TREE_H */ diff --git a/include/linux/rcu_segcblist.h b/include/linux/rcu_segcblist.h new file mode 100644 index 000000000000..ced8f313fd05 --- /dev/null +++ b/include/linux/rcu_segcblist.h @@ -0,0 +1,712 @@ +/* + * RCU segmented callback lists + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, you can access it online at + * http://www.gnu.org/licenses/gpl-2.0.html. + * + * Copyright IBM Corporation, 2017 + * + * Authors: Paul E. McKenney <paulmck@linux.vnet.ibm.com> + */ + +#ifndef __KERNEL_RCU_SEGCBLIST_H +#define __KERNEL_RCU_SEGCBLIST_H + +/* Simple unsegmented callback lists. */ +struct rcu_cblist { + struct rcu_head *head; + struct rcu_head **tail; + long len; + long len_lazy; +}; + +#define RCU_CBLIST_INITIALIZER(n) { .head = NULL, .tail = &n.head } + +/* Initialize simple callback list. */ +static inline void rcu_cblist_init(struct rcu_cblist *rclp) +{ + rclp->head = NULL; + rclp->tail = &rclp->head; + rclp->len = 0; + rclp->len_lazy = 0; +} + +/* Is simple callback list empty? */ +static inline bool rcu_cblist_empty(struct rcu_cblist *rclp) +{ + return !rclp->head; +} + +/* Return number of callbacks in simple callback list. */ +static inline long rcu_cblist_n_cbs(struct rcu_cblist *rclp) +{ + return rclp->len; +} + +/* Return number of lazy callbacks in simple callback list. */ +static inline long rcu_cblist_n_lazy_cbs(struct rcu_cblist *rclp) +{ + return rclp->len_lazy; +} + +/* + * Debug function to actually count the number of callbacks. + * If the number exceeds the limit specified, return -1. + */ +static inline long rcu_cblist_count_cbs(struct rcu_cblist *rclp, long lim) +{ + int cnt = 0; + struct rcu_head **rhpp = &rclp->head; + + for (;;) { + if (!*rhpp) + return cnt; + if (++cnt > lim) + return -1; + rhpp = &(*rhpp)->next; + } +} + +/* + * Dequeue the oldest rcu_head structure from the specified callback + * list. This function assumes that the callback is non-lazy, but + * the caller can later invoke rcu_cblist_dequeued_lazy() if it + * finds otherwise (and if it cares about laziness). This allows + * different users to have different ways of determining laziness. + */ +static inline struct rcu_head *rcu_cblist_dequeue(struct rcu_cblist *rclp) +{ + struct rcu_head *rhp; + + rhp = rclp->head; + if (!rhp) + return NULL; + rclp->len--; + rclp->head = rhp->next; + if (!rclp->head) + rclp->tail = &rclp->head; + return rhp; +} + +/* + * Account for the fact that a previously dequeued callback turned out + * to be marked as lazy. + */ +static inline void rcu_cblist_dequeued_lazy(struct rcu_cblist *rclp) +{ + rclp->len_lazy--; +} + +/* + * Interim function to return rcu_cblist head pointer. Longer term, the + * rcu_cblist will be used more pervasively, removing the need for this + * function. + */ +static inline struct rcu_head *rcu_cblist_head(struct rcu_cblist *rclp) +{ + return rclp->head; +} + +/* + * Interim function to return rcu_cblist head pointer. Longer term, the + * rcu_cblist will be used more pervasively, removing the need for this + * function. + */ +static inline struct rcu_head **rcu_cblist_tail(struct rcu_cblist *rclp) +{ + WARN_ON_ONCE(rcu_cblist_empty(rclp)); + return rclp->tail; +} + +/* Complicated segmented callback lists. ;-) */ + +/* + * Index values for segments in rcu_segcblist structure. + * + * The segments are as follows: + * + * [head, *tails[RCU_DONE_TAIL]): + * Callbacks whose grace period has elapsed, and thus can be invoked. + * [*tails[RCU_DONE_TAIL], *tails[RCU_WAIT_TAIL]): + * Callbacks waiting for the current GP from the current CPU's viewpoint. + * [*tails[RCU_WAIT_TAIL], *tails[RCU_NEXT_READY_TAIL]): + * Callbacks that arrived before the next GP started, again from + * the current CPU's viewpoint. These can be handled by the next GP. + * [*tails[RCU_NEXT_READY_TAIL], *tails[RCU_NEXT_TAIL]): + * Callbacks that might have arrived after the next GP started. + * There is some uncertainty as to when a given GP starts and + * ends, but a CPU knows the exact times if it is the one starting + * or ending the GP. Other CPUs know that the previous GP ends + * before the next one starts. + * + * Note that RCU_WAIT_TAIL cannot be empty unless RCU_NEXT_READY_TAIL is also + * empty. + * + * The ->gp_seq[] array contains the grace-period number at which the + * corresponding segment of callbacks will be ready to invoke. A given + * element of this array is meaningful only when the corresponding segment + * is non-empty, and it is never valid for RCU_DONE_TAIL (whose callbacks + * are already ready to invoke) or for RCU_NEXT_TAIL (whose callbacks have + * not yet been assigned a grace-period number). + */ +#define RCU_DONE_TAIL 0 /* Also RCU_WAIT head. */ +#define RCU_WAIT_TAIL 1 /* Also RCU_NEXT_READY head. */ +#define RCU_NEXT_READY_TAIL 2 /* Also RCU_NEXT head. */ +#define RCU_NEXT_TAIL 3 +#define RCU_CBLIST_NSEGS 4 + +struct rcu_segcblist { + struct rcu_head *head; + struct rcu_head **tails[RCU_CBLIST_NSEGS]; + unsigned long gp_seq[RCU_CBLIST_NSEGS]; + long len; + long len_lazy; +}; + +#define RCU_SEGCBLIST_INITIALIZER(n) \ +{ \ + .head = NULL, \ + .tails[RCU_DONE_TAIL] = &n.head, \ + .tails[RCU_WAIT_TAIL] = &n.head, \ + .tails[RCU_NEXT_READY_TAIL] = &n.head, \ + .tails[RCU_NEXT_TAIL] = &n.head, \ +} + +/* + * Initialize an rcu_segcblist structure. + */ +static inline void rcu_segcblist_init(struct rcu_segcblist *rsclp) +{ + int i; + + BUILD_BUG_ON(RCU_NEXT_TAIL + 1 != ARRAY_SIZE(rsclp->gp_seq)); + BUILD_BUG_ON(ARRAY_SIZE(rsclp->tails) != ARRAY_SIZE(rsclp->gp_seq)); + rsclp->head = NULL; + for (i = 0; i < RCU_CBLIST_NSEGS; i++) + rsclp->tails[i] = &rsclp->head; + rsclp->len = 0; + rsclp->len_lazy = 0; +} + +/* + * Is the specified rcu_segcblist structure empty? + * + * But careful! The fact that the ->head field is NULL does not + * necessarily imply that there are no callbacks associated with + * this structure. When callbacks are being invoked, they are + * removed as a group. If callback invocation must be preempted, + * the remaining callbacks will be added back to the list. Either + * way, the counts are updated later. + * + * So it is often the case that rcu_segcblist_n_cbs() should be used + * instead. + */ +static inline bool rcu_segcblist_empty(struct rcu_segcblist *rsclp) +{ + return !rsclp->head; +} + +/* Return number of callbacks in segmented callback list. */ +static inline long rcu_segcblist_n_cbs(struct rcu_segcblist *rsclp) +{ + return READ_ONCE(rsclp->len); +} + +/* Return number of lazy callbacks in segmented callback list. */ +static inline long rcu_segcblist_n_lazy_cbs(struct rcu_segcblist *rsclp) +{ + return rsclp->len_lazy; +} + +/* Return number of lazy callbacks in segmented callback list. */ +static inline long rcu_segcblist_n_nonlazy_cbs(struct rcu_segcblist *rsclp) +{ + return rsclp->len - rsclp->len_lazy; +} + +/* + * Is the specified rcu_segcblist enabled, for example, not corresponding + * to an offline or callback-offloaded CPU? + */ +static inline bool rcu_segcblist_is_enabled(struct rcu_segcblist *rsclp) +{ + return !!rsclp->tails[RCU_NEXT_TAIL]; +} + +/* + * Disable the specified rcu_segcblist structure, so that callbacks can + * no longer be posted to it. This structure must be empty. + */ +static inline void rcu_segcblist_disable(struct rcu_segcblist *rsclp) +{ + WARN_ON_ONCE(!rcu_segcblist_empty(rsclp)); + WARN_ON_ONCE(rcu_segcblist_n_cbs(rsclp)); + WARN_ON_ONCE(rcu_segcblist_n_lazy_cbs(rsclp)); + rsclp->tails[RCU_NEXT_TAIL] = NULL; +} + +/* + * Is the specified segment of the specified rcu_segcblist structure + * empty of callbacks? + */ +static inline bool rcu_segcblist_segempty(struct rcu_segcblist *rsclp, int seg) +{ + if (seg == RCU_DONE_TAIL) + return &rsclp->head == rsclp->tails[RCU_DONE_TAIL]; + return rsclp->tails[seg - 1] == rsclp->tails[seg]; +} + +/* + * Are all segments following the specified segment of the specified + * rcu_segcblist structure empty of callbacks? (The specified + * segment might well contain callbacks.) + */ +static inline bool rcu_segcblist_restempty(struct rcu_segcblist *rsclp, int seg) +{ + return !*rsclp->tails[seg]; +} + +/* + * Does the specified rcu_segcblist structure contain callbacks that + * are ready to be invoked? + */ +static inline bool rcu_segcblist_ready_cbs(struct rcu_segcblist *rsclp) +{ + return rcu_segcblist_is_enabled(rsclp) && + &rsclp->head != rsclp->tails[RCU_DONE_TAIL]; +} + +/* + * Does the specified rcu_segcblist structure contain callbacks that + * are still pending, that is, not yet ready to be invoked? + */ +static inline bool rcu_segcblist_pend_cbs(struct rcu_segcblist *rsclp) +{ + return rcu_segcblist_is_enabled(rsclp) && + !rcu_segcblist_restempty(rsclp, RCU_DONE_TAIL); +} + +/* + * Dequeue and return the first ready-to-invoke callback. If there + * are no ready-to-invoke callbacks, return NULL. Disables interrupts + * to avoid interference. Does not protect from interference from other + * CPUs or tasks. + */ +static inline struct rcu_head * +rcu_segcblist_dequeue(struct rcu_segcblist *rsclp) +{ + unsigned long flags; + int i; + struct rcu_head *rhp; + + local_irq_save(flags); + if (!rcu_segcblist_ready_cbs(rsclp)) { + local_irq_restore(flags); + return NULL; + } + rhp = rsclp->head; + BUG_ON(!rhp); + rsclp->head = rhp->next; + for (i = RCU_DONE_TAIL; i < RCU_CBLIST_NSEGS; i++) { + if (rsclp->tails[i] != &rhp->next) + break; + rsclp->tails[i] = &rsclp->head; + } + smp_mb(); /* Dequeue before decrement for rcu_barrier(). */ + WRITE_ONCE(rsclp->len, rsclp->len - 1); + local_irq_restore(flags); + return rhp; +} + +/* + * Account for the fact that a previously dequeued callback turned out + * to be marked as lazy. + */ +static inline void rcu_segcblist_dequeued_lazy(struct rcu_segcblist *rsclp) +{ + unsigned long flags; + + local_irq_save(flags); + rsclp->len_lazy--; + local_irq_restore(flags); +} + +/* + * Return a pointer to the first callback in the specified rcu_segcblist + * structure. This is useful for diagnostics. + */ +static inline struct rcu_head * +rcu_segcblist_first_cb(struct rcu_segcblist *rsclp) +{ + if (rcu_segcblist_is_enabled(rsclp)) + return rsclp->head; + return NULL; +} + +/* + * Return a pointer to the first pending callback in the specified + * rcu_segcblist structure. This is useful just after posting a given + * callback -- if that callback is the first pending callback, then + * you cannot rely on someone else having already started up the required + * grace period. + */ +static inline struct rcu_head * +rcu_segcblist_first_pend_cb(struct rcu_segcblist *rsclp) +{ + if (rcu_segcblist_is_enabled(rsclp)) + return *rsclp->tails[RCU_DONE_TAIL]; + return NULL; +} + +/* + * Does the specified rcu_segcblist structure contain callbacks that + * have not yet been processed beyond having been posted, that is, + * does it contain callbacks in its last segment? + */ +static inline bool rcu_segcblist_new_cbs(struct rcu_segcblist *rsclp) +{ + return rcu_segcblist_is_enabled(rsclp) && + !rcu_segcblist_restempty(rsclp, RCU_NEXT_READY_TAIL); +} + +/* + * Enqueue the specified callback onto the specified rcu_segcblist + * structure, updating accounting as needed. Note that the ->len + * field may be accessed locklessly, hence the WRITE_ONCE(). + * The ->len field is used by rcu_barrier() and friends to determine + * if it must post a callback on this structure, and it is OK + * for rcu_barrier() to sometimes post callbacks needlessly, but + * absolutely not OK for it to ever miss posting a callback. + */ +static inline void rcu_segcblist_enqueue(struct rcu_segcblist *rsclp, + struct rcu_head *rhp, bool lazy) +{ + WRITE_ONCE(rsclp->len, rsclp->len + 1); /* ->len sampled locklessly. */ + if (lazy) + rsclp->len_lazy++; + smp_mb(); /* Ensure counts are updated before callback is enqueued. */ + rhp->next = NULL; + *rsclp->tails[RCU_NEXT_TAIL] = rhp; + rsclp->tails[RCU_NEXT_TAIL] = &rhp->next; +} + +/* + * Entrain the specified callback onto the specified rcu_segcblist at + * the end of the last non-empty segment. If the entire rcu_segcblist + * is empty, make no change, but return false. + * + * This is intended for use by rcu_barrier()-like primitives, -not- + * for normal grace-period use. IMPORTANT: The callback you enqueue + * will wait for all prior callbacks, NOT necessarily for a grace + * period. You have been warned. + */ +static inline bool rcu_segcblist_entrain(struct rcu_segcblist *rsclp, + struct rcu_head *rhp, bool lazy) +{ + int i; + + if (rcu_segcblist_n_cbs(rsclp) == 0) + return false; + WRITE_ONCE(rsclp->len, rsclp->len + 1); + if (lazy) + rsclp->len_lazy++; + smp_mb(); /* Ensure counts are updated before callback is entrained. */ + rhp->next = NULL; + for (i = RCU_NEXT_TAIL; i > RCU_DONE_TAIL; i--) + if (rsclp->tails[i] != rsclp->tails[i - 1]) + break; + *rsclp->tails[i] = rhp; + for (; i <= RCU_NEXT_TAIL; i++) + rsclp->tails[i] = &rhp->next; + return true; +} + +/* + * Extract only the counts from the specified rcu_segcblist structure, + * and place them in the specified rcu_cblist structure. This function + * supports both callback orphaning and invocation, hence the separation + * of counts and callbacks. (Callbacks ready for invocation must be + * orphaned and adopted separately from pending callbacks, but counts + * apply to all callbacks. Locking must be used to make sure that + * both orphaned-callbacks lists are consistent.) + */ +static inline void rcu_segcblist_extract_count(struct rcu_segcblist *rsclp, + struct rcu_cblist *rclp) +{ + rclp->len_lazy += rsclp->len_lazy; + rclp->len += rsclp->len; + rsclp->len_lazy = 0; + WRITE_ONCE(rsclp->len, 0); /* ->len sampled locklessly. */ +} + +/* + * Extract only those callbacks ready to be invoked from the specified + * rcu_segcblist structure and place them in the specified rcu_cblist + * structure. + */ +static inline void rcu_segcblist_extract_done_cbs(struct rcu_segcblist *rsclp, + struct rcu_cblist *rclp) +{ + int i; + + if (!rcu_segcblist_ready_cbs(rsclp)) + return; /* Nothing to do. */ + *rclp->tail = rsclp->head; + rsclp->head = *rsclp->tails[RCU_DONE_TAIL]; + *rsclp->tails[RCU_DONE_TAIL] = NULL; + rclp->tail = rsclp->tails[RCU_DONE_TAIL]; + for (i = RCU_CBLIST_NSEGS - 1; i >= RCU_DONE_TAIL; i--) + if (rsclp->tails[i] == rsclp->tails[RCU_DONE_TAIL]) + rsclp->tails[i] = &rsclp->head; +} + +/* + * Extract only those callbacks still pending (not yet ready to be + * invoked) from the specified rcu_segcblist structure and place them in + * the specified rcu_cblist structure. Note that this loses information + * about any callbacks that might have been partway done waiting for + * their grace period. Too bad! They will have to start over. + */ +static inline void +rcu_segcblist_extract_pend_cbs(struct rcu_segcblist *rsclp, + struct rcu_cblist *rclp) +{ + int i; + + if (!rcu_segcblist_pend_cbs(rsclp)) + return; /* Nothing to do. */ + *rclp->tail = *rsclp->tails[RCU_DONE_TAIL]; + rclp->tail = rsclp->tails[RCU_NEXT_TAIL]; + *rsclp->tails[RCU_DONE_TAIL] = NULL; + for (i = RCU_DONE_TAIL + 1; i < RCU_CBLIST_NSEGS; i++) + rsclp->tails[i] = rsclp->tails[RCU_DONE_TAIL]; +} + +/* + * Move the entire contents of the specified rcu_segcblist structure, + * counts, callbacks, and all, to the specified rcu_cblist structure. + * @@@ Why do we need this??? Moving early-boot CBs to NOCB lists? + * @@@ Memory barrier needed? (Not if only used at boot time...) + */ +static inline void rcu_segcblist_extract_all(struct rcu_segcblist *rsclp, + struct rcu_cblist *rclp) +{ + rcu_segcblist_extract_done_cbs(rsclp, rclp); + rcu_segcblist_extract_pend_cbs(rsclp, rclp); + rcu_segcblist_extract_count(rsclp, rclp); +} + +/* + * Insert counts from the specified rcu_cblist structure in the + * specified rcu_segcblist structure. + */ +static inline void rcu_segcblist_insert_count(struct rcu_segcblist *rsclp, + struct rcu_cblist *rclp) +{ + rsclp->len_lazy += rclp->len_lazy; + /* ->len sampled locklessly. */ + WRITE_ONCE(rsclp->len, rsclp->len + rclp->len); + rclp->len_lazy = 0; + rclp->len = 0; +} + +/* + * Move callbacks from the specified rcu_cblist to the beginning of the + * done-callbacks segment of the specified rcu_segcblist. + */ +static inline void rcu_segcblist_insert_done_cbs(struct rcu_segcblist *rsclp, + struct rcu_cblist *rclp) +{ + int i; + + if (!rclp->head) + return; /* No callbacks to move. */ + *rclp->tail = rsclp->head; + rsclp->head = rclp->head; + for (i = RCU_DONE_TAIL; i < RCU_CBLIST_NSEGS; i++) + if (&rsclp->head == rsclp->tails[i]) + rsclp->tails[i] = rclp->tail; + else + break; + rclp->head = NULL; + rclp->tail = &rclp->head; +} + +/* + * Move callbacks from the specified rcu_cblist to the end of the + * new-callbacks segment of the specified rcu_segcblist. + */ +static inline void rcu_segcblist_insert_pend_cbs(struct rcu_segcblist *rsclp, + struct rcu_cblist *rclp) +{ + if (!rclp->head) + return; /* Nothing to do. */ + *rsclp->tails[RCU_NEXT_TAIL] = rclp->head; + rsclp->tails[RCU_NEXT_TAIL] = rclp->tail; + rclp->head = NULL; + rclp->tail = &rclp->head; +} + +/* + * Advance the callbacks in the specified rcu_segcblist structure based + * on the current value passed in for the grace-period counter. + */ +static inline void rcu_segcblist_advance(struct rcu_segcblist *rsclp, + unsigned long seq) +{ + int i, j; + + WARN_ON_ONCE(!rcu_segcblist_is_enabled(rsclp)); + if (rcu_segcblist_restempty(rsclp, RCU_DONE_TAIL)) + return; + + /* + * Find all callbacks whose ->gp_seq numbers indicate that they + * are ready to invoke, and put them into the RCU_DONE_TAIL segment. + */ + for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++) { + if (ULONG_CMP_LT(seq, rsclp->gp_seq[i])) + break; + rsclp->tails[RCU_DONE_TAIL] = rsclp->tails[i]; + } + + /* If no callbacks moved, nothing more need be done. */ + if (i == RCU_WAIT_TAIL) + return; + + /* Clean up tail pointers that might have been misordered above. */ + for (j = RCU_WAIT_TAIL; j < i; j++) + rsclp->tails[j] = rsclp->tails[RCU_DONE_TAIL]; + + /* + * Callbacks moved, so clean up the misordered ->tails[] pointers + * that now point into the middle of the list of ready-to-invoke + * callbacks. The overall effect is to copy down the later pointers + * into the gap that was created by the now-ready segments. + */ + for (j = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++, j++) { + if (rsclp->tails[j] == rsclp->tails[RCU_NEXT_TAIL]) + break; /* No more callbacks. */ + rsclp->tails[j] = rsclp->tails[i]; + rsclp->gp_seq[j] = rsclp->gp_seq[i]; + } +} + +/* + * "Accelerate" callbacks based on more-accurate grace-period information. + * The reason for this is that RCU does not synchronize the beginnings and + * ends of grace periods, and that callbacks are posted locally. This in + * turn means that the callbacks must be labelled conservatively early + * on, as getting exact information would degrade both performance and + * scalability. When more accurate grace-period information becomes + * available, previously posted callbacks can be "accelerated", marking + * them to complete at the end of the earlier grace period. + * + * This function operates on an rcu_segcblist structure, and also the + * grace-period sequence number seq at which new callbacks would become + * ready to invoke. Returns true if there are callbacks that won't be + * ready to invoke until seq, false otherwise. + */ +static inline bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp, + unsigned long seq) +{ + int i; + + WARN_ON_ONCE(!rcu_segcblist_is_enabled(rsclp)); + if (rcu_segcblist_restempty(rsclp, RCU_DONE_TAIL)) + return false; + + /* + * Find the segment preceding the oldest segment of callbacks + * whose ->gp_seq[] completion is at or after that passed in via + * "seq", skipping any empty segments. This oldest segment, along + * with any later segments, can be merged in with any newly arrived + * callbacks in the RCU_NEXT_TAIL segment, and assigned "seq" + * as their ->gp_seq[] grace-period completion sequence number. + */ + for (i = RCU_NEXT_READY_TAIL; i > RCU_DONE_TAIL; i--) + if (rsclp->tails[i] != rsclp->tails[i - 1] && + ULONG_CMP_LT(rsclp->gp_seq[i], seq)) + break; + + /* + * If all the segments contain callbacks that correspond to + * earlier grace-period sequence numbers than "seq", leave. + * Assuming that the rcu_segcblist structure has enough + * segments in its arrays, this can only happen if some of + * the non-done segments contain callbacks that really are + * ready to invoke. This situation will get straightened + * out by the next call to rcu_segcblist_advance(). + * + * Also advance to the oldest segment of callbacks whose + * ->gp_seq[] completion is at or after that passed in via "seq", + * skipping any empty segments. + */ + if (++i >= RCU_NEXT_TAIL) + return false; + + /* + * Merge all later callbacks, including newly arrived callbacks, + * into the segment located by the for-loop above. Assign "seq" + * as the ->gp_seq[] value in order to correctly handle the case + * where there were no pending callbacks in the rcu_segcblist + * structure other than in the RCU_NEXT_TAIL segment. + */ + for (; i < RCU_NEXT_TAIL; i++) { + rsclp->tails[i] = rsclp->tails[RCU_NEXT_TAIL]; + rsclp->gp_seq[i] = seq; + } + return true; +} + +/* + * Scan the specified rcu_segcblist structure for callbacks that need + * a grace period later than the one specified by "seq". We don't look + * at the RCU_DONE_TAIL or RCU_NEXT_TAIL segments because they don't + * have a grace-period sequence number. + */ +static inline bool rcu_segcblist_future_gp_needed(struct rcu_segcblist *rsclp, + unsigned long seq) +{ + int i; + + for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++) + if (rsclp->tails[i - 1] != rsclp->tails[i] && + ULONG_CMP_LT(seq, rsclp->gp_seq[i])) + return true; + return false; +} + +/* + * Interim function to return rcu_segcblist head pointer. Longer term, the + * rcu_segcblist will be used more pervasively, removing the need for this + * function. + */ +static inline struct rcu_head *rcu_segcblist_head(struct rcu_segcblist *rsclp) +{ + return rsclp->head; +} + +/* + * Interim function to return rcu_segcblist head pointer. Longer term, the + * rcu_segcblist will be used more pervasively, removing the need for this + * function. + */ +static inline struct rcu_head **rcu_segcblist_tail(struct rcu_segcblist *rsclp) +{ + WARN_ON_ONCE(rcu_segcblist_empty(rsclp)); + return rsclp->tails[RCU_NEXT_TAIL]; +} + +#endif /* __KERNEL_RCU_SEGCBLIST_H */ diff --git a/include/linux/rculist.h b/include/linux/rculist.h index 4f7a9561b8c4..b1fd8bf85fdc 100644 --- a/include/linux/rculist.h +++ b/include/linux/rculist.h @@ -509,7 +509,8 @@ static inline void hlist_add_tail_rcu(struct hlist_node *n, { struct hlist_node *i, *last = NULL; - for (i = hlist_first_rcu(h); i; i = hlist_next_rcu(i)) + /* Note: write side code, so rcu accessors are not needed. */ + for (i = h->first; i; i = i->next) last = i; if (last) { diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index de88b33c0974..f531b29207da 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -363,15 +363,20 @@ static inline void rcu_init_nohz(void) #ifdef CONFIG_TASKS_RCU #define TASKS_RCU(x) x extern struct srcu_struct tasks_rcu_exit_srcu; -#define rcu_note_voluntary_context_switch(t) \ +#define rcu_note_voluntary_context_switch_lite(t) \ do { \ - rcu_all_qs(); \ if (READ_ONCE((t)->rcu_tasks_holdout)) \ WRITE_ONCE((t)->rcu_tasks_holdout, false); \ } while (0) +#define rcu_note_voluntary_context_switch(t) \ + do { \ + rcu_all_qs(); \ + rcu_note_voluntary_context_switch_lite(t); \ + } while (0) #else /* #ifdef CONFIG_TASKS_RCU */ #define TASKS_RCU(x) do { } while (0) -#define rcu_note_voluntary_context_switch(t) rcu_all_qs() +#define rcu_note_voluntary_context_switch_lite(t) do { } while (0) +#define rcu_note_voluntary_context_switch(t) rcu_all_qs() #endif /* #else #ifdef CONFIG_TASKS_RCU */ /** @@ -1127,11 +1132,11 @@ do { \ * if the UNLOCK and LOCK are executed by the same CPU or if the * UNLOCK and LOCK operate on the same lock variable. */ -#ifdef CONFIG_PPC +#ifdef CONFIG_ARCH_WEAK_RELEASE_ACQUIRE #define smp_mb__after_unlock_lock() smp_mb() /* Full ordering for lock. */ -#else /* #ifdef CONFIG_PPC */ +#else /* #ifdef CONFIG_ARCH_WEAK_RELEASE_ACQUIRE */ #define smp_mb__after_unlock_lock() do { } while (0) -#endif /* #else #ifdef CONFIG_PPC */ +#endif /* #else #ifdef CONFIG_ARCH_WEAK_RELEASE_ACQUIRE */ #endif /* __LINUX_RCUPDATE_H */ diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index b452953e21c8..74d9c3a1feee 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -33,6 +33,11 @@ static inline int rcu_dynticks_snap(struct rcu_dynticks *rdtp) return 0; } +static inline bool rcu_eqs_special_set(int cpu) +{ + return false; /* Never flag non-existent other CPUs! */ +} + static inline unsigned long get_state_synchronize_rcu(void) { return 0; @@ -87,10 +92,11 @@ static inline void kfree_call_rcu(struct rcu_head *head, call_rcu(head, func); } -static inline void rcu_note_context_switch(void) -{ - rcu_sched_qs(); -} +#define rcu_note_context_switch(preempt) \ + do { \ + rcu_sched_qs(); \ + rcu_note_voluntary_context_switch_lite(current); \ + } while (0) /* * Take advantage of the fact that there is only one CPU, which @@ -212,14 +218,14 @@ static inline void exit_rcu(void) { } -#ifdef CONFIG_DEBUG_LOCK_ALLOC +#if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_SRCU) extern int rcu_scheduler_active __read_mostly; void rcu_scheduler_starting(void); -#else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ +#else /* #if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_SRCU) */ static inline void rcu_scheduler_starting(void) { } -#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ +#endif /* #else #if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_SRCU) */ #if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) @@ -237,6 +243,10 @@ static inline bool rcu_is_watching(void) #endif /* #else defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) */ +static inline void rcu_request_urgent_qs_task(struct task_struct *t) +{ +} + static inline void rcu_all_qs(void) { barrier(); /* Avoid RCU read-side critical sections leaking across. */ diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index 63a4e4cf40a5..0bacb6b2af69 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -30,7 +30,7 @@ #ifndef __LINUX_RCUTREE_H #define __LINUX_RCUTREE_H -void rcu_note_context_switch(void); +void rcu_note_context_switch(bool preempt); int rcu_needs_cpu(u64 basem, u64 *nextevt); void rcu_cpu_stall_reset(void); @@ -41,7 +41,7 @@ void rcu_cpu_stall_reset(void); */ static inline void rcu_virt_note_context_switch(int cpu) { - rcu_note_context_switch(); + rcu_note_context_switch(false); } void synchronize_rcu_bh(void); @@ -108,6 +108,7 @@ void rcu_scheduler_starting(void); extern int rcu_scheduler_active __read_mostly; bool rcu_is_watching(void); +void rcu_request_urgent_qs_task(struct task_struct *t); void rcu_all_qs(void); diff --git a/include/linux/slab.h b/include/linux/slab.h index 3c37a8c51921..04a7f7993e67 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -28,7 +28,7 @@ #define SLAB_STORE_USER 0x00010000UL /* DEBUG: Store the last owner for bug hunting */ #define SLAB_PANIC 0x00040000UL /* Panic if kmem_cache_create() fails */ /* - * SLAB_DESTROY_BY_RCU - **WARNING** READ THIS! + * SLAB_TYPESAFE_BY_RCU - **WARNING** READ THIS! * * This delays freeing the SLAB page by a grace period, it does _NOT_ * delay object freeing. This means that if you do kmem_cache_free() @@ -61,8 +61,10 @@ * * rcu_read_lock before reading the address, then rcu_read_unlock after * taking the spinlock within the structure expected at that address. + * + * Note that SLAB_TYPESAFE_BY_RCU was originally named SLAB_DESTROY_BY_RCU. */ -#define SLAB_DESTROY_BY_RCU 0x00080000UL /* Defer freeing slabs to RCU */ +#define SLAB_TYPESAFE_BY_RCU 0x00080000UL /* Defer freeing slabs to RCU */ #define SLAB_MEM_SPREAD 0x00100000UL /* Spread some memory over cpuset */ #define SLAB_TRACE 0x00200000UL /* Trace allocations and frees */ diff --git a/include/linux/srcu.h b/include/linux/srcu.h index a598cf3ac70c..167ad8831aaf 100644 --- a/include/linux/srcu.h +++ b/include/linux/srcu.h @@ -22,7 +22,7 @@ * Lai Jiangshan <laijs@cn.fujitsu.com> * * For detailed explanation of Read-Copy Update mechanism see - - * Documentation/RCU/ *.txt + * Documentation/RCU/ *.txt * */ @@ -32,35 +32,9 @@ #include <linux/mutex.h> #include <linux/rcupdate.h> #include <linux/workqueue.h> +#include <linux/rcu_segcblist.h> -struct srcu_array { - unsigned long lock_count[2]; - unsigned long unlock_count[2]; -}; - -struct rcu_batch { - struct rcu_head *head, **tail; -}; - -#define RCU_BATCH_INIT(name) { NULL, &(name.head) } - -struct srcu_struct { - unsigned long completed; - struct srcu_array __percpu *per_cpu_ref; - spinlock_t queue_lock; /* protect ->batch_queue, ->running */ - bool running; - /* callbacks just queued */ - struct rcu_batch batch_queue; - /* callbacks try to do the first check_zero */ - struct rcu_batch batch_check0; - /* callbacks done with the first check_zero and the flip */ - struct rcu_batch batch_check1; - struct rcu_batch batch_done; - struct delayed_work work; -#ifdef CONFIG_DEBUG_LOCK_ALLOC - struct lockdep_map dep_map; -#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ -}; +struct srcu_struct; #ifdef CONFIG_DEBUG_LOCK_ALLOC @@ -82,46 +56,15 @@ int init_srcu_struct(struct srcu_struct *sp); #define __SRCU_DEP_MAP_INIT(srcu_name) #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ -void process_srcu(struct work_struct *work); - -#define __SRCU_STRUCT_INIT(name) \ - { \ - .completed = -300, \ - .per_cpu_ref = &name##_srcu_array, \ - .queue_lock = __SPIN_LOCK_UNLOCKED(name.queue_lock), \ - .running = false, \ - .batch_queue = RCU_BATCH_INIT(name.batch_queue), \ - .batch_check0 = RCU_BATCH_INIT(name.batch_check0), \ - .batch_check1 = RCU_BATCH_INIT(name.batch_check1), \ - .batch_done = RCU_BATCH_INIT(name.batch_done), \ - .work = __DELAYED_WORK_INITIALIZER(name.work, process_srcu, 0),\ - __SRCU_DEP_MAP_INIT(name) \ - } - -/* - * Define and initialize a srcu struct at build time. - * Do -not- call init_srcu_struct() nor cleanup_srcu_struct() on it. - * - * Note that although DEFINE_STATIC_SRCU() hides the name from other - * files, the per-CPU variable rules nevertheless require that the - * chosen name be globally unique. These rules also prohibit use of - * DEFINE_STATIC_SRCU() within a function. If these rules are too - * restrictive, declare the srcu_struct manually. For example, in - * each file: - * - * static struct srcu_struct my_srcu; - * - * Then, before the first use of each my_srcu, manually initialize it: - * - * init_srcu_struct(&my_srcu); - * - * See include/linux/percpu-defs.h for the rules on per-CPU variables. - */ -#define __DEFINE_SRCU(name, is_static) \ - static DEFINE_PER_CPU(struct srcu_array, name##_srcu_array);\ - is_static struct srcu_struct name = __SRCU_STRUCT_INIT(name) -#define DEFINE_SRCU(name) __DEFINE_SRCU(name, /* not static */) -#define DEFINE_STATIC_SRCU(name) __DEFINE_SRCU(name, static) +#ifdef CONFIG_TINY_SRCU +#include <linux/srcutiny.h> +#elif defined(CONFIG_TREE_SRCU) +#include <linux/srcutree.h> +#elif defined(CONFIG_CLASSIC_SRCU) +#include <linux/srcuclassic.h> +#else +#error "Unknown SRCU implementation specified to kernel configuration" +#endif /** * call_srcu() - Queue a callback for invocation after an SRCU grace period @@ -147,9 +90,6 @@ void cleanup_srcu_struct(struct srcu_struct *sp); int __srcu_read_lock(struct srcu_struct *sp) __acquires(sp); void __srcu_read_unlock(struct srcu_struct *sp, int idx) __releases(sp); void synchronize_srcu(struct srcu_struct *sp); -void synchronize_srcu_expedited(struct srcu_struct *sp); -unsigned long srcu_batches_completed(struct srcu_struct *sp); -void srcu_barrier(struct srcu_struct *sp); #ifdef CONFIG_DEBUG_LOCK_ALLOC diff --git a/include/linux/srcuclassic.h b/include/linux/srcuclassic.h new file mode 100644 index 000000000000..41cf99930f34 --- /dev/null +++ b/include/linux/srcuclassic.h @@ -0,0 +1,101 @@ +/* + * Sleepable Read-Copy Update mechanism for mutual exclusion, + * classic v4.11 variant. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, you can access it online at + * http://www.gnu.org/licenses/gpl-2.0.html. + * + * Copyright (C) IBM Corporation, 2017 + * + * Author: Paul McKenney <paulmck@us.ibm.com> + */ + +#ifndef _LINUX_SRCU_CLASSIC_H +#define _LINUX_SRCU_CLASSIC_H + +struct srcu_array { + unsigned long lock_count[2]; + unsigned long unlock_count[2]; +}; + +struct rcu_batch { + struct rcu_head *head, **tail; +}; + +#define RCU_BATCH_INIT(name) { NULL, &(name.head) } + +struct srcu_struct { + unsigned long completed; + struct srcu_array __percpu *per_cpu_ref; + spinlock_t queue_lock; /* protect ->batch_queue, ->running */ + bool running; + /* callbacks just queued */ + struct rcu_batch batch_queue; + /* callbacks try to do the first check_zero */ + struct rcu_batch batch_check0; + /* callbacks done with the first check_zero and the flip */ + struct rcu_batch batch_check1; + struct rcu_batch batch_done; + struct delayed_work work; +#ifdef CONFIG_DEBUG_LOCK_ALLOC + struct lockdep_map dep_map; +#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ +}; + +void process_srcu(struct work_struct *work); + +#define __SRCU_STRUCT_INIT(name) \ + { \ + .completed = -300, \ + .per_cpu_ref = &name##_srcu_array, \ + .queue_lock = __SPIN_LOCK_UNLOCKED(name.queue_lock), \ + .running = false, \ + .batch_queue = RCU_BATCH_INIT(name.batch_queue), \ + .batch_check0 = RCU_BATCH_INIT(name.batch_check0), \ + .batch_check1 = RCU_BATCH_INIT(name.batch_check1), \ + .batch_done = RCU_BATCH_INIT(name.batch_done), \ + .work = __DELAYED_WORK_INITIALIZER(name.work, process_srcu, 0),\ + __SRCU_DEP_MAP_INIT(name) \ + } + +/* + * Define and initialize a srcu struct at build time. + * Do -not- call init_srcu_struct() nor cleanup_srcu_struct() on it. + * + * Note that although DEFINE_STATIC_SRCU() hides the name from other + * files, the per-CPU variable rules nevertheless require that the + * chosen name be globally unique. These rules also prohibit use of + * DEFINE_STATIC_SRCU() within a function. If these rules are too + * restrictive, declare the srcu_struct manually. For example, in + * each file: + * + * static struct srcu_struct my_srcu; + * + * Then, before the first use of each my_srcu, manually initialize it: + * + * init_srcu_struct(&my_srcu); + * + * See include/linux/percpu-defs.h for the rules on per-CPU variables. + */ +#define __DEFINE_SRCU(name, is_static) \ + static DEFINE_PER_CPU(struct srcu_array, name##_srcu_array);\ + is_static struct srcu_struct name = __SRCU_STRUCT_INIT(name) +#define DEFINE_SRCU(name) __DEFINE_SRCU(name, /* not static */) +#define DEFINE_STATIC_SRCU(name) __DEFINE_SRCU(name, static) + +void synchronize_srcu_expedited(struct srcu_struct *sp); +void srcu_barrier(struct srcu_struct *sp); +unsigned long srcu_batches_completed(struct srcu_struct *sp); + +#endif diff --git a/include/linux/srcutiny.h b/include/linux/srcutiny.h new file mode 100644 index 000000000000..4f284e4f4d8c --- /dev/null +++ b/include/linux/srcutiny.h @@ -0,0 +1,81 @@ +/* + * Sleepable Read-Copy Update mechanism for mutual exclusion, + * tiny variant. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, you can access it online at + * http://www.gnu.org/licenses/gpl-2.0.html. + * + * Copyright (C) IBM Corporation, 2017 + * + * Author: Paul McKenney <paulmck@us.ibm.com> + */ + +#ifndef _LINUX_SRCU_TINY_H +#define _LINUX_SRCU_TINY_H + +#include <linux/swait.h> + +struct srcu_struct { + int srcu_lock_nesting[2]; /* srcu_read_lock() nesting depth. */ + struct swait_queue_head srcu_wq; + /* Last srcu_read_unlock() wakes GP. */ + unsigned long srcu_gp_seq; /* GP seq # for callback tagging. */ + struct rcu_segcblist srcu_cblist; + /* Pending SRCU callbacks. */ + int srcu_idx; /* Current reader array element. */ + bool srcu_gp_running; /* GP workqueue running? */ + bool srcu_gp_waiting; /* GP waiting for readers? */ + struct work_struct srcu_work; /* For driving grace periods. */ +#ifdef CONFIG_DEBUG_LOCK_ALLOC + struct lockdep_map dep_map; +#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ +}; + +void srcu_drive_gp(struct work_struct *wp); + +#define __SRCU_STRUCT_INIT(name) \ +{ \ + .srcu_wq = __SWAIT_QUEUE_HEAD_INITIALIZER(name.srcu_wq), \ + .srcu_cblist = RCU_SEGCBLIST_INITIALIZER(name.srcu_cblist), \ + .srcu_work = __WORK_INITIALIZER(name.srcu_work, srcu_drive_gp), \ + __SRCU_DEP_MAP_INIT(name) \ +} + +/* + * This odd _STATIC_ arrangement is needed for API compatibility with + * Tree SRCU, which needs some per-CPU data. + */ +#define DEFINE_SRCU(name) \ + struct srcu_struct name = __SRCU_STRUCT_INIT(name) +#define DEFINE_STATIC_SRCU(name) \ + static struct srcu_struct name = __SRCU_STRUCT_INIT(name) + +void synchronize_srcu(struct srcu_struct *sp); + +static inline void synchronize_srcu_expedited(struct srcu_struct *sp) +{ + synchronize_srcu(sp); +} + +static inline void srcu_barrier(struct srcu_struct *sp) +{ + synchronize_srcu(sp); +} + +static inline unsigned long srcu_batches_completed(struct srcu_struct *sp) +{ + return 0; +} + +#endif diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h new file mode 100644 index 000000000000..0400e211aa44 --- /dev/null +++ b/include/linux/srcutree.h @@ -0,0 +1,139 @@ +/* + * Sleepable Read-Copy Update mechanism for mutual exclusion, + * tree variant. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, you can access it online at + * http://www.gnu.org/licenses/gpl-2.0.html. + * + * Copyright (C) IBM Corporation, 2017 + * + * Author: Paul McKenney <paulmck@us.ibm.com> + */ + +#ifndef _LINUX_SRCU_TREE_H +#define _LINUX_SRCU_TREE_H + +#include <linux/rcu_node_tree.h> +#include <linux/completion.h> + +struct srcu_node; +struct srcu_struct; + +/* + * Per-CPU structure feeding into leaf srcu_node, similar in function + * to rcu_node. + */ +struct srcu_data { + /* Read-side state. */ + unsigned long srcu_lock_count[2]; /* Locks per CPU. */ + unsigned long srcu_unlock_count[2]; /* Unlocks per CPU. */ + + /* Update-side state. */ + spinlock_t lock ____cacheline_internodealigned_in_smp; + struct rcu_segcblist srcu_cblist; /* List of callbacks.*/ + unsigned long srcu_gp_seq_needed; /* Furthest future GP needed. */ + bool srcu_cblist_invoking; /* Invoking these CBs? */ + struct delayed_work work; /* Context for CB invoking. */ + struct rcu_head srcu_barrier_head; /* For srcu_barrier() use. */ + struct srcu_node *mynode; /* Leaf srcu_node. */ + int cpu; + struct srcu_struct *sp; +}; + +/* + * Node in SRCU combining tree, similar in function to rcu_data. + */ +struct srcu_node { + spinlock_t lock; + unsigned long srcu_have_cbs[4]; /* GP seq for children */ + /* having CBs, but only */ + /* is > ->srcu_gq_seq. */ + struct srcu_node *srcu_parent; /* Next up in tree. */ + int grplo; /* Least CPU for node. */ + int grphi; /* Biggest CPU for node. */ +}; + +/* + * Per-SRCU-domain structure, similar in function to rcu_state. + */ +struct srcu_struct { + struct srcu_node node[NUM_RCU_NODES]; /* Combining tree. */ + struct srcu_node *level[RCU_NUM_LVLS + 1]; + /* First node at each level. */ + struct mutex srcu_cb_mutex; /* Serialize CB preparation. */ + spinlock_t gp_lock; /* protect ->srcu_cblist */ + struct mutex srcu_gp_mutex; /* Serialize GP work. */ + unsigned int srcu_idx; /* Current rdr array element. */ + unsigned long srcu_gp_seq; /* Grace-period seq #. */ + unsigned long srcu_gp_seq_needed; /* Latest gp_seq needed. */ + atomic_t srcu_exp_cnt; /* # ongoing expedited GPs. */ + struct srcu_data __percpu *sda; /* Per-CPU srcu_data array. */ + unsigned long srcu_barrier_seq; /* srcu_barrier seq #. */ + struct mutex srcu_barrier_mutex; /* Serialize barrier ops. */ + struct completion srcu_barrier_completion; + /* Awaken barrier rq at end. */ + atomic_t srcu_barrier_cpu_cnt; /* # CPUs not yet posting a */ + /* callback for the barrier */ + /* operation. */ + struct delayed_work work; +#ifdef CONFIG_DEBUG_LOCK_ALLOC + struct lockdep_map dep_map; +#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ +}; + +/* Values for state variable (bottom bits of ->srcu_gp_seq). */ +#define SRCU_STATE_IDLE 0 +#define SRCU_STATE_SCAN1 1 +#define SRCU_STATE_SCAN2 2 + +void process_srcu(struct work_struct *work); + +#define __SRCU_STRUCT_INIT(name) \ + { \ + .sda = &name##_srcu_data, \ + .gp_lock = __SPIN_LOCK_UNLOCKED(name.gp_lock), \ + .srcu_gp_seq_needed = 0 - 1, \ + __SRCU_DEP_MAP_INIT(name) \ + } + +/* + * Define and initialize a srcu struct at build time. + * Do -not- call init_srcu_struct() nor cleanup_srcu_struct() on it. + * + * Note that although DEFINE_STATIC_SRCU() hides the name from other + * files, the per-CPU variable rules nevertheless require that the + * chosen name be globally unique. These rules also prohibit use of + * DEFINE_STATIC_SRCU() within a function. If these rules are too + * restrictive, declare the srcu_struct manually. For example, in + * each file: + * + * static struct srcu_struct my_srcu; + * + * Then, before the first use of each my_srcu, manually initialize it: + * + * init_srcu_struct(&my_srcu); + * + * See include/linux/percpu-defs.h for the rules on per-CPU variables. + */ +#define __DEFINE_SRCU(name, is_static) \ + static DEFINE_PER_CPU(struct srcu_data, name##_srcu_data);\ + is_static struct srcu_struct name = __SRCU_STRUCT_INIT(name) +#define DEFINE_SRCU(name) __DEFINE_SRCU(name, /* not static */) +#define DEFINE_STATIC_SRCU(name) __DEFINE_SRCU(name, static) + +void synchronize_srcu_expedited(struct srcu_struct *sp); +void srcu_barrier(struct srcu_struct *sp); +unsigned long srcu_batches_completed(struct srcu_struct *sp); + +#endif diff --git a/include/linux/types.h b/include/linux/types.h index 1e7bd24848fc..258099a4ed82 100644 --- a/include/linux/types.h +++ b/include/linux/types.h @@ -209,7 +209,7 @@ struct ustat { * naturally due ABI requirements, but some architectures (like CRIS) have * weird ABI and we need to ask it explicitly. * - * The alignment is required to guarantee that bits 0 and 1 of @next will be + * The alignment is required to guarantee that bit 0 of @next will be * clear under normal conditions -- as long as we use call_rcu(), * call_rcu_bh(), call_rcu_sched(), or call_srcu() to queue callback. * diff --git a/include/net/sock.h b/include/net/sock.h index 5e5997654db6..59cdccaa30e7 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -993,7 +993,7 @@ struct smc_hashinfo; struct module; /* - * caches using SLAB_DESTROY_BY_RCU should let .next pointer from nulls nodes + * caches using SLAB_TYPESAFE_BY_RCU should let .next pointer from nulls nodes * un-modified. Special care is taken when initializing object to zero. */ static inline void sk_prot_clear_nulls(struct sock *sk, int size) diff --git a/init/Kconfig b/init/Kconfig index a92f27da4a27..4119a44e4157 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -526,6 +526,35 @@ config SRCU permits arbitrary sleeping or blocking within RCU read-side critical sections. +config CLASSIC_SRCU + bool "Use v4.11 classic SRCU implementation" + default n + depends on RCU_EXPERT && SRCU + help + This option selects the traditional well-tested classic SRCU + implementation from v4.11, as might be desired for enterprise + Linux distributions. Without this option, the shiny new + Tiny SRCU and Tree SRCU implementations are used instead. + At some point, it is hoped that Tiny SRCU and Tree SRCU + will accumulate enough test time and confidence to allow + Classic SRCU to be dropped entirely. + + Say Y if you need a rock-solid SRCU. + + Say N if you would like help test Tree SRCU. + +config TINY_SRCU + bool + default y if TINY_RCU && !CLASSIC_SRCU + help + This option selects the single-CPU non-preemptible version of SRCU. + +config TREE_SRCU + bool + default y if !TINY_RCU && !CLASSIC_SRCU + help + This option selects the full-fledged version of SRCU. + config TASKS_RCU bool default n @@ -612,11 +641,17 @@ config RCU_FANOUT_LEAF initialization. These systems tend to run CPU-bound, and thus are not helped by synchronized interrupts, and thus tend to skew them, which reduces lock contention enough that large - leaf-level fanouts work well. + leaf-level fanouts work well. That said, setting leaf-level + fanout to a large number will likely cause problematic + lock contention on the leaf-level rcu_node structures unless + you boot with the skew_tick kernel parameter. Select a specific number if testing RCU itself. - Select the maximum permissible value for large systems. + Select the maximum permissible value for large systems, but + please understand that you may also need to set the skew_tick + kernel boot parameter to avoid contention on the rcu_node + structure's locks. Take the default if unsure. diff --git a/kernel/fork.c b/kernel/fork.c index 6c463c80e93d..9330ce24f1bb 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1313,7 +1313,7 @@ void __cleanup_sighand(struct sighand_struct *sighand) if (atomic_dec_and_test(&sighand->count)) { signalfd_cleanup(sighand); /* - * sighand_cachep is SLAB_DESTROY_BY_RCU so we can free it + * sighand_cachep is SLAB_TYPESAFE_BY_RCU so we can free it * without an RCU grace period, see __lock_task_sighand(). */ kmem_cache_free(sighand_cachep, sighand); @@ -2144,7 +2144,7 @@ void __init proc_caches_init(void) { sighand_cachep = kmem_cache_create("sighand_cache", sizeof(struct sighand_struct), 0, - SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_DESTROY_BY_RCU| + SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU| SLAB_NOTRACK|SLAB_ACCOUNT, sighand_ctor); signal_cachep = kmem_cache_create("signal_cache", sizeof(struct signal_struct), 0, diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index a95e5d1f4a9c..e9d4f85b290c 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -1144,10 +1144,10 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth, return 0; printk("\n"); - printk("======================================================\n"); - printk("[ INFO: possible circular locking dependency detected ]\n"); + pr_warn("======================================================\n"); + pr_warn("WARNING: possible circular locking dependency detected\n"); print_kernel_ident(); - printk("-------------------------------------------------------\n"); + pr_warn("------------------------------------------------------\n"); printk("%s/%d is trying to acquire lock:\n", curr->comm, task_pid_nr(curr)); print_lock(check_src); @@ -1482,11 +1482,11 @@ print_bad_irq_dependency(struct task_struct *curr, return 0; printk("\n"); - printk("======================================================\n"); - printk("[ INFO: %s-safe -> %s-unsafe lock order detected ]\n", + pr_warn("=====================================================\n"); + pr_warn("WARNING: %s-safe -> %s-unsafe lock order detected\n", irqclass, irqclass); print_kernel_ident(); - printk("------------------------------------------------------\n"); + pr_warn("-----------------------------------------------------\n"); printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] is trying to acquire:\n", curr->comm, task_pid_nr(curr), curr->hardirq_context, hardirq_count() >> HARDIRQ_SHIFT, @@ -1711,10 +1711,10 @@ print_deadlock_bug(struct task_struct *curr, struct held_lock *prev, return 0; printk("\n"); - printk("=============================================\n"); - printk("[ INFO: possible recursive locking detected ]\n"); + pr_warn("============================================\n"); + pr_warn("WARNING: possible recursive locking detected\n"); print_kernel_ident(); - printk("---------------------------------------------\n"); + pr_warn("--------------------------------------------\n"); printk("%s/%d is trying to acquire lock:\n", curr->comm, task_pid_nr(curr)); print_lock(next); @@ -2061,10 +2061,10 @@ static void print_collision(struct task_struct *curr, struct lock_chain *chain) { printk("\n"); - printk("======================\n"); - printk("[chain_key collision ]\n"); + pr_warn("============================\n"); + pr_warn("WARNING: chain_key collision\n"); print_kernel_ident(); - printk("----------------------\n"); + pr_warn("----------------------------\n"); printk("%s/%d: ", current->comm, task_pid_nr(current)); printk("Hash chain already cached but the contents don't match!\n"); @@ -2360,10 +2360,10 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this, return 0; printk("\n"); - printk("=================================\n"); - printk("[ INFO: inconsistent lock state ]\n"); + pr_warn("================================\n"); + pr_warn("WARNING: inconsistent lock state\n"); print_kernel_ident(); - printk("---------------------------------\n"); + pr_warn("--------------------------------\n"); printk("inconsistent {%s} -> {%s} usage.\n", usage_str[prev_bit], usage_str[new_bit]); @@ -2425,10 +2425,10 @@ print_irq_inversion_bug(struct task_struct *curr, return 0; printk("\n"); - printk("=========================================================\n"); - printk("[ INFO: possible irq lock inversion dependency detected ]\n"); + pr_warn("========================================================\n"); + pr_warn("WARNING: possible irq lock inversion dependency detected\n"); print_kernel_ident(); - printk("---------------------------------------------------------\n"); + pr_warn("--------------------------------------------------------\n"); printk("%s/%d just changed the state of lock:\n", curr->comm, task_pid_nr(curr)); print_lock(this); @@ -3170,10 +3170,10 @@ print_lock_nested_lock_not_held(struct task_struct *curr, return 0; printk("\n"); - printk("==================================\n"); - printk("[ BUG: Nested lock was not taken ]\n"); + pr_warn("==================================\n"); + pr_warn("WARNING: Nested lock was not taken\n"); print_kernel_ident(); - printk("----------------------------------\n"); + pr_warn("----------------------------------\n"); printk("%s/%d is trying to lock:\n", curr->comm, task_pid_nr(curr)); print_lock(hlock); @@ -3383,10 +3383,10 @@ print_unlock_imbalance_bug(struct task_struct *curr, struct lockdep_map *lock, return 0; printk("\n"); - printk("=====================================\n"); - printk("[ BUG: bad unlock balance detected! ]\n"); + pr_warn("=====================================\n"); + pr_warn("WARNING: bad unlock balance detected!\n"); print_kernel_ident(); - printk("-------------------------------------\n"); + pr_warn("-------------------------------------\n"); printk("%s/%d is trying to release lock (", curr->comm, task_pid_nr(curr)); print_lockdep_cache(lock); @@ -3880,10 +3880,10 @@ print_lock_contention_bug(struct task_struct *curr, struct lockdep_map *lock, return 0; printk("\n"); - printk("=================================\n"); - printk("[ BUG: bad contention detected! ]\n"); + pr_warn("=================================\n"); + pr_warn("WARNING: bad contention detected!\n"); print_kernel_ident(); - printk("---------------------------------\n"); + pr_warn("---------------------------------\n"); printk("%s/%d is trying to contend lock (", curr->comm, task_pid_nr(curr)); print_lockdep_cache(lock); @@ -4244,10 +4244,10 @@ print_freed_lock_bug(struct task_struct *curr, const void *mem_from, return; printk("\n"); - printk("=========================\n"); - printk("[ BUG: held lock freed! ]\n"); + pr_warn("=========================\n"); + pr_warn("WARNING: held lock freed!\n"); print_kernel_ident(); - printk("-------------------------\n"); + pr_warn("-------------------------\n"); printk("%s/%d is freeing memory %p-%p, with a lock still held there!\n", curr->comm, task_pid_nr(curr), mem_from, mem_to-1); print_lock(hlock); @@ -4302,11 +4302,11 @@ static void print_held_locks_bug(void) return; printk("\n"); - printk("=====================================\n"); - printk("[ BUG: %s/%d still has locks held! ]\n", + pr_warn("====================================\n"); + pr_warn("WARNING: %s/%d still has locks held!\n", current->comm, task_pid_nr(current)); print_kernel_ident(); - printk("-------------------------------------\n"); + pr_warn("------------------------------------\n"); lockdep_print_held_locks(current); printk("\nstack backtrace:\n"); dump_stack(); @@ -4371,7 +4371,7 @@ retry: } while_each_thread(g, p); printk("\n"); - printk("=============================================\n\n"); + pr_warn("=============================================\n\n"); if (unlock) read_unlock(&tasklist_lock); @@ -4401,10 +4401,10 @@ asmlinkage __visible void lockdep_sys_exit(void) if (!debug_locks_off()) return; printk("\n"); - printk("================================================\n"); - printk("[ BUG: lock held when returning to user space! ]\n"); + pr_warn("================================================\n"); + pr_warn("WARNING: lock held when returning to user space!\n"); print_kernel_ident(); - printk("------------------------------------------------\n"); + pr_warn("------------------------------------------------\n"); printk("%s/%d is leaving the kernel with locks still held!\n", curr->comm, curr->pid); lockdep_print_held_locks(curr); @@ -4421,13 +4421,13 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s) #endif /* #ifdef CONFIG_PROVE_RCU_REPEATEDLY */ /* Note: the following can be executed concurrently, so be careful. */ printk("\n"); - pr_err("===============================\n"); - pr_err("[ ERR: suspicious RCU usage. ]\n"); + pr_warn("=============================\n"); + pr_warn("WARNING: suspicious RCU usage\n"); print_kernel_ident(); - pr_err("-------------------------------\n"); - pr_err("%s:%d %s!\n", file, line, s); - pr_err("\nother info that might help us debug this:\n\n"); - pr_err("\n%srcu_scheduler_active = %d, debug_locks = %d\n", + pr_warn("-----------------------------\n"); + printk("%s:%d %s!\n", file, line, s); + printk("\nother info that might help us debug this:\n\n"); + printk("\n%srcu_scheduler_active = %d, debug_locks = %d\n", !rcu_lockdep_current_cpu_online() ? "RCU used illegally from offline CPU!\n" : !rcu_is_watching() diff --git a/kernel/locking/rtmutex-debug.c b/kernel/locking/rtmutex-debug.c index 97ee9df32e0f..db4f55211b04 100644 --- a/kernel/locking/rtmutex-debug.c +++ b/kernel/locking/rtmutex-debug.c @@ -102,10 +102,11 @@ void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter) return; } - printk("\n============================================\n"); - printk( "[ BUG: circular locking deadlock detected! ]\n"); - printk("%s\n", print_tainted()); - printk( "--------------------------------------------\n"); + pr_warn("\n"); + pr_warn("============================================\n"); + pr_warn("WARNING: circular locking deadlock detected!\n"); + pr_warn("%s\n", print_tainted()); + pr_warn("--------------------------------------------\n"); printk("%s/%d is deadlocking current task %s/%d\n\n", task->comm, task_pid_nr(task), current->comm, task_pid_nr(current)); diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile index 18dfc485225c..158e6593d58c 100644 --- a/kernel/rcu/Makefile +++ b/kernel/rcu/Makefile @@ -3,7 +3,9 @@ KCOV_INSTRUMENT := n obj-y += update.o sync.o -obj-$(CONFIG_SRCU) += srcu.o +obj-$(CONFIG_CLASSIC_SRCU) += srcu.o +obj-$(CONFIG_TREE_SRCU) += srcutree.o +obj-$(CONFIG_TINY_SRCU) += srcutiny.o obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o obj-$(CONFIG_RCU_PERF_TEST) += rcuperf.o obj-$(CONFIG_TREE_RCU) += tree.o diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 0d6ff3e471be..73e16ec4054b 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -56,6 +56,83 @@ #define DYNTICK_TASK_EXIT_IDLE (DYNTICK_TASK_NEST_VALUE + \ DYNTICK_TASK_FLAG) + +/* + * Grace-period counter management. + */ + +#define RCU_SEQ_CTR_SHIFT 2 +#define RCU_SEQ_STATE_MASK ((1 << RCU_SEQ_CTR_SHIFT) - 1) + +/* + * Return the counter portion of a sequence number previously returned + * by rcu_seq_snap() or rcu_seq_current(). + */ +static inline unsigned long rcu_seq_ctr(unsigned long s) +{ + return s >> RCU_SEQ_CTR_SHIFT; +} + +/* + * Return the state portion of a sequence number previously returned + * by rcu_seq_snap() or rcu_seq_current(). + */ +static inline int rcu_seq_state(unsigned long s) +{ + return s & RCU_SEQ_STATE_MASK; +} + +/* + * Set the state portion of the pointed-to sequence number. + * The caller is responsible for preventing conflicting updates. + */ +static inline void rcu_seq_set_state(unsigned long *sp, int newstate) +{ + WARN_ON_ONCE(newstate & ~RCU_SEQ_STATE_MASK); + WRITE_ONCE(*sp, (*sp & ~RCU_SEQ_STATE_MASK) + newstate); +} + +/* Adjust sequence number for start of update-side operation. */ +static inline void rcu_seq_start(unsigned long *sp) +{ + WRITE_ONCE(*sp, *sp + 1); + smp_mb(); /* Ensure update-side operation after counter increment. */ + WARN_ON_ONCE(rcu_seq_state(*sp) != 1); +} + +/* Adjust sequence number for end of update-side operation. */ +static inline void rcu_seq_end(unsigned long *sp) +{ + smp_mb(); /* Ensure update-side operation before counter increment. */ + WARN_ON_ONCE(!rcu_seq_state(*sp)); + WRITE_ONCE(*sp, (*sp | RCU_SEQ_STATE_MASK) + 1); +} + +/* Take a snapshot of the update side's sequence number. */ +static inline unsigned long rcu_seq_snap(unsigned long *sp) +{ + unsigned long s; + + s = (READ_ONCE(*sp) + 2 * RCU_SEQ_STATE_MASK + 1) & ~RCU_SEQ_STATE_MASK; + smp_mb(); /* Above access must not bleed into critical section. */ + return s; +} + +/* Return the current value the update side's sequence number, no ordering. */ +static inline unsigned long rcu_seq_current(unsigned long *sp) +{ + return READ_ONCE(*sp); +} + +/* + * Given a snapshot from rcu_seq_snap(), determine whether or not a + * full update-side operation has occurred. + */ +static inline bool rcu_seq_done(unsigned long *sp, unsigned long s) +{ + return ULONG_CMP_GE(READ_ONCE(*sp), s); +} + /* * debug_rcu_head_queue()/debug_rcu_head_unqueue() are used internally * by call_rcu() and rcu callback execution, and are therefore not part of the @@ -109,12 +186,12 @@ static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head) rcu_lock_acquire(&rcu_callback_map); if (__is_kfree_rcu_offset(offset)) { - RCU_TRACE(trace_rcu_invoke_kfree_callback(rn, head, offset)); + RCU_TRACE(trace_rcu_invoke_kfree_callback(rn, head, offset);) kfree((void *)head - offset); rcu_lock_release(&rcu_callback_map); return true; } else { - RCU_TRACE(trace_rcu_invoke_callback(rn, head)); + RCU_TRACE(trace_rcu_invoke_callback(rn, head);) head->func(head); rcu_lock_release(&rcu_callback_map); return false; @@ -144,4 +221,76 @@ void rcu_test_sync_prims(void); */ extern void resched_cpu(int cpu); +#if defined(SRCU) || !defined(TINY_RCU) + +#include <linux/rcu_node_tree.h> + +extern int rcu_num_lvls; +extern int num_rcu_lvl[]; +extern int rcu_num_nodes; +static bool rcu_fanout_exact; +static int rcu_fanout_leaf; + +/* + * Compute the per-level fanout, either using the exact fanout specified + * or balancing the tree, depending on the rcu_fanout_exact boot parameter. + */ +static inline void rcu_init_levelspread(int *levelspread, const int *levelcnt) +{ + int i; + + if (rcu_fanout_exact) { + levelspread[rcu_num_lvls - 1] = rcu_fanout_leaf; + for (i = rcu_num_lvls - 2; i >= 0; i--) + levelspread[i] = RCU_FANOUT; + } else { + int ccur; + int cprv; + + cprv = nr_cpu_ids; + for (i = rcu_num_lvls - 1; i >= 0; i--) { + ccur = levelcnt[i]; + levelspread[i] = (cprv + ccur - 1) / ccur; + cprv = ccur; + } + } +} + +/* + * Do a full breadth-first scan of the rcu_node structures for the + * specified rcu_state structure. + */ +#define rcu_for_each_node_breadth_first(rsp, rnp) \ + for ((rnp) = &(rsp)->node[0]; \ + (rnp) < &(rsp)->node[rcu_num_nodes]; (rnp)++) + +/* + * Do a breadth-first scan of the non-leaf rcu_node structures for the + * specified rcu_state structure. Note that if there is a singleton + * rcu_node tree with but one rcu_node structure, this loop is a no-op. + */ +#define rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) \ + for ((rnp) = &(rsp)->node[0]; \ + (rnp) < (rsp)->level[rcu_num_lvls - 1]; (rnp)++) + +/* + * Scan the leaves of the rcu_node hierarchy for the specified rcu_state + * structure. Note that if there is a singleton rcu_node tree with but + * one rcu_node structure, this loop -will- visit the rcu_node structure. + * It is still a leaf node, even if it is also the root node. + */ +#define rcu_for_each_leaf_node(rsp, rnp) \ + for ((rnp) = (rsp)->level[rcu_num_lvls - 1]; \ + (rnp) < &(rsp)->node[rcu_num_nodes]; (rnp)++) + +/* + * Iterate over all possible CPUs in a leaf RCU node. + */ +#define for_each_leaf_node_possible_cpu(rnp, cpu) \ + for ((cpu) = cpumask_next(rnp->grplo - 1, cpu_possible_mask); \ + cpu <= rnp->grphi; \ + cpu = cpumask_next((cpu), cpu_possible_mask)) + +#endif /* #if defined(SRCU) || !defined(TINY_RCU) */ + #endif /* __LINUX_RCU_H */ diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index cccc417a8135..e9d4527cdd43 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -559,19 +559,34 @@ static void srcu_torture_barrier(void) static void srcu_torture_stats(void) { - int cpu; - int idx = srcu_ctlp->completed & 0x1; + int __maybe_unused cpu; + int idx; - pr_alert("%s%s per-CPU(idx=%d):", +#if defined(CONFIG_TREE_SRCU) || defined(CONFIG_CLASSIC_SRCU) +#ifdef CONFIG_TREE_SRCU + idx = srcu_ctlp->srcu_idx & 0x1; +#else /* #ifdef CONFIG_TREE_SRCU */ + idx = srcu_ctlp->completed & 0x1; +#endif /* #else #ifdef CONFIG_TREE_SRCU */ + pr_alert("%s%s Tree SRCU per-CPU(idx=%d):", torture_type, TORTURE_FLAG, idx); for_each_possible_cpu(cpu) { unsigned long l0, l1; unsigned long u0, u1; long c0, c1; - struct srcu_array *counts = per_cpu_ptr(srcu_ctlp->per_cpu_ref, cpu); +#ifdef CONFIG_TREE_SRCU + struct srcu_data *counts; + counts = per_cpu_ptr(srcu_ctlp->sda, cpu); + u0 = counts->srcu_unlock_count[!idx]; + u1 = counts->srcu_unlock_count[idx]; +#else /* #ifdef CONFIG_TREE_SRCU */ + struct srcu_array *counts; + + counts = per_cpu_ptr(srcu_ctlp->per_cpu_ref, cpu); u0 = counts->unlock_count[!idx]; u1 = counts->unlock_count[idx]; +#endif /* #else #ifdef CONFIG_TREE_SRCU */ /* * Make sure that a lock is always counted if the corresponding @@ -579,14 +594,26 @@ static void srcu_torture_stats(void) */ smp_rmb(); +#ifdef CONFIG_TREE_SRCU + l0 = counts->srcu_lock_count[!idx]; + l1 = counts->srcu_lock_count[idx]; +#else /* #ifdef CONFIG_TREE_SRCU */ l0 = counts->lock_count[!idx]; l1 = counts->lock_count[idx]; +#endif /* #else #ifdef CONFIG_TREE_SRCU */ c0 = l0 - u0; c1 = l1 - u1; pr_cont(" %d(%ld,%ld)", cpu, c0, c1); } pr_cont("\n"); +#elif defined(CONFIG_TINY_SRCU) + idx = READ_ONCE(srcu_ctlp->srcu_idx) & 0x1; + pr_alert("%s%s Tiny SRCU per-CPU(idx=%d): (%d,%d)\n", + torture_type, TORTURE_FLAG, idx, + READ_ONCE(srcu_ctlp->srcu_lock_nesting[!idx]), + READ_ONCE(srcu_ctlp->srcu_lock_nesting[idx])); +#endif } static void srcu_torture_synchronize_expedited(void) diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c index ef3bcfb15b39..584d8a983883 100644 --- a/kernel/rcu/srcu.c +++ b/kernel/rcu/srcu.c @@ -22,7 +22,7 @@ * Lai Jiangshan <laijs@cn.fujitsu.com> * * For detailed explanation of Read-Copy Update mechanism see - - * Documentation/RCU/ *.txt + * Documentation/RCU/ *.txt * */ @@ -243,8 +243,14 @@ static bool srcu_readers_active(struct srcu_struct *sp) * cleanup_srcu_struct - deconstruct a sleep-RCU structure * @sp: structure to clean up. * - * Must invoke this after you are finished using a given srcu_struct that - * was initialized via init_srcu_struct(), else you leak memory. + * Must invoke this only after you are finished using a given srcu_struct + * that was initialized via init_srcu_struct(). This code does some + * probabalistic checking, spotting late uses of srcu_read_lock(), + * synchronize_srcu(), synchronize_srcu_expedited(), and call_srcu(). + * If any such late uses are detected, the per-CPU memory associated with + * the srcu_struct is simply leaked and WARN_ON() is invoked. If the + * caller frees the srcu_struct itself, a use-after-free crash will likely + * ensue, but at least there will be a warning printed. */ void cleanup_srcu_struct(struct srcu_struct *sp) { diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c new file mode 100644 index 000000000000..b8293527ee18 --- /dev/null +++ b/kernel/rcu/srcutiny.c @@ -0,0 +1,215 @@ +/* + * Sleepable Read-Copy Update mechanism for mutual exclusion, + * tiny version for non-preemptible single-CPU use. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, you can access it online at + * http://www.gnu.org/licenses/gpl-2.0.html. + * + * Copyright (C) IBM Corporation, 2017 + * + * Author: Paul McKenney <paulmck@us.ibm.com> + */ + +#include <linux/export.h> +#include <linux/mutex.h> +#include <linux/preempt.h> +#include <linux/rcupdate_wait.h> +#include <linux/sched.h> +#include <linux/delay.h> +#include <linux/srcu.h> + +#include <linux/rcu_node_tree.h> +#include "rcu.h" + +static int init_srcu_struct_fields(struct srcu_struct *sp) +{ + sp->srcu_lock_nesting[0] = 0; + sp->srcu_lock_nesting[1] = 0; + init_swait_queue_head(&sp->srcu_wq); + sp->srcu_gp_seq = 0; + rcu_segcblist_init(&sp->srcu_cblist); + sp->srcu_gp_running = false; + sp->srcu_gp_waiting = false; + sp->srcu_idx = 0; + INIT_WORK(&sp->srcu_work, srcu_drive_gp); + return 0; +} + +#ifdef CONFIG_DEBUG_LOCK_ALLOC + +int __init_srcu_struct(struct srcu_struct *sp, const char *name, + struct lock_class_key *key) +{ + /* Don't re-initialize a lock while it is held. */ + debug_check_no_locks_freed((void *)sp, sizeof(*sp)); + lockdep_init_map(&sp->dep_map, name, key, 0); + return init_srcu_struct_fields(sp); +} +EXPORT_SYMBOL_GPL(__init_srcu_struct); + +#else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ + +/* + * init_srcu_struct - initialize a sleep-RCU structure + * @sp: structure to initialize. + * + * Must invoke this on a given srcu_struct before passing that srcu_struct + * to any other function. Each srcu_struct represents a separate domain + * of SRCU protection. + */ +int init_srcu_struct(struct srcu_struct *sp) +{ + return init_srcu_struct_fields(sp); +} +EXPORT_SYMBOL_GPL(init_srcu_struct); + +#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ + +/* + * cleanup_srcu_struct - deconstruct a sleep-RCU structure + * @sp: structure to clean up. + * + * Must invoke this after you are finished using a given srcu_struct that + * was initialized via init_srcu_struct(), else you leak memory. + */ +void cleanup_srcu_struct(struct srcu_struct *sp) +{ + WARN_ON(sp->srcu_lock_nesting[0] || sp->srcu_lock_nesting[1]); + flush_work(&sp->srcu_work); + WARN_ON(rcu_seq_state(sp->srcu_gp_seq)); + WARN_ON(sp->srcu_gp_running); + WARN_ON(sp->srcu_gp_waiting); + WARN_ON(!rcu_segcblist_empty(&sp->srcu_cblist)); +} +EXPORT_SYMBOL_GPL(cleanup_srcu_struct); + +/* + * Counts the new reader in the appropriate per-CPU element of the + * srcu_struct. Must be called from process context. + * Returns an index that must be passed to the matching srcu_read_unlock(). + */ +int __srcu_read_lock(struct srcu_struct *sp) +{ + int idx; + + idx = READ_ONCE(sp->srcu_idx); + WRITE_ONCE(sp->srcu_lock_nesting[idx], sp->srcu_lock_nesting[idx] + 1); + return idx; +} +EXPORT_SYMBOL_GPL(__srcu_read_lock); + +/* + * Removes the count for the old reader from the appropriate element of + * the srcu_struct. Must be called from process context. + */ +void __srcu_read_unlock(struct srcu_struct *sp, int idx) +{ + int newval = sp->srcu_lock_nesting[idx] - 1; + + WRITE_ONCE(sp->srcu_lock_nesting[idx], newval); + if (!newval && READ_ONCE(sp->srcu_gp_waiting)) + swake_up(&sp->srcu_wq); +} +EXPORT_SYMBOL_GPL(__srcu_read_unlock); + +/* + * Workqueue handler to drive one grace period and invoke any callbacks + * that become ready as a result. Single-CPU and !PREEMPT operation + * means that we get away with murder on synchronization. ;-) + */ +void srcu_drive_gp(struct work_struct *wp) +{ + int idx; + struct rcu_cblist ready_cbs; + struct srcu_struct *sp; + struct rcu_head *rhp; + + sp = container_of(wp, struct srcu_struct, srcu_work); + if (sp->srcu_gp_running || rcu_segcblist_empty(&sp->srcu_cblist)) + return; /* Already running or nothing to do. */ + + /* Tag recently arrived callbacks and wait for readers. */ + WRITE_ONCE(sp->srcu_gp_running, true); + rcu_segcblist_accelerate(&sp->srcu_cblist, + rcu_seq_snap(&sp->srcu_gp_seq)); + rcu_seq_start(&sp->srcu_gp_seq); + idx = sp->srcu_idx; + WRITE_ONCE(sp->srcu_idx, !sp->srcu_idx); + WRITE_ONCE(sp->srcu_gp_waiting, true); /* srcu_read_unlock() wakes! */ + swait_event(sp->srcu_wq, !READ_ONCE(sp->srcu_lock_nesting[idx])); + WRITE_ONCE(sp->srcu_gp_waiting, false); /* srcu_read_unlock() cheap. */ + rcu_seq_end(&sp->srcu_gp_seq); + + /* Update callback list based on GP, and invoke ready callbacks. */ + rcu_segcblist_advance(&sp->srcu_cblist, + rcu_seq_current(&sp->srcu_gp_seq)); + if (rcu_segcblist_ready_cbs(&sp->srcu_cblist)) { + rcu_cblist_init(&ready_cbs); + local_irq_disable(); + rcu_segcblist_extract_done_cbs(&sp->srcu_cblist, &ready_cbs); + local_irq_enable(); + rhp = rcu_cblist_dequeue(&ready_cbs); + for (; rhp != NULL; rhp = rcu_cblist_dequeue(&ready_cbs)) { + local_bh_disable(); + rhp->func(rhp); + local_bh_enable(); + } + local_irq_disable(); + rcu_segcblist_insert_count(&sp->srcu_cblist, &ready_cbs); + local_irq_enable(); + } + WRITE_ONCE(sp->srcu_gp_running, false); + + /* + * If more callbacks, reschedule ourselves. This can race with + * a call_srcu() at interrupt level, but the ->srcu_gp_running + * checks will straighten that out. + */ + if (!rcu_segcblist_empty(&sp->srcu_cblist)) + schedule_work(&sp->srcu_work); +} +EXPORT_SYMBOL_GPL(srcu_drive_gp); + +/* + * Enqueue an SRCU callback on the specified srcu_struct structure, + * initiating grace-period processing if it is not already running. + */ +void call_srcu(struct srcu_struct *sp, struct rcu_head *head, + rcu_callback_t func) +{ + unsigned long flags; + + head->func = func; + local_irq_save(flags); + rcu_segcblist_enqueue(&sp->srcu_cblist, head, false); + local_irq_restore(flags); + if (!READ_ONCE(sp->srcu_gp_running)) + schedule_work(&sp->srcu_work); +} +EXPORT_SYMBOL_GPL(call_srcu); + +/* + * synchronize_srcu - wait for prior SRCU read-side critical-section completion + */ +void synchronize_srcu(struct srcu_struct *sp) +{ + struct rcu_synchronize rs; + + init_rcu_head_on_stack(&rs.head); + init_completion(&rs.completion); + call_srcu(sp, &rs.head, wakeme_after_rcu); + wait_for_completion(&rs.completion); + destroy_rcu_head_on_stack(&rs.head); +} +EXPORT_SYMBOL_GPL(synchronize_srcu); diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c new file mode 100644 index 000000000000..9ecf0acc18eb --- /dev/null +++ b/kernel/rcu/srcutree.c @@ -0,0 +1,996 @@ +/* + * Sleepable Read-Copy Update mechanism for mutual exclusion. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, you can access it online at + * http://www.gnu.org/licenses/gpl-2.0.html. + * + * Copyright (C) IBM Corporation, 2006 + * Copyright (C) Fujitsu, 2012 + * + * Author: Paul McKenney <paulmck@us.ibm.com> + * Lai Jiangshan <laijs@cn.fujitsu.com> + * + * For detailed explanation of Read-Copy Update mechanism see - + * Documentation/RCU/ *.txt + * + */ + +#include <linux/export.h> +#include <linux/mutex.h> +#include <linux/percpu.h> +#include <linux/preempt.h> +#include <linux/rcupdate_wait.h> +#include <linux/sched.h> +#include <linux/smp.h> +#include <linux/delay.h> +#include <linux/srcu.h> + +#include "rcu.h" + +static void srcu_invoke_callbacks(struct work_struct *work); +static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay); + +/* + * Initialize SRCU combining tree. Note that statically allocated + * srcu_struct structures might already have srcu_read_lock() and + * srcu_read_unlock() running against them. So if the is_static parameter + * is set, don't initialize ->srcu_lock_count[] and ->srcu_unlock_count[]. + */ +static void init_srcu_struct_nodes(struct srcu_struct *sp, bool is_static) +{ + int cpu; + int i; + int level = 0; + int levelspread[RCU_NUM_LVLS]; + struct srcu_data *sdp; + struct srcu_node *snp; + struct srcu_node *snp_first; + + /* Work out the overall tree geometry. */ + sp->level[0] = &sp->node[0]; + for (i = 1; i < rcu_num_lvls; i++) + sp->level[i] = sp->level[i - 1] + num_rcu_lvl[i - 1]; + rcu_init_levelspread(levelspread, num_rcu_lvl); + + /* Each pass through this loop initializes one srcu_node structure. */ + rcu_for_each_node_breadth_first(sp, snp) { + spin_lock_init(&snp->lock); + for (i = 0; i < ARRAY_SIZE(snp->srcu_have_cbs); i++) + snp->srcu_have_cbs[i] = 0; + snp->grplo = -1; + snp->grphi = -1; + if (snp == &sp->node[0]) { + /* Root node, special case. */ + snp->srcu_parent = NULL; + continue; + } + + /* Non-root node. */ + if (snp == sp->level[level + 1]) + level++; + snp->srcu_parent = sp->level[level - 1] + + (snp - sp->level[level]) / + levelspread[level - 1]; + } + + /* + * Initialize the per-CPU srcu_data array, which feeds into the + * leaves of the srcu_node tree. + */ + WARN_ON_ONCE(ARRAY_SIZE(sdp->srcu_lock_count) != + ARRAY_SIZE(sdp->srcu_unlock_count)); + level = rcu_num_lvls - 1; + snp_first = sp->level[level]; + for_each_possible_cpu(cpu) { + sdp = per_cpu_ptr(sp->sda, cpu); + spin_lock_init(&sdp->lock); + rcu_segcblist_init(&sdp->srcu_cblist); + sdp->srcu_cblist_invoking = false; + sdp->srcu_gp_seq_needed = sp->srcu_gp_seq; + sdp->mynode = &snp_first[cpu / levelspread[level]]; + for (snp = sdp->mynode; snp != NULL; snp = snp->srcu_parent) { + if (snp->grplo < 0) + snp->grplo = cpu; + snp->grphi = cpu; + } + sdp->cpu = cpu; + INIT_DELAYED_WORK(&sdp->work, srcu_invoke_callbacks); + sdp->sp = sp; + if (is_static) + continue; + + /* Dynamically allocated, better be no srcu_read_locks()! */ + for (i = 0; i < ARRAY_SIZE(sdp->srcu_lock_count); i++) { + sdp->srcu_lock_count[i] = 0; + sdp->srcu_unlock_count[i] = 0; + } + } +} + +/* + * Initialize non-compile-time initialized fields, including the + * associated srcu_node and srcu_data structures. The is_static + * parameter is passed through to init_srcu_struct_nodes(), and + * also tells us that ->sda has already been wired up to srcu_data. + */ +static int init_srcu_struct_fields(struct srcu_struct *sp, bool is_static) +{ + mutex_init(&sp->srcu_cb_mutex); + mutex_init(&sp->srcu_gp_mutex); + sp->srcu_idx = 0; + sp->srcu_gp_seq = 0; + atomic_set(&sp->srcu_exp_cnt, 0); + sp->srcu_barrier_seq = 0; + mutex_init(&sp->srcu_barrier_mutex); + atomic_set(&sp->srcu_barrier_cpu_cnt, 0); + INIT_DELAYED_WORK(&sp->work, process_srcu); + if (!is_static) + sp->sda = alloc_percpu(struct srcu_data); + init_srcu_struct_nodes(sp, is_static); + smp_store_release(&sp->srcu_gp_seq_needed, 0); /* Init done. */ + return sp->sda ? 0 : -ENOMEM; +} + +#ifdef CONFIG_DEBUG_LOCK_ALLOC + +int __init_srcu_struct(struct srcu_struct *sp, const char *name, + struct lock_class_key *key) +{ + /* Don't re-initialize a lock while it is held. */ + debug_check_no_locks_freed((void *)sp, sizeof(*sp)); + lockdep_init_map(&sp->dep_map, name, key, 0); + spin_lock_init(&sp->gp_lock); + return init_srcu_struct_fields(sp, false); +} +EXPORT_SYMBOL_GPL(__init_srcu_struct); + +#else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ + +/** + * init_srcu_struct - initialize a sleep-RCU structure + * @sp: structure to initialize. + * + * Must invoke this on a given srcu_struct before passing that srcu_struct + * to any other function. Each srcu_struct represents a separate domain + * of SRCU protection. + */ +int init_srcu_struct(struct srcu_struct *sp) +{ + spin_lock_init(&sp->gp_lock); + return init_srcu_struct_fields(sp, false); +} +EXPORT_SYMBOL_GPL(init_srcu_struct); + +#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ + +/* + * First-use initialization of statically allocated srcu_struct + * structure. Wiring up the combining tree is more than can be + * done with compile-time initialization, so this check is added + * to each update-side SRCU primitive. Use ->gp_lock, which -is- + * compile-time initialized, to resolve races involving multiple + * CPUs trying to garner first-use privileges. + */ +static void check_init_srcu_struct(struct srcu_struct *sp) +{ + unsigned long flags; + + WARN_ON_ONCE(rcu_scheduler_active == RCU_SCHEDULER_INIT); + /* The smp_load_acquire() pairs with the smp_store_release(). */ + if (!rcu_seq_state(smp_load_acquire(&sp->srcu_gp_seq_needed))) /*^^^*/ + return; /* Already initialized. */ + spin_lock_irqsave(&sp->gp_lock, flags); + if (!rcu_seq_state(sp->srcu_gp_seq_needed)) { + spin_unlock_irqrestore(&sp->gp_lock, flags); + return; + } + init_srcu_struct_fields(sp, true); + spin_unlock_irqrestore(&sp->gp_lock, flags); +} + +/* + * Returns approximate total of the readers' ->srcu_lock_count[] values + * for the rank of per-CPU counters specified by idx. + */ +static unsigned long srcu_readers_lock_idx(struct srcu_struct *sp, int idx) +{ + int cpu; + unsigned long sum = 0; + + for_each_possible_cpu(cpu) { + struct srcu_data *cpuc = per_cpu_ptr(sp->sda, cpu); + + sum += READ_ONCE(cpuc->srcu_lock_count[idx]); + } + return sum; +} + +/* + * Returns approximate total of the readers' ->srcu_unlock_count[] values + * for the rank of per-CPU counters specified by idx. + */ +static unsigned long srcu_readers_unlock_idx(struct srcu_struct *sp, int idx) +{ + int cpu; + unsigned long sum = 0; + + for_each_possible_cpu(cpu) { + struct srcu_data *cpuc = per_cpu_ptr(sp->sda, cpu); + + sum += READ_ONCE(cpuc->srcu_unlock_count[idx]); + } + return sum; +} + +/* + * Return true if the number of pre-existing readers is determined to + * be zero. + */ +static bool srcu_readers_active_idx_check(struct srcu_struct *sp, int idx) +{ + unsigned long unlocks; + + unlocks = srcu_readers_unlock_idx(sp, idx); + + /* + * Make sure that a lock is always counted if the corresponding + * unlock is counted. Needs to be a smp_mb() as the read side may + * contain a read from a variable that is written to before the + * synchronize_srcu() in the write side. In this case smp_mb()s + * A and B act like the store buffering pattern. + * + * This smp_mb() also pairs with smp_mb() C to prevent accesses + * after the synchronize_srcu() from being executed before the + * grace period ends. + */ + smp_mb(); /* A */ + + /* + * If the locks are the same as the unlocks, then there must have + * been no readers on this index at some time in between. This does + * not mean that there are no more readers, as one could have read + * the current index but not have incremented the lock counter yet. + * + * Possible bug: There is no guarantee that there haven't been + * ULONG_MAX increments of ->srcu_lock_count[] since the unlocks were + * counted, meaning that this could return true even if there are + * still active readers. Since there are no memory barriers around + * srcu_flip(), the CPU is not required to increment ->srcu_idx + * before running srcu_readers_unlock_idx(), which means that there + * could be an arbitrarily large number of critical sections that + * execute after srcu_readers_unlock_idx() but use the old value + * of ->srcu_idx. + */ + return srcu_readers_lock_idx(sp, idx) == unlocks; +} + +/** + * srcu_readers_active - returns true if there are readers. and false + * otherwise + * @sp: which srcu_struct to count active readers (holding srcu_read_lock). + * + * Note that this is not an atomic primitive, and can therefore suffer + * severe errors when invoked on an active srcu_struct. That said, it + * can be useful as an error check at cleanup time. + */ +static bool srcu_readers_active(struct srcu_struct *sp) +{ + int cpu; + unsigned long sum = 0; + + for_each_possible_cpu(cpu) { + struct srcu_data *cpuc = per_cpu_ptr(sp->sda, cpu); + + sum += READ_ONCE(cpuc->srcu_lock_count[0]); + sum += READ_ONCE(cpuc->srcu_lock_count[1]); + sum -= READ_ONCE(cpuc->srcu_unlock_count[0]); + sum -= READ_ONCE(cpuc->srcu_unlock_count[1]); + } + return sum; +} + +#define SRCU_INTERVAL 1 + +/** + * cleanup_srcu_struct - deconstruct a sleep-RCU structure + * @sp: structure to clean up. + * + * Must invoke this after you are finished using a given srcu_struct that + * was initialized via init_srcu_struct(), else you leak memory. + */ +void cleanup_srcu_struct(struct srcu_struct *sp) +{ + int cpu; + + WARN_ON_ONCE(atomic_read(&sp->srcu_exp_cnt)); + if (WARN_ON(srcu_readers_active(sp))) + return; /* Leakage unless caller handles error. */ + flush_delayed_work(&sp->work); + for_each_possible_cpu(cpu) + flush_delayed_work(&per_cpu_ptr(sp->sda, cpu)->work); + if (WARN_ON(rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)) != SRCU_STATE_IDLE) || + WARN_ON(srcu_readers_active(sp))) { + pr_info("cleanup_srcu_struct: Active srcu_struct %p state: %d\n", sp, rcu_seq_state(READ_ONCE(sp->srcu_gp_seq))); + return; /* Caller forgot to stop doing call_srcu()? */ + } + free_percpu(sp->sda); + sp->sda = NULL; +} +EXPORT_SYMBOL_GPL(cleanup_srcu_struct); + +/* + * Counts the new reader in the appropriate per-CPU element of the + * srcu_struct. Must be called from process context. + * Returns an index that must be passed to the matching srcu_read_unlock(). + */ +int __srcu_read_lock(struct srcu_struct *sp) +{ + int idx; + + idx = READ_ONCE(sp->srcu_idx) & 0x1; + __this_cpu_inc(sp->sda->srcu_lock_count[idx]); + smp_mb(); /* B */ /* Avoid leaking the critical section. */ + return idx; +} +EXPORT_SYMBOL_GPL(__srcu_read_lock); + +/* + * Removes the count for the old reader from the appropriate per-CPU + * element of the srcu_struct. Note that this may well be a different + * CPU than that which was incremented by the corresponding srcu_read_lock(). + * Must be called from process context. + */ +void __srcu_read_unlock(struct srcu_struct *sp, int idx) +{ + smp_mb(); /* C */ /* Avoid leaking the critical section. */ + this_cpu_inc(sp->sda->srcu_unlock_count[idx]); +} +EXPORT_SYMBOL_GPL(__srcu_read_unlock); + +/* + * We use an adaptive strategy for synchronize_srcu() and especially for + * synchronize_srcu_expedited(). We spin for a fixed time period + * (defined below) to allow SRCU readers to exit their read-side critical + * sections. If there are still some readers after a few microseconds, + * we repeatedly block for 1-millisecond time periods. + */ +#define SRCU_RETRY_CHECK_DELAY 5 + +/* + * Start an SRCU grace period. + */ +static void srcu_gp_start(struct srcu_struct *sp) +{ + struct srcu_data *sdp = this_cpu_ptr(sp->sda); + int state; + + RCU_LOCKDEP_WARN(!lockdep_is_held(&sp->gp_lock), + "Invoked srcu_gp_start() without ->gp_lock!"); + WARN_ON_ONCE(ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed)); + rcu_segcblist_advance(&sdp->srcu_cblist, + rcu_seq_current(&sp->srcu_gp_seq)); + (void)rcu_segcblist_accelerate(&sdp->srcu_cblist, + rcu_seq_snap(&sp->srcu_gp_seq)); + rcu_seq_start(&sp->srcu_gp_seq); + state = rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)); + WARN_ON_ONCE(state != SRCU_STATE_SCAN1); +} + +/* + * Track online CPUs to guide callback workqueue placement. + */ +DEFINE_PER_CPU(bool, srcu_online); + +void srcu_online_cpu(unsigned int cpu) +{ + WRITE_ONCE(per_cpu(srcu_online, cpu), true); +} + +void srcu_offline_cpu(unsigned int cpu) +{ + WRITE_ONCE(per_cpu(srcu_online, cpu), false); +} + +/* + * Place the workqueue handler on the specified CPU if online, otherwise + * just run it whereever. This is useful for placing workqueue handlers + * that are to invoke the specified CPU's callbacks. + */ +static bool srcu_queue_delayed_work_on(int cpu, struct workqueue_struct *wq, + struct delayed_work *dwork, + unsigned long delay) +{ + bool ret; + + preempt_disable(); + if (READ_ONCE(per_cpu(srcu_online, cpu))) + ret = queue_delayed_work_on(cpu, wq, dwork, delay); + else + ret = queue_delayed_work(wq, dwork, delay); + preempt_enable(); + return ret; +} + +/* + * Schedule callback invocation for the specified srcu_data structure, + * if possible, on the corresponding CPU. + */ +static void srcu_schedule_cbs_sdp(struct srcu_data *sdp, unsigned long delay) +{ + srcu_queue_delayed_work_on(sdp->cpu, system_power_efficient_wq, + &sdp->work, delay); +} + +/* + * Schedule callback invocation for all srcu_data structures associated + * with the specified srcu_node structure, if possible, on the corresponding + * CPUs. + */ +static void srcu_schedule_cbs_snp(struct srcu_struct *sp, struct srcu_node *snp) +{ + int cpu; + + for (cpu = snp->grplo; cpu <= snp->grphi; cpu++) + srcu_schedule_cbs_sdp(per_cpu_ptr(sp->sda, cpu), + atomic_read(&sp->srcu_exp_cnt) ? 0 : SRCU_INTERVAL); +} + +/* + * Note the end of an SRCU grace period. Initiates callback invocation + * and starts a new grace period if needed. + * + * The ->srcu_cb_mutex acquisition does not protect any data, but + * instead prevents more than one grace period from starting while we + * are initiating callback invocation. This allows the ->srcu_have_cbs[] + * array to have a finite number of elements. + */ +static void srcu_gp_end(struct srcu_struct *sp) +{ + bool cbs; + unsigned long gpseq; + int idx; + int idxnext; + struct srcu_node *snp; + + /* Prevent more than one additional grace period. */ + mutex_lock(&sp->srcu_cb_mutex); + + /* End the current grace period. */ + spin_lock_irq(&sp->gp_lock); + idx = rcu_seq_state(sp->srcu_gp_seq); + WARN_ON_ONCE(idx != SRCU_STATE_SCAN2); + rcu_seq_end(&sp->srcu_gp_seq); + gpseq = rcu_seq_current(&sp->srcu_gp_seq); + spin_unlock_irq(&sp->gp_lock); + mutex_unlock(&sp->srcu_gp_mutex); + /* A new grace period can start at this point. But only one. */ + + /* Initiate callback invocation as needed. */ + idx = rcu_seq_ctr(gpseq) % ARRAY_SIZE(snp->srcu_have_cbs); + idxnext = (idx + 1) % ARRAY_SIZE(snp->srcu_have_cbs); + rcu_for_each_node_breadth_first(sp, snp) { + spin_lock_irq(&snp->lock); + cbs = false; + if (snp >= sp->level[rcu_num_lvls - 1]) + cbs = snp->srcu_have_cbs[idx] == gpseq; + snp->srcu_have_cbs[idx] = gpseq; + rcu_seq_set_state(&snp->srcu_have_cbs[idx], 1); + spin_unlock_irq(&snp->lock); + if (cbs) { + smp_mb(); /* GP end before CB invocation. */ + srcu_schedule_cbs_snp(sp, snp); + } + } + + /* Callback initiation done, allow grace periods after next. */ + mutex_unlock(&sp->srcu_cb_mutex); + + /* Start a new grace period if needed. */ + spin_lock_irq(&sp->gp_lock); + gpseq = rcu_seq_current(&sp->srcu_gp_seq); + if (!rcu_seq_state(gpseq) && + ULONG_CMP_LT(gpseq, sp->srcu_gp_seq_needed)) { + srcu_gp_start(sp); + spin_unlock_irq(&sp->gp_lock); + /* Throttle expedited grace periods: Should be rare! */ + srcu_reschedule(sp, atomic_read(&sp->srcu_exp_cnt) && + rcu_seq_ctr(gpseq) & 0xf + ? 0 + : SRCU_INTERVAL); + } else { + spin_unlock_irq(&sp->gp_lock); + } +} + +/* + * Funnel-locking scheme to scalably mediate many concurrent grace-period + * requests. The winner has to do the work of actually starting grace + * period s. Losers must either ensure that their desired grace-period + * number is recorded on at least their leaf srcu_node structure, or they + * must take steps to invoke their own callbacks. + */ +static void srcu_funnel_gp_start(struct srcu_struct *sp, + struct srcu_data *sdp, + unsigned long s) +{ + unsigned long flags; + int idx = rcu_seq_ctr(s) % ARRAY_SIZE(sdp->mynode->srcu_have_cbs); + struct srcu_node *snp = sdp->mynode; + unsigned long snp_seq; + + /* Each pass through the loop does one level of the srcu_node tree. */ + for (; snp != NULL; snp = snp->srcu_parent) { + if (rcu_seq_done(&sp->srcu_gp_seq, s) && snp != sdp->mynode) + return; /* GP already done and CBs recorded. */ + spin_lock_irqsave(&snp->lock, flags); + if (ULONG_CMP_GE(snp->srcu_have_cbs[idx], s)) { + snp_seq = snp->srcu_have_cbs[idx]; + spin_unlock_irqrestore(&snp->lock, flags); + if (snp == sdp->mynode && snp_seq != s) { + smp_mb(); /* CBs after GP! */ + srcu_schedule_cbs_sdp(sdp, 0); + } + return; + } + snp->srcu_have_cbs[idx] = s; + spin_unlock_irqrestore(&snp->lock, flags); + } + + /* Top of tree, must ensure the grace period will be started. */ + spin_lock_irqsave(&sp->gp_lock, flags); + if (ULONG_CMP_LT(sp->srcu_gp_seq_needed, s)) { + /* + * Record need for grace period s. Pair with load + * acquire setting up for initialization. + */ + smp_store_release(&sp->srcu_gp_seq_needed, s); /*^^^*/ + } + + /* If grace period not already done and none in progress, start it. */ + if (!rcu_seq_done(&sp->srcu_gp_seq, s) && + rcu_seq_state(sp->srcu_gp_seq) == SRCU_STATE_IDLE) { + WARN_ON_ONCE(ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed)); + srcu_gp_start(sp); + queue_delayed_work(system_power_efficient_wq, &sp->work, + atomic_read(&sp->srcu_exp_cnt) + ? 0 + : SRCU_INTERVAL); + } + spin_unlock_irqrestore(&sp->gp_lock, flags); +} + +/* + * Wait until all readers counted by array index idx complete, but + * loop an additional time if there is an expedited grace period pending. + * The caller must ensure that ->srcu_idx is not changed while checking. + */ +static bool try_check_zero(struct srcu_struct *sp, int idx, int trycount) +{ + for (;;) { + if (srcu_readers_active_idx_check(sp, idx)) + return true; + if (--trycount + !!atomic_read(&sp->srcu_exp_cnt) <= 0) + return false; + udelay(SRCU_RETRY_CHECK_DELAY); + } +} + +/* + * Increment the ->srcu_idx counter so that future SRCU readers will + * use the other rank of the ->srcu_(un)lock_count[] arrays. This allows + * us to wait for pre-existing readers in a starvation-free manner. + */ +static void srcu_flip(struct srcu_struct *sp) +{ + WRITE_ONCE(sp->srcu_idx, sp->srcu_idx + 1); + + /* + * Ensure that if the updater misses an __srcu_read_unlock() + * increment, that task's next __srcu_read_lock() will see the + * above counter update. Note that both this memory barrier + * and the one in srcu_readers_active_idx_check() provide the + * guarantee for __srcu_read_lock(). + */ + smp_mb(); /* D */ /* Pairs with C. */ +} + +/* + * Enqueue an SRCU callback on the srcu_data structure associated with + * the current CPU and the specified srcu_struct structure, initiating + * grace-period processing if it is not already running. + * + * Note that all CPUs must agree that the grace period extended beyond + * all pre-existing SRCU read-side critical section. On systems with + * more than one CPU, this means that when "func()" is invoked, each CPU + * is guaranteed to have executed a full memory barrier since the end of + * its last corresponding SRCU read-side critical section whose beginning + * preceded the call to call_rcu(). It also means that each CPU executing + * an SRCU read-side critical section that continues beyond the start of + * "func()" must have executed a memory barrier after the call_rcu() + * but before the beginning of that SRCU read-side critical section. + * Note that these guarantees include CPUs that are offline, idle, or + * executing in user mode, as well as CPUs that are executing in the kernel. + * + * Furthermore, if CPU A invoked call_rcu() and CPU B invoked the + * resulting SRCU callback function "func()", then both CPU A and CPU + * B are guaranteed to execute a full memory barrier during the time + * interval between the call to call_rcu() and the invocation of "func()". + * This guarantee applies even if CPU A and CPU B are the same CPU (but + * again only if the system has more than one CPU). + * + * Of course, these guarantees apply only for invocations of call_srcu(), + * srcu_read_lock(), and srcu_read_unlock() that are all passed the same + * srcu_struct structure. + */ +void call_srcu(struct srcu_struct *sp, struct rcu_head *rhp, + rcu_callback_t func) +{ + unsigned long flags; + bool needgp = false; + unsigned long s; + struct srcu_data *sdp; + + check_init_srcu_struct(sp); + rhp->func = func; + local_irq_save(flags); + sdp = this_cpu_ptr(sp->sda); + spin_lock(&sdp->lock); + rcu_segcblist_enqueue(&sdp->srcu_cblist, rhp, false); + rcu_segcblist_advance(&sdp->srcu_cblist, + rcu_seq_current(&sp->srcu_gp_seq)); + s = rcu_seq_snap(&sp->srcu_gp_seq); + (void)rcu_segcblist_accelerate(&sdp->srcu_cblist, s); + if (ULONG_CMP_LT(sdp->srcu_gp_seq_needed, s)) { + sdp->srcu_gp_seq_needed = s; + needgp = true; + } + spin_unlock_irqrestore(&sdp->lock, flags); + if (needgp) + srcu_funnel_gp_start(sp, sdp, s); +} +EXPORT_SYMBOL_GPL(call_srcu); + +/* + * Helper function for synchronize_srcu() and synchronize_srcu_expedited(). + */ +static void __synchronize_srcu(struct srcu_struct *sp) +{ + struct rcu_synchronize rcu; + + RCU_LOCKDEP_WARN(lock_is_held(&sp->dep_map) || + lock_is_held(&rcu_bh_lock_map) || + lock_is_held(&rcu_lock_map) || + lock_is_held(&rcu_sched_lock_map), + "Illegal synchronize_srcu() in same-type SRCU (or in RCU) read-side critical section"); + + if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE) + return; + might_sleep(); + check_init_srcu_struct(sp); + init_completion(&rcu.completion); + init_rcu_head_on_stack(&rcu.head); + call_srcu(sp, &rcu.head, wakeme_after_rcu); + wait_for_completion(&rcu.completion); + destroy_rcu_head_on_stack(&rcu.head); +} + +/** + * synchronize_srcu_expedited - Brute-force SRCU grace period + * @sp: srcu_struct with which to synchronize. + * + * Wait for an SRCU grace period to elapse, but be more aggressive about + * spinning rather than blocking when waiting. + * + * Note that synchronize_srcu_expedited() has the same deadlock and + * memory-ordering properties as does synchronize_srcu(). + */ +void synchronize_srcu_expedited(struct srcu_struct *sp) +{ + bool do_norm = rcu_gp_is_normal(); + + check_init_srcu_struct(sp); + if (!do_norm) { + atomic_inc(&sp->srcu_exp_cnt); + smp_mb__after_atomic(); /* increment before GP. */ + } + __synchronize_srcu(sp); + if (!do_norm) { + smp_mb__before_atomic(); /* GP before decrement. */ + WARN_ON_ONCE(atomic_dec_return(&sp->srcu_exp_cnt) < 0); + } +} +EXPORT_SYMBOL_GPL(synchronize_srcu_expedited); + +/** + * synchronize_srcu - wait for prior SRCU read-side critical-section completion + * @sp: srcu_struct with which to synchronize. + * + * Wait for the count to drain to zero of both indexes. To avoid the + * possible starvation of synchronize_srcu(), it waits for the count of + * the index=((->srcu_idx & 1) ^ 1) to drain to zero at first, + * and then flip the srcu_idx and wait for the count of the other index. + * + * Can block; must be called from process context. + * + * Note that it is illegal to call synchronize_srcu() from the corresponding + * SRCU read-side critical section; doing so will result in deadlock. + * However, it is perfectly legal to call synchronize_srcu() on one + * srcu_struct from some other srcu_struct's read-side critical section, + * as long as the resulting graph of srcu_structs is acyclic. + * + * There are memory-ordering constraints implied by synchronize_srcu(). + * On systems with more than one CPU, when synchronize_srcu() returns, + * each CPU is guaranteed to have executed a full memory barrier since + * the end of its last corresponding SRCU-sched read-side critical section + * whose beginning preceded the call to synchronize_srcu(). In addition, + * each CPU having an SRCU read-side critical section that extends beyond + * the return from synchronize_srcu() is guaranteed to have executed a + * full memory barrier after the beginning of synchronize_srcu() and before + * the beginning of that SRCU read-side critical section. Note that these + * guarantees include CPUs that are offline, idle, or executing in user mode, + * as well as CPUs that are executing in the kernel. + * + * Furthermore, if CPU A invoked synchronize_srcu(), which returned + * to its caller on CPU B, then both CPU A and CPU B are guaranteed + * to have executed a full memory barrier during the execution of + * synchronize_srcu(). This guarantee applies even if CPU A and CPU B + * are the same CPU, but again only if the system has more than one CPU. + * + * Of course, these memory-ordering guarantees apply only when + * synchronize_srcu(), srcu_read_lock(), and srcu_read_unlock() are + * passed the same srcu_struct structure. + */ +void synchronize_srcu(struct srcu_struct *sp) +{ + if (rcu_gp_is_expedited()) + synchronize_srcu_expedited(sp); + else + __synchronize_srcu(sp); +} +EXPORT_SYMBOL_GPL(synchronize_srcu); + +/* + * Callback function for srcu_barrier() use. + */ +static void srcu_barrier_cb(struct rcu_head *rhp) +{ + struct srcu_data *sdp; + struct srcu_struct *sp; + + sdp = container_of(rhp, struct srcu_data, srcu_barrier_head); + sp = sdp->sp; + if (atomic_dec_and_test(&sp->srcu_barrier_cpu_cnt)) + complete(&sp->srcu_barrier_completion); +} + +/** + * srcu_barrier - Wait until all in-flight call_srcu() callbacks complete. + * @sp: srcu_struct on which to wait for in-flight callbacks. + */ +void srcu_barrier(struct srcu_struct *sp) +{ + int cpu; + struct srcu_data *sdp; + unsigned long s = rcu_seq_snap(&sp->srcu_barrier_seq); + + check_init_srcu_struct(sp); + mutex_lock(&sp->srcu_barrier_mutex); + if (rcu_seq_done(&sp->srcu_barrier_seq, s)) { + smp_mb(); /* Force ordering following return. */ + mutex_unlock(&sp->srcu_barrier_mutex); + return; /* Someone else did our work for us. */ + } + rcu_seq_start(&sp->srcu_barrier_seq); + init_completion(&sp->srcu_barrier_completion); + + /* Initial count prevents reaching zero until all CBs are posted. */ + atomic_set(&sp->srcu_barrier_cpu_cnt, 1); + + /* + * Each pass through this loop enqueues a callback, but only + * on CPUs already having callbacks enqueued. Note that if + * a CPU already has callbacks enqueue, it must have already + * registered the need for a future grace period, so all we + * need do is enqueue a callback that will use the same + * grace period as the last callback already in the queue. + */ + for_each_possible_cpu(cpu) { + sdp = per_cpu_ptr(sp->sda, cpu); + spin_lock_irq(&sdp->lock); + atomic_inc(&sp->srcu_barrier_cpu_cnt); + sdp->srcu_barrier_head.func = srcu_barrier_cb; + if (!rcu_segcblist_entrain(&sdp->srcu_cblist, + &sdp->srcu_barrier_head, 0)) + atomic_dec(&sp->srcu_barrier_cpu_cnt); + spin_unlock_irq(&sdp->lock); + } + + /* Remove the initial count, at which point reaching zero can happen. */ + if (atomic_dec_and_test(&sp->srcu_barrier_cpu_cnt)) + complete(&sp->srcu_barrier_completion); + wait_for_completion(&sp->srcu_barrier_completion); + + rcu_seq_end(&sp->srcu_barrier_seq); + mutex_unlock(&sp->srcu_barrier_mutex); +} +EXPORT_SYMBOL_GPL(srcu_barrier); + +/** + * srcu_batches_completed - return batches completed. + * @sp: srcu_struct on which to report batch completion. + * + * Report the number of batches, correlated with, but not necessarily + * precisely the same as, the number of grace periods that have elapsed. + */ +unsigned long srcu_batches_completed(struct srcu_struct *sp) +{ + return sp->srcu_idx; +} +EXPORT_SYMBOL_GPL(srcu_batches_completed); + +/* + * Core SRCU state machine. Push state bits of ->srcu_gp_seq + * to SRCU_STATE_SCAN2, and invoke srcu_gp_end() when scan has + * completed in that state. + */ +static void srcu_advance_state(struct srcu_struct *sp) +{ + int idx; + + mutex_lock(&sp->srcu_gp_mutex); + + /* + * Because readers might be delayed for an extended period after + * fetching ->srcu_idx for their index, at any point in time there + * might well be readers using both idx=0 and idx=1. We therefore + * need to wait for readers to clear from both index values before + * invoking a callback. + * + * The load-acquire ensures that we see the accesses performed + * by the prior grace period. + */ + idx = rcu_seq_state(smp_load_acquire(&sp->srcu_gp_seq)); /* ^^^ */ + if (idx == SRCU_STATE_IDLE) { + spin_lock_irq(&sp->gp_lock); + if (ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed)) { + WARN_ON_ONCE(rcu_seq_state(sp->srcu_gp_seq)); + spin_unlock_irq(&sp->gp_lock); + mutex_unlock(&sp->srcu_gp_mutex); + return; + } + idx = rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)); + if (idx == SRCU_STATE_IDLE) + srcu_gp_start(sp); + spin_unlock_irq(&sp->gp_lock); + if (idx != SRCU_STATE_IDLE) { + mutex_unlock(&sp->srcu_gp_mutex); + return; /* Someone else started the grace period. */ + } + } + + if (rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)) == SRCU_STATE_SCAN1) { + idx = 1 ^ (sp->srcu_idx & 1); + if (!try_check_zero(sp, idx, 1)) { + mutex_unlock(&sp->srcu_gp_mutex); + return; /* readers present, retry later. */ + } + srcu_flip(sp); + rcu_seq_set_state(&sp->srcu_gp_seq, SRCU_STATE_SCAN2); + } + + if (rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)) == SRCU_STATE_SCAN2) { + + /* + * SRCU read-side critical sections are normally short, + * so check at least twice in quick succession after a flip. + */ + idx = 1 ^ (sp->srcu_idx & 1); + if (!try_check_zero(sp, idx, 2)) { + mutex_unlock(&sp->srcu_gp_mutex); + return; /* readers present, retry later. */ + } + srcu_gp_end(sp); /* Releases ->srcu_gp_mutex. */ + } +} + +/* + * Invoke a limited number of SRCU callbacks that have passed through + * their grace period. If there are more to do, SRCU will reschedule + * the workqueue. Note that needed memory barriers have been executed + * in this task's context by srcu_readers_active_idx_check(). + */ +static void srcu_invoke_callbacks(struct work_struct *work) +{ + bool more; + struct rcu_cblist ready_cbs; + struct rcu_head *rhp; + struct srcu_data *sdp; + struct srcu_struct *sp; + + sdp = container_of(work, struct srcu_data, work.work); + sp = sdp->sp; + rcu_cblist_init(&ready_cbs); + spin_lock_irq(&sdp->lock); + smp_mb(); /* Old grace periods before callback invocation! */ + rcu_segcblist_advance(&sdp->srcu_cblist, + rcu_seq_current(&sp->srcu_gp_seq)); + if (sdp->srcu_cblist_invoking || + !rcu_segcblist_ready_cbs(&sdp->srcu_cblist)) { + spin_unlock_irq(&sdp->lock); + return; /* Someone else on the job or nothing to do. */ + } + + /* We are on the job! Extract and invoke ready callbacks. */ + sdp->srcu_cblist_invoking = true; + rcu_segcblist_extract_done_cbs(&sdp->srcu_cblist, &ready_cbs); + spin_unlock_irq(&sdp->lock); + rhp = rcu_cblist_dequeue(&ready_cbs); + for (; rhp != NULL; rhp = rcu_cblist_dequeue(&ready_cbs)) { + local_bh_disable(); + rhp->func(rhp); + local_bh_enable(); + } + + /* + * Update counts, accelerate new callbacks, and if needed, + * schedule another round of callback invocation. + */ + spin_lock_irq(&sdp->lock); + rcu_segcblist_insert_count(&sdp->srcu_cblist, &ready_cbs); + (void)rcu_segcblist_accelerate(&sdp->srcu_cblist, + rcu_seq_snap(&sp->srcu_gp_seq)); + sdp->srcu_cblist_invoking = false; + more = rcu_segcblist_ready_cbs(&sdp->srcu_cblist); + spin_unlock_irq(&sdp->lock); + if (more) + srcu_schedule_cbs_sdp(sdp, 0); +} + +/* + * Finished one round of SRCU grace period. Start another if there are + * more SRCU callbacks queued, otherwise put SRCU into not-running state. + */ +static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay) +{ + bool pushgp = true; + + spin_lock_irq(&sp->gp_lock); + if (ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed)) { + if (!WARN_ON_ONCE(rcu_seq_state(sp->srcu_gp_seq))) { + /* All requests fulfilled, time to go idle. */ + pushgp = false; + } + } else if (!rcu_seq_state(sp->srcu_gp_seq)) { + /* Outstanding request and no GP. Start one. */ + srcu_gp_start(sp); + } + spin_unlock_irq(&sp->gp_lock); + + if (pushgp) + queue_delayed_work(system_power_efficient_wq, &sp->work, delay); +} + +/* + * This is the work-queue function that handles SRCU grace periods. + */ +void process_srcu(struct work_struct *work) +{ + struct srcu_struct *sp; + + sp = container_of(work, struct srcu_struct, work.work); + + srcu_advance_state(sp); + srcu_reschedule(sp, atomic_read(&sp->srcu_exp_cnt) ? 0 : SRCU_INTERVAL); +} +EXPORT_SYMBOL_GPL(process_srcu); diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index 6ad330dbbae2..e5385731e391 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c @@ -79,7 +79,7 @@ EXPORT_SYMBOL(__rcu_is_watching); */ static int rcu_qsctr_help(struct rcu_ctrlblk *rcp) { - RCU_TRACE(reset_cpu_stall_ticks(rcp)); + RCU_TRACE(reset_cpu_stall_ticks(rcp);) if (rcp->donetail != rcp->curtail) { rcp->donetail = rcp->curtail; return 1; @@ -125,7 +125,7 @@ void rcu_bh_qs(void) */ void rcu_check_callbacks(int user) { - RCU_TRACE(check_cpu_stalls()); + RCU_TRACE(check_cpu_stalls();) if (user) rcu_sched_qs(); else if (!in_softirq()) @@ -143,7 +143,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) const char *rn = NULL; struct rcu_head *next, *list; unsigned long flags; - RCU_TRACE(int cb_count = 0); + RCU_TRACE(int cb_count = 0;) /* Move the ready-to-invoke callbacks to a local list. */ local_irq_save(flags); @@ -152,7 +152,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) local_irq_restore(flags); return; } - RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, rcp->qlen, -1)); + RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, rcp->qlen, -1);) list = rcp->rcucblist; rcp->rcucblist = *rcp->donetail; *rcp->donetail = NULL; @@ -162,7 +162,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) local_irq_restore(flags); /* Invoke the callbacks on the local list. */ - RCU_TRACE(rn = rcp->name); + RCU_TRACE(rn = rcp->name;) while (list) { next = list->next; prefetch(next); @@ -171,9 +171,9 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) __rcu_reclaim(rn, list); local_bh_enable(); list = next; - RCU_TRACE(cb_count++); + RCU_TRACE(cb_count++;) } - RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count)); + RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count);) RCU_TRACE(trace_rcu_batch_end(rcp->name, cb_count, 0, need_resched(), is_idle_task(current), @@ -221,7 +221,7 @@ static void __call_rcu(struct rcu_head *head, local_irq_save(flags); *rcp->curtail = head; rcp->curtail = &head->next; - RCU_TRACE(rcp->qlen++); + RCU_TRACE(rcp->qlen++;) local_irq_restore(flags); if (unlikely(is_idle_task(current))) { @@ -254,8 +254,8 @@ EXPORT_SYMBOL_GPL(call_rcu_bh); void __init rcu_init(void) { open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); - RCU_TRACE(reset_cpu_stall_ticks(&rcu_sched_ctrlblk)); - RCU_TRACE(reset_cpu_stall_ticks(&rcu_bh_ctrlblk)); + RCU_TRACE(reset_cpu_stall_ticks(&rcu_sched_ctrlblk);) + RCU_TRACE(reset_cpu_stall_ticks(&rcu_bh_ctrlblk);) rcu_early_boot_tests(); } diff --git a/kernel/rcu/tiny_plugin.h b/kernel/rcu/tiny_plugin.h index c64b827ecbca..371034e77f87 100644 --- a/kernel/rcu/tiny_plugin.h +++ b/kernel/rcu/tiny_plugin.h @@ -52,7 +52,7 @@ static struct rcu_ctrlblk rcu_bh_ctrlblk = { RCU_TRACE(.name = "rcu_bh") }; -#ifdef CONFIG_DEBUG_LOCK_ALLOC +#if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_SRCU) #include <linux/kernel_stat.h> int rcu_scheduler_active __read_mostly; @@ -65,15 +65,16 @@ EXPORT_SYMBOL_GPL(rcu_scheduler_active); * to RCU_SCHEDULER_RUNNING, skipping the RCU_SCHEDULER_INIT stage. * The reason for this is that Tiny RCU does not need kthreads, so does * not have to care about the fact that the scheduler is half-initialized - * at a certain phase of the boot process. + * at a certain phase of the boot process. Unless SRCU is in the mix. */ void __init rcu_scheduler_starting(void) { WARN_ON(nr_context_switches() > 0); - rcu_scheduler_active = RCU_SCHEDULER_RUNNING; + rcu_scheduler_active = IS_ENABLED(CONFIG_SRCU) + ? RCU_SCHEDULER_INIT : RCU_SCHEDULER_RUNNING; } -#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ +#endif /* #if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_SRCU) */ #ifdef CONFIG_RCU_TRACE @@ -162,8 +163,8 @@ static void reset_cpu_stall_ticks(struct rcu_ctrlblk *rcp) static void check_cpu_stalls(void) { - RCU_TRACE(check_cpu_stall(&rcu_bh_ctrlblk)); - RCU_TRACE(check_cpu_stall(&rcu_sched_ctrlblk)); + RCU_TRACE(check_cpu_stall(&rcu_bh_ctrlblk);) + RCU_TRACE(check_cpu_stall(&rcu_sched_ctrlblk);) } #endif /* #ifdef CONFIG_RCU_TRACE */ diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 50fee7689e71..23aa02587d0f 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -97,8 +97,8 @@ struct rcu_state sname##_state = { \ .gpnum = 0UL - 300UL, \ .completed = 0UL - 300UL, \ .orphan_lock = __RAW_SPIN_LOCK_UNLOCKED(&sname##_state.orphan_lock), \ - .orphan_nxttail = &sname##_state.orphan_nxtlist, \ - .orphan_donetail = &sname##_state.orphan_donelist, \ + .orphan_pend = RCU_CBLIST_INITIALIZER(sname##_state.orphan_pend), \ + .orphan_done = RCU_CBLIST_INITIALIZER(sname##_state.orphan_done), \ .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \ .name = RCU_STATE_NAME(sname), \ .abbr = sabbr, \ @@ -123,7 +123,7 @@ static int rcu_fanout_leaf = RCU_FANOUT_LEAF; module_param(rcu_fanout_leaf, int, 0444); int rcu_num_lvls __read_mostly = RCU_NUM_LVLS; /* Number of rcu_nodes at specified level. */ -static int num_rcu_lvl[] = NUM_RCU_LVL_INIT; +int num_rcu_lvl[] = NUM_RCU_LVL_INIT; int rcu_num_nodes __read_mostly = NUM_RCU_NODES; /* Total # rcu_nodes in use. */ /* panic() on RCU Stall sysctl. */ int sysctl_panic_on_rcu_stall __read_mostly; @@ -199,7 +199,7 @@ static const int gp_cleanup_delay; /* * Number of grace periods between delays, normalized by the duration of - * the delay. The longer the the delay, the more the grace periods between + * the delay. The longer the delay, the more the grace periods between * each delay. The reason for this normalization is that it means that, * for non-zero delays, the overall slowdown of grace periods is constant * regardless of the duration of the delay. This arrangement balances @@ -272,11 +272,19 @@ void rcu_bh_qs(void) } } -static DEFINE_PER_CPU(int, rcu_sched_qs_mask); +/* + * Steal a bit from the bottom of ->dynticks for idle entry/exit + * control. Initially this is for TLB flushing. + */ +#define RCU_DYNTICK_CTRL_MASK 0x1 +#define RCU_DYNTICK_CTRL_CTR (RCU_DYNTICK_CTRL_MASK + 1) +#ifndef rcu_eqs_special_exit +#define rcu_eqs_special_exit() do { } while (0) +#endif static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE, - .dynticks = ATOMIC_INIT(1), + .dynticks = ATOMIC_INIT(RCU_DYNTICK_CTRL_CTR), #ifdef CONFIG_NO_HZ_FULL_SYSIDLE .dynticks_idle_nesting = DYNTICK_TASK_NEST_VALUE, .dynticks_idle = ATOMIC_INIT(1), @@ -290,15 +298,20 @@ static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { static void rcu_dynticks_eqs_enter(void) { struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); - int special; + int seq; /* - * CPUs seeing atomic_inc_return() must see prior RCU read-side + * CPUs seeing atomic_add_return() must see prior RCU read-side * critical sections, and we also must force ordering with the * next idle sojourn. */ - special = atomic_inc_return(&rdtp->dynticks); - WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && special & 0x1); + seq = atomic_add_return(RCU_DYNTICK_CTRL_CTR, &rdtp->dynticks); + /* Better be in an extended quiescent state! */ + WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && + (seq & RCU_DYNTICK_CTRL_CTR)); + /* Better not have special action (TLB flush) pending! */ + WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && + (seq & RCU_DYNTICK_CTRL_MASK)); } /* @@ -308,15 +321,22 @@ static void rcu_dynticks_eqs_enter(void) static void rcu_dynticks_eqs_exit(void) { struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); - int special; + int seq; /* - * CPUs seeing atomic_inc_return() must see prior idle sojourns, + * CPUs seeing atomic_add_return() must see prior idle sojourns, * and we also must force ordering with the next RCU read-side * critical section. */ - special = atomic_inc_return(&rdtp->dynticks); - WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !(special & 0x1)); + seq = atomic_add_return(RCU_DYNTICK_CTRL_CTR, &rdtp->dynticks); + WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && + !(seq & RCU_DYNTICK_CTRL_CTR)); + if (seq & RCU_DYNTICK_CTRL_MASK) { + atomic_andnot(RCU_DYNTICK_CTRL_MASK, &rdtp->dynticks); + smp_mb__after_atomic(); /* _exit after clearing mask. */ + /* Prefer duplicate flushes to losing a flush. */ + rcu_eqs_special_exit(); + } } /* @@ -333,9 +353,9 @@ static void rcu_dynticks_eqs_online(void) { struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); - if (atomic_read(&rdtp->dynticks) & 0x1) + if (atomic_read(&rdtp->dynticks) & RCU_DYNTICK_CTRL_CTR) return; - atomic_add(0x1, &rdtp->dynticks); + atomic_add(RCU_DYNTICK_CTRL_CTR, &rdtp->dynticks); } /* @@ -347,7 +367,7 @@ bool rcu_dynticks_curr_cpu_in_eqs(void) { struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); - return !(atomic_read(&rdtp->dynticks) & 0x1); + return !(atomic_read(&rdtp->dynticks) & RCU_DYNTICK_CTRL_CTR); } /* @@ -358,7 +378,7 @@ int rcu_dynticks_snap(struct rcu_dynticks *rdtp) { int snap = atomic_add_return(0, &rdtp->dynticks); - return snap; + return snap & ~RCU_DYNTICK_CTRL_MASK; } /* @@ -367,7 +387,7 @@ int rcu_dynticks_snap(struct rcu_dynticks *rdtp) */ static bool rcu_dynticks_in_eqs(int snap) { - return !(snap & 0x1); + return !(snap & RCU_DYNTICK_CTRL_CTR); } /* @@ -387,14 +407,34 @@ static bool rcu_dynticks_in_eqs_since(struct rcu_dynticks *rdtp, int snap) static void rcu_dynticks_momentary_idle(void) { struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); - int special = atomic_add_return(2, &rdtp->dynticks); + int special = atomic_add_return(2 * RCU_DYNTICK_CTRL_CTR, + &rdtp->dynticks); /* It is illegal to call this from idle state. */ - WARN_ON_ONCE(!(special & 0x1)); + WARN_ON_ONCE(!(special & RCU_DYNTICK_CTRL_CTR)); } -DEFINE_PER_CPU_SHARED_ALIGNED(unsigned long, rcu_qs_ctr); -EXPORT_PER_CPU_SYMBOL_GPL(rcu_qs_ctr); +/* + * Set the special (bottom) bit of the specified CPU so that it + * will take special action (such as flushing its TLB) on the + * next exit from an extended quiescent state. Returns true if + * the bit was successfully set, or false if the CPU was not in + * an extended quiescent state. + */ +bool rcu_eqs_special_set(int cpu) +{ + int old; + int new; + struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); + + do { + old = atomic_read(&rdtp->dynticks); + if (old & RCU_DYNTICK_CTRL_CTR) + return false; + new = old | RCU_DYNTICK_CTRL_MASK; + } while (atomic_cmpxchg(&rdtp->dynticks, old, new) != old); + return true; +} /* * Let the RCU core know that this CPU has gone through the scheduler, @@ -403,44 +443,14 @@ EXPORT_PER_CPU_SYMBOL_GPL(rcu_qs_ctr); * memory barriers to let the RCU core know about it, regardless of what * this CPU might (or might not) do in the near future. * - * We inform the RCU core by emulating a zero-duration dyntick-idle - * period, which we in turn do by incrementing the ->dynticks counter - * by two. + * We inform the RCU core by emulating a zero-duration dyntick-idle period. * * The caller must have disabled interrupts. */ static void rcu_momentary_dyntick_idle(void) { - struct rcu_data *rdp; - int resched_mask; - struct rcu_state *rsp; - - /* - * Yes, we can lose flag-setting operations. This is OK, because - * the flag will be set again after some delay. - */ - resched_mask = raw_cpu_read(rcu_sched_qs_mask); - raw_cpu_write(rcu_sched_qs_mask, 0); - - /* Find the flavor that needs a quiescent state. */ - for_each_rcu_flavor(rsp) { - rdp = raw_cpu_ptr(rsp->rda); - if (!(resched_mask & rsp->flavor_mask)) - continue; - smp_mb(); /* rcu_sched_qs_mask before cond_resched_completed. */ - if (READ_ONCE(rdp->mynode->completed) != - READ_ONCE(rdp->cond_resched_completed)) - continue; - - /* - * Pretend to be momentarily idle for the quiescent state. - * This allows the grace-period kthread to record the - * quiescent state, with no need for this CPU to do anything - * further. - */ - rcu_dynticks_momentary_idle(); - break; - } + raw_cpu_write(rcu_dynticks.rcu_need_heavy_qs, false); + rcu_dynticks_momentary_idle(); } /* @@ -448,14 +458,22 @@ static void rcu_momentary_dyntick_idle(void) * and requires special handling for preemptible RCU. * The caller must have disabled interrupts. */ -void rcu_note_context_switch(void) +void rcu_note_context_switch(bool preempt) { barrier(); /* Avoid RCU read-side critical sections leaking down. */ trace_rcu_utilization(TPS("Start context switch")); rcu_sched_qs(); rcu_preempt_note_context_switch(); - if (unlikely(raw_cpu_read(rcu_sched_qs_mask))) + /* Load rcu_urgent_qs before other flags. */ + if (!smp_load_acquire(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs))) + goto out; + this_cpu_write(rcu_dynticks.rcu_urgent_qs, false); + if (unlikely(raw_cpu_read(rcu_dynticks.rcu_need_heavy_qs))) rcu_momentary_dyntick_idle(); + this_cpu_inc(rcu_dynticks.rcu_qs_ctr); + if (!preempt) + rcu_note_voluntary_context_switch_lite(current); +out: trace_rcu_utilization(TPS("End context switch")); barrier(); /* Avoid RCU read-side critical sections leaking up. */ } @@ -478,29 +496,26 @@ void rcu_all_qs(void) { unsigned long flags; + if (!raw_cpu_read(rcu_dynticks.rcu_urgent_qs)) + return; + preempt_disable(); + /* Load rcu_urgent_qs before other flags. */ + if (!smp_load_acquire(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs))) { + preempt_enable(); + return; + } + this_cpu_write(rcu_dynticks.rcu_urgent_qs, false); barrier(); /* Avoid RCU read-side critical sections leaking down. */ - if (unlikely(raw_cpu_read(rcu_sched_qs_mask))) { + if (unlikely(raw_cpu_read(rcu_dynticks.rcu_need_heavy_qs))) { local_irq_save(flags); rcu_momentary_dyntick_idle(); local_irq_restore(flags); } - if (unlikely(raw_cpu_read(rcu_sched_data.cpu_no_qs.b.exp))) { - /* - * Yes, we just checked a per-CPU variable with preemption - * enabled, so we might be migrated to some other CPU at - * this point. That is OK because in that case, the - * migration will supply the needed quiescent state. - * We might end up needlessly disabling preemption and - * invoking rcu_sched_qs() on the destination CPU, but - * the probability and cost are both quite low, so this - * should not be a problem in practice. - */ - preempt_disable(); + if (unlikely(raw_cpu_read(rcu_sched_data.cpu_no_qs.b.exp))) rcu_sched_qs(); - preempt_enable(); - } - this_cpu_inc(rcu_qs_ctr); + this_cpu_inc(rcu_dynticks.rcu_qs_ctr); barrier(); /* Avoid RCU read-side critical sections leaking up. */ + preempt_enable(); } EXPORT_SYMBOL_GPL(rcu_all_qs); @@ -713,16 +728,6 @@ void rcutorture_record_progress(unsigned long vernum) EXPORT_SYMBOL_GPL(rcutorture_record_progress); /* - * Does the CPU have callbacks ready to be invoked? - */ -static int -cpu_has_callbacks_ready_to_invoke(struct rcu_data *rdp) -{ - return &rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL] && - rdp->nxttail[RCU_NEXT_TAIL] != NULL; -} - -/* * Return the root node of the specified rcu_state structure. */ static struct rcu_node *rcu_get_root(struct rcu_state *rsp) @@ -752,21 +757,17 @@ static int rcu_future_needs_gp(struct rcu_state *rsp) static bool cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp) { - int i; - if (rcu_gp_in_progress(rsp)) return false; /* No, a grace period is already in progress. */ if (rcu_future_needs_gp(rsp)) return true; /* Yes, a no-CBs CPU needs one. */ - if (!rdp->nxttail[RCU_NEXT_TAIL]) + if (!rcu_segcblist_is_enabled(&rdp->cblist)) return false; /* No, this is a no-CBs (or offline) CPU. */ - if (*rdp->nxttail[RCU_NEXT_READY_TAIL]) + if (!rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL)) return true; /* Yes, CPU has newly registered callbacks. */ - for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++) - if (rdp->nxttail[i - 1] != rdp->nxttail[i] && - ULONG_CMP_LT(READ_ONCE(rsp->completed), - rdp->nxtcompleted[i])) - return true; /* Yes, CBs for future grace period. */ + if (rcu_segcblist_future_gp_needed(&rdp->cblist, + READ_ONCE(rsp->completed))) + return true; /* Yes, CBs for future grace period. */ return false; /* No grace period needed. */ } @@ -1150,6 +1151,24 @@ bool notrace rcu_is_watching(void) } EXPORT_SYMBOL_GPL(rcu_is_watching); +/* + * If a holdout task is actually running, request an urgent quiescent + * state from its CPU. This is unsynchronized, so migrations can cause + * the request to go to the wrong CPU. Which is OK, all that will happen + * is that the CPU's next context switch will be a bit slower and next + * time around this task will generate another request. + */ +void rcu_request_urgent_qs_task(struct task_struct *t) +{ + int cpu; + + barrier(); + cpu = task_cpu(t); + if (!task_curr(t)) + return; /* This task is not running on that CPU. */ + smp_store_release(per_cpu_ptr(&rcu_dynticks.rcu_urgent_qs, cpu), true); +} + #if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU) /* @@ -1235,7 +1254,8 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp, bool *isidle, unsigned long *maxj) { unsigned long jtsq; - int *rcrmp; + bool *rnhqp; + bool *ruqp; unsigned long rjtsc; struct rcu_node *rnp; @@ -1271,11 +1291,15 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp, * might not be the case for nohz_full CPUs looping in the kernel. */ rnp = rdp->mynode; + ruqp = per_cpu_ptr(&rcu_dynticks.rcu_urgent_qs, rdp->cpu); if (time_after(jiffies, rdp->rsp->gp_start + jtsq) && - READ_ONCE(rdp->rcu_qs_ctr_snap) != per_cpu(rcu_qs_ctr, rdp->cpu) && + READ_ONCE(rdp->rcu_qs_ctr_snap) != per_cpu(rcu_dynticks.rcu_qs_ctr, rdp->cpu) && READ_ONCE(rdp->gpnum) == rnp->gpnum && !rdp->gpwrap) { trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("rqc")); return 1; + } else { + /* Load rcu_qs_ctr before store to rcu_urgent_qs. */ + smp_store_release(ruqp, true); } /* Check for the CPU being offline. */ @@ -1292,7 +1316,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp, * in-kernel CPU-bound tasks cannot advance grace periods. * So if the grace period is old enough, make the CPU pay attention. * Note that the unsynchronized assignments to the per-CPU - * rcu_sched_qs_mask variable are safe. Yes, setting of + * rcu_need_heavy_qs variable are safe. Yes, setting of * bits can be lost, but they will be set again on the next * force-quiescent-state pass. So lost bit sets do not result * in incorrect behavior, merely in a grace period lasting @@ -1306,16 +1330,13 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp, * is set too high, we override with half of the RCU CPU stall * warning delay. */ - rcrmp = &per_cpu(rcu_sched_qs_mask, rdp->cpu); - if (time_after(jiffies, rdp->rsp->gp_start + jtsq) || - time_after(jiffies, rdp->rsp->jiffies_resched)) { - if (!(READ_ONCE(*rcrmp) & rdp->rsp->flavor_mask)) { - WRITE_ONCE(rdp->cond_resched_completed, - READ_ONCE(rdp->mynode->completed)); - smp_mb(); /* ->cond_resched_completed before *rcrmp. */ - WRITE_ONCE(*rcrmp, - READ_ONCE(*rcrmp) + rdp->rsp->flavor_mask); - } + rnhqp = &per_cpu(rcu_dynticks.rcu_need_heavy_qs, rdp->cpu); + if (!READ_ONCE(*rnhqp) && + (time_after(jiffies, rdp->rsp->gp_start + jtsq) || + time_after(jiffies, rdp->rsp->jiffies_resched))) { + WRITE_ONCE(*rnhqp, true); + /* Store rcu_need_heavy_qs before rcu_urgent_qs. */ + smp_store_release(ruqp, true); rdp->rsp->jiffies_resched += 5; /* Re-enable beating. */ } @@ -1475,7 +1496,8 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum) print_cpu_stall_info_end(); for_each_possible_cpu(cpu) - totqlen += per_cpu_ptr(rsp->rda, cpu)->qlen; + totqlen += rcu_segcblist_n_cbs(&per_cpu_ptr(rsp->rda, + cpu)->cblist); pr_cont("(detected by %d, t=%ld jiffies, g=%ld, c=%ld, q=%lu)\n", smp_processor_id(), (long)(jiffies - rsp->gp_start), (long)rsp->gpnum, (long)rsp->completed, totqlen); @@ -1529,7 +1551,8 @@ static void print_cpu_stall(struct rcu_state *rsp) print_cpu_stall_info(rsp, smp_processor_id()); print_cpu_stall_info_end(); for_each_possible_cpu(cpu) - totqlen += per_cpu_ptr(rsp->rda, cpu)->qlen; + totqlen += rcu_segcblist_n_cbs(&per_cpu_ptr(rsp->rda, + cpu)->cblist); pr_cont(" (t=%lu jiffies g=%ld c=%ld q=%lu)\n", jiffies - rsp->gp_start, (long)rsp->gpnum, (long)rsp->completed, totqlen); @@ -1632,30 +1655,6 @@ void rcu_cpu_stall_reset(void) } /* - * Initialize the specified rcu_data structure's default callback list - * to empty. The default callback list is the one that is not used by - * no-callbacks CPUs. - */ -static void init_default_callback_list(struct rcu_data *rdp) -{ - int i; - - rdp->nxtlist = NULL; - for (i = 0; i < RCU_NEXT_SIZE; i++) - rdp->nxttail[i] = &rdp->nxtlist; -} - -/* - * Initialize the specified rcu_data structure's callback list to empty. - */ -static void init_callback_list(struct rcu_data *rdp) -{ - if (init_nocb_callback_list(rdp)) - return; - init_default_callback_list(rdp); -} - -/* * Determine the value that ->completed will have at the end of the * next subsequent grace period. This is used to tag callbacks so that * a CPU can invoke callbacks in a timely fashion even if that CPU has @@ -1709,7 +1708,6 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp, unsigned long *c_out) { unsigned long c; - int i; bool ret = false; struct rcu_node *rnp_root = rcu_get_root(rdp->rsp); @@ -1755,13 +1753,11 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp, /* * Get a new grace-period number. If there really is no grace * period in progress, it will be smaller than the one we obtained - * earlier. Adjust callbacks as needed. Note that even no-CBs - * CPUs have a ->nxtcompleted[] array, so no no-CBs checks needed. + * earlier. Adjust callbacks as needed. */ c = rcu_cbs_completed(rdp->rsp, rnp_root); - for (i = RCU_DONE_TAIL; i < RCU_NEXT_TAIL; i++) - if (ULONG_CMP_LT(c, rdp->nxtcompleted[i])) - rdp->nxtcompleted[i] = c; + if (!rcu_is_nocb_cpu(rdp->cpu)) + (void)rcu_segcblist_accelerate(&rdp->cblist, c); /* * If the needed for the required grace period is already @@ -1793,9 +1789,7 @@ out: /* * Clean up any old requests for the just-ended grace period. Also return - * whether any additional grace periods have been requested. Also invoke - * rcu_nocb_gp_cleanup() in order to wake up any no-callbacks kthreads - * waiting for this grace period to complete. + * whether any additional grace periods have been requested. */ static int rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) { @@ -1841,57 +1835,27 @@ static void rcu_gp_kthread_wake(struct rcu_state *rsp) static bool rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) { - unsigned long c; - int i; - bool ret; - - /* If the CPU has no callbacks, nothing to do. */ - if (!rdp->nxttail[RCU_NEXT_TAIL] || !*rdp->nxttail[RCU_DONE_TAIL]) - return false; - - /* - * Starting from the sublist containing the callbacks most - * recently assigned a ->completed number and working down, find the - * first sublist that is not assignable to an upcoming grace period. - * Such a sublist has something in it (first two tests) and has - * a ->completed number assigned that will complete sooner than - * the ->completed number for newly arrived callbacks (last test). - * - * The key point is that any later sublist can be assigned the - * same ->completed number as the newly arrived callbacks, which - * means that the callbacks in any of these later sublist can be - * grouped into a single sublist, whether or not they have already - * been assigned a ->completed number. - */ - c = rcu_cbs_completed(rsp, rnp); - for (i = RCU_NEXT_TAIL - 1; i > RCU_DONE_TAIL; i--) - if (rdp->nxttail[i] != rdp->nxttail[i - 1] && - !ULONG_CMP_GE(rdp->nxtcompleted[i], c)) - break; + bool ret = false; - /* - * If there are no sublist for unassigned callbacks, leave. - * At the same time, advance "i" one sublist, so that "i" will - * index into the sublist where all the remaining callbacks should - * be grouped into. - */ - if (++i >= RCU_NEXT_TAIL) + /* If no pending (not yet ready to invoke) callbacks, nothing to do. */ + if (!rcu_segcblist_pend_cbs(&rdp->cblist)) return false; /* - * Assign all subsequent callbacks' ->completed number to the next - * full grace period and group them all in the sublist initially - * indexed by "i". + * Callbacks are often registered with incomplete grace-period + * information. Something about the fact that getting exact + * information requires acquiring a global lock... RCU therefore + * makes a conservative estimate of the grace period number at which + * a given callback will become ready to invoke. The following + * code checks this estimate and improves it when possible, thus + * accelerating callback invocation to an earlier grace-period + * number. */ - for (; i <= RCU_NEXT_TAIL; i++) { - rdp->nxttail[i] = rdp->nxttail[RCU_NEXT_TAIL]; - rdp->nxtcompleted[i] = c; - } - /* Record any needed additional grace periods. */ - ret = rcu_start_future_gp(rnp, rdp, NULL); + if (rcu_segcblist_accelerate(&rdp->cblist, rcu_cbs_completed(rsp, rnp))) + ret = rcu_start_future_gp(rnp, rdp, NULL); /* Trace depending on how much we were able to accelerate. */ - if (!*rdp->nxttail[RCU_WAIT_TAIL]) + if (rcu_segcblist_restempty(&rdp->cblist, RCU_WAIT_TAIL)) trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("AccWaitCB")); else trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("AccReadyCB")); @@ -1911,32 +1875,15 @@ static bool rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp, static bool rcu_advance_cbs(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) { - int i, j; - - /* If the CPU has no callbacks, nothing to do. */ - if (!rdp->nxttail[RCU_NEXT_TAIL] || !*rdp->nxttail[RCU_DONE_TAIL]) + /* If no pending (not yet ready to invoke) callbacks, nothing to do. */ + if (!rcu_segcblist_pend_cbs(&rdp->cblist)) return false; /* * Find all callbacks whose ->completed numbers indicate that they * are ready to invoke, and put them into the RCU_DONE_TAIL sublist. */ - for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++) { - if (ULONG_CMP_LT(rnp->completed, rdp->nxtcompleted[i])) - break; - rdp->nxttail[RCU_DONE_TAIL] = rdp->nxttail[i]; - } - /* Clean up any sublist tail pointers that were misordered above. */ - for (j = RCU_WAIT_TAIL; j < i; j++) - rdp->nxttail[j] = rdp->nxttail[RCU_DONE_TAIL]; - - /* Copy down callbacks to fill in empty sublists. */ - for (j = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++, j++) { - if (rdp->nxttail[j] == rdp->nxttail[RCU_NEXT_TAIL]) - break; - rdp->nxttail[j] = rdp->nxttail[i]; - rdp->nxtcompleted[j] = rdp->nxtcompleted[i]; - } + rcu_segcblist_advance(&rdp->cblist, rnp->completed); /* Classify any remaining callbacks. */ return rcu_accelerate_cbs(rsp, rnp, rdp); @@ -1981,7 +1928,7 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpustart")); need_gp = !!(rnp->qsmask & rdp->grpmask); rdp->cpu_no_qs.b.norm = need_gp; - rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr); + rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_dynticks.rcu_qs_ctr); rdp->core_needs_qs = need_gp; zero_cpu_stall_ticks(rdp); WRITE_ONCE(rdp->gpwrap, false); @@ -2579,7 +2526,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp) * within the current grace period. */ rdp->cpu_no_qs.b.norm = true; /* need qs for new gp. */ - rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr); + rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_dynticks.rcu_qs_ctr); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); return; } @@ -2653,13 +2600,8 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp, * because _rcu_barrier() excludes CPU-hotplug operations, so it * cannot be running now. Thus no memory barrier is required. */ - if (rdp->nxtlist != NULL) { - rsp->qlen_lazy += rdp->qlen_lazy; - rsp->qlen += rdp->qlen; - rdp->n_cbs_orphaned += rdp->qlen; - rdp->qlen_lazy = 0; - WRITE_ONCE(rdp->qlen, 0); - } + rdp->n_cbs_orphaned += rcu_segcblist_n_cbs(&rdp->cblist); + rcu_segcblist_extract_count(&rdp->cblist, &rsp->orphan_done); /* * Next, move those callbacks still needing a grace period to @@ -2667,31 +2609,18 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp, * Some of the callbacks might have gone partway through a grace * period, but that is too bad. They get to start over because we * cannot assume that grace periods are synchronized across CPUs. - * We don't bother updating the ->nxttail[] array yet, instead - * we just reset the whole thing later on. */ - if (*rdp->nxttail[RCU_DONE_TAIL] != NULL) { - *rsp->orphan_nxttail = *rdp->nxttail[RCU_DONE_TAIL]; - rsp->orphan_nxttail = rdp->nxttail[RCU_NEXT_TAIL]; - *rdp->nxttail[RCU_DONE_TAIL] = NULL; - } + rcu_segcblist_extract_pend_cbs(&rdp->cblist, &rsp->orphan_pend); /* * Then move the ready-to-invoke callbacks to the orphanage, * where some other CPU will pick them up. These will not be * required to pass though another grace period: They are done. */ - if (rdp->nxtlist != NULL) { - *rsp->orphan_donetail = rdp->nxtlist; - rsp->orphan_donetail = rdp->nxttail[RCU_DONE_TAIL]; - } + rcu_segcblist_extract_done_cbs(&rdp->cblist, &rsp->orphan_done); - /* - * Finally, initialize the rcu_data structure's list to empty and - * disallow further callbacks on this CPU. - */ - init_callback_list(rdp); - rdp->nxttail[RCU_NEXT_TAIL] = NULL; + /* Finally, disallow further callbacks on this CPU. */ + rcu_segcblist_disable(&rdp->cblist); } /* @@ -2700,7 +2629,6 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp, */ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp, unsigned long flags) { - int i; struct rcu_data *rdp = raw_cpu_ptr(rsp->rda); /* No-CBs CPUs are handled specially. */ @@ -2709,13 +2637,11 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp, unsigned long flags) return; /* Do the accounting first. */ - rdp->qlen_lazy += rsp->qlen_lazy; - rdp->qlen += rsp->qlen; - rdp->n_cbs_adopted += rsp->qlen; - if (rsp->qlen_lazy != rsp->qlen) + rdp->n_cbs_adopted += rcu_cblist_n_cbs(&rsp->orphan_done); + if (rcu_cblist_n_lazy_cbs(&rsp->orphan_done) != + rcu_cblist_n_cbs(&rsp->orphan_done)) rcu_idle_count_callbacks_posted(); - rsp->qlen_lazy = 0; - rsp->qlen = 0; + rcu_segcblist_insert_count(&rdp->cblist, &rsp->orphan_done); /* * We do not need a memory barrier here because the only way we @@ -2723,24 +2649,13 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp, unsigned long flags) * we are the task doing the rcu_barrier(). */ - /* First adopt the ready-to-invoke callbacks. */ - if (rsp->orphan_donelist != NULL) { - *rsp->orphan_donetail = *rdp->nxttail[RCU_DONE_TAIL]; - *rdp->nxttail[RCU_DONE_TAIL] = rsp->orphan_donelist; - for (i = RCU_NEXT_SIZE - 1; i >= RCU_DONE_TAIL; i--) - if (rdp->nxttail[i] == rdp->nxttail[RCU_DONE_TAIL]) - rdp->nxttail[i] = rsp->orphan_donetail; - rsp->orphan_donelist = NULL; - rsp->orphan_donetail = &rsp->orphan_donelist; - } - - /* And then adopt the callbacks that still need a grace period. */ - if (rsp->orphan_nxtlist != NULL) { - *rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_nxtlist; - rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_nxttail; - rsp->orphan_nxtlist = NULL; - rsp->orphan_nxttail = &rsp->orphan_nxtlist; - } + /* First adopt the ready-to-invoke callbacks, then the done ones. */ + rcu_segcblist_insert_done_cbs(&rdp->cblist, &rsp->orphan_done); + WARN_ON_ONCE(!rcu_cblist_empty(&rsp->orphan_done)); + rcu_segcblist_insert_pend_cbs(&rdp->cblist, &rsp->orphan_pend); + WARN_ON_ONCE(!rcu_cblist_empty(&rsp->orphan_pend)); + WARN_ON_ONCE(rcu_segcblist_empty(&rdp->cblist) != + !rcu_segcblist_n_cbs(&rdp->cblist)); } /* @@ -2748,14 +2663,14 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp, unsigned long flags) */ static void rcu_cleanup_dying_cpu(struct rcu_state *rsp) { - RCU_TRACE(unsigned long mask); - RCU_TRACE(struct rcu_data *rdp = this_cpu_ptr(rsp->rda)); - RCU_TRACE(struct rcu_node *rnp = rdp->mynode); + RCU_TRACE(unsigned long mask;) + RCU_TRACE(struct rcu_data *rdp = this_cpu_ptr(rsp->rda);) + RCU_TRACE(struct rcu_node *rnp = rdp->mynode;) if (!IS_ENABLED(CONFIG_HOTPLUG_CPU)) return; - RCU_TRACE(mask = rdp->grpmask); + RCU_TRACE(mask = rdp->grpmask;) trace_rcu_grace_period(rsp->name, rnp->gpnum + 1 - !!(rnp->qsmask & mask), TPS("cpuofl")); @@ -2828,9 +2743,11 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) rcu_adopt_orphan_cbs(rsp, flags); raw_spin_unlock_irqrestore(&rsp->orphan_lock, flags); - WARN_ONCE(rdp->qlen != 0 || rdp->nxtlist != NULL, - "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, nxtlist=%p\n", - cpu, rdp->qlen, rdp->nxtlist); + WARN_ONCE(rcu_segcblist_n_cbs(&rdp->cblist) != 0 || + !rcu_segcblist_empty(&rdp->cblist), + "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, 1stCB=%p\n", + cpu, rcu_segcblist_n_cbs(&rdp->cblist), + rcu_segcblist_first_cb(&rdp->cblist)); } /* @@ -2840,14 +2757,17 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) { unsigned long flags; - struct rcu_head *next, *list, **tail; - long bl, count, count_lazy; - int i; + struct rcu_head *rhp; + struct rcu_cblist rcl = RCU_CBLIST_INITIALIZER(rcl); + long bl, count; /* If no callbacks are ready, just return. */ - if (!cpu_has_callbacks_ready_to_invoke(rdp)) { - trace_rcu_batch_start(rsp->name, rdp->qlen_lazy, rdp->qlen, 0); - trace_rcu_batch_end(rsp->name, 0, !!READ_ONCE(rdp->nxtlist), + if (!rcu_segcblist_ready_cbs(&rdp->cblist)) { + trace_rcu_batch_start(rsp->name, + rcu_segcblist_n_lazy_cbs(&rdp->cblist), + rcu_segcblist_n_cbs(&rdp->cblist), 0); + trace_rcu_batch_end(rsp->name, 0, + !rcu_segcblist_empty(&rdp->cblist), need_resched(), is_idle_task(current), rcu_is_callbacks_kthread()); return; @@ -2855,73 +2775,62 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) /* * Extract the list of ready callbacks, disabling to prevent - * races with call_rcu() from interrupt handlers. + * races with call_rcu() from interrupt handlers. Leave the + * callback counts, as rcu_barrier() needs to be conservative. */ local_irq_save(flags); WARN_ON_ONCE(cpu_is_offline(smp_processor_id())); bl = rdp->blimit; - trace_rcu_batch_start(rsp->name, rdp->qlen_lazy, rdp->qlen, bl); - list = rdp->nxtlist; - rdp->nxtlist = *rdp->nxttail[RCU_DONE_TAIL]; - *rdp->nxttail[RCU_DONE_TAIL] = NULL; - tail = rdp->nxttail[RCU_DONE_TAIL]; - for (i = RCU_NEXT_SIZE - 1; i >= 0; i--) - if (rdp->nxttail[i] == rdp->nxttail[RCU_DONE_TAIL]) - rdp->nxttail[i] = &rdp->nxtlist; + trace_rcu_batch_start(rsp->name, rcu_segcblist_n_lazy_cbs(&rdp->cblist), + rcu_segcblist_n_cbs(&rdp->cblist), bl); + rcu_segcblist_extract_done_cbs(&rdp->cblist, &rcl); local_irq_restore(flags); /* Invoke callbacks. */ - count = count_lazy = 0; - while (list) { - next = list->next; - prefetch(next); - debug_rcu_head_unqueue(list); - if (__rcu_reclaim(rsp->name, list)) - count_lazy++; - list = next; - /* Stop only if limit reached and CPU has something to do. */ - if (++count >= bl && + rhp = rcu_cblist_dequeue(&rcl); + for (; rhp; rhp = rcu_cblist_dequeue(&rcl)) { + debug_rcu_head_unqueue(rhp); + if (__rcu_reclaim(rsp->name, rhp)) + rcu_cblist_dequeued_lazy(&rcl); + /* + * Stop only if limit reached and CPU has something to do. + * Note: The rcl structure counts down from zero. + */ + if (-rcu_cblist_n_cbs(&rcl) >= bl && (need_resched() || (!is_idle_task(current) && !rcu_is_callbacks_kthread()))) break; } local_irq_save(flags); - trace_rcu_batch_end(rsp->name, count, !!list, need_resched(), - is_idle_task(current), + count = -rcu_cblist_n_cbs(&rcl); + trace_rcu_batch_end(rsp->name, count, !rcu_cblist_empty(&rcl), + need_resched(), is_idle_task(current), rcu_is_callbacks_kthread()); - /* Update count, and requeue any remaining callbacks. */ - if (list != NULL) { - *tail = rdp->nxtlist; - rdp->nxtlist = list; - for (i = 0; i < RCU_NEXT_SIZE; i++) - if (&rdp->nxtlist == rdp->nxttail[i]) - rdp->nxttail[i] = tail; - else - break; - } + /* Update counts and requeue any remaining callbacks. */ + rcu_segcblist_insert_done_cbs(&rdp->cblist, &rcl); smp_mb(); /* List handling before counting for rcu_barrier(). */ - rdp->qlen_lazy -= count_lazy; - WRITE_ONCE(rdp->qlen, rdp->qlen - count); rdp->n_cbs_invoked += count; + rcu_segcblist_insert_count(&rdp->cblist, &rcl); /* Reinstate batch limit if we have worked down the excess. */ - if (rdp->blimit == LONG_MAX && rdp->qlen <= qlowmark) + count = rcu_segcblist_n_cbs(&rdp->cblist); + if (rdp->blimit == LONG_MAX && count <= qlowmark) rdp->blimit = blimit; /* Reset ->qlen_last_fqs_check trigger if enough CBs have drained. */ - if (rdp->qlen == 0 && rdp->qlen_last_fqs_check != 0) { + if (count == 0 && rdp->qlen_last_fqs_check != 0) { rdp->qlen_last_fqs_check = 0; rdp->n_force_qs_snap = rsp->n_force_qs; - } else if (rdp->qlen < rdp->qlen_last_fqs_check - qhimark) - rdp->qlen_last_fqs_check = rdp->qlen; - WARN_ON_ONCE((rdp->nxtlist == NULL) != (rdp->qlen == 0)); + } else if (count < rdp->qlen_last_fqs_check - qhimark) + rdp->qlen_last_fqs_check = count; + WARN_ON_ONCE(rcu_segcblist_empty(&rdp->cblist) != (count == 0)); local_irq_restore(flags); /* Re-invoke RCU core processing if there are callbacks remaining. */ - if (cpu_has_callbacks_ready_to_invoke(rdp)) + if (rcu_segcblist_ready_cbs(&rdp->cblist)) invoke_rcu_core(); } @@ -3087,7 +2996,7 @@ __rcu_process_callbacks(struct rcu_state *rsp) bool needwake; struct rcu_data *rdp = raw_cpu_ptr(rsp->rda); - WARN_ON_ONCE(rdp->beenonline == 0); + WARN_ON_ONCE(!rdp->beenonline); /* Update RCU state based on any recent quiescent states. */ rcu_check_quiescent_state(rsp, rdp); @@ -3105,7 +3014,7 @@ __rcu_process_callbacks(struct rcu_state *rsp) } /* If there are callbacks ready, invoke them. */ - if (cpu_has_callbacks_ready_to_invoke(rdp)) + if (rcu_segcblist_ready_cbs(&rdp->cblist)) invoke_rcu_callbacks(rsp, rdp); /* Do any needed deferred wakeups of rcuo kthreads. */ @@ -3177,7 +3086,8 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp, * invoking force_quiescent_state() if the newly enqueued callback * is the only one waiting for a grace period to complete. */ - if (unlikely(rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) { + if (unlikely(rcu_segcblist_n_cbs(&rdp->cblist) > + rdp->qlen_last_fqs_check + qhimark)) { /* Are we ignoring a completed grace period? */ note_gp_changes(rsp, rdp); @@ -3195,10 +3105,10 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp, /* Give the grace period a kick. */ rdp->blimit = LONG_MAX; if (rsp->n_force_qs == rdp->n_force_qs_snap && - *rdp->nxttail[RCU_DONE_TAIL] != head) + rcu_segcblist_first_pend_cb(&rdp->cblist) != head) force_quiescent_state(rsp); rdp->n_force_qs_snap = rsp->n_force_qs; - rdp->qlen_last_fqs_check = rdp->qlen; + rdp->qlen_last_fqs_check = rcu_segcblist_n_cbs(&rdp->cblist); } } } @@ -3238,7 +3148,7 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func, rdp = this_cpu_ptr(rsp->rda); /* Add the callback to our list. */ - if (unlikely(rdp->nxttail[RCU_NEXT_TAIL] == NULL) || cpu != -1) { + if (unlikely(!rcu_segcblist_is_enabled(&rdp->cblist)) || cpu != -1) { int offline; if (cpu != -1) @@ -3257,23 +3167,21 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func, */ BUG_ON(cpu != -1); WARN_ON_ONCE(!rcu_is_watching()); - if (!likely(rdp->nxtlist)) - init_default_callback_list(rdp); + if (rcu_segcblist_empty(&rdp->cblist)) + rcu_segcblist_init(&rdp->cblist); } - WRITE_ONCE(rdp->qlen, rdp->qlen + 1); - if (lazy) - rdp->qlen_lazy++; - else + rcu_segcblist_enqueue(&rdp->cblist, head, lazy); + if (!lazy) rcu_idle_count_callbacks_posted(); - smp_mb(); /* Count before adding callback for rcu_barrier(). */ - *rdp->nxttail[RCU_NEXT_TAIL] = head; - rdp->nxttail[RCU_NEXT_TAIL] = &head->next; if (__is_kfree_rcu_offset((unsigned long)func)) trace_rcu_kfree_callback(rsp->name, head, (unsigned long)func, - rdp->qlen_lazy, rdp->qlen); + rcu_segcblist_n_lazy_cbs(&rdp->cblist), + rcu_segcblist_n_cbs(&rdp->cblist)); else - trace_rcu_callback(rsp->name, head, rdp->qlen_lazy, rdp->qlen); + trace_rcu_callback(rsp->name, head, + rcu_segcblist_n_lazy_cbs(&rdp->cblist), + rcu_segcblist_n_cbs(&rdp->cblist)); /* Go handle any RCU core processing required. */ __call_rcu_core(rsp, rdp, head, flags); @@ -3519,41 +3427,6 @@ void cond_synchronize_sched(unsigned long oldstate) } EXPORT_SYMBOL_GPL(cond_synchronize_sched); -/* Adjust sequence number for start of update-side operation. */ -static void rcu_seq_start(unsigned long *sp) -{ - WRITE_ONCE(*sp, *sp + 1); - smp_mb(); /* Ensure update-side operation after counter increment. */ - WARN_ON_ONCE(!(*sp & 0x1)); -} - -/* Adjust sequence number for end of update-side operation. */ -static void rcu_seq_end(unsigned long *sp) -{ - smp_mb(); /* Ensure update-side operation before counter increment. */ - WRITE_ONCE(*sp, *sp + 1); - WARN_ON_ONCE(*sp & 0x1); -} - -/* Take a snapshot of the update side's sequence number. */ -static unsigned long rcu_seq_snap(unsigned long *sp) -{ - unsigned long s; - - s = (READ_ONCE(*sp) + 3) & ~0x1; - smp_mb(); /* Above access must not bleed into critical section. */ - return s; -} - -/* - * Given a snapshot from rcu_seq_snap(), determine whether or not a - * full update-side operation has occurred. - */ -static bool rcu_seq_done(unsigned long *sp, unsigned long s) -{ - return ULONG_CMP_GE(READ_ONCE(*sp), s); -} - /* * Check to see if there is any immediate RCU-related work to be done * by the current CPU, for the specified type of RCU, returning 1 if so. @@ -3577,7 +3450,7 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) /* Is the RCU core waiting for a quiescent state from this CPU? */ if (rcu_scheduler_fully_active && rdp->core_needs_qs && rdp->cpu_no_qs.b.norm && - rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) { + rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_dynticks.rcu_qs_ctr)) { rdp->n_rp_core_needs_qs++; } else if (rdp->core_needs_qs && !rdp->cpu_no_qs.b.norm) { rdp->n_rp_report_qs++; @@ -3585,7 +3458,7 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) } /* Does this CPU have callbacks ready to invoke? */ - if (cpu_has_callbacks_ready_to_invoke(rdp)) { + if (rcu_segcblist_ready_cbs(&rdp->cblist)) { rdp->n_rp_cb_ready++; return 1; } @@ -3649,10 +3522,10 @@ static bool __maybe_unused rcu_cpu_has_callbacks(bool *all_lazy) for_each_rcu_flavor(rsp) { rdp = this_cpu_ptr(rsp->rda); - if (!rdp->nxtlist) + if (rcu_segcblist_empty(&rdp->cblist)) continue; hc = true; - if (rdp->qlen != rdp->qlen_lazy || !all_lazy) { + if (rcu_segcblist_n_nonlazy_cbs(&rdp->cblist) || !all_lazy) { al = false; break; } @@ -3761,7 +3634,7 @@ static void _rcu_barrier(struct rcu_state *rsp) __call_rcu(&rdp->barrier_head, rcu_barrier_callback, rsp, cpu, 0); } - } else if (READ_ONCE(rdp->qlen)) { + } else if (rcu_segcblist_n_cbs(&rdp->cblist)) { _rcu_barrier_trace(rsp, "OnlineQ", cpu, rsp->barrier_sequence); smp_call_function_single(cpu, rcu_barrier_func, rsp, 1); @@ -3870,8 +3743,9 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp) rdp->qlen_last_fqs_check = 0; rdp->n_force_qs_snap = rsp->n_force_qs; rdp->blimit = blimit; - if (!rdp->nxtlist) - init_callback_list(rdp); /* Re-enable callbacks on this CPU. */ + if (rcu_segcblist_empty(&rdp->cblist) && /* No early-boot CBs? */ + !init_nocb_callback_list(rdp)) + rcu_segcblist_init(&rdp->cblist); /* Re-enable callbacks. */ rdp->dynticks->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; rcu_sysidle_init_percpu_data(rdp->dynticks); rcu_dynticks_eqs_online(); @@ -3890,12 +3764,16 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp) rdp->gpnum = rnp->completed; /* Make CPU later note any new GP. */ rdp->completed = rnp->completed; rdp->cpu_no_qs.b.norm = true; - rdp->rcu_qs_ctr_snap = per_cpu(rcu_qs_ctr, cpu); + rdp->rcu_qs_ctr_snap = per_cpu(rcu_dynticks.rcu_qs_ctr, cpu); rdp->core_needs_qs = false; trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuonl")); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } +/* + * Invoked early in the CPU-online process, when pretty much all + * services are available. The incoming CPU is not present. + */ int rcutree_prepare_cpu(unsigned int cpu) { struct rcu_state *rsp; @@ -3909,6 +3787,9 @@ int rcutree_prepare_cpu(unsigned int cpu) return 0; } +/* + * Update RCU priority boot kthread affinity for CPU-hotplug changes. + */ static void rcutree_affinity_setting(unsigned int cpu, int outgoing) { struct rcu_data *rdp = per_cpu_ptr(rcu_state_p->rda, cpu); @@ -3916,20 +3797,34 @@ static void rcutree_affinity_setting(unsigned int cpu, int outgoing) rcu_boost_kthread_setaffinity(rdp->mynode, outgoing); } +/* + * Near the end of the CPU-online process. Pretty much all services + * enabled, and the CPU is now very much alive. + */ int rcutree_online_cpu(unsigned int cpu) { sync_sched_exp_online_cleanup(cpu); rcutree_affinity_setting(cpu, -1); + if (IS_ENABLED(CONFIG_TREE_SRCU)) + srcu_online_cpu(cpu); return 0; } +/* + * Near the beginning of the process. The CPU is still very much alive + * with pretty much all services enabled. + */ int rcutree_offline_cpu(unsigned int cpu) { rcutree_affinity_setting(cpu, cpu); + if (IS_ENABLED(CONFIG_TREE_SRCU)) + srcu_offline_cpu(cpu); return 0; } - +/* + * Near the end of the offline process. We do only tracing here. + */ int rcutree_dying_cpu(unsigned int cpu) { struct rcu_state *rsp; @@ -3939,6 +3834,9 @@ int rcutree_dying_cpu(unsigned int cpu) return 0; } +/* + * The outgoing CPU is gone and we are running elsewhere. + */ int rcutree_dead_cpu(unsigned int cpu) { struct rcu_state *rsp; @@ -3956,6 +3854,10 @@ int rcutree_dead_cpu(unsigned int cpu) * incoming CPUs are not allowed to use RCU read-side critical sections * until this function is called. Failing to observe this restriction * will result in lockdep splats. + * + * Note that this function is special in that it is invoked directly + * from the incoming CPU rather than from the cpuhp_step mechanism. + * This is because this function must be invoked at a precise location. */ void rcu_cpu_starting(unsigned int cpu) { @@ -3981,9 +3883,6 @@ void rcu_cpu_starting(unsigned int cpu) * The CPU is exiting the idle loop into the arch_cpu_idle_dead() * function. We now remove it from the rcu_node tree's ->qsmaskinit * bit masks. - * The CPU is exiting the idle loop into the arch_cpu_idle_dead() - * function. We now remove it from the rcu_node tree's ->qsmaskinit - * bit masks. */ static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp) { @@ -3999,6 +3898,14 @@ static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp) raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } +/* + * The outgoing function has no further need of RCU, so remove it from + * the list of CPUs that RCU must track. + * + * Note that this function is special in that it is invoked directly + * from the outgoing CPU rather than from the cpuhp_step mechanism. + * This is because this function must be invoked at a precise location. + */ void rcu_report_dead(unsigned int cpu) { struct rcu_state *rsp; @@ -4013,6 +3920,10 @@ void rcu_report_dead(unsigned int cpu) } #endif +/* + * On non-huge systems, use expedited RCU grace periods to make suspend + * and hibernation run faster. + */ static int rcu_pm_notify(struct notifier_block *self, unsigned long action, void *hcpu) { @@ -4083,7 +3994,7 @@ early_initcall(rcu_spawn_gp_kthread); * task is booting the system, and such primitives are no-ops). After this * function is called, any synchronous grace-period primitives are run as * expedited, with the requesting task driving the grace period forward. - * A later core_initcall() rcu_exp_runtime_mode() will switch to full + * A later core_initcall() rcu_set_runtime_mode() will switch to full * runtime RCU functionality. */ void rcu_scheduler_starting(void) @@ -4096,31 +4007,6 @@ void rcu_scheduler_starting(void) } /* - * Compute the per-level fanout, either using the exact fanout specified - * or balancing the tree, depending on the rcu_fanout_exact boot parameter. - */ -static void __init rcu_init_levelspread(int *levelspread, const int *levelcnt) -{ - int i; - - if (rcu_fanout_exact) { - levelspread[rcu_num_lvls - 1] = rcu_fanout_leaf; - for (i = rcu_num_lvls - 2; i >= 0; i--) - levelspread[i] = RCU_FANOUT; - } else { - int ccur; - int cprv; - - cprv = nr_cpu_ids; - for (i = rcu_num_lvls - 1; i >= 0; i--) { - ccur = levelcnt[i]; - levelspread[i] = (cprv + ccur - 1) / ccur; - cprv = ccur; - } - } -} - -/* * Helper function for rcu_init() that initializes one rcu_state structure. */ static void __init rcu_init_one(struct rcu_state *rsp) @@ -4129,9 +4015,7 @@ static void __init rcu_init_one(struct rcu_state *rsp) static const char * const fqs[] = RCU_FQS_NAME_INIT; static struct lock_class_key rcu_node_class[RCU_NUM_LVLS]; static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS]; - static u8 fl_mask = 0x1; - int levelcnt[RCU_NUM_LVLS]; /* # nodes in each level. */ int levelspread[RCU_NUM_LVLS]; /* kids/node in each level. */ int cpustride = 1; int i; @@ -4146,20 +4030,16 @@ static void __init rcu_init_one(struct rcu_state *rsp) /* Initialize the level-tracking arrays. */ - for (i = 0; i < rcu_num_lvls; i++) - levelcnt[i] = num_rcu_lvl[i]; for (i = 1; i < rcu_num_lvls; i++) - rsp->level[i] = rsp->level[i - 1] + levelcnt[i - 1]; - rcu_init_levelspread(levelspread, levelcnt); - rsp->flavor_mask = fl_mask; - fl_mask <<= 1; + rsp->level[i] = rsp->level[i - 1] + num_rcu_lvl[i - 1]; + rcu_init_levelspread(levelspread, num_rcu_lvl); /* Initialize the elements themselves, starting from the leaves. */ for (i = rcu_num_lvls - 1; i >= 0; i--) { cpustride *= levelspread[i]; rnp = rsp->level[i]; - for (j = 0; j < levelcnt[i]; j++, rnp++) { + for (j = 0; j < num_rcu_lvl[i]; j++, rnp++) { raw_spin_lock_init(&ACCESS_PRIVATE(rnp, lock)); lockdep_set_class_and_name(&ACCESS_PRIVATE(rnp, lock), &rcu_node_class[i], buf[i]); @@ -4332,6 +4212,8 @@ void __init rcu_init(void) for_each_online_cpu(cpu) { rcutree_prepare_cpu(cpu); rcu_cpu_starting(cpu); + if (IS_ENABLED(CONFIG_TREE_SRCU)) + srcu_online_cpu(cpu); } } diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index ec62a05bfdb3..0e598ab08fea 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -30,80 +30,8 @@ #include <linux/seqlock.h> #include <linux/swait.h> #include <linux/stop_machine.h> - -/* - * Define shape of hierarchy based on NR_CPUS, CONFIG_RCU_FANOUT, and - * CONFIG_RCU_FANOUT_LEAF. - * In theory, it should be possible to add more levels straightforwardly. - * In practice, this did work well going from three levels to four. - * Of course, your mileage may vary. - */ - -#ifdef CONFIG_RCU_FANOUT -#define RCU_FANOUT CONFIG_RCU_FANOUT -#else /* #ifdef CONFIG_RCU_FANOUT */ -# ifdef CONFIG_64BIT -# define RCU_FANOUT 64 -# else -# define RCU_FANOUT 32 -# endif -#endif /* #else #ifdef CONFIG_RCU_FANOUT */ - -#ifdef CONFIG_RCU_FANOUT_LEAF -#define RCU_FANOUT_LEAF CONFIG_RCU_FANOUT_LEAF -#else /* #ifdef CONFIG_RCU_FANOUT_LEAF */ -# ifdef CONFIG_64BIT -# define RCU_FANOUT_LEAF 64 -# else -# define RCU_FANOUT_LEAF 32 -# endif -#endif /* #else #ifdef CONFIG_RCU_FANOUT_LEAF */ - -#define RCU_FANOUT_1 (RCU_FANOUT_LEAF) -#define RCU_FANOUT_2 (RCU_FANOUT_1 * RCU_FANOUT) -#define RCU_FANOUT_3 (RCU_FANOUT_2 * RCU_FANOUT) -#define RCU_FANOUT_4 (RCU_FANOUT_3 * RCU_FANOUT) - -#if NR_CPUS <= RCU_FANOUT_1 -# define RCU_NUM_LVLS 1 -# define NUM_RCU_LVL_0 1 -# define NUM_RCU_NODES NUM_RCU_LVL_0 -# define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0 } -# define RCU_NODE_NAME_INIT { "rcu_node_0" } -# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0" } -#elif NR_CPUS <= RCU_FANOUT_2 -# define RCU_NUM_LVLS 2 -# define NUM_RCU_LVL_0 1 -# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1) -# define NUM_RCU_NODES (NUM_RCU_LVL_0 + NUM_RCU_LVL_1) -# define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1 } -# define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1" } -# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1" } -#elif NR_CPUS <= RCU_FANOUT_3 -# define RCU_NUM_LVLS 3 -# define NUM_RCU_LVL_0 1 -# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2) -# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1) -# define NUM_RCU_NODES (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2) -# define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1, NUM_RCU_LVL_2 } -# define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1", "rcu_node_2" } -# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2" } -#elif NR_CPUS <= RCU_FANOUT_4 -# define RCU_NUM_LVLS 4 -# define NUM_RCU_LVL_0 1 -# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_3) -# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2) -# define NUM_RCU_LVL_3 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1) -# define NUM_RCU_NODES (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3) -# define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1, NUM_RCU_LVL_2, NUM_RCU_LVL_3 } -# define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1", "rcu_node_2", "rcu_node_3" } -# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2", "rcu_node_fqs_3" } -#else -# error "CONFIG_RCU_FANOUT insufficient for NR_CPUS" -#endif /* #if (NR_CPUS) <= RCU_FANOUT_1 */ - -extern int rcu_num_lvls; -extern int rcu_num_nodes; +#include <linux/rcu_segcblist.h> +#include <linux/rcu_node_tree.h> /* * Dynticks per-CPU state. @@ -113,6 +41,9 @@ struct rcu_dynticks { /* Process level is worth LLONG_MAX/2. */ int dynticks_nmi_nesting; /* Track NMI nesting level. */ atomic_t dynticks; /* Even value for idle, else odd. */ + bool rcu_need_heavy_qs; /* GP old, need heavy quiescent state. */ + unsigned long rcu_qs_ctr; /* Light universal quiescent state ctr. */ + bool rcu_urgent_qs; /* GP old need light quiescent state. */ #ifdef CONFIG_NO_HZ_FULL_SYSIDLE long long dynticks_idle_nesting; /* irq/process nesting level from idle. */ @@ -262,41 +193,6 @@ struct rcu_node { #define leaf_node_cpu_bit(rnp, cpu) (1UL << ((cpu) - (rnp)->grplo)) /* - * Do a full breadth-first scan of the rcu_node structures for the - * specified rcu_state structure. - */ -#define rcu_for_each_node_breadth_first(rsp, rnp) \ - for ((rnp) = &(rsp)->node[0]; \ - (rnp) < &(rsp)->node[rcu_num_nodes]; (rnp)++) - -/* - * Do a breadth-first scan of the non-leaf rcu_node structures for the - * specified rcu_state structure. Note that if there is a singleton - * rcu_node tree with but one rcu_node structure, this loop is a no-op. - */ -#define rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) \ - for ((rnp) = &(rsp)->node[0]; \ - (rnp) < (rsp)->level[rcu_num_lvls - 1]; (rnp)++) - -/* - * Scan the leaves of the rcu_node hierarchy for the specified rcu_state - * structure. Note that if there is a singleton rcu_node tree with but - * one rcu_node structure, this loop -will- visit the rcu_node structure. - * It is still a leaf node, even if it is also the root node. - */ -#define rcu_for_each_leaf_node(rsp, rnp) \ - for ((rnp) = (rsp)->level[rcu_num_lvls - 1]; \ - (rnp) < &(rsp)->node[rcu_num_nodes]; (rnp)++) - -/* - * Iterate over all possible CPUs in a leaf RCU node. - */ -#define for_each_leaf_node_possible_cpu(rnp, cpu) \ - for ((cpu) = cpumask_next(rnp->grplo - 1, cpu_possible_mask); \ - cpu <= rnp->grphi; \ - cpu = cpumask_next((cpu), cpu_possible_mask)) - -/* * Union to allow "aggregate OR" operation on the need for a quiescent * state by the normal and expedited grace periods. */ @@ -336,34 +232,9 @@ struct rcu_data { /* period it is aware of. */ /* 2) batch handling */ - /* - * If nxtlist is not NULL, it is partitioned as follows. - * Any of the partitions might be empty, in which case the - * pointer to that partition will be equal to the pointer for - * the following partition. When the list is empty, all of - * the nxttail elements point to the ->nxtlist pointer itself, - * which in that case is NULL. - * - * [nxtlist, *nxttail[RCU_DONE_TAIL]): - * Entries that batch # <= ->completed - * The grace period for these entries has completed, and - * the other grace-period-completed entries may be moved - * here temporarily in rcu_process_callbacks(). - * [*nxttail[RCU_DONE_TAIL], *nxttail[RCU_WAIT_TAIL]): - * Entries that batch # <= ->completed - 1: waiting for current GP - * [*nxttail[RCU_WAIT_TAIL], *nxttail[RCU_NEXT_READY_TAIL]): - * Entries known to have arrived before current GP ended - * [*nxttail[RCU_NEXT_READY_TAIL], *nxttail[RCU_NEXT_TAIL]): - * Entries that might have arrived after current GP ended - * Note that the value of *nxttail[RCU_NEXT_TAIL] will - * always be NULL, as this is the end of the list. - */ - struct rcu_head *nxtlist; - struct rcu_head **nxttail[RCU_NEXT_SIZE]; - unsigned long nxtcompleted[RCU_NEXT_SIZE]; - /* grace periods for sublists. */ - long qlen_lazy; /* # of lazy queued callbacks */ - long qlen; /* # of queued callbacks, incl lazy */ + struct rcu_segcblist cblist; /* Segmented callback list, with */ + /* different callbacks waiting for */ + /* different grace periods. */ long qlen_last_fqs_check; /* qlen at last check for QS forcing */ unsigned long n_cbs_invoked; /* count of RCU cbs invoked. */ @@ -482,7 +353,6 @@ struct rcu_state { struct rcu_node *level[RCU_NUM_LVLS + 1]; /* Hierarchy levels (+1 to */ /* shut bogus gcc warning) */ - u8 flavor_mask; /* bit in flavor mask. */ struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */ call_rcu_func_t call; /* call_rcu() flavor. */ int ncpus; /* # CPUs seen so far. */ @@ -502,14 +372,11 @@ struct rcu_state { raw_spinlock_t orphan_lock ____cacheline_internodealigned_in_smp; /* Protect following fields. */ - struct rcu_head *orphan_nxtlist; /* Orphaned callbacks that */ + struct rcu_cblist orphan_pend; /* Orphaned callbacks that */ /* need a grace period. */ - struct rcu_head **orphan_nxttail; /* Tail of above. */ - struct rcu_head *orphan_donelist; /* Orphaned callbacks that */ + struct rcu_cblist orphan_done; /* Orphaned callbacks that */ /* are ready to invoke. */ - struct rcu_head **orphan_donetail; /* Tail of above. */ - long qlen_lazy; /* Number of lazy callbacks. */ - long qlen; /* Total number of callbacks. */ + /* (Contains counts.) */ /* End of fields guarded by orphan_lock. */ struct mutex barrier_mutex; /* Guards barrier fields. */ @@ -596,6 +463,7 @@ extern struct rcu_state rcu_preempt_state; #endif /* #ifdef CONFIG_PREEMPT_RCU */ int rcu_dynticks_snap(struct rcu_dynticks *rdtp); +bool rcu_eqs_special_set(int cpu); #ifdef CONFIG_RCU_BOOST DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status); @@ -673,6 +541,14 @@ static bool rcu_nohz_full_cpu(struct rcu_state *rsp); static void rcu_dynticks_task_enter(void); static void rcu_dynticks_task_exit(void); +#ifdef CONFIG_SRCU +void srcu_online_cpu(unsigned int cpu); +void srcu_offline_cpu(unsigned int cpu); +#else /* #ifdef CONFIG_SRCU */ +void srcu_online_cpu(unsigned int cpu) { } +void srcu_offline_cpu(unsigned int cpu) { } +#endif /* #else #ifdef CONFIG_SRCU */ + #endif /* #ifndef RCU_TREE_NONCORE */ #ifdef CONFIG_RCU_TRACE diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index a7b639ccd46e..e513b4ab1197 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -292,7 +292,7 @@ static bool exp_funnel_lock(struct rcu_state *rsp, unsigned long s) trace_rcu_exp_funnel_lock(rsp->name, rnp->level, rnp->grplo, rnp->grphi, TPS("wait")); - wait_event(rnp->exp_wq[(s >> 1) & 0x3], + wait_event(rnp->exp_wq[rcu_seq_ctr(s) & 0x3], sync_exp_work_done(rsp, &rdp->exp_workdone2, s)); return true; @@ -331,6 +331,8 @@ static void sync_sched_exp_handler(void *data) return; } __this_cpu_write(rcu_sched_data.cpu_no_qs.b.exp, true); + /* Store .exp before .rcu_urgent_qs. */ + smp_store_release(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs), true); resched_cpu(smp_processor_id()); } @@ -531,7 +533,8 @@ static void rcu_exp_wait_wake(struct rcu_state *rsp, unsigned long s) rnp->exp_seq_rq = s; spin_unlock(&rnp->exp_lock); } - wake_up_all(&rnp->exp_wq[(rsp->expedited_sequence >> 1) & 0x3]); + smp_mb(); /* All above changes before wakeup. */ + wake_up_all(&rnp->exp_wq[rcu_seq_ctr(rsp->expedited_sequence) & 0x3]); } trace_rcu_exp_grace_period(rsp->name, s, TPS("endwake")); mutex_unlock(&rsp->exp_wake_mutex); @@ -609,9 +612,9 @@ static void _synchronize_rcu_expedited(struct rcu_state *rsp, /* Wait for expedited grace period to complete. */ rdp = per_cpu_ptr(rsp->rda, raw_smp_processor_id()); rnp = rcu_get_root(rsp); - wait_event(rnp->exp_wq[(s >> 1) & 0x3], - sync_exp_work_done(rsp, - &rdp->exp_workdone0, s)); + wait_event(rnp->exp_wq[rcu_seq_ctr(s) & 0x3], + sync_exp_work_done(rsp, &rdp->exp_workdone0, s)); + smp_mb(); /* Workqueue actions happen before return. */ /* Let the next expedited grace period start. */ mutex_unlock(&rsp->exp_mutex); @@ -735,15 +738,3 @@ void synchronize_rcu_expedited(void) EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); #endif /* #else #ifdef CONFIG_PREEMPT_RCU */ - -/* - * Switch to run-time mode once Tree RCU has fully initialized. - */ -static int __init rcu_exp_runtime_mode(void) -{ - rcu_test_sync_prims(); - rcu_scheduler_active = RCU_SCHEDULER_RUNNING; - rcu_test_sync_prims(); - return 0; -} -core_initcall(rcu_exp_runtime_mode); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 0a62a8f1caac..7f1d677a2a25 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1350,10 +1350,10 @@ static bool __maybe_unused rcu_try_advance_all_cbs(void) */ if ((rdp->completed != rnp->completed || unlikely(READ_ONCE(rdp->gpwrap))) && - rdp->nxttail[RCU_DONE_TAIL] != rdp->nxttail[RCU_NEXT_TAIL]) + rcu_segcblist_pend_cbs(&rdp->cblist)) note_gp_changes(rsp, rdp); - if (cpu_has_callbacks_ready_to_invoke(rdp)) + if (rcu_segcblist_ready_cbs(&rdp->cblist)) cbs_ready = true; } return cbs_ready; @@ -1461,7 +1461,7 @@ static void rcu_prepare_for_idle(void) rdtp->last_accelerate = jiffies; for_each_rcu_flavor(rsp) { rdp = this_cpu_ptr(rsp->rda); - if (!*rdp->nxttail[RCU_DONE_TAIL]) + if (rcu_segcblist_pend_cbs(&rdp->cblist)) continue; rnp = rdp->mynode; raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ @@ -1529,7 +1529,7 @@ static void rcu_oom_notify_cpu(void *unused) for_each_rcu_flavor(rsp) { rdp = raw_cpu_ptr(rsp->rda); - if (rdp->qlen_lazy != 0) { + if (rcu_segcblist_n_lazy_cbs(&rdp->cblist)) { atomic_inc(&oom_callback_count); rsp->call(&rdp->oom_head, rcu_oom_callback); } @@ -1709,7 +1709,7 @@ __setup("rcu_nocbs=", rcu_nocb_setup); static int __init parse_rcu_nocb_poll(char *arg) { - rcu_nocb_poll = 1; + rcu_nocb_poll = true; return 0; } early_param("rcu_nocb_poll", parse_rcu_nocb_poll); @@ -1860,7 +1860,9 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp, trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeEmpty")); } else { - rdp->nocb_defer_wakeup = RCU_NOGP_WAKE; + WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOGP_WAKE); + /* Store ->nocb_defer_wakeup before ->rcu_urgent_qs. */ + smp_store_release(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs), true); trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeEmptyIsDeferred")); } @@ -1872,7 +1874,9 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp, trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeOvf")); } else { - rdp->nocb_defer_wakeup = RCU_NOGP_WAKE_FORCE; + WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOGP_WAKE_FORCE); + /* Store ->nocb_defer_wakeup before ->rcu_urgent_qs. */ + smp_store_release(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs), true); trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeOvfIsDeferred")); } @@ -1930,30 +1934,26 @@ static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, struct rcu_data *rdp, unsigned long flags) { - long ql = rsp->qlen; - long qll = rsp->qlen_lazy; + long ql = rcu_cblist_n_cbs(&rsp->orphan_done); + long qll = rcu_cblist_n_lazy_cbs(&rsp->orphan_done); /* If this is not a no-CBs CPU, tell the caller to do it the old way. */ if (!rcu_is_nocb_cpu(smp_processor_id())) return false; - rsp->qlen = 0; - rsp->qlen_lazy = 0; /* First, enqueue the donelist, if any. This preserves CB ordering. */ - if (rsp->orphan_donelist != NULL) { - __call_rcu_nocb_enqueue(rdp, rsp->orphan_donelist, - rsp->orphan_donetail, ql, qll, flags); - ql = qll = 0; - rsp->orphan_donelist = NULL; - rsp->orphan_donetail = &rsp->orphan_donelist; + if (!rcu_cblist_empty(&rsp->orphan_done)) { + __call_rcu_nocb_enqueue(rdp, rcu_cblist_head(&rsp->orphan_done), + rcu_cblist_tail(&rsp->orphan_done), + ql, qll, flags); } - if (rsp->orphan_nxtlist != NULL) { - __call_rcu_nocb_enqueue(rdp, rsp->orphan_nxtlist, - rsp->orphan_nxttail, ql, qll, flags); - ql = qll = 0; - rsp->orphan_nxtlist = NULL; - rsp->orphan_nxttail = &rsp->orphan_nxtlist; + if (!rcu_cblist_empty(&rsp->orphan_pend)) { + __call_rcu_nocb_enqueue(rdp, rcu_cblist_head(&rsp->orphan_pend), + rcu_cblist_tail(&rsp->orphan_pend), + ql, qll, flags); } + rcu_cblist_init(&rsp->orphan_done); + rcu_cblist_init(&rsp->orphan_pend); return true; } @@ -2395,16 +2395,16 @@ static bool init_nocb_callback_list(struct rcu_data *rdp) return false; /* If there are early-boot callbacks, move them to nocb lists. */ - if (rdp->nxtlist) { - rdp->nocb_head = rdp->nxtlist; - rdp->nocb_tail = rdp->nxttail[RCU_NEXT_TAIL]; - atomic_long_set(&rdp->nocb_q_count, rdp->qlen); - atomic_long_set(&rdp->nocb_q_count_lazy, rdp->qlen_lazy); - rdp->nxtlist = NULL; - rdp->qlen = 0; - rdp->qlen_lazy = 0; + if (!rcu_segcblist_empty(&rdp->cblist)) { + rdp->nocb_head = rcu_segcblist_head(&rdp->cblist); + rdp->nocb_tail = rcu_segcblist_tail(&rdp->cblist); + atomic_long_set(&rdp->nocb_q_count, + rcu_segcblist_n_cbs(&rdp->cblist)); + atomic_long_set(&rdp->nocb_q_count_lazy, + rcu_segcblist_n_lazy_cbs(&rdp->cblist)); + rcu_segcblist_init(&rdp->cblist); } - rdp->nxttail[RCU_NEXT_TAIL] = NULL; + rcu_segcblist_disable(&rdp->cblist); return true; } diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c index 8751a748499a..30c5bf89ee58 100644 --- a/kernel/rcu/tree_trace.c +++ b/kernel/rcu/tree_trace.c @@ -41,11 +41,11 @@ #include <linux/mutex.h> #include <linux/debugfs.h> #include <linux/seq_file.h> +#include <linux/prefetch.h> #define RCU_TREE_NONCORE #include "tree.h" - -DECLARE_PER_CPU_SHARED_ALIGNED(unsigned long, rcu_qs_ctr); +#include "rcu.h" static int r_open(struct inode *inode, struct file *file, const struct seq_operations *op) @@ -121,7 +121,7 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) cpu_is_offline(rdp->cpu) ? '!' : ' ', ulong2long(rdp->completed), ulong2long(rdp->gpnum), rdp->cpu_no_qs.b.norm, - rdp->rcu_qs_ctr_snap == per_cpu(rcu_qs_ctr, rdp->cpu), + rdp->rcu_qs_ctr_snap == per_cpu(rdp->dynticks->rcu_qs_ctr, rdp->cpu), rdp->core_needs_qs); seq_printf(m, " dt=%d/%llx/%d df=%lu", rcu_dynticks_snap(rdp->dynticks), @@ -130,17 +130,15 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) rdp->dynticks_fqs); seq_printf(m, " of=%lu", rdp->offline_fqs); rcu_nocb_q_lengths(rdp, &ql, &qll); - qll += rdp->qlen_lazy; - ql += rdp->qlen; + qll += rcu_segcblist_n_lazy_cbs(&rdp->cblist); + ql += rcu_segcblist_n_cbs(&rdp->cblist); seq_printf(m, " ql=%ld/%ld qs=%c%c%c%c", qll, ql, - ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] != - rdp->nxttail[RCU_NEXT_TAIL]], - ".R"[rdp->nxttail[RCU_WAIT_TAIL] != - rdp->nxttail[RCU_NEXT_READY_TAIL]], - ".W"[rdp->nxttail[RCU_DONE_TAIL] != - rdp->nxttail[RCU_WAIT_TAIL]], - ".D"[&rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL]]); + ".N"[!rcu_segcblist_segempty(&rdp->cblist, RCU_NEXT_TAIL)], + ".R"[!rcu_segcblist_segempty(&rdp->cblist, + RCU_NEXT_READY_TAIL)], + ".W"[!rcu_segcblist_segempty(&rdp->cblist, RCU_WAIT_TAIL)], + ".D"[!rcu_segcblist_segempty(&rdp->cblist, RCU_DONE_TAIL)]); #ifdef CONFIG_RCU_BOOST seq_printf(m, " kt=%d/%c ktl=%x", per_cpu(rcu_cpu_has_work, rdp->cpu), @@ -278,7 +276,9 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) seq_printf(m, "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld/%ld\n", rsp->n_force_qs, rsp->n_force_qs_ngp, rsp->n_force_qs - rsp->n_force_qs_ngp, - READ_ONCE(rsp->n_force_qs_lh), rsp->qlen_lazy, rsp->qlen); + READ_ONCE(rsp->n_force_qs_lh), + rcu_cblist_n_lazy_cbs(&rsp->orphan_done), + rcu_cblist_n_cbs(&rsp->orphan_done)); for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < rcu_num_nodes; rnp++) { if (rnp->level != level) { seq_puts(m, "\n"); diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 55c8530316c7..273e869ca21d 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -124,7 +124,7 @@ EXPORT_SYMBOL(rcu_read_lock_sched_held); * non-expedited counterparts? Intended for use within RCU. Note * that if the user specifies both rcu_expedited and rcu_normal, then * rcu_normal wins. (Except during the time period during boot from - * when the first task is spawned until the rcu_exp_runtime_mode() + * when the first task is spawned until the rcu_set_runtime_mode() * core_initcall() is invoked, at which point everything is expedited.) */ bool rcu_gp_is_normal(void) @@ -190,6 +190,39 @@ void rcu_end_inkernel_boot(void) #endif /* #ifndef CONFIG_TINY_RCU */ +/* + * Test each non-SRCU synchronous grace-period wait API. This is + * useful just after a change in mode for these primitives, and + * during early boot. + */ +void rcu_test_sync_prims(void) +{ + if (!IS_ENABLED(CONFIG_PROVE_RCU)) + return; + synchronize_rcu(); + synchronize_rcu_bh(); + synchronize_sched(); + synchronize_rcu_expedited(); + synchronize_rcu_bh_expedited(); + synchronize_sched_expedited(); +} + +#if !defined(CONFIG_TINY_RCU) || defined(CONFIG_SRCU) + +/* + * Switch to run-time mode once RCU has fully initialized. + */ +static int __init rcu_set_runtime_mode(void) +{ + rcu_test_sync_prims(); + rcu_scheduler_active = RCU_SCHEDULER_RUNNING; + rcu_test_sync_prims(); + return 0; +} +core_initcall(rcu_set_runtime_mode); + +#endif /* #if !defined(CONFIG_TINY_RCU) || defined(CONFIG_SRCU) */ + #ifdef CONFIG_PREEMPT_RCU /* @@ -632,6 +665,7 @@ static void check_holdout_task(struct task_struct *t, put_task_struct(t); return; } + rcu_request_urgent_qs_task(t); if (!needreport) return; if (*firstreport) { @@ -817,23 +851,6 @@ static void rcu_spawn_tasks_kthread(void) #endif /* #ifdef CONFIG_TASKS_RCU */ -/* - * Test each non-SRCU synchronous grace-period wait API. This is - * useful just after a change in mode for these primitives, and - * during early boot. - */ -void rcu_test_sync_prims(void) -{ - if (!IS_ENABLED(CONFIG_PROVE_RCU)) - return; - synchronize_rcu(); - synchronize_rcu_bh(); - synchronize_sched(); - synchronize_rcu_expedited(); - synchronize_rcu_bh_expedited(); - synchronize_sched_expedited(); -} - #ifdef CONFIG_PROVE_RCU /* diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 3b31fc05a0f1..2adf7b6c04e7 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3378,7 +3378,7 @@ static void __sched notrace __schedule(bool preempt) hrtick_clear(rq); local_irq_disable(); - rcu_note_context_switch(); + rcu_note_context_switch(preempt); /* * Make sure that signal_pending_state()->signal_pending() below diff --git a/kernel/signal.c b/kernel/signal.c index 7e59ebc2c25e..6df5f72158e4 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1237,7 +1237,7 @@ struct sighand_struct *__lock_task_sighand(struct task_struct *tsk, } /* * This sighand can be already freed and even reused, but - * we rely on SLAB_DESTROY_BY_RCU and sighand_ctor() which + * we rely on SLAB_TYPESAFE_BY_RCU and sighand_ctor() which * initializes ->siglock: this slab can't go away, it has * the same object type, ->siglock can't be reinitialized. * diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c index 98b27195e38b..4b20061102f6 100644 --- a/mm/kasan/kasan.c +++ b/mm/kasan/kasan.c @@ -413,7 +413,7 @@ void kasan_cache_create(struct kmem_cache *cache, size_t *size, *size += sizeof(struct kasan_alloc_meta); /* Add free meta. */ - if (cache->flags & SLAB_DESTROY_BY_RCU || cache->ctor || + if (cache->flags & SLAB_TYPESAFE_BY_RCU || cache->ctor || cache->object_size < sizeof(struct kasan_free_meta)) { cache->kasan_info.free_meta_offset = *size; *size += sizeof(struct kasan_free_meta); @@ -561,7 +561,7 @@ static void kasan_poison_slab_free(struct kmem_cache *cache, void *object) unsigned long rounded_up_size = round_up(size, KASAN_SHADOW_SCALE_SIZE); /* RCU slabs could be legally used after free within the RCU period */ - if (unlikely(cache->flags & SLAB_DESTROY_BY_RCU)) + if (unlikely(cache->flags & SLAB_TYPESAFE_BY_RCU)) return; kasan_poison_shadow(object, rounded_up_size, KASAN_KMALLOC_FREE); @@ -572,7 +572,7 @@ bool kasan_slab_free(struct kmem_cache *cache, void *object) s8 shadow_byte; /* RCU slabs could be legally used after free within the RCU period */ - if (unlikely(cache->flags & SLAB_DESTROY_BY_RCU)) + if (unlikely(cache->flags & SLAB_TYPESAFE_BY_RCU)) return false; shadow_byte = READ_ONCE(*(s8 *)kasan_mem_to_shadow(object)); diff --git a/mm/kmemcheck.c b/mm/kmemcheck.c index 5bf191756a4a..2d5959c5f7c5 100644 --- a/mm/kmemcheck.c +++ b/mm/kmemcheck.c @@ -95,7 +95,7 @@ void kmemcheck_slab_alloc(struct kmem_cache *s, gfp_t gfpflags, void *object, void kmemcheck_slab_free(struct kmem_cache *s, void *object, size_t size) { /* TODO: RCU freeing is unsupported for now; hide false positives. */ - if (!s->ctor && !(s->flags & SLAB_DESTROY_BY_RCU)) + if (!s->ctor && !(s->flags & SLAB_TYPESAFE_BY_RCU)) kmemcheck_mark_freed(object, size); } diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index a7652acd2ab9..54ca54562928 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -21,7 +21,7 @@ #include <linux/slab.h> /* global SRCU for all MMs */ -static struct srcu_struct srcu; +DEFINE_STATIC_SRCU(srcu); /* * This function allows mmu_notifier::release callback to delay a call to @@ -252,12 +252,6 @@ static int do_mmu_notifier_register(struct mmu_notifier *mn, BUG_ON(atomic_read(&mm->mm_users) <= 0); - /* - * Verify that mmu_notifier_init() already run and the global srcu is - * initialized. - */ - BUG_ON(!srcu.per_cpu_ref); - ret = -ENOMEM; mmu_notifier_mm = kmalloc(sizeof(struct mmu_notifier_mm), GFP_KERNEL); if (unlikely(!mmu_notifier_mm)) @@ -406,9 +400,3 @@ void mmu_notifier_unregister_no_release(struct mmu_notifier *mn, mmdrop(mm); } EXPORT_SYMBOL_GPL(mmu_notifier_unregister_no_release); - -static int __init mmu_notifier_init(void) -{ - return init_srcu_struct(&srcu); -} -subsys_initcall(mmu_notifier_init); diff --git a/mm/rmap.c b/mm/rmap.c index 49ed681ccc7b..8ffd59df8a3f 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -430,7 +430,7 @@ static void anon_vma_ctor(void *data) void __init anon_vma_init(void) { anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma), - 0, SLAB_DESTROY_BY_RCU|SLAB_PANIC|SLAB_ACCOUNT, + 0, SLAB_TYPESAFE_BY_RCU|SLAB_PANIC|SLAB_ACCOUNT, anon_vma_ctor); anon_vma_chain_cachep = KMEM_CACHE(anon_vma_chain, SLAB_PANIC|SLAB_ACCOUNT); @@ -481,7 +481,7 @@ struct anon_vma *page_get_anon_vma(struct page *page) * If this page is still mapped, then its anon_vma cannot have been * freed. But if it has been unmapped, we have no security against the * anon_vma structure being freed and reused (for another anon_vma: - * SLAB_DESTROY_BY_RCU guarantees that - so the atomic_inc_not_zero() + * SLAB_TYPESAFE_BY_RCU guarantees that - so the atomic_inc_not_zero() * above cannot corrupt). */ if (!page_mapped(page)) { diff --git a/mm/slab.c b/mm/slab.c index 807d86c76908..93c827864862 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1728,7 +1728,7 @@ static void slab_destroy(struct kmem_cache *cachep, struct page *page) freelist = page->freelist; slab_destroy_debugcheck(cachep, page); - if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) + if (unlikely(cachep->flags & SLAB_TYPESAFE_BY_RCU)) call_rcu(&page->rcu_head, kmem_rcu_free); else kmem_freepages(cachep, page); @@ -1924,7 +1924,7 @@ static bool set_objfreelist_slab_cache(struct kmem_cache *cachep, cachep->num = 0; - if (cachep->ctor || flags & SLAB_DESTROY_BY_RCU) + if (cachep->ctor || flags & SLAB_TYPESAFE_BY_RCU) return false; left = calculate_slab_order(cachep, size, @@ -2030,7 +2030,7 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) if (size < 4096 || fls(size - 1) == fls(size-1 + REDZONE_ALIGN + 2 * sizeof(unsigned long long))) flags |= SLAB_RED_ZONE | SLAB_STORE_USER; - if (!(flags & SLAB_DESTROY_BY_RCU)) + if (!(flags & SLAB_TYPESAFE_BY_RCU)) flags |= SLAB_POISON; #endif #endif diff --git a/mm/slab.h b/mm/slab.h index 65e7c3fcac72..9cfcf099709c 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -126,7 +126,7 @@ static inline unsigned long kmem_cache_flags(unsigned long object_size, /* Legal flag mask for kmem_cache_create(), for various configurations */ #define SLAB_CORE_FLAGS (SLAB_HWCACHE_ALIGN | SLAB_CACHE_DMA | SLAB_PANIC | \ - SLAB_DESTROY_BY_RCU | SLAB_DEBUG_OBJECTS ) + SLAB_TYPESAFE_BY_RCU | SLAB_DEBUG_OBJECTS ) #if defined(CONFIG_DEBUG_SLAB) #define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER) @@ -415,7 +415,7 @@ static inline size_t slab_ksize(const struct kmem_cache *s) * back there or track user information then we can * only use the space before that information. */ - if (s->flags & (SLAB_DESTROY_BY_RCU | SLAB_STORE_USER)) + if (s->flags & (SLAB_TYPESAFE_BY_RCU | SLAB_STORE_USER)) return s->inuse; /* * Else we can use all the padding etc for the allocation diff --git a/mm/slab_common.c b/mm/slab_common.c index 09d0e849b07f..01a0fe2eb332 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -39,7 +39,7 @@ static DECLARE_WORK(slab_caches_to_rcu_destroy_work, * Set of flags that will prevent slab merging */ #define SLAB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \ - SLAB_TRACE | SLAB_DESTROY_BY_RCU | SLAB_NOLEAKTRACE | \ + SLAB_TRACE | SLAB_TYPESAFE_BY_RCU | SLAB_NOLEAKTRACE | \ SLAB_FAILSLAB | SLAB_KASAN) #define SLAB_MERGE_SAME (SLAB_RECLAIM_ACCOUNT | SLAB_CACHE_DMA | \ @@ -500,7 +500,7 @@ static void slab_caches_to_rcu_destroy_workfn(struct work_struct *work) struct kmem_cache *s, *s2; /* - * On destruction, SLAB_DESTROY_BY_RCU kmem_caches are put on the + * On destruction, SLAB_TYPESAFE_BY_RCU kmem_caches are put on the * @slab_caches_to_rcu_destroy list. The slab pages are freed * through RCU and and the associated kmem_cache are dereferenced * while freeing the pages, so the kmem_caches should be freed only @@ -537,7 +537,7 @@ static int shutdown_cache(struct kmem_cache *s) memcg_unlink_cache(s); list_del(&s->list); - if (s->flags & SLAB_DESTROY_BY_RCU) { + if (s->flags & SLAB_TYPESAFE_BY_RCU) { list_add_tail(&s->list, &slab_caches_to_rcu_destroy); schedule_work(&slab_caches_to_rcu_destroy_work); } else { diff --git a/mm/slob.c b/mm/slob.c index eac04d4357ec..1bae78d71096 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -126,7 +126,7 @@ static inline void clear_slob_page_free(struct page *sp) /* * struct slob_rcu is inserted at the tail of allocated slob blocks, which - * were created with a SLAB_DESTROY_BY_RCU slab. slob_rcu is used to free + * were created with a SLAB_TYPESAFE_BY_RCU slab. slob_rcu is used to free * the block using call_rcu. */ struct slob_rcu { @@ -524,7 +524,7 @@ EXPORT_SYMBOL(ksize); int __kmem_cache_create(struct kmem_cache *c, unsigned long flags) { - if (flags & SLAB_DESTROY_BY_RCU) { + if (flags & SLAB_TYPESAFE_BY_RCU) { /* leave room for rcu footer at the end of object */ c->size += sizeof(struct slob_rcu); } @@ -598,7 +598,7 @@ static void kmem_rcu_free(struct rcu_head *head) void kmem_cache_free(struct kmem_cache *c, void *b) { kmemleak_free_recursive(b, c->flags); - if (unlikely(c->flags & SLAB_DESTROY_BY_RCU)) { + if (unlikely(c->flags & SLAB_TYPESAFE_BY_RCU)) { struct slob_rcu *slob_rcu; slob_rcu = b + (c->size - sizeof(struct slob_rcu)); slob_rcu->size = c->size; diff --git a/mm/slub.c b/mm/slub.c index 7f4bc7027ed5..57e5156f02be 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1687,7 +1687,7 @@ static void rcu_free_slab(struct rcu_head *h) static void free_slab(struct kmem_cache *s, struct page *page) { - if (unlikely(s->flags & SLAB_DESTROY_BY_RCU)) { + if (unlikely(s->flags & SLAB_TYPESAFE_BY_RCU)) { struct rcu_head *head; if (need_reserve_slab_rcu) { @@ -2963,7 +2963,7 @@ static __always_inline void slab_free(struct kmem_cache *s, struct page *page, * slab_free_freelist_hook() could have put the items into quarantine. * If so, no need to free them. */ - if (s->flags & SLAB_KASAN && !(s->flags & SLAB_DESTROY_BY_RCU)) + if (s->flags & SLAB_KASAN && !(s->flags & SLAB_TYPESAFE_BY_RCU)) return; do_slab_free(s, page, head, tail, cnt, addr); } @@ -3433,7 +3433,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) * the slab may touch the object after free or before allocation * then we should never poison the object itself. */ - if ((flags & SLAB_POISON) && !(flags & SLAB_DESTROY_BY_RCU) && + if ((flags & SLAB_POISON) && !(flags & SLAB_TYPESAFE_BY_RCU) && !s->ctor) s->flags |= __OBJECT_POISON; else @@ -3455,7 +3455,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) */ s->inuse = size; - if (((flags & (SLAB_DESTROY_BY_RCU | SLAB_POISON)) || + if (((flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON)) || s->ctor)) { /* * Relocate free pointer after the object if it is not @@ -3537,7 +3537,7 @@ static int kmem_cache_open(struct kmem_cache *s, unsigned long flags) s->flags = kmem_cache_flags(s->size, flags, s->name, s->ctor); s->reserved = 0; - if (need_reserve_slab_rcu && (s->flags & SLAB_DESTROY_BY_RCU)) + if (need_reserve_slab_rcu && (s->flags & SLAB_TYPESAFE_BY_RCU)) s->reserved = sizeof(struct rcu_head); if (!calculate_sizes(s, -1)) @@ -5042,7 +5042,7 @@ SLAB_ATTR_RO(cache_dma); static ssize_t destroy_by_rcu_show(struct kmem_cache *s, char *buf) { - return sprintf(buf, "%d\n", !!(s->flags & SLAB_DESTROY_BY_RCU)); + return sprintf(buf, "%d\n", !!(s->flags & SLAB_TYPESAFE_BY_RCU)); } SLAB_ATTR_RO(destroy_by_rcu); diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index 409d0cfd3447..90210a0e3888 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -950,7 +950,7 @@ static struct proto dccp_v4_prot = { .orphan_count = &dccp_orphan_count, .max_header = MAX_DCCP_HEADER, .obj_size = sizeof(struct dccp_sock), - .slab_flags = SLAB_DESTROY_BY_RCU, + .slab_flags = SLAB_TYPESAFE_BY_RCU, .rsk_prot = &dccp_request_sock_ops, .twsk_prot = &dccp_timewait_sock_ops, .h.hashinfo = &dccp_hashinfo, diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index 233b57367758..b4019a5e4551 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -1012,7 +1012,7 @@ static struct proto dccp_v6_prot = { .orphan_count = &dccp_orphan_count, .max_header = MAX_DCCP_HEADER, .obj_size = sizeof(struct dccp6_sock), - .slab_flags = SLAB_DESTROY_BY_RCU, + .slab_flags = SLAB_TYPESAFE_BY_RCU, .rsk_prot = &dccp6_request_sock_ops, .twsk_prot = &dccp6_timewait_sock_ops, .h.hashinfo = &dccp_hashinfo, diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 9a89b8deafae..82c89abeb989 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2398,7 +2398,7 @@ struct proto tcp_prot = { .sysctl_rmem = sysctl_tcp_rmem, .max_header = MAX_TCP_HEADER, .obj_size = sizeof(struct tcp_sock), - .slab_flags = SLAB_DESTROY_BY_RCU, + .slab_flags = SLAB_TYPESAFE_BY_RCU, .twsk_prot = &tcp_timewait_sock_ops, .rsk_prot = &tcp_request_sock_ops, .h.hashinfo = &tcp_hashinfo, diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 60a5295a7de6..bdbc4327ebee 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1919,7 +1919,7 @@ struct proto tcpv6_prot = { .sysctl_rmem = sysctl_tcp_rmem, .max_header = MAX_TCP_HEADER, .obj_size = sizeof(struct tcp6_sock), - .slab_flags = SLAB_DESTROY_BY_RCU, + .slab_flags = SLAB_TYPESAFE_BY_RCU, .twsk_prot = &tcp6_timewait_sock_ops, .rsk_prot = &tcp6_request_sock_ops, .h.hashinfo = &tcp_hashinfo, diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c index 06186d608a27..d096ca563054 100644 --- a/net/llc/af_llc.c +++ b/net/llc/af_llc.c @@ -142,7 +142,7 @@ static struct proto llc_proto = { .name = "LLC", .owner = THIS_MODULE, .obj_size = sizeof(struct llc_sock), - .slab_flags = SLAB_DESTROY_BY_RCU, + .slab_flags = SLAB_TYPESAFE_BY_RCU, }; /** diff --git a/net/llc/llc_conn.c b/net/llc/llc_conn.c index 8bc5a1bd2d45..9b02c13d258b 100644 --- a/net/llc/llc_conn.c +++ b/net/llc/llc_conn.c @@ -506,7 +506,7 @@ static struct sock *__llc_lookup_established(struct llc_sap *sap, again: sk_nulls_for_each_rcu(rc, node, laddr_hb) { if (llc_estab_match(sap, daddr, laddr, rc)) { - /* Extra checks required by SLAB_DESTROY_BY_RCU */ + /* Extra checks required by SLAB_TYPESAFE_BY_RCU */ if (unlikely(!atomic_inc_not_zero(&rc->sk_refcnt))) goto again; if (unlikely(llc_sk(rc)->sap != sap || @@ -565,7 +565,7 @@ static struct sock *__llc_lookup_listener(struct llc_sap *sap, again: sk_nulls_for_each_rcu(rc, node, laddr_hb) { if (llc_listener_match(sap, laddr, rc)) { - /* Extra checks required by SLAB_DESTROY_BY_RCU */ + /* Extra checks required by SLAB_TYPESAFE_BY_RCU */ if (unlikely(!atomic_inc_not_zero(&rc->sk_refcnt))) goto again; if (unlikely(llc_sk(rc)->sap != sap || diff --git a/net/llc/llc_sap.c b/net/llc/llc_sap.c index 5404d0d195cc..63b6ab056370 100644 --- a/net/llc/llc_sap.c +++ b/net/llc/llc_sap.c @@ -328,7 +328,7 @@ static struct sock *llc_lookup_dgram(struct llc_sap *sap, again: sk_nulls_for_each_rcu(rc, node, laddr_hb) { if (llc_dgram_match(sap, laddr, rc)) { - /* Extra checks required by SLAB_DESTROY_BY_RCU */ + /* Extra checks required by SLAB_TYPESAFE_BY_RCU */ if (unlikely(!atomic_inc_not_zero(&rc->sk_refcnt))) goto again; if (unlikely(llc_sk(rc)->sap != sap || diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 071b97fcbefb..fdcdac7916b2 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -914,7 +914,7 @@ static unsigned int early_drop_list(struct net *net, continue; /* kill only if still in same netns -- might have moved due to - * SLAB_DESTROY_BY_RCU rules. + * SLAB_TYPESAFE_BY_RCU rules. * * We steal the timer reference. If that fails timer has * already fired or someone else deleted it. Just drop ref @@ -1069,7 +1069,7 @@ __nf_conntrack_alloc(struct net *net, /* * Do not use kmem_cache_zalloc(), as this cache uses - * SLAB_DESTROY_BY_RCU. + * SLAB_TYPESAFE_BY_RCU. */ ct = kmem_cache_alloc(nf_conntrack_cachep, gfp); if (ct == NULL) @@ -1114,7 +1114,7 @@ void nf_conntrack_free(struct nf_conn *ct) struct net *net = nf_ct_net(ct); /* A freed object has refcnt == 0, that's - * the golden rule for SLAB_DESTROY_BY_RCU + * the golden rule for SLAB_TYPESAFE_BY_RCU */ NF_CT_ASSERT(atomic_read(&ct->ct_general.use) == 0); @@ -1878,7 +1878,7 @@ int nf_conntrack_init_start(void) nf_conntrack_cachep = kmem_cache_create("nf_conntrack", sizeof(struct nf_conn), NFCT_INFOMASK + 1, - SLAB_DESTROY_BY_RCU | SLAB_HWCACHE_ALIGN, NULL); + SLAB_TYPESAFE_BY_RCU | SLAB_HWCACHE_ALIGN, NULL); if (!nf_conntrack_cachep) goto err_cachep; diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 85837ab90e89..d34bbd6d8f38 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -101,7 +101,7 @@ struct proto smc_proto = { .unhash = smc_unhash_sk, .obj_size = sizeof(struct smc_sock), .h.smc_hash = &smc_v4_hashinfo, - .slab_flags = SLAB_DESTROY_BY_RCU, + .slab_flags = SLAB_TYPESAFE_BY_RCU, }; EXPORT_SYMBOL_GPL(smc_proto); diff --git a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh index ea6e373edc27..93eede4e8fbe 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh @@ -170,7 +170,7 @@ qemu_append="`identify_qemu_append "$QEMU"`" # Pull in Kconfig-fragment boot parameters boot_args="`configfrag_boot_params "$boot_args" "$config_template"`" # Generate kernel-version-specific boot parameters -boot_args="`per_version_boot_params "$boot_args" $builddir/.config $seconds`" +boot_args="`per_version_boot_params "$boot_args" $resdir/.config $seconds`" if test -n "$TORTURE_BUILDONLY" then |