From e72246748ff006ab928bc774e276e6ef5542f9c5 Mon Sep 17 00:00:00 2001 From: Tim Chen Date: Tue, 21 Jan 2014 15:36:00 -0800 Subject: locking/mutexes/mcs: Restructure the MCS lock defines and locking code into its own file We will need the MCS lock code for doing optimistic spinning for rwsem and queued rwlock. Extracting the MCS code from mutex.c and put into its own file allow us to reuse this code easily. We also inline mcs_spin_lock and mcs_spin_unlock functions for better efficiency. Note that using the smp_load_acquire/smp_store_release pair used in mcs_lock and mcs_unlock is not sufficient to form a full memory barrier across cpus for many architectures (except x86). For applications that absolutely need a full barrier across multiple cpus with mcs_unlock and mcs_lock pair, smp_mb__after_unlock_lock() should be used after mcs_lock. Reviewed-by: Paul E. McKenney Signed-off-by: Tim Chen Signed-off-by: Davidlohr Bueso Signed-off-by: Peter Zijlstra Cc: Linus Torvalds Cc: Andrew Morton Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1390347360.3138.63.camel@schen9-DESK Signed-off-by: Ingo Molnar --- include/linux/mcs_spinlock.h | 77 ++++++++++++++++++++++++++++++++++++++++++++ include/linux/mutex.h | 5 +-- 2 files changed, 80 insertions(+), 2 deletions(-) create mode 100644 include/linux/mcs_spinlock.h (limited to 'include/linux') diff --git a/include/linux/mcs_spinlock.h b/include/linux/mcs_spinlock.h new file mode 100644 index 000000000000..9578ef81940b --- /dev/null +++ b/include/linux/mcs_spinlock.h @@ -0,0 +1,77 @@ +/* + * MCS lock defines + * + * This file contains the main data structure and API definitions of MCS lock. + * + * The MCS lock (proposed by Mellor-Crummey and Scott) is a simple spin-lock + * with the desirable properties of being fair, and with each cpu trying + * to acquire the lock spinning on a local variable. + * It avoids expensive cache bouncings that common test-and-set spin-lock + * implementations incur. + */ +#ifndef __LINUX_MCS_SPINLOCK_H +#define __LINUX_MCS_SPINLOCK_H + +struct mcs_spinlock { + struct mcs_spinlock *next; + int locked; /* 1 if lock acquired */ +}; + +/* + * Note: the smp_load_acquire/smp_store_release pair is not + * sufficient to form a full memory barrier across + * cpus for many architectures (except x86) for mcs_unlock and mcs_lock. + * For applications that need a full barrier across multiple cpus + * with mcs_unlock and mcs_lock pair, smp_mb__after_unlock_lock() should be + * used after mcs_lock. + */ +static inline +void mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node) +{ + struct mcs_spinlock *prev; + + /* Init node */ + node->locked = 0; + node->next = NULL; + + prev = xchg(lock, node); + if (likely(prev == NULL)) { + /* Lock acquired */ + node->locked = 1; + return; + } + ACCESS_ONCE(prev->next) = node; + /* + * Wait until the lock holder passes the lock down. + * Using smp_load_acquire() provides a memory barrier that + * ensures subsequent operations happen after the lock is acquired. + */ + while (!(smp_load_acquire(&node->locked))) + arch_mutex_cpu_relax(); +} + +static inline +void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node) +{ + struct mcs_spinlock *next = ACCESS_ONCE(node->next); + + if (likely(!next)) { + /* + * Release the lock by setting it to NULL + */ + if (cmpxchg(lock, node, NULL) == node) + return; + /* Wait until the next pointer is set */ + while (!(next = ACCESS_ONCE(node->next))) + arch_mutex_cpu_relax(); + } + /* + * Pass lock to next waiter. + * smp_store_release() provides a memory barrier to ensure + * all operations in the critical section has been completed + * before unlocking. + */ + smp_store_release(&next->locked, 1); +} + +#endif /* __LINUX_MCS_SPINLOCK_H */ diff --git a/include/linux/mutex.h b/include/linux/mutex.h index d3181936c138..c482e1d2cc49 100644 --- a/include/linux/mutex.h +++ b/include/linux/mutex.h @@ -46,6 +46,7 @@ * - detects multi-task circular deadlocks and prints out all affected * locks and tasks (and only those tasks) */ +struct mcs_spinlock; struct mutex { /* 1: unlocked, 0: locked, negative: locked, possible waiters */ atomic_t count; @@ -55,7 +56,7 @@ struct mutex { struct task_struct *owner; #endif #ifdef CONFIG_MUTEX_SPIN_ON_OWNER - void *spin_mlock; /* Spinner MCS lock */ + struct mcs_spinlock *mcs_lock; /* Spinner MCS lock */ #endif #ifdef CONFIG_DEBUG_MUTEXES const char *name; @@ -179,4 +180,4 @@ extern int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock); # define arch_mutex_cpu_relax() cpu_relax() #endif -#endif +#endif /* __LINUX_MUTEX_H */ -- cgit v1.2.3 From 5faeb8adb956a5ad6579c4e309e8689943ad8294 Mon Sep 17 00:00:00 2001 From: Jason Low Date: Tue, 21 Jan 2014 15:36:05 -0800 Subject: locking/mcs: Micro-optimize the MCS code, add extra comments Remove unnecessary operation to assign locked status to 1 if lock is acquired without contention. Lock status will not be checked by lock holder again once it is acquired and any lock contenders will not be looking at the lock holder's lock status. Make the cmpxchg(lock, node, NULL) == node check in mcs_spin_unlock() likely() as it is likely that a race did not occur most of the time. Also add in more comments describing how the local node is used in MCS locks. Reviewed-by: Paul E. McKenney Reviewed-by: Tim Chen Signed-off-by: Jason Low Signed-off-by: Tim Chen Signed-off-by: Peter Zijlstra Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/1390347365.3138.64.camel@schen9-DESK Signed-off-by: Ingo Molnar --- include/linux/mcs_spinlock.h | 27 ++++++++++++++++++++++++--- 1 file changed, 24 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mcs_spinlock.h b/include/linux/mcs_spinlock.h index 9578ef81940b..143fa428a857 100644 --- a/include/linux/mcs_spinlock.h +++ b/include/linux/mcs_spinlock.h @@ -25,6 +25,17 @@ struct mcs_spinlock { * with mcs_unlock and mcs_lock pair, smp_mb__after_unlock_lock() should be * used after mcs_lock. */ + +/* + * In order to acquire the lock, the caller should declare a local node and + * pass a reference of the node to this function in addition to the lock. + * If the lock has already been acquired, then this will proceed to spin + * on this node->locked until the previous lock holder sets the node->locked + * in mcs_spin_unlock(). + * + * We don't inline mcs_spin_lock() so that perf can correctly account for the + * time spent in this lock function. + */ static inline void mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node) { @@ -36,8 +47,14 @@ void mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node) prev = xchg(lock, node); if (likely(prev == NULL)) { - /* Lock acquired */ - node->locked = 1; + /* + * Lock acquired, don't need to set node->locked to 1. Threads + * only spin on its own node->locked value for lock acquisition. + * However, since this thread can immediately acquire the lock + * and does not proceed to spin on its own node->locked, this + * value won't be used. If a debug mode is needed to + * audit lock status, then set node->locked value here. + */ return; } ACCESS_ONCE(prev->next) = node; @@ -50,6 +67,10 @@ void mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node) arch_mutex_cpu_relax(); } +/* + * Releases the lock. The caller should pass in the corresponding node that + * was used to acquire the lock. + */ static inline void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node) { @@ -59,7 +80,7 @@ void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node) /* * Release the lock by setting it to NULL */ - if (cmpxchg(lock, node, NULL) == node) + if (likely(cmpxchg(lock, node, NULL) == node)) return; /* Wait until the next pointer is set */ while (!(next = ACCESS_ONCE(node->next))) -- cgit v1.2.3 From e207552e64ea053a33e856828ad7915484911d06 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 21 Jan 2014 15:36:10 -0800 Subject: locking/mcs: Allow architectures to hook in to contended paths When contended, architectures may be able to reduce the polling overhead in ways which aren't expressible using a simple relax() primitive. This patch allows architectures to hook into the mcs_{lock,unlock} functions for the contended cases only. Reviewed-by: Paul E. McKenney Signed-off-by: Will Deacon Signed-off-by: Tim Chen Signed-off-by: Peter Zijlstra Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/1390347370.3138.65.camel@schen9-DESK Signed-off-by: Ingo Molnar --- include/linux/mcs_spinlock.h | 42 ++++++++++++++++++++++++++++-------------- 1 file changed, 28 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mcs_spinlock.h b/include/linux/mcs_spinlock.h index 143fa428a857..e9a4d74c63dc 100644 --- a/include/linux/mcs_spinlock.h +++ b/include/linux/mcs_spinlock.h @@ -17,6 +17,28 @@ struct mcs_spinlock { int locked; /* 1 if lock acquired */ }; +#ifndef arch_mcs_spin_lock_contended +/* + * Using smp_load_acquire() provides a memory barrier that ensures + * subsequent operations happen after the lock is acquired. + */ +#define arch_mcs_spin_lock_contended(l) \ +do { \ + while (!(smp_load_acquire(l))) \ + arch_mutex_cpu_relax(); \ +} while (0) +#endif + +#ifndef arch_mcs_spin_unlock_contended +/* + * smp_store_release() provides a memory barrier to ensure all + * operations in the critical section has been completed before + * unlocking. + */ +#define arch_mcs_spin_unlock_contended(l) \ + smp_store_release((l), 1) +#endif + /* * Note: the smp_load_acquire/smp_store_release pair is not * sufficient to form a full memory barrier across @@ -58,13 +80,9 @@ void mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node) return; } ACCESS_ONCE(prev->next) = node; - /* - * Wait until the lock holder passes the lock down. - * Using smp_load_acquire() provides a memory barrier that - * ensures subsequent operations happen after the lock is acquired. - */ - while (!(smp_load_acquire(&node->locked))) - arch_mutex_cpu_relax(); + + /* Wait until the lock holder passes the lock down. */ + arch_mcs_spin_lock_contended(&node->locked); } /* @@ -86,13 +104,9 @@ void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node) while (!(next = ACCESS_ONCE(node->next))) arch_mutex_cpu_relax(); } - /* - * Pass lock to next waiter. - * smp_store_release() provides a memory barrier to ensure - * all operations in the critical section has been completed - * before unlocking. - */ - smp_store_release(&next->locked, 1); + + /* Pass lock to next waiter. */ + arch_mcs_spin_unlock_contended(&next->locked); } #endif /* __LINUX_MCS_SPINLOCK_H */ -- cgit v1.2.3 From 52bf84aa206cd2c2516dfa3e03b578edf8a3242f Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Mon, 27 Jan 2014 17:03:40 -0500 Subject: sched/numa, mm: Remove p->numa_migrate_deferred Excessive migration of pages can hurt the performance of workloads that span multiple NUMA nodes. However, it turns out that the p->numa_migrate_deferred knob is a really big hammer, which does reduce migration rates, but does not actually help performance. Now that the second stage of the automatic numa balancing code has stabilized, it is time to replace the simplistic migration deferral code with something smarter. Signed-off-by: Rik van Riel Acked-by: Mel Gorman Signed-off-by: Peter Zijlstra Cc: Chegu Vinod Link: http://lkml.kernel.org/r/1390860228-21539-2-git-send-email-riel@redhat.com Signed-off-by: Ingo Molnar --- Documentation/sysctl/kernel.txt | 10 +-------- include/linux/sched.h | 1 - kernel/sched/fair.c | 8 -------- kernel/sysctl.c | 7 ------- mm/mempolicy.c | 45 ----------------------------------------- 5 files changed, 1 insertion(+), 70 deletions(-) (limited to 'include/linux') diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt index 6d486404200e..760f6e6a2e40 100644 --- a/Documentation/sysctl/kernel.txt +++ b/Documentation/sysctl/kernel.txt @@ -386,8 +386,7 @@ feature should be disabled. Otherwise, if the system overhead from the feature is too high then the rate the kernel samples for NUMA hinting faults may be controlled by the numa_balancing_scan_period_min_ms, numa_balancing_scan_delay_ms, numa_balancing_scan_period_max_ms, -numa_balancing_scan_size_mb, numa_balancing_settle_count sysctls and -numa_balancing_migrate_deferred. +numa_balancing_scan_size_mb, and numa_balancing_settle_count sysctls. ============================================================== @@ -428,13 +427,6 @@ rate for each task. numa_balancing_scan_size_mb is how many megabytes worth of pages are scanned for a given scan. -numa_balancing_migrate_deferred is how many page migrations get skipped -unconditionally, after a page migration is skipped because a page is shared -with other tasks. This reduces page migration overhead, and determines -how much stronger the "move task near its memory" policy scheduler becomes, -versus the "move memory near its task" memory management policy, for workloads -with shared memory. - ============================================================== osrelease, ostype & version: diff --git a/include/linux/sched.h b/include/linux/sched.h index ffccdad050b5..d572d5ba650f 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1457,7 +1457,6 @@ struct task_struct { unsigned int numa_scan_period; unsigned int numa_scan_period_max; int numa_preferred_nid; - int numa_migrate_deferred; unsigned long numa_migrate_retry; u64 node_stamp; /* migration stamp */ struct callback_head numa_work; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index efe6457ac5c8..7cdde913b4dc 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -819,14 +819,6 @@ unsigned int sysctl_numa_balancing_scan_size = 256; /* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */ unsigned int sysctl_numa_balancing_scan_delay = 1000; -/* - * After skipping a page migration on a shared page, skip N more numa page - * migrations unconditionally. This reduces the number of NUMA migrations - * in shared memory workloads, and has the effect of pulling tasks towards - * where their memory lives, over pulling the memory towards the task. - */ -unsigned int sysctl_numa_balancing_migrate_deferred = 16; - static unsigned int task_nr_scan_windows(struct task_struct *p) { unsigned long rss = 0; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index c8da99f905cf..b41d61d95c14 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -384,13 +384,6 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, - { - .procname = "numa_balancing_migrate_deferred", - .data = &sysctl_numa_balancing_migrate_deferred, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, #endif /* CONFIG_NUMA_BALANCING */ #endif /* CONFIG_SCHED_DEBUG */ { diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 0cd2c4d4e270..68d5c7f7164e 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2304,35 +2304,6 @@ static void sp_free(struct sp_node *n) kmem_cache_free(sn_cache, n); } -#ifdef CONFIG_NUMA_BALANCING -static bool numa_migrate_deferred(struct task_struct *p, int last_cpupid) -{ - /* Never defer a private fault */ - if (cpupid_match_pid(p, last_cpupid)) - return false; - - if (p->numa_migrate_deferred) { - p->numa_migrate_deferred--; - return true; - } - return false; -} - -static inline void defer_numa_migrate(struct task_struct *p) -{ - p->numa_migrate_deferred = sysctl_numa_balancing_migrate_deferred; -} -#else -static inline bool numa_migrate_deferred(struct task_struct *p, int last_cpupid) -{ - return false; -} - -static inline void defer_numa_migrate(struct task_struct *p) -{ -} -#endif /* CONFIG_NUMA_BALANCING */ - /** * mpol_misplaced - check whether current page node is valid in policy * @@ -2435,24 +2406,8 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long */ last_cpupid = page_cpupid_xchg_last(page, this_cpupid); if (!cpupid_pid_unset(last_cpupid) && cpupid_to_nid(last_cpupid) != thisnid) { - - /* See sysctl_numa_balancing_migrate_deferred comment */ - if (!cpupid_match_pid(current, last_cpupid)) - defer_numa_migrate(current); - goto out; } - - /* - * The quadratic filter above reduces extraneous migration - * of shared pages somewhat. This code reduces it even more, - * reducing the overhead of page migrations of shared pages. - * This makes workloads with shared pages rely more on - * "move task near its memory", and less on "move memory - * towards its task", which is exactly what we want. - */ - if (numa_migrate_deferred(current, last_cpupid)) - goto out; } if (curnid != polnid) -- cgit v1.2.3 From ff1df896aef8e0ec1556a5c44f424bd45bfa2cbe Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Mon, 27 Jan 2014 17:03:41 -0500 Subject: sched/numa: Rename p->numa_faults to numa_faults_memory In order to get a more consistent naming scheme, making it clear which fault statistics track memory locality, and which track CPU locality, rename the memory fault statistics. Suggested-by: Mel Gorman Signed-off-by: Rik van Riel Acked-by: Mel Gorman Signed-off-by: Peter Zijlstra Cc: Chegu Vinod Link: http://lkml.kernel.org/r/1390860228-21539-3-git-send-email-riel@redhat.com Signed-off-by: Ingo Molnar --- include/linux/sched.h | 8 ++++---- kernel/sched/core.c | 4 ++-- kernel/sched/debug.c | 6 +++--- kernel/sched/fair.c | 56 +++++++++++++++++++++++++-------------------------- 4 files changed, 37 insertions(+), 37 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index d572d5ba650f..144d509df053 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1469,15 +1469,15 @@ struct task_struct { * Scheduling placement decisions are made based on the these counts. * The values remain static for the duration of a PTE scan */ - unsigned long *numa_faults; + unsigned long *numa_faults_memory; unsigned long total_numa_faults; /* * numa_faults_buffer records faults per node during the current - * scan window. When the scan completes, the counts in numa_faults - * decay and these values are copied. + * scan window. When the scan completes, the counts in + * numa_faults_memory decay and these values are copied. */ - unsigned long *numa_faults_buffer; + unsigned long *numa_faults_buffer_memory; /* * numa_faults_locality tracks if faults recorded during the last diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 81343d6bd9cb..bc708c53bf03 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1744,8 +1744,8 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0; p->numa_scan_period = sysctl_numa_balancing_scan_delay; p->numa_work.next = &p->numa_work; - p->numa_faults = NULL; - p->numa_faults_buffer = NULL; + p->numa_faults_memory = NULL; + p->numa_faults_buffer_memory = NULL; INIT_LIST_HEAD(&p->numa_entry); p->numa_group = NULL; diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index dd52e7ffb10e..31b908daaa1b 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -533,15 +533,15 @@ static void sched_show_numa(struct task_struct *p, struct seq_file *m) unsigned long nr_faults = -1; int cpu_current, home_node; - if (p->numa_faults) - nr_faults = p->numa_faults[2*node + i]; + if (p->numa_faults_memory) + nr_faults = p->numa_faults_memory[2*node + i]; cpu_current = !i ? (task_node(p) == node) : (pol && node_isset(node, pol->v.nodes)); home_node = (p->numa_preferred_nid == node); - SEQ_printf(m, "numa_faults, %d, %d, %d, %d, %ld\n", + SEQ_printf(m, "numa_faults_memory, %d, %d, %d, %d, %ld\n", i, node, cpu_current, home_node, nr_faults); } } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7cdde913b4dc..3e616d704f67 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -901,11 +901,11 @@ static inline int task_faults_idx(int nid, int priv) static inline unsigned long task_faults(struct task_struct *p, int nid) { - if (!p->numa_faults) + if (!p->numa_faults_memory) return 0; - return p->numa_faults[task_faults_idx(nid, 0)] + - p->numa_faults[task_faults_idx(nid, 1)]; + return p->numa_faults_memory[task_faults_idx(nid, 0)] + + p->numa_faults_memory[task_faults_idx(nid, 1)]; } static inline unsigned long group_faults(struct task_struct *p, int nid) @@ -927,7 +927,7 @@ static inline unsigned long task_weight(struct task_struct *p, int nid) { unsigned long total_faults; - if (!p->numa_faults) + if (!p->numa_faults_memory) return 0; total_faults = p->total_numa_faults; @@ -1255,7 +1255,7 @@ static int task_numa_migrate(struct task_struct *p) static void numa_migrate_preferred(struct task_struct *p) { /* This task has no NUMA fault statistics yet */ - if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults)) + if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults_memory)) return; /* Periodically retry migrating the task to the preferred node */ @@ -1371,16 +1371,16 @@ static void task_numa_placement(struct task_struct *p) long diff; i = task_faults_idx(nid, priv); - diff = -p->numa_faults[i]; + diff = -p->numa_faults_memory[i]; /* Decay existing window, copy faults since last scan */ - p->numa_faults[i] >>= 1; - p->numa_faults[i] += p->numa_faults_buffer[i]; - fault_types[priv] += p->numa_faults_buffer[i]; - p->numa_faults_buffer[i] = 0; + p->numa_faults_memory[i] >>= 1; + p->numa_faults_memory[i] += p->numa_faults_buffer_memory[i]; + fault_types[priv] += p->numa_faults_buffer_memory[i]; + p->numa_faults_buffer_memory[i] = 0; - faults += p->numa_faults[i]; - diff += p->numa_faults[i]; + faults += p->numa_faults_memory[i]; + diff += p->numa_faults_memory[i]; p->total_numa_faults += diff; if (p->numa_group) { /* safe because we can only change our own group */ @@ -1465,7 +1465,7 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags, grp->gid = p->pid; for (i = 0; i < 2*nr_node_ids; i++) - grp->faults[i] = p->numa_faults[i]; + grp->faults[i] = p->numa_faults_memory[i]; grp->total_faults = p->total_numa_faults; @@ -1523,8 +1523,8 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags, double_lock(&my_grp->lock, &grp->lock); for (i = 0; i < 2*nr_node_ids; i++) { - my_grp->faults[i] -= p->numa_faults[i]; - grp->faults[i] += p->numa_faults[i]; + my_grp->faults[i] -= p->numa_faults_memory[i]; + grp->faults[i] += p->numa_faults_memory[i]; } my_grp->total_faults -= p->total_numa_faults; grp->total_faults += p->total_numa_faults; @@ -1550,12 +1550,12 @@ void task_numa_free(struct task_struct *p) { struct numa_group *grp = p->numa_group; int i; - void *numa_faults = p->numa_faults; + void *numa_faults = p->numa_faults_memory; if (grp) { spin_lock(&grp->lock); for (i = 0; i < 2*nr_node_ids; i++) - grp->faults[i] -= p->numa_faults[i]; + grp->faults[i] -= p->numa_faults_memory[i]; grp->total_faults -= p->total_numa_faults; list_del(&p->numa_entry); @@ -1565,8 +1565,8 @@ void task_numa_free(struct task_struct *p) put_numa_group(grp); } - p->numa_faults = NULL; - p->numa_faults_buffer = NULL; + p->numa_faults_memory = NULL; + p->numa_faults_buffer_memory = NULL; kfree(numa_faults); } @@ -1591,16 +1591,16 @@ void task_numa_fault(int last_cpupid, int node, int pages, int flags) return; /* Allocate buffer to track faults on a per-node basis */ - if (unlikely(!p->numa_faults)) { - int size = sizeof(*p->numa_faults) * 2 * nr_node_ids; + if (unlikely(!p->numa_faults_memory)) { + int size = sizeof(*p->numa_faults_memory) * 2 * nr_node_ids; /* numa_faults and numa_faults_buffer share the allocation */ - p->numa_faults = kzalloc(size * 2, GFP_KERNEL|__GFP_NOWARN); - if (!p->numa_faults) + p->numa_faults_memory = kzalloc(size * 2, GFP_KERNEL|__GFP_NOWARN); + if (!p->numa_faults_memory) return; - BUG_ON(p->numa_faults_buffer); - p->numa_faults_buffer = p->numa_faults + (2 * nr_node_ids); + BUG_ON(p->numa_faults_buffer_memory); + p->numa_faults_buffer_memory = p->numa_faults_memory + (2 * nr_node_ids); p->total_numa_faults = 0; memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality)); } @@ -1629,7 +1629,7 @@ void task_numa_fault(int last_cpupid, int node, int pages, int flags) if (migrated) p->numa_pages_migrated += pages; - p->numa_faults_buffer[task_faults_idx(node, priv)] += pages; + p->numa_faults_buffer_memory[task_faults_idx(node, priv)] += pages; p->numa_faults_locality[!!(flags & TNF_FAULT_LOCAL)] += pages; } @@ -4771,7 +4771,7 @@ static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env) { int src_nid, dst_nid; - if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults || + if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults_memory || !(env->sd->flags & SD_NUMA)) { return false; } @@ -4802,7 +4802,7 @@ static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env) if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER)) return false; - if (!p->numa_faults || !(env->sd->flags & SD_NUMA)) + if (!p->numa_faults_memory || !(env->sd->flags & SD_NUMA)) return false; src_nid = cpu_to_node(env->src_cpu); -- cgit v1.2.3 From 50ec8a401fed6d246ab65e6011d61ac91c34af70 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Mon, 27 Jan 2014 17:03:42 -0500 Subject: sched/numa: Track from which nodes NUMA faults are triggered Track which nodes NUMA faults are triggered from, in other words the CPUs on which the NUMA faults happened. This uses a similar mechanism to what is used to track the memory involved in numa faults. The next patches use this to build up a bitmap of which nodes a workload is actively running on. Signed-off-by: Rik van Riel Acked-by: Mel Gorman Signed-off-by: Peter Zijlstra Cc: Chegu Vinod Link: http://lkml.kernel.org/r/1390860228-21539-4-git-send-email-riel@redhat.com Signed-off-by: Ingo Molnar --- include/linux/sched.h | 9 +++++++-- kernel/sched/fair.c | 30 +++++++++++++++++++++++------- 2 files changed, 30 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 144d509df053..5fb0cfb43ecf 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1479,6 +1479,13 @@ struct task_struct { */ unsigned long *numa_faults_buffer_memory; + /* + * Track the nodes the process was running on when a NUMA hinting + * fault was incurred. + */ + unsigned long *numa_faults_cpu; + unsigned long *numa_faults_buffer_cpu; + /* * numa_faults_locality tracks if faults recorded during the last * scan window were remote/local. The task scan period is adapted @@ -1582,8 +1589,6 @@ extern void task_numa_fault(int last_node, int node, int pages, int flags); extern pid_t task_numa_group_id(struct task_struct *p); extern void set_numabalancing_state(bool enabled); extern void task_numa_free(struct task_struct *p); - -extern unsigned int sysctl_numa_balancing_migrate_deferred; #else static inline void task_numa_fault(int last_node, int node, int pages, int flags) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 3e616d704f67..4841aaff7394 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -886,6 +886,7 @@ struct numa_group { struct rcu_head rcu; unsigned long total_faults; + unsigned long *faults_cpu; unsigned long faults[0]; }; @@ -1368,10 +1369,11 @@ static void task_numa_placement(struct task_struct *p) int priv, i; for (priv = 0; priv < 2; priv++) { - long diff; + long diff, f_diff; i = task_faults_idx(nid, priv); diff = -p->numa_faults_memory[i]; + f_diff = -p->numa_faults_cpu[i]; /* Decay existing window, copy faults since last scan */ p->numa_faults_memory[i] >>= 1; @@ -1379,12 +1381,18 @@ static void task_numa_placement(struct task_struct *p) fault_types[priv] += p->numa_faults_buffer_memory[i]; p->numa_faults_buffer_memory[i] = 0; + p->numa_faults_cpu[i] >>= 1; + p->numa_faults_cpu[i] += p->numa_faults_buffer_cpu[i]; + p->numa_faults_buffer_cpu[i] = 0; + faults += p->numa_faults_memory[i]; diff += p->numa_faults_memory[i]; + f_diff += p->numa_faults_cpu[i]; p->total_numa_faults += diff; if (p->numa_group) { /* safe because we can only change our own group */ p->numa_group->faults[i] += diff; + p->numa_group->faults_cpu[i] += f_diff; p->numa_group->total_faults += diff; group_faults += p->numa_group->faults[i]; } @@ -1453,7 +1461,7 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags, if (unlikely(!p->numa_group)) { unsigned int size = sizeof(struct numa_group) + - 2*nr_node_ids*sizeof(unsigned long); + 4*nr_node_ids*sizeof(unsigned long); grp = kzalloc(size, GFP_KERNEL | __GFP_NOWARN); if (!grp) @@ -1463,8 +1471,10 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags, spin_lock_init(&grp->lock); INIT_LIST_HEAD(&grp->task_list); grp->gid = p->pid; + /* Second half of the array tracks nids where faults happen */ + grp->faults_cpu = grp->faults + 2 * nr_node_ids; - for (i = 0; i < 2*nr_node_ids; i++) + for (i = 0; i < 4*nr_node_ids; i++) grp->faults[i] = p->numa_faults_memory[i]; grp->total_faults = p->total_numa_faults; @@ -1522,7 +1532,7 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags, double_lock(&my_grp->lock, &grp->lock); - for (i = 0; i < 2*nr_node_ids; i++) { + for (i = 0; i < 4*nr_node_ids; i++) { my_grp->faults[i] -= p->numa_faults_memory[i]; grp->faults[i] += p->numa_faults_memory[i]; } @@ -1554,7 +1564,7 @@ void task_numa_free(struct task_struct *p) if (grp) { spin_lock(&grp->lock); - for (i = 0; i < 2*nr_node_ids; i++) + for (i = 0; i < 4*nr_node_ids; i++) grp->faults[i] -= p->numa_faults_memory[i]; grp->total_faults -= p->total_numa_faults; @@ -1567,6 +1577,8 @@ void task_numa_free(struct task_struct *p) p->numa_faults_memory = NULL; p->numa_faults_buffer_memory = NULL; + p->numa_faults_cpu= NULL; + p->numa_faults_buffer_cpu = NULL; kfree(numa_faults); } @@ -1577,6 +1589,7 @@ void task_numa_fault(int last_cpupid, int node, int pages, int flags) { struct task_struct *p = current; bool migrated = flags & TNF_MIGRATED; + int this_node = task_node(current); int priv; if (!numabalancing_enabled) @@ -1592,7 +1605,7 @@ void task_numa_fault(int last_cpupid, int node, int pages, int flags) /* Allocate buffer to track faults on a per-node basis */ if (unlikely(!p->numa_faults_memory)) { - int size = sizeof(*p->numa_faults_memory) * 2 * nr_node_ids; + int size = sizeof(*p->numa_faults_memory) * 4 * nr_node_ids; /* numa_faults and numa_faults_buffer share the allocation */ p->numa_faults_memory = kzalloc(size * 2, GFP_KERNEL|__GFP_NOWARN); @@ -1600,7 +1613,9 @@ void task_numa_fault(int last_cpupid, int node, int pages, int flags) return; BUG_ON(p->numa_faults_buffer_memory); - p->numa_faults_buffer_memory = p->numa_faults_memory + (2 * nr_node_ids); + p->numa_faults_cpu = p->numa_faults_memory + (2 * nr_node_ids); + p->numa_faults_buffer_memory = p->numa_faults_memory + (4 * nr_node_ids); + p->numa_faults_buffer_cpu = p->numa_faults_memory + (6 * nr_node_ids); p->total_numa_faults = 0; memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality)); } @@ -1630,6 +1645,7 @@ void task_numa_fault(int last_cpupid, int node, int pages, int flags) p->numa_pages_migrated += pages; p->numa_faults_buffer_memory[task_faults_idx(node, priv)] += pages; + p->numa_faults_buffer_cpu[task_faults_idx(this_node, priv)] += pages; p->numa_faults_locality[!!(flags & TNF_FAULT_LOCAL)] += pages; } -- cgit v1.2.3 From 10f39042711ba21773763f267b4943a2c66c8bef Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Mon, 27 Jan 2014 17:03:44 -0500 Subject: sched/numa, mm: Use active_nodes nodemask to limit numa migrations Use the active_nodes nodemask to make smarter decisions on NUMA migrations. In order to maximize performance of workloads that do not fit in one NUMA node, we want to satisfy the following criteria: 1) keep private memory local to each thread 2) avoid excessive NUMA migration of pages 3) distribute shared memory across the active nodes, to maximize memory bandwidth available to the workload This patch accomplishes that by implementing the following policy for NUMA migrations: 1) always migrate on a private fault 2) never migrate to a node that is not in the set of active nodes for the numa_group 3) always migrate from a node outside of the set of active nodes, to a node that is in that set 4) within the set of active nodes in the numa_group, only migrate from a node with more NUMA page faults, to a node with fewer NUMA page faults, with a 25% margin to avoid ping-ponging This results in most pages of a workload ending up on the actively used nodes, with reduced ping-ponging of pages between those nodes. Signed-off-by: Rik van Riel Acked-by: Mel Gorman Signed-off-by: Peter Zijlstra Cc: Chegu Vinod Link: http://lkml.kernel.org/r/1390860228-21539-6-git-send-email-riel@redhat.com Signed-off-by: Ingo Molnar --- include/linux/sched.h | 7 ++++++ kernel/sched/fair.c | 63 +++++++++++++++++++++++++++++++++++++++++++++++++++ mm/mempolicy.c | 29 +----------------------- 3 files changed, 71 insertions(+), 28 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 5fb0cfb43ecf..5ab3b89fc33e 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1589,6 +1589,8 @@ extern void task_numa_fault(int last_node, int node, int pages, int flags); extern pid_t task_numa_group_id(struct task_struct *p); extern void set_numabalancing_state(bool enabled); extern void task_numa_free(struct task_struct *p); +extern bool should_numa_migrate_memory(struct task_struct *p, struct page *page, + int src_nid, int dst_cpu); #else static inline void task_numa_fault(int last_node, int node, int pages, int flags) @@ -1604,6 +1606,11 @@ static inline void set_numabalancing_state(bool enabled) static inline void task_numa_free(struct task_struct *p) { } +static inline bool should_numa_migrate_memory(struct task_struct *p, + struct page *page, int src_nid, int dst_cpu) +{ + return true; +} #endif static inline struct pid *task_pid(struct task_struct *task) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 1ee921f1ec35..eeabb33f349e 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -954,6 +954,69 @@ static inline unsigned long group_weight(struct task_struct *p, int nid) return 1000 * group_faults(p, nid) / p->numa_group->total_faults; } +bool should_numa_migrate_memory(struct task_struct *p, struct page * page, + int src_nid, int dst_cpu) +{ + struct numa_group *ng = p->numa_group; + int dst_nid = cpu_to_node(dst_cpu); + int last_cpupid, this_cpupid; + + this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid); + + /* + * Multi-stage node selection is used in conjunction with a periodic + * migration fault to build a temporal task<->page relation. By using + * a two-stage filter we remove short/unlikely relations. + * + * Using P(p) ~ n_p / n_t as per frequentist probability, we can equate + * a task's usage of a particular page (n_p) per total usage of this + * page (n_t) (in a given time-span) to a probability. + * + * Our periodic faults will sample this probability and getting the + * same result twice in a row, given these samples are fully + * independent, is then given by P(n)^2, provided our sample period + * is sufficiently short compared to the usage pattern. + * + * This quadric squishes small probabilities, making it less likely we + * act on an unlikely task<->page relation. + */ + last_cpupid = page_cpupid_xchg_last(page, this_cpupid); + if (!cpupid_pid_unset(last_cpupid) && + cpupid_to_nid(last_cpupid) != dst_nid) + return false; + + /* Always allow migrate on private faults */ + if (cpupid_match_pid(p, last_cpupid)) + return true; + + /* A shared fault, but p->numa_group has not been set up yet. */ + if (!ng) + return true; + + /* + * Do not migrate if the destination is not a node that + * is actively used by this numa group. + */ + if (!node_isset(dst_nid, ng->active_nodes)) + return false; + + /* + * Source is a node that is not actively used by this + * numa group, while the destination is. Migrate. + */ + if (!node_isset(src_nid, ng->active_nodes)) + return true; + + /* + * Both source and destination are nodes in active + * use by this numa group. Maximize memory bandwidth + * by migrating from more heavily used groups, to less + * heavily used ones, spreading the load around. + * Use a 1/4 hysteresis to avoid spurious page movement. + */ + return group_faults(p, dst_nid) < (group_faults(p, src_nid) * 3 / 4); +} + static unsigned long weighted_cpuload(const int cpu); static unsigned long source_load(int cpu, int type); static unsigned long target_load(int cpu, int type); diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 68d5c7f7164e..784c11ef7719 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2377,37 +2377,10 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long /* Migrate the page towards the node whose CPU is referencing it */ if (pol->flags & MPOL_F_MORON) { - int last_cpupid; - int this_cpupid; - polnid = thisnid; - this_cpupid = cpu_pid_to_cpupid(thiscpu, current->pid); - /* - * Multi-stage node selection is used in conjunction - * with a periodic migration fault to build a temporal - * task<->page relation. By using a two-stage filter we - * remove short/unlikely relations. - * - * Using P(p) ~ n_p / n_t as per frequentist - * probability, we can equate a task's usage of a - * particular page (n_p) per total usage of this - * page (n_t) (in a given time-span) to a probability. - * - * Our periodic faults will sample this probability and - * getting the same result twice in a row, given these - * samples are fully independent, is then given by - * P(n)^2, provided our sample period is sufficiently - * short compared to the usage pattern. - * - * This quadric squishes small probabilities, making - * it less likely we act on an unlikely task<->page - * relation. - */ - last_cpupid = page_cpupid_xchg_last(page, this_cpupid); - if (!cpupid_pid_unset(last_cpupid) && cpupid_to_nid(last_cpupid) != thisnid) { + if (!should_numa_migrate_memory(current, page, curnid, thiscpu)) goto out; - } } if (curnid != polnid) -- cgit v1.2.3 From 7e2703e6099609adc93679c4d45cd6247f565971 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Mon, 27 Jan 2014 17:03:45 -0500 Subject: sched/numa: Normalize faults_cpu stats and weigh by CPU use Tracing the code that decides the active nodes has made it abundantly clear that the naive implementation of the faults_from code has issues. Specifically, the garbage collector in some workloads will access orders of magnitudes more memory than the threads that do all the active work. This resulted in the node with the garbage collector being marked the only active node in the group. This issue is avoided if we weigh the statistics by CPU use of each task in the numa group, instead of by how many faults each thread has occurred. To achieve this, we normalize the number of faults to the fraction of faults that occurred on each node, and then multiply that fraction by the fraction of CPU time the task has used since the last time task_numa_placement was invoked. This way the nodes in the active node mask will be the ones where the tasks from the numa group are most actively running, and the influence of eg. the garbage collector and other do-little threads is properly minimized. On a 4 node system, using CPU use statistics calculated over a longer interval results in about 1% fewer page migrations with two 32-warehouse specjbb runs on a 4 node system, and about 5% fewer page migrations, as well as 1% better throughput, with two 8-warehouse specjbb runs, as compared with the shorter term statistics kept by the scheduler. Signed-off-by: Rik van Riel Acked-by: Mel Gorman Signed-off-by: Peter Zijlstra Cc: Chegu Vinod Link: http://lkml.kernel.org/r/1390860228-21539-7-git-send-email-riel@redhat.com Signed-off-by: Ingo Molnar --- include/linux/sched.h | 2 ++ kernel/sched/core.c | 2 ++ kernel/sched/fair.c | 53 +++++++++++++++++++++++++++++++++++++++++++++++++-- 3 files changed, 55 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 5ab3b89fc33e..ef92953764f2 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1459,6 +1459,8 @@ struct task_struct { int numa_preferred_nid; unsigned long numa_migrate_retry; u64 node_stamp; /* migration stamp */ + u64 last_task_numa_placement; + u64 last_sum_exec_runtime; struct callback_head numa_work; struct list_head numa_entry; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index bc708c53bf03..a561c9e8e382 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1746,6 +1746,8 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) p->numa_work.next = &p->numa_work; p->numa_faults_memory = NULL; p->numa_faults_buffer_memory = NULL; + p->last_task_numa_placement = 0; + p->last_sum_exec_runtime = 0; INIT_LIST_HEAD(&p->numa_entry); p->numa_group = NULL; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index eeabb33f349e..8fc3a8234817 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -887,6 +887,11 @@ struct numa_group { struct rcu_head rcu; nodemask_t active_nodes; unsigned long total_faults; + /* + * Faults_cpu is used to decide whether memory should move + * towards the CPU. As a consequence, these stats are weighted + * more by CPU use than by memory faults. + */ unsigned long *faults_cpu; unsigned long faults[0]; }; @@ -1446,11 +1451,41 @@ static void update_task_scan_period(struct task_struct *p, memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality)); } +/* + * Get the fraction of time the task has been running since the last + * NUMA placement cycle. The scheduler keeps similar statistics, but + * decays those on a 32ms period, which is orders of magnitude off + * from the dozens-of-seconds NUMA balancing period. Use the scheduler + * stats only if the task is so new there are no NUMA statistics yet. + */ +static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period) +{ + u64 runtime, delta, now; + /* Use the start of this time slice to avoid calculations. */ + now = p->se.exec_start; + runtime = p->se.sum_exec_runtime; + + if (p->last_task_numa_placement) { + delta = runtime - p->last_sum_exec_runtime; + *period = now - p->last_task_numa_placement; + } else { + delta = p->se.avg.runnable_avg_sum; + *period = p->se.avg.runnable_avg_period; + } + + p->last_sum_exec_runtime = runtime; + p->last_task_numa_placement = now; + + return delta; +} + static void task_numa_placement(struct task_struct *p) { int seq, nid, max_nid = -1, max_group_nid = -1; unsigned long max_faults = 0, max_group_faults = 0; unsigned long fault_types[2] = { 0, 0 }; + unsigned long total_faults; + u64 runtime, period; spinlock_t *group_lock = NULL; seq = ACCESS_ONCE(p->mm->numa_scan_seq); @@ -1459,6 +1494,10 @@ static void task_numa_placement(struct task_struct *p) p->numa_scan_seq = seq; p->numa_scan_period_max = task_scan_max(p); + total_faults = p->numa_faults_locality[0] + + p->numa_faults_locality[1]; + runtime = numa_get_avg_runtime(p, &period); + /* If the task is part of a group prevent parallel updates to group stats */ if (p->numa_group) { group_lock = &p->numa_group->lock; @@ -1471,7 +1510,7 @@ static void task_numa_placement(struct task_struct *p) int priv, i; for (priv = 0; priv < 2; priv++) { - long diff, f_diff; + long diff, f_diff, f_weight; i = task_faults_idx(nid, priv); diff = -p->numa_faults_memory[i]; @@ -1483,8 +1522,18 @@ static void task_numa_placement(struct task_struct *p) fault_types[priv] += p->numa_faults_buffer_memory[i]; p->numa_faults_buffer_memory[i] = 0; + /* + * Normalize the faults_from, so all tasks in a group + * count according to CPU use, instead of by the raw + * number of faults. Tasks with little runtime have + * little over-all impact on throughput, and thus their + * faults are less important. + */ + f_weight = div64_u64(runtime << 16, period + 1); + f_weight = (f_weight * p->numa_faults_buffer_cpu[i]) / + (total_faults + 1); p->numa_faults_cpu[i] >>= 1; - p->numa_faults_cpu[i] += p->numa_faults_buffer_cpu[i]; + p->numa_faults_cpu[i] += f_weight; p->numa_faults_buffer_cpu[i] = 0; faults += p->numa_faults_memory[i]; -- cgit v1.2.3 From da7e6f45c34d39186b72328bacc4dd86bff60e0a Mon Sep 17 00:00:00 2001 From: Preeti U Murthy Date: Fri, 7 Feb 2014 13:36:06 +0530 Subject: time: Change the return type of clockevents_notify() to integer The broadcast framework can potentially be made use of by archs which do not have an external clock device as well. Then, it is required that one of the CPUs need to handle the broadcasting of wakeup IPIs to the CPUs in deep idle. As a result its local timers should remain functional all the time. For such a CPU, the BROADCAST_ENTER notification has to fail indicating that its clock device cannot be shutdown. To make way for this support, change the return type of tick_broadcast_oneshot_control() and hence clockevents_notify() to indicate such scenarios. Signed-off-by: Preeti U Murthy Cc: deepthi@linux.vnet.ibm.com Cc: paulmck@linux.vnet.ibm.com Cc: fweisbec@gmail.com Cc: paulus@samba.org Cc: srivatsa.bhat@linux.vnet.ibm.com Cc: svaidy@linux.vnet.ibm.com Cc: peterz@infradead.org Cc: benh@kernel.crashing.org Cc: rafael.j.wysocki@intel.com Cc: linuxppc-dev@lists.ozlabs.org Link: http://lkml.kernel.org/r/20140207080606.17187.78306.stgit@preeti.in.ibm.com Signed-off-by: Thomas Gleixner --- include/linux/clockchips.h | 6 +++--- kernel/time/clockevents.c | 8 +++++--- kernel/time/tick-broadcast.c | 6 ++++-- kernel/time/tick-internal.h | 6 +++--- 4 files changed, 15 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/clockchips.h b/include/linux/clockchips.h index 493aa021c7a9..e0c5a6c418de 100644 --- a/include/linux/clockchips.h +++ b/include/linux/clockchips.h @@ -186,9 +186,9 @@ static inline int tick_check_broadcast_expired(void) { return 0; } #endif #ifdef CONFIG_GENERIC_CLOCKEVENTS -extern void clockevents_notify(unsigned long reason, void *arg); +extern int clockevents_notify(unsigned long reason, void *arg); #else -static inline void clockevents_notify(unsigned long reason, void *arg) {} +static inline int clockevents_notify(unsigned long reason, void *arg) { return 0; } #endif #else /* CONFIG_GENERIC_CLOCKEVENTS_BUILD */ @@ -196,7 +196,7 @@ static inline void clockevents_notify(unsigned long reason, void *arg) {} static inline void clockevents_suspend(void) {} static inline void clockevents_resume(void) {} -static inline void clockevents_notify(unsigned long reason, void *arg) {} +static inline int clockevents_notify(unsigned long reason, void *arg) { return 0; } static inline int tick_check_broadcast_expired(void) { return 0; } #endif diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index f85e5fda9c66..ad362c260ef4 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -542,12 +542,13 @@ void clockevents_resume(void) #ifdef CONFIG_GENERIC_CLOCKEVENTS /** * clockevents_notify - notification about relevant events + * Returns 0 on success, any other value on error */ -void clockevents_notify(unsigned long reason, void *arg) +int clockevents_notify(unsigned long reason, void *arg) { struct clock_event_device *dev, *tmp; unsigned long flags; - int cpu; + int cpu, ret = 0; raw_spin_lock_irqsave(&clockevents_lock, flags); @@ -560,7 +561,7 @@ void clockevents_notify(unsigned long reason, void *arg) case CLOCK_EVT_NOTIFY_BROADCAST_ENTER: case CLOCK_EVT_NOTIFY_BROADCAST_EXIT: - tick_broadcast_oneshot_control(reason); + ret = tick_broadcast_oneshot_control(reason); break; case CLOCK_EVT_NOTIFY_CPU_DYING: @@ -603,6 +604,7 @@ void clockevents_notify(unsigned long reason, void *arg) break; } raw_spin_unlock_irqrestore(&clockevents_lock, flags); + return ret; } EXPORT_SYMBOL_GPL(clockevents_notify); diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 003e6c3663b1..84c8fd91d744 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -646,14 +646,15 @@ again: /* * Powerstate information: The system enters/leaves a state, where * affected devices might stop + * Returns 0 on success, -EBUSY if the cpu is used to broadcast wakeups. */ -void tick_broadcast_oneshot_control(unsigned long reason) +int tick_broadcast_oneshot_control(unsigned long reason) { struct clock_event_device *bc, *dev; struct tick_device *td; unsigned long flags; ktime_t now; - int cpu; + int cpu, ret = 0; /* * Periodic mode does not care about the enter/exit of power @@ -759,6 +760,7 @@ void tick_broadcast_oneshot_control(unsigned long reason) } out: raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); + return ret; } /* diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index 26f1c0ba9d81..0756c62c219a 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -46,7 +46,7 @@ extern int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *)); extern void tick_resume_oneshot(void); # ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST extern void tick_broadcast_setup_oneshot(struct clock_event_device *bc); -extern void tick_broadcast_oneshot_control(unsigned long reason); +extern int tick_broadcast_oneshot_control(unsigned long reason); extern void tick_broadcast_switch_to_oneshot(void); extern void tick_shutdown_broadcast_oneshot(unsigned int *cpup); extern int tick_resume_broadcast_oneshot(struct clock_event_device *bc); @@ -58,7 +58,7 @@ static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc) { BUG(); } -static inline void tick_broadcast_oneshot_control(unsigned long reason) { } +static inline int tick_broadcast_oneshot_control(unsigned long reason) { return 0; } static inline void tick_broadcast_switch_to_oneshot(void) { } static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { } static inline int tick_broadcast_oneshot_active(void) { return 0; } @@ -87,7 +87,7 @@ static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc) { BUG(); } -static inline void tick_broadcast_oneshot_control(unsigned long reason) { } +static inline int tick_broadcast_oneshot_control(unsigned long reason) { return 0; } static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { } static inline int tick_resume_broadcast_oneshot(struct clock_event_device *bc) { -- cgit v1.2.3 From 5d1638acb9f62fa7eb0c07cb85318bbe1f13b227 Mon Sep 17 00:00:00 2001 From: Preeti U Murthy Date: Fri, 7 Feb 2014 13:36:32 +0530 Subject: tick: Introduce hrtimer based broadcast On some architectures, in certain CPU deep idle states the local timers stop. An external clock device is used to wakeup these CPUs. The kernel support for the wakeup of these CPUs is provided by the tick broadcast framework by using the external clock device as the wakeup source. However not all implementations of architectures provide such an external clock device. This patch includes support in the broadcast framework to handle the wakeup of the CPUs in deep idle states on such systems by queuing a hrtimer on one of the CPUs, which is meant to handle the wakeup of CPUs in deep idle states. This patchset introduces a pseudo clock device which can be registered by the archs as tick_broadcast_device in the absence of a real external clock device. Once registered, the broadcast framework will work as is for these architectures as long as the archs take care of the BROADCAST_ENTER notification failing for one of the CPUs. This CPU is made the stand by CPU to handle wakeup of the CPUs in deep idle and it *must not enter deep idle states*. The CPU with the earliest wakeup is chosen to be this CPU. Hence this way the stand by CPU dynamically moves around and so does the hrtimer which is queued to trigger at the next earliest wakeup time. This is consistent with the case where an external clock device is present. The smp affinity of this clock device is set to the CPU with the earliest wakeup. This patchset handles the hotplug of the stand by CPU as well by moving the hrtimer on to the CPU handling the CPU_DEAD notification. Originally-from: Thomas Gleixner Signed-off-by: Preeti U Murthy Cc: deepthi@linux.vnet.ibm.com Cc: paulmck@linux.vnet.ibm.com Cc: fweisbec@gmail.com Cc: paulus@samba.org Cc: srivatsa.bhat@linux.vnet.ibm.com Cc: svaidy@linux.vnet.ibm.com Cc: peterz@infradead.org Cc: benh@kernel.crashing.org Cc: rafael.j.wysocki@intel.com Cc: linuxppc-dev@lists.ozlabs.org Link: http://lkml.kernel.org/r/20140207080632.17187.80532.stgit@preeti.in.ibm.com Signed-off-by: Thomas Gleixner --- include/linux/clockchips.h | 9 +++ kernel/time/Makefile | 2 +- kernel/time/tick-broadcast-hrtimer.c | 106 +++++++++++++++++++++++++++++++++++ kernel/time/tick-broadcast.c | 54 +++++++++++++++++- 4 files changed, 167 insertions(+), 4 deletions(-) create mode 100644 kernel/time/tick-broadcast-hrtimer.c (limited to 'include/linux') diff --git a/include/linux/clockchips.h b/include/linux/clockchips.h index e0c5a6c418de..dbe9e1457168 100644 --- a/include/linux/clockchips.h +++ b/include/linux/clockchips.h @@ -62,6 +62,11 @@ enum clock_event_mode { #define CLOCK_EVT_FEAT_DYNIRQ 0x000020 #define CLOCK_EVT_FEAT_PERCPU 0x000040 +/* + * Clockevent device is based on a hrtimer for broadcast + */ +#define CLOCK_EVT_FEAT_HRTIMER 0x000080 + /** * struct clock_event_device - clock event device descriptor * @event_handler: Assigned by the framework to be called by the low @@ -83,6 +88,7 @@ enum clock_event_mode { * @name: ptr to clock event name * @rating: variable to rate clock event devices * @irq: IRQ number (only for non CPU local devices) + * @bound_on: Bound on CPU * @cpumask: cpumask to indicate for which CPUs this device works * @list: list head for the management code * @owner: module reference @@ -113,6 +119,7 @@ struct clock_event_device { const char *name; int rating; int irq; + int bound_on; const struct cpumask *cpumask; struct list_head list; struct module *owner; @@ -180,9 +187,11 @@ extern int tick_receive_broadcast(void); #endif #if defined(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) && defined(CONFIG_TICK_ONESHOT) +extern void tick_setup_hrtimer_broadcast(void); extern int tick_check_broadcast_expired(void); #else static inline int tick_check_broadcast_expired(void) { return 0; } +static void tick_setup_hrtimer_broadcast(void) {}; #endif #ifdef CONFIG_GENERIC_CLOCKEVENTS diff --git a/kernel/time/Makefile b/kernel/time/Makefile index 9250130646f5..06151ef4a744 100644 --- a/kernel/time/Makefile +++ b/kernel/time/Makefile @@ -3,7 +3,7 @@ obj-y += timeconv.o posix-clock.o alarmtimer.o obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD) += clockevents.o obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o -obj-$(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) += tick-broadcast.o +obj-$(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) += tick-broadcast.o tick-broadcast-hrtimer.o obj-$(CONFIG_GENERIC_SCHED_CLOCK) += sched_clock.o obj-$(CONFIG_TICK_ONESHOT) += tick-oneshot.o obj-$(CONFIG_TICK_ONESHOT) += tick-sched.o diff --git a/kernel/time/tick-broadcast-hrtimer.c b/kernel/time/tick-broadcast-hrtimer.c new file mode 100644 index 000000000000..92425279312b --- /dev/null +++ b/kernel/time/tick-broadcast-hrtimer.c @@ -0,0 +1,106 @@ +/* + * linux/kernel/time/tick-broadcast-hrtimer.c + * This file emulates a local clock event device + * via a pseudo clock device. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "tick-internal.h" + +static struct hrtimer bctimer; + +static void bc_set_mode(enum clock_event_mode mode, + struct clock_event_device *bc) +{ + switch (mode) { + case CLOCK_EVT_MODE_SHUTDOWN: + /* + * Note, we cannot cancel the timer here as we might + * run into the following live lock scenario: + * + * cpu 0 cpu1 + * lock(broadcast_lock); + * hrtimer_interrupt() + * bc_handler() + * tick_handle_oneshot_broadcast(); + * lock(broadcast_lock); + * hrtimer_cancel() + * wait_for_callback() + */ + hrtimer_try_to_cancel(&bctimer); + break; + default: + break; + } +} + +/* + * This is called from the guts of the broadcast code when the cpu + * which is about to enter idle has the earliest broadcast timer event. + */ +static int bc_set_next(ktime_t expires, struct clock_event_device *bc) +{ + /* + * We try to cancel the timer first. If the callback is on + * flight on some other cpu then we let it handle it. If we + * were able to cancel the timer nothing can rearm it as we + * own broadcast_lock. + * + * However we can also be called from the event handler of + * ce_broadcast_hrtimer itself when it expires. We cannot + * restart the timer because we are in the callback, but we + * can set the expiry time and let the callback return + * HRTIMER_RESTART. + */ + if (hrtimer_try_to_cancel(&bctimer) >= 0) { + hrtimer_start(&bctimer, expires, HRTIMER_MODE_ABS_PINNED); + /* Bind the "device" to the cpu */ + bc->bound_on = smp_processor_id(); + } else if (bc->bound_on == smp_processor_id()) { + hrtimer_set_expires(&bctimer, expires); + } + return 0; +} + +static struct clock_event_device ce_broadcast_hrtimer = { + .set_mode = bc_set_mode, + .set_next_ktime = bc_set_next, + .features = CLOCK_EVT_FEAT_ONESHOT | + CLOCK_EVT_FEAT_KTIME | + CLOCK_EVT_FEAT_HRTIMER, + .rating = 0, + .bound_on = -1, + .min_delta_ns = 1, + .max_delta_ns = KTIME_MAX, + .min_delta_ticks = 1, + .max_delta_ticks = KTIME_MAX, + .mult = 1, + .shift = 0, + .cpumask = cpu_all_mask, +}; + +static enum hrtimer_restart bc_handler(struct hrtimer *t) +{ + ce_broadcast_hrtimer.event_handler(&ce_broadcast_hrtimer); + + if (ce_broadcast_hrtimer.next_event.tv64 == KTIME_MAX) + return HRTIMER_NORESTART; + + return HRTIMER_RESTART; +} + +void tick_setup_hrtimer_broadcast(void) +{ + hrtimer_init(&bctimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); + bctimer.function = bc_handler; + clockevents_register_device(&ce_broadcast_hrtimer); +} diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 84c8fd91d744..63c7b2d9ed8e 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -643,6 +643,42 @@ again: raw_spin_unlock(&tick_broadcast_lock); } +static int broadcast_needs_cpu(struct clock_event_device *bc, int cpu) +{ + if (!(bc->features & CLOCK_EVT_FEAT_HRTIMER)) + return 0; + if (bc->next_event.tv64 == KTIME_MAX) + return 0; + return bc->bound_on == cpu ? -EBUSY : 0; +} + +static void broadcast_shutdown_local(struct clock_event_device *bc, + struct clock_event_device *dev) +{ + /* + * For hrtimer based broadcasting we cannot shutdown the cpu + * local device if our own event is the first one to expire or + * if we own the broadcast timer. + */ + if (bc->features & CLOCK_EVT_FEAT_HRTIMER) { + if (broadcast_needs_cpu(bc, smp_processor_id())) + return; + if (dev->next_event.tv64 < bc->next_event.tv64) + return; + } + clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN); +} + +static void broadcast_move_bc(int deadcpu) +{ + struct clock_event_device *bc = tick_broadcast_device.evtdev; + + if (!bc || !broadcast_needs_cpu(bc, deadcpu)) + return; + /* This moves the broadcast assignment to this cpu */ + clockevents_program_event(bc, bc->next_event, 1); +} + /* * Powerstate information: The system enters/leaves a state, where * affected devices might stop @@ -661,7 +697,7 @@ int tick_broadcast_oneshot_control(unsigned long reason) * states */ if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) - return; + return 0; /* * We are called with preemtion disabled from the depth of the @@ -672,7 +708,7 @@ int tick_broadcast_oneshot_control(unsigned long reason) dev = td->evtdev; if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) - return; + return 0; bc = tick_broadcast_device.evtdev; @@ -680,7 +716,7 @@ int tick_broadcast_oneshot_control(unsigned long reason) if (reason == CLOCK_EVT_NOTIFY_BROADCAST_ENTER) { if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_oneshot_mask)) { WARN_ON_ONCE(cpumask_test_cpu(cpu, tick_broadcast_pending_mask)); - clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN); + broadcast_shutdown_local(bc, dev); /* * We only reprogram the broadcast timer if we * did not mark ourself in the force mask and @@ -693,6 +729,16 @@ int tick_broadcast_oneshot_control(unsigned long reason) dev->next_event.tv64 < bc->next_event.tv64) tick_broadcast_set_event(bc, cpu, dev->next_event, 1); } + /* + * If the current CPU owns the hrtimer broadcast + * mechanism, it cannot go deep idle and we remove the + * CPU from the broadcast mask. We don't have to go + * through the EXIT path as the local timer is not + * shutdown. + */ + ret = broadcast_needs_cpu(bc, cpu); + if (ret) + cpumask_clear_cpu(cpu, tick_broadcast_oneshot_mask); } else { if (cpumask_test_and_clear_cpu(cpu, tick_broadcast_oneshot_mask)) { clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT); @@ -866,6 +912,8 @@ void tick_shutdown_broadcast_oneshot(unsigned int *cpup) cpumask_clear_cpu(cpu, tick_broadcast_pending_mask); cpumask_clear_cpu(cpu, tick_broadcast_force_mask); + broadcast_move_bc(cpu); + raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); } -- cgit v1.2.3 From f1689bb7abec8e2e670d8ad11eaa86d54bad8cfd Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 7 Feb 2014 16:00:46 +0100 Subject: time: Fixup fallout from recent clockevent/tick changes Make the stub function static inline instead of static and move the clockevents related function into the proper ifdeffed section. Reported-by: Fengguang Wu Signed-off-by: Thomas Gleixner Cc: Soren Brinkmann Cc: Preeti U Murthy --- include/linux/clockchips.h | 2 +- kernel/time/tick-internal.h | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/clockchips.h b/include/linux/clockchips.h index dbe9e1457168..20a7183f2831 100644 --- a/include/linux/clockchips.h +++ b/include/linux/clockchips.h @@ -191,7 +191,7 @@ extern void tick_setup_hrtimer_broadcast(void); extern int tick_check_broadcast_expired(void); #else static inline int tick_check_broadcast_expired(void) { return 0; } -static void tick_setup_hrtimer_broadcast(void) {}; +static inline void tick_setup_hrtimer_broadcast(void) {}; #endif #ifdef CONFIG_GENERIC_CLOCKEVENTS diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index 0756c62c219a..7ab92b19965a 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -155,8 +155,9 @@ static inline int tick_device_is_functional(struct clock_event_device *dev) return !(dev->features & CLOCK_EVT_FEAT_DUMMY); } +int __clockevents_update_freq(struct clock_event_device *dev, u32 freq); + #endif -int __clockevents_update_freq(struct clock_event_device *dev, u32 freq); extern void do_timer(unsigned long ticks); extern void update_wall_time(void); -- cgit v1.2.3 From 980f88e414418bf65569a3b62b08b07e6fc2f4c6 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 6 Feb 2014 17:28:41 +0100 Subject: sched/wait: Suppress Sparse 'variable shadowing' warning This warning seems to show up a lot now, since ___wait_event() is (indirectly) used inside wait_event_timeout(), which also has a variable called __ret. Rename the one in ___wait_event() to ___ret (another leading underscore) to suppress the warning. Signed-off-by: Johannes Berg Signed-off-by: Peter Zijlstra Cc: Steven Rostedt Cc: Andrew Morton Link: http://lkml.kernel.org/r/1391704121.12789.20.camel@jlt4.sipsolutions.net Signed-off-by: Ingo Molnar --- include/linux/wait.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/wait.h b/include/linux/wait.h index 559044c79232..c55ea5c24404 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -195,7 +195,7 @@ wait_queue_head_t *bit_waitqueue(void *, int); ({ \ __label__ __out; \ wait_queue_t __wait; \ - long __ret = ret; \ + long ___ret = ret; \ \ INIT_LIST_HEAD(&__wait.task_list); \ if (exclusive) \ @@ -210,7 +210,7 @@ wait_queue_head_t *bit_waitqueue(void *, int); break; \ \ if (___wait_is_interruptible(state) && __int) { \ - __ret = __int; \ + ___ret = __int; \ if (exclusive) { \ abort_exclusive_wait(&wq, &__wait, \ state, NULL); \ @@ -222,7 +222,7 @@ wait_queue_head_t *bit_waitqueue(void *, int); cmd; \ } \ finish_wait(&wq, &__wait); \ -__out: __ret; \ +__out: ___ret; \ }) #define __wait_event(wq, condition) \ -- cgit v1.2.3 From 6a02ad66b2c44155d529f430d4fa5c6c66321077 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 3 Feb 2014 18:11:08 +0100 Subject: perf/x86: Push the duration-logging printk() to IRQ context Calling printk() from NMI context is bad (TM), so move it to IRQ context. This also avoids the problem where the printk() time is measured by the generic NMI duration goo and triggers a second warning. Signed-off-by: Peter Zijlstra Cc: Don Zickus Cc: Dave Hansen Link: http://lkml.kernel.org/n/tip-75dv35xf6dhhmeb7nq6fua31@git.kernel.org Signed-off-by: Ingo Molnar --- include/linux/irq_work.h | 2 ++ kernel/events/core.c | 28 +++++++++++++++++++++------- 2 files changed, 23 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/irq_work.h b/include/linux/irq_work.h index 66017028dcb3..add13c8624b7 100644 --- a/include/linux/irq_work.h +++ b/include/linux/irq_work.h @@ -30,6 +30,8 @@ void init_irq_work(struct irq_work *work, void (*func)(struct irq_work *)) work->func = func; } +#define DEFINE_IRQ_WORK(name, _f) struct irq_work name = { .func = (_f), } + void irq_work_queue(struct irq_work *work); void irq_work_run(void); void irq_work_sync(struct irq_work *work); diff --git a/kernel/events/core.c b/kernel/events/core.c index 56003c6edfd3..2067cbb378eb 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -231,11 +231,29 @@ int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write, #define NR_ACCUMULATED_SAMPLES 128 static DEFINE_PER_CPU(u64, running_sample_length); -void perf_sample_event_took(u64 sample_len_ns) +static void perf_duration_warn(struct irq_work *w) { + u64 allowed_ns = ACCESS_ONCE(perf_sample_allowed_ns); u64 avg_local_sample_len; u64 local_samples_len; + + local_samples_len = __get_cpu_var(running_sample_length); + avg_local_sample_len = local_samples_len/NR_ACCUMULATED_SAMPLES; + + printk_ratelimited(KERN_WARNING + "perf interrupt took too long (%lld > %lld), lowering " + "kernel.perf_event_max_sample_rate to %d\n", + avg_local_sample_len, allowed_ns, + sysctl_perf_event_sample_rate); +} + +static DEFINE_IRQ_WORK(perf_duration_work, perf_duration_warn); + +void perf_sample_event_took(u64 sample_len_ns) +{ u64 allowed_ns = ACCESS_ONCE(perf_sample_allowed_ns); + u64 avg_local_sample_len; + u64 local_samples_len; if (allowed_ns == 0) return; @@ -263,13 +281,9 @@ void perf_sample_event_took(u64 sample_len_ns) sysctl_perf_event_sample_rate = max_samples_per_tick * HZ; perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate; - printk_ratelimited(KERN_WARNING - "perf samples too long (%lld > %lld), lowering " - "kernel.perf_event_max_sample_rate to %d\n", - avg_local_sample_len, allowed_ns, - sysctl_perf_event_sample_rate); - update_perf_cpu_limits(); + + irq_work_queue(&perf_duration_work); } static atomic64_t perf_event_id; -- cgit v1.2.3 From 5c228079ce8a9bb043a423069a6674dfb9268037 Mon Sep 17 00:00:00 2001 From: Dongsheng Yang Date: Mon, 27 Jan 2014 17:15:37 -0500 Subject: sched: Move the priority specific bits into a new header file Some bits about priority are defined in linux/sched/rt.h, but some of them are not only for rt scheduler, such as MAX_PRIO. This patch move them all into a new header file, linux/sched/prio.h. Signed-off-by: Dongsheng Yang Cc: clark.williams@gmail.com Cc: rostedt@goodmis.org Cc: raistlin@linux.it Cc: juri.lelli@gmail.com Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/f7549508a1588da2c613d601748ca9de30fa5dcf.1390859827.git.yangds.fnst@cn.fujitsu.com Signed-off-by: Ingo Molnar --- include/linux/sched.h | 2 ++ include/linux/sched/prio.h | 23 +++++++++++++++++++++++ include/linux/sched/rt.h | 19 +------------------ 3 files changed, 26 insertions(+), 18 deletions(-) create mode 100644 include/linux/sched/prio.h (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index ed867797fe5a..d97d0a8e87dc 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -3,6 +3,8 @@ #include +#include + struct sched_param { int sched_priority; diff --git a/include/linux/sched/prio.h b/include/linux/sched/prio.h new file mode 100644 index 000000000000..9382ba84d5d0 --- /dev/null +++ b/include/linux/sched/prio.h @@ -0,0 +1,23 @@ +#ifndef _SCHED_PRIO_H +#define _SCHED_PRIO_H + +/* + * Priority of a process goes from 0..MAX_PRIO-1, valid RT + * priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL/SCHED_BATCH + * tasks are in the range MAX_RT_PRIO..MAX_PRIO-1. Priority + * values are inverted: lower p->prio value means higher priority. + * + * The MAX_USER_RT_PRIO value allows the actual maximum + * RT priority to be separate from the value exported to + * user-space. This allows kernel threads to set their + * priority to a value higher than any user task. Note: + * MAX_RT_PRIO must not be smaller than MAX_USER_RT_PRIO. + */ + +#define MAX_USER_RT_PRIO 100 +#define MAX_RT_PRIO MAX_USER_RT_PRIO + +#define MAX_PRIO (MAX_RT_PRIO + 40) +#define DEFAULT_PRIO (MAX_RT_PRIO + 20) + +#endif /* _SCHED_PRIO_H */ diff --git a/include/linux/sched/rt.h b/include/linux/sched/rt.h index 34e4ebea8fce..f7453d4c5613 100644 --- a/include/linux/sched/rt.h +++ b/include/linux/sched/rt.h @@ -1,24 +1,7 @@ #ifndef _SCHED_RT_H #define _SCHED_RT_H -/* - * Priority of a process goes from 0..MAX_PRIO-1, valid RT - * priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL/SCHED_BATCH - * tasks are in the range MAX_RT_PRIO..MAX_PRIO-1. Priority - * values are inverted: lower p->prio value means higher priority. - * - * The MAX_USER_RT_PRIO value allows the actual maximum - * RT priority to be separate from the value exported to - * user-space. This allows kernel threads to set their - * priority to a value higher than any user task. Note: - * MAX_RT_PRIO must not be smaller than MAX_USER_RT_PRIO. - */ - -#define MAX_USER_RT_PRIO 100 -#define MAX_RT_PRIO MAX_USER_RT_PRIO - -#define MAX_PRIO (MAX_RT_PRIO + 40) -#define DEFAULT_PRIO (MAX_RT_PRIO + 20) +#include static inline int rt_prio(int prio) { -- cgit v1.2.3 From 6b6350f155afdfdf888e18c7bf26950a6d10b0c2 Mon Sep 17 00:00:00 2001 From: Dongsheng Yang Date: Mon, 27 Jan 2014 17:15:38 -0500 Subject: sched: Expose some macros related to priority Some macros in kernel/sched/sched.h about priority are private to kernel/sched. But they are useful to other parts of the core kernel. This patch moves these macros from kernel/sched/sched.h to include/linux/sched/prio.h so that they are available to other subsystems. Signed-off-by: Dongsheng Yang Cc: raistlin@linux.it Cc: juri.lelli@gmail.com Cc: clark.williams@gmail.com Cc: rostedt@goodmis.org Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/2b022810905b52d13238466807f4b2a691577180.1390859827.git.yangds.fnst@cn.fujitsu.com Signed-off-by: Ingo Molnar --- include/linux/sched/prio.h | 18 ++++++++++++++++++ kernel/sched/sched.h | 18 ------------------ 2 files changed, 18 insertions(+), 18 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched/prio.h b/include/linux/sched/prio.h index 9382ba84d5d0..13216f16762e 100644 --- a/include/linux/sched/prio.h +++ b/include/linux/sched/prio.h @@ -20,4 +20,22 @@ #define MAX_PRIO (MAX_RT_PRIO + 40) #define DEFAULT_PRIO (MAX_RT_PRIO + 20) +/* + * Convert user-nice values [ -20 ... 0 ... 19 ] + * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], + * and back. + */ +#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20) +#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20) +#define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio) + +/* + * 'User priority' is the nice value converted to something we + * can work with better when scaling various scheduler parameters, + * it's a [ 0 ... 39 ] range. + */ +#define USER_PRIO(p) ((p)-MAX_RT_PRIO) +#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio) +#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO)) + #endif /* _SCHED_PRIO_H */ diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index c2119fd20f8b..b44720d38ae9 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -23,24 +23,6 @@ extern atomic_long_t calc_load_tasks; extern long calc_load_fold_active(struct rq *this_rq); extern void update_cpu_load_active(struct rq *this_rq); -/* - * Convert user-nice values [ -20 ... 0 ... 19 ] - * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], - * and back. - */ -#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20) -#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20) -#define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio) - -/* - * 'User priority' is the nice value converted to something we - * can work with better when scaling various scheduler parameters, - * it's a [ 0 ... 39 ] range. - */ -#define USER_PRIO(p) ((p)-MAX_RT_PRIO) -#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio) -#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO)) - /* * Helpers for converting nanosecond timing to jiffy resolution */ -- cgit v1.2.3 From 849401b66d305f3feb75b6db7459b95ad190552a Mon Sep 17 00:00:00 2001 From: Preeti U Murthy Date: Sun, 9 Feb 2014 11:32:22 +0530 Subject: tick: Fixup more fallout from hrtimer broadcast mode The hrtimer mode of broadcast is supported only when GENERIC_CLOCKEVENTS_BROADCAST and TICK_ONESHOT config options are enabled. Hence compile in the functions for hrtimer mode of broadcast only when these options are selected. Also fix max_delta_ticks value for the pseudo clock device. Reported-by: Fengguang Wu Reported-by: Ingo Molnar Signed-off-by: Preeti U Murthy Link: http://lkml.kernel.org/r/52F719EE.9010304@linux.vnet.ibm.com Signed-off-by: Thomas Gleixner --- include/linux/clockchips.h | 1 + kernel/time/Makefile | 5 ++++- kernel/time/tick-broadcast-hrtimer.c | 2 +- 3 files changed, 6 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/clockchips.h b/include/linux/clockchips.h index 20a7183f2831..2e4cb67f6e56 100644 --- a/include/linux/clockchips.h +++ b/include/linux/clockchips.h @@ -207,6 +207,7 @@ static inline void clockevents_resume(void) {} static inline int clockevents_notify(unsigned long reason, void *arg) { return 0; } static inline int tick_check_broadcast_expired(void) { return 0; } +static inline void tick_setup_hrtimer_broadcast(void) {}; #endif diff --git a/kernel/time/Makefile b/kernel/time/Makefile index 06151ef4a744..57a413fd0ebf 100644 --- a/kernel/time/Makefile +++ b/kernel/time/Makefile @@ -3,7 +3,10 @@ obj-y += timeconv.o posix-clock.o alarmtimer.o obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD) += clockevents.o obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o -obj-$(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) += tick-broadcast.o tick-broadcast-hrtimer.o +ifeq ($(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST),y) + obj-y += tick-broadcast.o + obj-$(CONFIG_TICK_ONESHOT) += tick-broadcast-hrtimer.o +endif obj-$(CONFIG_GENERIC_SCHED_CLOCK) += sched_clock.o obj-$(CONFIG_TICK_ONESHOT) += tick-oneshot.o obj-$(CONFIG_TICK_ONESHOT) += tick-sched.o diff --git a/kernel/time/tick-broadcast-hrtimer.c b/kernel/time/tick-broadcast-hrtimer.c index 92425279312b..eb682d5c697c 100644 --- a/kernel/time/tick-broadcast-hrtimer.c +++ b/kernel/time/tick-broadcast-hrtimer.c @@ -82,7 +82,7 @@ static struct clock_event_device ce_broadcast_hrtimer = { .min_delta_ns = 1, .max_delta_ns = KTIME_MAX, .min_delta_ticks = 1, - .max_delta_ticks = KTIME_MAX, + .max_delta_ticks = ULONG_MAX, .mult = 1, .shift = 0, .cpumask = cpu_all_mask, -- cgit v1.2.3 From d0ea026808ad81de2af14938448419a95211b938 Mon Sep 17 00:00:00 2001 From: Dongsheng Yang Date: Mon, 27 Jan 2014 22:00:45 -0500 Subject: sched: Implement task_nice() as static inline function As patch "sched: Move the priority specific bits into a new header file" exposes the priority related macros in linux/sched/prio.h, we don't have to implement task_nice() in kernel/sched/core.c any more. This patch implements it in linux/sched/sched.h as static inline function, saving the kernel stack and enhancing performance a bit. Signed-off-by: Dongsheng Yang Cc: clark.williams@gmail.com Cc: rostedt@goodmis.org Cc: raistlin@linux.it Cc: juri.lelli@gmail.com Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1390878045-7096-1-git-send-email-yangds.fnst@cn.fujitsu.com Signed-off-by: Ingo Molnar --- include/linux/sched.h | 11 ++++++++++- include/linux/sched/prio.h | 1 - kernel/sched/core.c | 26 +++++++------------------- kernel/sched/cputime.c | 4 ++-- 4 files changed, 19 insertions(+), 23 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index d97d0a8e87dc..e3d556427b2e 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2094,7 +2094,16 @@ static inline void sched_autogroup_exit(struct signal_struct *sig) { } extern bool yield_to(struct task_struct *p, bool preempt); extern void set_user_nice(struct task_struct *p, long nice); extern int task_prio(const struct task_struct *p); -extern int task_nice(const struct task_struct *p); +/** + * task_nice - return the nice value of a given task. + * @p: the task in question. + * + * Return: The nice value [ -20 ... 0 ... 19 ]. + */ +static inline int task_nice(const struct task_struct *p) +{ + return PRIO_TO_NICE((p)->static_prio); +} extern int can_nice(const struct task_struct *p, const int nice); extern int task_curr(const struct task_struct *p); extern int idle_cpu(int cpu); diff --git a/include/linux/sched/prio.h b/include/linux/sched/prio.h index 13216f16762e..410ccb74c9e6 100644 --- a/include/linux/sched/prio.h +++ b/include/linux/sched/prio.h @@ -27,7 +27,6 @@ */ #define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20) #define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20) -#define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio) /* * 'User priority' is the nice value converted to something we diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 210a12acf2cd..104c8164e04f 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3000,7 +3000,7 @@ void set_user_nice(struct task_struct *p, long nice) unsigned long flags; struct rq *rq; - if (TASK_NICE(p) == nice || nice < -20 || nice > 19) + if (task_nice(p) == nice || nice < -20 || nice > 19) return; /* * We have to be careful, if called from sys_setpriority(), @@ -3078,7 +3078,7 @@ SYSCALL_DEFINE1(nice, int, increment) if (increment > 40) increment = 40; - nice = TASK_NICE(current) + increment; + nice = task_nice(current) + increment; if (nice < -20) nice = -20; if (nice > 19) @@ -3110,18 +3110,6 @@ int task_prio(const struct task_struct *p) return p->prio - MAX_RT_PRIO; } -/** - * task_nice - return the nice value of a given task. - * @p: the task in question. - * - * Return: The nice value [ -20 ... 0 ... 19 ]. - */ -int task_nice(const struct task_struct *p) -{ - return TASK_NICE(p); -} -EXPORT_SYMBOL(task_nice); - /** * idle_cpu - is a given cpu idle currently? * @cpu: the processor in question. @@ -3321,7 +3309,7 @@ recheck: */ if (user && !capable(CAP_SYS_NICE)) { if (fair_policy(policy)) { - if (attr->sched_nice < TASK_NICE(p) && + if (attr->sched_nice < task_nice(p) && !can_nice(p, attr->sched_nice)) return -EPERM; } @@ -3345,7 +3333,7 @@ recheck: * SCHED_NORMAL if the RLIMIT_NICE would normally permit it. */ if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) { - if (!can_nice(p, TASK_NICE(p))) + if (!can_nice(p, task_nice(p))) return -EPERM; } @@ -3385,7 +3373,7 @@ recheck: * If not changing anything there's no need to proceed further: */ if (unlikely(policy == p->policy)) { - if (fair_policy(policy) && attr->sched_nice != TASK_NICE(p)) + if (fair_policy(policy) && attr->sched_nice != task_nice(p)) goto change; if (rt_policy(policy) && attr->sched_priority != p->rt_priority) goto change; @@ -3837,7 +3825,7 @@ SYSCALL_DEFINE3(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, else if (task_has_rt_policy(p)) attr.sched_priority = p->rt_priority; else - attr.sched_nice = TASK_NICE(p); + attr.sched_nice = task_nice(p); rcu_read_unlock(); @@ -7010,7 +6998,7 @@ void normalize_rt_tasks(void) * Renice negative nice level userspace * tasks back to 0: */ - if (TASK_NICE(p) < 0 && p->mm) + if (task_nice(p) < 0 && p->mm) set_user_nice(p, 0); continue; } diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 99947919e30b..58624a65f124 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -142,7 +142,7 @@ void account_user_time(struct task_struct *p, cputime_t cputime, p->utimescaled += cputime_scaled; account_group_user_time(p, cputime); - index = (TASK_NICE(p) > 0) ? CPUTIME_NICE : CPUTIME_USER; + index = (task_nice(p) > 0) ? CPUTIME_NICE : CPUTIME_USER; /* Add user time to cpustat. */ task_group_account_field(p, index, (__force u64) cputime); @@ -169,7 +169,7 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime, p->gtime += cputime; /* Add guest time to cpustat. */ - if (TASK_NICE(p) > 0) { + if (task_nice(p) > 0) { cpustat[CPUTIME_NICE] += (__force u64) cputime; cpustat[CPUTIME_GUEST_NICE] += (__force u64) cputime; } else { -- cgit v1.2.3 From ddf1d169c0a489d498c1799a7043904a43b0c159 Mon Sep 17 00:00:00 2001 From: Tim Chen Date: Tue, 21 Jan 2014 15:36:22 -0800 Subject: locking/mcs: Allow architecture specific asm files to be used for contended case This patch allows each architecture to add its specific assembly optimized arch_mcs_spin_lock_contended and arch_mcs_spinlock_uncontended for MCS lock and unlock functions. Signed-off-by: Tim Chen Cc: Scott J Norton Cc: Raghavendra K T Cc: AswinChandramouleeswaran Cc: George Spelvin Cc: Rik vanRiel Cc: Andrea Arcangeli Cc: MichelLespinasse Cc: Peter Hurley Cc: Andi Kleen Cc: Alex Shi Cc: Dave Hansen Cc: Tim Chen Cc: Arnd Bergmann Cc: "Figo.zhang" Cc: "Paul E.McKenney" Cc: "H. Peter Anvin" Cc: Davidlohr Bueso Cc: Waiman Long Cc: Ingo Molnar Cc: Will Deacon Cc: Andrew Morton Cc: Linus Torvalds Cc: Matthew R Wilcox Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1390347382.3138.67.camel@schen9-DESK Signed-off-by: Ingo Molnar --- arch/alpha/include/asm/Kbuild | 1 + arch/arc/include/asm/Kbuild | 1 + arch/arm/include/asm/Kbuild | 1 + arch/arm64/include/asm/Kbuild | 1 + arch/avr32/include/asm/Kbuild | 1 + arch/blackfin/include/asm/Kbuild | 1 + arch/c6x/include/asm/Kbuild | 1 + arch/cris/include/asm/Kbuild | 1 + arch/frv/include/asm/Kbuild | 1 + arch/hexagon/include/asm/Kbuild | 1 + arch/ia64/include/asm/Kbuild | 1 + arch/m32r/include/asm/Kbuild | 1 + arch/m68k/include/asm/Kbuild | 1 + arch/metag/include/asm/Kbuild | 1 + arch/microblaze/include/asm/Kbuild | 1 + arch/mips/include/asm/Kbuild | 1 + arch/mn10300/include/asm/Kbuild | 1 + arch/openrisc/include/asm/Kbuild | 1 + arch/parisc/include/asm/Kbuild | 1 + arch/powerpc/include/asm/Kbuild | 1 + arch/s390/include/asm/Kbuild | 1 + arch/score/include/asm/Kbuild | 1 + arch/sh/include/asm/Kbuild | 1 + arch/sparc/include/asm/Kbuild | 1 + arch/tile/include/asm/Kbuild | 1 + arch/um/include/asm/Kbuild | 1 + arch/unicore32/include/asm/Kbuild | 1 + arch/x86/include/asm/Kbuild | 1 + arch/xtensa/include/asm/Kbuild | 1 + include/asm-generic/mcs_spinlock.h | 13 +++++++++++++ include/linux/mcs_spinlock.h | 2 ++ 31 files changed, 44 insertions(+) create mode 100644 include/asm-generic/mcs_spinlock.h (limited to 'include/linux') diff --git a/arch/alpha/include/asm/Kbuild b/arch/alpha/include/asm/Kbuild index 55a2b0395d48..7736f426ff3b 100644 --- a/arch/alpha/include/asm/Kbuild +++ b/arch/alpha/include/asm/Kbuild @@ -3,5 +3,6 @@ generic-y += clkdev.h generic-y += exec.h generic-y += hash.h +generic-y += mcs_spinlock.h generic-y += preempt.h generic-y += trace_clock.h diff --git a/arch/arc/include/asm/Kbuild b/arch/arc/include/asm/Kbuild index 54ceb91c7ce0..e76fd79f32b0 100644 --- a/arch/arc/include/asm/Kbuild +++ b/arch/arc/include/asm/Kbuild @@ -22,6 +22,7 @@ generic-y += kmap_types.h generic-y += kvm_para.h generic-y += local.h generic-y += local64.h +generic-y += mcs_spinlock.h generic-y += mman.h generic-y += msgbuf.h generic-y += param.h diff --git a/arch/arm/include/asm/Kbuild b/arch/arm/include/asm/Kbuild index 3b2092049b51..23e728ecf8ab 100644 --- a/arch/arm/include/asm/Kbuild +++ b/arch/arm/include/asm/Kbuild @@ -14,6 +14,7 @@ generic-y += irq_regs.h generic-y += kdebug.h generic-y += local.h generic-y += local64.h +generic-y += mcs_spinlock.h generic-y += msgbuf.h generic-y += param.h generic-y += parport.h diff --git a/arch/arm64/include/asm/Kbuild b/arch/arm64/include/asm/Kbuild index 54038befa0a4..3bdfdda70567 100644 --- a/arch/arm64/include/asm/Kbuild +++ b/arch/arm64/include/asm/Kbuild @@ -23,6 +23,7 @@ generic-y += kmap_types.h generic-y += kvm_para.h generic-y += local.h generic-y += local64.h +generic-y += mcs_spinlock.h generic-y += mman.h generic-y += msgbuf.h generic-y += mutex.h diff --git a/arch/avr32/include/asm/Kbuild b/arch/avr32/include/asm/Kbuild index 064ca3c058ff..8b398ff96974 100644 --- a/arch/avr32/include/asm/Kbuild +++ b/arch/avr32/include/asm/Kbuild @@ -11,6 +11,7 @@ generic-y += hash.h generic-y += irq_regs.h generic-y += local.h generic-y += local64.h +generic-y += mcs_spinlock.h generic-y += param.h generic-y += percpu.h generic-y += preempt.h diff --git a/arch/blackfin/include/asm/Kbuild b/arch/blackfin/include/asm/Kbuild index ca30a3dc47e7..0d93b9a79ca9 100644 --- a/arch/blackfin/include/asm/Kbuild +++ b/arch/blackfin/include/asm/Kbuild @@ -20,6 +20,7 @@ generic-y += kmap_types.h generic-y += kvm_para.h generic-y += local.h generic-y += local64.h +generic-y += mcs_spinlock.h generic-y += mman.h generic-y += msgbuf.h generic-y += mutex.h diff --git a/arch/c6x/include/asm/Kbuild b/arch/c6x/include/asm/Kbuild index dea0671524a1..8dbdce8421b0 100644 --- a/arch/c6x/include/asm/Kbuild +++ b/arch/c6x/include/asm/Kbuild @@ -25,6 +25,7 @@ generic-y += irq_regs.h generic-y += kdebug.h generic-y += kmap_types.h generic-y += local.h +generic-y += mcs_spinlock.h generic-y += mman.h generic-y += mmu.h generic-y += mmu_context.h diff --git a/arch/cris/include/asm/Kbuild b/arch/cris/include/asm/Kbuild index 7f8d1b174a34..056027f38351 100644 --- a/arch/cris/include/asm/Kbuild +++ b/arch/cris/include/asm/Kbuild @@ -9,6 +9,7 @@ generic-y += exec.h generic-y += hash.h generic-y += kvm_para.h generic-y += linkage.h +generic-y += mcs_spinlock.h generic-y += module.h generic-y += preempt.h generic-y += trace_clock.h diff --git a/arch/frv/include/asm/Kbuild b/arch/frv/include/asm/Kbuild index 93428d035cfb..babb9338ebf8 100644 --- a/arch/frv/include/asm/Kbuild +++ b/arch/frv/include/asm/Kbuild @@ -2,5 +2,6 @@ generic-y += clkdev.h generic-y += exec.h generic-y += hash.h +generic-y += mcs_spinlock.h generic-y += preempt.h generic-y += trace_clock.h diff --git a/arch/hexagon/include/asm/Kbuild b/arch/hexagon/include/asm/Kbuild index 9f0eda48643a..eadcc118f950 100644 --- a/arch/hexagon/include/asm/Kbuild +++ b/arch/hexagon/include/asm/Kbuild @@ -27,6 +27,7 @@ generic-y += kdebug.h generic-y += kmap_types.h generic-y += local.h generic-y += local64.h +generic-y += mcs_spinlock.h generic-y += mman.h generic-y += msgbuf.h generic-y += pci.h diff --git a/arch/ia64/include/asm/Kbuild b/arch/ia64/include/asm/Kbuild index 81ebef7f1a87..0da4aa2602ae 100644 --- a/arch/ia64/include/asm/Kbuild +++ b/arch/ia64/include/asm/Kbuild @@ -3,6 +3,7 @@ generic-y += clkdev.h generic-y += exec.h generic-y += hash.h generic-y += kvm_para.h +generic-y += mcs_spinlock.h generic-y += preempt.h generic-y += trace_clock.h generic-y += vtime.h diff --git a/arch/m32r/include/asm/Kbuild b/arch/m32r/include/asm/Kbuild index 87042215b543..5825a35b2c56 100644 --- a/arch/m32r/include/asm/Kbuild +++ b/arch/m32r/include/asm/Kbuild @@ -2,6 +2,7 @@ generic-y += clkdev.h generic-y += exec.h generic-y += hash.h +generic-y += mcs_spinlock.h generic-y += module.h generic-y += preempt.h generic-y += trace_clock.h diff --git a/arch/m68k/include/asm/Kbuild b/arch/m68k/include/asm/Kbuild index f44ac8f9536a..86b4c178cc75 100644 --- a/arch/m68k/include/asm/Kbuild +++ b/arch/m68k/include/asm/Kbuild @@ -16,6 +16,7 @@ generic-y += kmap_types.h generic-y += kvm_para.h generic-y += local.h generic-y += local64.h +generic-y += mcs_spinlock.h generic-y += mman.h generic-y += mutex.h generic-y += percpu.h diff --git a/arch/metag/include/asm/Kbuild b/arch/metag/include/asm/Kbuild index ac22144e92a3..c29ead89a317 100644 --- a/arch/metag/include/asm/Kbuild +++ b/arch/metag/include/asm/Kbuild @@ -24,6 +24,7 @@ generic-y += kmap_types.h generic-y += kvm_para.h generic-y += local.h generic-y += local64.h +generic-y += mcs_spinlock.h generic-y += msgbuf.h generic-y += mutex.h generic-y += param.h diff --git a/arch/microblaze/include/asm/Kbuild b/arch/microblaze/include/asm/Kbuild index 4b5f12259d81..1f590ab8f323 100644 --- a/arch/microblaze/include/asm/Kbuild +++ b/arch/microblaze/include/asm/Kbuild @@ -3,6 +3,7 @@ generic-y += barrier.h generic-y += clkdev.h generic-y += exec.h generic-y += hash.h +generic-y += mcs_spinlock.h generic-y += preempt.h generic-y += syscalls.h generic-y += trace_clock.h diff --git a/arch/mips/include/asm/Kbuild b/arch/mips/include/asm/Kbuild index 82ee17a68ae0..05439187891d 100644 --- a/arch/mips/include/asm/Kbuild +++ b/arch/mips/include/asm/Kbuild @@ -4,6 +4,7 @@ generic-y += current.h generic-y += emergency-restart.h generic-y += hash.h generic-y += local64.h +generic-y += mcs_spinlock.h generic-y += mutex.h generic-y += parport.h generic-y += percpu.h diff --git a/arch/mn10300/include/asm/Kbuild b/arch/mn10300/include/asm/Kbuild index da3b2f50c07e..cbc6b9bf45da 100644 --- a/arch/mn10300/include/asm/Kbuild +++ b/arch/mn10300/include/asm/Kbuild @@ -3,5 +3,6 @@ generic-y += barrier.h generic-y += clkdev.h generic-y += exec.h generic-y += hash.h +generic-y += mcs_spinlock.h generic-y += preempt.h generic-y += trace_clock.h diff --git a/arch/openrisc/include/asm/Kbuild b/arch/openrisc/include/asm/Kbuild index f62a67a15055..480af0d9c2f5 100644 --- a/arch/openrisc/include/asm/Kbuild +++ b/arch/openrisc/include/asm/Kbuild @@ -35,6 +35,7 @@ generic-y += kdebug.h generic-y += kmap_types.h generic-y += kvm_para.h generic-y += local.h +generic-y += mcs_spinlock.h generic-y += mman.h generic-y += module.h generic-y += msgbuf.h diff --git a/arch/parisc/include/asm/Kbuild b/arch/parisc/include/asm/Kbuild index 958b79aac24f..ecf25e6678ad 100644 --- a/arch/parisc/include/asm/Kbuild +++ b/arch/parisc/include/asm/Kbuild @@ -14,6 +14,7 @@ generic-y += kdebug.h generic-y += kvm_para.h generic-y += local.h generic-y += local64.h +generic-y += mcs_spinlock.h generic-y += mutex.h generic-y += param.h generic-y += percpu.h diff --git a/arch/powerpc/include/asm/Kbuild b/arch/powerpc/include/asm/Kbuild index b618c416d3d9..3fb1bc432f4f 100644 --- a/arch/powerpc/include/asm/Kbuild +++ b/arch/powerpc/include/asm/Kbuild @@ -1,6 +1,7 @@ generic-y += clkdev.h generic-y += hash.h +generic-y += mcs_spinlock.h generic-y += preempt.h generic-y += rwsem.h generic-y += trace_clock.h diff --git a/arch/s390/include/asm/Kbuild b/arch/s390/include/asm/Kbuild index 482f492f0834..57892a8a9055 100644 --- a/arch/s390/include/asm/Kbuild +++ b/arch/s390/include/asm/Kbuild @@ -2,5 +2,6 @@ generic-y += clkdev.h generic-y += hash.h +generic-y += mcs_spinlock.h generic-y += preempt.h generic-y += trace_clock.h diff --git a/arch/score/include/asm/Kbuild b/arch/score/include/asm/Kbuild index e92319fb146d..4630cf217b5b 100644 --- a/arch/score/include/asm/Kbuild +++ b/arch/score/include/asm/Kbuild @@ -5,6 +5,7 @@ header-y += generic-y += barrier.h generic-y += clkdev.h generic-y += hash.h +generic-y += mcs_spinlock.h generic-y += preempt.h generic-y += trace_clock.h generic-y += xor.h diff --git a/arch/sh/include/asm/Kbuild b/arch/sh/include/asm/Kbuild index 85c214e456f9..c19e47dacb31 100644 --- a/arch/sh/include/asm/Kbuild +++ b/arch/sh/include/asm/Kbuild @@ -15,6 +15,7 @@ generic-y += irq_regs.h generic-y += kvm_para.h generic-y += local.h generic-y += local64.h +generic-y += mcs_spinlock.h generic-y += mman.h generic-y += msgbuf.h generic-y += param.h diff --git a/arch/sparc/include/asm/Kbuild b/arch/sparc/include/asm/Kbuild index 82150287bfa4..a45821818003 100644 --- a/arch/sparc/include/asm/Kbuild +++ b/arch/sparc/include/asm/Kbuild @@ -11,6 +11,7 @@ generic-y += irq_regs.h generic-y += linkage.h generic-y += local.h generic-y += local64.h +generic-y += mcs_spinlock.h generic-y += module.h generic-y += mutex.h generic-y += preempt.h diff --git a/arch/tile/include/asm/Kbuild b/arch/tile/include/asm/Kbuild index e0b1ed967737..0aa5675e7025 100644 --- a/arch/tile/include/asm/Kbuild +++ b/arch/tile/include/asm/Kbuild @@ -19,6 +19,7 @@ generic-y += ipcbuf.h generic-y += irq_regs.h generic-y += local.h generic-y += local64.h +generic-y += mcs_spinlock.h generic-y += msgbuf.h generic-y += mutex.h generic-y += param.h diff --git a/arch/um/include/asm/Kbuild b/arch/um/include/asm/Kbuild index 215e05d8d4b8..a5e4b6068213 100644 --- a/arch/um/include/asm/Kbuild +++ b/arch/um/include/asm/Kbuild @@ -15,6 +15,7 @@ generic-y += hw_irq.h generic-y += io.h generic-y += irq_regs.h generic-y += kdebug.h +generic-y += mcs_spinlock.h generic-y += mutex.h generic-y += param.h generic-y += pci.h diff --git a/arch/unicore32/include/asm/Kbuild b/arch/unicore32/include/asm/Kbuild index ad2c737a9cab..1e5fb872a4aa 100644 --- a/arch/unicore32/include/asm/Kbuild +++ b/arch/unicore32/include/asm/Kbuild @@ -25,6 +25,7 @@ generic-y += irq_regs.h generic-y += kdebug.h generic-y += kmap_types.h generic-y += local.h +generic-y += mcs_spinlock.h generic-y += mman.h generic-y += module.h generic-y += msgbuf.h diff --git a/arch/x86/include/asm/Kbuild b/arch/x86/include/asm/Kbuild index 7f669853317a..a8fee078b92f 100644 --- a/arch/x86/include/asm/Kbuild +++ b/arch/x86/include/asm/Kbuild @@ -5,3 +5,4 @@ genhdr-y += unistd_64.h genhdr-y += unistd_x32.h generic-y += clkdev.h +generic-y += mcs_spinlock.h diff --git a/arch/xtensa/include/asm/Kbuild b/arch/xtensa/include/asm/Kbuild index 223280d8e611..c3d20ba6eb86 100644 --- a/arch/xtensa/include/asm/Kbuild +++ b/arch/xtensa/include/asm/Kbuild @@ -18,6 +18,7 @@ generic-y += kvm_para.h generic-y += linkage.h generic-y += local.h generic-y += local64.h +generic-y += mcs_spinlock.h generic-y += percpu.h generic-y += preempt.h generic-y += resource.h diff --git a/include/asm-generic/mcs_spinlock.h b/include/asm-generic/mcs_spinlock.h new file mode 100644 index 000000000000..10cd4ffc6ba2 --- /dev/null +++ b/include/asm-generic/mcs_spinlock.h @@ -0,0 +1,13 @@ +#ifndef __ASM_MCS_SPINLOCK_H +#define __ASM_MCS_SPINLOCK_H + +/* + * Architectures can define their own: + * + * arch_mcs_spin_lock_contended(l) + * arch_mcs_spin_unlock_contended(l) + * + * See kernel/locking/mcs_spinlock.c. + */ + +#endif /* __ASM_MCS_SPINLOCK_H */ diff --git a/include/linux/mcs_spinlock.h b/include/linux/mcs_spinlock.h index e9a4d74c63dc..f2a5c6360083 100644 --- a/include/linux/mcs_spinlock.h +++ b/include/linux/mcs_spinlock.h @@ -12,6 +12,8 @@ #ifndef __LINUX_MCS_SPINLOCK_H #define __LINUX_MCS_SPINLOCK_H +#include + struct mcs_spinlock { struct mcs_spinlock *next; int locked; /* 1 if lock acquired */ -- cgit v1.2.3 From fb9edbe98493fcd9df66de926ae9157cbe0e4dcd Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 20 Jan 2014 19:20:06 +0100 Subject: lockdep: Make held_lock->check and "int check" argument bool The "int check" argument of lock_acquire() and held_lock->check are misleading. This is actually a boolean: 2 means "true", everything else is "false". And there is no need to pass 1 or 0 to lock_acquire() depending on CONFIG_PROVE_LOCKING, __lock_acquire() checks prove_locking at the start and clears "check" if !CONFIG_PROVE_LOCKING. Note: probably we can simply kill this member/arg. The only explicit user of check => 0 is rcu_lock_acquire(), perhaps we can change it to use lock_acquire(trylock =>, read => 2). __lockdep_no_validate means check => 0 implicitly, but we can change validate_chain() to check hlock->instance->key instead. Not to mention it would be nice to get rid of lockdep_set_novalidate_class(). Signed-off-by: Oleg Nesterov Cc: Dave Jones Cc: Greg Kroah-Hartman Cc: Linus Torvalds Cc: Paul McKenney Cc: Steven Rostedt Cc: Alan Stern Cc: Sasha Levin Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20140120182006.GA26495@redhat.com Signed-off-by: Ingo Molnar --- drivers/tty/tty_ldsem.c | 15 ++++----------- include/linux/lockdep.h | 25 +++++++++---------------- include/linux/rcupdate.h | 2 +- kernel/locking/lockdep.c | 11 ++++------- 4 files changed, 18 insertions(+), 35 deletions(-) (limited to 'include/linux') diff --git a/drivers/tty/tty_ldsem.c b/drivers/tty/tty_ldsem.c index d8a55e87877f..0ffb0cbe2823 100644 --- a/drivers/tty/tty_ldsem.c +++ b/drivers/tty/tty_ldsem.c @@ -39,17 +39,10 @@ lock_acquire(&(l)->dep_map, s, t, r, c, n, i) # define __rel(l, n, i) \ lock_release(&(l)->dep_map, n, i) -# ifdef CONFIG_PROVE_LOCKING -# define lockdep_acquire(l, s, t, i) __acq(l, s, t, 0, 2, NULL, i) -# define lockdep_acquire_nest(l, s, t, n, i) __acq(l, s, t, 0, 2, n, i) -# define lockdep_acquire_read(l, s, t, i) __acq(l, s, t, 1, 2, NULL, i) -# define lockdep_release(l, n, i) __rel(l, n, i) -# else -# define lockdep_acquire(l, s, t, i) __acq(l, s, t, 0, 1, NULL, i) -# define lockdep_acquire_nest(l, s, t, n, i) __acq(l, s, t, 0, 1, n, i) -# define lockdep_acquire_read(l, s, t, i) __acq(l, s, t, 1, 1, NULL, i) -# define lockdep_release(l, n, i) __rel(l, n, i) -# endif +#define lockdep_acquire(l, s, t, i) __acq(l, s, t, 0, 1, NULL, i) +#define lockdep_acquire_nest(l, s, t, n, i) __acq(l, s, t, 0, 1, n, i) +#define lockdep_acquire_read(l, s, t, i) __acq(l, s, t, 1, 1, NULL, i) +#define lockdep_release(l, n, i) __rel(l, n, i) #else # define lockdep_acquire(l, s, t, i) do { } while (0) # define lockdep_acquire_nest(l, s, t, n, i) do { } while (0) diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index 92b1bfc5da60..1626047c1f26 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -252,9 +252,9 @@ struct held_lock { unsigned int trylock:1; /* 16 bits */ unsigned int read:2; /* see lock_acquire() comment */ - unsigned int check:2; /* see lock_acquire() comment */ + unsigned int check:1; /* see lock_acquire() comment */ unsigned int hardirqs_off:1; - unsigned int references:11; /* 32 bits */ + unsigned int references:12; /* 32 bits */ }; /* @@ -326,9 +326,8 @@ static inline int lockdep_match_key(struct lockdep_map *lock, * * Values for check: * - * 0: disabled - * 1: simple checks (freeing, held-at-exit-time, etc.) - * 2: full validation + * 0: simple checks (freeing, held-at-exit-time, etc.) + * 1: full validation */ extern void lock_acquire(struct lockdep_map *lock, unsigned int subclass, int trylock, int read, int check, @@ -479,15 +478,9 @@ static inline void print_irqtrace_events(struct task_struct *curr) * on the per lock-class debug mode: */ -#ifdef CONFIG_PROVE_LOCKING - #define lock_acquire_exclusive(l, s, t, n, i) lock_acquire(l, s, t, 0, 2, n, i) - #define lock_acquire_shared(l, s, t, n, i) lock_acquire(l, s, t, 1, 2, n, i) - #define lock_acquire_shared_recursive(l, s, t, n, i) lock_acquire(l, s, t, 2, 2, n, i) -#else - #define lock_acquire_exclusive(l, s, t, n, i) lock_acquire(l, s, t, 0, 1, n, i) - #define lock_acquire_shared(l, s, t, n, i) lock_acquire(l, s, t, 1, 1, n, i) - #define lock_acquire_shared_recursive(l, s, t, n, i) lock_acquire(l, s, t, 2, 1, n, i) -#endif +#define lock_acquire_exclusive(l, s, t, n, i) lock_acquire(l, s, t, 0, 1, n, i) +#define lock_acquire_shared(l, s, t, n, i) lock_acquire(l, s, t, 1, 1, n, i) +#define lock_acquire_shared_recursive(l, s, t, n, i) lock_acquire(l, s, t, 2, 1, n, i) #define spin_acquire(l, s, t, i) lock_acquire_exclusive(l, s, t, NULL, i) #define spin_acquire_nest(l, s, t, n, i) lock_acquire_exclusive(l, s, t, n, i) @@ -518,13 +511,13 @@ static inline void print_irqtrace_events(struct task_struct *curr) # define might_lock(lock) \ do { \ typecheck(struct lockdep_map *, &(lock)->dep_map); \ - lock_acquire(&(lock)->dep_map, 0, 0, 0, 2, NULL, _THIS_IP_); \ + lock_acquire(&(lock)->dep_map, 0, 0, 0, 1, NULL, _THIS_IP_); \ lock_release(&(lock)->dep_map, 0, _THIS_IP_); \ } while (0) # define might_lock_read(lock) \ do { \ typecheck(struct lockdep_map *, &(lock)->dep_map); \ - lock_acquire(&(lock)->dep_map, 0, 0, 1, 2, NULL, _THIS_IP_); \ + lock_acquire(&(lock)->dep_map, 0, 0, 1, 1, NULL, _THIS_IP_); \ lock_release(&(lock)->dep_map, 0, _THIS_IP_); \ } while (0) #else diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 72bf3a01a4ee..adff3c99dcaa 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -314,7 +314,7 @@ static inline bool rcu_lockdep_current_cpu_online(void) static inline void rcu_lock_acquire(struct lockdep_map *map) { - lock_acquire(map, 0, 0, 2, 1, NULL, _THIS_IP_); + lock_acquire(map, 0, 0, 2, 0, NULL, _THIS_IP_); } static inline void rcu_lock_release(struct lockdep_map *map) diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index eb8a54783fa0..8c85a0da5a38 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -2098,7 +2098,7 @@ static int validate_chain(struct task_struct *curr, struct lockdep_map *lock, * (If lookup_chain_cache() returns with 1 it acquires * graph_lock for us) */ - if (!hlock->trylock && (hlock->check == 2) && + if (!hlock->trylock && hlock->check && lookup_chain_cache(curr, hlock, chain_key)) { /* * Check whether last held lock: @@ -3055,9 +3055,6 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, int class_idx; u64 chain_key; - if (!prove_locking) - check = 1; - if (unlikely(!debug_locks)) return 0; @@ -3069,8 +3066,8 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) return 0; - if (lock->key == &__lockdep_no_validate__) - check = 1; + if (!prove_locking || lock->key == &__lockdep_no_validate__) + check = 0; if (subclass < NR_LOCKDEP_CACHING_CLASSES) class = lock->class_cache[subclass]; @@ -3138,7 +3135,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, hlock->holdtime_stamp = lockstat_clock(); #endif - if (check == 2 && !mark_irqflags(curr, hlock)) + if (check && !mark_irqflags(curr, hlock)) return 0; /* mark it as used: */ -- cgit v1.2.3 From 47be1c1a0e188232b5e5962917b21750053cd3f8 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 20 Jan 2014 19:20:16 +0100 Subject: lockdep: Change lockdep_set_novalidate_class() to use _and_name Cosmetic. This doesn't really matter because a) device->mutex is the only user of __lockdep_no_validate__ and b) this class should be never reported as the source of problem, but if something goes wrong "&dev->mutex" looks better than "&__lockdep_no_validate__" as the name of the lock. Signed-off-by: Oleg Nesterov Cc: Dave Jones Cc: Greg Kroah-Hartman Cc: Linus Torvalds Cc: Paul McKenney Cc: Steven Rostedt Cc: Alan Stern Cc: Sasha Levin Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20140120182016.GA26512@redhat.com Signed-off-by: Ingo Molnar --- include/linux/lockdep.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index 1626047c1f26..060e5137fd80 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -303,7 +303,7 @@ extern void lockdep_init_map(struct lockdep_map *lock, const char *name, (lock)->dep_map.key, sub) #define lockdep_set_novalidate_class(lock) \ - lockdep_set_class(lock, &__lockdep_no_validate__) + lockdep_set_class_and_name(lock, &__lockdep_no_validate__, #lock) /* * Compare locking classes */ -- cgit v1.2.3 From fed14d45f945042a15b09de48d7d3d58d9455fc4 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sat, 11 Feb 2012 06:05:00 +0100 Subject: sched/fair: Track cgroup depth Track depth in cgroup tree, this is useful for things like find_matching_se() where you need to get to a common parent of two sched entities. Keeping the depth avoids having to calculate it on the spot, which saves a number of possible cache-misses. Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1328936700.2476.17.camel@laptop Signed-off-by: Ingo Molnar --- include/linux/sched.h | 1 + kernel/sched/fair.c | 47 +++++++++++++++++++++-------------------------- 2 files changed, 22 insertions(+), 26 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index e3d556427b2e..555e27d717c0 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1078,6 +1078,7 @@ struct sched_entity { #endif #ifdef CONFIG_FAIR_GROUP_SCHED + int depth; struct sched_entity *parent; /* rq on which this entity is (to be) queued: */ struct cfs_rq *cfs_rq; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 04fea7744a9f..748a7ac3388f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -322,13 +322,13 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq) list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list) /* Do the two (enqueued) entities belong to the same group ? */ -static inline int +static inline struct cfs_rq * is_same_group(struct sched_entity *se, struct sched_entity *pse) { if (se->cfs_rq == pse->cfs_rq) - return 1; + return se->cfs_rq; - return 0; + return NULL; } static inline struct sched_entity *parent_entity(struct sched_entity *se) @@ -336,17 +336,6 @@ static inline struct sched_entity *parent_entity(struct sched_entity *se) return se->parent; } -/* return depth at which a sched entity is present in the hierarchy */ -static inline int depth_se(struct sched_entity *se) -{ - int depth = 0; - - for_each_sched_entity(se) - depth++; - - return depth; -} - static void find_matching_se(struct sched_entity **se, struct sched_entity **pse) { @@ -360,8 +349,8 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse) */ /* First walk up until both entities are at same depth */ - se_depth = depth_se(*se); - pse_depth = depth_se(*pse); + se_depth = (*se)->depth; + pse_depth = (*pse)->depth; while (se_depth > pse_depth) { se_depth--; @@ -426,10 +415,10 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq) #define for_each_leaf_cfs_rq(rq, cfs_rq) \ for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL) -static inline int +static inline struct cfs_rq * is_same_group(struct sched_entity *se, struct sched_entity *pse) { - return 1; + return cfs_rq_of(se); /* always the same rq */ } static inline struct sched_entity *parent_entity(struct sched_entity *se) @@ -7262,7 +7251,9 @@ void init_cfs_rq(struct cfs_rq *cfs_rq) #ifdef CONFIG_FAIR_GROUP_SCHED static void task_move_group_fair(struct task_struct *p, int on_rq) { + struct sched_entity *se = &p->se; struct cfs_rq *cfs_rq; + /* * If the task was not on the rq at the time of this cgroup movement * it must have been asleep, sleeping tasks keep their ->vruntime @@ -7288,23 +7279,24 @@ static void task_move_group_fair(struct task_struct *p, int on_rq) * To prevent boost or penalty in the new cfs_rq caused by delta * min_vruntime between the two cfs_rqs, we skip vruntime adjustment. */ - if (!on_rq && (!p->se.sum_exec_runtime || p->state == TASK_WAKING)) + if (!on_rq && (!se->sum_exec_runtime || p->state == TASK_WAKING)) on_rq = 1; if (!on_rq) - p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime; + se->vruntime -= cfs_rq_of(se)->min_vruntime; set_task_rq(p, task_cpu(p)); + se->depth = se->parent ? se->parent->depth + 1 : 0; if (!on_rq) { - cfs_rq = cfs_rq_of(&p->se); - p->se.vruntime += cfs_rq->min_vruntime; + cfs_rq = cfs_rq_of(se); + se->vruntime += cfs_rq->min_vruntime; #ifdef CONFIG_SMP /* * migrate_task_rq_fair() will have removed our previous * contribution, but we must synchronize for ongoing future * decay. */ - p->se.avg.decay_count = atomic64_read(&cfs_rq->decay_counter); - cfs_rq->blocked_load_avg += p->se.avg.load_avg_contrib; + se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter); + cfs_rq->blocked_load_avg += se->avg.load_avg_contrib; #endif } } @@ -7400,10 +7392,13 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, if (!se) return; - if (!parent) + if (!parent) { se->cfs_rq = &rq->cfs; - else + se->depth = 0; + } else { se->cfs_rq = parent->my_q; + se->depth = parent->depth + 1; + } se->my_q = cfs_rq; /* guarantee group entities always have weight */ -- cgit v1.2.3 From d47d5c8194579bce1d62f88e26fea91d7c553e42 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Sat, 8 Feb 2014 08:51:58 +0100 Subject: asmlinkage: Make __iowrite32_copy visible This is a assembler function on x86, so it should be visible. Signed-off-by: Andi Kleen Link: http://lkml.kernel.org/r/1391845930-28580-2-git-send-email-ak@linux.intel.com Signed-off-by: H. Peter Anvin --- include/linux/io.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/io.h b/include/linux/io.h index f4f42faec686..8a18e75600cc 100644 --- a/include/linux/io.h +++ b/include/linux/io.h @@ -24,7 +24,7 @@ struct device; -void __iowrite32_copy(void __iomem *to, const void *from, size_t count); +__visible void __iowrite32_copy(void __iomem *to, const void *from, size_t count); void __iowrite64_copy(void __iomem *to, const void *from, size_t count); #ifdef CONFIG_MMU -- cgit v1.2.3 From 63f9a7fde715352e0769302527670542a664b981 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Sat, 8 Feb 2014 08:52:01 +0100 Subject: asmlinkage: Make lockdep_sys_exit asmlinkage lockdep_sys_exit can be called from assembler code, so make it asmlinkage. Cc: Peter Zijlstra Cc: Ingo Molnar Signed-off-by: Andi Kleen Link: http://lkml.kernel.org/r/1391845930-28580-5-git-send-email-ak@linux.intel.com Signed-off-by: H. Peter Anvin --- include/linux/lockdep.h | 2 +- kernel/locking/lockdep.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index 92b1bfc5da60..7df9aa6902c0 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -265,7 +265,7 @@ extern void lockdep_info(void); extern void lockdep_reset(void); extern void lockdep_reset_lock(struct lockdep_map *lock); extern void lockdep_free_key_range(void *start, unsigned long size); -extern void lockdep_sys_exit(void); +extern asmlinkage void lockdep_sys_exit(void); extern void lockdep_off(void); extern void lockdep_on(void); diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index eb8a54783fa0..c8b6753c5bb1 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -4191,7 +4191,7 @@ void debug_show_held_locks(struct task_struct *task) } EXPORT_SYMBOL_GPL(debug_show_held_locks); -void lockdep_sys_exit(void) +asmlinkage void lockdep_sys_exit(void) { struct task_struct *curr = current; -- cgit v1.2.3 From 128ea04a9885af9629059e631ddf0cab4815b589 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Sat, 8 Feb 2014 09:01:07 +0100 Subject: lto: Make asmlinkage __visible Signed-off-by: Andi Kleen Link: http://lkml.kernel.org/r/1391846481-31491-3-git-send-email-ak@linux.intel.com Signed-off-by: H. Peter Anvin --- include/linux/linkage.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/linkage.h b/include/linux/linkage.h index a6a42dd02466..34a513a2727b 100644 --- a/include/linux/linkage.h +++ b/include/linux/linkage.h @@ -12,9 +12,9 @@ #endif #ifdef __cplusplus -#define CPP_ASMLINKAGE extern "C" +#define CPP_ASMLINKAGE extern "C" __visible #else -#define CPP_ASMLINKAGE +#define CPP_ASMLINKAGE __visible #endif #ifndef asmlinkage -- cgit v1.2.3 From ef1b893c29d0dba778f67ad97b554b37f9108dcc Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Sat, 8 Feb 2014 09:01:08 +0100 Subject: lto, workaround: Add workaround for initcall reordering MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Work around a LTO gcc problem: when there is no reference to a variable in a module it will be moved to the end of the program. This causes reordering of initcalls which the kernel does not like. Add a dummy reference function to avoid this. The function is deleted by the linker. This replaces a previous much slower workaround. Thanks to Jan "Honza" Hubička for suggesting this technique. Suggested-by: Jan Hubička Signed-off-by: Andi Kleen Link: http://lkml.kernel.org/r/1391846481-31491-4-git-send-email-ak@linux.intel.com Signed-off-by: H. Peter Anvin --- include/linux/init.h | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/init.h b/include/linux/init.h index e1688802964f..a3ba27076342 100644 --- a/include/linux/init.h +++ b/include/linux/init.h @@ -163,6 +163,23 @@ extern bool initcall_debug; #ifndef __ASSEMBLY__ +#ifdef CONFIG_LTO +/* Work around a LTO gcc problem: when there is no reference to a variable + * in a module it will be moved to the end of the program. This causes + * reordering of initcalls which the kernel does not like. + * Add a dummy reference function to avoid this. The function is + * deleted by the linker. + */ +#define LTO_REFERENCE_INITCALL(x) \ + ; /* yes this is needed */ \ + static __used __exit void *reference_##x(void) \ + { \ + return &x; \ + } +#else +#define LTO_REFERENCE_INITCALL(x) +#endif + /* initcalls are now grouped by functionality into separate * subsections. Ordering inside the subsections is determined * by link order. @@ -175,7 +192,8 @@ extern bool initcall_debug; #define __define_initcall(fn, id) \ static initcall_t __initcall_##fn##id __used \ - __attribute__((__section__(".initcall" #id ".init"))) = fn + __attribute__((__section__(".initcall" #id ".init"))) = fn; \ + LTO_REFERENCE_INITCALL(__initcall_##fn##id) /* * Early initcalls run before initializing SMP. -- cgit v1.2.3 From 478b360a47b71f3b5030eacd3aae6acb1a32c2b6 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Sat, 15 Feb 2014 23:48:45 +0100 Subject: netfilter: nf_tables: fix nf_trace always-on with XT_TRACE=n When using nftables with CONFIG_NETFILTER_XT_TARGET_TRACE=n, we get lots of "TRACE: filter:output:policy:1 IN=..." warnings as several places will leave skb->nf_trace uninitialised. Unlike iptables tracing functionality is not conditional in nftables, so always copy/zero nf_trace setting when nftables is enabled. Move this into __nf_copy() helper. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/linux/skbuff.h | 5 ++++- net/core/skbuff.c | 3 --- net/ipv4/ip_output.c | 3 --- net/ipv6/ip6_output.c | 3 --- 4 files changed, 4 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index f589c9af8cbf..d40d40b2915b 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -2725,7 +2725,7 @@ static inline void nf_reset(struct sk_buff *skb) static inline void nf_reset_trace(struct sk_buff *skb) { -#if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE) +#if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE) || defined(CONFIG_NF_TABLES) skb->nf_trace = 0; #endif } @@ -2742,6 +2742,9 @@ static inline void __nf_copy(struct sk_buff *dst, const struct sk_buff *src) dst->nf_bridge = src->nf_bridge; nf_bridge_get(src->nf_bridge); #endif +#if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE) || defined(CONFIG_NF_TABLES) + dst->nf_trace = src->nf_trace; +#endif } static inline void nf_copy(struct sk_buff *dst, const struct sk_buff *src) diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 5976ef0846bd..5d6236d9fdce 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -707,9 +707,6 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old) new->mark = old->mark; new->skb_iif = old->skb_iif; __nf_copy(new, old); -#if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE) - new->nf_trace = old->nf_trace; -#endif #ifdef CONFIG_NET_SCHED new->tc_index = old->tc_index; #ifdef CONFIG_NET_CLS_ACT diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 8971780aec7c..73c6b63bba74 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -422,9 +422,6 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from) to->tc_index = from->tc_index; #endif nf_copy(to, from); -#if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE) - to->nf_trace = from->nf_trace; -#endif #if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE) to->ipvs_property = from->ipvs_property; #endif diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index ef02b26ccf81..4cfbe0f3793a 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -517,9 +517,6 @@ static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from) to->tc_index = from->tc_index; #endif nf_copy(to, from); -#if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE) - to->nf_trace = from->nf_trace; -#endif skb_copy_secmark(to, from); } -- cgit v1.2.3 From 87de1cfdc55b16b794e245b07322340725149d62 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 3 Dec 2013 10:02:52 -0800 Subject: rcu: Stop tracking FSF's postal address All of the RCU source files have the usual GPL header, which contains a long-obsolete postal address for FSF. To avoid the need to track the FSF office's movements, this commit substitutes the URL where GPL may be found. Reported-by: Greg KH Reported-by: Steven Rostedt Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- include/linux/rcupdate.h | 4 ++-- include/linux/rcutiny.h | 4 ++-- include/linux/rcutree.h | 4 ++-- include/linux/srcu.h | 4 ++-- kernel/rcu/rcu.h | 4 ++-- kernel/rcu/srcu.c | 4 ++-- kernel/rcu/tiny.c | 4 ++-- kernel/rcu/tiny_plugin.h | 4 ++-- kernel/rcu/torture.c | 4 ++-- kernel/rcu/tree.c | 4 ++-- kernel/rcu/tree.h | 4 ++-- kernel/rcu/tree_plugin.h | 4 ++-- kernel/rcu/tree_trace.c | 4 ++-- kernel/rcu/update.c | 4 ++-- 14 files changed, 28 insertions(+), 28 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 72bf3a01a4ee..b200756ea4c0 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -12,8 +12,8 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * along with this program; if not, you can access it online at + * http://www.gnu.org/licenses/gpl-2.0.html. * * Copyright IBM Corporation, 2001 * diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index 6f01771b571c..c364e9148de2 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -12,8 +12,8 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * along with this program; if not, you can access it online at + * http://www.gnu.org/licenses/gpl-2.0.html. * * Copyright IBM Corporation, 2008 * diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index 72137ee8c603..08b084068967 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -12,8 +12,8 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * along with this program; if not, you can access it online at + * http://www.gnu.org/licenses/gpl-2.0.html. * * Copyright IBM Corporation, 2008 * diff --git a/include/linux/srcu.h b/include/linux/srcu.h index 9b058eecd403..a2783cb5d275 100644 --- a/include/linux/srcu.h +++ b/include/linux/srcu.h @@ -12,8 +12,8 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * along with this program; if not, you can access it online at + * http://www.gnu.org/licenses/gpl-2.0.html. * * Copyright (C) IBM Corporation, 2006 * Copyright (C) Fujitsu, 2012 diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 79c3877e9c5b..1bd787fddcb2 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -12,8 +12,8 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * along with this program; if not, you can access it online at + * http://www.gnu.org/licenses/gpl-2.0.html. * * Copyright IBM Corporation, 2011 * diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c index 3318d8284384..5db7e9272d37 100644 --- a/kernel/rcu/srcu.c +++ b/kernel/rcu/srcu.c @@ -12,8 +12,8 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * along with this program; if not, you can access it online at + * http://www.gnu.org/licenses/gpl-2.0.html. * * Copyright (C) IBM Corporation, 2006 * Copyright (C) Fujitsu, 2012 diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index 1254f312d024..53b95bbf4abb 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c @@ -12,8 +12,8 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * along with this program; if not, you can access it online at + * http://www.gnu.org/licenses/gpl-2.0.html. * * Copyright IBM Corporation, 2008 * diff --git a/kernel/rcu/tiny_plugin.h b/kernel/rcu/tiny_plugin.h index 280d06cae352..431528520562 100644 --- a/kernel/rcu/tiny_plugin.h +++ b/kernel/rcu/tiny_plugin.h @@ -14,8 +14,8 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * along with this program; if not, you can access it online at + * http://www.gnu.org/licenses/gpl-2.0.html. * * Copyright (c) 2010 Linaro * diff --git a/kernel/rcu/torture.c b/kernel/rcu/torture.c index 732f8ae3086a..ab7dd192a50b 100644 --- a/kernel/rcu/torture.c +++ b/kernel/rcu/torture.c @@ -12,8 +12,8 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * along with this program; if not, you can access it online at + * http://www.gnu.org/licenses/gpl-2.0.html. * * Copyright (C) IBM Corporation, 2005, 2006 * diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index e64157798624..321feef0f5c0 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -12,8 +12,8 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * along with this program; if not, you can access it online at + * http://www.gnu.org/licenses/gpl-2.0.html. * * Copyright IBM Corporation, 2008 * diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 8c19873f1ac9..75dc3c39a02a 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -13,8 +13,8 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * along with this program; if not, you can access it online at + * http://www.gnu.org/licenses/gpl-2.0.html. * * Copyright IBM Corporation, 2008 * diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 6e2ef4b2b920..f9b9cdd36c8d 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -14,8 +14,8 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * along with this program; if not, you can access it online at + * http://www.gnu.org/licenses/gpl-2.0.html. * * Copyright Red Hat, 2009 * Copyright IBM Corporation, 2009 diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c index d1f1e64a6d72..5cdc62e1beeb 100644 --- a/kernel/rcu/tree_trace.c +++ b/kernel/rcu/tree_trace.c @@ -12,8 +12,8 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * along with this program; if not, you can access it online at + * http://www.gnu.org/licenses/gpl-2.0.html. * * Copyright IBM Corporation, 2008 * diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index c54609faf233..fd0d5b5b8e7c 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -12,8 +12,8 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * along with this program; if not, you can access it online at + * http://www.gnu.org/licenses/gpl-2.0.html. * * Copyright IBM Corporation, 2001 * -- cgit v1.2.3 From 41f4abd92a34f9c5110bbb870c04f8854604e28d Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Thu, 5 Dec 2013 15:10:23 -0800 Subject: rcu: Glue ASCII strings together Split strings make it difficult to find the code that resulted in a given console message, so this commit glues split strings back together despite the resulting long lines. Signed-off-by: Joe Perches Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- include/linux/rcupdate.h | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index b200756ea4c0..d946d3660633 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -479,11 +479,9 @@ static inline void rcu_preempt_sleep_check(void) do { \ rcu_preempt_sleep_check(); \ rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map), \ - "Illegal context switch in RCU-bh" \ - " read-side critical section"); \ + "Illegal context switch in RCU-bh read-side critical section"); \ rcu_lockdep_assert(!lock_is_held(&rcu_sched_lock_map), \ - "Illegal context switch in RCU-sched"\ - " read-side critical section"); \ + "Illegal context switch in RCU-sched read-side critical section"); \ } while (0) #else /* #ifdef CONFIG_PROVE_RCU */ @@ -518,16 +516,14 @@ static inline void rcu_preempt_sleep_check(void) #define __rcu_dereference_check(p, c, space) \ ({ \ typeof(*p) *_________p1 = (typeof(*p)*__force )ACCESS_ONCE(p); \ - rcu_lockdep_assert(c, "suspicious rcu_dereference_check()" \ - " usage"); \ + rcu_lockdep_assert(c, "suspicious rcu_dereference_check() usage"); \ rcu_dereference_sparse(p, space); \ smp_read_barrier_depends(); \ ((typeof(*p) __force __kernel *)(_________p1)); \ }) #define __rcu_dereference_protected(p, c, space) \ ({ \ - rcu_lockdep_assert(c, "suspicious rcu_dereference_protected()" \ - " usage"); \ + rcu_lockdep_assert(c, "suspicious rcu_dereference_protected() usage"); \ rcu_dereference_sparse(p, space); \ ((typeof(*p) __force __kernel *)(p)); \ }) @@ -542,8 +538,7 @@ static inline void rcu_preempt_sleep_check(void) ({ \ typeof(p) _________p1 = ACCESS_ONCE(p); \ rcu_lockdep_assert(c, \ - "suspicious rcu_dereference_index_check()" \ - " usage"); \ + "suspicious rcu_dereference_index_check() usage"); \ smp_read_barrier_depends(); \ (_________p1); \ }) -- cgit v1.2.3 From 0adab9b9aa18d7e90337d43567f1eec3d5401b81 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Thu, 5 Dec 2013 16:19:15 -0800 Subject: rcu: Indentation and spacing fixes. This commit outdents expression-statement macros, thus repairing a few line-length complaints. Also fix some spacing errors called out by checkpatch.pl. Signed-off-by: Joe Perches Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- include/linux/rculist.h | 17 +++++++------- include/linux/rcupdate.h | 58 ++++++++++++++++++++++++------------------------ 2 files changed, 38 insertions(+), 37 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rculist.h b/include/linux/rculist.h index dbaf99084112..8183b46fbaa2 100644 --- a/include/linux/rculist.h +++ b/include/linux/rculist.h @@ -247,9 +247,10 @@ static inline void list_splice_init_rcu(struct list_head *list, * primitives such as list_add_rcu() as long as it's guarded by rcu_read_lock(). */ #define list_entry_rcu(ptr, type, member) \ - ({typeof (*ptr) __rcu *__ptr = (typeof (*ptr) __rcu __force *)ptr; \ - container_of((typeof(ptr))rcu_dereference_raw(__ptr), type, member); \ - }) +({ \ + typeof(*ptr) __rcu *__ptr = (typeof(*ptr) __rcu __force *)ptr; \ + container_of((typeof(ptr))rcu_dereference_raw(__ptr), type, member); \ +}) /** * Where are list_empty_rcu() and list_first_entry_rcu()? @@ -285,11 +286,11 @@ static inline void list_splice_init_rcu(struct list_head *list, * primitives such as list_add_rcu() as long as it's guarded by rcu_read_lock(). */ #define list_first_or_null_rcu(ptr, type, member) \ - ({struct list_head *__ptr = (ptr); \ - struct list_head *__next = ACCESS_ONCE(__ptr->next); \ - likely(__ptr != __next) ? \ - list_entry_rcu(__next, type, member) : NULL; \ - }) +({ \ + struct list_head *__ptr = (ptr); \ + struct list_head *__next = ACCESS_ONCE(__ptr->next); \ + likely(__ptr != __next) ? list_entry_rcu(__next, type, member) : NULL; \ +}) /** * list_for_each_entry_rcu - iterate over rcu list of given type diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index d946d3660633..278a9da69ec4 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -508,40 +508,40 @@ static inline void rcu_preempt_sleep_check(void) #endif /* #else #ifdef __CHECKER__ */ #define __rcu_access_pointer(p, space) \ - ({ \ - typeof(*p) *_________p1 = (typeof(*p)*__force )ACCESS_ONCE(p); \ - rcu_dereference_sparse(p, space); \ - ((typeof(*p) __force __kernel *)(_________p1)); \ - }) +({ \ + typeof(*p) *_________p1 = (typeof(*p) *__force)ACCESS_ONCE(p); \ + rcu_dereference_sparse(p, space); \ + ((typeof(*p) __force __kernel *)(_________p1)); \ +}) #define __rcu_dereference_check(p, c, space) \ - ({ \ - typeof(*p) *_________p1 = (typeof(*p)*__force )ACCESS_ONCE(p); \ - rcu_lockdep_assert(c, "suspicious rcu_dereference_check() usage"); \ - rcu_dereference_sparse(p, space); \ - smp_read_barrier_depends(); \ - ((typeof(*p) __force __kernel *)(_________p1)); \ - }) +({ \ + typeof(*p) *_________p1 = (typeof(*p) *__force)ACCESS_ONCE(p); \ + rcu_lockdep_assert(c, "suspicious rcu_dereference_check() usage"); \ + rcu_dereference_sparse(p, space); \ + smp_read_barrier_depends(); /* Dependency order vs. p above. */ \ + ((typeof(*p) __force __kernel *)(_________p1)); \ +}) #define __rcu_dereference_protected(p, c, space) \ - ({ \ - rcu_lockdep_assert(c, "suspicious rcu_dereference_protected() usage"); \ - rcu_dereference_sparse(p, space); \ - ((typeof(*p) __force __kernel *)(p)); \ - }) +({ \ + rcu_lockdep_assert(c, "suspicious rcu_dereference_protected() usage"); \ + rcu_dereference_sparse(p, space); \ + ((typeof(*p) __force __kernel *)(p)); \ +}) #define __rcu_access_index(p, space) \ - ({ \ - typeof(p) _________p1 = ACCESS_ONCE(p); \ - rcu_dereference_sparse(p, space); \ - (_________p1); \ - }) +({ \ + typeof(p) _________p1 = ACCESS_ONCE(p); \ + rcu_dereference_sparse(p, space); \ + (_________p1); \ +}) #define __rcu_dereference_index_check(p, c) \ - ({ \ - typeof(p) _________p1 = ACCESS_ONCE(p); \ - rcu_lockdep_assert(c, \ - "suspicious rcu_dereference_index_check() usage"); \ - smp_read_barrier_depends(); \ - (_________p1); \ - }) +({ \ + typeof(p) _________p1 = ACCESS_ONCE(p); \ + rcu_lockdep_assert(c, \ + "suspicious rcu_dereference_index_check() usage"); \ + smp_read_barrier_depends(); /* Dependency order vs. p above. */ \ + (_________p1); \ +}) /** * RCU_INITIALIZER() - statically initialize an RCU-protected global variable -- cgit v1.2.3 From 88c1863066ccfa456797e12c5d8b4631aa1ad0d0 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 16 Dec 2013 13:24:32 -0800 Subject: rcu: Define rcu_assign_pointer() in terms of smp_store_release() The new smp_store_release() function provides better guarantees than did rcu_assign_pointer(), and potentially less overhead on some architectures. The guarantee that smp_store_release() provides that rcu_assign_pointer() does that is obscure, but its lack could cause considerable confusion. This guarantee is illustrated by the following code fragment: struct foo { int a; int b; int c; struct foo *next; }; struct foo foo1; struct foo foo2; struct foo __rcu *foop; ... foo2.a = 1; foo2.b = 2; BUG_ON(foo2.c); rcu_assign_pointer(foop, &foo); ... fp = rcu_dereference(foop); fp.c = 3; The current rcu_assign_pointer() semantics permit the BUG_ON() to trigger because rcu_assign_pointer()'s smp_wmb() is not guaranteed to order prior reads against later writes. This commit therefore upgrades rcu_assign_pointer() from smp_wmb() to smp_store_release() to avoid this counter-intuitive outcome. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- include/linux/rcupdate.h | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 278a9da69ec4..32decf1a9c6c 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -44,6 +44,7 @@ #include #include #include +#include #ifdef CONFIG_RCU_TORTURE_TEST extern int rcutorture_runnable; /* for sysctl */ @@ -580,12 +581,7 @@ static inline void rcu_preempt_sleep_check(void) * please be careful when making changes to rcu_assign_pointer() and the * other macros that it invokes. */ -#define rcu_assign_pointer(p, v) \ - do { \ - smp_wmb(); \ - ACCESS_ONCE(p) = RCU_INITIALIZER(v); \ - } while (0) - +#define rcu_assign_pointer(p, v) smp_store_release(&p, RCU_INITIALIZER(v)) /** * rcu_access_pointer() - fetch RCU pointer with no dereferencing -- cgit v1.2.3 From 2f33b512a5460578f6cf11d7b7867bed53157c7c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 17 Nov 2013 18:25:48 -0800 Subject: rcu: Optimize rcu_is_nocb_cpu() for RCU_NOCB_CPU_ALL If CONFIG_RCU_NOCB_CPU_ALL=y, then rcu_is_nocb_cpu() will always return true, however, the current version nevertheless checks rcu_nocb_mask. This commit therefore creates a static inline implementation of rcu_is_nocb_cpu() that unconditionally returns true when CONFIG_RCU_NOCB_CPU_ALL=y. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- include/linux/rcupdate.h | 6 ++++-- kernel/rcu/tree_plugin.h | 2 ++ 2 files changed, 6 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 72bf3a01a4ee..281c90f8989e 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -1015,11 +1015,13 @@ static inline notrace void rcu_read_unlock_sched_notrace(void) #define kfree_rcu(ptr, rcu_head) \ __kfree_rcu(&((ptr)->rcu_head), offsetof(typeof(*(ptr)), rcu_head)) -#ifdef CONFIG_RCU_NOCB_CPU +#if defined(CONFIG_RCU_NOCB_CPU_ALL) +static inline bool rcu_is_nocb_cpu(int cpu) { return true; } +#elif defined(CONFIG_RCU_NOCB_CPU) bool rcu_is_nocb_cpu(int cpu); #else static inline bool rcu_is_nocb_cpu(int cpu) { return false; } -#endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */ +#endif /* Only for use by adaptive-ticks code. */ diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 6e2ef4b2b920..39a50b918bff 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -2101,6 +2101,7 @@ static void rcu_init_one_nocb(struct rcu_node *rnp) init_waitqueue_head(&rnp->nocb_gp_wq[1]); } +#ifndef CONFIG_RCU_NOCB_CPU_ALL /* Is the specified CPU a no-CPUs CPU? */ bool rcu_is_nocb_cpu(int cpu) { @@ -2108,6 +2109,7 @@ bool rcu_is_nocb_cpu(int cpu) return cpumask_test_cpu(cpu, rcu_nocb_mask); return false; } +#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */ /* * Enqueue the specified string of rcu_head structures onto the specified -- cgit v1.2.3 From ffa83fb565fbc397cbafb4b71fd1cce276d4c3b6 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 17 Nov 2013 19:27:16 -0800 Subject: rcu: Optimize rcu_needs_cpu() for RCU_NOCB_CPU_ALL If CONFIG_RCU_NOCB_CPU_ALL=y, then rcu_needs_cpu() will always return false, however, the current version nevertheless checks for RCU callbacks. This commit therefore creates a static inline implementation of rcu_needs_cpu() that unconditionally returns false when CONFIG_RCU_NOCB_CPU_ALL=y. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- include/linux/rcupdate.h | 8 ++++++++ include/linux/rcutiny.h | 6 ------ include/linux/rcutree.h | 2 ++ kernel/rcu/tree.c | 2 +- kernel/rcu/tree_plugin.h | 4 ++++ 5 files changed, 15 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 281c90f8989e..5f7d5f410d50 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -1015,6 +1015,14 @@ static inline notrace void rcu_read_unlock_sched_notrace(void) #define kfree_rcu(ptr, rcu_head) \ __kfree_rcu(&((ptr)->rcu_head), offsetof(typeof(*(ptr)), rcu_head)) +#if defined(CONFIG_TINY_RCU) || defined(CONFIG_RCU_NOCB_CPU_ALL) +static inline int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies) +{ + *delta_jiffies = ULONG_MAX; + return 0; +} +#endif /* #if defined(CONFIG_TINY_RCU) || defined(CONFIG_RCU_NOCB_CPU_ALL) */ + #if defined(CONFIG_RCU_NOCB_CPU_ALL) static inline bool rcu_is_nocb_cpu(int cpu) { return true; } #elif defined(CONFIG_RCU_NOCB_CPU) diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index 6f01771b571c..9524903487d0 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -68,12 +68,6 @@ static inline void kfree_call_rcu(struct rcu_head *head, call_rcu(head, func); } -static inline int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies) -{ - *delta_jiffies = ULONG_MAX; - return 0; -} - static inline void rcu_note_context_switch(int cpu) { rcu_sched_qs(cpu); diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index 72137ee8c603..81198c84e268 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -31,7 +31,9 @@ #define __LINUX_RCUTREE_H void rcu_note_context_switch(int cpu); +#ifndef CONFIG_RCU_NOCB_CPU_ALL int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies); +#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */ void rcu_cpu_stall_reset(void); /* diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index b3d116cd072d..c2c8234a0291 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2880,7 +2880,7 @@ static int rcu_pending(int cpu) * non-NULL, store an indication of whether all callbacks are lazy. * (If there are no callbacks, all of them are deemed to be lazy.) */ -static int rcu_cpu_has_callbacks(int cpu, bool *all_lazy) +static int __maybe_unused rcu_cpu_has_callbacks(int cpu, bool *all_lazy) { bool al = true; bool hc = false; diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 39a50b918bff..820b06aefbee 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1586,11 +1586,13 @@ static void rcu_prepare_kthreads(int cpu) * Because we not have RCU_FAST_NO_HZ, just check whether this CPU needs * any flavor of RCU. */ +#ifndef CONFIG_RCU_NOCB_CPU_ALL int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies) { *delta_jiffies = ULONG_MAX; return rcu_cpu_has_callbacks(cpu, NULL); } +#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */ /* * Because we do not have RCU_FAST_NO_HZ, don't bother cleaning up @@ -1696,6 +1698,7 @@ static bool rcu_try_advance_all_cbs(void) * * The caller must have disabled interrupts. */ +#ifndef CONFIG_RCU_NOCB_CPU_ALL int rcu_needs_cpu(int cpu, unsigned long *dj) { struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); @@ -1726,6 +1729,7 @@ int rcu_needs_cpu(int cpu, unsigned long *dj) } return 0; } +#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */ /* * Prepare a CPU for idle from an RCU perspective. The first major task -- cgit v1.2.3 From 45a22f4c11fef4ecd5c61c0a299cd3f23d77be8e Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 17 Feb 2014 13:09:50 +0100 Subject: inotify: Fix reporting of cookies for inotify events My rework of handling of notification events (namely commit 7053aee26a35 "fsnotify: do not share events between notification groups") broke sending of cookies with inotify events. We didn't propagate the value passed to fsnotify() properly and passed 4 uninitialized bytes to userspace instead (so it is also an information leak). Sadly I didn't notice this during my testing because inotify cookies aren't used very much and LTP inotify tests ignore them. Fix the problem by passing the cookie value properly. Fixes: 7053aee26a3548ebaba046ae2e52396ccf56ac6c Reported-by: Vegard Nossum Signed-off-by: Jan Kara --- fs/notify/dnotify/dnotify.c | 2 +- fs/notify/fanotify/fanotify.c | 2 +- fs/notify/fsnotify.c | 2 +- fs/notify/inotify/inotify.h | 2 +- fs/notify/inotify/inotify_fsnotify.c | 3 ++- fs/notify/inotify/inotify_user.c | 2 +- include/linux/fsnotify_backend.h | 2 +- kernel/audit_tree.c | 2 +- kernel/audit_watch.c | 2 +- 9 files changed, 10 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c index 0b9ff4395e6a..abc8cbcfe90e 100644 --- a/fs/notify/dnotify/dnotify.c +++ b/fs/notify/dnotify/dnotify.c @@ -86,7 +86,7 @@ static int dnotify_handle_event(struct fsnotify_group *group, struct fsnotify_mark *inode_mark, struct fsnotify_mark *vfsmount_mark, u32 mask, void *data, int data_type, - const unsigned char *file_name) + const unsigned char *file_name, u32 cookie) { struct dnotify_mark *dn_mark; struct dnotify_struct *dn; diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index 0e792f5e3147..205dc2163822 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -147,7 +147,7 @@ static int fanotify_handle_event(struct fsnotify_group *group, struct fsnotify_mark *inode_mark, struct fsnotify_mark *fanotify_mark, u32 mask, void *data, int data_type, - const unsigned char *file_name) + const unsigned char *file_name, u32 cookie) { int ret = 0; struct fanotify_event_info *event; diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index 1d4e1ea2f37c..9d3e9c50066a 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -179,7 +179,7 @@ static int send_to_group(struct inode *to_tell, return group->ops->handle_event(group, to_tell, inode_mark, vfsmount_mark, mask, data, data_is, - file_name); + file_name, cookie); } /* diff --git a/fs/notify/inotify/inotify.h b/fs/notify/inotify/inotify.h index 485eef3f4407..ed855ef6f077 100644 --- a/fs/notify/inotify/inotify.h +++ b/fs/notify/inotify/inotify.h @@ -27,6 +27,6 @@ extern int inotify_handle_event(struct fsnotify_group *group, struct fsnotify_mark *inode_mark, struct fsnotify_mark *vfsmount_mark, u32 mask, void *data, int data_type, - const unsigned char *file_name); + const unsigned char *file_name, u32 cookie); extern const struct fsnotify_ops inotify_fsnotify_ops; diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c index d5ee56348bb8..43ab1e1a07a2 100644 --- a/fs/notify/inotify/inotify_fsnotify.c +++ b/fs/notify/inotify/inotify_fsnotify.c @@ -67,7 +67,7 @@ int inotify_handle_event(struct fsnotify_group *group, struct fsnotify_mark *inode_mark, struct fsnotify_mark *vfsmount_mark, u32 mask, void *data, int data_type, - const unsigned char *file_name) + const unsigned char *file_name, u32 cookie) { struct inotify_inode_mark *i_mark; struct inotify_event_info *event; @@ -103,6 +103,7 @@ int inotify_handle_event(struct fsnotify_group *group, fsn_event = &event->fse; fsnotify_init_event(fsn_event, inode, mask); event->wd = i_mark->wd; + event->sync_cookie = cookie; event->name_len = len; if (len) strcpy(event->name, file_name); diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index 497395c8274b..6528b5a54ca0 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -495,7 +495,7 @@ void inotify_ignored_and_remove_idr(struct fsnotify_mark *fsn_mark, /* Queue ignore event for the watch */ inotify_handle_event(group, NULL, fsn_mark, NULL, FS_IN_IGNORED, - NULL, FSNOTIFY_EVENT_NONE, NULL); + NULL, FSNOTIFY_EVENT_NONE, NULL, 0); i_mark = container_of(fsn_mark, struct inotify_inode_mark, fsn_mark); /* remove this mark from the idr */ diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h index 3d286ff49ab0..c84bc7c2bfc8 100644 --- a/include/linux/fsnotify_backend.h +++ b/include/linux/fsnotify_backend.h @@ -99,7 +99,7 @@ struct fsnotify_ops { struct fsnotify_mark *inode_mark, struct fsnotify_mark *vfsmount_mark, u32 mask, void *data, int data_type, - const unsigned char *file_name); + const unsigned char *file_name, u32 cookie); void (*free_group_priv)(struct fsnotify_group *group); void (*freeing_mark)(struct fsnotify_mark *mark, struct fsnotify_group *group); void (*free_event)(struct fsnotify_event *event); diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index 67ccf0e7cca9..135944a7b28a 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -916,7 +916,7 @@ static int audit_tree_handle_event(struct fsnotify_group *group, struct fsnotify_mark *inode_mark, struct fsnotify_mark *vfsmount_mark, u32 mask, void *data, int data_type, - const unsigned char *file_name) + const unsigned char *file_name, u32 cookie) { return 0; } diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index 2596fac5dcb4..70b4554d2fbe 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -471,7 +471,7 @@ static int audit_watch_handle_event(struct fsnotify_group *group, struct fsnotify_mark *inode_mark, struct fsnotify_mark *vfsmount_mark, u32 mask, void *data, int data_type, - const unsigned char *dname) + const unsigned char *dname, u32 cookie) { struct inode *inode; struct audit_parent *parent; -- cgit v1.2.3 From 90d88bd75424dff51e2072fd2f8fa85ee893aa17 Mon Sep 17 00:00:00 2001 From: Tan Xiaojun Date: Sat, 15 Feb 2014 13:19:51 +0800 Subject: workqueue: Remove deprecated __cancel_delayed_work() __cancel_delayed_work() was deprecated by 136b5721d75a ("workqueue: deprecate __cancel_delayed_work()") as cancel_delayed_work() was updated so that it could be used from all contexts. Enough time has passed since the deprecation. Let's remove it. tj: description update Signed-off-by: Tan Xiaojun Signed-off-by: Tejun Heo --- include/linux/workqueue.h | 15 --------------- 1 file changed, 15 deletions(-) (limited to 'include/linux') diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 594521ba0d43..edc941049d79 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -605,21 +605,6 @@ static inline bool keventd_up(void) return system_wq != NULL; } -/* - * Like above, but uses del_timer() instead of del_timer_sync(). This means, - * if it returns 0 the timer function may be running and the queueing is in - * progress. - */ -static inline bool __deprecated __cancel_delayed_work(struct delayed_work *work) -{ - bool ret; - - ret = del_timer(&work->timer); - if (ret) - work_clear_pending(&work->work); - return ret; -} - /* used to be different but now identical to flush_work(), deprecated */ static inline bool __deprecated flush_work_sync(struct work_struct *work) { -- cgit v1.2.3 From 18258f7239a61d8929b8e0c7b6d46c446459074c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 15 Feb 2014 00:55:18 +0000 Subject: genirq: Provide synchronize_hardirq() synchronize_irq() waits for hard irq and threaded handlers to complete before returning. For some special cases we only need to make sure that the hard interrupt part of the irq line is not in progress when we disabled the - possibly shared - interrupt at the device level. A proper use case for this was provided by Russell. The sdhci driver requires some irq triggered functions to be run in thread context. The current implementation of the thread context is a sdio private kthread construct, which has quite some shortcomings. These can be avoided when the thread is directly associated to the device interrupt via the generic threaded irq infrastructure. Though there is a corner case related to run time power management where one side disables the device interrupts at the device level and needs to make sure, that an already running hard interrupt handler has completed before proceeding further. Though that hard interrupt handler might wake the associated thread, which in turn can request the runtime PM to reenable the device. Using synchronize_irq() leads to an immediate deadlock of the irq thread waiting for the PM lock and the synchronize_irq() waiting for the irq thread to complete. Due to the fact that it is sufficient for this case to ensure that no hard irq handler is executing a new function which avoids the check for the thread is required. Add a function, which just monitors the hard irq parts and ignores the threaded handlers. Signed-off-by: Thomas Gleixner Tested-by: Russell King Cc: Chris Ball Acked-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20140215003823.653236081@linutronix.de --- include/linux/hardirq.h | 1 + kernel/irq/manage.c | 70 +++++++++++++++++++++++++++++++++++-------------- 2 files changed, 51 insertions(+), 20 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h index 12d5f972f23f..cba442ec3c66 100644 --- a/include/linux/hardirq.h +++ b/include/linux/hardirq.h @@ -9,6 +9,7 @@ extern void synchronize_irq(unsigned int irq); +extern void synchronize_hardirq(unsigned int irq); #if defined(CONFIG_TINY_RCU) diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 481a13c43b17..274ba9238fb7 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -32,24 +32,10 @@ static int __init setup_forced_irqthreads(char *arg) early_param("threadirqs", setup_forced_irqthreads); #endif -/** - * synchronize_irq - wait for pending IRQ handlers (on other CPUs) - * @irq: interrupt number to wait for - * - * This function waits for any pending IRQ handlers for this interrupt - * to complete before returning. If you use this function while - * holding a resource the IRQ handler may need you will deadlock. - * - * This function may be called - with care - from IRQ context. - */ -void synchronize_irq(unsigned int irq) +static void __synchronize_hardirq(struct irq_desc *desc) { - struct irq_desc *desc = irq_to_desc(irq); bool inprogress; - if (!desc) - return; - do { unsigned long flags; @@ -67,12 +53,56 @@ void synchronize_irq(unsigned int irq) /* Oops, that failed? */ } while (inprogress); +} - /* - * We made sure that no hardirq handler is running. Now verify - * that no threaded handlers are active. - */ - wait_event(desc->wait_for_threads, !atomic_read(&desc->threads_active)); +/** + * synchronize_hardirq - wait for pending hard IRQ handlers (on other CPUs) + * @irq: interrupt number to wait for + * + * This function waits for any pending hard IRQ handlers for this + * interrupt to complete before returning. If you use this + * function while holding a resource the IRQ handler may need you + * will deadlock. It does not take associated threaded handlers + * into account. + * + * Do not use this for shutdown scenarios where you must be sure + * that all parts (hardirq and threaded handler) have completed. + * + * This function may be called - with care - from IRQ context. + */ +void synchronize_hardirq(unsigned int irq) +{ + struct irq_desc *desc = irq_to_desc(irq); + + if (desc) + __synchronize_hardirq(desc); +} +EXPORT_SYMBOL(synchronize_hardirq); + +/** + * synchronize_irq - wait for pending IRQ handlers (on other CPUs) + * @irq: interrupt number to wait for + * + * This function waits for any pending IRQ handlers for this interrupt + * to complete before returning. If you use this function while + * holding a resource the IRQ handler may need you will deadlock. + * + * This function may be called - with care - from IRQ context. + */ +void synchronize_irq(unsigned int irq) +{ + struct irq_desc *desc = irq_to_desc(irq); + + if (desc) { + __synchronize_hardirq(desc); + /* + * We made sure that no hardirq handler is + * running. Now verify that no threaded handlers are + * active. + */ + wait_event(desc->wait_for_threads, + !atomic_read(&desc->threads_active)); + } } EXPORT_SYMBOL(synchronize_irq); -- cgit v1.2.3 From a92444c6b2225a9115d661c950cb48a22aeace20 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 15 Feb 2014 00:55:19 +0000 Subject: genirq: Provide irq_wake_thread() In course of the sdhci/sdio discussion with Russell about killing the sdio kthread hackery we discovered the need to be able to wake an interrupt thread from software. The rationale for this is, that sdio hardware can lack proper interrupt support for certain features. So the driver needs to poll the status registers, but at the same time it needs to be woken up by an hardware interrupt. To be able to get rid of the home brewn kthread construct of sdio we need a way to wake an irq thread independent of an actual hardware interrupt. Provide an irq_wake_thread() function which wakes up the thread which is associated to a given dev_id. This allows sdio to invoke the irq thread from the hardware irq handler via the IRQ_WAKE_THREAD return value and provides a possibility to wake it via a timer for the polling scenarios. That allows to simplify the sdio logic significantly. Signed-off-by: Thomas Gleixner Cc: Russell King Cc: Chris Ball Acked-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20140215003823.772565780@linutronix.de --- include/linux/interrupt.h | 1 + kernel/irq/handle.c | 4 ++-- kernel/irq/internals.h | 1 + kernel/irq/manage.c | 27 +++++++++++++++++++++++++++ 4 files changed, 31 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index a2678d35b5a2..c7bfac1c4a7b 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -188,6 +188,7 @@ extern void disable_irq(unsigned int irq); extern void disable_percpu_irq(unsigned int irq); extern void enable_irq(unsigned int irq); extern void enable_percpu_irq(unsigned int irq, unsigned int type); +extern void irq_wake_thread(unsigned int irq, void *dev_id); /* The following three functions are for the core kernel use only. */ extern void suspend_device_irqs(void); diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 131ca176b497..bfec453557b4 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -51,7 +51,7 @@ static void warn_no_thread(unsigned int irq, struct irqaction *action) "but no thread function available.", irq, action->name); } -static void irq_wake_thread(struct irq_desc *desc, struct irqaction *action) +void __irq_wake_thread(struct irq_desc *desc, struct irqaction *action) { /* * In case the thread crashed and was killed we just pretend that @@ -157,7 +157,7 @@ handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action) break; } - irq_wake_thread(desc, action); + __irq_wake_thread(desc, action); /* Fall through to add to randomness */ case IRQ_HANDLED: diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 001fa5bab490..d61ac29e32d0 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -82,6 +82,7 @@ irqreturn_t handle_irq_event(struct irq_desc *desc); /* Resending of interrupts :*/ void check_irq_resend(struct irq_desc *desc, unsigned int irq); bool irq_wait_for_poll(struct irq_desc *desc); +void __irq_wake_thread(struct irq_desc *desc, struct irqaction *action); #ifdef CONFIG_PROC_FS extern void register_irq_proc(unsigned int irq, struct irq_desc *desc); diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 274ba9238fb7..54eb5c99351b 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -911,6 +911,33 @@ static int irq_thread(void *data) return 0; } +/** + * irq_wake_thread - wake the irq thread for the action identified by dev_id + * @irq: Interrupt line + * @dev_id: Device identity for which the thread should be woken + * + */ +void irq_wake_thread(unsigned int irq, void *dev_id) +{ + struct irq_desc *desc = irq_to_desc(irq); + struct irqaction *action; + unsigned long flags; + + if (!desc || WARN_ON(irq_settings_is_per_cpu_devid(desc))) + return; + + raw_spin_lock_irqsave(&desc->lock, flags); + for (action = desc->action; action; action = action->next) { + if (action->dev_id == dev_id) { + if (action->thread) + __irq_wake_thread(desc, action); + break; + } + } + raw_spin_unlock_irqrestore(&desc->lock, flags); +} +EXPORT_SYMBOL_GPL(irq_wake_thread); + static void irq_setup_forced_threading(struct irqaction *new) { if (!force_irqthreads) -- cgit v1.2.3 From 994c41ee0ac875797b4dfef509ac7753e2649b4d Mon Sep 17 00:00:00 2001 From: Tomi Valkeinen Date: Thu, 30 Jan 2014 13:17:20 +0200 Subject: ARM: OMAP2+: clock: fix clkoutx2 with CLK_SET_RATE_PARENT If CLK_SET_RATE_PARENT is set for a clkoutx2 clock, calling clk_set_rate() on the clock "skips" the x2 multiplier as there are no set_rate and round_rate functions defined for the clkoutx2. This results in getting double the requested clock rates, breaking the display on omap3430 based devices. This got broken when d0f58bd3bba3877fb1af4664c4e33273d36f00e4 and related patches were merged for v3.14, as omapdss driver now relies more on the clk-framework and CLK_SET_RATE_PARENT. This patch implements set_rate and round_rate for clkoutx2. Tested on OMAP3430, OMAP3630, OMAP4460. Signed-off-by: Tomi Valkeinen Acked-by: Tero Kristo Signed-off-by: Paul Walmsley --- arch/arm/mach-omap2/cclock3xxx_data.c | 2 + arch/arm/mach-omap2/dpll3xxx.c | 92 +++++++++++++++++++++++++++++------ include/linux/clk/ti.h | 4 ++ 3 files changed, 83 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/arch/arm/mach-omap2/cclock3xxx_data.c b/arch/arm/mach-omap2/cclock3xxx_data.c index 3b05aea56d1f..11ed9152e665 100644 --- a/arch/arm/mach-omap2/cclock3xxx_data.c +++ b/arch/arm/mach-omap2/cclock3xxx_data.c @@ -433,7 +433,9 @@ static const struct clk_ops dpll4_m5x2_ck_ops = { .enable = &omap2_dflt_clk_enable, .disable = &omap2_dflt_clk_disable, .is_enabled = &omap2_dflt_clk_is_enabled, + .set_rate = &omap3_clkoutx2_set_rate, .recalc_rate = &omap3_clkoutx2_recalc, + .round_rate = &omap3_clkoutx2_round_rate, }; static const struct clk_ops dpll4_m5x2_ck_3630_ops = { diff --git a/arch/arm/mach-omap2/dpll3xxx.c b/arch/arm/mach-omap2/dpll3xxx.c index 3185ced807c9..3c418ea54bbe 100644 --- a/arch/arm/mach-omap2/dpll3xxx.c +++ b/arch/arm/mach-omap2/dpll3xxx.c @@ -623,6 +623,32 @@ void omap3_dpll_deny_idle(struct clk_hw_omap *clk) /* Clock control for DPLL outputs */ +/* Find the parent DPLL for the given clkoutx2 clock */ +static struct clk_hw_omap *omap3_find_clkoutx2_dpll(struct clk_hw *hw) +{ + struct clk_hw_omap *pclk = NULL; + struct clk *parent; + + /* Walk up the parents of clk, looking for a DPLL */ + do { + do { + parent = __clk_get_parent(hw->clk); + hw = __clk_get_hw(parent); + } while (hw && (__clk_get_flags(hw->clk) & CLK_IS_BASIC)); + if (!hw) + break; + pclk = to_clk_hw_omap(hw); + } while (pclk && !pclk->dpll_data); + + /* clk does not have a DPLL as a parent? error in the clock data */ + if (!pclk) { + WARN_ON(1); + return NULL; + } + + return pclk; +} + /** * omap3_clkoutx2_recalc - recalculate DPLL X2 output virtual clock rate * @clk: DPLL output struct clk @@ -637,27 +663,14 @@ unsigned long omap3_clkoutx2_recalc(struct clk_hw *hw, unsigned long rate; u32 v; struct clk_hw_omap *pclk = NULL; - struct clk *parent; if (!parent_rate) return 0; - /* Walk up the parents of clk, looking for a DPLL */ - do { - do { - parent = __clk_get_parent(hw->clk); - hw = __clk_get_hw(parent); - } while (hw && (__clk_get_flags(hw->clk) & CLK_IS_BASIC)); - if (!hw) - break; - pclk = to_clk_hw_omap(hw); - } while (pclk && !pclk->dpll_data); + pclk = omap3_find_clkoutx2_dpll(hw); - /* clk does not have a DPLL as a parent? error in the clock data */ - if (!pclk) { - WARN_ON(1); + if (!pclk) return 0; - } dd = pclk->dpll_data; @@ -672,6 +685,55 @@ unsigned long omap3_clkoutx2_recalc(struct clk_hw *hw, return rate; } +int omap3_clkoutx2_set_rate(struct clk_hw *hw, unsigned long rate, + unsigned long parent_rate) +{ + return 0; +} + +long omap3_clkoutx2_round_rate(struct clk_hw *hw, unsigned long rate, + unsigned long *prate) +{ + const struct dpll_data *dd; + u32 v; + struct clk_hw_omap *pclk = NULL; + + if (!*prate) + return 0; + + pclk = omap3_find_clkoutx2_dpll(hw); + + if (!pclk) + return 0; + + dd = pclk->dpll_data; + + /* TYPE J does not have a clkoutx2 */ + if (dd->flags & DPLL_J_TYPE) { + *prate = __clk_round_rate(__clk_get_parent(pclk->hw.clk), rate); + return *prate; + } + + WARN_ON(!dd->enable_mask); + + v = omap2_clk_readl(pclk, dd->control_reg) & dd->enable_mask; + v >>= __ffs(dd->enable_mask); + + /* If in bypass, the rate is fixed to the bypass rate*/ + if (v != OMAP3XXX_EN_DPLL_LOCKED) + return *prate; + + if (__clk_get_flags(hw->clk) & CLK_SET_RATE_PARENT) { + unsigned long best_parent; + + best_parent = (rate / 2); + *prate = __clk_round_rate(__clk_get_parent(hw->clk), + best_parent); + } + + return *prate * 2; +} + /* OMAP3/4 non-CORE DPLL clkops */ const struct clk_hw_omap_ops clkhwops_omap3_dpll = { .allow_idle = omap3_dpll_allow_idle, diff --git a/include/linux/clk/ti.h b/include/linux/clk/ti.h index 092b64168d7f..4a21a872dbbd 100644 --- a/include/linux/clk/ti.h +++ b/include/linux/clk/ti.h @@ -245,6 +245,10 @@ long omap2_dpll_round_rate(struct clk_hw *hw, unsigned long target_rate, void omap2_init_clk_clkdm(struct clk_hw *clk); unsigned long omap3_clkoutx2_recalc(struct clk_hw *hw, unsigned long parent_rate); +int omap3_clkoutx2_set_rate(struct clk_hw *hw, unsigned long rate, + unsigned long parent_rate); +long omap3_clkoutx2_round_rate(struct clk_hw *hw, unsigned long rate, + unsigned long *prate); int omap2_clkops_enable_clkdm(struct clk_hw *hw); void omap2_clkops_disable_clkdm(struct clk_hw *hw); int omap2_clk_disable_autoidle_all(void); -- cgit v1.2.3 From 634391ace193d00c59a691e9fc227b0f8942bad7 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Thu, 13 Feb 2014 13:53:33 +0100 Subject: mm: mask bits from pmd in pmd_lockptr/pmd_huge_pte The pmd pointer passed to pmd_lockptr/pmd_huge_pte can point to any entry in a pmd table. With USE_SPLIT_PMD_PTLOCKS==1 the code uses virt_to_page to get a struct page for the pmd table. The virt_to_page function automatically masks the lower PAGE_SHIFT bits from the address. But if the size of a pmd table is larger than PAGE_SIZE the additional bits are not removed from the pmd address and the wrong page struct is used. Fix this by explicitely masking the offset in the pmd table from the pmd pointer. Signed-off-by: Martin Schwidefsky --- include/linux/mm.h | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index f28f46eade6a..d354a72e6127 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1477,9 +1477,15 @@ static inline void pgtable_page_dtor(struct page *page) #if USE_SPLIT_PMD_PTLOCKS +static struct page *pmd_to_page(pmd_t *pmd) +{ + unsigned long mask = ~(PTRS_PER_PMD * sizeof(pmd_t) - 1); + return virt_to_page((void *)((unsigned long) pmd & mask)); +} + static inline spinlock_t *pmd_lockptr(struct mm_struct *mm, pmd_t *pmd) { - return ptlock_ptr(virt_to_page(pmd)); + return ptlock_ptr(pmd_to_page(pmd)); } static inline bool pgtable_pmd_page_ctor(struct page *page) @@ -1498,7 +1504,7 @@ static inline void pgtable_pmd_page_dtor(struct page *page) ptlock_free(page); } -#define pmd_huge_pte(mm, pmd) (virt_to_page(pmd)->pmd_huge_pte) +#define pmd_huge_pte(mm, pmd) (pmd_to_page(pmd)->pmd_huge_pte) #else -- cgit v1.2.3 From feb71dae1f9e0aeb056f7f639a21e620d327fc66 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 20 Feb 2014 15:32:37 -0800 Subject: blk-mq: merge blk_mq_insert_request and blk_mq_run_request It's almost identical to blk_mq_insert_request, so fold the two into one slightly more generic function by making the flush special case a bit smarted. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-exec.c | 2 +- block/blk-flush.c | 4 ++-- block/blk-mq.c | 53 ++++++++++---------------------------------------- block/blk-mq.h | 1 - include/linux/blk-mq.h | 3 +-- 5 files changed, 14 insertions(+), 49 deletions(-) (limited to 'include/linux') diff --git a/block/blk-exec.c b/block/blk-exec.c index c68613bb4c79..dbf4502b1d67 100644 --- a/block/blk-exec.c +++ b/block/blk-exec.c @@ -65,7 +65,7 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk, * be resued after dying flag is set */ if (q->mq_ops) { - blk_mq_insert_request(q, rq, at_head, true); + blk_mq_insert_request(rq, at_head, true, false); return; } diff --git a/block/blk-flush.c b/block/blk-flush.c index 66e2b697f5db..f598f794c3c6 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -137,7 +137,7 @@ static void mq_flush_run(struct work_struct *work) rq = container_of(work, struct request, mq_flush_work); memset(&rq->csd, 0, sizeof(rq->csd)); - blk_mq_run_request(rq, true, false); + blk_mq_insert_request(rq, false, true, false); } static bool blk_flush_queue_rq(struct request *rq) @@ -411,7 +411,7 @@ void blk_insert_flush(struct request *rq) if ((policy & REQ_FSEQ_DATA) && !(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH))) { if (q->mq_ops) { - blk_mq_run_request(rq, false, true); + blk_mq_insert_request(rq, false, false, true); } else list_add_tail(&rq->queuelist, &q->queue_head); return; diff --git a/block/blk-mq.c b/block/blk-mq.c index 1e585e3444c2..2af840594dc1 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -724,61 +724,28 @@ static void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, blk_mq_add_timer(rq); } -void blk_mq_insert_request(struct request_queue *q, struct request *rq, - bool at_head, bool run_queue) +void blk_mq_insert_request(struct request *rq, bool at_head, bool run_queue, + bool async) { + struct request_queue *q = rq->q; struct blk_mq_hw_ctx *hctx; - struct blk_mq_ctx *ctx, *current_ctx; + struct blk_mq_ctx *ctx = rq->mq_ctx, *current_ctx; + + current_ctx = blk_mq_get_ctx(q); + if (!cpu_online(ctx->cpu)) + rq->mq_ctx = ctx = current_ctx; - ctx = rq->mq_ctx; hctx = q->mq_ops->map_queue(q, ctx->cpu); - if (rq->cmd_flags & (REQ_FLUSH | REQ_FUA)) { + if (rq->cmd_flags & (REQ_FLUSH | REQ_FUA) && + !(rq->cmd_flags & (REQ_FLUSH_SEQ))) { blk_insert_flush(rq); } else { - current_ctx = blk_mq_get_ctx(q); - - if (!cpu_online(ctx->cpu)) { - ctx = current_ctx; - hctx = q->mq_ops->map_queue(q, ctx->cpu); - rq->mq_ctx = ctx; - } spin_lock(&ctx->lock); __blk_mq_insert_request(hctx, rq, at_head); spin_unlock(&ctx->lock); - - blk_mq_put_ctx(current_ctx); } - if (run_queue) - __blk_mq_run_hw_queue(hctx); -} -EXPORT_SYMBOL(blk_mq_insert_request); - -/* - * This is a special version of blk_mq_insert_request to bypass FLUSH request - * check. Should only be used internally. - */ -void blk_mq_run_request(struct request *rq, bool run_queue, bool async) -{ - struct request_queue *q = rq->q; - struct blk_mq_hw_ctx *hctx; - struct blk_mq_ctx *ctx, *current_ctx; - - current_ctx = blk_mq_get_ctx(q); - - ctx = rq->mq_ctx; - if (!cpu_online(ctx->cpu)) { - ctx = current_ctx; - rq->mq_ctx = ctx; - } - hctx = q->mq_ops->map_queue(q, ctx->cpu); - - /* ctx->cpu might be offline */ - spin_lock(&ctx->lock); - __blk_mq_insert_request(hctx, rq, false); - spin_unlock(&ctx->lock); - blk_mq_put_ctx(current_ctx); if (run_queue) diff --git a/block/blk-mq.h b/block/blk-mq.h index ed0035cd458e..72beba1f9d55 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -23,7 +23,6 @@ struct blk_mq_ctx { }; void __blk_mq_complete_request(struct request *rq); -void blk_mq_run_request(struct request *rq, bool run_queue, bool async); void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async); void blk_mq_init_flush(struct request_queue *q); void blk_mq_drain_queue(struct request_queue *q); diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 18ba8a627f46..ff28fe37ddda 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -121,8 +121,7 @@ void blk_mq_init_commands(struct request_queue *, void (*init)(void *data, struc void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule); -void blk_mq_insert_request(struct request_queue *, struct request *, - bool, bool); +void blk_mq_insert_request(struct request *, bool, bool, bool); void blk_mq_run_queues(struct request_queue *q, bool async); void blk_mq_free_request(struct request *rq); bool blk_mq_can_queue(struct blk_mq_hw_ctx *); -- cgit v1.2.3 From d6a25b31315327eef7785b895c354cc45c3f3742 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 20 Feb 2014 15:32:38 -0800 Subject: blk-mq: support partial I/O completions Add a new blk_mq_end_io_partial function to partially complete requests as needed by the SCSI layer. We do this by reusing blk_update_request to advance the bio instead of having a simplified version of it in the blk-mq code. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-mq.c | 37 +++++-------------------------------- include/linux/blk-mq.h | 8 +++++++- 2 files changed, 12 insertions(+), 33 deletions(-) (limited to 'include/linux') diff --git a/block/blk-mq.c b/block/blk-mq.c index 2af840594dc1..1b8b50df3655 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -283,38 +283,10 @@ void blk_mq_free_request(struct request *rq) __blk_mq_free_request(hctx, ctx, rq); } -static void blk_mq_bio_endio(struct request *rq, struct bio *bio, int error) +bool blk_mq_end_io_partial(struct request *rq, int error, unsigned int nr_bytes) { - if (error) - clear_bit(BIO_UPTODATE, &bio->bi_flags); - else if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) - error = -EIO; - - if (unlikely(rq->cmd_flags & REQ_QUIET)) - set_bit(BIO_QUIET, &bio->bi_flags); - - /* don't actually finish bio if it's part of flush sequence */ - if (!(rq->cmd_flags & REQ_FLUSH_SEQ)) - bio_endio(bio, error); -} - -void blk_mq_end_io(struct request *rq, int error) -{ - struct bio *bio = rq->bio; - unsigned int bytes = 0; - - trace_block_rq_complete(rq->q, rq); - - while (bio) { - struct bio *next = bio->bi_next; - - bio->bi_next = NULL; - bytes += bio->bi_iter.bi_size; - blk_mq_bio_endio(rq, bio, error); - bio = next; - } - - blk_account_io_completion(rq, bytes); + if (blk_update_request(rq, error, blk_rq_bytes(rq))) + return true; blk_account_io_done(rq); @@ -322,8 +294,9 @@ void blk_mq_end_io(struct request *rq, int error) rq->end_io(rq, error); else blk_mq_free_request(rq); + return false; } -EXPORT_SYMBOL(blk_mq_end_io); +EXPORT_SYMBOL(blk_mq_end_io_partial); static void __blk_mq_complete_request_remote(void *data) { diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index ff28fe37ddda..2ff2e8d982be 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -133,7 +133,13 @@ struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *, const int ctx_ind struct blk_mq_hw_ctx *blk_mq_alloc_single_hw_queue(struct blk_mq_reg *, unsigned int); void blk_mq_free_single_hw_queue(struct blk_mq_hw_ctx *, unsigned int); -void blk_mq_end_io(struct request *rq, int error); +bool blk_mq_end_io_partial(struct request *rq, int error, + unsigned int nr_bytes); +static inline void blk_mq_end_io(struct request *rq, int error) +{ + bool done = !blk_mq_end_io_partial(rq, error, blk_rq_bytes(rq)); + BUG_ON(!done); +} void blk_mq_complete_request(struct request *rq); -- cgit v1.2.3 From cd578abb24aa67ce468c427d3356c08ea32cf768 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 11 Feb 2014 16:01:16 +0100 Subject: perf/x86: Warn to early_printk() in case irq_work is too slow On Mon, Feb 10, 2014 at 08:45:16AM -0800, Dave Hansen wrote: > The reason I coded this up was that NMIs were firing off so fast that > nothing else was getting a chance to run. With this patch, at least the > printk() would come out and I'd have some idea what was going on. It will start spewing to early_printk() (which is a lot nicer to use from NMI context too) when it fails to queue the IRQ-work because its already enqueued. It does have the false-positive for when two CPUs trigger the warn concurrently, but that should be rare and some extra clutter on the early printk shouldn't be a problem. Cc: hpa@zytor.com Cc: tglx@linutronix.de Cc: dzickus@redhat.com Cc: Dave Hansen Cc: mingo@kernel.org Fixes: 6a02ad66b2c4 ("perf/x86: Push the duration-logging printk() to IRQ context") Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20140211150116.GO27965@twins.programming.kicks-ass.net Signed-off-by: Thomas Gleixner --- include/linux/irq_work.h | 2 +- kernel/events/core.c | 9 +++++++-- kernel/irq_work.c | 6 ++++-- 3 files changed, 12 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/irq_work.h b/include/linux/irq_work.h index add13c8624b7..19ae05d4b8ec 100644 --- a/include/linux/irq_work.h +++ b/include/linux/irq_work.h @@ -32,7 +32,7 @@ void init_irq_work(struct irq_work *work, void (*func)(struct irq_work *)) #define DEFINE_IRQ_WORK(name, _f) struct irq_work name = { .func = (_f), } -void irq_work_queue(struct irq_work *work); +bool irq_work_queue(struct irq_work *work); void irq_work_run(void); void irq_work_sync(struct irq_work *work); diff --git a/kernel/events/core.c b/kernel/events/core.c index 2067cbb378eb..45e5543e2a1e 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -243,7 +243,7 @@ static void perf_duration_warn(struct irq_work *w) printk_ratelimited(KERN_WARNING "perf interrupt took too long (%lld > %lld), lowering " "kernel.perf_event_max_sample_rate to %d\n", - avg_local_sample_len, allowed_ns, + avg_local_sample_len, allowed_ns >> 1, sysctl_perf_event_sample_rate); } @@ -283,7 +283,12 @@ void perf_sample_event_took(u64 sample_len_ns) update_perf_cpu_limits(); - irq_work_queue(&perf_duration_work); + if (!irq_work_queue(&perf_duration_work)) { + early_printk("perf interrupt took too long (%lld > %lld), lowering " + "kernel.perf_event_max_sample_rate to %d\n", + avg_local_sample_len, allowed_ns >> 1, + sysctl_perf_event_sample_rate); + } } static atomic64_t perf_event_id; diff --git a/kernel/irq_work.c b/kernel/irq_work.c index 55fcce6065cf..a82170e2fa78 100644 --- a/kernel/irq_work.c +++ b/kernel/irq_work.c @@ -61,11 +61,11 @@ void __weak arch_irq_work_raise(void) * * Can be re-enqueued while the callback is still in progress. */ -void irq_work_queue(struct irq_work *work) +bool irq_work_queue(struct irq_work *work) { /* Only queue if not already pending */ if (!irq_work_claim(work)) - return; + return false; /* Queue the entry and raise the IPI if needed. */ preempt_disable(); @@ -83,6 +83,8 @@ void irq_work_queue(struct irq_work *work) } preempt_enable(); + + return true; } EXPORT_SYMBOL_GPL(irq_work_queue); -- cgit v1.2.3 From ae58faf90050ea72c289f62c4bc9a5bb8ae7f0bd Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Tue, 11 Feb 2014 16:20:09 +0100 Subject: perf/x86/uncore: add PCI ids for SNB/IVB/HSW IMC This patch adds the PCI ids for the Intel SandyBridge, IvyBridge, Haswell Client memory controller (IMC). Cc: mingo@elte.hu Cc: acme@redhat.com Cc: ak@linux.intel.com Cc: zheng.z.yan@intel.com Cc: peterz@infradead.org Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1392132015-14521-4-git-send-email-eranian@google.com Signed-off-by: Thomas Gleixner --- include/linux/pci_ids.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h index 97fbecdd7a40..7399e6a3e9a0 100644 --- a/include/linux/pci_ids.h +++ b/include/linux/pci_ids.h @@ -2531,6 +2531,9 @@ #define PCI_VENDOR_ID_INTEL 0x8086 #define PCI_DEVICE_ID_INTEL_EESSC 0x0008 +#define PCI_DEVICE_ID_INTEL_SNB_IMC 0x0100 +#define PCI_DEVICE_ID_INTEL_IVB_IMC 0x0154 +#define PCI_DEVICE_ID_INTEL_HSW_IMC 0x0c00 #define PCI_DEVICE_ID_INTEL_PXHD_0 0x0320 #define PCI_DEVICE_ID_INTEL_PXHD_1 0x0321 #define PCI_DEVICE_ID_INTEL_PXH_0 0x0329 -- cgit v1.2.3 From 0dc83bd30b0bf5410c0933cfbbf8853248eff0a9 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 21 Feb 2014 11:19:04 +0100 Subject: Revert "writeback: do not sync data dirtied after sync start" This reverts commit c4a391b53a72d2df4ee97f96f78c1d5971b47489. Dave Chinner has reported the commit may cause some inodes to be left out from sync(2). This is because we can call redirty_tail() for some inode (which sets i_dirtied_when to current time) after sync(2) has started or similarly requeue_inode() can set i_dirtied_when to current time if writeback had to skip some pages. The real problem is in the functions clobbering i_dirtied_when but fixing that isn't trivial so revert is a safer choice for now. CC: stable@vger.kernel.org # >= 3.13 Signed-off-by: Jan Kara --- fs/fs-writeback.c | 33 +++++++++++---------------------- fs/sync.c | 15 ++++++--------- fs/xfs/xfs_super.c | 2 +- include/linux/writeback.h | 2 +- include/trace/events/writeback.h | 6 +++--- 5 files changed, 22 insertions(+), 36 deletions(-) (limited to 'include/linux') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index e0259a163f98..d754e3cf99a8 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -40,18 +40,13 @@ struct wb_writeback_work { long nr_pages; struct super_block *sb; - /* - * Write only inodes dirtied before this time. Don't forget to set - * older_than_this_is_set when you set this. - */ - unsigned long older_than_this; + unsigned long *older_than_this; enum writeback_sync_modes sync_mode; unsigned int tagged_writepages:1; unsigned int for_kupdate:1; unsigned int range_cyclic:1; unsigned int for_background:1; unsigned int for_sync:1; /* sync(2) WB_SYNC_ALL writeback */ - unsigned int older_than_this_is_set:1; enum wb_reason reason; /* why was writeback initiated? */ struct list_head list; /* pending work list */ @@ -252,10 +247,10 @@ static int move_expired_inodes(struct list_head *delaying_queue, int do_sb_sort = 0; int moved = 0; - WARN_ON_ONCE(!work->older_than_this_is_set); while (!list_empty(delaying_queue)) { inode = wb_inode(delaying_queue->prev); - if (inode_dirtied_after(inode, work->older_than_this)) + if (work->older_than_this && + inode_dirtied_after(inode, *work->older_than_this)) break; list_move(&inode->i_wb_list, &tmp); moved++; @@ -742,8 +737,6 @@ static long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages, .sync_mode = WB_SYNC_NONE, .range_cyclic = 1, .reason = reason, - .older_than_this = jiffies, - .older_than_this_is_set = 1, }; spin_lock(&wb->list_lock); @@ -802,13 +795,12 @@ static long wb_writeback(struct bdi_writeback *wb, { unsigned long wb_start = jiffies; long nr_pages = work->nr_pages; + unsigned long oldest_jif; struct inode *inode; long progress; - if (!work->older_than_this_is_set) { - work->older_than_this = jiffies; - work->older_than_this_is_set = 1; - } + oldest_jif = jiffies; + work->older_than_this = &oldest_jif; spin_lock(&wb->list_lock); for (;;) { @@ -842,10 +834,10 @@ static long wb_writeback(struct bdi_writeback *wb, * safe. */ if (work->for_kupdate) { - work->older_than_this = jiffies - + oldest_jif = jiffies - msecs_to_jiffies(dirty_expire_interval * 10); } else if (work->for_background) - work->older_than_this = jiffies; + oldest_jif = jiffies; trace_writeback_start(wb->bdi, work); if (list_empty(&wb->b_io)) @@ -1357,21 +1349,18 @@ EXPORT_SYMBOL(try_to_writeback_inodes_sb); /** * sync_inodes_sb - sync sb inode pages - * @sb: the superblock - * @older_than_this: timestamp + * @sb: the superblock * * This function writes and waits on any dirty inode belonging to this - * superblock that has been dirtied before given timestamp. + * super_block. */ -void sync_inodes_sb(struct super_block *sb, unsigned long older_than_this) +void sync_inodes_sb(struct super_block *sb) { DECLARE_COMPLETION_ONSTACK(done); struct wb_writeback_work work = { .sb = sb, .sync_mode = WB_SYNC_ALL, .nr_pages = LONG_MAX, - .older_than_this = older_than_this, - .older_than_this_is_set = 1, .range_cyclic = 0, .done = &done, .reason = WB_REASON_SYNC, diff --git a/fs/sync.c b/fs/sync.c index e8ba024a055b..b28d1dd10e8b 100644 --- a/fs/sync.c +++ b/fs/sync.c @@ -27,11 +27,10 @@ * wait == 1 case since in that case write_inode() functions do * sync_dirty_buffer() and thus effectively write one block at a time. */ -static int __sync_filesystem(struct super_block *sb, int wait, - unsigned long start) +static int __sync_filesystem(struct super_block *sb, int wait) { if (wait) - sync_inodes_sb(sb, start); + sync_inodes_sb(sb); else writeback_inodes_sb(sb, WB_REASON_SYNC); @@ -48,7 +47,6 @@ static int __sync_filesystem(struct super_block *sb, int wait, int sync_filesystem(struct super_block *sb) { int ret; - unsigned long start = jiffies; /* * We need to be protected against the filesystem going from @@ -62,17 +60,17 @@ int sync_filesystem(struct super_block *sb) if (sb->s_flags & MS_RDONLY) return 0; - ret = __sync_filesystem(sb, 0, start); + ret = __sync_filesystem(sb, 0); if (ret < 0) return ret; - return __sync_filesystem(sb, 1, start); + return __sync_filesystem(sb, 1); } EXPORT_SYMBOL_GPL(sync_filesystem); static void sync_inodes_one_sb(struct super_block *sb, void *arg) { if (!(sb->s_flags & MS_RDONLY)) - sync_inodes_sb(sb, *((unsigned long *)arg)); + sync_inodes_sb(sb); } static void sync_fs_one_sb(struct super_block *sb, void *arg) @@ -104,10 +102,9 @@ static void fdatawait_one_bdev(struct block_device *bdev, void *arg) SYSCALL_DEFINE0(sync) { int nowait = 0, wait = 1; - unsigned long start = jiffies; wakeup_flusher_threads(0, WB_REASON_SYNC); - iterate_supers(sync_inodes_one_sb, &start); + iterate_supers(sync_inodes_one_sb, NULL); iterate_supers(sync_fs_one_sb, &nowait); iterate_supers(sync_fs_one_sb, &wait); iterate_bdevs(fdatawrite_one_bdev, NULL); diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index f317488263dd..d971f4932b5d 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -913,7 +913,7 @@ xfs_flush_inodes( struct super_block *sb = mp->m_super; if (down_read_trylock(&sb->s_umount)) { - sync_inodes_sb(sb, jiffies); + sync_inodes_sb(sb); up_read(&sb->s_umount); } } diff --git a/include/linux/writeback.h b/include/linux/writeback.h index fc0e4320aa6d..021b8a319b9e 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -97,7 +97,7 @@ void writeback_inodes_sb_nr(struct super_block *, unsigned long nr, int try_to_writeback_inodes_sb(struct super_block *, enum wb_reason reason); int try_to_writeback_inodes_sb_nr(struct super_block *, unsigned long nr, enum wb_reason reason); -void sync_inodes_sb(struct super_block *sb, unsigned long older_than_this); +void sync_inodes_sb(struct super_block *); void wakeup_flusher_threads(long nr_pages, enum wb_reason reason); void inode_wait_for_writeback(struct inode *inode); diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h index c7bbbe794e65..464ea82e10db 100644 --- a/include/trace/events/writeback.h +++ b/include/trace/events/writeback.h @@ -287,11 +287,11 @@ TRACE_EVENT(writeback_queue_io, __field(int, reason) ), TP_fast_assign( - unsigned long older_than_this = work->older_than_this; + unsigned long *older_than_this = work->older_than_this; strncpy(__entry->name, dev_name(wb->bdi->dev), 32); - __entry->older = older_than_this; + __entry->older = older_than_this ? *older_than_this : 0; __entry->age = older_than_this ? - (jiffies - older_than_this) * 1000 / HZ : -1; + (jiffies - *older_than_this) * 1000 / HZ : -1; __entry->moved = moved; __entry->reason = work->reason; ), -- cgit v1.2.3 From 8f47b1871b8aac98f1a9d93bc3467fb97b65199a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 7 Feb 2014 20:58:39 +0100 Subject: sched: Add better debug output for might_sleep() might_sleep() can tell us where interrupts have been disabled, but we have no idea what disabled preemption. Add some debug infrastructure. Signed-off-by: Thomas Gleixner Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1391803122-4425-4-git-send-email-bigeasy@linutronix.de Signed-off-by: Ingo Molnar --- include/linux/sched.h | 3 +++ kernel/sched/core.c | 23 +++++++++++++++++++++-- 2 files changed, 24 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index c49a2585ff7d..825ed838d4b9 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1463,6 +1463,9 @@ struct task_struct { struct mutex perf_event_mutex; struct list_head perf_event_list; #endif +#ifdef CONFIG_DEBUG_PREEMPT + unsigned long preempt_disable_ip; +#endif #ifdef CONFIG_NUMA struct mempolicy *mempolicy; /* Protected by alloc_lock */ short il_next; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index a01fe6cfdb9b..c94e851dc981 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2501,8 +2501,13 @@ void __kprobes preempt_count_add(int val) DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= PREEMPT_MASK - 10); #endif - if (preempt_count() == val) - trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); + if (preempt_count() == val) { + unsigned long ip = get_parent_ip(CALLER_ADDR1); +#ifdef CONFIG_DEBUG_PREEMPT + current->preempt_disable_ip = ip; +#endif + trace_preempt_off(CALLER_ADDR0, ip); + } } EXPORT_SYMBOL(preempt_count_add); @@ -2545,6 +2550,13 @@ static noinline void __schedule_bug(struct task_struct *prev) print_modules(); if (irqs_disabled()) print_irqtrace_events(prev); +#ifdef CONFIG_DEBUG_PREEMPT + if (in_atomic_preempt_off()) { + pr_err("Preemption disabled at:"); + print_ip_sym(current->preempt_disable_ip); + pr_cont("\n"); + } +#endif dump_stack(); add_taint(TAINT_WARN, LOCKDEP_STILL_OK); } @@ -6946,6 +6958,13 @@ void __might_sleep(const char *file, int line, int preempt_offset) debug_show_held_locks(current); if (irqs_disabled()) print_irqtrace_events(current); +#ifdef CONFIG_DEBUG_PREEMPT + if (!preempt_count_equals(preempt_offset)) { + pr_err("Preemption disabled at:"); + print_ip_sym(current->preempt_disable_ip); + pr_cont("\n"); + } +#endif dump_stack(); } EXPORT_SYMBOL(__might_sleep); -- cgit v1.2.3 From c365c292d05908c6ea6f32708f331e21033fe71d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 7 Feb 2014 20:58:42 +0100 Subject: sched: Consider pi boosting in setscheduler() If a PI boosted task policy/priority is modified by a setscheduler() call we unconditionally dequeue and requeue the task if it is on the runqueue even if the new priority is lower than the current effective boosted priority. This can result in undesired reordering of the priority bucket list. If the new priority is less or equal than the current effective we just store the new parameters in the task struct and leave the scheduler class and the runqueue untouched. This is handled when the task deboosts itself. Only if the new priority is higher than the effective boosted priority we apply the change immediately. Signed-off-by: Thomas Gleixner [ Rebase ontop of v3.14-rc1. ] Signed-off-by: Sebastian Andrzej Siewior Cc: Dario Faggioli Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1391803122-4425-7-git-send-email-bigeasy@linutronix.de Signed-off-by: Ingo Molnar --- include/linux/sched/rt.h | 7 +++++++ kernel/locking/rtmutex.c | 12 ++++++++++++ kernel/sched/core.c | 41 ++++++++++++++++++++++++++++++----------- 3 files changed, 49 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched/rt.h b/include/linux/sched/rt.h index f7453d4c5613..6341f5be6e24 100644 --- a/include/linux/sched/rt.h +++ b/include/linux/sched/rt.h @@ -18,6 +18,7 @@ static inline int rt_task(struct task_struct *p) #ifdef CONFIG_RT_MUTEXES extern int rt_mutex_getprio(struct task_struct *p); extern void rt_mutex_setprio(struct task_struct *p, int prio); +extern int rt_mutex_check_prio(struct task_struct *task, int newprio); extern struct task_struct *rt_mutex_get_top_task(struct task_struct *task); extern void rt_mutex_adjust_pi(struct task_struct *p); static inline bool tsk_is_pi_blocked(struct task_struct *tsk) @@ -29,6 +30,12 @@ static inline int rt_mutex_getprio(struct task_struct *p) { return p->normal_prio; } + +static inline int rt_mutex_check_prio(struct task_struct *task, int newprio) +{ + return 0; +} + static inline struct task_struct *rt_mutex_get_top_task(struct task_struct *task) { return NULL; diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 2e960a2bab81..aa4dff04b594 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -212,6 +212,18 @@ struct task_struct *rt_mutex_get_top_task(struct task_struct *task) return task_top_pi_waiter(task)->task; } +/* + * Called by sched_setscheduler() to check whether the priority change + * is overruled by a possible priority boosting. + */ +int rt_mutex_check_prio(struct task_struct *task, int newprio) +{ + if (!task_has_pi_waiters(task)) + return 0; + + return task_top_pi_waiter(task)->task->prio <= newprio; +} + /* * Adjust the priority of a task, after its pi_waiters got modified. * diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 9c2fcbf9a266..003263b3b05c 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2902,7 +2902,8 @@ EXPORT_SYMBOL(sleep_on_timeout); * This function changes the 'effective' priority of a task. It does * not touch ->normal_prio like __setscheduler(). * - * Used by the rt_mutex code to implement priority inheritance logic. + * Used by the rt_mutex code to implement priority inheritance + * logic. Call site only calls if the priority of the task changed. */ void rt_mutex_setprio(struct task_struct *p, int prio) { @@ -3171,9 +3172,8 @@ __setparam_dl(struct task_struct *p, const struct sched_attr *attr) dl_se->dl_new = 1; } -/* Actually do priority change: must hold pi & rq lock. */ -static void __setscheduler(struct rq *rq, struct task_struct *p, - const struct sched_attr *attr) +static void __setscheduler_params(struct task_struct *p, + const struct sched_attr *attr) { int policy = attr->sched_policy; @@ -3193,9 +3193,14 @@ static void __setscheduler(struct rq *rq, struct task_struct *p, * getparam()/getattr() don't report silly values for !rt tasks. */ p->rt_priority = attr->sched_priority; + set_load_weight(p); +} - p->normal_prio = normal_prio(p); - p->prio = rt_mutex_getprio(p); +/* Actually do priority change: must hold pi & rq lock. */ +static void __setscheduler(struct rq *rq, struct task_struct *p, + const struct sched_attr *attr) +{ + __setscheduler_params(p, attr); if (dl_prio(p->prio)) p->sched_class = &dl_sched_class; @@ -3203,8 +3208,6 @@ static void __setscheduler(struct rq *rq, struct task_struct *p, p->sched_class = &rt_sched_class; else p->sched_class = &fair_sched_class; - - set_load_weight(p); } static void @@ -3257,6 +3260,7 @@ static int __sched_setscheduler(struct task_struct *p, const struct sched_attr *attr, bool user) { + int newprio = MAX_RT_PRIO - 1 - attr->sched_priority; int retval, oldprio, oldpolicy = -1, on_rq, running; int policy = attr->sched_policy; unsigned long flags; @@ -3427,6 +3431,24 @@ change: return -EBUSY; } + p->sched_reset_on_fork = reset_on_fork; + oldprio = p->prio; + + /* + * Special case for priority boosted tasks. + * + * If the new priority is lower or equal (user space view) + * than the current (boosted) priority, we just store the new + * normal parameters and do not touch the scheduler class and + * the runqueue. This will be done when the task deboost + * itself. + */ + if (rt_mutex_check_prio(p, newprio)) { + __setscheduler_params(p, attr); + task_rq_unlock(rq, p, &flags); + return 0; + } + on_rq = p->on_rq; running = task_current(rq, p); if (on_rq) @@ -3434,9 +3456,6 @@ change: if (running) p->sched_class->put_prev_task(rq, p); - p->sched_reset_on_fork = reset_on_fork; - - oldprio = p->prio; prev_class = p->sched_class; __setscheduler(rq, p, attr); -- cgit v1.2.3 From 7e298d60f717257dc8365c975f45ff9c37165362 Mon Sep 17 00:00:00 2001 From: Dongsheng Yang Date: Tue, 11 Feb 2014 15:34:45 +0800 Subject: sched/prio: Use DEFAULT_PRIO to define NICE_TO_PRIO() and PRIO_TO_NICE() There is already a macro named DEFAULT_PRIO in prio.h, we can use it to define NICE_TO_PRIO and PRIO_TO_NICE rather than use hard coding of (MAX_RT_PRIO + 20). Signed-off-by: Dongsheng Yang Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/4e28ec36fb49e8906027cbbdd900ab26a149905e.1392103744.git.yangds.fnst@cn.fujitsu.com Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- include/linux/sched/prio.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched/prio.h b/include/linux/sched/prio.h index 410ccb74c9e6..1ceaaa1da3e4 100644 --- a/include/linux/sched/prio.h +++ b/include/linux/sched/prio.h @@ -25,8 +25,8 @@ * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], * and back. */ -#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20) -#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20) +#define NICE_TO_PRIO(nice) ((nice) + DEFAULT_PRIO) +#define PRIO_TO_NICE(prio) ((prio) - DEFAULT_PRIO) /* * 'User priority' is the nice value converted to something we -- cgit v1.2.3 From 3ee237dddcd885d4e525791299d62de33b2ca117 Mon Sep 17 00:00:00 2001 From: Dongsheng Yang Date: Tue, 11 Feb 2014 15:34:46 +0800 Subject: sched/prio: Add 3 macros of MAX_NICE, MIN_NICE and NICE_WIDTH in prio.h Currently there is lots of hard coding to 19 and -20, to represent maximum and minimum of nice values. This patch add three macros in prio.h for maximum, minimum and width of nice value, and uses it to remove hardcoded values in prio.h. Signed-off-by: Dongsheng Yang Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/3994e89327b2b15f992277cdf9f409c516f87d1b.1392103744.git.yangds.fnst@cn.fujitsu.com Signed-off-by: Thomas Gleixner [ Collapsed two small patches. ] Signed-off-by: Ingo Molnar --- include/linux/sched/prio.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched/prio.h b/include/linux/sched/prio.h index 1ceaaa1da3e4..ac322583c820 100644 --- a/include/linux/sched/prio.h +++ b/include/linux/sched/prio.h @@ -1,6 +1,10 @@ #ifndef _SCHED_PRIO_H #define _SCHED_PRIO_H +#define MAX_NICE 19 +#define MIN_NICE -20 +#define NICE_WIDTH (MAX_NICE - MIN_NICE + 1) + /* * Priority of a process goes from 0..MAX_PRIO-1, valid RT * priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL/SCHED_BATCH @@ -17,8 +21,8 @@ #define MAX_USER_RT_PRIO 100 #define MAX_RT_PRIO MAX_USER_RT_PRIO -#define MAX_PRIO (MAX_RT_PRIO + 40) -#define DEFAULT_PRIO (MAX_RT_PRIO + 20) +#define MAX_PRIO (MAX_RT_PRIO + NICE_WIDTH) +#define DEFAULT_PRIO (MAX_RT_PRIO + NICE_WIDTH / 2) /* * Convert user-nice values [ -20 ... 0 ... 19 ] -- cgit v1.2.3 From 156c5887948cd191417f18026aab9ce26e5a95da Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Sat, 22 Feb 2014 16:53:31 +0100 Subject: ahci-platform: Add support for devices with more then 1 clock The allwinner-sun4i AHCI controller needs 2 clocks to be enabled and the imx AHCI controller needs 3 clocks to be enabled. tj: Minor comment formatting updates. Signed-off-by: Hans de Goede Signed-off-by: Tejun Heo --- .../devicetree/bindings/ata/ahci-platform.txt | 1 + drivers/ata/ahci.h | 3 +- drivers/ata/ahci_platform.c | 113 ++++++++++++++++----- include/linux/ahci_platform.h | 4 + 4 files changed, 93 insertions(+), 28 deletions(-) (limited to 'include/linux') diff --git a/Documentation/devicetree/bindings/ata/ahci-platform.txt b/Documentation/devicetree/bindings/ata/ahci-platform.txt index 89de1564950c..3ced07dc8f56 100644 --- a/Documentation/devicetree/bindings/ata/ahci-platform.txt +++ b/Documentation/devicetree/bindings/ata/ahci-platform.txt @@ -10,6 +10,7 @@ Required properties: Optional properties: - dma-coherent : Present if dma operations are coherent +- clocks : a list of phandle + clock specifier pairs Example: sata@ffe08000 { diff --git a/drivers/ata/ahci.h b/drivers/ata/ahci.h index 64d1a99de5e4..c12862bd48ee 100644 --- a/drivers/ata/ahci.h +++ b/drivers/ata/ahci.h @@ -51,6 +51,7 @@ enum { AHCI_MAX_PORTS = 32, + AHCI_MAX_CLKS = 3, AHCI_MAX_SG = 168, /* hardware max is 64K */ AHCI_DMA_BOUNDARY = 0xffffffff, AHCI_MAX_CMDS = 32, @@ -321,7 +322,7 @@ struct ahci_host_priv { u32 em_loc; /* enclosure management location */ u32 em_buf_sz; /* EM buffer size in byte */ u32 em_msg_type; /* EM message type */ - struct clk *clk; /* Only for platforms supporting clk */ + struct clk *clks[AHCI_MAX_CLKS]; /* Optional */ void *plat_data; /* Other platform data */ /* * Optional ahci_start_engine override, if not set this gets set to the diff --git a/drivers/ata/ahci_platform.c b/drivers/ata/ahci_platform.c index 9302d8143393..8d5a9a7ce958 100644 --- a/drivers/ata/ahci_platform.c +++ b/drivers/ata/ahci_platform.c @@ -86,6 +86,60 @@ static struct scsi_host_template ahci_platform_sht = { AHCI_SHT("ahci_platform"), }; +/** + * ahci_platform_enable_clks - Enable platform clocks + * @hpriv: host private area to store config values + * + * This function enables all the clks found in hpriv->clks, starting at + * index 0. If any clk fails to enable it disables all the clks already + * enabled in reverse order, and then returns an error. + * + * RETURNS: + * 0 on success otherwise a negative error code + */ +int ahci_platform_enable_clks(struct ahci_host_priv *hpriv) +{ + int c, rc; + + for (c = 0; c < AHCI_MAX_CLKS && hpriv->clks[c]; c++) { + rc = clk_prepare_enable(hpriv->clks[c]); + if (rc) + goto disable_unprepare_clk; + } + return 0; + +disable_unprepare_clk: + while (--c >= 0) + clk_disable_unprepare(hpriv->clks[c]); + return rc; +} +EXPORT_SYMBOL_GPL(ahci_platform_enable_clks); + +/** + * ahci_platform_disable_clks - Disable platform clocks + * @hpriv: host private area to store config values + * + * This function disables all the clks found in hpriv->clks, in reverse + * order of ahci_platform_enable_clks (starting at the end of the array). + */ +void ahci_platform_disable_clks(struct ahci_host_priv *hpriv) +{ + int c; + + for (c = AHCI_MAX_CLKS - 1; c >= 0; c--) + if (hpriv->clks[c]) + clk_disable_unprepare(hpriv->clks[c]); +} +EXPORT_SYMBOL_GPL(ahci_platform_disable_clks); + +static void ahci_put_clks(struct ahci_host_priv *hpriv) +{ + int c; + + for (c = 0; c < AHCI_MAX_CLKS && hpriv->clks[c]; c++) + clk_put(hpriv->clks[c]); +} + static int ahci_probe(struct platform_device *pdev) { struct device *dev = &pdev->dev; @@ -96,6 +150,7 @@ static int ahci_probe(struct platform_device *pdev) struct ahci_host_priv *hpriv; struct ata_host *host; struct resource *mem; + struct clk *clk; int irq; int n_ports; int i; @@ -130,17 +185,31 @@ static int ahci_probe(struct platform_device *pdev) return -ENOMEM; } - hpriv->clk = clk_get(dev, NULL); - if (IS_ERR(hpriv->clk)) { - dev_err(dev, "can't get clock\n"); - } else { - rc = clk_prepare_enable(hpriv->clk); - if (rc) { - dev_err(dev, "clock prepare enable failed"); - goto free_clk; + for (i = 0; i < AHCI_MAX_CLKS; i++) { + /* + * For now we must use clk_get(dev, NULL) for the first clock, + * because some platforms (da850, spear13xx) are not yet + * converted to use devicetree for clocks. For new platforms + * this is equivalent to of_clk_get(dev->of_node, 0). + */ + if (i == 0) + clk = clk_get(dev, NULL); + else + clk = of_clk_get(dev->of_node, i); + + if (IS_ERR(clk)) { + rc = PTR_ERR(clk); + if (rc == -EPROBE_DEFER) + goto free_clk; + break; } + hpriv->clks[i] = clk; } + rc = ahci_enable_clks(dev, hpriv); + if (rc) + goto free_clk; + /* * Some platforms might need to prepare for mmio region access, * which could be done in the following init call. So, the mmio @@ -221,11 +290,9 @@ pdata_exit: if (pdata && pdata->exit) pdata->exit(dev); disable_unprepare_clk: - if (!IS_ERR(hpriv->clk)) - clk_disable_unprepare(hpriv->clk); + ahci_disable_clks(hpriv); free_clk: - if (!IS_ERR(hpriv->clk)) - clk_put(hpriv->clk); + ahci_put_clks(hpriv); return rc; } @@ -238,10 +305,8 @@ static void ahci_host_stop(struct ata_host *host) if (pdata && pdata->exit) pdata->exit(dev); - if (!IS_ERR(hpriv->clk)) { - clk_disable_unprepare(hpriv->clk); - clk_put(hpriv->clk); - } + ahci_disable_clks(hpriv); + ahci_put_clks(hpriv); } #ifdef CONFIG_PM_SLEEP @@ -276,8 +341,7 @@ static int ahci_suspend(struct device *dev) if (pdata && pdata->suspend) return pdata->suspend(dev); - if (!IS_ERR(hpriv->clk)) - clk_disable_unprepare(hpriv->clk); + ahci_disable_clks(hpriv); return 0; } @@ -289,13 +353,9 @@ static int ahci_resume(struct device *dev) struct ahci_host_priv *hpriv = host->private_data; int rc; - if (!IS_ERR(hpriv->clk)) { - rc = clk_prepare_enable(hpriv->clk); - if (rc) { - dev_err(dev, "clock prepare enable failed"); - return rc; - } - } + rc = ahci_enable_clks(dev, hpriv); + if (rc) + return rc; if (pdata && pdata->resume) { rc = pdata->resume(dev); @@ -316,8 +376,7 @@ static int ahci_resume(struct device *dev) return 0; disable_unprepare_clk: - if (!IS_ERR(hpriv->clk)) - clk_disable_unprepare(hpriv->clk); + ahci_disable_clks(hpriv); return rc; } diff --git a/include/linux/ahci_platform.h b/include/linux/ahci_platform.h index 73a25005d88a..769d06525320 100644 --- a/include/linux/ahci_platform.h +++ b/include/linux/ahci_platform.h @@ -19,6 +19,7 @@ struct device; struct ata_port_info; +struct ahci_host_priv; struct ahci_platform_data { int (*init)(struct device *dev, void __iomem *addr); @@ -30,4 +31,7 @@ struct ahci_platform_data { unsigned int mask_port_map; }; +int ahci_platform_enable_clks(struct ahci_host_priv *hpriv); +void ahci_platform_disable_clks(struct ahci_host_priv *hpriv); + #endif /* _AHCI_PLATFORM_H */ -- cgit v1.2.3 From 96a01ba52c60fdd74dd6e8cf06645d06515b1396 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Sat, 22 Feb 2014 16:53:33 +0100 Subject: ahci-platform: Add enable_ / disable_resources helper functions tj: Minor comment formatting updates. Signed-off-by: Hans de Goede Signed-off-by: Tejun Heo --- drivers/ata/ahci_platform.c | 106 +++++++++++++++++++++++++++--------------- include/linux/ahci_platform.h | 2 + 2 files changed, 71 insertions(+), 37 deletions(-) (limited to 'include/linux') diff --git a/drivers/ata/ahci_platform.c b/drivers/ata/ahci_platform.c index 4befd78133ad..a32df31013cb 100644 --- a/drivers/ata/ahci_platform.c +++ b/drivers/ata/ahci_platform.c @@ -132,6 +132,62 @@ void ahci_platform_disable_clks(struct ahci_host_priv *hpriv) } EXPORT_SYMBOL_GPL(ahci_platform_disable_clks); +/** + * ahci_platform_enable_resources - Enable platform resources + * @hpriv: host private area to store config values + * + * This function enables all ahci_platform managed resources in the + * following order: + * 1) Regulator + * 2) Clocks (through ahci_platform_enable_clks) + * + * If resource enabling fails at any point the previous enabled resources + * are disabled in reverse order. + * + * RETURNS: + * 0 on success otherwise a negative error code + */ +int ahci_platform_enable_resources(struct ahci_host_priv *hpriv) +{ + int rc; + + if (hpriv->target_pwr) { + rc = regulator_enable(hpriv->target_pwr); + if (rc) + return rc; + } + + rc = ahci_platform_enable_clks(hpriv); + if (rc) + goto disable_regulator; + + return 0; + +disable_regulator: + if (hpriv->target_pwr) + regulator_disable(hpriv->target_pwr); + return rc; +} +EXPORT_SYMBOL_GPL(ahci_platform_enable_resources); + +/** + * ahci_platform_disable_resources - Disable platform resources + * @hpriv: host private area to store config values + * + * This function disables all ahci_platform managed resources in the + * following order: + * 1) Clocks (through ahci_platform_disable_clks) + * 2) Regulator + */ +void ahci_platform_disable_resources(struct ahci_host_priv *hpriv) +{ + ahci_platform_disable_clks(hpriv); + + if (hpriv->target_pwr) + regulator_disable(hpriv->target_pwr); +} +EXPORT_SYMBOL_GPL(ahci_platform_disable_resources); + static void ahci_put_clks(struct ahci_host_priv *hpriv) { int c; @@ -214,15 +270,9 @@ static int ahci_probe(struct platform_device *pdev) hpriv->clks[i] = clk; } - if (hpriv->target_pwr) { - rc = regulator_enable(hpriv->target_pwr); - if (rc) - goto free_clk; - } - - rc = ahci_enable_clks(dev, hpriv); + rc = ahci_platform_enable_resources(hpriv); if (rc) - goto disable_regulator; + goto free_clk; /* * Some platforms might need to prepare for mmio region access, @@ -233,7 +283,7 @@ static int ahci_probe(struct platform_device *pdev) if (pdata && pdata->init) { rc = pdata->init(dev, hpriv->mmio); if (rc) - goto disable_unprepare_clk; + goto disable_resources; } ahci_save_initial_config(dev, hpriv, @@ -303,11 +353,8 @@ static int ahci_probe(struct platform_device *pdev) pdata_exit: if (pdata && pdata->exit) pdata->exit(dev); -disable_unprepare_clk: - ahci_disable_clks(hpriv); -disable_regulator: - if (hpriv->target_pwr) - regulator_disable(hpriv->target_pwr); +disable_resources: + ahci_platform_disable_resources(hpriv); free_clk: ahci_put_clks(hpriv); return rc; @@ -322,11 +369,8 @@ static void ahci_host_stop(struct ata_host *host) if (pdata && pdata->exit) pdata->exit(dev); - ahci_disable_clks(hpriv); + ahci_platform_disable_resources(hpriv); ahci_put_clks(hpriv); - - if (hpriv->target_pwr) - regulator_disable(hpriv->target_pwr); } #ifdef CONFIG_PM_SLEEP @@ -361,10 +405,7 @@ static int ahci_suspend(struct device *dev) if (pdata && pdata->suspend) return pdata->suspend(dev); - ahci_disable_clks(hpriv); - - if (hpriv->target_pwr) - regulator_disable(hpriv->target_pwr); + ahci_platform_disable_resources(hpriv); return 0; } @@ -376,26 +417,20 @@ static int ahci_resume(struct device *dev) struct ahci_host_priv *hpriv = host->private_data; int rc; - if (hpriv->target_pwr) { - rc = regulator_enable(hpriv->target_pwr); - if (rc) - return rc; - } - - rc = ahci_enable_clks(dev, hpriv); + rc = ahci_platform_enable_resources(hpriv); if (rc) - goto disable_regulator; + return rc; if (pdata && pdata->resume) { rc = pdata->resume(dev); if (rc) - goto disable_unprepare_clk; + goto disable_resources; } if (dev->power.power_state.event == PM_EVENT_SUSPEND) { rc = ahci_reset_controller(host); if (rc) - goto disable_unprepare_clk; + goto disable_resources; ahci_init_controller(host); } @@ -404,11 +439,8 @@ static int ahci_resume(struct device *dev) return 0; -disable_unprepare_clk: - ahci_disable_clks(hpriv); -disable_regulator: - if (hpriv->target_pwr) - regulator_disable(hpriv->target_pwr); +disable_resources: + ahci_platform_disable_resources(hpriv); return rc; } diff --git a/include/linux/ahci_platform.h b/include/linux/ahci_platform.h index 769d06525320..b674b01ce9bc 100644 --- a/include/linux/ahci_platform.h +++ b/include/linux/ahci_platform.h @@ -33,5 +33,7 @@ struct ahci_platform_data { int ahci_platform_enable_clks(struct ahci_host_priv *hpriv); void ahci_platform_disable_clks(struct ahci_host_priv *hpriv); +int ahci_platform_enable_resources(struct ahci_host_priv *hpriv); +void ahci_platform_disable_resources(struct ahci_host_priv *hpriv); #endif /* _AHCI_PLATFORM_H */ -- cgit v1.2.3 From 23b07d4cb3c0c850055cf968af44019b8da185fb Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Sat, 22 Feb 2014 16:53:34 +0100 Subject: ahci-platform: "Library-ise" ahci_probe functionality ahci_probe consists of 3 steps: 1) Get resources (get mmio, clks, regulator) 2) Enable resources, handled by ahci_platform_enable_resouces 3) The more or less standard ahci-host controller init sequence This commit refactors step 1 and 3 into separate functions, so the platform drivers for AHCI implementations which need a specific order in step 2, and / or need to do some custom register poking at some time, can re-use ahci-platform.c code without needing to copy and paste it. Note that ahci_platform_init_host's prototype takes the 3 non function members of ahci_platform_data as arguments, the idea is that drivers using the new exported utility functions will not use ahci_platform_data at all, and hopefully in the future ahci_platform_data can go away entirely. tj: Minor comment formatting updates. Signed-off-by: Hans de Goede Signed-off-by: Tejun Heo --- drivers/ata/ahci_platform.c | 188 +++++++++++++++++++++++++++--------------- include/linux/ahci_platform.h | 14 ++++ 2 files changed, 137 insertions(+), 65 deletions(-) (limited to 'include/linux') diff --git a/drivers/ata/ahci_platform.c b/drivers/ata/ahci_platform.c index a32df31013cb..19e9eaafb1f2 100644 --- a/drivers/ata/ahci_platform.c +++ b/drivers/ata/ahci_platform.c @@ -188,64 +188,60 @@ void ahci_platform_disable_resources(struct ahci_host_priv *hpriv) } EXPORT_SYMBOL_GPL(ahci_platform_disable_resources); -static void ahci_put_clks(struct ahci_host_priv *hpriv) +static void ahci_platform_put_resources(struct device *dev, void *res) { + struct ahci_host_priv *hpriv = res; int c; for (c = 0; c < AHCI_MAX_CLKS && hpriv->clks[c]; c++) clk_put(hpriv->clks[c]); } -static int ahci_probe(struct platform_device *pdev) +/** + * ahci_platform_get_resources - Get platform resources + * @pdev: platform device to get resources for + * + * This function allocates an ahci_host_priv struct, and gets the following + * resources, storing a reference to them inside the returned struct: + * + * 1) mmio registers (IORESOURCE_MEM 0, mandatory) + * 2) regulator for controlling the targets power (optional) + * 3) 0 - AHCI_MAX_CLKS clocks, as specified in the devs devicetree node, + * or for non devicetree enabled platforms a single clock + * + * RETURNS: + * The allocated ahci_host_priv on success, otherwise an ERR_PTR value + */ +struct ahci_host_priv *ahci_platform_get_resources( + struct platform_device *pdev) { struct device *dev = &pdev->dev; - struct ahci_platform_data *pdata = dev_get_platdata(dev); - const struct platform_device_id *id = platform_get_device_id(pdev); - struct ata_port_info pi = ahci_port_info[id ? id->driver_data : 0]; - const struct ata_port_info *ppi[] = { &pi, NULL }; struct ahci_host_priv *hpriv; - struct ata_host *host; - struct resource *mem; struct clk *clk; - int irq; - int n_ports; - int i; - int rc; + int i, rc = -ENOMEM; - mem = platform_get_resource(pdev, IORESOURCE_MEM, 0); - if (!mem) { - dev_err(dev, "no mmio space\n"); - return -EINVAL; - } - - irq = platform_get_irq(pdev, 0); - if (irq <= 0) { - dev_err(dev, "no irq\n"); - return -EINVAL; - } + if (!devres_open_group(dev, NULL, GFP_KERNEL)) + return ERR_PTR(-ENOMEM); - if (pdata && pdata->ata_port_info) - pi = *pdata->ata_port_info; - - hpriv = devm_kzalloc(dev, sizeof(*hpriv), GFP_KERNEL); - if (!hpriv) { - dev_err(dev, "can't alloc ahci_host_priv\n"); - return -ENOMEM; - } + hpriv = devres_alloc(ahci_platform_put_resources, sizeof(*hpriv), + GFP_KERNEL); + if (!hpriv) + goto err_out; - hpriv->flags |= (unsigned long)pi.private_data; + devres_add(dev, hpriv); - hpriv->mmio = devm_ioremap(dev, mem->start, resource_size(mem)); + hpriv->mmio = devm_ioremap_resource(dev, + platform_get_resource(pdev, IORESOURCE_MEM, 0)); if (!hpriv->mmio) { - dev_err(dev, "can't map %pR\n", mem); - return -ENOMEM; + dev_err(dev, "no mmio space\n"); + goto err_out; } hpriv->target_pwr = devm_regulator_get_optional(dev, "target"); if (IS_ERR(hpriv->target_pwr)) { rc = PTR_ERR(hpriv->target_pwr); if (rc == -EPROBE_DEFER) - return -EPROBE_DEFER; + goto err_out; hpriv->target_pwr = NULL; } @@ -264,33 +260,59 @@ static int ahci_probe(struct platform_device *pdev) if (IS_ERR(clk)) { rc = PTR_ERR(clk); if (rc == -EPROBE_DEFER) - goto free_clk; + goto err_out; break; } hpriv->clks[i] = clk; } - rc = ahci_platform_enable_resources(hpriv); - if (rc) - goto free_clk; + devres_remove_group(dev, NULL); + return hpriv; - /* - * Some platforms might need to prepare for mmio region access, - * which could be done in the following init call. So, the mmio - * region shouldn't be accessed before init (if provided) has - * returned successfully. - */ - if (pdata && pdata->init) { - rc = pdata->init(dev, hpriv->mmio); - if (rc) - goto disable_resources; - } +err_out: + devres_release_group(dev, NULL); + return ERR_PTR(rc); +} +EXPORT_SYMBOL_GPL(ahci_platform_get_resources); + +/** + * ahci_platform_init_host - Bring up an ahci-platform host + * @pdev: platform device pointer for the host + * @hpriv: ahci-host private data for the host + * @pi_template: template for the ata_port_info to use + * @force_port_map: param passed to ahci_save_initial_config + * @mask_port_map: param passed to ahci_save_initial_config + * + * This function does all the usual steps needed to bring up an + * ahci-platform host, note any necessary resources (ie clks, phy, etc.) + * must be initialized / enabled before calling this. + * + * RETURNS: + * 0 on success otherwise a negative error code + */ +int ahci_platform_init_host(struct platform_device *pdev, + struct ahci_host_priv *hpriv, + const struct ata_port_info *pi_template, + unsigned int force_port_map, + unsigned int mask_port_map) +{ + struct device *dev = &pdev->dev; + struct ata_port_info pi = *pi_template; + const struct ata_port_info *ppi[] = { &pi, NULL }; + struct ata_host *host; + int i, irq, n_ports, rc; - ahci_save_initial_config(dev, hpriv, - pdata ? pdata->force_port_map : 0, - pdata ? pdata->mask_port_map : 0); + irq = platform_get_irq(pdev, 0); + if (irq <= 0) { + dev_err(dev, "no irq\n"); + return -EINVAL; + } /* prepare host */ + hpriv->flags |= (unsigned long)pi.private_data; + + ahci_save_initial_config(dev, hpriv, force_port_map, mask_port_map); + if (hpriv->cap & HOST_CAP_NCQ) pi.flags |= ATA_FLAG_NCQ; @@ -307,10 +329,8 @@ static int ahci_probe(struct platform_device *pdev) n_ports = max(ahci_nr_ports(hpriv->cap), fls(hpriv->port_map)); host = ata_host_alloc_pinfo(dev, ppi, n_ports); - if (!host) { - rc = -ENOMEM; - goto pdata_exit; - } + if (!host) + return -ENOMEM; host->private_data = hpriv; @@ -325,7 +345,8 @@ static int ahci_probe(struct platform_device *pdev) for (i = 0; i < host->n_ports; i++) { struct ata_port *ap = host->ports[i]; - ata_port_desc(ap, "mmio %pR", mem); + ata_port_desc(ap, "mmio %pR", + platform_get_resource(pdev, IORESOURCE_MEM, 0)); ata_port_desc(ap, "port 0x%x", 0x100 + ap->port_no * 0x80); /* set enclosure management message type */ @@ -339,13 +360,53 @@ static int ahci_probe(struct platform_device *pdev) rc = ahci_reset_controller(host); if (rc) - goto pdata_exit; + return rc; ahci_init_controller(host); ahci_print_info(host, "platform"); - rc = ata_host_activate(host, irq, ahci_interrupt, IRQF_SHARED, - &ahci_platform_sht); + return ata_host_activate(host, irq, ahci_interrupt, IRQF_SHARED, + &ahci_platform_sht); +} +EXPORT_SYMBOL_GPL(ahci_platform_init_host); + +static int ahci_probe(struct platform_device *pdev) +{ + struct device *dev = &pdev->dev; + struct ahci_platform_data *pdata = dev_get_platdata(dev); + const struct platform_device_id *id = platform_get_device_id(pdev); + const struct ata_port_info *pi_template; + struct ahci_host_priv *hpriv; + int rc; + + hpriv = ahci_platform_get_resources(pdev); + if (IS_ERR(hpriv)) + return PTR_ERR(hpriv); + + rc = ahci_platform_enable_resources(hpriv); + if (rc) + return rc; + + /* + * Some platforms might need to prepare for mmio region access, + * which could be done in the following init call. So, the mmio + * region shouldn't be accessed before init (if provided) has + * returned successfully. + */ + if (pdata && pdata->init) { + rc = pdata->init(dev, hpriv->mmio); + if (rc) + goto disable_resources; + } + + if (pdata && pdata->ata_port_info) + pi_template = pdata->ata_port_info; + else + pi_template = &ahci_port_info[id ? id->driver_data : 0]; + + rc = ahci_platform_init_host(pdev, hpriv, pi_template, + pdata ? pdata->force_port_map : 0, + pdata ? pdata->mask_port_map : 0); if (rc) goto pdata_exit; @@ -355,8 +416,6 @@ pdata_exit: pdata->exit(dev); disable_resources: ahci_platform_disable_resources(hpriv); -free_clk: - ahci_put_clks(hpriv); return rc; } @@ -370,7 +429,6 @@ static void ahci_host_stop(struct ata_host *host) pdata->exit(dev); ahci_platform_disable_resources(hpriv); - ahci_put_clks(hpriv); } #ifdef CONFIG_PM_SLEEP diff --git a/include/linux/ahci_platform.h b/include/linux/ahci_platform.h index b674b01ce9bc..b80c51c20f76 100644 --- a/include/linux/ahci_platform.h +++ b/include/linux/ahci_platform.h @@ -20,7 +20,14 @@ struct device; struct ata_port_info; struct ahci_host_priv; +struct platform_device; +/* + * Note ahci_platform_data is deprecated, it is only kept around for use + * by the old da850 and spear13xx ahci code. + * New drivers should instead declare their own platform_driver struct, and + * use ahci_platform* functions in their own probe, suspend and resume methods. + */ struct ahci_platform_data { int (*init)(struct device *dev, void __iomem *addr); void (*exit)(struct device *dev); @@ -35,5 +42,12 @@ int ahci_platform_enable_clks(struct ahci_host_priv *hpriv); void ahci_platform_disable_clks(struct ahci_host_priv *hpriv); int ahci_platform_enable_resources(struct ahci_host_priv *hpriv); void ahci_platform_disable_resources(struct ahci_host_priv *hpriv); +struct ahci_host_priv *ahci_platform_get_resources( + struct platform_device *pdev); +int ahci_platform_init_host(struct platform_device *pdev, + struct ahci_host_priv *hpriv, + const struct ata_port_info *pi_template, + unsigned int force_port_map, + unsigned int mask_port_map); #endif /* _AHCI_PLATFORM_H */ -- cgit v1.2.3 From 648cb6fd83b97f0f772db783a280af300fa9f2bc Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Sat, 22 Feb 2014 16:53:35 +0100 Subject: ahci-platform: "Library-ise" suspend / resume functionality Split suspend / resume code into host suspend / resume functionality and resource enable / disabling phases, and export the new suspend_ / resume_host functions. tj: Minor comment formatting updates. Signed-off-by: Hans de Goede Signed-off-by: Tejun Heo --- drivers/ata/ahci_platform.c | 97 ++++++++++++++++++++++++++++++++++++------- include/linux/ahci_platform.h | 5 +++ 2 files changed, 87 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/drivers/ata/ahci_platform.c b/drivers/ata/ahci_platform.c index 19e9eaafb1f2..01f7bbe65572 100644 --- a/drivers/ata/ahci_platform.c +++ b/drivers/ata/ahci_platform.c @@ -432,14 +432,23 @@ static void ahci_host_stop(struct ata_host *host) } #ifdef CONFIG_PM_SLEEP -static int ahci_suspend(struct device *dev) +/** + * ahci_platform_suspend_host - Suspend an ahci-platform host + * @dev: device pointer for the host + * + * This function does all the usual steps needed to suspend an + * ahci-platform host, note any necessary resources (ie clks, phy, etc.) + * must be disabled after calling this. + * + * RETURNS: + * 0 on success otherwise a negative error code + */ +int ahci_platform_suspend_host(struct device *dev) { - struct ahci_platform_data *pdata = dev_get_platdata(dev); struct ata_host *host = dev_get_drvdata(dev); struct ahci_host_priv *hpriv = host->private_data; void __iomem *mmio = hpriv->mmio; u32 ctl; - int rc; if (hpriv->flags & AHCI_HFLAG_NO_SUSPEND) { dev_err(dev, "firmware update required for suspend/resume\n"); @@ -456,7 +465,58 @@ static int ahci_suspend(struct device *dev) writel(ctl, mmio + HOST_CTL); readl(mmio + HOST_CTL); /* flush */ - rc = ata_host_suspend(host, PMSG_SUSPEND); + return ata_host_suspend(host, PMSG_SUSPEND); +} +EXPORT_SYMBOL_GPL(ahci_platform_suspend_host); + +/** + * ahci_platform_resume_host - Resume an ahci-platform host + * @dev: device pointer for the host + * + * This function does all the usual steps needed to resume an ahci-platform + * host, note any necessary resources (ie clks, phy, etc.) must be + * initialized / enabled before calling this. + * + * RETURNS: + * 0 on success otherwise a negative error code + */ +int ahci_platform_resume_host(struct device *dev) +{ + struct ata_host *host = dev_get_drvdata(dev); + int rc; + + if (dev->power.power_state.event == PM_EVENT_SUSPEND) { + rc = ahci_reset_controller(host); + if (rc) + return rc; + + ahci_init_controller(host); + } + + ata_host_resume(host); + + return 0; +} +EXPORT_SYMBOL_GPL(ahci_platform_resume_host); + +/** + * ahci_platform_suspend - Suspend an ahci-platform device + * @dev: the platform device to suspend + * + * This function suspends the host associated with the device, followed by + * disabling all the resources of the device. + * + * RETURNS: + * 0 on success otherwise a negative error code + */ +int ahci_platform_suspend(struct device *dev) +{ + struct ahci_platform_data *pdata = dev_get_platdata(dev); + struct ata_host *host = dev_get_drvdata(dev); + struct ahci_host_priv *hpriv = host->private_data; + int rc; + + rc = ahci_platform_suspend_host(dev); if (rc) return rc; @@ -467,8 +527,19 @@ static int ahci_suspend(struct device *dev) return 0; } +EXPORT_SYMBOL_GPL(ahci_platform_suspend); -static int ahci_resume(struct device *dev) +/** + * ahci_platform_resume - Resume an ahci-platform device + * @dev: the platform device to resume + * + * This function enables all the resources of the device followed by + * resuming the host associated with the device. + * + * RETURNS: + * 0 on success otherwise a negative error code + */ +int ahci_platform_resume(struct device *dev) { struct ahci_platform_data *pdata = dev_get_platdata(dev); struct ata_host *host = dev_get_drvdata(dev); @@ -485,15 +556,9 @@ static int ahci_resume(struct device *dev) goto disable_resources; } - if (dev->power.power_state.event == PM_EVENT_SUSPEND) { - rc = ahci_reset_controller(host); - if (rc) - goto disable_resources; - - ahci_init_controller(host); - } - - ata_host_resume(host); + rc = ahci_platform_resume_host(dev); + if (rc) + goto disable_resources; return 0; @@ -502,9 +567,11 @@ disable_resources: return rc; } +EXPORT_SYMBOL_GPL(ahci_platform_resume); #endif -static SIMPLE_DEV_PM_OPS(ahci_pm_ops, ahci_suspend, ahci_resume); +static SIMPLE_DEV_PM_OPS(ahci_pm_ops, ahci_platform_suspend, + ahci_platform_resume); static const struct of_device_id ahci_of_match[] = { { .compatible = "snps,spear-ahci", }, diff --git a/include/linux/ahci_platform.h b/include/linux/ahci_platform.h index b80c51c20f76..542f2686eb1d 100644 --- a/include/linux/ahci_platform.h +++ b/include/linux/ahci_platform.h @@ -50,4 +50,9 @@ int ahci_platform_init_host(struct platform_device *pdev, unsigned int force_port_map, unsigned int mask_port_map); +int ahci_platform_suspend_host(struct device *dev); +int ahci_platform_resume_host(struct device *dev); +int ahci_platform_suspend(struct device *dev); +int ahci_platform_resume(struct device *dev); + #endif /* _AHCI_PLATFORM_H */ -- cgit v1.2.3 From 6ef95e87763fd69889655f4f14e499377a54d066 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Sat, 22 Feb 2014 17:22:55 +0100 Subject: ahci_platform: Drop unused ahci_platform_data members These members are not used anywhere, and in the future we want ahci_platform_data to go away entirely so there is no reason to keep these around. Signed-off-by: Hans de Goede Signed-off-by: Tejun Heo --- drivers/ata/ahci_platform.c | 10 +--------- include/linux/ahci_platform.h | 3 --- 2 files changed, 1 insertion(+), 12 deletions(-) (limited to 'include/linux') diff --git a/drivers/ata/ahci_platform.c b/drivers/ata/ahci_platform.c index 8fab4bf9042a..db24d2a08051 100644 --- a/drivers/ata/ahci_platform.c +++ b/drivers/ata/ahci_platform.c @@ -384,7 +384,6 @@ static int ahci_probe(struct platform_device *pdev) { struct device *dev = &pdev->dev; struct ahci_platform_data *pdata = dev_get_platdata(dev); - const struct ata_port_info *pi_template; struct ahci_host_priv *hpriv; int rc; @@ -408,14 +407,7 @@ static int ahci_probe(struct platform_device *pdev) goto disable_resources; } - if (pdata && pdata->ata_port_info) - pi_template = pdata->ata_port_info; - else - pi_template = &ahci_port_info; - - rc = ahci_platform_init_host(pdev, hpriv, pi_template, - pdata ? pdata->force_port_map : 0, - pdata ? pdata->mask_port_map : 0); + rc = ahci_platform_init_host(pdev, hpriv, &ahci_port_info, 0, 0); if (rc) goto pdata_exit; diff --git a/include/linux/ahci_platform.h b/include/linux/ahci_platform.h index 542f2686eb1d..1f16d502600c 100644 --- a/include/linux/ahci_platform.h +++ b/include/linux/ahci_platform.h @@ -33,9 +33,6 @@ struct ahci_platform_data { void (*exit)(struct device *dev); int (*suspend)(struct device *dev); int (*resume)(struct device *dev); - const struct ata_port_info *ata_port_info; - unsigned int force_port_map; - unsigned int mask_port_map; }; int ahci_platform_enable_clks(struct ahci_host_priv *hpriv); -- cgit v1.2.3 From 51b1130eb5823ddb90a9ad07d243031d8cb7ecf2 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 27 Jan 2014 11:49:39 -0800 Subject: rcutorture: Abstract rcu_torture_random() Because rcu_torture_random() will be used by the locking equivalent to rcutorture, pull it out into its own module. This new module cannot be separately configured, instead, use the Kconfig "select" statement from the Kconfig options of tests depending on it. Suggested-by: Rusty Russell Signed-off-by: Paul E. McKenney --- include/linux/torture.h | 47 ++ kernel/Makefile | 1 + kernel/rcu/Makefile | 2 +- kernel/rcu/rcutorture.c | 2125 ++++++++++++++++++++++++++++++++++++++++++++++ kernel/rcu/torture.c | 2148 ----------------------------------------------- kernel/torture.c | 71 ++ lib/Kconfig.debug | 5 + 7 files changed, 2250 insertions(+), 2149 deletions(-) create mode 100644 include/linux/torture.h create mode 100644 kernel/rcu/rcutorture.c delete mode 100644 kernel/rcu/torture.c create mode 100644 kernel/torture.c (limited to 'include/linux') diff --git a/include/linux/torture.h b/include/linux/torture.h new file mode 100644 index 000000000000..979e3e6b378a --- /dev/null +++ b/include/linux/torture.h @@ -0,0 +1,47 @@ +/* + * Common functions for in-kernel torture tests. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, you can access it online at + * http://www.gnu.org/licenses/gpl-2.0.html. + * + * Copyright IBM Corporation, 2014 + * + * Author: Paul E. McKenney + */ + +#ifndef __LINUX_TORTURE_H +#define __LINUX_TORTURE_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct torture_random_state { + unsigned long trs_state; + long trs_count; +}; + +#define DEFINE_TORTURE_RANDOM(name) struct torture_random_state name = { 0, 0 } + +unsigned long torture_random(struct torture_random_state *trsp); + +#endif /* __LINUX_TORTURE_H */ diff --git a/kernel/Makefile b/kernel/Makefile index bc010ee272b6..5c0e7666811d 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -93,6 +93,7 @@ obj-$(CONFIG_PADATA) += padata.o obj-$(CONFIG_CRASH_DUMP) += crash_dump.o obj-$(CONFIG_JUMP_LABEL) += jump_label.o obj-$(CONFIG_CONTEXT_TRACKING) += context_tracking.o +obj-$(CONFIG_TORTURE_TEST) += torture.o $(obj)/configs.o: $(obj)/config_data.h diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile index 01e9ec37a3e3..807ccfbf69b3 100644 --- a/kernel/rcu/Makefile +++ b/kernel/rcu/Makefile @@ -1,5 +1,5 @@ obj-y += update.o srcu.o -obj-$(CONFIG_RCU_TORTURE_TEST) += torture.o +obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o obj-$(CONFIG_TREE_RCU) += tree.o obj-$(CONFIG_TREE_PREEMPT_RCU) += tree.o obj-$(CONFIG_TREE_RCU_TRACE) += tree_trace.o diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c new file mode 100644 index 000000000000..94b1cd8b214c --- /dev/null +++ b/kernel/rcu/rcutorture.c @@ -0,0 +1,2125 @@ +/* + * Read-Copy Update module-based torture test facility + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) IBM Corporation, 2005, 2006 + * + * Authors: Paul E. McKenney + * Josh Triplett + * + * See also: Documentation/RCU/torture.txt + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Paul E. McKenney and Josh Triplett "); + +MODULE_ALIAS("rcutorture"); +#ifdef MODULE_PARAM_PREFIX +#undef MODULE_PARAM_PREFIX +#endif +#define MODULE_PARAM_PREFIX "rcutorture." + +static int fqs_duration; +module_param(fqs_duration, int, 0444); +MODULE_PARM_DESC(fqs_duration, "Duration of fqs bursts (us), 0 to disable"); +static int fqs_holdoff; +module_param(fqs_holdoff, int, 0444); +MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)"); +static int fqs_stutter = 3; +module_param(fqs_stutter, int, 0444); +MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)"); +static bool gp_exp; +module_param(gp_exp, bool, 0444); +MODULE_PARM_DESC(gp_exp, "Use expedited GP wait primitives"); +static bool gp_normal; +module_param(gp_normal, bool, 0444); +MODULE_PARM_DESC(gp_normal, "Use normal (non-expedited) GP wait primitives"); +static int irqreader = 1; +module_param(irqreader, int, 0444); +MODULE_PARM_DESC(irqreader, "Allow RCU readers from irq handlers"); +static int n_barrier_cbs; +module_param(n_barrier_cbs, int, 0444); +MODULE_PARM_DESC(n_barrier_cbs, "# of callbacks/kthreads for barrier testing"); +static int nfakewriters = 4; +module_param(nfakewriters, int, 0444); +MODULE_PARM_DESC(nfakewriters, "Number of RCU fake writer threads"); +static int nreaders = -1; +module_param(nreaders, int, 0444); +MODULE_PARM_DESC(nreaders, "Number of RCU reader threads"); +static int object_debug; +module_param(object_debug, int, 0444); +MODULE_PARM_DESC(object_debug, "Enable debug-object double call_rcu() testing"); +static int onoff_holdoff; +module_param(onoff_holdoff, int, 0444); +MODULE_PARM_DESC(onoff_holdoff, "Time after boot before CPU hotplugs (s)"); +static int onoff_interval; +module_param(onoff_interval, int, 0444); +MODULE_PARM_DESC(onoff_interval, "Time between CPU hotplugs (s), 0=disable"); +static int shuffle_interval = 3; +module_param(shuffle_interval, int, 0444); +MODULE_PARM_DESC(shuffle_interval, "Number of seconds between shuffles"); +static int shutdown_secs; +module_param(shutdown_secs, int, 0444); +MODULE_PARM_DESC(shutdown_secs, "Shutdown time (s), <= zero to disable."); +static int stall_cpu; +module_param(stall_cpu, int, 0444); +MODULE_PARM_DESC(stall_cpu, "Stall duration (s), zero to disable."); +static int stall_cpu_holdoff = 10; +module_param(stall_cpu_holdoff, int, 0444); +MODULE_PARM_DESC(stall_cpu_holdoff, "Time to wait before starting stall (s)."); +static int stat_interval = 60; +module_param(stat_interval, int, 0644); +MODULE_PARM_DESC(stat_interval, "Number of seconds between stats printk()s"); +static int stutter = 5; +module_param(stutter, int, 0444); +MODULE_PARM_DESC(stutter, "Number of seconds to run/halt test"); +static int test_boost = 1; +module_param(test_boost, int, 0444); +MODULE_PARM_DESC(test_boost, "Test RCU prio boost: 0=no, 1=maybe, 2=yes."); +static int test_boost_duration = 4; +module_param(test_boost_duration, int, 0444); +MODULE_PARM_DESC(test_boost_duration, "Duration of each boost test, seconds."); +static int test_boost_interval = 7; +module_param(test_boost_interval, int, 0444); +MODULE_PARM_DESC(test_boost_interval, "Interval between boost tests, seconds."); +static bool test_no_idle_hz = true; +module_param(test_no_idle_hz, bool, 0444); +MODULE_PARM_DESC(test_no_idle_hz, "Test support for tickless idle CPUs"); +static char *torture_type = "rcu"; +module_param(torture_type, charp, 0444); +MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, ...)"); +static bool verbose; +module_param(verbose, bool, 0444); +MODULE_PARM_DESC(verbose, "Enable verbose debugging printk()s"); + +#define TORTURE_FLAG "-torture:" +#define PRINTK_STRING(s) \ + do { pr_alert("%s" TORTURE_FLAG s "\n", torture_type); } while (0) +#define VERBOSE_PRINTK_STRING(s) \ + do { if (verbose) pr_alert("%s" TORTURE_FLAG s "\n", torture_type); } while (0) +#define VERBOSE_PRINTK_ERRSTRING(s) \ + do { if (verbose) pr_alert("%s" TORTURE_FLAG "!!! " s "\n", torture_type); } while (0) + +static int nrealreaders; +static struct task_struct *writer_task; +static struct task_struct **fakewriter_tasks; +static struct task_struct **reader_tasks; +static struct task_struct *stats_task; +static struct task_struct *shuffler_task; +static struct task_struct *stutter_task; +static struct task_struct *fqs_task; +static struct task_struct *boost_tasks[NR_CPUS]; +static struct task_struct *shutdown_task; +#ifdef CONFIG_HOTPLUG_CPU +static struct task_struct *onoff_task; +#endif /* #ifdef CONFIG_HOTPLUG_CPU */ +static struct task_struct *stall_task; +static struct task_struct **barrier_cbs_tasks; +static struct task_struct *barrier_task; + +#define RCU_TORTURE_PIPE_LEN 10 + +struct rcu_torture { + struct rcu_head rtort_rcu; + int rtort_pipe_count; + struct list_head rtort_free; + int rtort_mbtest; +}; + +static LIST_HEAD(rcu_torture_freelist); +static struct rcu_torture __rcu *rcu_torture_current; +static unsigned long rcu_torture_current_version; +static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN]; +static DEFINE_SPINLOCK(rcu_torture_lock); +static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], + rcu_torture_count) = { 0 }; +static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], + rcu_torture_batch) = { 0 }; +static atomic_t rcu_torture_wcount[RCU_TORTURE_PIPE_LEN + 1]; +static atomic_t n_rcu_torture_alloc; +static atomic_t n_rcu_torture_alloc_fail; +static atomic_t n_rcu_torture_free; +static atomic_t n_rcu_torture_mberror; +static atomic_t n_rcu_torture_error; +static long n_rcu_torture_barrier_error; +static long n_rcu_torture_boost_ktrerror; +static long n_rcu_torture_boost_rterror; +static long n_rcu_torture_boost_failure; +static long n_rcu_torture_boosts; +static long n_rcu_torture_timers; +static long n_offline_attempts; +static long n_offline_successes; +static unsigned long sum_offline; +static int min_offline = -1; +static int max_offline; +static long n_online_attempts; +static long n_online_successes; +static unsigned long sum_online; +static int min_online = -1; +static int max_online; +static long n_barrier_attempts; +static long n_barrier_successes; +static struct list_head rcu_torture_removed; +static cpumask_var_t shuffle_tmp_mask; + +static int stutter_pause_test; + +#if defined(MODULE) || defined(CONFIG_RCU_TORTURE_TEST_RUNNABLE) +#define RCUTORTURE_RUNNABLE_INIT 1 +#else +#define RCUTORTURE_RUNNABLE_INIT 0 +#endif +int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT; +module_param(rcutorture_runnable, int, 0444); +MODULE_PARM_DESC(rcutorture_runnable, "Start rcutorture at boot"); + +#if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) +#define rcu_can_boost() 1 +#else /* #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) */ +#define rcu_can_boost() 0 +#endif /* #else #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) */ + +#ifdef CONFIG_RCU_TRACE +static u64 notrace rcu_trace_clock_local(void) +{ + u64 ts = trace_clock_local(); + unsigned long __maybe_unused ts_rem = do_div(ts, NSEC_PER_USEC); + return ts; +} +#else /* #ifdef CONFIG_RCU_TRACE */ +static u64 notrace rcu_trace_clock_local(void) +{ + return 0ULL; +} +#endif /* #else #ifdef CONFIG_RCU_TRACE */ + +static unsigned long shutdown_time; /* jiffies to system shutdown. */ +static unsigned long boost_starttime; /* jiffies of next boost test start. */ +DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */ + /* and boost task create/destroy. */ +static atomic_t barrier_cbs_count; /* Barrier callbacks registered. */ +static bool barrier_phase; /* Test phase. */ +static atomic_t barrier_cbs_invoked; /* Barrier callbacks invoked. */ +static wait_queue_head_t *barrier_cbs_wq; /* Coordinate barrier testing. */ +static DECLARE_WAIT_QUEUE_HEAD(barrier_wq); + +/* Mediate rmmod and system shutdown. Concurrent rmmod & shutdown illegal! */ + +#define FULLSTOP_DONTSTOP 0 /* Normal operation. */ +#define FULLSTOP_SHUTDOWN 1 /* System shutdown with rcutorture running. */ +#define FULLSTOP_RMMOD 2 /* Normal rmmod of rcutorture. */ +static int fullstop = FULLSTOP_RMMOD; +/* + * Protect fullstop transitions and spawning of kthreads. + */ +static DEFINE_MUTEX(fullstop_mutex); + +/* Forward reference. */ +static void rcu_torture_cleanup(void); + +/* + * Detect and respond to a system shutdown. + */ +static int +rcutorture_shutdown_notify(struct notifier_block *unused1, + unsigned long unused2, void *unused3) +{ + mutex_lock(&fullstop_mutex); + if (fullstop == FULLSTOP_DONTSTOP) + fullstop = FULLSTOP_SHUTDOWN; + else + pr_warn(/* but going down anyway, so... */ + "Concurrent 'rmmod rcutorture' and shutdown illegal!\n"); + mutex_unlock(&fullstop_mutex); + return NOTIFY_DONE; +} + +/* + * Absorb kthreads into a kernel function that won't return, so that + * they won't ever access module text or data again. + */ +static void rcutorture_shutdown_absorb(const char *title) +{ + if (ACCESS_ONCE(fullstop) == FULLSTOP_SHUTDOWN) { + pr_notice( + "rcutorture thread %s parking due to system shutdown\n", + title); + schedule_timeout_uninterruptible(MAX_SCHEDULE_TIMEOUT); + } +} + +/* + * Allocate an element from the rcu_tortures pool. + */ +static struct rcu_torture * +rcu_torture_alloc(void) +{ + struct list_head *p; + + spin_lock_bh(&rcu_torture_lock); + if (list_empty(&rcu_torture_freelist)) { + atomic_inc(&n_rcu_torture_alloc_fail); + spin_unlock_bh(&rcu_torture_lock); + return NULL; + } + atomic_inc(&n_rcu_torture_alloc); + p = rcu_torture_freelist.next; + list_del_init(p); + spin_unlock_bh(&rcu_torture_lock); + return container_of(p, struct rcu_torture, rtort_free); +} + +/* + * Free an element to the rcu_tortures pool. + */ +static void +rcu_torture_free(struct rcu_torture *p) +{ + atomic_inc(&n_rcu_torture_free); + spin_lock_bh(&rcu_torture_lock); + list_add_tail(&p->rtort_free, &rcu_torture_freelist); + spin_unlock_bh(&rcu_torture_lock); +} + +static void +rcu_stutter_wait(const char *title) +{ + while (stutter_pause_test || !rcutorture_runnable) { + if (rcutorture_runnable) + schedule_timeout_interruptible(1); + else + schedule_timeout_interruptible(round_jiffies_relative(HZ)); + rcutorture_shutdown_absorb(title); + } +} + +/* + * Operations vector for selecting different types of tests. + */ + +struct rcu_torture_ops { + void (*init)(void); + int (*readlock)(void); + void (*read_delay)(struct torture_random_state *rrsp); + void (*readunlock)(int idx); + int (*completed)(void); + void (*deferred_free)(struct rcu_torture *p); + void (*sync)(void); + void (*exp_sync)(void); + void (*call)(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); + void (*cb_barrier)(void); + void (*fqs)(void); + void (*stats)(char *page); + int irq_capable; + int can_boost; + const char *name; +}; + +static struct rcu_torture_ops *cur_ops; + +/* + * Definitions for rcu torture testing. + */ + +static int rcu_torture_read_lock(void) __acquires(RCU) +{ + rcu_read_lock(); + return 0; +} + +static void rcu_read_delay(struct torture_random_state *rrsp) +{ + const unsigned long shortdelay_us = 200; + const unsigned long longdelay_ms = 50; + + /* We want a short delay sometimes to make a reader delay the grace + * period, and we want a long delay occasionally to trigger + * force_quiescent_state. */ + + if (!(torture_random(rrsp) % (nrealreaders * 2000 * longdelay_ms))) + mdelay(longdelay_ms); + if (!(torture_random(rrsp) % (nrealreaders * 2 * shortdelay_us))) + udelay(shortdelay_us); +#ifdef CONFIG_PREEMPT + if (!preempt_count() && + !(torture_random(rrsp) % (nrealreaders * 20000))) + preempt_schedule(); /* No QS if preempt_disable() in effect */ +#endif +} + +static void rcu_torture_read_unlock(int idx) __releases(RCU) +{ + rcu_read_unlock(); +} + +static int rcu_torture_completed(void) +{ + return rcu_batches_completed(); +} + +static void +rcu_torture_cb(struct rcu_head *p) +{ + int i; + struct rcu_torture *rp = container_of(p, struct rcu_torture, rtort_rcu); + + if (fullstop != FULLSTOP_DONTSTOP) { + /* Test is ending, just drop callbacks on the floor. */ + /* The next initialization will pick up the pieces. */ + return; + } + i = rp->rtort_pipe_count; + if (i > RCU_TORTURE_PIPE_LEN) + i = RCU_TORTURE_PIPE_LEN; + atomic_inc(&rcu_torture_wcount[i]); + if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) { + rp->rtort_mbtest = 0; + rcu_torture_free(rp); + } else { + cur_ops->deferred_free(rp); + } +} + +static int rcu_no_completed(void) +{ + return 0; +} + +static void rcu_torture_deferred_free(struct rcu_torture *p) +{ + call_rcu(&p->rtort_rcu, rcu_torture_cb); +} + +static void rcu_sync_torture_init(void) +{ + INIT_LIST_HEAD(&rcu_torture_removed); +} + +static struct rcu_torture_ops rcu_ops = { + .init = rcu_sync_torture_init, + .readlock = rcu_torture_read_lock, + .read_delay = rcu_read_delay, + .readunlock = rcu_torture_read_unlock, + .completed = rcu_torture_completed, + .deferred_free = rcu_torture_deferred_free, + .sync = synchronize_rcu, + .exp_sync = synchronize_rcu_expedited, + .call = call_rcu, + .cb_barrier = rcu_barrier, + .fqs = rcu_force_quiescent_state, + .stats = NULL, + .irq_capable = 1, + .can_boost = rcu_can_boost(), + .name = "rcu" +}; + +/* + * Definitions for rcu_bh torture testing. + */ + +static int rcu_bh_torture_read_lock(void) __acquires(RCU_BH) +{ + rcu_read_lock_bh(); + return 0; +} + +static void rcu_bh_torture_read_unlock(int idx) __releases(RCU_BH) +{ + rcu_read_unlock_bh(); +} + +static int rcu_bh_torture_completed(void) +{ + return rcu_batches_completed_bh(); +} + +static void rcu_bh_torture_deferred_free(struct rcu_torture *p) +{ + call_rcu_bh(&p->rtort_rcu, rcu_torture_cb); +} + +static struct rcu_torture_ops rcu_bh_ops = { + .init = rcu_sync_torture_init, + .readlock = rcu_bh_torture_read_lock, + .read_delay = rcu_read_delay, /* just reuse rcu's version. */ + .readunlock = rcu_bh_torture_read_unlock, + .completed = rcu_bh_torture_completed, + .deferred_free = rcu_bh_torture_deferred_free, + .sync = synchronize_rcu_bh, + .exp_sync = synchronize_rcu_bh_expedited, + .call = call_rcu_bh, + .cb_barrier = rcu_barrier_bh, + .fqs = rcu_bh_force_quiescent_state, + .stats = NULL, + .irq_capable = 1, + .name = "rcu_bh" +}; + +/* + * Definitions for srcu torture testing. + */ + +DEFINE_STATIC_SRCU(srcu_ctl); + +static int srcu_torture_read_lock(void) __acquires(&srcu_ctl) +{ + return srcu_read_lock(&srcu_ctl); +} + +static void srcu_read_delay(struct torture_random_state *rrsp) +{ + long delay; + const long uspertick = 1000000 / HZ; + const long longdelay = 10; + + /* We want there to be long-running readers, but not all the time. */ + + delay = torture_random(rrsp) % + (nrealreaders * 2 * longdelay * uspertick); + if (!delay) + schedule_timeout_interruptible(longdelay); + else + rcu_read_delay(rrsp); +} + +static void srcu_torture_read_unlock(int idx) __releases(&srcu_ctl) +{ + srcu_read_unlock(&srcu_ctl, idx); +} + +static int srcu_torture_completed(void) +{ + return srcu_batches_completed(&srcu_ctl); +} + +static void srcu_torture_deferred_free(struct rcu_torture *rp) +{ + call_srcu(&srcu_ctl, &rp->rtort_rcu, rcu_torture_cb); +} + +static void srcu_torture_synchronize(void) +{ + synchronize_srcu(&srcu_ctl); +} + +static void srcu_torture_call(struct rcu_head *head, + void (*func)(struct rcu_head *head)) +{ + call_srcu(&srcu_ctl, head, func); +} + +static void srcu_torture_barrier(void) +{ + srcu_barrier(&srcu_ctl); +} + +static void srcu_torture_stats(char *page) +{ + int cpu; + int idx = srcu_ctl.completed & 0x1; + + page += sprintf(page, "%s%s per-CPU(idx=%d):", + torture_type, TORTURE_FLAG, idx); + for_each_possible_cpu(cpu) { + page += sprintf(page, " %d(%lu,%lu)", cpu, + per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[!idx], + per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[idx]); + } + sprintf(page, "\n"); +} + +static void srcu_torture_synchronize_expedited(void) +{ + synchronize_srcu_expedited(&srcu_ctl); +} + +static struct rcu_torture_ops srcu_ops = { + .init = rcu_sync_torture_init, + .readlock = srcu_torture_read_lock, + .read_delay = srcu_read_delay, + .readunlock = srcu_torture_read_unlock, + .completed = srcu_torture_completed, + .deferred_free = srcu_torture_deferred_free, + .sync = srcu_torture_synchronize, + .exp_sync = srcu_torture_synchronize_expedited, + .call = srcu_torture_call, + .cb_barrier = srcu_torture_barrier, + .stats = srcu_torture_stats, + .name = "srcu" +}; + +/* + * Definitions for sched torture testing. + */ + +static int sched_torture_read_lock(void) +{ + preempt_disable(); + return 0; +} + +static void sched_torture_read_unlock(int idx) +{ + preempt_enable(); +} + +static void rcu_sched_torture_deferred_free(struct rcu_torture *p) +{ + call_rcu_sched(&p->rtort_rcu, rcu_torture_cb); +} + +static struct rcu_torture_ops sched_ops = { + .init = rcu_sync_torture_init, + .readlock = sched_torture_read_lock, + .read_delay = rcu_read_delay, /* just reuse rcu's version. */ + .readunlock = sched_torture_read_unlock, + .completed = rcu_no_completed, + .deferred_free = rcu_sched_torture_deferred_free, + .sync = synchronize_sched, + .exp_sync = synchronize_sched_expedited, + .call = call_rcu_sched, + .cb_barrier = rcu_barrier_sched, + .fqs = rcu_sched_force_quiescent_state, + .stats = NULL, + .irq_capable = 1, + .name = "sched" +}; + +/* + * RCU torture priority-boost testing. Runs one real-time thread per + * CPU for moderate bursts, repeatedly registering RCU callbacks and + * spinning waiting for them to be invoked. If a given callback takes + * too long to be invoked, we assume that priority inversion has occurred. + */ + +struct rcu_boost_inflight { + struct rcu_head rcu; + int inflight; +}; + +static void rcu_torture_boost_cb(struct rcu_head *head) +{ + struct rcu_boost_inflight *rbip = + container_of(head, struct rcu_boost_inflight, rcu); + + smp_mb(); /* Ensure RCU-core accesses precede clearing ->inflight */ + rbip->inflight = 0; +} + +static int rcu_torture_boost(void *arg) +{ + unsigned long call_rcu_time; + unsigned long endtime; + unsigned long oldstarttime; + struct rcu_boost_inflight rbi = { .inflight = 0 }; + struct sched_param sp; + + VERBOSE_PRINTK_STRING("rcu_torture_boost started"); + + /* Set real-time priority. */ + sp.sched_priority = 1; + if (sched_setscheduler(current, SCHED_FIFO, &sp) < 0) { + VERBOSE_PRINTK_STRING("rcu_torture_boost RT prio failed!"); + n_rcu_torture_boost_rterror++; + } + + init_rcu_head_on_stack(&rbi.rcu); + /* Each pass through the following loop does one boost-test cycle. */ + do { + /* Wait for the next test interval. */ + oldstarttime = boost_starttime; + while (ULONG_CMP_LT(jiffies, oldstarttime)) { + schedule_timeout_interruptible(oldstarttime - jiffies); + rcu_stutter_wait("rcu_torture_boost"); + if (kthread_should_stop() || + fullstop != FULLSTOP_DONTSTOP) + goto checkwait; + } + + /* Do one boost-test interval. */ + endtime = oldstarttime + test_boost_duration * HZ; + call_rcu_time = jiffies; + while (ULONG_CMP_LT(jiffies, endtime)) { + /* If we don't have a callback in flight, post one. */ + if (!rbi.inflight) { + smp_mb(); /* RCU core before ->inflight = 1. */ + rbi.inflight = 1; + call_rcu(&rbi.rcu, rcu_torture_boost_cb); + if (jiffies - call_rcu_time > + test_boost_duration * HZ - HZ / 2) { + VERBOSE_PRINTK_STRING("rcu_torture_boost boosting failed"); + n_rcu_torture_boost_failure++; + } + call_rcu_time = jiffies; + } + cond_resched(); + rcu_stutter_wait("rcu_torture_boost"); + if (kthread_should_stop() || + fullstop != FULLSTOP_DONTSTOP) + goto checkwait; + } + + /* + * Set the start time of the next test interval. + * Yes, this is vulnerable to long delays, but such + * delays simply cause a false negative for the next + * interval. Besides, we are running at RT priority, + * so delays should be relatively rare. + */ + while (oldstarttime == boost_starttime && + !kthread_should_stop()) { + if (mutex_trylock(&boost_mutex)) { + boost_starttime = jiffies + + test_boost_interval * HZ; + n_rcu_torture_boosts++; + mutex_unlock(&boost_mutex); + break; + } + schedule_timeout_uninterruptible(1); + } + + /* Go do the stutter. */ +checkwait: rcu_stutter_wait("rcu_torture_boost"); + } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); + + /* Clean up and exit. */ + VERBOSE_PRINTK_STRING("rcu_torture_boost task stopping"); + rcutorture_shutdown_absorb("rcu_torture_boost"); + while (!kthread_should_stop() || rbi.inflight) + schedule_timeout_uninterruptible(1); + smp_mb(); /* order accesses to ->inflight before stack-frame death. */ + destroy_rcu_head_on_stack(&rbi.rcu); + return 0; +} + +/* + * RCU torture force-quiescent-state kthread. Repeatedly induces + * bursts of calls to force_quiescent_state(), increasing the probability + * of occurrence of some important types of race conditions. + */ +static int +rcu_torture_fqs(void *arg) +{ + unsigned long fqs_resume_time; + int fqs_burst_remaining; + + VERBOSE_PRINTK_STRING("rcu_torture_fqs task started"); + do { + fqs_resume_time = jiffies + fqs_stutter * HZ; + while (ULONG_CMP_LT(jiffies, fqs_resume_time) && + !kthread_should_stop()) { + schedule_timeout_interruptible(1); + } + fqs_burst_remaining = fqs_duration; + while (fqs_burst_remaining > 0 && + !kthread_should_stop()) { + cur_ops->fqs(); + udelay(fqs_holdoff); + fqs_burst_remaining -= fqs_holdoff; + } + rcu_stutter_wait("rcu_torture_fqs"); + } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); + VERBOSE_PRINTK_STRING("rcu_torture_fqs task stopping"); + rcutorture_shutdown_absorb("rcu_torture_fqs"); + while (!kthread_should_stop()) + schedule_timeout_uninterruptible(1); + return 0; +} + +/* + * RCU torture writer kthread. Repeatedly substitutes a new structure + * for that pointed to by rcu_torture_current, freeing the old structure + * after a series of grace periods (the "pipeline"). + */ +static int +rcu_torture_writer(void *arg) +{ + bool exp; + int i; + struct rcu_torture *rp; + struct rcu_torture *rp1; + struct rcu_torture *old_rp; + static DEFINE_TORTURE_RANDOM(rand); + + VERBOSE_PRINTK_STRING("rcu_torture_writer task started"); + set_user_nice(current, 19); + + do { + schedule_timeout_uninterruptible(1); + rp = rcu_torture_alloc(); + if (rp == NULL) + continue; + rp->rtort_pipe_count = 0; + udelay(torture_random(&rand) & 0x3ff); + old_rp = rcu_dereference_check(rcu_torture_current, + current == writer_task); + rp->rtort_mbtest = 1; + rcu_assign_pointer(rcu_torture_current, rp); + smp_wmb(); /* Mods to old_rp must follow rcu_assign_pointer() */ + if (old_rp) { + i = old_rp->rtort_pipe_count; + if (i > RCU_TORTURE_PIPE_LEN) + i = RCU_TORTURE_PIPE_LEN; + atomic_inc(&rcu_torture_wcount[i]); + old_rp->rtort_pipe_count++; + if (gp_normal == gp_exp) + exp = !!(torture_random(&rand) & 0x80); + else + exp = gp_exp; + if (!exp) { + cur_ops->deferred_free(old_rp); + } else { + cur_ops->exp_sync(); + list_add(&old_rp->rtort_free, + &rcu_torture_removed); + list_for_each_entry_safe(rp, rp1, + &rcu_torture_removed, + rtort_free) { + i = rp->rtort_pipe_count; + if (i > RCU_TORTURE_PIPE_LEN) + i = RCU_TORTURE_PIPE_LEN; + atomic_inc(&rcu_torture_wcount[i]); + if (++rp->rtort_pipe_count >= + RCU_TORTURE_PIPE_LEN) { + rp->rtort_mbtest = 0; + list_del(&rp->rtort_free); + rcu_torture_free(rp); + } + } + } + } + rcutorture_record_progress(++rcu_torture_current_version); + rcu_stutter_wait("rcu_torture_writer"); + } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); + VERBOSE_PRINTK_STRING("rcu_torture_writer task stopping"); + rcutorture_shutdown_absorb("rcu_torture_writer"); + while (!kthread_should_stop()) + schedule_timeout_uninterruptible(1); + return 0; +} + +/* + * RCU torture fake writer kthread. Repeatedly calls sync, with a random + * delay between calls. + */ +static int +rcu_torture_fakewriter(void *arg) +{ + DEFINE_TORTURE_RANDOM(rand); + + VERBOSE_PRINTK_STRING("rcu_torture_fakewriter task started"); + set_user_nice(current, 19); + + do { + schedule_timeout_uninterruptible(1 + torture_random(&rand)%10); + udelay(torture_random(&rand) & 0x3ff); + if (cur_ops->cb_barrier != NULL && + torture_random(&rand) % (nfakewriters * 8) == 0) { + cur_ops->cb_barrier(); + } else if (gp_normal == gp_exp) { + if (torture_random(&rand) & 0x80) + cur_ops->sync(); + else + cur_ops->exp_sync(); + } else if (gp_normal) { + cur_ops->sync(); + } else { + cur_ops->exp_sync(); + } + rcu_stutter_wait("rcu_torture_fakewriter"); + } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); + + VERBOSE_PRINTK_STRING("rcu_torture_fakewriter task stopping"); + rcutorture_shutdown_absorb("rcu_torture_fakewriter"); + while (!kthread_should_stop()) + schedule_timeout_uninterruptible(1); + return 0; +} + +void rcutorture_trace_dump(void) +{ + static atomic_t beenhere = ATOMIC_INIT(0); + + if (atomic_read(&beenhere)) + return; + if (atomic_xchg(&beenhere, 1) != 0) + return; + ftrace_dump(DUMP_ALL); +} + +/* + * RCU torture reader from timer handler. Dereferences rcu_torture_current, + * incrementing the corresponding element of the pipeline array. The + * counter in the element should never be greater than 1, otherwise, the + * RCU implementation is broken. + */ +static void rcu_torture_timer(unsigned long unused) +{ + int idx; + int completed; + int completed_end; + static DEFINE_TORTURE_RANDOM(rand); + static DEFINE_SPINLOCK(rand_lock); + struct rcu_torture *p; + int pipe_count; + unsigned long long ts; + + idx = cur_ops->readlock(); + completed = cur_ops->completed(); + ts = rcu_trace_clock_local(); + p = rcu_dereference_check(rcu_torture_current, + rcu_read_lock_bh_held() || + rcu_read_lock_sched_held() || + srcu_read_lock_held(&srcu_ctl)); + if (p == NULL) { + /* Leave because rcu_torture_writer is not yet underway */ + cur_ops->readunlock(idx); + return; + } + if (p->rtort_mbtest == 0) + atomic_inc(&n_rcu_torture_mberror); + spin_lock(&rand_lock); + cur_ops->read_delay(&rand); + n_rcu_torture_timers++; + spin_unlock(&rand_lock); + preempt_disable(); + pipe_count = p->rtort_pipe_count; + if (pipe_count > RCU_TORTURE_PIPE_LEN) { + /* Should not happen, but... */ + pipe_count = RCU_TORTURE_PIPE_LEN; + } + completed_end = cur_ops->completed(); + if (pipe_count > 1) { + do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu, ts, + completed, completed_end); + rcutorture_trace_dump(); + } + __this_cpu_inc(rcu_torture_count[pipe_count]); + completed = completed_end - completed; + if (completed > RCU_TORTURE_PIPE_LEN) { + /* Should not happen, but... */ + completed = RCU_TORTURE_PIPE_LEN; + } + __this_cpu_inc(rcu_torture_batch[completed]); + preempt_enable(); + cur_ops->readunlock(idx); +} + +/* + * RCU torture reader kthread. Repeatedly dereferences rcu_torture_current, + * incrementing the corresponding element of the pipeline array. The + * counter in the element should never be greater than 1, otherwise, the + * RCU implementation is broken. + */ +static int +rcu_torture_reader(void *arg) +{ + int completed; + int completed_end; + int idx; + DEFINE_TORTURE_RANDOM(rand); + struct rcu_torture *p; + int pipe_count; + struct timer_list t; + unsigned long long ts; + + VERBOSE_PRINTK_STRING("rcu_torture_reader task started"); + set_user_nice(current, 19); + if (irqreader && cur_ops->irq_capable) + setup_timer_on_stack(&t, rcu_torture_timer, 0); + + do { + if (irqreader && cur_ops->irq_capable) { + if (!timer_pending(&t)) + mod_timer(&t, jiffies + 1); + } + idx = cur_ops->readlock(); + completed = cur_ops->completed(); + ts = rcu_trace_clock_local(); + p = rcu_dereference_check(rcu_torture_current, + rcu_read_lock_bh_held() || + rcu_read_lock_sched_held() || + srcu_read_lock_held(&srcu_ctl)); + if (p == NULL) { + /* Wait for rcu_torture_writer to get underway */ + cur_ops->readunlock(idx); + schedule_timeout_interruptible(HZ); + continue; + } + if (p->rtort_mbtest == 0) + atomic_inc(&n_rcu_torture_mberror); + cur_ops->read_delay(&rand); + preempt_disable(); + pipe_count = p->rtort_pipe_count; + if (pipe_count > RCU_TORTURE_PIPE_LEN) { + /* Should not happen, but... */ + pipe_count = RCU_TORTURE_PIPE_LEN; + } + completed_end = cur_ops->completed(); + if (pipe_count > 1) { + do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu, + ts, completed, completed_end); + rcutorture_trace_dump(); + } + __this_cpu_inc(rcu_torture_count[pipe_count]); + completed = completed_end - completed; + if (completed > RCU_TORTURE_PIPE_LEN) { + /* Should not happen, but... */ + completed = RCU_TORTURE_PIPE_LEN; + } + __this_cpu_inc(rcu_torture_batch[completed]); + preempt_enable(); + cur_ops->readunlock(idx); + schedule(); + rcu_stutter_wait("rcu_torture_reader"); + } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); + VERBOSE_PRINTK_STRING("rcu_torture_reader task stopping"); + rcutorture_shutdown_absorb("rcu_torture_reader"); + if (irqreader && cur_ops->irq_capable) + del_timer_sync(&t); + while (!kthread_should_stop()) + schedule_timeout_uninterruptible(1); + return 0; +} + +/* + * Create an RCU-torture statistics message in the specified buffer. + */ +static void +rcu_torture_printk(char *page) +{ + int cpu; + int i; + long pipesummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 }; + long batchsummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 }; + + for_each_possible_cpu(cpu) { + for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) { + pipesummary[i] += per_cpu(rcu_torture_count, cpu)[i]; + batchsummary[i] += per_cpu(rcu_torture_batch, cpu)[i]; + } + } + for (i = RCU_TORTURE_PIPE_LEN - 1; i >= 0; i--) { + if (pipesummary[i] != 0) + break; + } + page += sprintf(page, "%s%s ", torture_type, TORTURE_FLAG); + page += sprintf(page, + "rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d ", + rcu_torture_current, + rcu_torture_current_version, + list_empty(&rcu_torture_freelist), + atomic_read(&n_rcu_torture_alloc), + atomic_read(&n_rcu_torture_alloc_fail), + atomic_read(&n_rcu_torture_free)); + page += sprintf(page, "rtmbe: %d rtbke: %ld rtbre: %ld ", + atomic_read(&n_rcu_torture_mberror), + n_rcu_torture_boost_ktrerror, + n_rcu_torture_boost_rterror); + page += sprintf(page, "rtbf: %ld rtb: %ld nt: %ld ", + n_rcu_torture_boost_failure, + n_rcu_torture_boosts, + n_rcu_torture_timers); + page += sprintf(page, + "onoff: %ld/%ld:%ld/%ld %d,%d:%d,%d %lu:%lu (HZ=%d) ", + n_online_successes, n_online_attempts, + n_offline_successes, n_offline_attempts, + min_online, max_online, + min_offline, max_offline, + sum_online, sum_offline, HZ); + page += sprintf(page, "barrier: %ld/%ld:%ld", + n_barrier_successes, + n_barrier_attempts, + n_rcu_torture_barrier_error); + page += sprintf(page, "\n%s%s ", torture_type, TORTURE_FLAG); + if (atomic_read(&n_rcu_torture_mberror) != 0 || + n_rcu_torture_barrier_error != 0 || + n_rcu_torture_boost_ktrerror != 0 || + n_rcu_torture_boost_rterror != 0 || + n_rcu_torture_boost_failure != 0 || + i > 1) { + page += sprintf(page, "!!! "); + atomic_inc(&n_rcu_torture_error); + WARN_ON_ONCE(1); + } + page += sprintf(page, "Reader Pipe: "); + for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) + page += sprintf(page, " %ld", pipesummary[i]); + page += sprintf(page, "\n%s%s ", torture_type, TORTURE_FLAG); + page += sprintf(page, "Reader Batch: "); + for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) + page += sprintf(page, " %ld", batchsummary[i]); + page += sprintf(page, "\n%s%s ", torture_type, TORTURE_FLAG); + page += sprintf(page, "Free-Block Circulation: "); + for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) { + page += sprintf(page, " %d", + atomic_read(&rcu_torture_wcount[i])); + } + page += sprintf(page, "\n"); + if (cur_ops->stats) + cur_ops->stats(page); +} + +/* + * Print torture statistics. Caller must ensure that there is only + * one call to this function at a given time!!! This is normally + * accomplished by relying on the module system to only have one copy + * of the module loaded, and then by giving the rcu_torture_stats + * kthread full control (or the init/cleanup functions when rcu_torture_stats + * thread is not running). + */ +static void +rcu_torture_stats_print(void) +{ + int size = nr_cpu_ids * 200 + 8192; + char *buf; + + buf = kmalloc(size, GFP_KERNEL); + if (!buf) { + pr_err("rcu-torture: Out of memory, need: %d", size); + return; + } + rcu_torture_printk(buf); + pr_alert("%s", buf); + kfree(buf); +} + +/* + * Periodically prints torture statistics, if periodic statistics printing + * was specified via the stat_interval module parameter. + * + * No need to worry about fullstop here, since this one doesn't reference + * volatile state or register callbacks. + */ +static int +rcu_torture_stats(void *arg) +{ + VERBOSE_PRINTK_STRING("rcu_torture_stats task started"); + do { + schedule_timeout_interruptible(stat_interval * HZ); + rcu_torture_stats_print(); + rcutorture_shutdown_absorb("rcu_torture_stats"); + } while (!kthread_should_stop()); + VERBOSE_PRINTK_STRING("rcu_torture_stats task stopping"); + return 0; +} + +static int rcu_idle_cpu; /* Force all torture tasks off this CPU */ + +/* Shuffle tasks such that we allow @rcu_idle_cpu to become idle. A special case + * is when @rcu_idle_cpu = -1, when we allow the tasks to run on all CPUs. + */ +static void rcu_torture_shuffle_tasks(void) +{ + int i; + + cpumask_setall(shuffle_tmp_mask); + get_online_cpus(); + + /* No point in shuffling if there is only one online CPU (ex: UP) */ + if (num_online_cpus() == 1) { + put_online_cpus(); + return; + } + + if (rcu_idle_cpu != -1) + cpumask_clear_cpu(rcu_idle_cpu, shuffle_tmp_mask); + + set_cpus_allowed_ptr(current, shuffle_tmp_mask); + + if (reader_tasks) { + for (i = 0; i < nrealreaders; i++) + if (reader_tasks[i]) + set_cpus_allowed_ptr(reader_tasks[i], + shuffle_tmp_mask); + } + if (fakewriter_tasks) { + for (i = 0; i < nfakewriters; i++) + if (fakewriter_tasks[i]) + set_cpus_allowed_ptr(fakewriter_tasks[i], + shuffle_tmp_mask); + } + if (writer_task) + set_cpus_allowed_ptr(writer_task, shuffle_tmp_mask); + if (stats_task) + set_cpus_allowed_ptr(stats_task, shuffle_tmp_mask); + if (stutter_task) + set_cpus_allowed_ptr(stutter_task, shuffle_tmp_mask); + if (fqs_task) + set_cpus_allowed_ptr(fqs_task, shuffle_tmp_mask); + if (shutdown_task) + set_cpus_allowed_ptr(shutdown_task, shuffle_tmp_mask); +#ifdef CONFIG_HOTPLUG_CPU + if (onoff_task) + set_cpus_allowed_ptr(onoff_task, shuffle_tmp_mask); +#endif /* #ifdef CONFIG_HOTPLUG_CPU */ + if (stall_task) + set_cpus_allowed_ptr(stall_task, shuffle_tmp_mask); + if (barrier_cbs_tasks) + for (i = 0; i < n_barrier_cbs; i++) + if (barrier_cbs_tasks[i]) + set_cpus_allowed_ptr(barrier_cbs_tasks[i], + shuffle_tmp_mask); + if (barrier_task) + set_cpus_allowed_ptr(barrier_task, shuffle_tmp_mask); + + if (rcu_idle_cpu == -1) + rcu_idle_cpu = num_online_cpus() - 1; + else + rcu_idle_cpu--; + + put_online_cpus(); +} + +/* Shuffle tasks across CPUs, with the intent of allowing each CPU in the + * system to become idle at a time and cut off its timer ticks. This is meant + * to test the support for such tickless idle CPU in RCU. + */ +static int +rcu_torture_shuffle(void *arg) +{ + VERBOSE_PRINTK_STRING("rcu_torture_shuffle task started"); + do { + schedule_timeout_interruptible(shuffle_interval * HZ); + rcu_torture_shuffle_tasks(); + rcutorture_shutdown_absorb("rcu_torture_shuffle"); + } while (!kthread_should_stop()); + VERBOSE_PRINTK_STRING("rcu_torture_shuffle task stopping"); + return 0; +} + +/* Cause the rcutorture test to "stutter", starting and stopping all + * threads periodically. + */ +static int +rcu_torture_stutter(void *arg) +{ + VERBOSE_PRINTK_STRING("rcu_torture_stutter task started"); + do { + schedule_timeout_interruptible(stutter * HZ); + stutter_pause_test = 1; + if (!kthread_should_stop()) + schedule_timeout_interruptible(stutter * HZ); + stutter_pause_test = 0; + rcutorture_shutdown_absorb("rcu_torture_stutter"); + } while (!kthread_should_stop()); + VERBOSE_PRINTK_STRING("rcu_torture_stutter task stopping"); + return 0; +} + +static inline void +rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag) +{ + pr_alert("%s" TORTURE_FLAG + "--- %s: nreaders=%d nfakewriters=%d " + "stat_interval=%d verbose=%d test_no_idle_hz=%d " + "shuffle_interval=%d stutter=%d irqreader=%d " + "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d " + "test_boost=%d/%d test_boost_interval=%d " + "test_boost_duration=%d shutdown_secs=%d " + "stall_cpu=%d stall_cpu_holdoff=%d " + "n_barrier_cbs=%d " + "onoff_interval=%d onoff_holdoff=%d\n", + torture_type, tag, nrealreaders, nfakewriters, + stat_interval, verbose, test_no_idle_hz, shuffle_interval, + stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter, + test_boost, cur_ops->can_boost, + test_boost_interval, test_boost_duration, shutdown_secs, + stall_cpu, stall_cpu_holdoff, + n_barrier_cbs, + onoff_interval, onoff_holdoff); +} + +static struct notifier_block rcutorture_shutdown_nb = { + .notifier_call = rcutorture_shutdown_notify, +}; + +static void rcutorture_booster_cleanup(int cpu) +{ + struct task_struct *t; + + if (boost_tasks[cpu] == NULL) + return; + mutex_lock(&boost_mutex); + VERBOSE_PRINTK_STRING("Stopping rcu_torture_boost task"); + t = boost_tasks[cpu]; + boost_tasks[cpu] = NULL; + mutex_unlock(&boost_mutex); + + /* This must be outside of the mutex, otherwise deadlock! */ + kthread_stop(t); + boost_tasks[cpu] = NULL; +} + +static int rcutorture_booster_init(int cpu) +{ + int retval; + + if (boost_tasks[cpu] != NULL) + return 0; /* Already created, nothing more to do. */ + + /* Don't allow time recalculation while creating a new task. */ + mutex_lock(&boost_mutex); + VERBOSE_PRINTK_STRING("Creating rcu_torture_boost task"); + boost_tasks[cpu] = kthread_create_on_node(rcu_torture_boost, NULL, + cpu_to_node(cpu), + "rcu_torture_boost"); + if (IS_ERR(boost_tasks[cpu])) { + retval = PTR_ERR(boost_tasks[cpu]); + VERBOSE_PRINTK_STRING("rcu_torture_boost task create failed"); + n_rcu_torture_boost_ktrerror++; + boost_tasks[cpu] = NULL; + mutex_unlock(&boost_mutex); + return retval; + } + kthread_bind(boost_tasks[cpu], cpu); + wake_up_process(boost_tasks[cpu]); + mutex_unlock(&boost_mutex); + return 0; +} + +/* + * Cause the rcutorture test to shutdown the system after the test has + * run for the time specified by the shutdown_secs module parameter. + */ +static int +rcu_torture_shutdown(void *arg) +{ + long delta; + unsigned long jiffies_snap; + + VERBOSE_PRINTK_STRING("rcu_torture_shutdown task started"); + jiffies_snap = ACCESS_ONCE(jiffies); + while (ULONG_CMP_LT(jiffies_snap, shutdown_time) && + !kthread_should_stop()) { + delta = shutdown_time - jiffies_snap; + if (verbose) + pr_alert("%s" TORTURE_FLAG + "rcu_torture_shutdown task: %lu jiffies remaining\n", + torture_type, delta); + schedule_timeout_interruptible(delta); + jiffies_snap = ACCESS_ONCE(jiffies); + } + if (kthread_should_stop()) { + VERBOSE_PRINTK_STRING("rcu_torture_shutdown task stopping"); + return 0; + } + + /* OK, shut down the system. */ + + VERBOSE_PRINTK_STRING("rcu_torture_shutdown task shutting down system"); + shutdown_task = NULL; /* Avoid self-kill deadlock. */ + rcu_torture_cleanup(); /* Get the success/failure message. */ + kernel_power_off(); /* Shut down the system. */ + return 0; +} + +#ifdef CONFIG_HOTPLUG_CPU + +/* + * Execute random CPU-hotplug operations at the interval specified + * by the onoff_interval. + */ +static int +rcu_torture_onoff(void *arg) +{ + int cpu; + unsigned long delta; + int maxcpu = -1; + DEFINE_TORTURE_RANDOM(rand); + int ret; + unsigned long starttime; + + VERBOSE_PRINTK_STRING("rcu_torture_onoff task started"); + for_each_online_cpu(cpu) + maxcpu = cpu; + WARN_ON(maxcpu < 0); + if (onoff_holdoff > 0) { + VERBOSE_PRINTK_STRING("rcu_torture_onoff begin holdoff"); + schedule_timeout_interruptible(onoff_holdoff * HZ); + VERBOSE_PRINTK_STRING("rcu_torture_onoff end holdoff"); + } + while (!kthread_should_stop()) { + cpu = (torture_random(&rand) >> 4) % (maxcpu + 1); + if (cpu_online(cpu) && cpu_is_hotpluggable(cpu)) { + if (verbose) + pr_alert("%s" TORTURE_FLAG + "rcu_torture_onoff task: offlining %d\n", + torture_type, cpu); + starttime = jiffies; + n_offline_attempts++; + ret = cpu_down(cpu); + if (ret) { + if (verbose) + pr_alert("%s" TORTURE_FLAG + "rcu_torture_onoff task: offline %d failed: errno %d\n", + torture_type, cpu, ret); + } else { + if (verbose) + pr_alert("%s" TORTURE_FLAG + "rcu_torture_onoff task: offlined %d\n", + torture_type, cpu); + n_offline_successes++; + delta = jiffies - starttime; + sum_offline += delta; + if (min_offline < 0) { + min_offline = delta; + max_offline = delta; + } + if (min_offline > delta) + min_offline = delta; + if (max_offline < delta) + max_offline = delta; + } + } else if (cpu_is_hotpluggable(cpu)) { + if (verbose) + pr_alert("%s" TORTURE_FLAG + "rcu_torture_onoff task: onlining %d\n", + torture_type, cpu); + starttime = jiffies; + n_online_attempts++; + ret = cpu_up(cpu); + if (ret) { + if (verbose) + pr_alert("%s" TORTURE_FLAG + "rcu_torture_onoff task: online %d failed: errno %d\n", + torture_type, cpu, ret); + } else { + if (verbose) + pr_alert("%s" TORTURE_FLAG + "rcu_torture_onoff task: onlined %d\n", + torture_type, cpu); + n_online_successes++; + delta = jiffies - starttime; + sum_online += delta; + if (min_online < 0) { + min_online = delta; + max_online = delta; + } + if (min_online > delta) + min_online = delta; + if (max_online < delta) + max_online = delta; + } + } + schedule_timeout_interruptible(onoff_interval * HZ); + } + VERBOSE_PRINTK_STRING("rcu_torture_onoff task stopping"); + return 0; +} + +static int +rcu_torture_onoff_init(void) +{ + int ret; + + if (onoff_interval <= 0) + return 0; + onoff_task = kthread_run(rcu_torture_onoff, NULL, "rcu_torture_onoff"); + if (IS_ERR(onoff_task)) { + ret = PTR_ERR(onoff_task); + onoff_task = NULL; + return ret; + } + return 0; +} + +static void rcu_torture_onoff_cleanup(void) +{ + if (onoff_task == NULL) + return; + VERBOSE_PRINTK_STRING("Stopping rcu_torture_onoff task"); + kthread_stop(onoff_task); + onoff_task = NULL; +} + +#else /* #ifdef CONFIG_HOTPLUG_CPU */ + +static int +rcu_torture_onoff_init(void) +{ + return 0; +} + +static void rcu_torture_onoff_cleanup(void) +{ +} + +#endif /* #else #ifdef CONFIG_HOTPLUG_CPU */ + +/* + * CPU-stall kthread. It waits as specified by stall_cpu_holdoff, then + * induces a CPU stall for the time specified by stall_cpu. + */ +static int rcu_torture_stall(void *args) +{ + unsigned long stop_at; + + VERBOSE_PRINTK_STRING("rcu_torture_stall task started"); + if (stall_cpu_holdoff > 0) { + VERBOSE_PRINTK_STRING("rcu_torture_stall begin holdoff"); + schedule_timeout_interruptible(stall_cpu_holdoff * HZ); + VERBOSE_PRINTK_STRING("rcu_torture_stall end holdoff"); + } + if (!kthread_should_stop()) { + stop_at = get_seconds() + stall_cpu; + /* RCU CPU stall is expected behavior in following code. */ + pr_alert("rcu_torture_stall start.\n"); + rcu_read_lock(); + preempt_disable(); + while (ULONG_CMP_LT(get_seconds(), stop_at)) + continue; /* Induce RCU CPU stall warning. */ + preempt_enable(); + rcu_read_unlock(); + pr_alert("rcu_torture_stall end.\n"); + } + rcutorture_shutdown_absorb("rcu_torture_stall"); + while (!kthread_should_stop()) + schedule_timeout_interruptible(10 * HZ); + return 0; +} + +/* Spawn CPU-stall kthread, if stall_cpu specified. */ +static int __init rcu_torture_stall_init(void) +{ + int ret; + + if (stall_cpu <= 0) + return 0; + stall_task = kthread_run(rcu_torture_stall, NULL, "rcu_torture_stall"); + if (IS_ERR(stall_task)) { + ret = PTR_ERR(stall_task); + stall_task = NULL; + return ret; + } + return 0; +} + +/* Clean up after the CPU-stall kthread, if one was spawned. */ +static void rcu_torture_stall_cleanup(void) +{ + if (stall_task == NULL) + return; + VERBOSE_PRINTK_STRING("Stopping rcu_torture_stall_task."); + kthread_stop(stall_task); + stall_task = NULL; +} + +/* Callback function for RCU barrier testing. */ +void rcu_torture_barrier_cbf(struct rcu_head *rcu) +{ + atomic_inc(&barrier_cbs_invoked); +} + +/* kthread function to register callbacks used to test RCU barriers. */ +static int rcu_torture_barrier_cbs(void *arg) +{ + long myid = (long)arg; + bool lastphase = 0; + bool newphase; + struct rcu_head rcu; + + init_rcu_head_on_stack(&rcu); + VERBOSE_PRINTK_STRING("rcu_torture_barrier_cbs task started"); + set_user_nice(current, 19); + do { + wait_event(barrier_cbs_wq[myid], + (newphase = + ACCESS_ONCE(barrier_phase)) != lastphase || + kthread_should_stop() || + fullstop != FULLSTOP_DONTSTOP); + lastphase = newphase; + smp_mb(); /* ensure barrier_phase load before ->call(). */ + if (kthread_should_stop() || fullstop != FULLSTOP_DONTSTOP) + break; + cur_ops->call(&rcu, rcu_torture_barrier_cbf); + if (atomic_dec_and_test(&barrier_cbs_count)) + wake_up(&barrier_wq); + } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); + VERBOSE_PRINTK_STRING("rcu_torture_barrier_cbs task stopping"); + rcutorture_shutdown_absorb("rcu_torture_barrier_cbs"); + while (!kthread_should_stop()) + schedule_timeout_interruptible(1); + cur_ops->cb_barrier(); + destroy_rcu_head_on_stack(&rcu); + return 0; +} + +/* kthread function to drive and coordinate RCU barrier testing. */ +static int rcu_torture_barrier(void *arg) +{ + int i; + + VERBOSE_PRINTK_STRING("rcu_torture_barrier task starting"); + do { + atomic_set(&barrier_cbs_invoked, 0); + atomic_set(&barrier_cbs_count, n_barrier_cbs); + smp_mb(); /* Ensure barrier_phase after prior assignments. */ + barrier_phase = !barrier_phase; + for (i = 0; i < n_barrier_cbs; i++) + wake_up(&barrier_cbs_wq[i]); + wait_event(barrier_wq, + atomic_read(&barrier_cbs_count) == 0 || + kthread_should_stop() || + fullstop != FULLSTOP_DONTSTOP); + if (kthread_should_stop() || fullstop != FULLSTOP_DONTSTOP) + break; + n_barrier_attempts++; + cur_ops->cb_barrier(); /* Implies smp_mb() for wait_event(). */ + if (atomic_read(&barrier_cbs_invoked) != n_barrier_cbs) { + n_rcu_torture_barrier_error++; + WARN_ON_ONCE(1); + } + n_barrier_successes++; + schedule_timeout_interruptible(HZ / 10); + } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); + VERBOSE_PRINTK_STRING("rcu_torture_barrier task stopping"); + rcutorture_shutdown_absorb("rcu_torture_barrier"); + while (!kthread_should_stop()) + schedule_timeout_interruptible(1); + return 0; +} + +/* Initialize RCU barrier testing. */ +static int rcu_torture_barrier_init(void) +{ + int i; + int ret; + + if (n_barrier_cbs == 0) + return 0; + if (cur_ops->call == NULL || cur_ops->cb_barrier == NULL) { + pr_alert("%s" TORTURE_FLAG + " Call or barrier ops missing for %s,\n", + torture_type, cur_ops->name); + pr_alert("%s" TORTURE_FLAG + " RCU barrier testing omitted from run.\n", + torture_type); + return 0; + } + atomic_set(&barrier_cbs_count, 0); + atomic_set(&barrier_cbs_invoked, 0); + barrier_cbs_tasks = + kzalloc(n_barrier_cbs * sizeof(barrier_cbs_tasks[0]), + GFP_KERNEL); + barrier_cbs_wq = + kzalloc(n_barrier_cbs * sizeof(barrier_cbs_wq[0]), + GFP_KERNEL); + if (barrier_cbs_tasks == NULL || !barrier_cbs_wq) + return -ENOMEM; + for (i = 0; i < n_barrier_cbs; i++) { + init_waitqueue_head(&barrier_cbs_wq[i]); + barrier_cbs_tasks[i] = kthread_run(rcu_torture_barrier_cbs, + (void *)(long)i, + "rcu_torture_barrier_cbs"); + if (IS_ERR(barrier_cbs_tasks[i])) { + ret = PTR_ERR(barrier_cbs_tasks[i]); + VERBOSE_PRINTK_ERRSTRING("Failed to create rcu_torture_barrier_cbs"); + barrier_cbs_tasks[i] = NULL; + return ret; + } + } + barrier_task = kthread_run(rcu_torture_barrier, NULL, + "rcu_torture_barrier"); + if (IS_ERR(barrier_task)) { + ret = PTR_ERR(barrier_task); + VERBOSE_PRINTK_ERRSTRING("Failed to create rcu_torture_barrier"); + barrier_task = NULL; + } + return 0; +} + +/* Clean up after RCU barrier testing. */ +static void rcu_torture_barrier_cleanup(void) +{ + int i; + + if (barrier_task != NULL) { + VERBOSE_PRINTK_STRING("Stopping rcu_torture_barrier task"); + kthread_stop(barrier_task); + barrier_task = NULL; + } + if (barrier_cbs_tasks != NULL) { + for (i = 0; i < n_barrier_cbs; i++) { + if (barrier_cbs_tasks[i] != NULL) { + VERBOSE_PRINTK_STRING("Stopping rcu_torture_barrier_cbs task"); + kthread_stop(barrier_cbs_tasks[i]); + barrier_cbs_tasks[i] = NULL; + } + } + kfree(barrier_cbs_tasks); + barrier_cbs_tasks = NULL; + } + if (barrier_cbs_wq != NULL) { + kfree(barrier_cbs_wq); + barrier_cbs_wq = NULL; + } +} + +static int rcutorture_cpu_notify(struct notifier_block *self, + unsigned long action, void *hcpu) +{ + long cpu = (long)hcpu; + + switch (action) { + case CPU_ONLINE: + case CPU_DOWN_FAILED: + (void)rcutorture_booster_init(cpu); + break; + case CPU_DOWN_PREPARE: + rcutorture_booster_cleanup(cpu); + break; + default: + break; + } + return NOTIFY_OK; +} + +static struct notifier_block rcutorture_cpu_nb = { + .notifier_call = rcutorture_cpu_notify, +}; + +static void +rcu_torture_cleanup(void) +{ + int i; + + mutex_lock(&fullstop_mutex); + rcutorture_record_test_transition(); + if (fullstop == FULLSTOP_SHUTDOWN) { + pr_warn(/* but going down anyway, so... */ + "Concurrent 'rmmod rcutorture' and shutdown illegal!\n"); + mutex_unlock(&fullstop_mutex); + schedule_timeout_uninterruptible(10); + if (cur_ops->cb_barrier != NULL) + cur_ops->cb_barrier(); + return; + } + fullstop = FULLSTOP_RMMOD; + mutex_unlock(&fullstop_mutex); + unregister_reboot_notifier(&rcutorture_shutdown_nb); + rcu_torture_barrier_cleanup(); + rcu_torture_stall_cleanup(); + if (stutter_task) { + VERBOSE_PRINTK_STRING("Stopping rcu_torture_stutter task"); + kthread_stop(stutter_task); + } + stutter_task = NULL; + if (shuffler_task) { + VERBOSE_PRINTK_STRING("Stopping rcu_torture_shuffle task"); + kthread_stop(shuffler_task); + free_cpumask_var(shuffle_tmp_mask); + } + shuffler_task = NULL; + + if (writer_task) { + VERBOSE_PRINTK_STRING("Stopping rcu_torture_writer task"); + kthread_stop(writer_task); + } + writer_task = NULL; + + if (reader_tasks) { + for (i = 0; i < nrealreaders; i++) { + if (reader_tasks[i]) { + VERBOSE_PRINTK_STRING( + "Stopping rcu_torture_reader task"); + kthread_stop(reader_tasks[i]); + } + reader_tasks[i] = NULL; + } + kfree(reader_tasks); + reader_tasks = NULL; + } + rcu_torture_current = NULL; + + if (fakewriter_tasks) { + for (i = 0; i < nfakewriters; i++) { + if (fakewriter_tasks[i]) { + VERBOSE_PRINTK_STRING( + "Stopping rcu_torture_fakewriter task"); + kthread_stop(fakewriter_tasks[i]); + } + fakewriter_tasks[i] = NULL; + } + kfree(fakewriter_tasks); + fakewriter_tasks = NULL; + } + + if (stats_task) { + VERBOSE_PRINTK_STRING("Stopping rcu_torture_stats task"); + kthread_stop(stats_task); + } + stats_task = NULL; + + if (fqs_task) { + VERBOSE_PRINTK_STRING("Stopping rcu_torture_fqs task"); + kthread_stop(fqs_task); + } + fqs_task = NULL; + if ((test_boost == 1 && cur_ops->can_boost) || + test_boost == 2) { + unregister_cpu_notifier(&rcutorture_cpu_nb); + for_each_possible_cpu(i) + rcutorture_booster_cleanup(i); + } + if (shutdown_task != NULL) { + VERBOSE_PRINTK_STRING("Stopping rcu_torture_shutdown task"); + kthread_stop(shutdown_task); + } + shutdown_task = NULL; + rcu_torture_onoff_cleanup(); + + /* Wait for all RCU callbacks to fire. */ + + if (cur_ops->cb_barrier != NULL) + cur_ops->cb_barrier(); + + rcu_torture_stats_print(); /* -After- the stats thread is stopped! */ + + if (atomic_read(&n_rcu_torture_error) || n_rcu_torture_barrier_error) + rcu_torture_print_module_parms(cur_ops, "End of test: FAILURE"); + else if (n_online_successes != n_online_attempts || + n_offline_successes != n_offline_attempts) + rcu_torture_print_module_parms(cur_ops, + "End of test: RCU_HOTPLUG"); + else + rcu_torture_print_module_parms(cur_ops, "End of test: SUCCESS"); +} + +#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD +static void rcu_torture_leak_cb(struct rcu_head *rhp) +{ +} + +static void rcu_torture_err_cb(struct rcu_head *rhp) +{ + /* + * This -might- happen due to race conditions, but is unlikely. + * The scenario that leads to this happening is that the + * first of the pair of duplicate callbacks is queued, + * someone else starts a grace period that includes that + * callback, then the second of the pair must wait for the + * next grace period. Unlikely, but can happen. If it + * does happen, the debug-objects subsystem won't have splatted. + */ + pr_alert("rcutorture: duplicated callback was invoked.\n"); +} +#endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ + +/* + * Verify that double-free causes debug-objects to complain, but only + * if CONFIG_DEBUG_OBJECTS_RCU_HEAD=y. Otherwise, say that the test + * cannot be carried out. + */ +static void rcu_test_debug_objects(void) +{ +#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD + struct rcu_head rh1; + struct rcu_head rh2; + + init_rcu_head_on_stack(&rh1); + init_rcu_head_on_stack(&rh2); + pr_alert("rcutorture: WARN: Duplicate call_rcu() test starting.\n"); + + /* Try to queue the rh2 pair of callbacks for the same grace period. */ + preempt_disable(); /* Prevent preemption from interrupting test. */ + rcu_read_lock(); /* Make it impossible to finish a grace period. */ + call_rcu(&rh1, rcu_torture_leak_cb); /* Start grace period. */ + local_irq_disable(); /* Make it harder to start a new grace period. */ + call_rcu(&rh2, rcu_torture_leak_cb); + call_rcu(&rh2, rcu_torture_err_cb); /* Duplicate callback. */ + local_irq_enable(); + rcu_read_unlock(); + preempt_enable(); + + /* Wait for them all to get done so we can safely return. */ + rcu_barrier(); + pr_alert("rcutorture: WARN: Duplicate call_rcu() test complete.\n"); + destroy_rcu_head_on_stack(&rh1); + destroy_rcu_head_on_stack(&rh2); +#else /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ + pr_alert("rcutorture: !CONFIG_DEBUG_OBJECTS_RCU_HEAD, not testing duplicate call_rcu()\n"); +#endif /* #else #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ +} + +static int __init +rcu_torture_init(void) +{ + int i; + int cpu; + int firsterr = 0; + int retval; + static struct rcu_torture_ops *torture_ops[] = { + &rcu_ops, &rcu_bh_ops, &srcu_ops, &sched_ops, + }; + + mutex_lock(&fullstop_mutex); + + /* Process args and tell the world that the torturer is on the job. */ + for (i = 0; i < ARRAY_SIZE(torture_ops); i++) { + cur_ops = torture_ops[i]; + if (strcmp(torture_type, cur_ops->name) == 0) + break; + } + if (i == ARRAY_SIZE(torture_ops)) { + pr_alert("rcu-torture: invalid torture type: \"%s\"\n", + torture_type); + pr_alert("rcu-torture types:"); + for (i = 0; i < ARRAY_SIZE(torture_ops); i++) + pr_alert(" %s", torture_ops[i]->name); + pr_alert("\n"); + mutex_unlock(&fullstop_mutex); + return -EINVAL; + } + if (cur_ops->fqs == NULL && fqs_duration != 0) { + pr_alert("rcu-torture: ->fqs NULL and non-zero fqs_duration, fqs disabled.\n"); + fqs_duration = 0; + } + if (cur_ops->init) + cur_ops->init(); /* no "goto unwind" prior to this point!!! */ + + if (nreaders >= 0) + nrealreaders = nreaders; + else + nrealreaders = 2 * num_online_cpus(); + rcu_torture_print_module_parms(cur_ops, "Start of test"); + fullstop = FULLSTOP_DONTSTOP; + + /* Set up the freelist. */ + + INIT_LIST_HEAD(&rcu_torture_freelist); + for (i = 0; i < ARRAY_SIZE(rcu_tortures); i++) { + rcu_tortures[i].rtort_mbtest = 0; + list_add_tail(&rcu_tortures[i].rtort_free, + &rcu_torture_freelist); + } + + /* Initialize the statistics so that each run gets its own numbers. */ + + rcu_torture_current = NULL; + rcu_torture_current_version = 0; + atomic_set(&n_rcu_torture_alloc, 0); + atomic_set(&n_rcu_torture_alloc_fail, 0); + atomic_set(&n_rcu_torture_free, 0); + atomic_set(&n_rcu_torture_mberror, 0); + atomic_set(&n_rcu_torture_error, 0); + n_rcu_torture_barrier_error = 0; + n_rcu_torture_boost_ktrerror = 0; + n_rcu_torture_boost_rterror = 0; + n_rcu_torture_boost_failure = 0; + n_rcu_torture_boosts = 0; + for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) + atomic_set(&rcu_torture_wcount[i], 0); + for_each_possible_cpu(cpu) { + for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) { + per_cpu(rcu_torture_count, cpu)[i] = 0; + per_cpu(rcu_torture_batch, cpu)[i] = 0; + } + } + + /* Start up the kthreads. */ + + VERBOSE_PRINTK_STRING("Creating rcu_torture_writer task"); + writer_task = kthread_create(rcu_torture_writer, NULL, + "rcu_torture_writer"); + if (IS_ERR(writer_task)) { + firsterr = PTR_ERR(writer_task); + VERBOSE_PRINTK_ERRSTRING("Failed to create writer"); + writer_task = NULL; + goto unwind; + } + wake_up_process(writer_task); + fakewriter_tasks = kzalloc(nfakewriters * sizeof(fakewriter_tasks[0]), + GFP_KERNEL); + if (fakewriter_tasks == NULL) { + VERBOSE_PRINTK_ERRSTRING("out of memory"); + firsterr = -ENOMEM; + goto unwind; + } + for (i = 0; i < nfakewriters; i++) { + VERBOSE_PRINTK_STRING("Creating rcu_torture_fakewriter task"); + fakewriter_tasks[i] = kthread_run(rcu_torture_fakewriter, NULL, + "rcu_torture_fakewriter"); + if (IS_ERR(fakewriter_tasks[i])) { + firsterr = PTR_ERR(fakewriter_tasks[i]); + VERBOSE_PRINTK_ERRSTRING("Failed to create fakewriter"); + fakewriter_tasks[i] = NULL; + goto unwind; + } + } + reader_tasks = kzalloc(nrealreaders * sizeof(reader_tasks[0]), + GFP_KERNEL); + if (reader_tasks == NULL) { + VERBOSE_PRINTK_ERRSTRING("out of memory"); + firsterr = -ENOMEM; + goto unwind; + } + for (i = 0; i < nrealreaders; i++) { + VERBOSE_PRINTK_STRING("Creating rcu_torture_reader task"); + reader_tasks[i] = kthread_run(rcu_torture_reader, NULL, + "rcu_torture_reader"); + if (IS_ERR(reader_tasks[i])) { + firsterr = PTR_ERR(reader_tasks[i]); + VERBOSE_PRINTK_ERRSTRING("Failed to create reader"); + reader_tasks[i] = NULL; + goto unwind; + } + } + if (stat_interval > 0) { + VERBOSE_PRINTK_STRING("Creating rcu_torture_stats task"); + stats_task = kthread_run(rcu_torture_stats, NULL, + "rcu_torture_stats"); + if (IS_ERR(stats_task)) { + firsterr = PTR_ERR(stats_task); + VERBOSE_PRINTK_ERRSTRING("Failed to create stats"); + stats_task = NULL; + goto unwind; + } + } + if (test_no_idle_hz) { + rcu_idle_cpu = num_online_cpus() - 1; + + if (!alloc_cpumask_var(&shuffle_tmp_mask, GFP_KERNEL)) { + firsterr = -ENOMEM; + VERBOSE_PRINTK_ERRSTRING("Failed to alloc mask"); + goto unwind; + } + + /* Create the shuffler thread */ + shuffler_task = kthread_run(rcu_torture_shuffle, NULL, + "rcu_torture_shuffle"); + if (IS_ERR(shuffler_task)) { + free_cpumask_var(shuffle_tmp_mask); + firsterr = PTR_ERR(shuffler_task); + VERBOSE_PRINTK_ERRSTRING("Failed to create shuffler"); + shuffler_task = NULL; + goto unwind; + } + } + if (stutter < 0) + stutter = 0; + if (stutter) { + /* Create the stutter thread */ + stutter_task = kthread_run(rcu_torture_stutter, NULL, + "rcu_torture_stutter"); + if (IS_ERR(stutter_task)) { + firsterr = PTR_ERR(stutter_task); + VERBOSE_PRINTK_ERRSTRING("Failed to create stutter"); + stutter_task = NULL; + goto unwind; + } + } + if (fqs_duration < 0) + fqs_duration = 0; + if (fqs_duration) { + /* Create the stutter thread */ + fqs_task = kthread_run(rcu_torture_fqs, NULL, + "rcu_torture_fqs"); + if (IS_ERR(fqs_task)) { + firsterr = PTR_ERR(fqs_task); + VERBOSE_PRINTK_ERRSTRING("Failed to create fqs"); + fqs_task = NULL; + goto unwind; + } + } + if (test_boost_interval < 1) + test_boost_interval = 1; + if (test_boost_duration < 2) + test_boost_duration = 2; + if ((test_boost == 1 && cur_ops->can_boost) || + test_boost == 2) { + + boost_starttime = jiffies + test_boost_interval * HZ; + register_cpu_notifier(&rcutorture_cpu_nb); + for_each_possible_cpu(i) { + if (cpu_is_offline(i)) + continue; /* Heuristic: CPU can go offline. */ + retval = rcutorture_booster_init(i); + if (retval < 0) { + firsterr = retval; + goto unwind; + } + } + } + if (shutdown_secs > 0) { + shutdown_time = jiffies + shutdown_secs * HZ; + shutdown_task = kthread_create(rcu_torture_shutdown, NULL, + "rcu_torture_shutdown"); + if (IS_ERR(shutdown_task)) { + firsterr = PTR_ERR(shutdown_task); + VERBOSE_PRINTK_ERRSTRING("Failed to create shutdown"); + shutdown_task = NULL; + goto unwind; + } + wake_up_process(shutdown_task); + } + i = rcu_torture_onoff_init(); + if (i != 0) { + firsterr = i; + goto unwind; + } + register_reboot_notifier(&rcutorture_shutdown_nb); + i = rcu_torture_stall_init(); + if (i != 0) { + firsterr = i; + goto unwind; + } + retval = rcu_torture_barrier_init(); + if (retval != 0) { + firsterr = retval; + goto unwind; + } + if (object_debug) + rcu_test_debug_objects(); + rcutorture_record_test_transition(); + mutex_unlock(&fullstop_mutex); + return 0; + +unwind: + mutex_unlock(&fullstop_mutex); + rcu_torture_cleanup(); + return firsterr; +} + +module_init(rcu_torture_init); +module_exit(rcu_torture_cleanup); diff --git a/kernel/rcu/torture.c b/kernel/rcu/torture.c deleted file mode 100644 index dad67238d086..000000000000 --- a/kernel/rcu/torture.c +++ /dev/null @@ -1,2148 +0,0 @@ -/* - * Read-Copy Update module-based torture test facility - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - * - * Copyright (C) IBM Corporation, 2005, 2006 - * - * Authors: Paul E. McKenney - * Josh Triplett - * - * See also: Documentation/RCU/torture.txt - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Paul E. McKenney and Josh Triplett "); - -MODULE_ALIAS("rcutorture"); -#ifdef MODULE_PARAM_PREFIX -#undef MODULE_PARAM_PREFIX -#endif -#define MODULE_PARAM_PREFIX "rcutorture." - -static int fqs_duration; -module_param(fqs_duration, int, 0444); -MODULE_PARM_DESC(fqs_duration, "Duration of fqs bursts (us), 0 to disable"); -static int fqs_holdoff; -module_param(fqs_holdoff, int, 0444); -MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)"); -static int fqs_stutter = 3; -module_param(fqs_stutter, int, 0444); -MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)"); -static bool gp_exp; -module_param(gp_exp, bool, 0444); -MODULE_PARM_DESC(gp_exp, "Use expedited GP wait primitives"); -static bool gp_normal; -module_param(gp_normal, bool, 0444); -MODULE_PARM_DESC(gp_normal, "Use normal (non-expedited) GP wait primitives"); -static int irqreader = 1; -module_param(irqreader, int, 0444); -MODULE_PARM_DESC(irqreader, "Allow RCU readers from irq handlers"); -static int n_barrier_cbs; -module_param(n_barrier_cbs, int, 0444); -MODULE_PARM_DESC(n_barrier_cbs, "# of callbacks/kthreads for barrier testing"); -static int nfakewriters = 4; -module_param(nfakewriters, int, 0444); -MODULE_PARM_DESC(nfakewriters, "Number of RCU fake writer threads"); -static int nreaders = -1; -module_param(nreaders, int, 0444); -MODULE_PARM_DESC(nreaders, "Number of RCU reader threads"); -static int object_debug; -module_param(object_debug, int, 0444); -MODULE_PARM_DESC(object_debug, "Enable debug-object double call_rcu() testing"); -static int onoff_holdoff; -module_param(onoff_holdoff, int, 0444); -MODULE_PARM_DESC(onoff_holdoff, "Time after boot before CPU hotplugs (s)"); -static int onoff_interval; -module_param(onoff_interval, int, 0444); -MODULE_PARM_DESC(onoff_interval, "Time between CPU hotplugs (s), 0=disable"); -static int shuffle_interval = 3; -module_param(shuffle_interval, int, 0444); -MODULE_PARM_DESC(shuffle_interval, "Number of seconds between shuffles"); -static int shutdown_secs; -module_param(shutdown_secs, int, 0444); -MODULE_PARM_DESC(shutdown_secs, "Shutdown time (s), <= zero to disable."); -static int stall_cpu; -module_param(stall_cpu, int, 0444); -MODULE_PARM_DESC(stall_cpu, "Stall duration (s), zero to disable."); -static int stall_cpu_holdoff = 10; -module_param(stall_cpu_holdoff, int, 0444); -MODULE_PARM_DESC(stall_cpu_holdoff, "Time to wait before starting stall (s)."); -static int stat_interval = 60; -module_param(stat_interval, int, 0644); -MODULE_PARM_DESC(stat_interval, "Number of seconds between stats printk()s"); -static int stutter = 5; -module_param(stutter, int, 0444); -MODULE_PARM_DESC(stutter, "Number of seconds to run/halt test"); -static int test_boost = 1; -module_param(test_boost, int, 0444); -MODULE_PARM_DESC(test_boost, "Test RCU prio boost: 0=no, 1=maybe, 2=yes."); -static int test_boost_duration = 4; -module_param(test_boost_duration, int, 0444); -MODULE_PARM_DESC(test_boost_duration, "Duration of each boost test, seconds."); -static int test_boost_interval = 7; -module_param(test_boost_interval, int, 0444); -MODULE_PARM_DESC(test_boost_interval, "Interval between boost tests, seconds."); -static bool test_no_idle_hz = true; -module_param(test_no_idle_hz, bool, 0444); -MODULE_PARM_DESC(test_no_idle_hz, "Test support for tickless idle CPUs"); -static char *torture_type = "rcu"; -module_param(torture_type, charp, 0444); -MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, ...)"); -static bool verbose; -module_param(verbose, bool, 0444); -MODULE_PARM_DESC(verbose, "Enable verbose debugging printk()s"); - -#define TORTURE_FLAG "-torture:" -#define PRINTK_STRING(s) \ - do { pr_alert("%s" TORTURE_FLAG s "\n", torture_type); } while (0) -#define VERBOSE_PRINTK_STRING(s) \ - do { if (verbose) pr_alert("%s" TORTURE_FLAG s "\n", torture_type); } while (0) -#define VERBOSE_PRINTK_ERRSTRING(s) \ - do { if (verbose) pr_alert("%s" TORTURE_FLAG "!!! " s "\n", torture_type); } while (0) - -static int nrealreaders; -static struct task_struct *writer_task; -static struct task_struct **fakewriter_tasks; -static struct task_struct **reader_tasks; -static struct task_struct *stats_task; -static struct task_struct *shuffler_task; -static struct task_struct *stutter_task; -static struct task_struct *fqs_task; -static struct task_struct *boost_tasks[NR_CPUS]; -static struct task_struct *shutdown_task; -#ifdef CONFIG_HOTPLUG_CPU -static struct task_struct *onoff_task; -#endif /* #ifdef CONFIG_HOTPLUG_CPU */ -static struct task_struct *stall_task; -static struct task_struct **barrier_cbs_tasks; -static struct task_struct *barrier_task; - -#define RCU_TORTURE_PIPE_LEN 10 - -struct rcu_torture { - struct rcu_head rtort_rcu; - int rtort_pipe_count; - struct list_head rtort_free; - int rtort_mbtest; -}; - -static LIST_HEAD(rcu_torture_freelist); -static struct rcu_torture __rcu *rcu_torture_current; -static unsigned long rcu_torture_current_version; -static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN]; -static DEFINE_SPINLOCK(rcu_torture_lock); -static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], - rcu_torture_count) = { 0 }; -static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], - rcu_torture_batch) = { 0 }; -static atomic_t rcu_torture_wcount[RCU_TORTURE_PIPE_LEN + 1]; -static atomic_t n_rcu_torture_alloc; -static atomic_t n_rcu_torture_alloc_fail; -static atomic_t n_rcu_torture_free; -static atomic_t n_rcu_torture_mberror; -static atomic_t n_rcu_torture_error; -static long n_rcu_torture_barrier_error; -static long n_rcu_torture_boost_ktrerror; -static long n_rcu_torture_boost_rterror; -static long n_rcu_torture_boost_failure; -static long n_rcu_torture_boosts; -static long n_rcu_torture_timers; -static long n_offline_attempts; -static long n_offline_successes; -static unsigned long sum_offline; -static int min_offline = -1; -static int max_offline; -static long n_online_attempts; -static long n_online_successes; -static unsigned long sum_online; -static int min_online = -1; -static int max_online; -static long n_barrier_attempts; -static long n_barrier_successes; -static struct list_head rcu_torture_removed; -static cpumask_var_t shuffle_tmp_mask; - -static int stutter_pause_test; - -#if defined(MODULE) || defined(CONFIG_RCU_TORTURE_TEST_RUNNABLE) -#define RCUTORTURE_RUNNABLE_INIT 1 -#else -#define RCUTORTURE_RUNNABLE_INIT 0 -#endif -int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT; -module_param(rcutorture_runnable, int, 0444); -MODULE_PARM_DESC(rcutorture_runnable, "Start rcutorture at boot"); - -#if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) -#define rcu_can_boost() 1 -#else /* #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) */ -#define rcu_can_boost() 0 -#endif /* #else #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) */ - -#ifdef CONFIG_RCU_TRACE -static u64 notrace rcu_trace_clock_local(void) -{ - u64 ts = trace_clock_local(); - unsigned long __maybe_unused ts_rem = do_div(ts, NSEC_PER_USEC); - return ts; -} -#else /* #ifdef CONFIG_RCU_TRACE */ -static u64 notrace rcu_trace_clock_local(void) -{ - return 0ULL; -} -#endif /* #else #ifdef CONFIG_RCU_TRACE */ - -static unsigned long shutdown_time; /* jiffies to system shutdown. */ -static unsigned long boost_starttime; /* jiffies of next boost test start. */ -DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */ - /* and boost task create/destroy. */ -static atomic_t barrier_cbs_count; /* Barrier callbacks registered. */ -static bool barrier_phase; /* Test phase. */ -static atomic_t barrier_cbs_invoked; /* Barrier callbacks invoked. */ -static wait_queue_head_t *barrier_cbs_wq; /* Coordinate barrier testing. */ -static DECLARE_WAIT_QUEUE_HEAD(barrier_wq); - -/* Mediate rmmod and system shutdown. Concurrent rmmod & shutdown illegal! */ - -#define FULLSTOP_DONTSTOP 0 /* Normal operation. */ -#define FULLSTOP_SHUTDOWN 1 /* System shutdown with rcutorture running. */ -#define FULLSTOP_RMMOD 2 /* Normal rmmod of rcutorture. */ -static int fullstop = FULLSTOP_RMMOD; -/* - * Protect fullstop transitions and spawning of kthreads. - */ -static DEFINE_MUTEX(fullstop_mutex); - -/* Forward reference. */ -static void rcu_torture_cleanup(void); - -/* - * Detect and respond to a system shutdown. - */ -static int -rcutorture_shutdown_notify(struct notifier_block *unused1, - unsigned long unused2, void *unused3) -{ - mutex_lock(&fullstop_mutex); - if (fullstop == FULLSTOP_DONTSTOP) - fullstop = FULLSTOP_SHUTDOWN; - else - pr_warn(/* but going down anyway, so... */ - "Concurrent 'rmmod rcutorture' and shutdown illegal!\n"); - mutex_unlock(&fullstop_mutex); - return NOTIFY_DONE; -} - -/* - * Absorb kthreads into a kernel function that won't return, so that - * they won't ever access module text or data again. - */ -static void rcutorture_shutdown_absorb(const char *title) -{ - if (ACCESS_ONCE(fullstop) == FULLSTOP_SHUTDOWN) { - pr_notice( - "rcutorture thread %s parking due to system shutdown\n", - title); - schedule_timeout_uninterruptible(MAX_SCHEDULE_TIMEOUT); - } -} - -/* - * Allocate an element from the rcu_tortures pool. - */ -static struct rcu_torture * -rcu_torture_alloc(void) -{ - struct list_head *p; - - spin_lock_bh(&rcu_torture_lock); - if (list_empty(&rcu_torture_freelist)) { - atomic_inc(&n_rcu_torture_alloc_fail); - spin_unlock_bh(&rcu_torture_lock); - return NULL; - } - atomic_inc(&n_rcu_torture_alloc); - p = rcu_torture_freelist.next; - list_del_init(p); - spin_unlock_bh(&rcu_torture_lock); - return container_of(p, struct rcu_torture, rtort_free); -} - -/* - * Free an element to the rcu_tortures pool. - */ -static void -rcu_torture_free(struct rcu_torture *p) -{ - atomic_inc(&n_rcu_torture_free); - spin_lock_bh(&rcu_torture_lock); - list_add_tail(&p->rtort_free, &rcu_torture_freelist); - spin_unlock_bh(&rcu_torture_lock); -} - -struct rcu_random_state { - unsigned long rrs_state; - long rrs_count; -}; - -#define RCU_RANDOM_MULT 39916801 /* prime */ -#define RCU_RANDOM_ADD 479001701 /* prime */ -#define RCU_RANDOM_REFRESH 10000 - -#define DEFINE_RCU_RANDOM(name) struct rcu_random_state name = { 0, 0 } - -/* - * Crude but fast random-number generator. Uses a linear congruential - * generator, with occasional help from cpu_clock(). - */ -static unsigned long -rcu_random(struct rcu_random_state *rrsp) -{ - if (--rrsp->rrs_count < 0) { - rrsp->rrs_state += (unsigned long)local_clock(); - rrsp->rrs_count = RCU_RANDOM_REFRESH; - } - rrsp->rrs_state = rrsp->rrs_state * RCU_RANDOM_MULT + RCU_RANDOM_ADD; - return swahw32(rrsp->rrs_state); -} - -static void -rcu_stutter_wait(const char *title) -{ - while (stutter_pause_test || !rcutorture_runnable) { - if (rcutorture_runnable) - schedule_timeout_interruptible(1); - else - schedule_timeout_interruptible(round_jiffies_relative(HZ)); - rcutorture_shutdown_absorb(title); - } -} - -/* - * Operations vector for selecting different types of tests. - */ - -struct rcu_torture_ops { - void (*init)(void); - int (*readlock)(void); - void (*read_delay)(struct rcu_random_state *rrsp); - void (*readunlock)(int idx); - int (*completed)(void); - void (*deferred_free)(struct rcu_torture *p); - void (*sync)(void); - void (*exp_sync)(void); - void (*call)(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); - void (*cb_barrier)(void); - void (*fqs)(void); - void (*stats)(char *page); - int irq_capable; - int can_boost; - const char *name; -}; - -static struct rcu_torture_ops *cur_ops; - -/* - * Definitions for rcu torture testing. - */ - -static int rcu_torture_read_lock(void) __acquires(RCU) -{ - rcu_read_lock(); - return 0; -} - -static void rcu_read_delay(struct rcu_random_state *rrsp) -{ - const unsigned long shortdelay_us = 200; - const unsigned long longdelay_ms = 50; - - /* We want a short delay sometimes to make a reader delay the grace - * period, and we want a long delay occasionally to trigger - * force_quiescent_state. */ - - if (!(rcu_random(rrsp) % (nrealreaders * 2000 * longdelay_ms))) - mdelay(longdelay_ms); - if (!(rcu_random(rrsp) % (nrealreaders * 2 * shortdelay_us))) - udelay(shortdelay_us); -#ifdef CONFIG_PREEMPT - if (!preempt_count() && !(rcu_random(rrsp) % (nrealreaders * 20000))) - preempt_schedule(); /* No QS if preempt_disable() in effect */ -#endif -} - -static void rcu_torture_read_unlock(int idx) __releases(RCU) -{ - rcu_read_unlock(); -} - -static int rcu_torture_completed(void) -{ - return rcu_batches_completed(); -} - -static void -rcu_torture_cb(struct rcu_head *p) -{ - int i; - struct rcu_torture *rp = container_of(p, struct rcu_torture, rtort_rcu); - - if (fullstop != FULLSTOP_DONTSTOP) { - /* Test is ending, just drop callbacks on the floor. */ - /* The next initialization will pick up the pieces. */ - return; - } - i = rp->rtort_pipe_count; - if (i > RCU_TORTURE_PIPE_LEN) - i = RCU_TORTURE_PIPE_LEN; - atomic_inc(&rcu_torture_wcount[i]); - if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) { - rp->rtort_mbtest = 0; - rcu_torture_free(rp); - } else { - cur_ops->deferred_free(rp); - } -} - -static int rcu_no_completed(void) -{ - return 0; -} - -static void rcu_torture_deferred_free(struct rcu_torture *p) -{ - call_rcu(&p->rtort_rcu, rcu_torture_cb); -} - -static void rcu_sync_torture_init(void) -{ - INIT_LIST_HEAD(&rcu_torture_removed); -} - -static struct rcu_torture_ops rcu_ops = { - .init = rcu_sync_torture_init, - .readlock = rcu_torture_read_lock, - .read_delay = rcu_read_delay, - .readunlock = rcu_torture_read_unlock, - .completed = rcu_torture_completed, - .deferred_free = rcu_torture_deferred_free, - .sync = synchronize_rcu, - .exp_sync = synchronize_rcu_expedited, - .call = call_rcu, - .cb_barrier = rcu_barrier, - .fqs = rcu_force_quiescent_state, - .stats = NULL, - .irq_capable = 1, - .can_boost = rcu_can_boost(), - .name = "rcu" -}; - -/* - * Definitions for rcu_bh torture testing. - */ - -static int rcu_bh_torture_read_lock(void) __acquires(RCU_BH) -{ - rcu_read_lock_bh(); - return 0; -} - -static void rcu_bh_torture_read_unlock(int idx) __releases(RCU_BH) -{ - rcu_read_unlock_bh(); -} - -static int rcu_bh_torture_completed(void) -{ - return rcu_batches_completed_bh(); -} - -static void rcu_bh_torture_deferred_free(struct rcu_torture *p) -{ - call_rcu_bh(&p->rtort_rcu, rcu_torture_cb); -} - -static struct rcu_torture_ops rcu_bh_ops = { - .init = rcu_sync_torture_init, - .readlock = rcu_bh_torture_read_lock, - .read_delay = rcu_read_delay, /* just reuse rcu's version. */ - .readunlock = rcu_bh_torture_read_unlock, - .completed = rcu_bh_torture_completed, - .deferred_free = rcu_bh_torture_deferred_free, - .sync = synchronize_rcu_bh, - .exp_sync = synchronize_rcu_bh_expedited, - .call = call_rcu_bh, - .cb_barrier = rcu_barrier_bh, - .fqs = rcu_bh_force_quiescent_state, - .stats = NULL, - .irq_capable = 1, - .name = "rcu_bh" -}; - -/* - * Definitions for srcu torture testing. - */ - -DEFINE_STATIC_SRCU(srcu_ctl); - -static int srcu_torture_read_lock(void) __acquires(&srcu_ctl) -{ - return srcu_read_lock(&srcu_ctl); -} - -static void srcu_read_delay(struct rcu_random_state *rrsp) -{ - long delay; - const long uspertick = 1000000 / HZ; - const long longdelay = 10; - - /* We want there to be long-running readers, but not all the time. */ - - delay = rcu_random(rrsp) % (nrealreaders * 2 * longdelay * uspertick); - if (!delay) - schedule_timeout_interruptible(longdelay); - else - rcu_read_delay(rrsp); -} - -static void srcu_torture_read_unlock(int idx) __releases(&srcu_ctl) -{ - srcu_read_unlock(&srcu_ctl, idx); -} - -static int srcu_torture_completed(void) -{ - return srcu_batches_completed(&srcu_ctl); -} - -static void srcu_torture_deferred_free(struct rcu_torture *rp) -{ - call_srcu(&srcu_ctl, &rp->rtort_rcu, rcu_torture_cb); -} - -static void srcu_torture_synchronize(void) -{ - synchronize_srcu(&srcu_ctl); -} - -static void srcu_torture_call(struct rcu_head *head, - void (*func)(struct rcu_head *head)) -{ - call_srcu(&srcu_ctl, head, func); -} - -static void srcu_torture_barrier(void) -{ - srcu_barrier(&srcu_ctl); -} - -static void srcu_torture_stats(char *page) -{ - int cpu; - int idx = srcu_ctl.completed & 0x1; - - page += sprintf(page, "%s%s per-CPU(idx=%d):", - torture_type, TORTURE_FLAG, idx); - for_each_possible_cpu(cpu) { - page += sprintf(page, " %d(%lu,%lu)", cpu, - per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[!idx], - per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[idx]); - } - sprintf(page, "\n"); -} - -static void srcu_torture_synchronize_expedited(void) -{ - synchronize_srcu_expedited(&srcu_ctl); -} - -static struct rcu_torture_ops srcu_ops = { - .init = rcu_sync_torture_init, - .readlock = srcu_torture_read_lock, - .read_delay = srcu_read_delay, - .readunlock = srcu_torture_read_unlock, - .completed = srcu_torture_completed, - .deferred_free = srcu_torture_deferred_free, - .sync = srcu_torture_synchronize, - .exp_sync = srcu_torture_synchronize_expedited, - .call = srcu_torture_call, - .cb_barrier = srcu_torture_barrier, - .stats = srcu_torture_stats, - .name = "srcu" -}; - -/* - * Definitions for sched torture testing. - */ - -static int sched_torture_read_lock(void) -{ - preempt_disable(); - return 0; -} - -static void sched_torture_read_unlock(int idx) -{ - preempt_enable(); -} - -static void rcu_sched_torture_deferred_free(struct rcu_torture *p) -{ - call_rcu_sched(&p->rtort_rcu, rcu_torture_cb); -} - -static struct rcu_torture_ops sched_ops = { - .init = rcu_sync_torture_init, - .readlock = sched_torture_read_lock, - .read_delay = rcu_read_delay, /* just reuse rcu's version. */ - .readunlock = sched_torture_read_unlock, - .completed = rcu_no_completed, - .deferred_free = rcu_sched_torture_deferred_free, - .sync = synchronize_sched, - .exp_sync = synchronize_sched_expedited, - .call = call_rcu_sched, - .cb_barrier = rcu_barrier_sched, - .fqs = rcu_sched_force_quiescent_state, - .stats = NULL, - .irq_capable = 1, - .name = "sched" -}; - -/* - * RCU torture priority-boost testing. Runs one real-time thread per - * CPU for moderate bursts, repeatedly registering RCU callbacks and - * spinning waiting for them to be invoked. If a given callback takes - * too long to be invoked, we assume that priority inversion has occurred. - */ - -struct rcu_boost_inflight { - struct rcu_head rcu; - int inflight; -}; - -static void rcu_torture_boost_cb(struct rcu_head *head) -{ - struct rcu_boost_inflight *rbip = - container_of(head, struct rcu_boost_inflight, rcu); - - smp_mb(); /* Ensure RCU-core accesses precede clearing ->inflight */ - rbip->inflight = 0; -} - -static int rcu_torture_boost(void *arg) -{ - unsigned long call_rcu_time; - unsigned long endtime; - unsigned long oldstarttime; - struct rcu_boost_inflight rbi = { .inflight = 0 }; - struct sched_param sp; - - VERBOSE_PRINTK_STRING("rcu_torture_boost started"); - - /* Set real-time priority. */ - sp.sched_priority = 1; - if (sched_setscheduler(current, SCHED_FIFO, &sp) < 0) { - VERBOSE_PRINTK_STRING("rcu_torture_boost RT prio failed!"); - n_rcu_torture_boost_rterror++; - } - - init_rcu_head_on_stack(&rbi.rcu); - /* Each pass through the following loop does one boost-test cycle. */ - do { - /* Wait for the next test interval. */ - oldstarttime = boost_starttime; - while (ULONG_CMP_LT(jiffies, oldstarttime)) { - schedule_timeout_interruptible(oldstarttime - jiffies); - rcu_stutter_wait("rcu_torture_boost"); - if (kthread_should_stop() || - fullstop != FULLSTOP_DONTSTOP) - goto checkwait; - } - - /* Do one boost-test interval. */ - endtime = oldstarttime + test_boost_duration * HZ; - call_rcu_time = jiffies; - while (ULONG_CMP_LT(jiffies, endtime)) { - /* If we don't have a callback in flight, post one. */ - if (!rbi.inflight) { - smp_mb(); /* RCU core before ->inflight = 1. */ - rbi.inflight = 1; - call_rcu(&rbi.rcu, rcu_torture_boost_cb); - if (jiffies - call_rcu_time > - test_boost_duration * HZ - HZ / 2) { - VERBOSE_PRINTK_STRING("rcu_torture_boost boosting failed"); - n_rcu_torture_boost_failure++; - } - call_rcu_time = jiffies; - } - cond_resched(); - rcu_stutter_wait("rcu_torture_boost"); - if (kthread_should_stop() || - fullstop != FULLSTOP_DONTSTOP) - goto checkwait; - } - - /* - * Set the start time of the next test interval. - * Yes, this is vulnerable to long delays, but such - * delays simply cause a false negative for the next - * interval. Besides, we are running at RT priority, - * so delays should be relatively rare. - */ - while (oldstarttime == boost_starttime && - !kthread_should_stop()) { - if (mutex_trylock(&boost_mutex)) { - boost_starttime = jiffies + - test_boost_interval * HZ; - n_rcu_torture_boosts++; - mutex_unlock(&boost_mutex); - break; - } - schedule_timeout_uninterruptible(1); - } - - /* Go do the stutter. */ -checkwait: rcu_stutter_wait("rcu_torture_boost"); - } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); - - /* Clean up and exit. */ - VERBOSE_PRINTK_STRING("rcu_torture_boost task stopping"); - rcutorture_shutdown_absorb("rcu_torture_boost"); - while (!kthread_should_stop() || rbi.inflight) - schedule_timeout_uninterruptible(1); - smp_mb(); /* order accesses to ->inflight before stack-frame death. */ - destroy_rcu_head_on_stack(&rbi.rcu); - return 0; -} - -/* - * RCU torture force-quiescent-state kthread. Repeatedly induces - * bursts of calls to force_quiescent_state(), increasing the probability - * of occurrence of some important types of race conditions. - */ -static int -rcu_torture_fqs(void *arg) -{ - unsigned long fqs_resume_time; - int fqs_burst_remaining; - - VERBOSE_PRINTK_STRING("rcu_torture_fqs task started"); - do { - fqs_resume_time = jiffies + fqs_stutter * HZ; - while (ULONG_CMP_LT(jiffies, fqs_resume_time) && - !kthread_should_stop()) { - schedule_timeout_interruptible(1); - } - fqs_burst_remaining = fqs_duration; - while (fqs_burst_remaining > 0 && - !kthread_should_stop()) { - cur_ops->fqs(); - udelay(fqs_holdoff); - fqs_burst_remaining -= fqs_holdoff; - } - rcu_stutter_wait("rcu_torture_fqs"); - } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); - VERBOSE_PRINTK_STRING("rcu_torture_fqs task stopping"); - rcutorture_shutdown_absorb("rcu_torture_fqs"); - while (!kthread_should_stop()) - schedule_timeout_uninterruptible(1); - return 0; -} - -/* - * RCU torture writer kthread. Repeatedly substitutes a new structure - * for that pointed to by rcu_torture_current, freeing the old structure - * after a series of grace periods (the "pipeline"). - */ -static int -rcu_torture_writer(void *arg) -{ - bool exp; - int i; - struct rcu_torture *rp; - struct rcu_torture *rp1; - struct rcu_torture *old_rp; - static DEFINE_RCU_RANDOM(rand); - - VERBOSE_PRINTK_STRING("rcu_torture_writer task started"); - set_user_nice(current, 19); - - do { - schedule_timeout_uninterruptible(1); - rp = rcu_torture_alloc(); - if (rp == NULL) - continue; - rp->rtort_pipe_count = 0; - udelay(rcu_random(&rand) & 0x3ff); - old_rp = rcu_dereference_check(rcu_torture_current, - current == writer_task); - rp->rtort_mbtest = 1; - rcu_assign_pointer(rcu_torture_current, rp); - smp_wmb(); /* Mods to old_rp must follow rcu_assign_pointer() */ - if (old_rp) { - i = old_rp->rtort_pipe_count; - if (i > RCU_TORTURE_PIPE_LEN) - i = RCU_TORTURE_PIPE_LEN; - atomic_inc(&rcu_torture_wcount[i]); - old_rp->rtort_pipe_count++; - if (gp_normal == gp_exp) - exp = !!(rcu_random(&rand) & 0x80); - else - exp = gp_exp; - if (!exp) { - cur_ops->deferred_free(old_rp); - } else { - cur_ops->exp_sync(); - list_add(&old_rp->rtort_free, - &rcu_torture_removed); - list_for_each_entry_safe(rp, rp1, - &rcu_torture_removed, - rtort_free) { - i = rp->rtort_pipe_count; - if (i > RCU_TORTURE_PIPE_LEN) - i = RCU_TORTURE_PIPE_LEN; - atomic_inc(&rcu_torture_wcount[i]); - if (++rp->rtort_pipe_count >= - RCU_TORTURE_PIPE_LEN) { - rp->rtort_mbtest = 0; - list_del(&rp->rtort_free); - rcu_torture_free(rp); - } - } - } - } - rcutorture_record_progress(++rcu_torture_current_version); - rcu_stutter_wait("rcu_torture_writer"); - } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); - VERBOSE_PRINTK_STRING("rcu_torture_writer task stopping"); - rcutorture_shutdown_absorb("rcu_torture_writer"); - while (!kthread_should_stop()) - schedule_timeout_uninterruptible(1); - return 0; -} - -/* - * RCU torture fake writer kthread. Repeatedly calls sync, with a random - * delay between calls. - */ -static int -rcu_torture_fakewriter(void *arg) -{ - DEFINE_RCU_RANDOM(rand); - - VERBOSE_PRINTK_STRING("rcu_torture_fakewriter task started"); - set_user_nice(current, 19); - - do { - schedule_timeout_uninterruptible(1 + rcu_random(&rand)%10); - udelay(rcu_random(&rand) & 0x3ff); - if (cur_ops->cb_barrier != NULL && - rcu_random(&rand) % (nfakewriters * 8) == 0) { - cur_ops->cb_barrier(); - } else if (gp_normal == gp_exp) { - if (rcu_random(&rand) & 0x80) - cur_ops->sync(); - else - cur_ops->exp_sync(); - } else if (gp_normal) { - cur_ops->sync(); - } else { - cur_ops->exp_sync(); - } - rcu_stutter_wait("rcu_torture_fakewriter"); - } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); - - VERBOSE_PRINTK_STRING("rcu_torture_fakewriter task stopping"); - rcutorture_shutdown_absorb("rcu_torture_fakewriter"); - while (!kthread_should_stop()) - schedule_timeout_uninterruptible(1); - return 0; -} - -void rcutorture_trace_dump(void) -{ - static atomic_t beenhere = ATOMIC_INIT(0); - - if (atomic_read(&beenhere)) - return; - if (atomic_xchg(&beenhere, 1) != 0) - return; - ftrace_dump(DUMP_ALL); -} - -/* - * RCU torture reader from timer handler. Dereferences rcu_torture_current, - * incrementing the corresponding element of the pipeline array. The - * counter in the element should never be greater than 1, otherwise, the - * RCU implementation is broken. - */ -static void rcu_torture_timer(unsigned long unused) -{ - int idx; - int completed; - int completed_end; - static DEFINE_RCU_RANDOM(rand); - static DEFINE_SPINLOCK(rand_lock); - struct rcu_torture *p; - int pipe_count; - unsigned long long ts; - - idx = cur_ops->readlock(); - completed = cur_ops->completed(); - ts = rcu_trace_clock_local(); - p = rcu_dereference_check(rcu_torture_current, - rcu_read_lock_bh_held() || - rcu_read_lock_sched_held() || - srcu_read_lock_held(&srcu_ctl)); - if (p == NULL) { - /* Leave because rcu_torture_writer is not yet underway */ - cur_ops->readunlock(idx); - return; - } - if (p->rtort_mbtest == 0) - atomic_inc(&n_rcu_torture_mberror); - spin_lock(&rand_lock); - cur_ops->read_delay(&rand); - n_rcu_torture_timers++; - spin_unlock(&rand_lock); - preempt_disable(); - pipe_count = p->rtort_pipe_count; - if (pipe_count > RCU_TORTURE_PIPE_LEN) { - /* Should not happen, but... */ - pipe_count = RCU_TORTURE_PIPE_LEN; - } - completed_end = cur_ops->completed(); - if (pipe_count > 1) { - do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu, ts, - completed, completed_end); - rcutorture_trace_dump(); - } - __this_cpu_inc(rcu_torture_count[pipe_count]); - completed = completed_end - completed; - if (completed > RCU_TORTURE_PIPE_LEN) { - /* Should not happen, but... */ - completed = RCU_TORTURE_PIPE_LEN; - } - __this_cpu_inc(rcu_torture_batch[completed]); - preempt_enable(); - cur_ops->readunlock(idx); -} - -/* - * RCU torture reader kthread. Repeatedly dereferences rcu_torture_current, - * incrementing the corresponding element of the pipeline array. The - * counter in the element should never be greater than 1, otherwise, the - * RCU implementation is broken. - */ -static int -rcu_torture_reader(void *arg) -{ - int completed; - int completed_end; - int idx; - DEFINE_RCU_RANDOM(rand); - struct rcu_torture *p; - int pipe_count; - struct timer_list t; - unsigned long long ts; - - VERBOSE_PRINTK_STRING("rcu_torture_reader task started"); - set_user_nice(current, 19); - if (irqreader && cur_ops->irq_capable) - setup_timer_on_stack(&t, rcu_torture_timer, 0); - - do { - if (irqreader && cur_ops->irq_capable) { - if (!timer_pending(&t)) - mod_timer(&t, jiffies + 1); - } - idx = cur_ops->readlock(); - completed = cur_ops->completed(); - ts = rcu_trace_clock_local(); - p = rcu_dereference_check(rcu_torture_current, - rcu_read_lock_bh_held() || - rcu_read_lock_sched_held() || - srcu_read_lock_held(&srcu_ctl)); - if (p == NULL) { - /* Wait for rcu_torture_writer to get underway */ - cur_ops->readunlock(idx); - schedule_timeout_interruptible(HZ); - continue; - } - if (p->rtort_mbtest == 0) - atomic_inc(&n_rcu_torture_mberror); - cur_ops->read_delay(&rand); - preempt_disable(); - pipe_count = p->rtort_pipe_count; - if (pipe_count > RCU_TORTURE_PIPE_LEN) { - /* Should not happen, but... */ - pipe_count = RCU_TORTURE_PIPE_LEN; - } - completed_end = cur_ops->completed(); - if (pipe_count > 1) { - do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu, - ts, completed, completed_end); - rcutorture_trace_dump(); - } - __this_cpu_inc(rcu_torture_count[pipe_count]); - completed = completed_end - completed; - if (completed > RCU_TORTURE_PIPE_LEN) { - /* Should not happen, but... */ - completed = RCU_TORTURE_PIPE_LEN; - } - __this_cpu_inc(rcu_torture_batch[completed]); - preempt_enable(); - cur_ops->readunlock(idx); - schedule(); - rcu_stutter_wait("rcu_torture_reader"); - } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); - VERBOSE_PRINTK_STRING("rcu_torture_reader task stopping"); - rcutorture_shutdown_absorb("rcu_torture_reader"); - if (irqreader && cur_ops->irq_capable) - del_timer_sync(&t); - while (!kthread_should_stop()) - schedule_timeout_uninterruptible(1); - return 0; -} - -/* - * Create an RCU-torture statistics message in the specified buffer. - */ -static void -rcu_torture_printk(char *page) -{ - int cpu; - int i; - long pipesummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 }; - long batchsummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 }; - - for_each_possible_cpu(cpu) { - for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) { - pipesummary[i] += per_cpu(rcu_torture_count, cpu)[i]; - batchsummary[i] += per_cpu(rcu_torture_batch, cpu)[i]; - } - } - for (i = RCU_TORTURE_PIPE_LEN - 1; i >= 0; i--) { - if (pipesummary[i] != 0) - break; - } - page += sprintf(page, "%s%s ", torture_type, TORTURE_FLAG); - page += sprintf(page, - "rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d ", - rcu_torture_current, - rcu_torture_current_version, - list_empty(&rcu_torture_freelist), - atomic_read(&n_rcu_torture_alloc), - atomic_read(&n_rcu_torture_alloc_fail), - atomic_read(&n_rcu_torture_free)); - page += sprintf(page, "rtmbe: %d rtbke: %ld rtbre: %ld ", - atomic_read(&n_rcu_torture_mberror), - n_rcu_torture_boost_ktrerror, - n_rcu_torture_boost_rterror); - page += sprintf(page, "rtbf: %ld rtb: %ld nt: %ld ", - n_rcu_torture_boost_failure, - n_rcu_torture_boosts, - n_rcu_torture_timers); - page += sprintf(page, - "onoff: %ld/%ld:%ld/%ld %d,%d:%d,%d %lu:%lu (HZ=%d) ", - n_online_successes, n_online_attempts, - n_offline_successes, n_offline_attempts, - min_online, max_online, - min_offline, max_offline, - sum_online, sum_offline, HZ); - page += sprintf(page, "barrier: %ld/%ld:%ld", - n_barrier_successes, - n_barrier_attempts, - n_rcu_torture_barrier_error); - page += sprintf(page, "\n%s%s ", torture_type, TORTURE_FLAG); - if (atomic_read(&n_rcu_torture_mberror) != 0 || - n_rcu_torture_barrier_error != 0 || - n_rcu_torture_boost_ktrerror != 0 || - n_rcu_torture_boost_rterror != 0 || - n_rcu_torture_boost_failure != 0 || - i > 1) { - page += sprintf(page, "!!! "); - atomic_inc(&n_rcu_torture_error); - WARN_ON_ONCE(1); - } - page += sprintf(page, "Reader Pipe: "); - for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) - page += sprintf(page, " %ld", pipesummary[i]); - page += sprintf(page, "\n%s%s ", torture_type, TORTURE_FLAG); - page += sprintf(page, "Reader Batch: "); - for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) - page += sprintf(page, " %ld", batchsummary[i]); - page += sprintf(page, "\n%s%s ", torture_type, TORTURE_FLAG); - page += sprintf(page, "Free-Block Circulation: "); - for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) { - page += sprintf(page, " %d", - atomic_read(&rcu_torture_wcount[i])); - } - page += sprintf(page, "\n"); - if (cur_ops->stats) - cur_ops->stats(page); -} - -/* - * Print torture statistics. Caller must ensure that there is only - * one call to this function at a given time!!! This is normally - * accomplished by relying on the module system to only have one copy - * of the module loaded, and then by giving the rcu_torture_stats - * kthread full control (or the init/cleanup functions when rcu_torture_stats - * thread is not running). - */ -static void -rcu_torture_stats_print(void) -{ - int size = nr_cpu_ids * 200 + 8192; - char *buf; - - buf = kmalloc(size, GFP_KERNEL); - if (!buf) { - pr_err("rcu-torture: Out of memory, need: %d", size); - return; - } - rcu_torture_printk(buf); - pr_alert("%s", buf); - kfree(buf); -} - -/* - * Periodically prints torture statistics, if periodic statistics printing - * was specified via the stat_interval module parameter. - * - * No need to worry about fullstop here, since this one doesn't reference - * volatile state or register callbacks. - */ -static int -rcu_torture_stats(void *arg) -{ - VERBOSE_PRINTK_STRING("rcu_torture_stats task started"); - do { - schedule_timeout_interruptible(stat_interval * HZ); - rcu_torture_stats_print(); - rcutorture_shutdown_absorb("rcu_torture_stats"); - } while (!kthread_should_stop()); - VERBOSE_PRINTK_STRING("rcu_torture_stats task stopping"); - return 0; -} - -static int rcu_idle_cpu; /* Force all torture tasks off this CPU */ - -/* Shuffle tasks such that we allow @rcu_idle_cpu to become idle. A special case - * is when @rcu_idle_cpu = -1, when we allow the tasks to run on all CPUs. - */ -static void rcu_torture_shuffle_tasks(void) -{ - int i; - - cpumask_setall(shuffle_tmp_mask); - get_online_cpus(); - - /* No point in shuffling if there is only one online CPU (ex: UP) */ - if (num_online_cpus() == 1) { - put_online_cpus(); - return; - } - - if (rcu_idle_cpu != -1) - cpumask_clear_cpu(rcu_idle_cpu, shuffle_tmp_mask); - - set_cpus_allowed_ptr(current, shuffle_tmp_mask); - - if (reader_tasks) { - for (i = 0; i < nrealreaders; i++) - if (reader_tasks[i]) - set_cpus_allowed_ptr(reader_tasks[i], - shuffle_tmp_mask); - } - if (fakewriter_tasks) { - for (i = 0; i < nfakewriters; i++) - if (fakewriter_tasks[i]) - set_cpus_allowed_ptr(fakewriter_tasks[i], - shuffle_tmp_mask); - } - if (writer_task) - set_cpus_allowed_ptr(writer_task, shuffle_tmp_mask); - if (stats_task) - set_cpus_allowed_ptr(stats_task, shuffle_tmp_mask); - if (stutter_task) - set_cpus_allowed_ptr(stutter_task, shuffle_tmp_mask); - if (fqs_task) - set_cpus_allowed_ptr(fqs_task, shuffle_tmp_mask); - if (shutdown_task) - set_cpus_allowed_ptr(shutdown_task, shuffle_tmp_mask); -#ifdef CONFIG_HOTPLUG_CPU - if (onoff_task) - set_cpus_allowed_ptr(onoff_task, shuffle_tmp_mask); -#endif /* #ifdef CONFIG_HOTPLUG_CPU */ - if (stall_task) - set_cpus_allowed_ptr(stall_task, shuffle_tmp_mask); - if (barrier_cbs_tasks) - for (i = 0; i < n_barrier_cbs; i++) - if (barrier_cbs_tasks[i]) - set_cpus_allowed_ptr(barrier_cbs_tasks[i], - shuffle_tmp_mask); - if (barrier_task) - set_cpus_allowed_ptr(barrier_task, shuffle_tmp_mask); - - if (rcu_idle_cpu == -1) - rcu_idle_cpu = num_online_cpus() - 1; - else - rcu_idle_cpu--; - - put_online_cpus(); -} - -/* Shuffle tasks across CPUs, with the intent of allowing each CPU in the - * system to become idle at a time and cut off its timer ticks. This is meant - * to test the support for such tickless idle CPU in RCU. - */ -static int -rcu_torture_shuffle(void *arg) -{ - VERBOSE_PRINTK_STRING("rcu_torture_shuffle task started"); - do { - schedule_timeout_interruptible(shuffle_interval * HZ); - rcu_torture_shuffle_tasks(); - rcutorture_shutdown_absorb("rcu_torture_shuffle"); - } while (!kthread_should_stop()); - VERBOSE_PRINTK_STRING("rcu_torture_shuffle task stopping"); - return 0; -} - -/* Cause the rcutorture test to "stutter", starting and stopping all - * threads periodically. - */ -static int -rcu_torture_stutter(void *arg) -{ - VERBOSE_PRINTK_STRING("rcu_torture_stutter task started"); - do { - schedule_timeout_interruptible(stutter * HZ); - stutter_pause_test = 1; - if (!kthread_should_stop()) - schedule_timeout_interruptible(stutter * HZ); - stutter_pause_test = 0; - rcutorture_shutdown_absorb("rcu_torture_stutter"); - } while (!kthread_should_stop()); - VERBOSE_PRINTK_STRING("rcu_torture_stutter task stopping"); - return 0; -} - -static inline void -rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag) -{ - pr_alert("%s" TORTURE_FLAG - "--- %s: nreaders=%d nfakewriters=%d " - "stat_interval=%d verbose=%d test_no_idle_hz=%d " - "shuffle_interval=%d stutter=%d irqreader=%d " - "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d " - "test_boost=%d/%d test_boost_interval=%d " - "test_boost_duration=%d shutdown_secs=%d " - "stall_cpu=%d stall_cpu_holdoff=%d " - "n_barrier_cbs=%d " - "onoff_interval=%d onoff_holdoff=%d\n", - torture_type, tag, nrealreaders, nfakewriters, - stat_interval, verbose, test_no_idle_hz, shuffle_interval, - stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter, - test_boost, cur_ops->can_boost, - test_boost_interval, test_boost_duration, shutdown_secs, - stall_cpu, stall_cpu_holdoff, - n_barrier_cbs, - onoff_interval, onoff_holdoff); -} - -static struct notifier_block rcutorture_shutdown_nb = { - .notifier_call = rcutorture_shutdown_notify, -}; - -static void rcutorture_booster_cleanup(int cpu) -{ - struct task_struct *t; - - if (boost_tasks[cpu] == NULL) - return; - mutex_lock(&boost_mutex); - VERBOSE_PRINTK_STRING("Stopping rcu_torture_boost task"); - t = boost_tasks[cpu]; - boost_tasks[cpu] = NULL; - mutex_unlock(&boost_mutex); - - /* This must be outside of the mutex, otherwise deadlock! */ - kthread_stop(t); - boost_tasks[cpu] = NULL; -} - -static int rcutorture_booster_init(int cpu) -{ - int retval; - - if (boost_tasks[cpu] != NULL) - return 0; /* Already created, nothing more to do. */ - - /* Don't allow time recalculation while creating a new task. */ - mutex_lock(&boost_mutex); - VERBOSE_PRINTK_STRING("Creating rcu_torture_boost task"); - boost_tasks[cpu] = kthread_create_on_node(rcu_torture_boost, NULL, - cpu_to_node(cpu), - "rcu_torture_boost"); - if (IS_ERR(boost_tasks[cpu])) { - retval = PTR_ERR(boost_tasks[cpu]); - VERBOSE_PRINTK_STRING("rcu_torture_boost task create failed"); - n_rcu_torture_boost_ktrerror++; - boost_tasks[cpu] = NULL; - mutex_unlock(&boost_mutex); - return retval; - } - kthread_bind(boost_tasks[cpu], cpu); - wake_up_process(boost_tasks[cpu]); - mutex_unlock(&boost_mutex); - return 0; -} - -/* - * Cause the rcutorture test to shutdown the system after the test has - * run for the time specified by the shutdown_secs module parameter. - */ -static int -rcu_torture_shutdown(void *arg) -{ - long delta; - unsigned long jiffies_snap; - - VERBOSE_PRINTK_STRING("rcu_torture_shutdown task started"); - jiffies_snap = ACCESS_ONCE(jiffies); - while (ULONG_CMP_LT(jiffies_snap, shutdown_time) && - !kthread_should_stop()) { - delta = shutdown_time - jiffies_snap; - if (verbose) - pr_alert("%s" TORTURE_FLAG - "rcu_torture_shutdown task: %lu jiffies remaining\n", - torture_type, delta); - schedule_timeout_interruptible(delta); - jiffies_snap = ACCESS_ONCE(jiffies); - } - if (kthread_should_stop()) { - VERBOSE_PRINTK_STRING("rcu_torture_shutdown task stopping"); - return 0; - } - - /* OK, shut down the system. */ - - VERBOSE_PRINTK_STRING("rcu_torture_shutdown task shutting down system"); - shutdown_task = NULL; /* Avoid self-kill deadlock. */ - rcu_torture_cleanup(); /* Get the success/failure message. */ - kernel_power_off(); /* Shut down the system. */ - return 0; -} - -#ifdef CONFIG_HOTPLUG_CPU - -/* - * Execute random CPU-hotplug operations at the interval specified - * by the onoff_interval. - */ -static int -rcu_torture_onoff(void *arg) -{ - int cpu; - unsigned long delta; - int maxcpu = -1; - DEFINE_RCU_RANDOM(rand); - int ret; - unsigned long starttime; - - VERBOSE_PRINTK_STRING("rcu_torture_onoff task started"); - for_each_online_cpu(cpu) - maxcpu = cpu; - WARN_ON(maxcpu < 0); - if (onoff_holdoff > 0) { - VERBOSE_PRINTK_STRING("rcu_torture_onoff begin holdoff"); - schedule_timeout_interruptible(onoff_holdoff * HZ); - VERBOSE_PRINTK_STRING("rcu_torture_onoff end holdoff"); - } - while (!kthread_should_stop()) { - cpu = (rcu_random(&rand) >> 4) % (maxcpu + 1); - if (cpu_online(cpu) && cpu_is_hotpluggable(cpu)) { - if (verbose) - pr_alert("%s" TORTURE_FLAG - "rcu_torture_onoff task: offlining %d\n", - torture_type, cpu); - starttime = jiffies; - n_offline_attempts++; - ret = cpu_down(cpu); - if (ret) { - if (verbose) - pr_alert("%s" TORTURE_FLAG - "rcu_torture_onoff task: offline %d failed: errno %d\n", - torture_type, cpu, ret); - } else { - if (verbose) - pr_alert("%s" TORTURE_FLAG - "rcu_torture_onoff task: offlined %d\n", - torture_type, cpu); - n_offline_successes++; - delta = jiffies - starttime; - sum_offline += delta; - if (min_offline < 0) { - min_offline = delta; - max_offline = delta; - } - if (min_offline > delta) - min_offline = delta; - if (max_offline < delta) - max_offline = delta; - } - } else if (cpu_is_hotpluggable(cpu)) { - if (verbose) - pr_alert("%s" TORTURE_FLAG - "rcu_torture_onoff task: onlining %d\n", - torture_type, cpu); - starttime = jiffies; - n_online_attempts++; - ret = cpu_up(cpu); - if (ret) { - if (verbose) - pr_alert("%s" TORTURE_FLAG - "rcu_torture_onoff task: online %d failed: errno %d\n", - torture_type, cpu, ret); - } else { - if (verbose) - pr_alert("%s" TORTURE_FLAG - "rcu_torture_onoff task: onlined %d\n", - torture_type, cpu); - n_online_successes++; - delta = jiffies - starttime; - sum_online += delta; - if (min_online < 0) { - min_online = delta; - max_online = delta; - } - if (min_online > delta) - min_online = delta; - if (max_online < delta) - max_online = delta; - } - } - schedule_timeout_interruptible(onoff_interval * HZ); - } - VERBOSE_PRINTK_STRING("rcu_torture_onoff task stopping"); - return 0; -} - -static int -rcu_torture_onoff_init(void) -{ - int ret; - - if (onoff_interval <= 0) - return 0; - onoff_task = kthread_run(rcu_torture_onoff, NULL, "rcu_torture_onoff"); - if (IS_ERR(onoff_task)) { - ret = PTR_ERR(onoff_task); - onoff_task = NULL; - return ret; - } - return 0; -} - -static void rcu_torture_onoff_cleanup(void) -{ - if (onoff_task == NULL) - return; - VERBOSE_PRINTK_STRING("Stopping rcu_torture_onoff task"); - kthread_stop(onoff_task); - onoff_task = NULL; -} - -#else /* #ifdef CONFIG_HOTPLUG_CPU */ - -static int -rcu_torture_onoff_init(void) -{ - return 0; -} - -static void rcu_torture_onoff_cleanup(void) -{ -} - -#endif /* #else #ifdef CONFIG_HOTPLUG_CPU */ - -/* - * CPU-stall kthread. It waits as specified by stall_cpu_holdoff, then - * induces a CPU stall for the time specified by stall_cpu. - */ -static int rcu_torture_stall(void *args) -{ - unsigned long stop_at; - - VERBOSE_PRINTK_STRING("rcu_torture_stall task started"); - if (stall_cpu_holdoff > 0) { - VERBOSE_PRINTK_STRING("rcu_torture_stall begin holdoff"); - schedule_timeout_interruptible(stall_cpu_holdoff * HZ); - VERBOSE_PRINTK_STRING("rcu_torture_stall end holdoff"); - } - if (!kthread_should_stop()) { - stop_at = get_seconds() + stall_cpu; - /* RCU CPU stall is expected behavior in following code. */ - pr_alert("rcu_torture_stall start.\n"); - rcu_read_lock(); - preempt_disable(); - while (ULONG_CMP_LT(get_seconds(), stop_at)) - continue; /* Induce RCU CPU stall warning. */ - preempt_enable(); - rcu_read_unlock(); - pr_alert("rcu_torture_stall end.\n"); - } - rcutorture_shutdown_absorb("rcu_torture_stall"); - while (!kthread_should_stop()) - schedule_timeout_interruptible(10 * HZ); - return 0; -} - -/* Spawn CPU-stall kthread, if stall_cpu specified. */ -static int __init rcu_torture_stall_init(void) -{ - int ret; - - if (stall_cpu <= 0) - return 0; - stall_task = kthread_run(rcu_torture_stall, NULL, "rcu_torture_stall"); - if (IS_ERR(stall_task)) { - ret = PTR_ERR(stall_task); - stall_task = NULL; - return ret; - } - return 0; -} - -/* Clean up after the CPU-stall kthread, if one was spawned. */ -static void rcu_torture_stall_cleanup(void) -{ - if (stall_task == NULL) - return; - VERBOSE_PRINTK_STRING("Stopping rcu_torture_stall_task."); - kthread_stop(stall_task); - stall_task = NULL; -} - -/* Callback function for RCU barrier testing. */ -void rcu_torture_barrier_cbf(struct rcu_head *rcu) -{ - atomic_inc(&barrier_cbs_invoked); -} - -/* kthread function to register callbacks used to test RCU barriers. */ -static int rcu_torture_barrier_cbs(void *arg) -{ - long myid = (long)arg; - bool lastphase = 0; - bool newphase; - struct rcu_head rcu; - - init_rcu_head_on_stack(&rcu); - VERBOSE_PRINTK_STRING("rcu_torture_barrier_cbs task started"); - set_user_nice(current, 19); - do { - wait_event(barrier_cbs_wq[myid], - (newphase = - ACCESS_ONCE(barrier_phase)) != lastphase || - kthread_should_stop() || - fullstop != FULLSTOP_DONTSTOP); - lastphase = newphase; - smp_mb(); /* ensure barrier_phase load before ->call(). */ - if (kthread_should_stop() || fullstop != FULLSTOP_DONTSTOP) - break; - cur_ops->call(&rcu, rcu_torture_barrier_cbf); - if (atomic_dec_and_test(&barrier_cbs_count)) - wake_up(&barrier_wq); - } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); - VERBOSE_PRINTK_STRING("rcu_torture_barrier_cbs task stopping"); - rcutorture_shutdown_absorb("rcu_torture_barrier_cbs"); - while (!kthread_should_stop()) - schedule_timeout_interruptible(1); - cur_ops->cb_barrier(); - destroy_rcu_head_on_stack(&rcu); - return 0; -} - -/* kthread function to drive and coordinate RCU barrier testing. */ -static int rcu_torture_barrier(void *arg) -{ - int i; - - VERBOSE_PRINTK_STRING("rcu_torture_barrier task starting"); - do { - atomic_set(&barrier_cbs_invoked, 0); - atomic_set(&barrier_cbs_count, n_barrier_cbs); - smp_mb(); /* Ensure barrier_phase after prior assignments. */ - barrier_phase = !barrier_phase; - for (i = 0; i < n_barrier_cbs; i++) - wake_up(&barrier_cbs_wq[i]); - wait_event(barrier_wq, - atomic_read(&barrier_cbs_count) == 0 || - kthread_should_stop() || - fullstop != FULLSTOP_DONTSTOP); - if (kthread_should_stop() || fullstop != FULLSTOP_DONTSTOP) - break; - n_barrier_attempts++; - cur_ops->cb_barrier(); /* Implies smp_mb() for wait_event(). */ - if (atomic_read(&barrier_cbs_invoked) != n_barrier_cbs) { - n_rcu_torture_barrier_error++; - WARN_ON_ONCE(1); - } - n_barrier_successes++; - schedule_timeout_interruptible(HZ / 10); - } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); - VERBOSE_PRINTK_STRING("rcu_torture_barrier task stopping"); - rcutorture_shutdown_absorb("rcu_torture_barrier"); - while (!kthread_should_stop()) - schedule_timeout_interruptible(1); - return 0; -} - -/* Initialize RCU barrier testing. */ -static int rcu_torture_barrier_init(void) -{ - int i; - int ret; - - if (n_barrier_cbs == 0) - return 0; - if (cur_ops->call == NULL || cur_ops->cb_barrier == NULL) { - pr_alert("%s" TORTURE_FLAG - " Call or barrier ops missing for %s,\n", - torture_type, cur_ops->name); - pr_alert("%s" TORTURE_FLAG - " RCU barrier testing omitted from run.\n", - torture_type); - return 0; - } - atomic_set(&barrier_cbs_count, 0); - atomic_set(&barrier_cbs_invoked, 0); - barrier_cbs_tasks = - kzalloc(n_barrier_cbs * sizeof(barrier_cbs_tasks[0]), - GFP_KERNEL); - barrier_cbs_wq = - kzalloc(n_barrier_cbs * sizeof(barrier_cbs_wq[0]), - GFP_KERNEL); - if (barrier_cbs_tasks == NULL || !barrier_cbs_wq) - return -ENOMEM; - for (i = 0; i < n_barrier_cbs; i++) { - init_waitqueue_head(&barrier_cbs_wq[i]); - barrier_cbs_tasks[i] = kthread_run(rcu_torture_barrier_cbs, - (void *)(long)i, - "rcu_torture_barrier_cbs"); - if (IS_ERR(barrier_cbs_tasks[i])) { - ret = PTR_ERR(barrier_cbs_tasks[i]); - VERBOSE_PRINTK_ERRSTRING("Failed to create rcu_torture_barrier_cbs"); - barrier_cbs_tasks[i] = NULL; - return ret; - } - } - barrier_task = kthread_run(rcu_torture_barrier, NULL, - "rcu_torture_barrier"); - if (IS_ERR(barrier_task)) { - ret = PTR_ERR(barrier_task); - VERBOSE_PRINTK_ERRSTRING("Failed to create rcu_torture_barrier"); - barrier_task = NULL; - } - return 0; -} - -/* Clean up after RCU barrier testing. */ -static void rcu_torture_barrier_cleanup(void) -{ - int i; - - if (barrier_task != NULL) { - VERBOSE_PRINTK_STRING("Stopping rcu_torture_barrier task"); - kthread_stop(barrier_task); - barrier_task = NULL; - } - if (barrier_cbs_tasks != NULL) { - for (i = 0; i < n_barrier_cbs; i++) { - if (barrier_cbs_tasks[i] != NULL) { - VERBOSE_PRINTK_STRING("Stopping rcu_torture_barrier_cbs task"); - kthread_stop(barrier_cbs_tasks[i]); - barrier_cbs_tasks[i] = NULL; - } - } - kfree(barrier_cbs_tasks); - barrier_cbs_tasks = NULL; - } - if (barrier_cbs_wq != NULL) { - kfree(barrier_cbs_wq); - barrier_cbs_wq = NULL; - } -} - -static int rcutorture_cpu_notify(struct notifier_block *self, - unsigned long action, void *hcpu) -{ - long cpu = (long)hcpu; - - switch (action) { - case CPU_ONLINE: - case CPU_DOWN_FAILED: - (void)rcutorture_booster_init(cpu); - break; - case CPU_DOWN_PREPARE: - rcutorture_booster_cleanup(cpu); - break; - default: - break; - } - return NOTIFY_OK; -} - -static struct notifier_block rcutorture_cpu_nb = { - .notifier_call = rcutorture_cpu_notify, -}; - -static void -rcu_torture_cleanup(void) -{ - int i; - - mutex_lock(&fullstop_mutex); - rcutorture_record_test_transition(); - if (fullstop == FULLSTOP_SHUTDOWN) { - pr_warn(/* but going down anyway, so... */ - "Concurrent 'rmmod rcutorture' and shutdown illegal!\n"); - mutex_unlock(&fullstop_mutex); - schedule_timeout_uninterruptible(10); - if (cur_ops->cb_barrier != NULL) - cur_ops->cb_barrier(); - return; - } - fullstop = FULLSTOP_RMMOD; - mutex_unlock(&fullstop_mutex); - unregister_reboot_notifier(&rcutorture_shutdown_nb); - rcu_torture_barrier_cleanup(); - rcu_torture_stall_cleanup(); - if (stutter_task) { - VERBOSE_PRINTK_STRING("Stopping rcu_torture_stutter task"); - kthread_stop(stutter_task); - } - stutter_task = NULL; - if (shuffler_task) { - VERBOSE_PRINTK_STRING("Stopping rcu_torture_shuffle task"); - kthread_stop(shuffler_task); - free_cpumask_var(shuffle_tmp_mask); - } - shuffler_task = NULL; - - if (writer_task) { - VERBOSE_PRINTK_STRING("Stopping rcu_torture_writer task"); - kthread_stop(writer_task); - } - writer_task = NULL; - - if (reader_tasks) { - for (i = 0; i < nrealreaders; i++) { - if (reader_tasks[i]) { - VERBOSE_PRINTK_STRING( - "Stopping rcu_torture_reader task"); - kthread_stop(reader_tasks[i]); - } - reader_tasks[i] = NULL; - } - kfree(reader_tasks); - reader_tasks = NULL; - } - rcu_torture_current = NULL; - - if (fakewriter_tasks) { - for (i = 0; i < nfakewriters; i++) { - if (fakewriter_tasks[i]) { - VERBOSE_PRINTK_STRING( - "Stopping rcu_torture_fakewriter task"); - kthread_stop(fakewriter_tasks[i]); - } - fakewriter_tasks[i] = NULL; - } - kfree(fakewriter_tasks); - fakewriter_tasks = NULL; - } - - if (stats_task) { - VERBOSE_PRINTK_STRING("Stopping rcu_torture_stats task"); - kthread_stop(stats_task); - } - stats_task = NULL; - - if (fqs_task) { - VERBOSE_PRINTK_STRING("Stopping rcu_torture_fqs task"); - kthread_stop(fqs_task); - } - fqs_task = NULL; - if ((test_boost == 1 && cur_ops->can_boost) || - test_boost == 2) { - unregister_cpu_notifier(&rcutorture_cpu_nb); - for_each_possible_cpu(i) - rcutorture_booster_cleanup(i); - } - if (shutdown_task != NULL) { - VERBOSE_PRINTK_STRING("Stopping rcu_torture_shutdown task"); - kthread_stop(shutdown_task); - } - shutdown_task = NULL; - rcu_torture_onoff_cleanup(); - - /* Wait for all RCU callbacks to fire. */ - - if (cur_ops->cb_barrier != NULL) - cur_ops->cb_barrier(); - - rcu_torture_stats_print(); /* -After- the stats thread is stopped! */ - - if (atomic_read(&n_rcu_torture_error) || n_rcu_torture_barrier_error) - rcu_torture_print_module_parms(cur_ops, "End of test: FAILURE"); - else if (n_online_successes != n_online_attempts || - n_offline_successes != n_offline_attempts) - rcu_torture_print_module_parms(cur_ops, - "End of test: RCU_HOTPLUG"); - else - rcu_torture_print_module_parms(cur_ops, "End of test: SUCCESS"); -} - -#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD -static void rcu_torture_leak_cb(struct rcu_head *rhp) -{ -} - -static void rcu_torture_err_cb(struct rcu_head *rhp) -{ - /* - * This -might- happen due to race conditions, but is unlikely. - * The scenario that leads to this happening is that the - * first of the pair of duplicate callbacks is queued, - * someone else starts a grace period that includes that - * callback, then the second of the pair must wait for the - * next grace period. Unlikely, but can happen. If it - * does happen, the debug-objects subsystem won't have splatted. - */ - pr_alert("rcutorture: duplicated callback was invoked.\n"); -} -#endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ - -/* - * Verify that double-free causes debug-objects to complain, but only - * if CONFIG_DEBUG_OBJECTS_RCU_HEAD=y. Otherwise, say that the test - * cannot be carried out. - */ -static void rcu_test_debug_objects(void) -{ -#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD - struct rcu_head rh1; - struct rcu_head rh2; - - init_rcu_head_on_stack(&rh1); - init_rcu_head_on_stack(&rh2); - pr_alert("rcutorture: WARN: Duplicate call_rcu() test starting.\n"); - - /* Try to queue the rh2 pair of callbacks for the same grace period. */ - preempt_disable(); /* Prevent preemption from interrupting test. */ - rcu_read_lock(); /* Make it impossible to finish a grace period. */ - call_rcu(&rh1, rcu_torture_leak_cb); /* Start grace period. */ - local_irq_disable(); /* Make it harder to start a new grace period. */ - call_rcu(&rh2, rcu_torture_leak_cb); - call_rcu(&rh2, rcu_torture_err_cb); /* Duplicate callback. */ - local_irq_enable(); - rcu_read_unlock(); - preempt_enable(); - - /* Wait for them all to get done so we can safely return. */ - rcu_barrier(); - pr_alert("rcutorture: WARN: Duplicate call_rcu() test complete.\n"); - destroy_rcu_head_on_stack(&rh1); - destroy_rcu_head_on_stack(&rh2); -#else /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ - pr_alert("rcutorture: !CONFIG_DEBUG_OBJECTS_RCU_HEAD, not testing duplicate call_rcu()\n"); -#endif /* #else #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ -} - -static int __init -rcu_torture_init(void) -{ - int i; - int cpu; - int firsterr = 0; - int retval; - static struct rcu_torture_ops *torture_ops[] = { - &rcu_ops, &rcu_bh_ops, &srcu_ops, &sched_ops, - }; - - mutex_lock(&fullstop_mutex); - - /* Process args and tell the world that the torturer is on the job. */ - for (i = 0; i < ARRAY_SIZE(torture_ops); i++) { - cur_ops = torture_ops[i]; - if (strcmp(torture_type, cur_ops->name) == 0) - break; - } - if (i == ARRAY_SIZE(torture_ops)) { - pr_alert("rcu-torture: invalid torture type: \"%s\"\n", - torture_type); - pr_alert("rcu-torture types:"); - for (i = 0; i < ARRAY_SIZE(torture_ops); i++) - pr_alert(" %s", torture_ops[i]->name); - pr_alert("\n"); - mutex_unlock(&fullstop_mutex); - return -EINVAL; - } - if (cur_ops->fqs == NULL && fqs_duration != 0) { - pr_alert("rcu-torture: ->fqs NULL and non-zero fqs_duration, fqs disabled.\n"); - fqs_duration = 0; - } - if (cur_ops->init) - cur_ops->init(); /* no "goto unwind" prior to this point!!! */ - - if (nreaders >= 0) - nrealreaders = nreaders; - else - nrealreaders = 2 * num_online_cpus(); - rcu_torture_print_module_parms(cur_ops, "Start of test"); - fullstop = FULLSTOP_DONTSTOP; - - /* Set up the freelist. */ - - INIT_LIST_HEAD(&rcu_torture_freelist); - for (i = 0; i < ARRAY_SIZE(rcu_tortures); i++) { - rcu_tortures[i].rtort_mbtest = 0; - list_add_tail(&rcu_tortures[i].rtort_free, - &rcu_torture_freelist); - } - - /* Initialize the statistics so that each run gets its own numbers. */ - - rcu_torture_current = NULL; - rcu_torture_current_version = 0; - atomic_set(&n_rcu_torture_alloc, 0); - atomic_set(&n_rcu_torture_alloc_fail, 0); - atomic_set(&n_rcu_torture_free, 0); - atomic_set(&n_rcu_torture_mberror, 0); - atomic_set(&n_rcu_torture_error, 0); - n_rcu_torture_barrier_error = 0; - n_rcu_torture_boost_ktrerror = 0; - n_rcu_torture_boost_rterror = 0; - n_rcu_torture_boost_failure = 0; - n_rcu_torture_boosts = 0; - for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) - atomic_set(&rcu_torture_wcount[i], 0); - for_each_possible_cpu(cpu) { - for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) { - per_cpu(rcu_torture_count, cpu)[i] = 0; - per_cpu(rcu_torture_batch, cpu)[i] = 0; - } - } - - /* Start up the kthreads. */ - - VERBOSE_PRINTK_STRING("Creating rcu_torture_writer task"); - writer_task = kthread_create(rcu_torture_writer, NULL, - "rcu_torture_writer"); - if (IS_ERR(writer_task)) { - firsterr = PTR_ERR(writer_task); - VERBOSE_PRINTK_ERRSTRING("Failed to create writer"); - writer_task = NULL; - goto unwind; - } - wake_up_process(writer_task); - fakewriter_tasks = kzalloc(nfakewriters * sizeof(fakewriter_tasks[0]), - GFP_KERNEL); - if (fakewriter_tasks == NULL) { - VERBOSE_PRINTK_ERRSTRING("out of memory"); - firsterr = -ENOMEM; - goto unwind; - } - for (i = 0; i < nfakewriters; i++) { - VERBOSE_PRINTK_STRING("Creating rcu_torture_fakewriter task"); - fakewriter_tasks[i] = kthread_run(rcu_torture_fakewriter, NULL, - "rcu_torture_fakewriter"); - if (IS_ERR(fakewriter_tasks[i])) { - firsterr = PTR_ERR(fakewriter_tasks[i]); - VERBOSE_PRINTK_ERRSTRING("Failed to create fakewriter"); - fakewriter_tasks[i] = NULL; - goto unwind; - } - } - reader_tasks = kzalloc(nrealreaders * sizeof(reader_tasks[0]), - GFP_KERNEL); - if (reader_tasks == NULL) { - VERBOSE_PRINTK_ERRSTRING("out of memory"); - firsterr = -ENOMEM; - goto unwind; - } - for (i = 0; i < nrealreaders; i++) { - VERBOSE_PRINTK_STRING("Creating rcu_torture_reader task"); - reader_tasks[i] = kthread_run(rcu_torture_reader, NULL, - "rcu_torture_reader"); - if (IS_ERR(reader_tasks[i])) { - firsterr = PTR_ERR(reader_tasks[i]); - VERBOSE_PRINTK_ERRSTRING("Failed to create reader"); - reader_tasks[i] = NULL; - goto unwind; - } - } - if (stat_interval > 0) { - VERBOSE_PRINTK_STRING("Creating rcu_torture_stats task"); - stats_task = kthread_run(rcu_torture_stats, NULL, - "rcu_torture_stats"); - if (IS_ERR(stats_task)) { - firsterr = PTR_ERR(stats_task); - VERBOSE_PRINTK_ERRSTRING("Failed to create stats"); - stats_task = NULL; - goto unwind; - } - } - if (test_no_idle_hz) { - rcu_idle_cpu = num_online_cpus() - 1; - - if (!alloc_cpumask_var(&shuffle_tmp_mask, GFP_KERNEL)) { - firsterr = -ENOMEM; - VERBOSE_PRINTK_ERRSTRING("Failed to alloc mask"); - goto unwind; - } - - /* Create the shuffler thread */ - shuffler_task = kthread_run(rcu_torture_shuffle, NULL, - "rcu_torture_shuffle"); - if (IS_ERR(shuffler_task)) { - free_cpumask_var(shuffle_tmp_mask); - firsterr = PTR_ERR(shuffler_task); - VERBOSE_PRINTK_ERRSTRING("Failed to create shuffler"); - shuffler_task = NULL; - goto unwind; - } - } - if (stutter < 0) - stutter = 0; - if (stutter) { - /* Create the stutter thread */ - stutter_task = kthread_run(rcu_torture_stutter, NULL, - "rcu_torture_stutter"); - if (IS_ERR(stutter_task)) { - firsterr = PTR_ERR(stutter_task); - VERBOSE_PRINTK_ERRSTRING("Failed to create stutter"); - stutter_task = NULL; - goto unwind; - } - } - if (fqs_duration < 0) - fqs_duration = 0; - if (fqs_duration) { - /* Create the stutter thread */ - fqs_task = kthread_run(rcu_torture_fqs, NULL, - "rcu_torture_fqs"); - if (IS_ERR(fqs_task)) { - firsterr = PTR_ERR(fqs_task); - VERBOSE_PRINTK_ERRSTRING("Failed to create fqs"); - fqs_task = NULL; - goto unwind; - } - } - if (test_boost_interval < 1) - test_boost_interval = 1; - if (test_boost_duration < 2) - test_boost_duration = 2; - if ((test_boost == 1 && cur_ops->can_boost) || - test_boost == 2) { - - boost_starttime = jiffies + test_boost_interval * HZ; - register_cpu_notifier(&rcutorture_cpu_nb); - for_each_possible_cpu(i) { - if (cpu_is_offline(i)) - continue; /* Heuristic: CPU can go offline. */ - retval = rcutorture_booster_init(i); - if (retval < 0) { - firsterr = retval; - goto unwind; - } - } - } - if (shutdown_secs > 0) { - shutdown_time = jiffies + shutdown_secs * HZ; - shutdown_task = kthread_create(rcu_torture_shutdown, NULL, - "rcu_torture_shutdown"); - if (IS_ERR(shutdown_task)) { - firsterr = PTR_ERR(shutdown_task); - VERBOSE_PRINTK_ERRSTRING("Failed to create shutdown"); - shutdown_task = NULL; - goto unwind; - } - wake_up_process(shutdown_task); - } - i = rcu_torture_onoff_init(); - if (i != 0) { - firsterr = i; - goto unwind; - } - register_reboot_notifier(&rcutorture_shutdown_nb); - i = rcu_torture_stall_init(); - if (i != 0) { - firsterr = i; - goto unwind; - } - retval = rcu_torture_barrier_init(); - if (retval != 0) { - firsterr = retval; - goto unwind; - } - if (object_debug) - rcu_test_debug_objects(); - rcutorture_record_test_transition(); - mutex_unlock(&fullstop_mutex); - return 0; - -unwind: - mutex_unlock(&fullstop_mutex); - rcu_torture_cleanup(); - return firsterr; -} - -module_init(rcu_torture_init); -module_exit(rcu_torture_cleanup); diff --git a/kernel/torture.c b/kernel/torture.c new file mode 100644 index 000000000000..c82c70f7828e --- /dev/null +++ b/kernel/torture.c @@ -0,0 +1,71 @@ +/* + * Common functions for in-kernel torture tests. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, you can access it online at + * http://www.gnu.org/licenses/gpl-2.0.html. + * + * Copyright (C) IBM Corporation, 2014 + * + * Author: Paul E. McKenney + * Based on kernel/rcu/torture.c. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Paul E. McKenney "); + +#define TORTURE_RANDOM_MULT 39916801 /* prime */ +#define TORTURE_RANDOM_ADD 479001701 /* prime */ +#define TORTURE_RANDOM_REFRESH 10000 + +/* + * Crude but fast random-number generator. Uses a linear congruential + * generator, with occasional help from cpu_clock(). + */ +unsigned long +torture_random(struct torture_random_state *trsp) +{ + if (--trsp->trs_count < 0) { + trsp->trs_state += (unsigned long)local_clock(); + trsp->trs_count = TORTURE_RANDOM_REFRESH; + } + trsp->trs_state = trsp->trs_state * TORTURE_RANDOM_MULT + + TORTURE_RANDOM_ADD; + return swahw32(trsp->trs_state); +} +EXPORT_SYMBOL_GPL(torture_random); diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index a48abeac753f..2bfb4e5cdf8c 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1141,9 +1141,14 @@ config SPARSE_RCU_POINTER Say N if you are unsure. +config TORTURE_TEST + tristate + default n + config RCU_TORTURE_TEST tristate "torture tests for RCU" depends on DEBUG_KERNEL + select TORTURE_TEST default n help This option provides a kernel module that runs torture tests -- cgit v1.2.3 From 9e2502254132261e0ea8010692fd447b1cedf627 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 27 Jan 2014 16:27:00 -0800 Subject: rcutorture: Abstract torture_param() Create a torture_param() macro and apply it to rcutorture in order to save a few lines of code. This same macro may be applied to other torture frameworks. Signed-off-by: Paul E. McKenney --- include/linux/torture.h | 9 ++++- kernel/rcu/rcutorture.c | 103 ++++++++++++++++-------------------------------- 2 files changed, 41 insertions(+), 71 deletions(-) (limited to 'include/linux') diff --git a/include/linux/torture.h b/include/linux/torture.h index 979e3e6b378a..5aae880b6b4e 100644 --- a/include/linux/torture.h +++ b/include/linux/torture.h @@ -35,13 +35,18 @@ #include #include +/* Definitions for a non-string torture-test module parameter. */ +#define torture_param(type, name, init, msg) \ + static type name = init; \ + module_param(name, type, 0444); \ + MODULE_PARM_DESC(name, msg); + +/* Low-rider random number generator. */ struct torture_random_state { unsigned long trs_state; long trs_count; }; - #define DEFINE_TORTURE_RANDOM(name) struct torture_random_state name = { 0, 0 } - unsigned long torture_random(struct torture_random_state *trsp); #endif /* __LINUX_TORTURE_H */ diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 94b1cd8b214c..930791e0698d 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -59,78 +59,43 @@ MODULE_ALIAS("rcutorture"); #endif #define MODULE_PARAM_PREFIX "rcutorture." -static int fqs_duration; -module_param(fqs_duration, int, 0444); -MODULE_PARM_DESC(fqs_duration, "Duration of fqs bursts (us), 0 to disable"); -static int fqs_holdoff; -module_param(fqs_holdoff, int, 0444); -MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)"); -static int fqs_stutter = 3; -module_param(fqs_stutter, int, 0444); -MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)"); -static bool gp_exp; -module_param(gp_exp, bool, 0444); -MODULE_PARM_DESC(gp_exp, "Use expedited GP wait primitives"); -static bool gp_normal; -module_param(gp_normal, bool, 0444); -MODULE_PARM_DESC(gp_normal, "Use normal (non-expedited) GP wait primitives"); -static int irqreader = 1; -module_param(irqreader, int, 0444); -MODULE_PARM_DESC(irqreader, "Allow RCU readers from irq handlers"); -static int n_barrier_cbs; -module_param(n_barrier_cbs, int, 0444); -MODULE_PARM_DESC(n_barrier_cbs, "# of callbacks/kthreads for barrier testing"); -static int nfakewriters = 4; -module_param(nfakewriters, int, 0444); -MODULE_PARM_DESC(nfakewriters, "Number of RCU fake writer threads"); -static int nreaders = -1; -module_param(nreaders, int, 0444); -MODULE_PARM_DESC(nreaders, "Number of RCU reader threads"); -static int object_debug; -module_param(object_debug, int, 0444); -MODULE_PARM_DESC(object_debug, "Enable debug-object double call_rcu() testing"); -static int onoff_holdoff; -module_param(onoff_holdoff, int, 0444); -MODULE_PARM_DESC(onoff_holdoff, "Time after boot before CPU hotplugs (s)"); -static int onoff_interval; -module_param(onoff_interval, int, 0444); -MODULE_PARM_DESC(onoff_interval, "Time between CPU hotplugs (s), 0=disable"); -static int shuffle_interval = 3; -module_param(shuffle_interval, int, 0444); -MODULE_PARM_DESC(shuffle_interval, "Number of seconds between shuffles"); -static int shutdown_secs; -module_param(shutdown_secs, int, 0444); -MODULE_PARM_DESC(shutdown_secs, "Shutdown time (s), <= zero to disable."); -static int stall_cpu; -module_param(stall_cpu, int, 0444); -MODULE_PARM_DESC(stall_cpu, "Stall duration (s), zero to disable."); -static int stall_cpu_holdoff = 10; -module_param(stall_cpu_holdoff, int, 0444); -MODULE_PARM_DESC(stall_cpu_holdoff, "Time to wait before starting stall (s)."); -static int stat_interval = 60; -module_param(stat_interval, int, 0644); -MODULE_PARM_DESC(stat_interval, "Number of seconds between stats printk()s"); -static int stutter = 5; -module_param(stutter, int, 0444); -MODULE_PARM_DESC(stutter, "Number of seconds to run/halt test"); -static int test_boost = 1; -module_param(test_boost, int, 0444); -MODULE_PARM_DESC(test_boost, "Test RCU prio boost: 0=no, 1=maybe, 2=yes."); -static int test_boost_duration = 4; -module_param(test_boost_duration, int, 0444); -MODULE_PARM_DESC(test_boost_duration, "Duration of each boost test, seconds."); -static int test_boost_interval = 7; -module_param(test_boost_interval, int, 0444); -MODULE_PARM_DESC(test_boost_interval, "Interval between boost tests, seconds."); -static bool test_no_idle_hz = true; -module_param(test_no_idle_hz, bool, 0444); -MODULE_PARM_DESC(test_no_idle_hz, "Test support for tickless idle CPUs"); +torture_param(int, fqs_duration, 0, + "Duration of fqs bursts (us), 0 to disable"); +torture_param(int, fqs_holdoff, 0, "Holdoff time within fqs bursts (us)"); +torture_param(int, fqs_stutter, 3, "Wait time between fqs bursts (s)"); +torture_param(bool, gp_exp, false, "Use expedited GP wait primitives"); +torture_param(bool, gp_normal, false, + "Use normal (non-expedited) GP wait primitives"); +torture_param(int, irqreader, 1, "Allow RCU readers from irq handlers"); +torture_param(int, n_barrier_cbs, 0, + "# of callbacks/kthreads for barrier testing"); +torture_param(int, nfakewriters, 4, "Number of RCU fake writer threads"); +torture_param(int, nreaders, -1, "Number of RCU reader threads"); +torture_param(int, object_debug, 0, + "Enable debug-object double call_rcu() testing"); +torture_param(int, onoff_holdoff, 0, "Time after boot before CPU hotplugs (s)"); +torture_param(int, onoff_interval, 0, + "Time between CPU hotplugs (s), 0=disable"); +torture_param(int, shuffle_interval, 3, "Number of seconds between shuffles"); +torture_param(int, shutdown_secs, 0, "Shutdown time (s), <= zero to disable."); +torture_param(int, stall_cpu, 0, "Stall duration (s), zero to disable."); +torture_param(int, stall_cpu_holdoff, 10, + "Time to wait before starting stall (s)."); +torture_param(int, stat_interval, 60, + "Number of seconds between stats printk()s"); +torture_param(int, stutter, 5, "Number of seconds to run/halt test"); +torture_param(int, test_boost, 1, "Test RCU prio boost: 0=no, 1=maybe, 2=yes."); +torture_param(int, test_boost_duration, 4, + "Duration of each boost test, seconds."); +torture_param(int, test_boost_interval, 7, + "Interval between boost tests, seconds."); +torture_param(bool, test_no_idle_hz, true, + "Test support for tickless idle CPUs"); +torture_param(bool, verbose, false, "Enable verbose debugging printk()s"); + static char *torture_type = "rcu"; module_param(torture_type, charp, 0444); MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, ...)"); -static bool verbose; -module_param(verbose, bool, 0444); -MODULE_PARM_DESC(verbose, "Enable verbose debugging printk()s"); #define TORTURE_FLAG "-torture:" #define PRINTK_STRING(s) \ -- cgit v1.2.3 From c2884de38e01134ae040d55aa5644049d1bb850f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 29 Jan 2014 07:30:50 -0800 Subject: rcutorture: Abstract TOROUT_STRING() and friends These diagnostic macros are not confined to torturing RCU, so this commit makes them available to other torture tests. Also removed the do-while from TOROUT_STRING() in response to checkpatch complaints. Signed-off-by: Paul E. McKenney --- include/linux/torture.h | 8 ++++++++ kernel/rcu/rcutorture.c | 8 -------- 2 files changed, 8 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/torture.h b/include/linux/torture.h index 5aae880b6b4e..09be8887fb08 100644 --- a/include/linux/torture.h +++ b/include/linux/torture.h @@ -41,6 +41,14 @@ module_param(name, type, 0444); \ MODULE_PARM_DESC(name, msg); +#define TORTURE_FLAG "-torture:" +#define TOROUT_STRING(s) \ + pr_alert("%s" TORTURE_FLAG s "\n", torture_type) +#define VERBOSE_TOROUT_STRING(s) \ + do { if (verbose) pr_alert("%s" TORTURE_FLAG s "\n", torture_type); } while (0) +#define VERBOSE_TOROUT_ERRSTRING(s) \ + do { if (verbose) pr_alert("%s" TORTURE_FLAG "!!! " s "\n", torture_type); } while (0) + /* Low-rider random number generator. */ struct torture_random_state { unsigned long trs_state; diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 34a75b1170e8..86fd8c11257b 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -97,14 +97,6 @@ static char *torture_type = "rcu"; module_param(torture_type, charp, 0444); MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, ...)"); -#define TORTURE_FLAG "-torture:" -#define TOROUT_STRING(s) \ - do { pr_alert("%s" TORTURE_FLAG s "\n", torture_type); } while (0) -#define VERBOSE_TOROUT_STRING(s) \ - do { if (verbose) pr_alert("%s" TORTURE_FLAG s "\n", torture_type); } while (0) -#define VERBOSE_TOROUT_ERRSTRING(s) \ - do { if (verbose) pr_alert("%s" TORTURE_FLAG "!!! " s "\n", torture_type); } while (0) - static int nrealreaders; static struct task_struct *writer_task; static struct task_struct **fakewriter_tasks; -- cgit v1.2.3 From f67a33561e6e5463b548219df98130da95f2e4a7 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 29 Jan 2014 07:40:27 -0800 Subject: rcutorture: Abstract torture_shutdown_absorb() Because handling races between rmmod and normal shutdown is not specific to rcutorture, this commit renames rcutorture_shutdown_absorb() to torture_shutdown_absorb() and pulls it out into then kernel/torture.c module. This implies pulling the fullstop mechanism into kernel/torture.c as well. The exporting of fullstop and fullstop_mutex is ugly and must die. And it does in fact die in later commits that introduce higher-level APIs that encapsulate both of these variables. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett ` --- include/linux/torture.h | 15 +++++++++++++ kernel/rcu/rcutorture.c | 57 ++++++++++++++++--------------------------------- kernel/torture.c | 20 +++++++++++++++++ 3 files changed, 53 insertions(+), 39 deletions(-) (limited to 'include/linux') diff --git a/include/linux/torture.h b/include/linux/torture.h index 09be8887fb08..8474d776c864 100644 --- a/include/linux/torture.h +++ b/include/linux/torture.h @@ -41,6 +41,18 @@ module_param(name, type, 0444); \ MODULE_PARM_DESC(name, msg); +/* Mediate rmmod and system shutdown. Concurrent rmmod & shutdown illegal! */ +#define FULLSTOP_DONTSTOP 0 /* Normal operation. */ +#define FULLSTOP_SHUTDOWN 1 /* System shutdown with rcutorture running. */ +#define FULLSTOP_RMMOD 2 /* Normal rmmod of rcutorture. */ +extern int fullstop; +/* Protect fullstop transitions and spawning of kthreads. */ +extern struct mutex fullstop_mutex; + +/* Common module parameters. */ +extern char *torture_type; +extern bool verbose; + #define TORTURE_FLAG "-torture:" #define TOROUT_STRING(s) \ pr_alert("%s" TORTURE_FLAG s "\n", torture_type) @@ -57,4 +69,7 @@ struct torture_random_state { #define DEFINE_TORTURE_RANDOM(name) struct torture_random_state name = { 0, 0 } unsigned long torture_random(struct torture_random_state *trsp); +/* Shutdown task absorption, for when the tasks cannot safely be killed. */ +void torture_shutdown_absorb(const char *title); + #endif /* __LINUX_TORTURE_H */ diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 86fd8c11257b..a868758a6f9c 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -91,11 +91,15 @@ torture_param(int, test_boost_interval, 7, "Interval between boost tests, seconds."); torture_param(bool, test_no_idle_hz, true, "Test support for tickless idle CPUs"); -torture_param(bool, verbose, false, "Enable verbose debugging printk()s"); -static char *torture_type = "rcu"; +char *torture_type = "rcu"; +EXPORT_SYMBOL_GPL(torture_type); module_param(torture_type, charp, 0444); MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, ...)"); +bool verbose; +EXPORT_SYMBOL_GPL(verbose); +module_param(verbose, bool, 0444); +MODULE_PARM_DESC(verbose, "Enable verbose debugging printk()s"); static int nrealreaders; static struct task_struct *writer_task; @@ -200,17 +204,6 @@ static atomic_t barrier_cbs_invoked; /* Barrier callbacks invoked. */ static wait_queue_head_t *barrier_cbs_wq; /* Coordinate barrier testing. */ static DECLARE_WAIT_QUEUE_HEAD(barrier_wq); -/* Mediate rmmod and system shutdown. Concurrent rmmod & shutdown illegal! */ - -#define FULLSTOP_DONTSTOP 0 /* Normal operation. */ -#define FULLSTOP_SHUTDOWN 1 /* System shutdown with rcutorture running. */ -#define FULLSTOP_RMMOD 2 /* Normal rmmod of rcutorture. */ -static int fullstop = FULLSTOP_RMMOD; -/* - * Protect fullstop transitions and spawning of kthreads. - */ -static DEFINE_MUTEX(fullstop_mutex); - /* Forward reference. */ static void rcu_torture_cleanup(void); @@ -231,20 +224,6 @@ rcutorture_shutdown_notify(struct notifier_block *unused1, return NOTIFY_DONE; } -/* - * Absorb kthreads into a kernel function that won't return, so that - * they won't ever access module text or data again. - */ -static void rcutorture_shutdown_absorb(const char *title) -{ - if (ACCESS_ONCE(fullstop) == FULLSTOP_SHUTDOWN) { - pr_notice( - "rcutorture thread %s parking due to system shutdown\n", - title); - schedule_timeout_uninterruptible(MAX_SCHEDULE_TIMEOUT); - } -} - /* * Allocate an element from the rcu_tortures pool. */ @@ -286,7 +265,7 @@ rcu_stutter_wait(const char *title) schedule_timeout_interruptible(1); else schedule_timeout_interruptible(round_jiffies_relative(HZ)); - rcutorture_shutdown_absorb(title); + torture_shutdown_absorb(title); } } @@ -681,7 +660,7 @@ checkwait: rcu_stutter_wait("rcu_torture_boost"); /* Clean up and exit. */ VERBOSE_TOROUT_STRING("rcu_torture_boost task stopping"); - rcutorture_shutdown_absorb("rcu_torture_boost"); + torture_shutdown_absorb("rcu_torture_boost"); while (!kthread_should_stop() || rbi.inflight) schedule_timeout_uninterruptible(1); smp_mb(); /* order accesses to ->inflight before stack-frame death. */ @@ -717,7 +696,7 @@ rcu_torture_fqs(void *arg) rcu_stutter_wait("rcu_torture_fqs"); } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); VERBOSE_TOROUT_STRING("rcu_torture_fqs task stopping"); - rcutorture_shutdown_absorb("rcu_torture_fqs"); + torture_shutdown_absorb("rcu_torture_fqs"); while (!kthread_should_stop()) schedule_timeout_uninterruptible(1); return 0; @@ -789,7 +768,7 @@ rcu_torture_writer(void *arg) rcu_stutter_wait("rcu_torture_writer"); } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); VERBOSE_TOROUT_STRING("rcu_torture_writer task stopping"); - rcutorture_shutdown_absorb("rcu_torture_writer"); + torture_shutdown_absorb("rcu_torture_writer"); while (!kthread_should_stop()) schedule_timeout_uninterruptible(1); return 0; @@ -827,7 +806,7 @@ rcu_torture_fakewriter(void *arg) } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); VERBOSE_TOROUT_STRING("rcu_torture_fakewriter task stopping"); - rcutorture_shutdown_absorb("rcu_torture_fakewriter"); + torture_shutdown_absorb("rcu_torture_fakewriter"); while (!kthread_should_stop()) schedule_timeout_uninterruptible(1); return 0; @@ -971,7 +950,7 @@ rcu_torture_reader(void *arg) rcu_stutter_wait("rcu_torture_reader"); } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); VERBOSE_TOROUT_STRING("rcu_torture_reader task stopping"); - rcutorture_shutdown_absorb("rcu_torture_reader"); + torture_shutdown_absorb("rcu_torture_reader"); if (irqreader && cur_ops->irq_capable) del_timer_sync(&t); while (!kthread_should_stop()) @@ -1095,7 +1074,7 @@ rcu_torture_stats(void *arg) do { schedule_timeout_interruptible(stat_interval * HZ); rcu_torture_stats_print(); - rcutorture_shutdown_absorb("rcu_torture_stats"); + torture_shutdown_absorb("rcu_torture_stats"); } while (!kthread_should_stop()); VERBOSE_TOROUT_STRING("rcu_torture_stats task stopping"); return 0; @@ -1179,7 +1158,7 @@ rcu_torture_shuffle(void *arg) do { schedule_timeout_interruptible(shuffle_interval * HZ); rcu_torture_shuffle_tasks(); - rcutorture_shutdown_absorb("rcu_torture_shuffle"); + torture_shutdown_absorb("rcu_torture_shuffle"); } while (!kthread_should_stop()); VERBOSE_TOROUT_STRING("rcu_torture_shuffle task stopping"); return 0; @@ -1198,7 +1177,7 @@ rcu_torture_stutter(void *arg) if (!kthread_should_stop()) schedule_timeout_interruptible(stutter * HZ); stutter_pause_test = 0; - rcutorture_shutdown_absorb("rcu_torture_stutter"); + torture_shutdown_absorb("rcu_torture_stutter"); } while (!kthread_should_stop()); VERBOSE_TOROUT_STRING("rcu_torture_stutter task stopping"); return 0; @@ -1470,7 +1449,7 @@ static int rcu_torture_stall(void *args) rcu_read_unlock(); pr_alert("rcu_torture_stall end.\n"); } - rcutorture_shutdown_absorb("rcu_torture_stall"); + torture_shutdown_absorb("rcu_torture_stall"); while (!kthread_should_stop()) schedule_timeout_interruptible(10 * HZ); return 0; @@ -1534,7 +1513,7 @@ static int rcu_torture_barrier_cbs(void *arg) wake_up(&barrier_wq); } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); VERBOSE_TOROUT_STRING("rcu_torture_barrier_cbs task stopping"); - rcutorture_shutdown_absorb("rcu_torture_barrier_cbs"); + torture_shutdown_absorb("rcu_torture_barrier_cbs"); while (!kthread_should_stop()) schedule_timeout_interruptible(1); cur_ops->cb_barrier(); @@ -1571,7 +1550,7 @@ static int rcu_torture_barrier(void *arg) schedule_timeout_interruptible(HZ / 10); } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); VERBOSE_TOROUT_STRING("rcu_torture_barrier task stopping"); - rcutorture_shutdown_absorb("rcu_torture_barrier"); + torture_shutdown_absorb("rcu_torture_barrier"); while (!kthread_should_stop()) schedule_timeout_interruptible(1); return 0; diff --git a/kernel/torture.c b/kernel/torture.c index c82c70f7828e..f05042036ae8 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -49,6 +49,11 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Paul E. McKenney "); +int fullstop = FULLSTOP_RMMOD; +EXPORT_SYMBOL_GPL(fullstop); +DEFINE_MUTEX(fullstop_mutex); +EXPORT_SYMBOL_GPL(fullstop_mutex); + #define TORTURE_RANDOM_MULT 39916801 /* prime */ #define TORTURE_RANDOM_ADD 479001701 /* prime */ #define TORTURE_RANDOM_REFRESH 10000 @@ -69,3 +74,18 @@ torture_random(struct torture_random_state *trsp) return swahw32(trsp->trs_state); } EXPORT_SYMBOL_GPL(torture_random); + +/* + * Absorb kthreads into a kernel function that won't return, so that + * they won't ever access module text or data again. + */ +void torture_shutdown_absorb(const char *title) +{ + while (ACCESS_ONCE(fullstop) == FULLSTOP_SHUTDOWN) { + pr_notice( + "torture thread %s parking due to system shutdown\n", + title); + schedule_timeout_uninterruptible(MAX_SCHEDULE_TIMEOUT); + } +} +EXPORT_SYMBOL_GPL(torture_shutdown_absorb); -- cgit v1.2.3 From 3808dc9fab05913060626d7f0edd0f195cb9dcab Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 28 Jan 2014 15:29:21 -0800 Subject: rcutorture: Abstract torture_shuffle() The torture_shuffle() function forces each CPU in turn to go idle periodically in order to check for problems interacting with per-CPU variables and with dyntick-idle mode. Because this sort of debugging is not specific to RCU, this commit abstracts that functionality. This in turn requires abstracting some additional infrastructure. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- include/linux/torture.h | 5 ++ kernel/rcu/rcutorture.c | 124 +++++---------------------------------- kernel/torture.c | 151 ++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 171 insertions(+), 109 deletions(-) (limited to 'include/linux') diff --git a/include/linux/torture.h b/include/linux/torture.h index 8474d776c864..544a2c33c1ae 100644 --- a/include/linux/torture.h +++ b/include/linux/torture.h @@ -69,6 +69,11 @@ struct torture_random_state { #define DEFINE_TORTURE_RANDOM(name) struct torture_random_state name = { 0, 0 } unsigned long torture_random(struct torture_random_state *trsp); +/* Task shuffler, which causes CPUs to occasionally go idle. */ +void torture_shuffle_task_register(struct task_struct *tp); +int torture_shuffle_init(long shuffint); +void torture_shuffle_cleanup(void); + /* Shutdown task absorption, for when the tasks cannot safely be killed. */ void torture_shutdown_absorb(const char *title); diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index a868758a6f9c..0380696f1844 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -106,7 +106,6 @@ static struct task_struct *writer_task; static struct task_struct **fakewriter_tasks; static struct task_struct **reader_tasks; static struct task_struct *stats_task; -static struct task_struct *shuffler_task; static struct task_struct *stutter_task; static struct task_struct *fqs_task; static struct task_struct *boost_tasks[NR_CPUS]; @@ -161,7 +160,6 @@ static int max_online; static long n_barrier_attempts; static long n_barrier_successes; static struct list_head rcu_torture_removed; -static cpumask_var_t shuffle_tmp_mask; static int stutter_pause_test; @@ -1080,90 +1078,6 @@ rcu_torture_stats(void *arg) return 0; } -static int rcu_idle_cpu; /* Force all torture tasks off this CPU */ - -/* Shuffle tasks such that we allow @rcu_idle_cpu to become idle. A special case - * is when @rcu_idle_cpu = -1, when we allow the tasks to run on all CPUs. - */ -static void rcu_torture_shuffle_tasks(void) -{ - int i; - - cpumask_setall(shuffle_tmp_mask); - get_online_cpus(); - - /* No point in shuffling if there is only one online CPU (ex: UP) */ - if (num_online_cpus() == 1) { - put_online_cpus(); - return; - } - - if (rcu_idle_cpu != -1) - cpumask_clear_cpu(rcu_idle_cpu, shuffle_tmp_mask); - - set_cpus_allowed_ptr(current, shuffle_tmp_mask); - - if (reader_tasks) { - for (i = 0; i < nrealreaders; i++) - if (reader_tasks[i]) - set_cpus_allowed_ptr(reader_tasks[i], - shuffle_tmp_mask); - } - if (fakewriter_tasks) { - for (i = 0; i < nfakewriters; i++) - if (fakewriter_tasks[i]) - set_cpus_allowed_ptr(fakewriter_tasks[i], - shuffle_tmp_mask); - } - if (writer_task) - set_cpus_allowed_ptr(writer_task, shuffle_tmp_mask); - if (stats_task) - set_cpus_allowed_ptr(stats_task, shuffle_tmp_mask); - if (stutter_task) - set_cpus_allowed_ptr(stutter_task, shuffle_tmp_mask); - if (fqs_task) - set_cpus_allowed_ptr(fqs_task, shuffle_tmp_mask); - if (shutdown_task) - set_cpus_allowed_ptr(shutdown_task, shuffle_tmp_mask); -#ifdef CONFIG_HOTPLUG_CPU - if (onoff_task) - set_cpus_allowed_ptr(onoff_task, shuffle_tmp_mask); -#endif /* #ifdef CONFIG_HOTPLUG_CPU */ - if (stall_task) - set_cpus_allowed_ptr(stall_task, shuffle_tmp_mask); - if (barrier_cbs_tasks) - for (i = 0; i < n_barrier_cbs; i++) - if (barrier_cbs_tasks[i]) - set_cpus_allowed_ptr(barrier_cbs_tasks[i], - shuffle_tmp_mask); - if (barrier_task) - set_cpus_allowed_ptr(barrier_task, shuffle_tmp_mask); - - if (rcu_idle_cpu == -1) - rcu_idle_cpu = num_online_cpus() - 1; - else - rcu_idle_cpu--; - - put_online_cpus(); -} - -/* Shuffle tasks across CPUs, with the intent of allowing each CPU in the - * system to become idle at a time and cut off its timer ticks. This is meant - * to test the support for such tickless idle CPU in RCU. - */ -static int -rcu_torture_shuffle(void *arg) -{ - VERBOSE_TOROUT_STRING("rcu_torture_shuffle task started"); - do { - schedule_timeout_interruptible(shuffle_interval * HZ); - rcu_torture_shuffle_tasks(); - torture_shutdown_absorb("rcu_torture_shuffle"); - } while (!kthread_should_stop()); - VERBOSE_TOROUT_STRING("rcu_torture_shuffle task stopping"); - return 0; -} - /* Cause the rcutorture test to "stutter", starting and stopping all * threads periodically. */ @@ -1397,6 +1311,7 @@ rcu_torture_onoff_init(void) onoff_task = NULL; return ret; } + torture_shuffle_task_register(onoff_task); return 0; } @@ -1468,6 +1383,7 @@ static int __init rcu_torture_stall_init(void) stall_task = NULL; return ret; } + torture_shuffle_task_register(stall_task); return 0; } @@ -1594,6 +1510,7 @@ static int rcu_torture_barrier_init(void) barrier_cbs_tasks[i] = NULL; return ret; } + torture_shuffle_task_register(barrier_cbs_tasks[i]); } barrier_task = kthread_run(rcu_torture_barrier, NULL, "rcu_torture_barrier"); @@ -1602,6 +1519,7 @@ static int rcu_torture_barrier_init(void) VERBOSE_TOROUT_ERRSTRING("Failed to create rcu_torture_barrier"); barrier_task = NULL; } + torture_shuffle_task_register(barrier_task); return 0; } @@ -1674,6 +1592,8 @@ rcu_torture_cleanup(void) fullstop = FULLSTOP_RMMOD; mutex_unlock(&fullstop_mutex); unregister_reboot_notifier(&rcutorture_shutdown_nb); + + torture_shuffle_cleanup(); /* Must be first task cleaned up. */ rcu_torture_barrier_cleanup(); rcu_torture_stall_cleanup(); if (stutter_task) { @@ -1681,12 +1601,6 @@ rcu_torture_cleanup(void) kthread_stop(stutter_task); } stutter_task = NULL; - if (shuffler_task) { - VERBOSE_TOROUT_STRING("Stopping rcu_torture_shuffle task"); - kthread_stop(shuffler_task); - free_cpumask_var(shuffle_tmp_mask); - } - shuffler_task = NULL; if (writer_task) { VERBOSE_TOROUT_STRING("Stopping rcu_torture_writer task"); @@ -1904,6 +1818,7 @@ rcu_torture_init(void) writer_task = NULL; goto unwind; } + torture_shuffle_task_register(writer_task); wake_up_process(writer_task); fakewriter_tasks = kzalloc(nfakewriters * sizeof(fakewriter_tasks[0]), GFP_KERNEL); @@ -1922,6 +1837,7 @@ rcu_torture_init(void) fakewriter_tasks[i] = NULL; goto unwind; } + torture_shuffle_task_register(fakewriter_tasks[i]); } reader_tasks = kzalloc(nrealreaders * sizeof(reader_tasks[0]), GFP_KERNEL); @@ -1940,6 +1856,7 @@ rcu_torture_init(void) reader_tasks[i] = NULL; goto unwind; } + torture_shuffle_task_register(reader_tasks[i]); } if (stat_interval > 0) { VERBOSE_TOROUT_STRING("Creating rcu_torture_stats task"); @@ -1951,26 +1868,12 @@ rcu_torture_init(void) stats_task = NULL; goto unwind; } + torture_shuffle_task_register(stats_task); } if (test_no_idle_hz) { - rcu_idle_cpu = num_online_cpus() - 1; - - if (!alloc_cpumask_var(&shuffle_tmp_mask, GFP_KERNEL)) { - firsterr = -ENOMEM; - VERBOSE_TOROUT_ERRSTRING("Failed to alloc mask"); + firsterr = torture_shuffle_init(shuffle_interval * HZ); + if (firsterr) goto unwind; - } - - /* Create the shuffler thread */ - shuffler_task = kthread_run(rcu_torture_shuffle, NULL, - "rcu_torture_shuffle"); - if (IS_ERR(shuffler_task)) { - free_cpumask_var(shuffle_tmp_mask); - firsterr = PTR_ERR(shuffler_task); - VERBOSE_TOROUT_ERRSTRING("Failed to create shuffler"); - shuffler_task = NULL; - goto unwind; - } } if (stutter < 0) stutter = 0; @@ -1984,6 +1887,7 @@ rcu_torture_init(void) stutter_task = NULL; goto unwind; } + torture_shuffle_task_register(stutter_task); } if (fqs_duration < 0) fqs_duration = 0; @@ -1997,6 +1901,7 @@ rcu_torture_init(void) fqs_task = NULL; goto unwind; } + torture_shuffle_task_register(fqs_task); } if (test_boost_interval < 1) test_boost_interval = 1; @@ -2027,6 +1932,7 @@ rcu_torture_init(void) shutdown_task = NULL; goto unwind; } + torture_shuffle_task_register(shutdown_task); wake_up_process(shutdown_task); } i = rcu_torture_onoff_init(); diff --git a/kernel/torture.c b/kernel/torture.c index f05042036ae8..26058f20ee83 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -75,6 +75,157 @@ torture_random(struct torture_random_state *trsp) } EXPORT_SYMBOL_GPL(torture_random); +/* + * Variables for shuffling. The idea is to ensure that each CPU stays + * idle for an extended period to test interactions with dyntick idle, + * as well as interactions with any per-CPU varibles. + */ +struct shuffle_task { + struct list_head st_l; + struct task_struct *st_t; +}; + +static long shuffle_interval; /* In jiffies. */ +static struct task_struct *shuffler_task; +static cpumask_var_t shuffle_tmp_mask; +static int shuffle_idle_cpu; /* Force all torture tasks off this CPU */ +static struct list_head shuffle_task_list = LIST_HEAD_INIT(shuffle_task_list); +static DEFINE_MUTEX(shuffle_task_mutex); + +/* + * Register a task to be shuffled. If there is no memory, just splat + * and don't bother registering. + */ +void torture_shuffle_task_register(struct task_struct *tp) +{ + struct shuffle_task *stp; + + if (WARN_ON_ONCE(tp == NULL)) + return; + stp = kmalloc(sizeof(*stp), GFP_KERNEL); + if (WARN_ON_ONCE(stp == NULL)) + return; + stp->st_t = tp; + mutex_lock(&shuffle_task_mutex); + list_add(&stp->st_l, &shuffle_task_list); + mutex_unlock(&shuffle_task_mutex); +} +EXPORT_SYMBOL_GPL(torture_shuffle_task_register); + +/* + * Unregister all tasks, for example, at the end of the torture run. + */ +static void torture_shuffle_task_unregister_all(void) +{ + struct shuffle_task *stp; + struct shuffle_task *p; + + mutex_lock(&shuffle_task_mutex); + list_for_each_entry_safe(stp, p, &shuffle_task_list, st_l) { + list_del(&stp->st_l); + kfree(stp); + } + mutex_unlock(&shuffle_task_mutex); +} + +/* Shuffle tasks such that we allow shuffle_idle_cpu to become idle. + * A special case is when shuffle_idle_cpu = -1, in which case we allow + * the tasks to run on all CPUs. + */ +static void torture_shuffle_tasks(void) +{ + struct shuffle_task *stp; + + cpumask_setall(shuffle_tmp_mask); + get_online_cpus(); + + /* No point in shuffling if there is only one online CPU (ex: UP) */ + if (num_online_cpus() == 1) { + put_online_cpus(); + return; + } + + /* Advance to the next CPU. Upon overflow, don't idle any CPUs. */ + shuffle_idle_cpu = cpumask_next(shuffle_idle_cpu, shuffle_tmp_mask); + if (shuffle_idle_cpu >= nr_cpu_ids) + shuffle_idle_cpu = -1; + if (shuffle_idle_cpu != -1) { + cpumask_clear_cpu(shuffle_idle_cpu, shuffle_tmp_mask); + if (cpumask_empty(shuffle_tmp_mask)) { + put_online_cpus(); + return; + } + } + + mutex_lock(&shuffle_task_mutex); + list_for_each_entry(stp, &shuffle_task_list, st_l) + set_cpus_allowed_ptr(stp->st_t, shuffle_tmp_mask); + mutex_unlock(&shuffle_task_mutex); + + put_online_cpus(); +} + +/* Shuffle tasks across CPUs, with the intent of allowing each CPU in the + * system to become idle at a time and cut off its timer ticks. This is meant + * to test the support for such tickless idle CPU in RCU. + */ +static int torture_shuffle(void *arg) +{ + VERBOSE_TOROUT_STRING("torture_shuffle task started"); + do { + schedule_timeout_interruptible(shuffle_interval); + torture_shuffle_tasks(); + torture_shutdown_absorb("torture_shuffle"); + } while (!torture_must_stop()); + VERBOSE_TOROUT_STRING("torture_shuffle task stopping"); + return 0; +} + +/* + * Start the shuffler, with shuffint in jiffies. + */ +int torture_shuffle_init(long shuffint) +{ + int ret; + + shuffle_interval = shuffint; + + shuffle_idle_cpu = -1; + + if (!alloc_cpumask_var(&shuffle_tmp_mask, GFP_KERNEL)) { + VERBOSE_TOROUT_ERRSTRING("Failed to alloc mask"); + return -ENOMEM; + } + + /* Create the shuffler thread */ + shuffler_task = kthread_run(torture_shuffle, NULL, "torture_shuffle"); + if (IS_ERR(shuffler_task)) { + ret = PTR_ERR(shuffler_task); + free_cpumask_var(shuffle_tmp_mask); + VERBOSE_TOROUT_ERRSTRING("Failed to create shuffler"); + shuffler_task = NULL; + return ret; + } + torture_shuffle_task_register(shuffler_task); + return 0; +} +EXPORT_SYMBOL_GPL(torture_shuffle_init); + +/* + * Stop the shuffling. + */ +void torture_shuffle_cleanup(void) +{ + torture_shuffle_task_unregister_all(); + if (shuffler_task) { + VERBOSE_TOROUT_STRING("Stopping torture_shuffle task"); + kthread_stop(shuffler_task); + free_cpumask_var(shuffle_tmp_mask); + } + shuffler_task = NULL; +} +EXPORT_SYMBOL_GPL(torture_shuffle_cleanup); + /* * Absorb kthreads into a kernel function that won't return, so that * they won't ever access module text or data again. -- cgit v1.2.3 From 2e9e8081d2e7a4efb582a240aa7fee991bbbabb0 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 28 Jan 2014 15:58:22 -0800 Subject: rcutorture: Abstract torture_onoff() Because online/offline torturing is not specific to RCU, this commit abstracts it into the kernel/torture.c module to allow other torture tests to use it. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- include/linux/torture.h | 12 ++++ kernel/rcu/rcutorture.c | 162 ++--------------------------------------- kernel/torture.c | 186 ++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 202 insertions(+), 158 deletions(-) (limited to 'include/linux') diff --git a/include/linux/torture.h b/include/linux/torture.h index 544a2c33c1ae..be3694a702e9 100644 --- a/include/linux/torture.h +++ b/include/linux/torture.h @@ -61,6 +61,18 @@ extern bool verbose; #define VERBOSE_TOROUT_ERRSTRING(s) \ do { if (verbose) pr_alert("%s" TORTURE_FLAG "!!! " s "\n", torture_type); } while (0) +/* Definitions for a non-string torture-test module parameter. */ +#define torture_parm(type, name, init, msg) \ + static type name = init; \ + module_param(name, type, 0444); \ + MODULE_PARM_DESC(name, msg); + +/* Definitions for online/offline exerciser. */ +int torture_onoff_init(long ooholdoff, long oointerval); +void torture_onoff_cleanup(void); +char *torture_onoff_stats(char *page); +bool torture_onoff_failures(void); + /* Low-rider random number generator. */ struct torture_random_state { unsigned long trs_state; diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 0380696f1844..0e8b52b71d76 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -110,9 +110,6 @@ static struct task_struct *stutter_task; static struct task_struct *fqs_task; static struct task_struct *boost_tasks[NR_CPUS]; static struct task_struct *shutdown_task; -#ifdef CONFIG_HOTPLUG_CPU -static struct task_struct *onoff_task; -#endif /* #ifdef CONFIG_HOTPLUG_CPU */ static struct task_struct *stall_task; static struct task_struct **barrier_cbs_tasks; static struct task_struct *barrier_task; @@ -147,16 +144,6 @@ static long n_rcu_torture_boost_rterror; static long n_rcu_torture_boost_failure; static long n_rcu_torture_boosts; static long n_rcu_torture_timers; -static long n_offline_attempts; -static long n_offline_successes; -static unsigned long sum_offline; -static int min_offline = -1; -static int max_offline; -static long n_online_attempts; -static long n_online_successes; -static unsigned long sum_online; -static int min_online = -1; -static int max_online; static long n_barrier_attempts; static long n_barrier_successes; static struct list_head rcu_torture_removed; @@ -994,13 +981,7 @@ rcu_torture_printk(char *page) n_rcu_torture_boost_failure, n_rcu_torture_boosts, n_rcu_torture_timers); - page += sprintf(page, - "onoff: %ld/%ld:%ld/%ld %d,%d:%d,%d %lu:%lu (HZ=%d) ", - n_online_successes, n_online_attempts, - n_offline_successes, n_offline_attempts, - min_online, max_online, - min_offline, max_offline, - sum_online, sum_offline, HZ); + page = torture_onoff_stats(page); page += sprintf(page, "barrier: %ld/%ld:%ld", n_barrier_successes, n_barrier_attempts, @@ -1204,140 +1185,6 @@ rcu_torture_shutdown(void *arg) return 0; } -#ifdef CONFIG_HOTPLUG_CPU - -/* - * Execute random CPU-hotplug operations at the interval specified - * by the onoff_interval. - */ -static int -rcu_torture_onoff(void *arg) -{ - int cpu; - unsigned long delta; - int maxcpu = -1; - DEFINE_TORTURE_RANDOM(rand); - int ret; - unsigned long starttime; - - VERBOSE_TOROUT_STRING("rcu_torture_onoff task started"); - for_each_online_cpu(cpu) - maxcpu = cpu; - WARN_ON(maxcpu < 0); - if (onoff_holdoff > 0) { - VERBOSE_TOROUT_STRING("rcu_torture_onoff begin holdoff"); - schedule_timeout_interruptible(onoff_holdoff * HZ); - VERBOSE_TOROUT_STRING("rcu_torture_onoff end holdoff"); - } - while (!kthread_should_stop()) { - cpu = (torture_random(&rand) >> 4) % (maxcpu + 1); - if (cpu_online(cpu) && cpu_is_hotpluggable(cpu)) { - if (verbose) - pr_alert("%s" TORTURE_FLAG - "rcu_torture_onoff task: offlining %d\n", - torture_type, cpu); - starttime = jiffies; - n_offline_attempts++; - ret = cpu_down(cpu); - if (ret) { - if (verbose) - pr_alert("%s" TORTURE_FLAG - "rcu_torture_onoff task: offline %d failed: errno %d\n", - torture_type, cpu, ret); - } else { - if (verbose) - pr_alert("%s" TORTURE_FLAG - "rcu_torture_onoff task: offlined %d\n", - torture_type, cpu); - n_offline_successes++; - delta = jiffies - starttime; - sum_offline += delta; - if (min_offline < 0) { - min_offline = delta; - max_offline = delta; - } - if (min_offline > delta) - min_offline = delta; - if (max_offline < delta) - max_offline = delta; - } - } else if (cpu_is_hotpluggable(cpu)) { - if (verbose) - pr_alert("%s" TORTURE_FLAG - "rcu_torture_onoff task: onlining %d\n", - torture_type, cpu); - starttime = jiffies; - n_online_attempts++; - ret = cpu_up(cpu); - if (ret) { - if (verbose) - pr_alert("%s" TORTURE_FLAG - "rcu_torture_onoff task: online %d failed: errno %d\n", - torture_type, cpu, ret); - } else { - if (verbose) - pr_alert("%s" TORTURE_FLAG - "rcu_torture_onoff task: onlined %d\n", - torture_type, cpu); - n_online_successes++; - delta = jiffies - starttime; - sum_online += delta; - if (min_online < 0) { - min_online = delta; - max_online = delta; - } - if (min_online > delta) - min_online = delta; - if (max_online < delta) - max_online = delta; - } - } - schedule_timeout_interruptible(onoff_interval * HZ); - } - VERBOSE_TOROUT_STRING("rcu_torture_onoff task stopping"); - return 0; -} - -static int -rcu_torture_onoff_init(void) -{ - int ret; - - if (onoff_interval <= 0) - return 0; - onoff_task = kthread_run(rcu_torture_onoff, NULL, "rcu_torture_onoff"); - if (IS_ERR(onoff_task)) { - ret = PTR_ERR(onoff_task); - onoff_task = NULL; - return ret; - } - torture_shuffle_task_register(onoff_task); - return 0; -} - -static void rcu_torture_onoff_cleanup(void) -{ - if (onoff_task == NULL) - return; - VERBOSE_TOROUT_STRING("Stopping rcu_torture_onoff task"); - kthread_stop(onoff_task); - onoff_task = NULL; -} - -#else /* #ifdef CONFIG_HOTPLUG_CPU */ - -static int -rcu_torture_onoff_init(void) -{ - return 0; -} - -static void rcu_torture_onoff_cleanup(void) -{ -} - -#endif /* #else #ifdef CONFIG_HOTPLUG_CPU */ - /* * CPU-stall kthread. It waits as specified by stall_cpu_holdoff, then * induces a CPU stall for the time specified by stall_cpu. @@ -1657,7 +1504,7 @@ rcu_torture_cleanup(void) kthread_stop(shutdown_task); } shutdown_task = NULL; - rcu_torture_onoff_cleanup(); + torture_onoff_cleanup(); /* Wait for all RCU callbacks to fire. */ @@ -1668,8 +1515,7 @@ rcu_torture_cleanup(void) if (atomic_read(&n_rcu_torture_error) || n_rcu_torture_barrier_error) rcu_torture_print_module_parms(cur_ops, "End of test: FAILURE"); - else if (n_online_successes != n_online_attempts || - n_offline_successes != n_offline_attempts) + else if (torture_onoff_failures()) rcu_torture_print_module_parms(cur_ops, "End of test: RCU_HOTPLUG"); else @@ -1935,7 +1781,7 @@ rcu_torture_init(void) torture_shuffle_task_register(shutdown_task); wake_up_process(shutdown_task); } - i = rcu_torture_onoff_init(); + i = torture_onoff_init(onoff_holdoff * HZ, onoff_interval * HZ); if (i != 0) { firsterr = i; goto unwind; diff --git a/kernel/torture.c b/kernel/torture.c index 26058f20ee83..a7ec8a7d561e 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -54,6 +54,192 @@ EXPORT_SYMBOL_GPL(fullstop); DEFINE_MUTEX(fullstop_mutex); EXPORT_SYMBOL_GPL(fullstop_mutex); +#ifdef CONFIG_HOTPLUG_CPU + +/* + * Variables for online-offline handling. Only present if CPU hotplug + * is enabled, otherwise does nothing. + */ + +static struct task_struct *onoff_task; +static long onoff_holdoff; +static long onoff_interval; +static long n_offline_attempts; +static long n_offline_successes; +static unsigned long sum_offline; +static int min_offline = -1; +static int max_offline; +static long n_online_attempts; +static long n_online_successes; +static unsigned long sum_online; +static int min_online = -1; +static int max_online; + +/* + * Execute random CPU-hotplug operations at the interval specified + * by the onoff_interval. + */ +static int +torture_onoff(void *arg) +{ + int cpu; + unsigned long delta; + int maxcpu = -1; + DEFINE_TORTURE_RANDOM(rand); + int ret; + unsigned long starttime; + + VERBOSE_TOROUT_STRING("torture_onoff task started"); + for_each_online_cpu(cpu) + maxcpu = cpu; + WARN_ON(maxcpu < 0); + if (onoff_holdoff > 0) { + VERBOSE_TOROUT_STRING("torture_onoff begin holdoff"); + schedule_timeout_interruptible(onoff_holdoff); + VERBOSE_TOROUT_STRING("torture_onoff end holdoff"); + } + while (!torture_must_stop()) { + cpu = (torture_random(&rand) >> 4) % (maxcpu + 1); + if (cpu_online(cpu) && cpu_is_hotpluggable(cpu)) { + if (verbose) + pr_alert("%s" TORTURE_FLAG + "torture_onoff task: offlining %d\n", + torture_type, cpu); + starttime = jiffies; + n_offline_attempts++; + ret = cpu_down(cpu); + if (ret) { + if (verbose) + pr_alert("%s" TORTURE_FLAG + "torture_onoff task: offline %d failed: errno %d\n", + torture_type, cpu, ret); + } else { + if (verbose) + pr_alert("%s" TORTURE_FLAG + "torture_onoff task: offlined %d\n", + torture_type, cpu); + n_offline_successes++; + delta = jiffies - starttime; + sum_offline += delta; + if (min_offline < 0) { + min_offline = delta; + max_offline = delta; + } + if (min_offline > delta) + min_offline = delta; + if (max_offline < delta) + max_offline = delta; + } + } else if (cpu_is_hotpluggable(cpu)) { + if (verbose) + pr_alert("%s" TORTURE_FLAG + "torture_onoff task: onlining %d\n", + torture_type, cpu); + starttime = jiffies; + n_online_attempts++; + ret = cpu_up(cpu); + if (ret) { + if (verbose) + pr_alert("%s" TORTURE_FLAG + "torture_onoff task: online %d failed: errno %d\n", + torture_type, cpu, ret); + } else { + if (verbose) + pr_alert("%s" TORTURE_FLAG + "torture_onoff task: onlined %d\n", + torture_type, cpu); + n_online_successes++; + delta = jiffies - starttime; + sum_online += delta; + if (min_online < 0) { + min_online = delta; + max_online = delta; + } + if (min_online > delta) + min_online = delta; + if (max_online < delta) + max_online = delta; + } + } + schedule_timeout_interruptible(onoff_interval); + } + VERBOSE_TOROUT_STRING("torture_onoff task stopping"); + return 0; +} + +#endif /* #ifdef CONFIG_HOTPLUG_CPU */ + +/* + * Initiate online-offline handling. + */ +int torture_onoff_init(long ooholdoff, long oointerval) +{ +#ifdef CONFIG_HOTPLUG_CPU + int ret; + + onoff_holdoff = ooholdoff; + onoff_interval = oointerval; + if (onoff_interval <= 0) + return 0; + onoff_task = kthread_run(torture_onoff, NULL, "torture_onoff"); + if (IS_ERR(onoff_task)) { + ret = PTR_ERR(onoff_task); + onoff_task = NULL; + return ret; + } + torture_shuffle_task_register(onoff_task); +#endif /* #ifdef CONFIG_HOTPLUG_CPU */ + return 0; +} +EXPORT_SYMBOL_GPL(torture_onoff_init); + +/* + * Clean up after online/offline testing. + */ +void torture_onoff_cleanup(void) +{ +#ifdef CONFIG_HOTPLUG_CPU + if (onoff_task == NULL) + return; + VERBOSE_TOROUT_STRING("Stopping torture_onoff task"); + kthread_stop(onoff_task); + onoff_task = NULL; +#endif /* #ifdef CONFIG_HOTPLUG_CPU */ +} +EXPORT_SYMBOL_GPL(torture_onoff_cleanup); + +/* + * Print online/offline testing statistics. + */ +char *torture_onoff_stats(char *page) +{ +#ifdef CONFIG_HOTPLUG_CPU + page += sprintf(page, + "onoff: %ld/%ld:%ld/%ld %d,%d:%d,%d %lu:%lu (HZ=%d) ", + n_online_successes, n_online_attempts, + n_offline_successes, n_offline_attempts, + min_online, max_online, + min_offline, max_offline, + sum_online, sum_offline, HZ); +#endif /* #ifdef CONFIG_HOTPLUG_CPU */ + return page; +} +EXPORT_SYMBOL_GPL(torture_onoff_stats); + +/* + * Were all the online/offline operations successful? + */ +bool torture_onoff_failures(void) +{ +#ifdef CONFIG_HOTPLUG_CPU + return n_online_successes != n_online_attempts || + n_offline_successes != n_offline_attempts; +#else /* #ifdef CONFIG_HOTPLUG_CPU */ + return false; +#endif /* #else #ifdef CONFIG_HOTPLUG_CPU */ +} +EXPORT_SYMBOL_GPL(torture_onoff_failures); + #define TORTURE_RANDOM_MULT 39916801 /* prime */ #define TORTURE_RANDOM_ADD 479001701 /* prime */ #define TORTURE_RANDOM_REFRESH 10000 -- cgit v1.2.3 From b5daa8f3b3b2b0133ad40e13d4f722070119ce36 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 30 Jan 2014 13:38:09 -0800 Subject: rcutorture: Abstract torture-test initialization This commit creates torture_init_begin() and torture_init_end() functions to abstract locking and allow the torture_type and verbose variables in kernel/torture.o to become static. With a bit more abstraction, fullstop_mutex will also become static. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- include/linux/torture.h | 9 +++++---- kernel/rcu/rcutorture.c | 19 ++++++++----------- kernel/torture.c | 27 +++++++++++++++++++++++++++ 3 files changed, 40 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/include/linux/torture.h b/include/linux/torture.h index be3694a702e9..428d2712be04 100644 --- a/include/linux/torture.h +++ b/include/linux/torture.h @@ -49,10 +49,6 @@ extern int fullstop; /* Protect fullstop transitions and spawning of kthreads. */ extern struct mutex fullstop_mutex; -/* Common module parameters. */ -extern char *torture_type; -extern bool verbose; - #define TORTURE_FLAG "-torture:" #define TOROUT_STRING(s) \ pr_alert("%s" TORTURE_FLAG s "\n", torture_type) @@ -89,4 +85,9 @@ void torture_shuffle_cleanup(void); /* Shutdown task absorption, for when the tasks cannot safely be killed. */ void torture_shutdown_absorb(const char *title); +/* Initialization and cleanup. */ + +void torture_init_begin(char *ttype, bool v); +void torture_init_end(void); + #endif /* __LINUX_TORTURE_H */ diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 0e8b52b71d76..93aca2f9261e 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -91,15 +91,12 @@ torture_param(int, test_boost_interval, 7, "Interval between boost tests, seconds."); torture_param(bool, test_no_idle_hz, true, "Test support for tickless idle CPUs"); +torture_param(bool, verbose, true, + "Enable verbose debugging printk()s"); -char *torture_type = "rcu"; -EXPORT_SYMBOL_GPL(torture_type); +static char *torture_type = "rcu"; module_param(torture_type, charp, 0444); MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, ...)"); -bool verbose; -EXPORT_SYMBOL_GPL(verbose); -module_param(verbose, bool, 0444); -MODULE_PARM_DESC(verbose, "Enable verbose debugging printk()s"); static int nrealreaders; static struct task_struct *writer_task; @@ -1425,8 +1422,8 @@ rcu_torture_cleanup(void) { int i; - mutex_lock(&fullstop_mutex); rcutorture_record_test_transition(); + mutex_lock(&fullstop_mutex); if (fullstop == FULLSTOP_SHUTDOWN) { pr_warn(/* but going down anyway, so... */ "Concurrent 'rmmod rcutorture' and shutdown illegal!\n"); @@ -1589,7 +1586,7 @@ rcu_torture_init(void) &rcu_ops, &rcu_bh_ops, &srcu_ops, &sched_ops, }; - mutex_lock(&fullstop_mutex); + torture_init_begin(torture_type, verbose); /* Process args and tell the world that the torturer is on the job. */ for (i = 0; i < ARRAY_SIZE(torture_ops); i++) { @@ -1604,7 +1601,7 @@ rcu_torture_init(void) for (i = 0; i < ARRAY_SIZE(torture_ops); i++) pr_alert(" %s", torture_ops[i]->name); pr_alert("\n"); - mutex_unlock(&fullstop_mutex); + torture_init_end(); return -EINVAL; } if (cur_ops->fqs == NULL && fqs_duration != 0) { @@ -1800,11 +1797,11 @@ rcu_torture_init(void) if (object_debug) rcu_test_debug_objects(); rcutorture_record_test_transition(); - mutex_unlock(&fullstop_mutex); + torture_init_end(); return 0; unwind: - mutex_unlock(&fullstop_mutex); + torture_init_end(); rcu_torture_cleanup(); return firsterr; } diff --git a/kernel/torture.c b/kernel/torture.c index a7ec8a7d561e..828d0b1a49b8 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -49,6 +49,9 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Paul E. McKenney "); +static char *torture_type; +static bool verbose; + int fullstop = FULLSTOP_RMMOD; EXPORT_SYMBOL_GPL(fullstop); DEFINE_MUTEX(fullstop_mutex); @@ -426,3 +429,27 @@ void torture_shutdown_absorb(const char *title) } } EXPORT_SYMBOL_GPL(torture_shutdown_absorb); + +/* + * Initialize torture module. Please note that this is -not- invoked via + * the usual module_init() mechanism, but rather by an explicit call from + * the client torture module. This call must be paired with a later + * torture_init_end(). + */ +void __init torture_init_begin(char *ttype, bool v) +{ + mutex_lock(&fullstop_mutex); + torture_type = ttype; + verbose = v; + +} +EXPORT_SYMBOL_GPL(torture_init_begin); + +/* + * Tell the torture module that initialization is complete. + */ +void __init torture_init_end(void) +{ + mutex_unlock(&fullstop_mutex); +} +EXPORT_SYMBOL_GPL(torture_init_end); -- cgit v1.2.3 From cc47ae0830264f07442070b36fe0d0a4d4e3c313 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 30 Jan 2014 14:21:11 -0800 Subject: rcutorture: Abstract torture-test cleanup This commit creates a torture_cleanup() that handles the generic cleanup actions local to kernel/torture.c. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- include/linux/torture.h | 4 +--- kernel/rcu/rcutorture.c | 11 +---------- kernel/torture.c | 30 ++++++++++++++++++++++++++++-- 3 files changed, 30 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/include/linux/torture.h b/include/linux/torture.h index 428d2712be04..31d69930e5cf 100644 --- a/include/linux/torture.h +++ b/include/linux/torture.h @@ -65,7 +65,6 @@ extern struct mutex fullstop_mutex; /* Definitions for online/offline exerciser. */ int torture_onoff_init(long ooholdoff, long oointerval); -void torture_onoff_cleanup(void); char *torture_onoff_stats(char *page); bool torture_onoff_failures(void); @@ -80,14 +79,13 @@ unsigned long torture_random(struct torture_random_state *trsp); /* Task shuffler, which causes CPUs to occasionally go idle. */ void torture_shuffle_task_register(struct task_struct *tp); int torture_shuffle_init(long shuffint); -void torture_shuffle_cleanup(void); /* Shutdown task absorption, for when the tasks cannot safely be killed. */ void torture_shutdown_absorb(const char *title); /* Initialization and cleanup. */ - void torture_init_begin(char *ttype, bool v); void torture_init_end(void); +bool torture_cleanup(void); #endif /* __LINUX_TORTURE_H */ diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 93aca2f9261e..68a689fc6ffa 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1423,21 +1423,13 @@ rcu_torture_cleanup(void) int i; rcutorture_record_test_transition(); - mutex_lock(&fullstop_mutex); - if (fullstop == FULLSTOP_SHUTDOWN) { - pr_warn(/* but going down anyway, so... */ - "Concurrent 'rmmod rcutorture' and shutdown illegal!\n"); - mutex_unlock(&fullstop_mutex); - schedule_timeout_uninterruptible(10); + if (torture_cleanup()) { if (cur_ops->cb_barrier != NULL) cur_ops->cb_barrier(); return; } - fullstop = FULLSTOP_RMMOD; - mutex_unlock(&fullstop_mutex); unregister_reboot_notifier(&rcutorture_shutdown_nb); - torture_shuffle_cleanup(); /* Must be first task cleaned up. */ rcu_torture_barrier_cleanup(); rcu_torture_stall_cleanup(); if (stutter_task) { @@ -1501,7 +1493,6 @@ rcu_torture_cleanup(void) kthread_stop(shutdown_task); } shutdown_task = NULL; - torture_onoff_cleanup(); /* Wait for all RCU callbacks to fire. */ diff --git a/kernel/torture.c b/kernel/torture.c index 828d0b1a49b8..41ae5cc3c4c3 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -199,7 +199,7 @@ EXPORT_SYMBOL_GPL(torture_onoff_init); /* * Clean up after online/offline testing. */ -void torture_onoff_cleanup(void) +static void torture_onoff_cleanup(void) { #ifdef CONFIG_HOTPLUG_CPU if (onoff_task == NULL) @@ -403,7 +403,7 @@ EXPORT_SYMBOL_GPL(torture_shuffle_init); /* * Stop the shuffling. */ -void torture_shuffle_cleanup(void) +static void torture_shuffle_cleanup(void) { torture_shuffle_task_unregister_all(); if (shuffler_task) { @@ -453,3 +453,29 @@ void __init torture_init_end(void) mutex_unlock(&fullstop_mutex); } EXPORT_SYMBOL_GPL(torture_init_end); + +/* + * Clean up torture module. Please note that this is -not- invoked via + * the usual module_exit() mechanism, but rather by an explicit call from + * the client torture module. Returns true if a race with system shutdown + * is detected. + * + * This must be called before the caller starts shutting down its own + * kthreads. + */ +bool torture_cleanup(void) +{ + mutex_lock(&fullstop_mutex); + if (fullstop == FULLSTOP_SHUTDOWN) { + pr_warn("Concurrent rmmod and shutdown illegal!\n"); + mutex_unlock(&fullstop_mutex); + schedule_timeout_uninterruptible(10); + return true; + } + fullstop = FULLSTOP_RMMOD; + mutex_unlock(&fullstop_mutex); + torture_shuffle_cleanup(); + torture_onoff_cleanup(); + return false; +} +EXPORT_SYMBOL_GPL(torture_cleanup); -- cgit v1.2.3 From 4622b487ecf0094401ac10e504606e5cbdea5a6e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 30 Jan 2014 15:37:19 -0800 Subject: rcutorture: Abstract torture_shutdown_notify() Because handling the race between rmmod and system shutdown is not specific to RCU, this commit abstracts torture_shutdown_notify(), placing this code into kernel/torture.c. This change also allows fullstop_mutex to be private to kernel/torture.c. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- include/linux/torture.h | 2 -- kernel/rcu/rcutorture.c | 23 ----------------------- kernel/torture.c | 29 ++++++++++++++++++++++++----- 3 files changed, 24 insertions(+), 30 deletions(-) (limited to 'include/linux') diff --git a/include/linux/torture.h b/include/linux/torture.h index 31d69930e5cf..742d8a402f19 100644 --- a/include/linux/torture.h +++ b/include/linux/torture.h @@ -46,8 +46,6 @@ #define FULLSTOP_SHUTDOWN 1 /* System shutdown with rcutorture running. */ #define FULLSTOP_RMMOD 2 /* Normal rmmod of rcutorture. */ extern int fullstop; -/* Protect fullstop transitions and spawning of kthreads. */ -extern struct mutex fullstop_mutex; #define TORTURE_FLAG "-torture:" #define TOROUT_STRING(s) \ diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 68a689fc6ffa..2560e9313887 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -189,23 +189,6 @@ static DECLARE_WAIT_QUEUE_HEAD(barrier_wq); /* Forward reference. */ static void rcu_torture_cleanup(void); -/* - * Detect and respond to a system shutdown. - */ -static int -rcutorture_shutdown_notify(struct notifier_block *unused1, - unsigned long unused2, void *unused3) -{ - mutex_lock(&fullstop_mutex); - if (fullstop == FULLSTOP_DONTSTOP) - fullstop = FULLSTOP_SHUTDOWN; - else - pr_warn(/* but going down anyway, so... */ - "Concurrent 'rmmod rcutorture' and shutdown illegal!\n"); - mutex_unlock(&fullstop_mutex); - return NOTIFY_DONE; -} - /* * Allocate an element from the rcu_tortures pool. */ @@ -1098,10 +1081,6 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag) onoff_interval, onoff_holdoff); } -static struct notifier_block rcutorture_shutdown_nb = { - .notifier_call = rcutorture_shutdown_notify, -}; - static void rcutorture_booster_cleanup(int cpu) { struct task_struct *t; @@ -1428,7 +1407,6 @@ rcu_torture_cleanup(void) cur_ops->cb_barrier(); return; } - unregister_reboot_notifier(&rcutorture_shutdown_nb); rcu_torture_barrier_cleanup(); rcu_torture_stall_cleanup(); @@ -1774,7 +1752,6 @@ rcu_torture_init(void) firsterr = i; goto unwind; } - register_reboot_notifier(&rcutorture_shutdown_nb); i = rcu_torture_stall_init(); if (i != 0) { firsterr = i; diff --git a/kernel/torture.c b/kernel/torture.c index 41ae5cc3c4c3..b02fa2785bbb 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -54,8 +54,7 @@ static bool verbose; int fullstop = FULLSTOP_RMMOD; EXPORT_SYMBOL_GPL(fullstop); -DEFINE_MUTEX(fullstop_mutex); -EXPORT_SYMBOL_GPL(fullstop_mutex); +static DEFINE_MUTEX(fullstop_mutex); #ifdef CONFIG_HOTPLUG_CPU @@ -422,14 +421,32 @@ EXPORT_SYMBOL_GPL(torture_shuffle_cleanup); void torture_shutdown_absorb(const char *title) { while (ACCESS_ONCE(fullstop) == FULLSTOP_SHUTDOWN) { - pr_notice( - "torture thread %s parking due to system shutdown\n", - title); + pr_notice("torture thread %s parking due to system shutdown\n", + title); schedule_timeout_uninterruptible(MAX_SCHEDULE_TIMEOUT); } } EXPORT_SYMBOL_GPL(torture_shutdown_absorb); +/* + * Detect and respond to a system shutdown. + */ +static int torture_shutdown_notify(struct notifier_block *unused1, + unsigned long unused2, void *unused3) +{ + mutex_lock(&fullstop_mutex); + if (fullstop == FULLSTOP_DONTSTOP) + fullstop = FULLSTOP_SHUTDOWN; + else + pr_warn("Concurrent rmmod and shutdown illegal!\n"); + mutex_unlock(&fullstop_mutex); + return NOTIFY_DONE; +} + +static struct notifier_block torture_shutdown_nb = { + .notifier_call = torture_shutdown_notify, +}; + /* * Initialize torture module. Please note that this is -not- invoked via * the usual module_init() mechanism, but rather by an explicit call from @@ -451,6 +468,7 @@ EXPORT_SYMBOL_GPL(torture_init_begin); void __init torture_init_end(void) { mutex_unlock(&fullstop_mutex); + register_reboot_notifier(&torture_shutdown_nb); } EXPORT_SYMBOL_GPL(torture_init_end); @@ -474,6 +492,7 @@ bool torture_cleanup(void) } fullstop = FULLSTOP_RMMOD; mutex_unlock(&fullstop_mutex); + unregister_reboot_notifier(&torture_shutdown_nb); torture_shuffle_cleanup(); torture_onoff_cleanup(); return false; -- cgit v1.2.3 From 36970bb91d89618d3495babf44b934e9c9db6bbc Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 30 Jan 2014 15:49:29 -0800 Subject: rcutorture: Privatize fullstop This commit introduces the torture_must_stop() function in order to keep use of the fullstop variable local to kernel/torture.c. There is also a torture_must_stop_irq() counterpart for use from RCU callbacks, timeout handlers, and the like. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- include/linux/torture.h | 8 ++------ kernel/rcu/rcutorture.c | 38 +++++++++++++++----------------------- kernel/torture.c | 27 +++++++++++++++++++++++++-- 3 files changed, 42 insertions(+), 31 deletions(-) (limited to 'include/linux') diff --git a/include/linux/torture.h b/include/linux/torture.h index 742d8a402f19..0259db38bfb0 100644 --- a/include/linux/torture.h +++ b/include/linux/torture.h @@ -41,12 +41,6 @@ module_param(name, type, 0444); \ MODULE_PARM_DESC(name, msg); -/* Mediate rmmod and system shutdown. Concurrent rmmod & shutdown illegal! */ -#define FULLSTOP_DONTSTOP 0 /* Normal operation. */ -#define FULLSTOP_SHUTDOWN 1 /* System shutdown with rcutorture running. */ -#define FULLSTOP_RMMOD 2 /* Normal rmmod of rcutorture. */ -extern int fullstop; - #define TORTURE_FLAG "-torture:" #define TOROUT_STRING(s) \ pr_alert("%s" TORTURE_FLAG s "\n", torture_type) @@ -85,5 +79,7 @@ void torture_shutdown_absorb(const char *title); void torture_init_begin(char *ttype, bool v); void torture_init_end(void); bool torture_cleanup(void); +bool torture_must_stop(void); +bool torture_must_stop_irq(void); #endif /* __LINUX_TORTURE_H */ diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 2560e9313887..9357c88cc8cc 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -304,7 +304,7 @@ rcu_torture_cb(struct rcu_head *p) int i; struct rcu_torture *rp = container_of(p, struct rcu_torture, rtort_rcu); - if (fullstop != FULLSTOP_DONTSTOP) { + if (torture_must_stop_irq()) { /* Test is ending, just drop callbacks on the floor. */ /* The next initialization will pick up the pieces. */ return; @@ -572,8 +572,7 @@ static int rcu_torture_boost(void *arg) while (ULONG_CMP_LT(jiffies, oldstarttime)) { schedule_timeout_interruptible(oldstarttime - jiffies); rcu_stutter_wait("rcu_torture_boost"); - if (kthread_should_stop() || - fullstop != FULLSTOP_DONTSTOP) + if (torture_must_stop()) goto checkwait; } @@ -595,8 +594,7 @@ static int rcu_torture_boost(void *arg) } cond_resched(); rcu_stutter_wait("rcu_torture_boost"); - if (kthread_should_stop() || - fullstop != FULLSTOP_DONTSTOP) + if (torture_must_stop()) goto checkwait; } @@ -621,7 +619,7 @@ static int rcu_torture_boost(void *arg) /* Go do the stutter. */ checkwait: rcu_stutter_wait("rcu_torture_boost"); - } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); + } while (!torture_must_stop()); /* Clean up and exit. */ VERBOSE_TOROUT_STRING("rcu_torture_boost task stopping"); @@ -659,7 +657,7 @@ rcu_torture_fqs(void *arg) fqs_burst_remaining -= fqs_holdoff; } rcu_stutter_wait("rcu_torture_fqs"); - } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); + } while (!torture_must_stop()); VERBOSE_TOROUT_STRING("rcu_torture_fqs task stopping"); torture_shutdown_absorb("rcu_torture_fqs"); while (!kthread_should_stop()) @@ -731,7 +729,7 @@ rcu_torture_writer(void *arg) } rcutorture_record_progress(++rcu_torture_current_version); rcu_stutter_wait("rcu_torture_writer"); - } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); + } while (!torture_must_stop()); VERBOSE_TOROUT_STRING("rcu_torture_writer task stopping"); torture_shutdown_absorb("rcu_torture_writer"); while (!kthread_should_stop()) @@ -768,7 +766,7 @@ rcu_torture_fakewriter(void *arg) cur_ops->exp_sync(); } rcu_stutter_wait("rcu_torture_fakewriter"); - } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); + } while (!torture_must_stop()); VERBOSE_TOROUT_STRING("rcu_torture_fakewriter task stopping"); torture_shutdown_absorb("rcu_torture_fakewriter"); @@ -913,7 +911,7 @@ rcu_torture_reader(void *arg) cur_ops->readunlock(idx); schedule(); rcu_stutter_wait("rcu_torture_reader"); - } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); + } while (!torture_must_stop()); VERBOSE_TOROUT_STRING("rcu_torture_reader task stopping"); torture_shutdown_absorb("rcu_torture_reader"); if (irqreader && cur_ops->irq_capable) @@ -1022,9 +1020,6 @@ rcu_torture_stats_print(void) /* * Periodically prints torture statistics, if periodic statistics printing * was specified via the stat_interval module parameter. - * - * No need to worry about fullstop here, since this one doesn't reference - * volatile state or register callbacks. */ static int rcu_torture_stats(void *arg) @@ -1034,7 +1029,7 @@ rcu_torture_stats(void *arg) schedule_timeout_interruptible(stat_interval * HZ); rcu_torture_stats_print(); torture_shutdown_absorb("rcu_torture_stats"); - } while (!kthread_should_stop()); + } while (!torture_must_stop()); VERBOSE_TOROUT_STRING("rcu_torture_stats task stopping"); return 0; } @@ -1241,16 +1236,15 @@ static int rcu_torture_barrier_cbs(void *arg) wait_event(barrier_cbs_wq[myid], (newphase = ACCESS_ONCE(barrier_phase)) != lastphase || - kthread_should_stop() || - fullstop != FULLSTOP_DONTSTOP); + torture_must_stop()); lastphase = newphase; smp_mb(); /* ensure barrier_phase load before ->call(). */ - if (kthread_should_stop() || fullstop != FULLSTOP_DONTSTOP) + if (torture_must_stop()) break; cur_ops->call(&rcu, rcu_torture_barrier_cbf); if (atomic_dec_and_test(&barrier_cbs_count)) wake_up(&barrier_wq); - } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); + } while (!torture_must_stop()); VERBOSE_TOROUT_STRING("rcu_torture_barrier_cbs task stopping"); torture_shutdown_absorb("rcu_torture_barrier_cbs"); while (!kthread_should_stop()) @@ -1275,9 +1269,8 @@ static int rcu_torture_barrier(void *arg) wake_up(&barrier_cbs_wq[i]); wait_event(barrier_wq, atomic_read(&barrier_cbs_count) == 0 || - kthread_should_stop() || - fullstop != FULLSTOP_DONTSTOP); - if (kthread_should_stop() || fullstop != FULLSTOP_DONTSTOP) + torture_must_stop()); + if (torture_must_stop()) break; n_barrier_attempts++; cur_ops->cb_barrier(); /* Implies smp_mb() for wait_event(). */ @@ -1287,7 +1280,7 @@ static int rcu_torture_barrier(void *arg) } n_barrier_successes++; schedule_timeout_interruptible(HZ / 10); - } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); + } while (!torture_must_stop()); VERBOSE_TOROUT_STRING("rcu_torture_barrier task stopping"); torture_shutdown_absorb("rcu_torture_barrier"); while (!kthread_should_stop()) @@ -1585,7 +1578,6 @@ rcu_torture_init(void) else nrealreaders = 2 * num_online_cpus(); rcu_torture_print_module_parms(cur_ops, "Start of test"); - fullstop = FULLSTOP_DONTSTOP; /* Set up the freelist. */ diff --git a/kernel/torture.c b/kernel/torture.c index b02fa2785bbb..ed360cf948da 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -52,8 +52,11 @@ MODULE_AUTHOR("Paul E. McKenney "); static char *torture_type; static bool verbose; -int fullstop = FULLSTOP_RMMOD; -EXPORT_SYMBOL_GPL(fullstop); +/* Mediate rmmod and system shutdown. Concurrent rmmod & shutdown illegal! */ +#define FULLSTOP_DONTSTOP 0 /* Normal operation. */ +#define FULLSTOP_SHUTDOWN 1 /* System shutdown with torture running. */ +#define FULLSTOP_RMMOD 2 /* Normal rmmod of torture. */ +static int fullstop = FULLSTOP_RMMOD; static DEFINE_MUTEX(fullstop_mutex); #ifdef CONFIG_HOTPLUG_CPU @@ -458,6 +461,7 @@ void __init torture_init_begin(char *ttype, bool v) mutex_lock(&fullstop_mutex); torture_type = ttype; verbose = v; + fullstop = FULLSTOP_DONTSTOP; } EXPORT_SYMBOL_GPL(torture_init_begin); @@ -498,3 +502,22 @@ bool torture_cleanup(void) return false; } EXPORT_SYMBOL_GPL(torture_cleanup); + +/* + * Is it time for the current torture test to stop? + */ +bool torture_must_stop(void) +{ + return torture_must_stop_irq() || kthread_should_stop(); +} +EXPORT_SYMBOL_GPL(torture_must_stop); + +/* + * Is it time for the current torture test to stop? This is the irq-safe + * version, hence no check for kthread_should_stop(). + */ +bool torture_must_stop_irq(void) +{ + return fullstop != FULLSTOP_DONTSTOP; +} +EXPORT_SYMBOL_GPL(torture_must_stop_irq); -- cgit v1.2.3 From 628edaa5062282b6e3d76c886fd2cbccae5cb87b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 31 Jan 2014 11:57:43 -0800 Subject: rcutorture: Abstract stutter_wait() Because stuttering the test load (stopping and restarting it) is useful for non-RCU testing, this commit moves the load-stuttering functionality to kernel/torture.c. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- include/linux/torture.h | 7 +++- kernel/rcu/rcutorture.c | 69 +++++++-------------------------------- kernel/torture.c | 87 ++++++++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 104 insertions(+), 59 deletions(-) (limited to 'include/linux') diff --git a/include/linux/torture.h b/include/linux/torture.h index 0259db38bfb0..203f127d9ddf 100644 --- a/include/linux/torture.h +++ b/include/linux/torture.h @@ -75,8 +75,13 @@ int torture_shuffle_init(long shuffint); /* Shutdown task absorption, for when the tasks cannot safely be killed. */ void torture_shutdown_absorb(const char *title); +/* Task stuttering, which forces load/no-load transitions. */ +void stutter_wait(const char *title); +int torture_stutter_init(int s); +void torture_stutter_cleanup(void); + /* Initialization and cleanup. */ -void torture_init_begin(char *ttype, bool v); +void torture_init_begin(char *ttype, bool v, int *runnable); void torture_init_end(void); bool torture_cleanup(void); bool torture_must_stop(void); diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 9357c88cc8cc..4329ad14f8dc 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -103,7 +103,6 @@ static struct task_struct *writer_task; static struct task_struct **fakewriter_tasks; static struct task_struct **reader_tasks; static struct task_struct *stats_task; -static struct task_struct *stutter_task; static struct task_struct *fqs_task; static struct task_struct *boost_tasks[NR_CPUS]; static struct task_struct *shutdown_task; @@ -145,8 +144,6 @@ static long n_barrier_attempts; static long n_barrier_successes; static struct list_head rcu_torture_removed; -static int stutter_pause_test; - #if defined(MODULE) || defined(CONFIG_RCU_TORTURE_TEST_RUNNABLE) #define RCUTORTURE_RUNNABLE_INIT 1 #else @@ -222,18 +219,6 @@ rcu_torture_free(struct rcu_torture *p) spin_unlock_bh(&rcu_torture_lock); } -static void -rcu_stutter_wait(const char *title) -{ - while (stutter_pause_test || !rcutorture_runnable) { - if (rcutorture_runnable) - schedule_timeout_interruptible(1); - else - schedule_timeout_interruptible(round_jiffies_relative(HZ)); - torture_shutdown_absorb(title); - } -} - /* * Operations vector for selecting different types of tests. */ @@ -571,7 +556,7 @@ static int rcu_torture_boost(void *arg) oldstarttime = boost_starttime; while (ULONG_CMP_LT(jiffies, oldstarttime)) { schedule_timeout_interruptible(oldstarttime - jiffies); - rcu_stutter_wait("rcu_torture_boost"); + stutter_wait("rcu_torture_boost"); if (torture_must_stop()) goto checkwait; } @@ -593,7 +578,7 @@ static int rcu_torture_boost(void *arg) call_rcu_time = jiffies; } cond_resched(); - rcu_stutter_wait("rcu_torture_boost"); + stutter_wait("rcu_torture_boost"); if (torture_must_stop()) goto checkwait; } @@ -618,7 +603,7 @@ static int rcu_torture_boost(void *arg) } /* Go do the stutter. */ -checkwait: rcu_stutter_wait("rcu_torture_boost"); +checkwait: stutter_wait("rcu_torture_boost"); } while (!torture_must_stop()); /* Clean up and exit. */ @@ -656,7 +641,7 @@ rcu_torture_fqs(void *arg) udelay(fqs_holdoff); fqs_burst_remaining -= fqs_holdoff; } - rcu_stutter_wait("rcu_torture_fqs"); + stutter_wait("rcu_torture_fqs"); } while (!torture_must_stop()); VERBOSE_TOROUT_STRING("rcu_torture_fqs task stopping"); torture_shutdown_absorb("rcu_torture_fqs"); @@ -728,7 +713,7 @@ rcu_torture_writer(void *arg) } } rcutorture_record_progress(++rcu_torture_current_version); - rcu_stutter_wait("rcu_torture_writer"); + stutter_wait("rcu_torture_writer"); } while (!torture_must_stop()); VERBOSE_TOROUT_STRING("rcu_torture_writer task stopping"); torture_shutdown_absorb("rcu_torture_writer"); @@ -765,7 +750,7 @@ rcu_torture_fakewriter(void *arg) } else { cur_ops->exp_sync(); } - rcu_stutter_wait("rcu_torture_fakewriter"); + stutter_wait("rcu_torture_fakewriter"); } while (!torture_must_stop()); VERBOSE_TOROUT_STRING("rcu_torture_fakewriter task stopping"); @@ -910,7 +895,7 @@ rcu_torture_reader(void *arg) preempt_enable(); cur_ops->readunlock(idx); schedule(); - rcu_stutter_wait("rcu_torture_reader"); + stutter_wait("rcu_torture_reader"); } while (!torture_must_stop()); VERBOSE_TOROUT_STRING("rcu_torture_reader task stopping"); torture_shutdown_absorb("rcu_torture_reader"); @@ -1034,25 +1019,6 @@ rcu_torture_stats(void *arg) return 0; } -/* Cause the rcutorture test to "stutter", starting and stopping all - * threads periodically. - */ -static int -rcu_torture_stutter(void *arg) -{ - VERBOSE_TOROUT_STRING("rcu_torture_stutter task started"); - do { - schedule_timeout_interruptible(stutter * HZ); - stutter_pause_test = 1; - if (!kthread_should_stop()) - schedule_timeout_interruptible(stutter * HZ); - stutter_pause_test = 0; - torture_shutdown_absorb("rcu_torture_stutter"); - } while (!kthread_should_stop()); - VERBOSE_TOROUT_STRING("rcu_torture_stutter task stopping"); - return 0; -} - static inline void rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag) { @@ -1403,11 +1369,7 @@ rcu_torture_cleanup(void) rcu_torture_barrier_cleanup(); rcu_torture_stall_cleanup(); - if (stutter_task) { - VERBOSE_TOROUT_STRING("Stopping rcu_torture_stutter task"); - kthread_stop(stutter_task); - } - stutter_task = NULL; + torture_stutter_cleanup(); if (writer_task) { VERBOSE_TOROUT_STRING("Stopping rcu_torture_writer task"); @@ -1548,7 +1510,7 @@ rcu_torture_init(void) &rcu_ops, &rcu_bh_ops, &srcu_ops, &sched_ops, }; - torture_init_begin(torture_type, verbose); + torture_init_begin(torture_type, verbose, &rcutorture_runnable); /* Process args and tell the world that the torturer is on the job. */ for (i = 0; i < ARRAY_SIZE(torture_ops); i++) { @@ -1682,21 +1644,14 @@ rcu_torture_init(void) if (stutter < 0) stutter = 0; if (stutter) { - /* Create the stutter thread */ - stutter_task = kthread_run(rcu_torture_stutter, NULL, - "rcu_torture_stutter"); - if (IS_ERR(stutter_task)) { - firsterr = PTR_ERR(stutter_task); - VERBOSE_TOROUT_ERRSTRING("Failed to create stutter"); - stutter_task = NULL; + firsterr = torture_stutter_init(stutter * HZ); + if (firsterr) goto unwind; - } - torture_shuffle_task_register(stutter_task); } if (fqs_duration < 0) fqs_duration = 0; if (fqs_duration) { - /* Create the stutter thread */ + /* Create the fqs thread */ fqs_task = kthread_run(rcu_torture_fqs, NULL, "rcu_torture_fqs"); if (IS_ERR(fqs_task)) { diff --git a/kernel/torture.c b/kernel/torture.c index d51de3029a5c..b30c2ee78580 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -58,6 +58,7 @@ static bool verbose; #define FULLSTOP_RMMOD 2 /* Normal rmmod of torture. */ static int fullstop = FULLSTOP_RMMOD; static DEFINE_MUTEX(fullstop_mutex); +static int *torture_runnable; #ifdef CONFIG_HOTPLUG_CPU @@ -452,17 +453,101 @@ static struct notifier_block torture_shutdown_nb = { .notifier_call = torture_shutdown_notify, }; +/* + * Variables for stuttering, which means to periodically pause and + * restart testing in order to catch bugs that appear when load is + * suddenly applied to or removed from the system. + */ +static struct task_struct *stutter_task; +static int stutter_pause_test; +static int stutter; + +/* + * Block until the stutter interval ends. This must be called periodically + * by all running kthreads that need to be subject to stuttering. + */ +void stutter_wait(const char *title) +{ + while (ACCESS_ONCE(stutter_pause_test) || + (torture_runnable && !ACCESS_ONCE(*torture_runnable))) { + if (stutter_pause_test) + schedule_timeout_interruptible(1); + else + schedule_timeout_interruptible(round_jiffies_relative(HZ)); + torture_shutdown_absorb(title); + } +} +EXPORT_SYMBOL_GPL(stutter_wait); + +/* + * Cause the torture test to "stutter", starting and stopping all + * threads periodically. + */ +static int torture_stutter(void *arg) +{ + VERBOSE_TOROUT_STRING("torture_stutter task started"); + do { + if (!torture_must_stop()) { + schedule_timeout_interruptible(stutter); + ACCESS_ONCE(stutter_pause_test) = 1; + } + if (!torture_must_stop()) + schedule_timeout_interruptible(stutter); + ACCESS_ONCE(stutter_pause_test) = 0; + torture_shutdown_absorb("torture_stutter"); + } while (!torture_must_stop()); + VERBOSE_TOROUT_STRING("torture_stutter task stopping"); + return 0; +} + +/* + * Initialize and kick off the torture_stutter kthread. + */ +int torture_stutter_init(int s) +{ + int ret; + + stutter = s; + stutter_task = kthread_run(torture_stutter, NULL, "torture_stutter"); + if (IS_ERR(stutter_task)) { + ret = PTR_ERR(stutter_task); + VERBOSE_TOROUT_ERRSTRING("Failed to create stutter"); + stutter_task = NULL; + return ret; + } + torture_shuffle_task_register(stutter_task); + return 0; +} +EXPORT_SYMBOL_GPL(torture_stutter_init); + +/* + * Cleanup after the torture_stutter kthread. + */ +void torture_stutter_cleanup(void) +{ + if (!stutter_task) + return; + VERBOSE_TOROUT_STRING("Stopping torture_stutter task"); + kthread_stop(stutter_task); + stutter_task = NULL; +} +EXPORT_SYMBOL_GPL(torture_stutter_cleanup); + /* * Initialize torture module. Please note that this is -not- invoked via * the usual module_init() mechanism, but rather by an explicit call from * the client torture module. This call must be paired with a later * torture_init_end(). + * + * The runnable parameter points to a flag that controls whether or not + * the test is currently runnable. If there is no such flag, pass in NULL. */ -void __init torture_init_begin(char *ttype, bool v) +void __init torture_init_begin(char *ttype, bool v, int *runnable) { mutex_lock(&fullstop_mutex); torture_type = ttype; verbose = v; + torture_runnable = runnable; fullstop = FULLSTOP_DONTSTOP; } -- cgit v1.2.3 From e991dbc0770b01b7dc7d6d7660442e83ebd11828 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 31 Jan 2014 14:52:13 -0800 Subject: rcutorture: Abstract torture_shutdown() Because auto-shutdown of torture testing is not specific to RCU, this commit moves the auto-shutdown function to kernel/torture.c. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- include/linux/torture.h | 4 ++- kernel/rcu/rcutorture.c | 63 +++---------------------------------- kernel/torture.c | 84 +++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 92 insertions(+), 59 deletions(-) (limited to 'include/linux') diff --git a/include/linux/torture.h b/include/linux/torture.h index 203f127d9ddf..c79c41d543ef 100644 --- a/include/linux/torture.h +++ b/include/linux/torture.h @@ -72,8 +72,10 @@ unsigned long torture_random(struct torture_random_state *trsp); void torture_shuffle_task_register(struct task_struct *tp); int torture_shuffle_init(long shuffint); -/* Shutdown task absorption, for when the tasks cannot safely be killed. */ +/* Test auto-shutdown handling. */ void torture_shutdown_absorb(const char *title); +int torture_shutdown_init(int ssecs, void (*cleanup)(void)); +void torture_shutdown_cleanup(void); /* Task stuttering, which forces load/no-load transitions. */ void stutter_wait(const char *title); diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 4329ad14f8dc..897b0f91f899 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -105,7 +105,6 @@ static struct task_struct **reader_tasks; static struct task_struct *stats_task; static struct task_struct *fqs_task; static struct task_struct *boost_tasks[NR_CPUS]; -static struct task_struct *shutdown_task; static struct task_struct *stall_task; static struct task_struct **barrier_cbs_tasks; static struct task_struct *barrier_task; @@ -173,7 +172,6 @@ static u64 notrace rcu_trace_clock_local(void) } #endif /* #else #ifdef CONFIG_RCU_TRACE */ -static unsigned long shutdown_time; /* jiffies to system shutdown. */ static unsigned long boost_starttime; /* jiffies of next boost test start. */ DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */ /* and boost task create/destroy. */ @@ -183,9 +181,6 @@ static atomic_t barrier_cbs_invoked; /* Barrier callbacks invoked. */ static wait_queue_head_t *barrier_cbs_wq; /* Coordinate barrier testing. */ static DECLARE_WAIT_QUEUE_HEAD(barrier_wq); -/* Forward reference. */ -static void rcu_torture_cleanup(void); - /* * Allocate an element from the rcu_tortures pool. */ @@ -1086,42 +1081,6 @@ static int rcutorture_booster_init(int cpu) return 0; } -/* - * Cause the rcutorture test to shutdown the system after the test has - * run for the time specified by the shutdown_secs module parameter. - */ -static int -rcu_torture_shutdown(void *arg) -{ - long delta; - unsigned long jiffies_snap; - - VERBOSE_TOROUT_STRING("rcu_torture_shutdown task started"); - jiffies_snap = ACCESS_ONCE(jiffies); - while (ULONG_CMP_LT(jiffies_snap, shutdown_time) && - !kthread_should_stop()) { - delta = shutdown_time - jiffies_snap; - if (verbose) - pr_alert("%s" TORTURE_FLAG - "rcu_torture_shutdown task: %lu jiffies remaining\n", - torture_type, delta); - schedule_timeout_interruptible(delta); - jiffies_snap = ACCESS_ONCE(jiffies); - } - if (kthread_should_stop()) { - VERBOSE_TOROUT_STRING("rcu_torture_shutdown task stopping"); - return 0; - } - - /* OK, shut down the system. */ - - VERBOSE_TOROUT_STRING("rcu_torture_shutdown task shutting down system"); - shutdown_task = NULL; /* Avoid self-kill deadlock. */ - rcu_torture_cleanup(); /* Get the success/failure message. */ - kernel_power_off(); /* Shut down the system. */ - return 0; -} - /* * CPU-stall kthread. It waits as specified by stall_cpu_holdoff, then * induces a CPU stall for the time specified by stall_cpu. @@ -1421,11 +1380,7 @@ rcu_torture_cleanup(void) for_each_possible_cpu(i) rcutorture_booster_cleanup(i); } - if (shutdown_task != NULL) { - VERBOSE_TOROUT_STRING("Stopping rcu_torture_shutdown task"); - kthread_stop(shutdown_task); - } - shutdown_task = NULL; + torture_shutdown_cleanup(); /* Wait for all RCU callbacks to fire. */ @@ -1681,18 +1636,10 @@ rcu_torture_init(void) } } } - if (shutdown_secs > 0) { - shutdown_time = jiffies + shutdown_secs * HZ; - shutdown_task = kthread_create(rcu_torture_shutdown, NULL, - "rcu_torture_shutdown"); - if (IS_ERR(shutdown_task)) { - firsterr = PTR_ERR(shutdown_task); - VERBOSE_TOROUT_ERRSTRING("Failed to create shutdown"); - shutdown_task = NULL; - goto unwind; - } - torture_shuffle_task_register(shutdown_task); - wake_up_process(shutdown_task); + i = torture_shutdown_init(shutdown_secs, rcu_torture_cleanup); + if (i != 0) { + firsterr = i; + goto unwind; } i = torture_onoff_init(onoff_holdoff * HZ, onoff_interval * HZ); if (i != 0) { diff --git a/kernel/torture.c b/kernel/torture.c index 1bafd02d1eed..df2c700e96e4 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -418,6 +418,15 @@ static void torture_shuffle_cleanup(void) } EXPORT_SYMBOL_GPL(torture_shuffle_cleanup); +/* + * Variables for auto-shutdown. This allows "lights out" torture runs + * to be fully scripted. + */ +static int shutdown_secs; /* desired test duration in seconds. */ +static struct task_struct *shutdown_task; +static unsigned long shutdown_time; /* jiffies to system shutdown. */ +static void (*torture_shutdown_hook)(void); + /* * Absorb kthreads into a kernel function that won't return, so that * they won't ever access module text or data again. @@ -432,6 +441,81 @@ void torture_shutdown_absorb(const char *title) } EXPORT_SYMBOL_GPL(torture_shutdown_absorb); +/* + * Cause the torture test to shutdown the system after the test has + * run for the time specified by the shutdown_secs parameter. + */ +static int torture_shutdown(void *arg) +{ + long delta; + unsigned long jiffies_snap; + + VERBOSE_TOROUT_STRING("torture_shutdown task started"); + jiffies_snap = jiffies; + while (ULONG_CMP_LT(jiffies_snap, shutdown_time) && + !torture_must_stop()) { + delta = shutdown_time - jiffies_snap; + if (verbose) + pr_alert("%s" TORTURE_FLAG + "torture_shutdown task: %lu jiffies remaining\n", + torture_type, delta); + schedule_timeout_interruptible(delta); + jiffies_snap = jiffies; + } + if (torture_must_stop()) { + VERBOSE_TOROUT_STRING("torture_shutdown task stopping"); + return 0; + } + + /* OK, shut down the system. */ + + VERBOSE_TOROUT_STRING("torture_shutdown task shutting down system"); + shutdown_task = NULL; /* Avoid self-kill deadlock. */ + torture_shutdown_hook();/* Shut down the enclosing torture test. */ + kernel_power_off(); /* Shut down the system. */ + return 0; +} + +/* + * Start up the shutdown task. + */ +int torture_shutdown_init(int ssecs, void (*cleanup)(void)) +{ + int ret; + + shutdown_secs = ssecs; + torture_shutdown_hook = cleanup; + if (shutdown_secs > 0) { + shutdown_time = jiffies + shutdown_secs * HZ; + shutdown_task = kthread_create(torture_shutdown, NULL, + "torture_shutdown"); + if (IS_ERR(shutdown_task)) { + ret = PTR_ERR(shutdown_task); + VERBOSE_TOROUT_ERRSTRING("Failed to create shutdown"); + shutdown_task = NULL; + return ret; + } + torture_shuffle_task_register(shutdown_task); + wake_up_process(shutdown_task); + } + return 0; +} +EXPORT_SYMBOL_GPL(torture_shutdown_init); + +/* + * Shut down the shutdown task. Say what??? Heh! This can happen if + * the torture module gets an rmmod before the shutdown time arrives. ;-) + */ +void torture_shutdown_cleanup(void) +{ + if (shutdown_task != NULL) { + VERBOSE_TOROUT_STRING("Stopping torture_shutdown task"); + kthread_stop(shutdown_task); + } + shutdown_task = NULL; +} +EXPORT_SYMBOL_GPL(torture_shutdown_cleanup); + /* * Detect and respond to a system shutdown. */ -- cgit v1.2.3 From 7fafaac5b9ce22cc57777865390520476ad2262d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 31 Jan 2014 17:37:28 -0800 Subject: rcutorture: Fix rcutorture shutdown races Not all of the rcutorture kthreads waited for kthread_should_stop() before returning from their top-level functions, and none of them used torture_shutdown_absorb() properly. These problems can result in segfaults and hangs at shutdown time, and some recent changes perturbed timing sufficiently to make them much more probable. This commit therefore creates a torture_kthread_stopping() function that does the proper kthread shutdown dance in one centralized location. Accommodate this grouping by making VERBOSE_TOROUT_STRING() capable of taking a non-const string as its argument, which allows the new torture_kthread_stopping() to pass its "title" argument directly to the updated version of VERBOSE_TOROUT_STRING(). Signed-off-by: Paul E. McKenney --- include/linux/torture.h | 3 ++- kernel/rcu/rcutorture.c | 39 +++++++++++---------------------------- kernel/torture.c | 26 ++++++++++++++++++++++---- 3 files changed, 35 insertions(+), 33 deletions(-) (limited to 'include/linux') diff --git a/include/linux/torture.h b/include/linux/torture.h index c79c41d543ef..2ea11094daf6 100644 --- a/include/linux/torture.h +++ b/include/linux/torture.h @@ -45,7 +45,7 @@ #define TOROUT_STRING(s) \ pr_alert("%s" TORTURE_FLAG s "\n", torture_type) #define VERBOSE_TOROUT_STRING(s) \ - do { if (verbose) pr_alert("%s" TORTURE_FLAG s "\n", torture_type); } while (0) + do { if (verbose) pr_alert("%s" TORTURE_FLAG " %s\n", torture_type, s); } while (0) #define VERBOSE_TOROUT_ERRSTRING(s) \ do { if (verbose) pr_alert("%s" TORTURE_FLAG "!!! " s "\n", torture_type); } while (0) @@ -88,5 +88,6 @@ void torture_init_end(void); bool torture_cleanup(void); bool torture_must_stop(void); bool torture_must_stop_irq(void); +void torture_kthread_stopping(char *title); #endif /* __LINUX_TORTURE_H */ diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 6e9ba51b23b9..bcaafd6cf633 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -602,12 +602,13 @@ checkwait: stutter_wait("rcu_torture_boost"); } while (!torture_must_stop()); /* Clean up and exit. */ - VERBOSE_TOROUT_STRING("rcu_torture_boost task stopping"); - torture_shutdown_absorb("rcu_torture_boost"); - while (!kthread_should_stop() || rbi.inflight) + while (!kthread_should_stop() || rbi.inflight) { + torture_shutdown_absorb("rcu_torture_boost"); schedule_timeout_uninterruptible(1); + } smp_mb(); /* order accesses to ->inflight before stack-frame death. */ destroy_rcu_head_on_stack(&rbi.rcu); + torture_kthread_stopping("rcu_torture_boost"); return 0; } @@ -638,10 +639,7 @@ rcu_torture_fqs(void *arg) } stutter_wait("rcu_torture_fqs"); } while (!torture_must_stop()); - VERBOSE_TOROUT_STRING("rcu_torture_fqs task stopping"); - torture_shutdown_absorb("rcu_torture_fqs"); - while (!kthread_should_stop()) - schedule_timeout_uninterruptible(1); + torture_kthread_stopping("rcu_torture_fqs"); return 0; } @@ -710,10 +708,7 @@ rcu_torture_writer(void *arg) rcutorture_record_progress(++rcu_torture_current_version); stutter_wait("rcu_torture_writer"); } while (!torture_must_stop()); - VERBOSE_TOROUT_STRING("rcu_torture_writer task stopping"); - torture_shutdown_absorb("rcu_torture_writer"); - while (!kthread_should_stop()) - schedule_timeout_uninterruptible(1); + torture_kthread_stopping("rcu_torture_writer"); return 0; } @@ -748,10 +743,7 @@ rcu_torture_fakewriter(void *arg) stutter_wait("rcu_torture_fakewriter"); } while (!torture_must_stop()); - VERBOSE_TOROUT_STRING("rcu_torture_fakewriter task stopping"); - torture_shutdown_absorb("rcu_torture_fakewriter"); - while (!kthread_should_stop()) - schedule_timeout_uninterruptible(1); + torture_kthread_stopping("rcu_torture_fakewriter"); return 0; } @@ -892,12 +884,9 @@ rcu_torture_reader(void *arg) schedule(); stutter_wait("rcu_torture_reader"); } while (!torture_must_stop()); - VERBOSE_TOROUT_STRING("rcu_torture_reader task stopping"); - torture_shutdown_absorb("rcu_torture_reader"); if (irqreader && cur_ops->irq_capable) del_timer_sync(&t); - while (!kthread_should_stop()) - schedule_timeout_uninterruptible(1); + torture_kthread_stopping("rcu_torture_reader"); return 0; } @@ -1010,7 +999,7 @@ rcu_torture_stats(void *arg) rcu_torture_stats_print(); torture_shutdown_absorb("rcu_torture_stats"); } while (!torture_must_stop()); - VERBOSE_TOROUT_STRING("rcu_torture_stats task stopping"); + torture_kthread_stopping("rcu_torture_stats"); return 0; } @@ -1171,12 +1160,9 @@ static int rcu_torture_barrier_cbs(void *arg) if (atomic_dec_and_test(&barrier_cbs_count)) wake_up(&barrier_wq); } while (!torture_must_stop()); - VERBOSE_TOROUT_STRING("rcu_torture_barrier_cbs task stopping"); - torture_shutdown_absorb("rcu_torture_barrier_cbs"); - while (!kthread_should_stop()) - schedule_timeout_interruptible(1); cur_ops->cb_barrier(); destroy_rcu_head_on_stack(&rcu); + torture_kthread_stopping("rcu_torture_barrier_cbs"); return 0; } @@ -1207,10 +1193,7 @@ static int rcu_torture_barrier(void *arg) n_barrier_successes++; schedule_timeout_interruptible(HZ / 10); } while (!torture_must_stop()); - VERBOSE_TOROUT_STRING("rcu_torture_barrier task stopping"); - torture_shutdown_absorb("rcu_torture_barrier"); - while (!kthread_should_stop()) - schedule_timeout_interruptible(1); + torture_kthread_stopping("rcu_torture_barrier"); return 0; } diff --git a/kernel/torture.c b/kernel/torture.c index 5e2838f902f9..330576660cf4 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -169,7 +169,7 @@ torture_onoff(void *arg) } schedule_timeout_interruptible(onoff_interval); } - VERBOSE_TOROUT_STRING("torture_onoff task stopping"); + torture_kthread_stopping("torture_onoff"); return 0; } @@ -370,7 +370,7 @@ static int torture_shuffle(void *arg) torture_shuffle_tasks(); torture_shutdown_absorb("torture_shuffle"); } while (!torture_must_stop()); - VERBOSE_TOROUT_STRING("torture_shuffle task stopping"); + torture_kthread_stopping("torture_shuffle"); return 0; } @@ -465,7 +465,7 @@ static int torture_shutdown(void *arg) jiffies_snap = jiffies; } if (torture_must_stop()) { - VERBOSE_TOROUT_STRING("torture_shutdown task stopping"); + torture_kthread_stopping("torture_shutdown"); return 0; } @@ -583,7 +583,7 @@ static int torture_stutter(void *arg) ACCESS_ONCE(stutter_pause_test) = 0; torture_shutdown_absorb("torture_stutter"); } while (!torture_must_stop()); - VERBOSE_TOROUT_STRING("torture_stutter task stopping"); + torture_kthread_stopping("torture_stutter"); return 0; } @@ -696,3 +696,21 @@ bool torture_must_stop_irq(void) return ACCESS_ONCE(fullstop) != FULLSTOP_DONTSTOP; } EXPORT_SYMBOL_GPL(torture_must_stop_irq); + +/* + * Each kthread must wait for kthread_should_stop() before returning from + * its top-level function, otherwise segfaults ensue. This function + * prints a "stopping" message and waits for kthread_should_stop(), and + * should be called from all torture kthreads immediately prior to + * returning. + */ +void torture_kthread_stopping(char *title) +{ + if (verbose) + VERBOSE_TOROUT_STRING(title); + while (!kthread_should_stop()) { + torture_shutdown_absorb(title); + schedule_timeout_uninterruptible(1); + } +} +EXPORT_SYMBOL_GPL(torture_kthread_stopping); -- cgit v1.2.3 From 47cf29b9e721967aac95ebda9e50408219755852 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 3 Feb 2014 11:52:27 -0800 Subject: rcutorture: Abstract torture_create_kthread() Creation of kthreads is not RCU-specific, so this commit abstracts out torture_create_kthread(), saving a few tens of lines of code in the process. This change requires modifying VERBOSE_TOROUT_ERRSTRING() to take a non-const string, so that _torture_create_kthread() can avoid an open-coded substitute. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- include/linux/torture.h | 8 +++- kernel/rcu/rcutorture.c | 98 ++++++++++--------------------------------------- kernel/torture.c | 80 +++++++++++++++++----------------------- 3 files changed, 60 insertions(+), 126 deletions(-) (limited to 'include/linux') diff --git a/include/linux/torture.h b/include/linux/torture.h index 2ea11094daf6..430cc3008628 100644 --- a/include/linux/torture.h +++ b/include/linux/torture.h @@ -47,7 +47,7 @@ #define VERBOSE_TOROUT_STRING(s) \ do { if (verbose) pr_alert("%s" TORTURE_FLAG " %s\n", torture_type, s); } while (0) #define VERBOSE_TOROUT_ERRSTRING(s) \ - do { if (verbose) pr_alert("%s" TORTURE_FLAG "!!! " s "\n", torture_type); } while (0) + do { if (verbose) pr_alert("%s" TORTURE_FLAG "!!! %s\n", torture_type, s); } while (0) /* Definitions for a non-string torture-test module parameter. */ #define torture_parm(type, name, init, msg) \ @@ -89,5 +89,11 @@ bool torture_cleanup(void); bool torture_must_stop(void); bool torture_must_stop_irq(void); void torture_kthread_stopping(char *title); +int _torture_create_kthread(int (*fn)(void *arg), void *arg, char *s, char *m, + char *f, struct task_struct **tp); + +#define torture_create_kthread(n, arg, tp) \ + _torture_create_kthread(n, (arg), #n, "Creating " #n " task", \ + "Failed to create " #n, &(tp)) #endif /* __LINUX_TORTURE_H */ diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 25e9b16fe7f0..a6f6c8418d87 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1105,19 +1105,9 @@ static int rcu_torture_stall(void *args) /* Spawn CPU-stall kthread, if stall_cpu specified. */ static int __init rcu_torture_stall_init(void) { - int ret; - if (stall_cpu <= 0) return 0; - VERBOSE_TOROUT_STRING("Creating rcu_torture_stall task"); - stall_task = kthread_run(rcu_torture_stall, NULL, "rcu_torture_stall"); - if (IS_ERR(stall_task)) { - ret = PTR_ERR(stall_task); - stall_task = NULL; - return ret; - } - torture_shuffle_task_register(stall_task); - return 0; + return torture_create_kthread(rcu_torture_stall, NULL, stall_task); } /* Clean up after the CPU-stall kthread, if one was spawned. */ @@ -1226,29 +1216,13 @@ static int rcu_torture_barrier_init(void) return -ENOMEM; for (i = 0; i < n_barrier_cbs; i++) { init_waitqueue_head(&barrier_cbs_wq[i]); - VERBOSE_TOROUT_STRING("Creating rcu_torture_barrier_cbs task"); - barrier_cbs_tasks[i] = kthread_run(rcu_torture_barrier_cbs, - (void *)(long)i, - "rcu_torture_barrier_cbs"); - if (IS_ERR(barrier_cbs_tasks[i])) { - ret = PTR_ERR(barrier_cbs_tasks[i]); - VERBOSE_TOROUT_ERRSTRING("Failed to create rcu_torture_barrier_cbs"); - barrier_cbs_tasks[i] = NULL; + ret = torture_create_kthread(rcu_torture_barrier_cbs, + (void *)(long)i, + barrier_cbs_tasks[i]); + if (ret) return ret; - } - torture_shuffle_task_register(barrier_cbs_tasks[i]); } - VERBOSE_TOROUT_STRING("Creating rcu_torture_barrier task"); - barrier_task = kthread_run(rcu_torture_barrier, NULL, - "rcu_torture_barrier"); - if (IS_ERR(barrier_task)) { - ret = PTR_ERR(barrier_task); - VERBOSE_TOROUT_ERRSTRING("Failed to create rcu_torture_barrier"); - barrier_task = NULL; - return ret; - } - torture_shuffle_task_register(barrier_task); - return 0; + return torture_create_kthread(rcu_torture_barrier, NULL, barrier_task); } /* Clean up after RCU barrier testing. */ @@ -1516,17 +1490,10 @@ rcu_torture_init(void) /* Start up the kthreads. */ - VERBOSE_TOROUT_STRING("Creating rcu_torture_writer task"); - writer_task = kthread_create(rcu_torture_writer, NULL, - "rcu_torture_writer"); - if (IS_ERR(writer_task)) { - firsterr = PTR_ERR(writer_task); - VERBOSE_TOROUT_ERRSTRING("Failed to create writer"); - writer_task = NULL; + firsterr = torture_create_kthread(rcu_torture_writer, NULL, + writer_task); + if (firsterr) goto unwind; - } - torture_shuffle_task_register(writer_task); - wake_up_process(writer_task); fakewriter_tasks = kzalloc(nfakewriters * sizeof(fakewriter_tasks[0]), GFP_KERNEL); if (fakewriter_tasks == NULL) { @@ -1535,16 +1502,10 @@ rcu_torture_init(void) goto unwind; } for (i = 0; i < nfakewriters; i++) { - VERBOSE_TOROUT_STRING("Creating rcu_torture_fakewriter task"); - fakewriter_tasks[i] = kthread_run(rcu_torture_fakewriter, NULL, - "rcu_torture_fakewriter"); - if (IS_ERR(fakewriter_tasks[i])) { - firsterr = PTR_ERR(fakewriter_tasks[i]); - VERBOSE_TOROUT_ERRSTRING("Failed to create fakewriter"); - fakewriter_tasks[i] = NULL; + firsterr = torture_create_kthread(rcu_torture_fakewriter, + NULL, fakewriter_tasks[i]); + if (firsterr) goto unwind; - } - torture_shuffle_task_register(fakewriter_tasks[i]); } reader_tasks = kzalloc(nrealreaders * sizeof(reader_tasks[0]), GFP_KERNEL); @@ -1554,28 +1515,16 @@ rcu_torture_init(void) goto unwind; } for (i = 0; i < nrealreaders; i++) { - VERBOSE_TOROUT_STRING("Creating rcu_torture_reader task"); - reader_tasks[i] = kthread_run(rcu_torture_reader, NULL, - "rcu_torture_reader"); - if (IS_ERR(reader_tasks[i])) { - firsterr = PTR_ERR(reader_tasks[i]); - VERBOSE_TOROUT_ERRSTRING("Failed to create reader"); - reader_tasks[i] = NULL; + firsterr = torture_create_kthread(rcu_torture_reader, NULL, + reader_tasks[i]); + if (firsterr) goto unwind; - } - torture_shuffle_task_register(reader_tasks[i]); } if (stat_interval > 0) { - VERBOSE_TOROUT_STRING("Creating rcu_torture_stats task"); - stats_task = kthread_run(rcu_torture_stats, NULL, - "rcu_torture_stats"); - if (IS_ERR(stats_task)) { - firsterr = PTR_ERR(stats_task); - VERBOSE_TOROUT_ERRSTRING("Failed to create stats"); - stats_task = NULL; + firsterr = torture_create_kthread(rcu_torture_stats, NULL, + stats_task); + if (firsterr) goto unwind; - } - torture_shuffle_task_register(stats_task); } if (test_no_idle_hz) { firsterr = torture_shuffle_init(shuffle_interval * HZ); @@ -1593,16 +1542,9 @@ rcu_torture_init(void) fqs_duration = 0; if (fqs_duration) { /* Create the fqs thread */ - VERBOSE_TOROUT_STRING("Creating rcu_torture_fqs task"); - fqs_task = kthread_run(rcu_torture_fqs, NULL, - "rcu_torture_fqs"); - if (IS_ERR(fqs_task)) { - firsterr = PTR_ERR(fqs_task); - VERBOSE_TOROUT_ERRSTRING("Failed to create fqs"); - fqs_task = NULL; + torture_create_kthread(rcu_torture_fqs, NULL, fqs_task); + if (firsterr) goto unwind; - } - torture_shuffle_task_register(fqs_task); } if (test_boost_interval < 1) test_boost_interval = 1; diff --git a/kernel/torture.c b/kernel/torture.c index 330576660cf4..439451821a7f 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -180,23 +180,16 @@ torture_onoff(void *arg) */ int torture_onoff_init(long ooholdoff, long oointerval) { -#ifdef CONFIG_HOTPLUG_CPU - int ret; + int ret = 0; +#ifdef CONFIG_HOTPLUG_CPU onoff_holdoff = ooholdoff; onoff_interval = oointerval; if (onoff_interval <= 0) return 0; - VERBOSE_TOROUT_STRING("Creating torture_onoff task"); - onoff_task = kthread_run(torture_onoff, NULL, "torture_onoff"); - if (IS_ERR(onoff_task)) { - ret = PTR_ERR(onoff_task); - onoff_task = NULL; - return ret; - } - torture_shuffle_task_register(onoff_task); + ret = torture_create_kthread(torture_onoff, NULL, onoff_task); #endif /* #ifdef CONFIG_HOTPLUG_CPU */ - return 0; + return ret; } EXPORT_SYMBOL_GPL(torture_onoff_init); @@ -379,8 +372,6 @@ static int torture_shuffle(void *arg) */ int torture_shuffle_init(long shuffint) { - int ret; - shuffle_interval = shuffint; shuffle_idle_cpu = -1; @@ -391,17 +382,7 @@ int torture_shuffle_init(long shuffint) } /* Create the shuffler thread */ - VERBOSE_TOROUT_STRING("Creating torture_shuffle task"); - shuffler_task = kthread_run(torture_shuffle, NULL, "torture_shuffle"); - if (IS_ERR(shuffler_task)) { - ret = PTR_ERR(shuffler_task); - free_cpumask_var(shuffle_tmp_mask); - VERBOSE_TOROUT_ERRSTRING("Failed to create shuffler"); - shuffler_task = NULL; - return ret; - } - torture_shuffle_task_register(shuffler_task); - return 0; + return torture_create_kthread(torture_shuffle, NULL, shuffler_task); } EXPORT_SYMBOL_GPL(torture_shuffle_init); @@ -483,25 +464,16 @@ static int torture_shutdown(void *arg) */ int torture_shutdown_init(int ssecs, void (*cleanup)(void)) { - int ret; + int ret = 0; shutdown_secs = ssecs; torture_shutdown_hook = cleanup; if (shutdown_secs > 0) { - VERBOSE_TOROUT_STRING("Creating torture_shutdown task"); shutdown_time = jiffies + shutdown_secs * HZ; - shutdown_task = kthread_create(torture_shutdown, NULL, - "torture_shutdown"); - if (IS_ERR(shutdown_task)) { - ret = PTR_ERR(shutdown_task); - VERBOSE_TOROUT_ERRSTRING("Failed to create shutdown"); - shutdown_task = NULL; - return ret; - } - torture_shuffle_task_register(shutdown_task); - wake_up_process(shutdown_task); + ret = torture_create_kthread(torture_shutdown, NULL, + shutdown_task); } - return 0; + return ret; } EXPORT_SYMBOL_GPL(torture_shutdown_init); @@ -595,16 +567,8 @@ int torture_stutter_init(int s) int ret; stutter = s; - VERBOSE_TOROUT_STRING("Creating torture_stutter task"); - stutter_task = kthread_run(torture_stutter, NULL, "torture_stutter"); - if (IS_ERR(stutter_task)) { - ret = PTR_ERR(stutter_task); - VERBOSE_TOROUT_ERRSTRING("Failed to create stutter"); - stutter_task = NULL; - return ret; - } - torture_shuffle_task_register(stutter_task); - return 0; + ret = torture_create_kthread(torture_stutter, NULL, stutter_task); + return ret; } EXPORT_SYMBOL_GPL(torture_stutter_init); @@ -714,3 +678,25 @@ void torture_kthread_stopping(char *title) } } EXPORT_SYMBOL_GPL(torture_kthread_stopping); + +/* + * Create a generic torture kthread that is immediately runnable. If you + * need the kthread to be stopped so that you can do something to it before + * it starts, you will need to open-code your own. + */ +int _torture_create_kthread(int (*fn)(void *arg), void *arg, char *s, char *m, + char *f, struct task_struct **tp) +{ + int ret = 0; + + VERBOSE_TOROUT_STRING(m); + *tp = kthread_run(fn, arg, s); + if (IS_ERR(*tp)) { + ret = PTR_ERR(*tp); + VERBOSE_TOROUT_ERRSTRING(f); + *tp = NULL; + } + torture_shuffle_task_register(*tp); + return ret; +} +EXPORT_SYMBOL_GPL(_torture_create_kthread); -- cgit v1.2.3 From 9c029b86098decd4660eec511b8d2d42da3e7dd9 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 4 Feb 2014 11:47:08 -0800 Subject: rcutorture: Abstract torture_stop_kthread() Stopping of kthreads is not RCU-specific, so this commit abstracts out torture_stop_kthread(), saving a few lines of code in the process. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- include/linux/torture.h | 3 +++ kernel/rcu/rcutorture.c | 72 ++++++++++--------------------------------------- kernel/torture.c | 13 +++++++++ 3 files changed, 30 insertions(+), 58 deletions(-) (limited to 'include/linux') diff --git a/include/linux/torture.h b/include/linux/torture.h index 430cc3008628..7ccfb0a16728 100644 --- a/include/linux/torture.h +++ b/include/linux/torture.h @@ -91,9 +91,12 @@ bool torture_must_stop_irq(void); void torture_kthread_stopping(char *title); int _torture_create_kthread(int (*fn)(void *arg), void *arg, char *s, char *m, char *f, struct task_struct **tp); +void _torture_stop_kthread(char *m, struct task_struct **tp); #define torture_create_kthread(n, arg, tp) \ _torture_create_kthread(n, (arg), #n, "Creating " #n " task", \ "Failed to create " #n, &(tp)) +#define torture_stop_kthread(n, tp) \ + _torture_stop_kthread("Stopping " #n " task", &(tp)) #endif /* __LINUX_TORTURE_H */ diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index a6f6c8418d87..37bd4beea198 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1033,14 +1033,12 @@ static void rcutorture_booster_cleanup(int cpu) if (boost_tasks[cpu] == NULL) return; mutex_lock(&boost_mutex); - VERBOSE_TOROUT_STRING("Stopping rcu_torture_boost task"); t = boost_tasks[cpu]; boost_tasks[cpu] = NULL; mutex_unlock(&boost_mutex); /* This must be outside of the mutex, otherwise deadlock! */ - kthread_stop(t); - boost_tasks[cpu] = NULL; + torture_stop_kthread(rcu_torture_boost, t); } static int rcutorture_booster_init(int cpu) @@ -1110,16 +1108,6 @@ static int __init rcu_torture_stall_init(void) return torture_create_kthread(rcu_torture_stall, NULL, stall_task); } -/* Clean up after the CPU-stall kthread, if one was spawned. */ -static void rcu_torture_stall_cleanup(void) -{ - if (stall_task == NULL) - return; - VERBOSE_TOROUT_STRING("Stopping rcu_torture_stall_task."); - kthread_stop(stall_task); - stall_task = NULL; -} - /* Callback function for RCU barrier testing. */ void rcu_torture_barrier_cbf(struct rcu_head *rcu) { @@ -1230,19 +1218,11 @@ static void rcu_torture_barrier_cleanup(void) { int i; - if (barrier_task != NULL) { - VERBOSE_TOROUT_STRING("Stopping rcu_torture_barrier task"); - kthread_stop(barrier_task); - barrier_task = NULL; - } + torture_stop_kthread(rcu_torture_barrier, barrier_task); if (barrier_cbs_tasks != NULL) { - for (i = 0; i < n_barrier_cbs; i++) { - if (barrier_cbs_tasks[i] != NULL) { - VERBOSE_TOROUT_STRING("Stopping rcu_torture_barrier_cbs task"); - kthread_stop(barrier_cbs_tasks[i]); - barrier_cbs_tasks[i] = NULL; - } - } + for (i = 0; i < n_barrier_cbs; i++) + torture_stop_kthread(rcu_torture_barrier_cbs, + barrier_cbs_tasks[i]); kfree(barrier_cbs_tasks); barrier_cbs_tasks = NULL; } @@ -1288,53 +1268,29 @@ rcu_torture_cleanup(void) } rcu_torture_barrier_cleanup(); - rcu_torture_stall_cleanup(); + torture_stop_kthread(rcu_torture_stall, stall_task); torture_stutter_cleanup(); - - if (writer_task) { - VERBOSE_TOROUT_STRING("Stopping rcu_torture_writer task"); - kthread_stop(writer_task); - } - writer_task = NULL; + torture_stop_kthread(rcu_torture_writer, writer_task); if (reader_tasks) { - for (i = 0; i < nrealreaders; i++) { - if (reader_tasks[i]) { - VERBOSE_TOROUT_STRING( - "Stopping rcu_torture_reader task"); - kthread_stop(reader_tasks[i]); - } - reader_tasks[i] = NULL; - } + for (i = 0; i < nrealreaders; i++) + torture_stop_kthread(rcu_torture_reader, + reader_tasks[i]); kfree(reader_tasks); - reader_tasks = NULL; } rcu_torture_current = NULL; if (fakewriter_tasks) { for (i = 0; i < nfakewriters; i++) { - if (fakewriter_tasks[i]) { - VERBOSE_TOROUT_STRING( - "Stopping rcu_torture_fakewriter task"); - kthread_stop(fakewriter_tasks[i]); - } - fakewriter_tasks[i] = NULL; + torture_stop_kthread(rcu_torture_fakewriter, + fakewriter_tasks[i]); } kfree(fakewriter_tasks); fakewriter_tasks = NULL; } - if (stats_task) { - VERBOSE_TOROUT_STRING("Stopping rcu_torture_stats task"); - kthread_stop(stats_task); - } - stats_task = NULL; - - if (fqs_task) { - VERBOSE_TOROUT_STRING("Stopping rcu_torture_fqs task"); - kthread_stop(fqs_task); - } - fqs_task = NULL; + torture_stop_kthread(rcu_torture_stats, stats_task); + torture_stop_kthread(rcu_torture_fqs, fqs_task); if ((test_boost == 1 && cur_ops->can_boost) || test_boost == 2) { unregister_cpu_notifier(&rcutorture_cpu_nb); diff --git a/kernel/torture.c b/kernel/torture.c index 439451821a7f..871f63611f7f 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -700,3 +700,16 @@ int _torture_create_kthread(int (*fn)(void *arg), void *arg, char *s, char *m, return ret; } EXPORT_SYMBOL_GPL(_torture_create_kthread); + +/* + * Stop a generic kthread, emitting a message. + */ +void _torture_stop_kthread(char *m, struct task_struct **tp) +{ + if (*tp == NULL) + return; + VERBOSE_TOROUT_STRING(m); + kthread_stop(*tp); + *tp = NULL; +} +EXPORT_SYMBOL_GPL(_torture_stop_kthread); -- cgit v1.2.3 From bfefc73aa1d1bad317bccef8a15da39263d3d962 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 4 Feb 2014 12:35:27 -0800 Subject: rcutorture: Stop generic kthreads in torture_cleanup() The specific torture modules (like rcutorture) need to call torture_cleanup() in any case, so this commit makes torture_cleanup() deal with torture_shutdown_cleanup() and torture_stutter_cleanup() so that the specific modules don't have to deal with these details. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- include/linux/torture.h | 2 -- kernel/rcu/rcutorture.c | 7 ------- kernel/torture.c | 37 +++++++++++++++++++------------------ 3 files changed, 19 insertions(+), 27 deletions(-) (limited to 'include/linux') diff --git a/include/linux/torture.h b/include/linux/torture.h index 7ccfb0a16728..b2e2b468e511 100644 --- a/include/linux/torture.h +++ b/include/linux/torture.h @@ -75,12 +75,10 @@ int torture_shuffle_init(long shuffint); /* Test auto-shutdown handling. */ void torture_shutdown_absorb(const char *title); int torture_shutdown_init(int ssecs, void (*cleanup)(void)); -void torture_shutdown_cleanup(void); /* Task stuttering, which forces load/no-load transitions. */ void stutter_wait(const char *title); int torture_stutter_init(int s); -void torture_stutter_cleanup(void); /* Initialization and cleanup. */ void torture_init_begin(char *ttype, bool v, int *runnable); diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 37bd4beea198..40792e76a116 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -53,11 +53,6 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Paul E. McKenney and Josh Triplett "); -MODULE_ALIAS("rcutorture"); -#ifdef MODULE_PARAM_PREFIX -#undef MODULE_PARAM_PREFIX -#endif -#define MODULE_PARAM_PREFIX "rcutorture." torture_param(int, fqs_duration, 0, "Duration of fqs bursts (us), 0 to disable"); @@ -1269,7 +1264,6 @@ rcu_torture_cleanup(void) rcu_torture_barrier_cleanup(); torture_stop_kthread(rcu_torture_stall, stall_task); - torture_stutter_cleanup(); torture_stop_kthread(rcu_torture_writer, writer_task); if (reader_tasks) { @@ -1297,7 +1291,6 @@ rcu_torture_cleanup(void) for_each_possible_cpu(i) rcutorture_booster_cleanup(i); } - torture_shutdown_cleanup(); /* Wait for all RCU callbacks to fire. */ diff --git a/kernel/torture.c b/kernel/torture.c index 871f63611f7f..b26c7b42becd 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -477,20 +477,6 @@ int torture_shutdown_init(int ssecs, void (*cleanup)(void)) } EXPORT_SYMBOL_GPL(torture_shutdown_init); -/* - * Shut down the shutdown task. Say what??? Heh! This can happen if - * the torture module gets an rmmod before the shutdown time arrives. ;-) - */ -void torture_shutdown_cleanup(void) -{ - if (shutdown_task != NULL) { - VERBOSE_TOROUT_STRING("Stopping torture_shutdown task"); - kthread_stop(shutdown_task); - } - shutdown_task = NULL; -} -EXPORT_SYMBOL_GPL(torture_shutdown_cleanup); - /* * Detect and respond to a system shutdown. */ @@ -512,6 +498,20 @@ static struct notifier_block torture_shutdown_nb = { .notifier_call = torture_shutdown_notify, }; +/* + * Shut down the shutdown task. Say what??? Heh! This can happen if + * the torture module gets an rmmod before the shutdown time arrives. ;-) + */ +static void torture_shutdown_cleanup(void) +{ + unregister_reboot_notifier(&torture_shutdown_nb); + if (shutdown_task != NULL) { + VERBOSE_TOROUT_STRING("Stopping torture_shutdown task"); + kthread_stop(shutdown_task); + } + shutdown_task = NULL; +} + /* * Variables for stuttering, which means to periodically pause and * restart testing in order to catch bugs that appear when load is @@ -575,7 +575,7 @@ EXPORT_SYMBOL_GPL(torture_stutter_init); /* * Cleanup after the torture_stutter kthread. */ -void torture_stutter_cleanup(void) +static void torture_stutter_cleanup(void) { if (!stutter_task) return; @@ -583,7 +583,6 @@ void torture_stutter_cleanup(void) kthread_stop(stutter_task); stutter_task = NULL; } -EXPORT_SYMBOL_GPL(torture_stutter_cleanup); /* * Initialize torture module. Please note that this is -not- invoked via @@ -619,7 +618,8 @@ EXPORT_SYMBOL_GPL(torture_init_end); * Clean up torture module. Please note that this is -not- invoked via * the usual module_exit() mechanism, but rather by an explicit call from * the client torture module. Returns true if a race with system shutdown - * is detected. + * is detected, otherwise, all kthreads started by functions in this file + * will be shut down. * * This must be called before the caller starts shutting down its own * kthreads. @@ -635,8 +635,9 @@ bool torture_cleanup(void) } ACCESS_ONCE(fullstop) = FULLSTOP_RMMOD; mutex_unlock(&fullstop_mutex); - unregister_reboot_notifier(&torture_shutdown_nb); + torture_shutdown_cleanup(); torture_shuffle_cleanup(); + torture_stutter_cleanup(); torture_onoff_cleanup(); return false; } -- cgit v1.2.3 From ff57cd5863cf3014c1c5ed62ce2715294f065b17 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 21 Feb 2014 19:14:11 +0100 Subject: fsnotify: Allocate overflow events with proper type Commit 7053aee26a35 "fsnotify: do not share events between notification groups" used overflow event statically allocated in a group with the size of the generic notification event. This causes problems because some code looks at type specific parts of event structure and gets confused by a random data it sees there and causes crashes. Fix the problem by allocating overflow event with type corresponding to the group type so code cannot get confused. Signed-off-by: Jan Kara --- fs/notify/fanotify/fanotify_user.c | 13 +++++++++++++ fs/notify/group.c | 8 +++++++- fs/notify/inotify/inotify_user.c | 12 ++++++++++++ fs/notify/notification.c | 4 ++-- include/linux/fsnotify_backend.h | 2 +- 5 files changed, 35 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index b6175fa11bf8..287a22c04149 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -698,6 +698,7 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags) struct fsnotify_group *group; int f_flags, fd; struct user_struct *user; + struct fanotify_event_info *oevent; pr_debug("%s: flags=%d event_f_flags=%d\n", __func__, flags, event_f_flags); @@ -730,8 +731,20 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags) group->fanotify_data.user = user; atomic_inc(&user->fanotify_listeners); + oevent = kmem_cache_alloc(fanotify_event_cachep, GFP_KERNEL); + if (unlikely(!oevent)) { + fd = -ENOMEM; + goto out_destroy_group; + } + group->overflow_event = &oevent->fse; + fsnotify_init_event(group->overflow_event, NULL, FS_Q_OVERFLOW); + oevent->tgid = get_pid(task_tgid(current)); + oevent->path.mnt = NULL; + oevent->path.dentry = NULL; + group->fanotify_data.f_flags = event_f_flags; #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS + oevent->response = 0; mutex_init(&group->fanotify_data.access_mutex); init_waitqueue_head(&group->fanotify_data.access_waitq); INIT_LIST_HEAD(&group->fanotify_data.access_list); diff --git a/fs/notify/group.c b/fs/notify/group.c index ee674fe2cec7..ad1995980456 100644 --- a/fs/notify/group.c +++ b/fs/notify/group.c @@ -55,6 +55,13 @@ void fsnotify_destroy_group(struct fsnotify_group *group) /* clear the notification queue of all events */ fsnotify_flush_notify(group); + /* + * Destroy overflow event (we cannot use fsnotify_destroy_event() as + * that deliberately ignores overflow events. + */ + if (group->overflow_event) + group->ops->free_event(group->overflow_event); + fsnotify_put_group(group); } @@ -99,7 +106,6 @@ struct fsnotify_group *fsnotify_alloc_group(const struct fsnotify_ops *ops) INIT_LIST_HEAD(&group->marks_list); group->ops = ops; - fsnotify_init_event(&group->overflow_event, NULL, FS_Q_OVERFLOW); return group; } diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index 6528b5a54ca0..78a2ca3966c3 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -633,11 +633,23 @@ static int inotify_update_watch(struct fsnotify_group *group, struct inode *inod static struct fsnotify_group *inotify_new_group(unsigned int max_events) { struct fsnotify_group *group; + struct inotify_event_info *oevent; group = fsnotify_alloc_group(&inotify_fsnotify_ops); if (IS_ERR(group)) return group; + oevent = kmalloc(sizeof(struct inotify_event_info), GFP_KERNEL); + if (unlikely(!oevent)) { + fsnotify_destroy_group(group); + return ERR_PTR(-ENOMEM); + } + group->overflow_event = &oevent->fse; + fsnotify_init_event(group->overflow_event, NULL, FS_Q_OVERFLOW); + oevent->wd = -1; + oevent->sync_cookie = 0; + oevent->name_len = 0; + group->max_events = max_events; spin_lock_init(&group->inotify_data.idr_lock); diff --git a/fs/notify/notification.c b/fs/notify/notification.c index 6a4ba17c0395..1e58402171a5 100644 --- a/fs/notify/notification.c +++ b/fs/notify/notification.c @@ -98,11 +98,11 @@ int fsnotify_add_notify_event(struct fsnotify_group *group, if (group->q_len >= group->max_events) { ret = 2; /* Queue overflow event only if it isn't already queued */ - if (!list_empty(&group->overflow_event.list)) { + if (!list_empty(&group->overflow_event->list)) { mutex_unlock(&group->notification_mutex); return ret; } - event = &group->overflow_event; + event = group->overflow_event; goto queue; } diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h index c84bc7c2bfc8..64cf3ef50696 100644 --- a/include/linux/fsnotify_backend.h +++ b/include/linux/fsnotify_backend.h @@ -160,7 +160,7 @@ struct fsnotify_group { struct fasync_struct *fsn_fa; /* async notification */ - struct fsnotify_event overflow_event; /* Event we queue when the + struct fsnotify_event *overflow_event; /* Event we queue when the * notification list is too * full */ -- cgit v1.2.3 From fed95bab8d29b928fcf6225be72d37ded452e8a2 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 25 Feb 2014 19:28:44 +0800 Subject: sysfs: fix namespace refcnt leak As mount() and kill_sb() is not a one-to-one match, we shoudn't get ns refcnt unconditionally in sysfs_mount(), and instead we should get the refcnt only when kernfs_mount() allocated a new superblock. v2: - Changed the name of the new argument, suggested by Tejun. - Made the argument optional, suggested by Tejun. v3: - Make the new argument as second-to-last arg, suggested by Tejun. Signed-off-by: Li Zefan Acked-by: Tejun Heo --- fs/kernfs/mount.c | 8 +++++++- fs/sysfs/mount.c | 5 +++-- include/linux/kernfs.h | 9 +++++---- 3 files changed, 15 insertions(+), 7 deletions(-) Signed-off-by: Greg Kroah-Hartman --- fs/kernfs/mount.c | 8 +++++++- fs/sysfs/mount.c | 5 +++-- include/linux/kernfs.h | 9 +++++---- 3 files changed, 15 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c index 0d6ce895a9ee..0f4152defe7b 100644 --- a/fs/kernfs/mount.c +++ b/fs/kernfs/mount.c @@ -94,6 +94,7 @@ const void *kernfs_super_ns(struct super_block *sb) * @fs_type: file_system_type of the fs being mounted * @flags: mount flags specified for the mount * @root: kernfs_root of the hierarchy being mounted + * @new_sb_created: tell the caller if we allocated a new superblock * @ns: optional namespace tag of the mount * * This is to be called from each kernfs user's file_system_type->mount() @@ -104,7 +105,8 @@ const void *kernfs_super_ns(struct super_block *sb) * The return value can be passed to the vfs layer verbatim. */ struct dentry *kernfs_mount_ns(struct file_system_type *fs_type, int flags, - struct kernfs_root *root, const void *ns) + struct kernfs_root *root, bool *new_sb_created, + const void *ns) { struct super_block *sb; struct kernfs_super_info *info; @@ -122,6 +124,10 @@ struct dentry *kernfs_mount_ns(struct file_system_type *fs_type, int flags, kfree(info); if (IS_ERR(sb)) return ERR_CAST(sb); + + if (new_sb_created) + *new_sb_created = !sb->s_root; + if (!sb->s_root) { error = kernfs_fill_super(sb); if (error) { diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c index 6211230814fd..3eaf5c6622eb 100644 --- a/fs/sysfs/mount.c +++ b/fs/sysfs/mount.c @@ -27,6 +27,7 @@ static struct dentry *sysfs_mount(struct file_system_type *fs_type, { struct dentry *root; void *ns; + bool new_sb; if (!(flags & MS_KERNMOUNT)) { if (!capable(CAP_SYS_ADMIN) && !fs_fully_visible(fs_type)) @@ -37,8 +38,8 @@ static struct dentry *sysfs_mount(struct file_system_type *fs_type, } ns = kobj_ns_grab_current(KOBJ_NS_TYPE_NET); - root = kernfs_mount_ns(fs_type, flags, sysfs_root, ns); - if (IS_ERR(root)) + root = kernfs_mount_ns(fs_type, flags, sysfs_root, &new_sb, ns); + if (IS_ERR(root) || !new_sb) kobj_ns_drop(KOBJ_NS_TYPE_NET, ns); return root; } diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h index 5be9f0228a3b..d267623c28cf 100644 --- a/include/linux/kernfs.h +++ b/include/linux/kernfs.h @@ -249,7 +249,8 @@ void kernfs_notify(struct kernfs_node *kn); const void *kernfs_super_ns(struct super_block *sb); struct dentry *kernfs_mount_ns(struct file_system_type *fs_type, int flags, - struct kernfs_root *root, const void *ns); + struct kernfs_root *root, bool *new_sb_created, + const void *ns); void kernfs_kill_sb(struct super_block *sb); void kernfs_init(void); @@ -317,7 +318,7 @@ static inline const void *kernfs_super_ns(struct super_block *sb) static inline struct dentry * kernfs_mount_ns(struct file_system_type *fs_type, int flags, - struct kernfs_root *root, const void *ns) + struct kernfs_root *root, bool *new_sb_created, const void *ns) { return ERR_PTR(-ENOSYS); } static inline void kernfs_kill_sb(struct super_block *sb) { } @@ -368,9 +369,9 @@ static inline int kernfs_remove_by_name(struct kernfs_node *parent, static inline struct dentry * kernfs_mount(struct file_system_type *fs_type, int flags, - struct kernfs_root *root) + struct kernfs_root *root, bool *new_sb_created) { - return kernfs_mount_ns(fs_type, flags, root, NULL); + return kernfs_mount_ns(fs_type, flags, root, new_sb_created, NULL); } #endif /* __LINUX_KERNFS_H */ -- cgit v1.2.3 From f3713fd9cff733d9df83116422d8e4af6e86b2bb Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Tue, 25 Feb 2014 15:01:45 -0800 Subject: ipc,mqueue: remove limits for the amount of system-wide queues Commit 93e6f119c0ce ("ipc/mqueue: cleanup definition names and locations") added global hardcoded limits to the amount of message queues that can be created. While these limits are per-namespace, reality is that it ends up breaking userspace applications. Historically users have, at least in theory, been able to create up to INT_MAX queues, and limiting it to just 1024 is way too low and dramatic for some workloads and use cases. For instance, Madars reports: "This update imposes bad limits on our multi-process application. As our app uses approaches that each process opens its own set of queues (usually something about 3-5 queues per process). In some scenarios we might run up to 3000 processes or more (which of-course for linux is not a problem). Thus we might need up to 9000 queues or more. All processes run under one user." Other affected users can be found in launchpad bug #1155695: https://bugs.launchpad.net/ubuntu/+source/manpages/+bug/1155695 Instead of increasing this limit, revert it entirely and fallback to the original way of dealing queue limits -- where once a user's resource limit is reached, and all memory is used, new queues cannot be created. Signed-off-by: Davidlohr Bueso Reported-by: Madars Vitolins Acked-by: Doug Ledford Cc: Manfred Spraul Cc: [3.5+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/ipc_namespace.h | 2 -- ipc/mq_sysctl.c | 18 ++++++++++++------ ipc/mqueue.c | 6 +++--- 3 files changed, 15 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h index e7831d203737..35e7eca4e33b 100644 --- a/include/linux/ipc_namespace.h +++ b/include/linux/ipc_namespace.h @@ -118,9 +118,7 @@ extern int mq_init_ns(struct ipc_namespace *ns); * the new maximum will handle anyone else. I may have to revisit this * in the future. */ -#define MIN_QUEUESMAX 1 #define DFLT_QUEUESMAX 256 -#define HARD_QUEUESMAX 1024 #define MIN_MSGMAX 1 #define DFLT_MSG 10U #define DFLT_MSGMAX 10 diff --git a/ipc/mq_sysctl.c b/ipc/mq_sysctl.c index 383d638340b8..5bb8bfe67149 100644 --- a/ipc/mq_sysctl.c +++ b/ipc/mq_sysctl.c @@ -22,6 +22,16 @@ static void *get_mq(ctl_table *table) return which; } +static int proc_mq_dointvec(ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + struct ctl_table mq_table; + memcpy(&mq_table, table, sizeof(mq_table)); + mq_table.data = get_mq(table); + + return proc_dointvec(&mq_table, write, buffer, lenp, ppos); +} + static int proc_mq_dointvec_minmax(ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { @@ -33,12 +43,10 @@ static int proc_mq_dointvec_minmax(ctl_table *table, int write, lenp, ppos); } #else +#define proc_mq_dointvec NULL #define proc_mq_dointvec_minmax NULL #endif -static int msg_queues_limit_min = MIN_QUEUESMAX; -static int msg_queues_limit_max = HARD_QUEUESMAX; - static int msg_max_limit_min = MIN_MSGMAX; static int msg_max_limit_max = HARD_MSGMAX; @@ -51,9 +59,7 @@ static ctl_table mq_sysctls[] = { .data = &init_ipc_ns.mq_queues_max, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_mq_dointvec_minmax, - .extra1 = &msg_queues_limit_min, - .extra2 = &msg_queues_limit_max, + .proc_handler = proc_mq_dointvec, }, { .procname = "msg_max", diff --git a/ipc/mqueue.c b/ipc/mqueue.c index ccf1f9fd263a..c3b31179122c 100644 --- a/ipc/mqueue.c +++ b/ipc/mqueue.c @@ -433,9 +433,9 @@ static int mqueue_create(struct inode *dir, struct dentry *dentry, error = -EACCES; goto out_unlock; } - if (ipc_ns->mq_queues_count >= HARD_QUEUESMAX || - (ipc_ns->mq_queues_count >= ipc_ns->mq_queues_max && - !capable(CAP_SYS_RESOURCE))) { + + if (ipc_ns->mq_queues_count >= ipc_ns->mq_queues_max && + !capable(CAP_SYS_RESOURCE)) { error = -ENOSPC; goto out_unlock; } -- cgit v1.2.3 From 7a754743185a4b05818e10058fa2fbe4e6969085 Mon Sep 17 00:00:00 2001 From: Paul Gortmaker Date: Tue, 11 Feb 2014 16:10:12 -0500 Subject: rcu: Fix sparse warning for rcu_expedited from kernel/ksysfs.c This commit fixes the follwoing warning: kernel/ksysfs.c:143:5: warning: symbol 'rcu_expedited' was not declared. Should it be static? Signed-off-by: Paul Gortmaker [ paulmck: Moved the declaration to include/linux/rcupdate.h to avoid including the RCU-internal rcu.h file outside of RCU. ] Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- include/linux/rcupdate.h | 1 + kernel/ksysfs.c | 2 ++ kernel/rcu/rcu.h | 2 -- 3 files changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 32decf1a9c6c..f3706c6b2e21 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -46,6 +46,7 @@ #include #include +extern int rcu_expedited; /* for sysctl */ #ifdef CONFIG_RCU_TORTURE_TEST extern int rcutorture_runnable; /* for sysctl */ #endif /* #ifdef CONFIG_RCU_TORTURE_TEST */ diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index d945a949760f..e660964086e2 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c @@ -19,6 +19,8 @@ #include #include +#include /* rcu_expedited */ + #define KERNEL_ATTR_RO(_name) \ static struct kobj_attribute _name##_attr = __ATTR_RO(_name) diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 1bd787fddcb2..af2e60a8425d 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -116,8 +116,6 @@ static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head) } } -extern int rcu_expedited; - #ifdef CONFIG_RCU_STALL_COMMON extern int rcu_cpu_stall_suppress; -- cgit v1.2.3 From f207dbe63c61158c234e2e8929a3725a7f6b2b9b Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 27 Feb 2014 12:20:31 +0100 Subject: Revert "sched/wait: Suppress Sparse 'variable shadowing' warning" This reverts commit 980f88e414418bf65569a3b62b08b07e6fc2f4c6. This warning is actually useful, don't suppress it. We actually rely on the shadowing for ___wait_cond_timeout(). We further used the __ret variable in __wait_event_timeout()'s cmd argument: __ret = schedule_timeout(__ret). That now explicitly uses the wrong __ret. Reported-by: Gregory CLEMENT Requested-by: Andrew Morton Requested-by: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-Q5blhuqqzwgVwvjf1gszrdol@git.kernel.org Signed-off-by: Ingo Molnar --- include/linux/wait.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/wait.h b/include/linux/wait.h index c55ea5c24404..559044c79232 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -195,7 +195,7 @@ wait_queue_head_t *bit_waitqueue(void *, int); ({ \ __label__ __out; \ wait_queue_t __wait; \ - long ___ret = ret; \ + long __ret = ret; \ \ INIT_LIST_HEAD(&__wait.task_list); \ if (exclusive) \ @@ -210,7 +210,7 @@ wait_queue_head_t *bit_waitqueue(void *, int); break; \ \ if (___wait_is_interruptible(state) && __int) { \ - ___ret = __int; \ + __ret = __int; \ if (exclusive) { \ abort_exclusive_wait(&wq, &__wait, \ state, NULL); \ @@ -222,7 +222,7 @@ wait_queue_head_t *bit_waitqueue(void *, int); cmd; \ } \ finish_wait(&wq, &__wait); \ -__out: ___ret; \ +__out: __ret; \ }) #define __wait_event(wq, condition) \ -- cgit v1.2.3 From 6f285b19d09f72e801525f5eea1bdad22e559bf0 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 28 Feb 2014 19:44:55 -0800 Subject: audit: Send replies in the proper network namespace. In perverse cases of file descriptor passing the current network namespace of a process and the network namespace of a socket used by that socket may differ. Therefore use the network namespace of the appropiate socket to ensure replies always go to the appropiate socket. Signed-off-by: "Eric W. Biederman" --- include/linux/audit.h | 3 ++- kernel/audit.c | 21 ++++++++++----------- kernel/auditfilter.c | 7 +++++-- 3 files changed, 17 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/audit.h b/include/linux/audit.h index aa865a9a4c4f..ec1464df4c60 100644 --- a/include/linux/audit.h +++ b/include/linux/audit.h @@ -43,6 +43,7 @@ struct mq_attr; struct mqstat; struct audit_watch; struct audit_tree; +struct sk_buff; struct audit_krule { int vers_ops; @@ -463,7 +464,7 @@ extern int audit_filter_user(int type); extern int audit_filter_type(int type); extern int audit_rule_change(int type, __u32 portid, int seq, void *data, size_t datasz); -extern int audit_list_rules_send(__u32 portid, int seq); +extern int audit_list_rules_send(struct sk_buff *request_skb, int seq); extern u32 audit_enabled; #else /* CONFIG_AUDIT */ diff --git a/kernel/audit.c b/kernel/audit.c index 1e5756f16f6f..32086bff5564 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -570,9 +570,11 @@ static int audit_send_reply_thread(void *arg) * Allocates an skb, builds the netlink message, and sends it to the port id. * No failure notifications. */ -static void audit_send_reply(__u32 portid, int seq, int type, int done, +static void audit_send_reply(struct sk_buff *request_skb, int seq, int type, int done, int multi, const void *payload, int size) { + u32 portid = NETLINK_CB(request_skb).portid; + struct net *net = sock_net(NETLINK_CB(request_skb).sk); struct sk_buff *skb; struct task_struct *tsk; struct audit_reply *reply = kmalloc(sizeof(struct audit_reply), @@ -585,7 +587,7 @@ static void audit_send_reply(__u32 portid, int seq, int type, int done, if (!skb) goto out; - reply->net = get_net(current->nsproxy->net_ns); + reply->net = get_net(net); reply->portid = portid; reply->skb = skb; @@ -675,8 +677,7 @@ static int audit_get_feature(struct sk_buff *skb) seq = nlmsg_hdr(skb)->nlmsg_seq; - audit_send_reply(NETLINK_CB(skb).portid, seq, AUDIT_GET, 0, 0, - &af, sizeof(af)); + audit_send_reply(skb, seq, AUDIT_GET, 0, 0, &af, sizeof(af)); return 0; } @@ -796,8 +797,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) s.backlog = skb_queue_len(&audit_skb_queue); s.version = AUDIT_VERSION_LATEST; s.backlog_wait_time = audit_backlog_wait_time; - audit_send_reply(NETLINK_CB(skb).portid, seq, AUDIT_GET, 0, 0, - &s, sizeof(s)); + audit_send_reply(skb, seq, AUDIT_GET, 0, 0, &s, sizeof(s)); break; } case AUDIT_SET: { @@ -907,7 +907,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) seq, data, nlmsg_len(nlh)); break; case AUDIT_LIST_RULES: - err = audit_list_rules_send(NETLINK_CB(skb).portid, seq); + err = audit_list_rules_send(skb, seq); break; case AUDIT_TRIM: audit_trim_trees(); @@ -972,8 +972,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) memcpy(sig_data->ctx, ctx, len); security_release_secctx(ctx, len); } - audit_send_reply(NETLINK_CB(skb).portid, seq, AUDIT_SIGNAL_INFO, - 0, 0, sig_data, sizeof(*sig_data) + len); + audit_send_reply(skb, seq, AUDIT_SIGNAL_INFO, 0, 0, + sig_data, sizeof(*sig_data) + len); kfree(sig_data); break; case AUDIT_TTY_GET: { @@ -985,8 +985,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) s.log_passwd = tsk->signal->audit_tty_log_passwd; spin_unlock(&tsk->sighand->siglock); - audit_send_reply(NETLINK_CB(skb).portid, seq, - AUDIT_TTY_GET, 0, 0, &s, sizeof(s)); + audit_send_reply(skb, seq, AUDIT_TTY_GET, 0, 0, &s, sizeof(s)); break; } case AUDIT_TTY_SET: { diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index a5e3d73d73e4..e8d1c7c515d7 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -30,6 +30,7 @@ #include #include #include +#include #include "audit.h" /* @@ -1069,8 +1070,10 @@ int audit_rule_change(int type, __u32 portid, int seq, void *data, * @portid: target portid for netlink audit messages * @seq: netlink audit message sequence (serial) number */ -int audit_list_rules_send(__u32 portid, int seq) +int audit_list_rules_send(struct sk_buff *request_skb, int seq) { + u32 portid = NETLINK_CB(request_skb).portid; + struct net *net = sock_net(NETLINK_CB(request_skb).sk); struct task_struct *tsk; struct audit_netlink_list *dest; int err = 0; @@ -1084,7 +1087,7 @@ int audit_list_rules_send(__u32 portid, int seq) dest = kmalloc(sizeof(struct audit_netlink_list), GFP_KERNEL); if (!dest) return -ENOMEM; - dest->net = get_net(current->nsproxy->net_ns); + dest->net = get_net(net); dest->portid = portid; skb_queue_head_init(&dest->q); -- cgit v1.2.3 From b7e63a1079b266866a732cf699d8c4d61391bbda Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 26 Feb 2014 11:19:14 -0800 Subject: NFSv4: Fix another nfs4_sequence corruptor nfs4_release_lockowner needs to set the rpc_message reply to point to the nfs4_sequence_res in order to avoid another Oopsable situation in nfs41_assign_slot. Fixes: fbd4bfd1d9d21 (NFS: Add nfs4_sequence calls for RELEASE_LOCKOWNER) Cc: stable@vger.kernel.org # 3.12+ Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 10 +++++----- include/linux/nfs_xdr.h | 5 +++++ 2 files changed, 10 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 2da6a698b8f7..44e088dc357c 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -5828,8 +5828,7 @@ struct nfs_release_lockowner_data { struct nfs4_lock_state *lsp; struct nfs_server *server; struct nfs_release_lockowner_args args; - struct nfs4_sequence_args seq_args; - struct nfs4_sequence_res seq_res; + struct nfs_release_lockowner_res res; unsigned long timestamp; }; @@ -5837,7 +5836,7 @@ static void nfs4_release_lockowner_prepare(struct rpc_task *task, void *calldata { struct nfs_release_lockowner_data *data = calldata; nfs40_setup_sequence(data->server, - &data->seq_args, &data->seq_res, task); + &data->args.seq_args, &data->res.seq_res, task); data->timestamp = jiffies; } @@ -5846,7 +5845,7 @@ static void nfs4_release_lockowner_done(struct rpc_task *task, void *calldata) struct nfs_release_lockowner_data *data = calldata; struct nfs_server *server = data->server; - nfs40_sequence_done(task, &data->seq_res); + nfs40_sequence_done(task, &data->res.seq_res); switch (task->tk_status) { case 0: @@ -5887,7 +5886,6 @@ static int nfs4_release_lockowner(struct nfs_server *server, struct nfs4_lock_st data = kmalloc(sizeof(*data), GFP_NOFS); if (!data) return -ENOMEM; - nfs4_init_sequence(&data->seq_args, &data->seq_res, 0); data->lsp = lsp; data->server = server; data->args.lock_owner.clientid = server->nfs_client->cl_clientid; @@ -5895,6 +5893,8 @@ static int nfs4_release_lockowner(struct nfs_server *server, struct nfs4_lock_st data->args.lock_owner.s_dev = server->s_dev; msg.rpc_argp = &data->args; + msg.rpc_resp = &data->res; + nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 0); rpc_call_async(server->client, &msg, 0, &nfs4_release_lockowner_ops, data); return 0; } diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index b2fb167b2e6d..5624e4e2763c 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -467,9 +467,14 @@ struct nfs_lockt_res { }; struct nfs_release_lockowner_args { + struct nfs4_sequence_args seq_args; struct nfs_lowner lock_owner; }; +struct nfs_release_lockowner_res { + struct nfs4_sequence_res seq_res; +}; + struct nfs4_delegreturnargs { struct nfs4_sequence_args seq_args; const struct nfs_fh *fhandle; -- cgit v1.2.3 From 03b8c7b623c80af264c4c8d6111e5c6289933666 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Sun, 2 Mar 2014 13:09:47 +0100 Subject: futex: Allow architectures to skip futex_atomic_cmpxchg_inatomic() test If an architecture has futex_atomic_cmpxchg_inatomic() implemented and there is no runtime check necessary, allow to skip the test within futex_init(). This allows to get rid of some code which would always give the same result, and also allows the compiler to optimize a couple of if statements away. Signed-off-by: Heiko Carstens Cc: Finn Thain Cc: Geert Uytterhoeven Link: http://lkml.kernel.org/r/20140302120947.GA3641@osiris Signed-off-by: Thomas Gleixner --- arch/s390/Kconfig | 1 + include/linux/futex.h | 4 ++++ init/Kconfig | 7 +++++++ kernel/futex.c | 37 ++++++++++++++++++++++++------------- 4 files changed, 36 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 65a07750f4f9..bb74b21f007a 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -117,6 +117,7 @@ config S390 select HAVE_FUNCTION_GRAPH_TRACER select HAVE_FUNCTION_TRACER select HAVE_FUNCTION_TRACE_MCOUNT_TEST + select HAVE_FUTEX_CMPXCHG if FUTEX select HAVE_KERNEL_BZIP2 select HAVE_KERNEL_GZIP select HAVE_KERNEL_LZ4 diff --git a/include/linux/futex.h b/include/linux/futex.h index b0d95cac826e..6435f46d6e13 100644 --- a/include/linux/futex.h +++ b/include/linux/futex.h @@ -55,7 +55,11 @@ union futex_key { #ifdef CONFIG_FUTEX extern void exit_robust_list(struct task_struct *curr); extern void exit_pi_state_list(struct task_struct *curr); +#ifdef CONFIG_HAVE_FUTEX_CMPXCHG +#define futex_cmpxchg_enabled 1 +#else extern int futex_cmpxchg_enabled; +#endif #else static inline void exit_robust_list(struct task_struct *curr) { diff --git a/init/Kconfig b/init/Kconfig index 009a797dd242..d56cb03c1b49 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1387,6 +1387,13 @@ config FUTEX support for "fast userspace mutexes". The resulting kernel may not run glibc-based applications correctly. +config HAVE_FUTEX_CMPXCHG + bool + help + Architectures should select this if futex_atomic_cmpxchg_inatomic() + is implemented and always working. This removes a couple of runtime + checks. + config EPOLL bool "Enable eventpoll support" if EXPERT default y diff --git a/kernel/futex.c b/kernel/futex.c index 44a1261cb9ff..5d17e3a83f8c 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -157,7 +157,9 @@ * enqueue. */ +#ifndef CONFIG_HAVE_FUTEX_CMPXCHG int __read_mostly futex_cmpxchg_enabled; +#endif /* * Futex flags used to encode options to functions and preserve them across @@ -2843,9 +2845,28 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, return do_futex(uaddr, op, val, tp, uaddr2, val2, val3); } -static int __init futex_init(void) +static void __init futex_detect_cmpxchg(void) { +#ifndef CONFIG_HAVE_FUTEX_CMPXCHG u32 curval; + + /* + * This will fail and we want it. Some arch implementations do + * runtime detection of the futex_atomic_cmpxchg_inatomic() + * functionality. We want to know that before we call in any + * of the complex code paths. Also we want to prevent + * registration of robust lists in that case. NULL is + * guaranteed to fault and we get -EFAULT on functional + * implementation, the non-functional ones will return + * -ENOSYS. + */ + if (cmpxchg_futex_value_locked(&curval, NULL, 0, 0) == -EFAULT) + futex_cmpxchg_enabled = 1; +#endif +} + +static int __init futex_init(void) +{ unsigned int futex_shift; unsigned long i; @@ -2861,18 +2882,8 @@ static int __init futex_init(void) &futex_shift, NULL, futex_hashsize, futex_hashsize); futex_hashsize = 1UL << futex_shift; - /* - * This will fail and we want it. Some arch implementations do - * runtime detection of the futex_atomic_cmpxchg_inatomic() - * functionality. We want to know that before we call in any - * of the complex code paths. Also we want to prevent - * registration of robust lists in that case. NULL is - * guaranteed to fault and we get -EFAULT on functional - * implementation, the non-functional ones will return - * -ENOSYS. - */ - if (cmpxchg_futex_value_locked(&curval, NULL, 0, 0) == -EFAULT) - futex_cmpxchg_enabled = 1; + + futex_detect_cmpxchg(); for (i = 0; i < futex_hashsize; i++) { plist_head_init(&futex_queues[i].chain); -- cgit v1.2.3 From 45ab2813d40d88fc575e753c38478de242d03f88 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Wed, 26 Feb 2014 13:37:38 -0500 Subject: tracing: Do not add event files for modules that fail tracepoints If a module fails to add its tracepoints due to module tainting, do not create the module event infrastructure in the debugfs directory. As the events will not work and worse yet, they will silently fail, making the user wonder why the events they enable do not display anything. Having a warning on module load and the events not visible to the users will make the cause of the problem much clearer. Link: http://lkml.kernel.org/r/20140227154923.265882695@goodmis.org Fixes: 6d723736e472 "tracing/events: add support for modules to TRACE_EVENT" Acked-by: Mathieu Desnoyers Cc: stable@vger.kernel.org # 2.6.31+ Cc: Rusty Russell Signed-off-by: Steven Rostedt --- include/linux/tracepoint.h | 6 ++++++ kernel/trace/trace_events.c | 10 ++++++++++ kernel/tracepoint.c | 7 ++++++- 3 files changed, 22 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h index accc497f8d72..7159a0a933df 100644 --- a/include/linux/tracepoint.h +++ b/include/linux/tracepoint.h @@ -60,6 +60,12 @@ struct tp_module { unsigned int num_tracepoints; struct tracepoint * const *tracepoints_ptrs; }; +bool trace_module_has_bad_taint(struct module *mod); +#else +static inline bool trace_module_has_bad_taint(struct module *mod) +{ + return false; +} #endif /* CONFIG_MODULES */ struct tracepoint_iter { diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index e71ffd4eccb5..f3989ceb5cd5 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -1777,6 +1777,16 @@ static void trace_module_add_events(struct module *mod) { struct ftrace_event_call **call, **start, **end; + if (!mod->num_trace_events) + return; + + /* Don't add infrastructure for mods without tracepoints */ + if (trace_module_has_bad_taint(mod)) { + pr_err("%s: module has bad taint, not creating trace events\n", + mod->name); + return; + } + start = mod->trace_events; end = mod->trace_events + mod->num_trace_events; diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index 29f26540e9c9..031cc5655a51 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -631,6 +631,11 @@ void tracepoint_iter_reset(struct tracepoint_iter *iter) EXPORT_SYMBOL_GPL(tracepoint_iter_reset); #ifdef CONFIG_MODULES +bool trace_module_has_bad_taint(struct module *mod) +{ + return mod->taints & ~((1 << TAINT_OOT_MODULE) | (1 << TAINT_CRAP)); +} + static int tracepoint_module_coming(struct module *mod) { struct tp_module *tp_mod, *iter; @@ -641,7 +646,7 @@ static int tracepoint_module_coming(struct module *mod) * module headers (for forced load), to make sure we don't cause a crash. * Staging and out-of-tree GPL modules are fine. */ - if (mod->taints & ~((1 << TAINT_OOT_MODULE) | (1 << TAINT_CRAP))) + if (trace_module_has_bad_taint(mod)) return 0; mutex_lock(&tracepoints_mutex); tp_mod = kmalloc(sizeof(struct tp_module), GFP_KERNEL); -- cgit v1.2.3 From 0473c9b5f05948df780bbc7b996dd7aefc4ec41d Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Mon, 3 Mar 2014 10:44:03 +0100 Subject: compat: let architectures define __ARCH_WANT_COMPAT_SYS_GETDENTS64 For architecture dependent compat syscalls in common code an architecture must define something like __ARCH_WANT_ if it wants to use the code. This however is not true for compat_sys_getdents64 for which architectures must define __ARCH_OMIT_COMPAT_SYS_GETDENTS64 if they do not want the code. This leads to the situation where all architectures, except mips, get the compat code but only x86_64, arm64 and the generic syscall architectures actually use it. So invert the logic, so that architectures actively must do something to get the compat code. This way a couple of architectures get rid of otherwise dead code. Signed-off-by: Heiko Carstens --- arch/arm64/include/asm/unistd.h | 1 + arch/mips/include/asm/unistd.h | 1 - arch/x86/include/asm/unistd.h | 1 + fs/compat.c | 4 ++-- include/linux/compat.h | 2 ++ include/uapi/asm-generic/unistd.h | 1 + 6 files changed, 7 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/arch/arm64/include/asm/unistd.h b/arch/arm64/include/asm/unistd.h index 82ce217e94cf..a4654c656a1e 100644 --- a/arch/arm64/include/asm/unistd.h +++ b/arch/arm64/include/asm/unistd.h @@ -14,6 +14,7 @@ * along with this program. If not, see . */ #ifdef CONFIG_COMPAT +#define __ARCH_WANT_COMPAT_SYS_GETDENTS64 #define __ARCH_WANT_COMPAT_STAT64 #define __ARCH_WANT_SYS_GETHOSTNAME #define __ARCH_WANT_SYS_PAUSE diff --git a/arch/mips/include/asm/unistd.h b/arch/mips/include/asm/unistd.h index 4d3b92886665..413d6c612bec 100644 --- a/arch/mips/include/asm/unistd.h +++ b/arch/mips/include/asm/unistd.h @@ -24,7 +24,6 @@ #ifndef __ASSEMBLY__ -#define __ARCH_OMIT_COMPAT_SYS_GETDENTS64 #define __ARCH_WANT_OLD_READDIR #define __ARCH_WANT_SYS_ALARM #define __ARCH_WANT_SYS_GETHOSTNAME diff --git a/arch/x86/include/asm/unistd.h b/arch/x86/include/asm/unistd.h index c2a48139c340..f4b5795d7e34 100644 --- a/arch/x86/include/asm/unistd.h +++ b/arch/x86/include/asm/unistd.h @@ -23,6 +23,7 @@ # include # include # define __ARCH_WANT_COMPAT_SYS_TIME +# define __ARCH_WANT_COMPAT_SYS_GETDENTS64 # endif diff --git a/fs/compat.c b/fs/compat.c index 6af20de2c1a3..0095a6978eef 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -981,7 +981,7 @@ asmlinkage long compat_sys_getdents(unsigned int fd, return error; } -#ifndef __ARCH_OMIT_COMPAT_SYS_GETDENTS64 +#ifdef __ARCH_WANT_COMPAT_SYS_GETDENTS64 struct compat_getdents_callback64 { struct dir_context ctx; @@ -1066,7 +1066,7 @@ asmlinkage long compat_sys_getdents64(unsigned int fd, fdput(f); return error; } -#endif /* ! __ARCH_OMIT_COMPAT_SYS_GETDENTS64 */ +#endif /* __ARCH_WANT_COMPAT_SYS_GETDENTS64 */ /* * Exactly like fs/open.c:sys_open(), except that it doesn't set the diff --git a/include/linux/compat.h b/include/linux/compat.h index 3f448c65511b..beded18f992d 100644 --- a/include/linux/compat.h +++ b/include/linux/compat.h @@ -502,9 +502,11 @@ asmlinkage long compat_sys_old_readdir(unsigned int fd, asmlinkage long compat_sys_getdents(unsigned int fd, struct compat_linux_dirent __user *dirent, unsigned int count); +#ifdef __ARCH_WANT_COMPAT_SYS_GETDENTS64 asmlinkage long compat_sys_getdents64(unsigned int fd, struct linux_dirent64 __user *dirent, unsigned int count); +#endif asmlinkage long compat_sys_vmsplice(int fd, const struct compat_iovec __user *, unsigned int nr_segs, unsigned int flags); asmlinkage long compat_sys_open(const char __user *filename, int flags, diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h index dde8041f40d2..6db66783d268 100644 --- a/include/uapi/asm-generic/unistd.h +++ b/include/uapi/asm-generic/unistd.h @@ -191,6 +191,7 @@ __SYSCALL(__NR_quotactl, sys_quotactl) /* fs/readdir.c */ #define __NR_getdents64 61 +#define __ARCH_WANT_COMPAT_SYS_GETDENTS64 __SC_COMP(__NR_getdents64, sys_getdents64, compat_sys_getdents64) /* fs/read_write.c */ -- cgit v1.2.3 From 217f4433fc2fe768a7f13f7e5586333bb8280e9e Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 26 Feb 2014 10:56:20 +0100 Subject: compat: add COMPAT_SYSCALL_DEFINE0 macro For consistency reason add a COMPAT_SYSCALL_DEFINE0 macro. This macro should be used for compat system calls with zero parameters. Signed-off-by: Heiko Carstens --- include/linux/compat.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/compat.h b/include/linux/compat.h index beded18f992d..4c42df3fce37 100644 --- a/include/linux/compat.h +++ b/include/linux/compat.h @@ -27,6 +27,9 @@ #define __SC_DELOUSE(t,v) ((t)(unsigned long)(v)) #endif +#define COMPAT_SYSCALL_DEFINE0(name) \ + asmlinkage long compat_sys_##name(void) + #define COMPAT_SYSCALL_DEFINE1(name, ...) \ COMPAT_SYSCALL_DEFINEx(1, _##name, __VA_ARGS__) #define COMPAT_SYSCALL_DEFINE2(name, ...) \ -- cgit v1.2.3 From ab4f8bba19323eb78b7473df42b225eb14090fcc Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Sat, 1 Mar 2014 13:45:03 +0100 Subject: s390/compat: automatic zero, sign and pointer conversion of syscalls Instead of explicitly changing compat system call parameters from e.g. unsigned long to compat_ulong_t let the COMPAT_SYSCALL_WRAP macros automatically detect (unsigned) long parameters and zero and sign extend them automatically. The resulting binary is completely identical. In addition add a sys_[system call name] prototype for each system call wrapper. This will cause compile errors if the prototype does not match the prototype in include/linux/syscall.h. Therefore we should now always get the correct zero and sign extension of system call parameters. Pointers are handled like before. Signed-off-by: Heiko Carstens --- arch/s390/kernel/compat_wrap.c | 134 +++++++++++++++++++++++------------------ include/linux/syscalls.h | 2 + 2 files changed, 77 insertions(+), 59 deletions(-) (limited to 'include/linux') diff --git a/arch/s390/kernel/compat_wrap.c b/arch/s390/kernel/compat_wrap.c index 17600b15ae3a..d6a2cac1af12 100644 --- a/arch/s390/kernel/compat_wrap.c +++ b/arch/s390/kernel/compat_wrap.c @@ -15,11 +15,27 @@ #define COMPAT_SYSCALL_WRAP6(name, ...) \ COMPAT_SYSCALL_WRAPx(6, _##name, __VA_ARGS__) +#define __SC_COMPAT_CAST(t, a) \ +({ \ + long __ReS = a; \ + \ + BUILD_BUG_ON((sizeof(t) > 4) && !__TYPE_IS_L(t) && \ + !__TYPE_IS_UL(t) && !__TYPE_IS_PTR(t)); \ + if (__TYPE_IS_L(t)) \ + __ReS = (s32)a; \ + if (__TYPE_IS_UL(t)) \ + __ReS = (u32)a; \ + if (__TYPE_IS_PTR(t)) \ + __ReS = a & 0x7fffffff; \ + (t)__ReS; \ +}) + #define COMPAT_SYSCALL_WRAPx(x, name, ...) \ + asmlinkage long sys##name(__MAP(x,__SC_DECL,__VA_ARGS__)); \ asmlinkage long compat_sys##name(__MAP(x,__SC_LONG,__VA_ARGS__)); \ asmlinkage long compat_sys##name(__MAP(x,__SC_LONG,__VA_ARGS__)) \ { \ - return sys##name(__MAP(x,__SC_DELOUSE,__VA_ARGS__)); \ + return sys##name(__MAP(x,__SC_COMPAT_CAST,__VA_ARGS__)); \ } COMPAT_SYSCALL_WRAP1(exit, int, error_code); @@ -40,22 +56,22 @@ COMPAT_SYSCALL_WRAP2(mkdir, const char __user *, pathname, umode_t, mode); COMPAT_SYSCALL_WRAP1(rmdir, const char __user *, pathname); COMPAT_SYSCALL_WRAP1(dup, unsigned int, fildes); COMPAT_SYSCALL_WRAP1(pipe, int __user *, fildes); -COMPAT_SYSCALL_WRAP1(brk, compat_ulong_t, brk); +COMPAT_SYSCALL_WRAP1(brk, unsigned long, brk); COMPAT_SYSCALL_WRAP2(signal, int, sig, __sighandler_t, handler); COMPAT_SYSCALL_WRAP1(acct, const char __user *, name); COMPAT_SYSCALL_WRAP2(umount, char __user *, name, int, flags); -COMPAT_SYSCALL_WRAP2(setpgid, compat_pid_t, pid, compat_pid_t, pgid); +COMPAT_SYSCALL_WRAP2(setpgid, pid_t, pid, pid_t, pgid); COMPAT_SYSCALL_WRAP1(umask, int, mask); COMPAT_SYSCALL_WRAP1(chroot, const char __user *, filename); COMPAT_SYSCALL_WRAP2(dup2, unsigned int, oldfd, unsigned int, newfd); -COMPAT_SYSCALL_WRAP3(sigsuspend, int, unused1, int, unused2, compat_old_sigset_t, mask); +COMPAT_SYSCALL_WRAP3(sigsuspend, int, unused1, int, unused2, old_sigset_t, mask); COMPAT_SYSCALL_WRAP2(sethostname, char __user *, name, int, len); COMPAT_SYSCALL_WRAP2(symlink, const char __user *, old, const char __user *, new); COMPAT_SYSCALL_WRAP3(readlink, const char __user *, path, char __user *, buf, int, bufsiz); COMPAT_SYSCALL_WRAP1(uselib, const char __user *, library); COMPAT_SYSCALL_WRAP2(swapon, const char __user *, specialfile, int, swap_flags); COMPAT_SYSCALL_WRAP4(reboot, int, magic1, int, magic2, unsigned int, cmd, void __user *, arg); -COMPAT_SYSCALL_WRAP2(munmap, compat_ulong_t, addr, compat_size_t, len); +COMPAT_SYSCALL_WRAP2(munmap, unsigned long, addr, size_t, len); COMPAT_SYSCALL_WRAP2(fchmod, unsigned int, fd, umode_t, mode); COMPAT_SYSCALL_WRAP2(getpriority, int, which, int, who); COMPAT_SYSCALL_WRAP3(setpriority, int, which, int, who, int, niceval); @@ -64,63 +80,63 @@ COMPAT_SYSCALL_WRAP1(swapoff, const char __user *, specialfile); COMPAT_SYSCALL_WRAP1(fsync, unsigned int, fd); COMPAT_SYSCALL_WRAP2(setdomainname, char __user *, name, int, len); COMPAT_SYSCALL_WRAP1(newuname, struct new_utsname __user *, name); -COMPAT_SYSCALL_WRAP3(mprotect, compat_ulong_t, start, compat_size_t, len, compat_ulong_t, prot); -COMPAT_SYSCALL_WRAP3(init_module, void __user *, umod, compat_ulong_t, len, const char __user *, uargs); +COMPAT_SYSCALL_WRAP3(mprotect, unsigned long, start, size_t, len, unsigned long, prot); +COMPAT_SYSCALL_WRAP3(init_module, void __user *, umod, unsigned long, len, const char __user *, uargs); COMPAT_SYSCALL_WRAP2(delete_module, const char __user *, name_user, unsigned int, flags); COMPAT_SYSCALL_WRAP4(quotactl, unsigned int, cmd, const char __user *, special, qid_t, id, void __user *, addr); -COMPAT_SYSCALL_WRAP1(getpgid, compat_pid_t, pid); +COMPAT_SYSCALL_WRAP1(getpgid, pid_t, pid); COMPAT_SYSCALL_WRAP1(fchdir, unsigned int, fd); -COMPAT_SYSCALL_WRAP2(bdflush, int, func, compat_long_t, data); -COMPAT_SYSCALL_WRAP3(sysfs, int, option, compat_ulong_t, arg1, compat_ulong_t, arg2); +COMPAT_SYSCALL_WRAP2(bdflush, int, func, long, data); +COMPAT_SYSCALL_WRAP3(sysfs, int, option, unsigned long, arg1, unsigned long, arg2); COMPAT_SYSCALL_WRAP1(s390_personality, unsigned int, personality); -COMPAT_SYSCALL_WRAP5(llseek, unsigned int, fd, u32, high, u32, low, loff_t __user *, result, unsigned int, whence); +COMPAT_SYSCALL_WRAP5(llseek, unsigned int, fd, unsigned long, high, unsigned long, low, loff_t __user *, result, unsigned int, whence); COMPAT_SYSCALL_WRAP2(flock, unsigned int, fd, unsigned int, cmd); -COMPAT_SYSCALL_WRAP3(msync, compat_ulong_t, start, compat_size_t, len, int, flags); -COMPAT_SYSCALL_WRAP1(getsid, compat_pid_t, pid); +COMPAT_SYSCALL_WRAP3(msync, unsigned long, start, size_t, len, int, flags); +COMPAT_SYSCALL_WRAP1(getsid, pid_t, pid); COMPAT_SYSCALL_WRAP1(fdatasync, unsigned int, fd); -COMPAT_SYSCALL_WRAP2(mlock, compat_ulong_t, start, compat_size_t, len); -COMPAT_SYSCALL_WRAP2(munlock, compat_ulong_t, start, compat_size_t, len); +COMPAT_SYSCALL_WRAP2(mlock, unsigned long, start, size_t, len); +COMPAT_SYSCALL_WRAP2(munlock, unsigned long, start, size_t, len); COMPAT_SYSCALL_WRAP1(mlockall, int, flags); -COMPAT_SYSCALL_WRAP2(sched_setparam, compat_pid_t, pid, struct sched_param __user *, param); -COMPAT_SYSCALL_WRAP2(sched_getparam, compat_pid_t, pid, struct sched_param __user *, param); -COMPAT_SYSCALL_WRAP3(sched_setscheduler, compat_pid_t, pid, int, policy, struct sched_param __user *, param); -COMPAT_SYSCALL_WRAP1(sched_getscheduler, compat_pid_t, pid); +COMPAT_SYSCALL_WRAP2(sched_setparam, pid_t, pid, struct sched_param __user *, param); +COMPAT_SYSCALL_WRAP2(sched_getparam, pid_t, pid, struct sched_param __user *, param); +COMPAT_SYSCALL_WRAP3(sched_setscheduler, pid_t, pid, int, policy, struct sched_param __user *, param); +COMPAT_SYSCALL_WRAP1(sched_getscheduler, pid_t, pid); COMPAT_SYSCALL_WRAP1(sched_get_priority_max, int, policy); COMPAT_SYSCALL_WRAP1(sched_get_priority_min, int, policy); -COMPAT_SYSCALL_WRAP5(mremap, u32, addr, u32, old_len, u32, new_len, u32, flags, u32, new_addr); +COMPAT_SYSCALL_WRAP5(mremap, unsigned long, addr, unsigned long, old_len, unsigned long, new_len, unsigned long, flags, unsigned long, new_addr); COMPAT_SYSCALL_WRAP3(poll, struct pollfd __user *, ufds, unsigned int, nfds, int, timeout); -COMPAT_SYSCALL_WRAP5(prctl, int, option, u32, arg2, u32, arg3, u32, arg4, u32, arg5); -COMPAT_SYSCALL_WRAP2(getcwd, char __user *, buf, u32, size); +COMPAT_SYSCALL_WRAP5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, unsigned long, arg4, unsigned long, arg5); +COMPAT_SYSCALL_WRAP2(getcwd, char __user *, buf, unsigned long, size); COMPAT_SYSCALL_WRAP2(capget, cap_user_header_t, header, cap_user_data_t, dataptr); COMPAT_SYSCALL_WRAP2(capset, cap_user_header_t, header, const cap_user_data_t, data); -COMPAT_SYSCALL_WRAP3(lchown, const char __user *, filename, compat_uid_t, user, compat_gid_t, group); -COMPAT_SYSCALL_WRAP2(setreuid, compat_uid_t, ruid, compat_uid_t, euid); -COMPAT_SYSCALL_WRAP2(setregid, compat_gid_t, rgid, compat_gid_t, egid); -COMPAT_SYSCALL_WRAP2(getgroups, int, gidsetsize, compat_gid_t __user *, grouplist); -COMPAT_SYSCALL_WRAP2(setgroups, int, gidsetsize, compat_gid_t __user *, grouplist); -COMPAT_SYSCALL_WRAP3(fchown, unsigned int, fd, compat_uid_t, user, compat_gid_t, group); -COMPAT_SYSCALL_WRAP3(setresuid, compat_uid_t, ruid, compat_uid_t, euid, compat_uid_t, suid); -COMPAT_SYSCALL_WRAP3(getresuid, compat_uid_t __user *, ruid, compat_uid_t __user *, euid, compat_uid_t __user *, suid); -COMPAT_SYSCALL_WRAP3(setresgid, compat_gid_t, rgid, compat_gid_t, egid, compat_gid_t, sgid); -COMPAT_SYSCALL_WRAP3(getresgid, compat_gid_t __user *, rgid, compat_gid_t __user *, egid, compat_gid_t __user *, sgid); -COMPAT_SYSCALL_WRAP3(chown, const char __user *, filename, compat_uid_t, user, compat_gid_t, group); -COMPAT_SYSCALL_WRAP1(setuid, compat_uid_t, uid); -COMPAT_SYSCALL_WRAP1(setgid, compat_gid_t, gid); -COMPAT_SYSCALL_WRAP1(setfsuid, compat_uid_t, uid); -COMPAT_SYSCALL_WRAP1(setfsgid, compat_gid_t, gid); +COMPAT_SYSCALL_WRAP3(lchown, const char __user *, filename, uid_t, user, gid_t, group); +COMPAT_SYSCALL_WRAP2(setreuid, uid_t, ruid, uid_t, euid); +COMPAT_SYSCALL_WRAP2(setregid, gid_t, rgid, gid_t, egid); +COMPAT_SYSCALL_WRAP2(getgroups, int, gidsetsize, gid_t __user *, grouplist); +COMPAT_SYSCALL_WRAP2(setgroups, int, gidsetsize, gid_t __user *, grouplist); +COMPAT_SYSCALL_WRAP3(fchown, unsigned int, fd, uid_t, user, gid_t, group); +COMPAT_SYSCALL_WRAP3(setresuid, uid_t, ruid, uid_t, euid, uid_t, suid); +COMPAT_SYSCALL_WRAP3(getresuid, uid_t __user *, ruid, uid_t __user *, euid, uid_t __user *, suid); +COMPAT_SYSCALL_WRAP3(setresgid, gid_t, rgid, gid_t, egid, gid_t, sgid); +COMPAT_SYSCALL_WRAP3(getresgid, gid_t __user *, rgid, gid_t __user *, egid, gid_t __user *, sgid); +COMPAT_SYSCALL_WRAP3(chown, const char __user *, filename, uid_t, user, gid_t, group); +COMPAT_SYSCALL_WRAP1(setuid, uid_t, uid); +COMPAT_SYSCALL_WRAP1(setgid, gid_t, gid); +COMPAT_SYSCALL_WRAP1(setfsuid, uid_t, uid); +COMPAT_SYSCALL_WRAP1(setfsgid, gid_t, gid); COMPAT_SYSCALL_WRAP2(pivot_root, const char __user *, new_root, const char __user *, put_old); -COMPAT_SYSCALL_WRAP3(mincore, compat_ulong_t, start, compat_size_t, len, unsigned char __user *, vec); -COMPAT_SYSCALL_WRAP3(madvise, compat_ulong_t, start, compat_size_t, len, int, behavior); -COMPAT_SYSCALL_WRAP5(setxattr, const char __user *, path, const char __user *, name, const void __user *, value, compat_size_t, size, int, flags); -COMPAT_SYSCALL_WRAP5(lsetxattr, const char __user *, path, const char __user *, name, const void __user *, value, compat_size_t, size, int, flags); -COMPAT_SYSCALL_WRAP5(fsetxattr, int, fd, const char __user *, name, const void __user *, value, compat_size_t, size, int, flags); +COMPAT_SYSCALL_WRAP3(mincore, unsigned long, start, size_t, len, unsigned char __user *, vec); +COMPAT_SYSCALL_WRAP3(madvise, unsigned long, start, size_t, len, int, behavior); +COMPAT_SYSCALL_WRAP5(setxattr, const char __user *, path, const char __user *, name, const void __user *, value, size_t, size, int, flags); +COMPAT_SYSCALL_WRAP5(lsetxattr, const char __user *, path, const char __user *, name, const void __user *, value, size_t, size, int, flags); +COMPAT_SYSCALL_WRAP5(fsetxattr, int, fd, const char __user *, name, const void __user *, value, size_t, size, int, flags); COMPAT_SYSCALL_WRAP3(getdents64, unsigned int, fd, struct linux_dirent64 __user *, dirent, unsigned int, count); -COMPAT_SYSCALL_WRAP4(getxattr, const char __user *, path, const char __user *, name, void __user *, value, compat_size_t, size); -COMPAT_SYSCALL_WRAP4(lgetxattr, const char __user *, path, const char __user *, name, void __user *, value, compat_size_t, size); -COMPAT_SYSCALL_WRAP4(fgetxattr, int, fd, const char __user *, name, void __user *, value, compat_size_t, size); -COMPAT_SYSCALL_WRAP3(listxattr, const char __user *, path, char __user *, list, compat_size_t, size); -COMPAT_SYSCALL_WRAP3(llistxattr, const char __user *, path, char __user *, list, compat_size_t, size); -COMPAT_SYSCALL_WRAP3(flistxattr, int, fd, char __user *, list, compat_size_t, size); +COMPAT_SYSCALL_WRAP4(getxattr, const char __user *, path, const char __user *, name, void __user *, value, size_t, size); +COMPAT_SYSCALL_WRAP4(lgetxattr, const char __user *, path, const char __user *, name, void __user *, value, size_t, size); +COMPAT_SYSCALL_WRAP4(fgetxattr, int, fd, const char __user *, name, void __user *, value, size_t, size); +COMPAT_SYSCALL_WRAP3(listxattr, const char __user *, path, char __user *, list, size_t, size); +COMPAT_SYSCALL_WRAP3(llistxattr, const char __user *, path, char __user *, list, size_t, size); +COMPAT_SYSCALL_WRAP3(flistxattr, int, fd, char __user *, list, size_t, size); COMPAT_SYSCALL_WRAP2(removexattr, const char __user *, path, const char __user *, name); COMPAT_SYSCALL_WRAP2(lremovexattr, const char __user *, path, const char __user *, name); COMPAT_SYSCALL_WRAP2(fremovexattr, int, fd, const char __user *, name); @@ -131,19 +147,19 @@ COMPAT_SYSCALL_WRAP4(epoll_ctl, int, epfd, int, op, int, fd, struct epoll_event COMPAT_SYSCALL_WRAP4(epoll_wait, int, epfd, struct epoll_event __user *, events, int, maxevents, int, timeout); COMPAT_SYSCALL_WRAP1(timer_getoverrun, timer_t, timer_id); COMPAT_SYSCALL_WRAP1(timer_delete, compat_timer_t, compat_timer_id); -COMPAT_SYSCALL_WRAP1(io_destroy, compat_aio_context_t, ctx); -COMPAT_SYSCALL_WRAP3(io_cancel, compat_aio_context_t, ctx_id, struct iocb __user *, iocb, struct io_event __user *, result); +COMPAT_SYSCALL_WRAP1(io_destroy, aio_context_t, ctx); +COMPAT_SYSCALL_WRAP3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb, struct io_event __user *, result); COMPAT_SYSCALL_WRAP1(mq_unlink, const char __user *, name); -COMPAT_SYSCALL_WRAP5(add_key, const char __user *, tp, const char __user *, dsc, const void __user *, pld, compat_size_t, len, key_serial_t, id); +COMPAT_SYSCALL_WRAP5(add_key, const char __user *, tp, const char __user *, dsc, const void __user *, pld, size_t, len, key_serial_t, id); COMPAT_SYSCALL_WRAP4(request_key, const char __user *, tp, const char __user *, dsc, const char __user *, info, key_serial_t, id); -COMPAT_SYSCALL_WRAP5(remap_file_pages, u32, start, u32, size, u32, prot, u32, pgoff, u32, flags); +COMPAT_SYSCALL_WRAP5(remap_file_pages, unsigned long, start, unsigned long, size, unsigned long, prot, unsigned long, pgoff, unsigned long, flags); COMPAT_SYSCALL_WRAP3(ioprio_set, int, which, int, who, int, ioprio); COMPAT_SYSCALL_WRAP2(ioprio_get, int, which, int, who); COMPAT_SYSCALL_WRAP3(inotify_add_watch, int, fd, const char __user *, path, u32, mask); COMPAT_SYSCALL_WRAP2(inotify_rm_watch, int, fd, __s32, wd); COMPAT_SYSCALL_WRAP3(mkdirat, int, dfd, const char __user *, pathname, umode_t, mode); COMPAT_SYSCALL_WRAP4(mknodat, int, dfd, const char __user *, filename, umode_t, mode, unsigned, dev); -COMPAT_SYSCALL_WRAP5(fchownat, int, dfd, const char __user *, filename, compat_uid_t, user, compat_gid_t, group, int, flag); +COMPAT_SYSCALL_WRAP5(fchownat, int, dfd, const char __user *, filename, uid_t, user, gid_t, group, int, flag); COMPAT_SYSCALL_WRAP3(unlinkat, int, dfd, const char __user *, pathname, int, flag); COMPAT_SYSCALL_WRAP4(renameat, int, olddfd, const char __user *, oldname, int, newdfd, const char __user *, newname); COMPAT_SYSCALL_WRAP5(linkat, int, olddfd, const char __user *, oldname, int, newdfd, const char __user *, newname, int, flags); @@ -151,9 +167,9 @@ COMPAT_SYSCALL_WRAP3(symlinkat, const char __user *, oldname, int, newdfd, const COMPAT_SYSCALL_WRAP4(readlinkat, int, dfd, const char __user *, path, char __user *, buf, int, bufsiz); COMPAT_SYSCALL_WRAP3(fchmodat, int, dfd, const char __user *, filename, umode_t, mode); COMPAT_SYSCALL_WRAP3(faccessat, int, dfd, const char __user *, filename, int, mode); -COMPAT_SYSCALL_WRAP1(unshare, compat_ulong_t, unshare_flags); -COMPAT_SYSCALL_WRAP6(splice, int, fd_in, loff_t __user *, off_in, int, fd_out, loff_t __user *, off_out, compat_size_t, len, unsigned int, flags); -COMPAT_SYSCALL_WRAP4(tee, int, fdin, int, fdout, compat_size_t, len, unsigned int, flags); +COMPAT_SYSCALL_WRAP1(unshare, unsigned long, unshare_flags); +COMPAT_SYSCALL_WRAP6(splice, int, fd_in, loff_t __user *, off_in, int, fd_out, loff_t __user *, off_out, size_t, len, unsigned int, flags); +COMPAT_SYSCALL_WRAP4(tee, int, fdin, int, fdout, size_t, len, unsigned int, flags); COMPAT_SYSCALL_WRAP3(getcpu, unsigned __user *, cpu, unsigned __user *, node, struct getcpu_cache __user *, cache); COMPAT_SYSCALL_WRAP1(eventfd, unsigned int, count); COMPAT_SYSCALL_WRAP2(timerfd_create, int, clockid, int, flags); @@ -164,15 +180,15 @@ COMPAT_SYSCALL_WRAP3(dup3, unsigned int, oldfd, unsigned int, newfd, int, flags) COMPAT_SYSCALL_WRAP1(epoll_create1, int, flags); COMPAT_SYSCALL_WRAP2(tkill, int, pid, int, sig); COMPAT_SYSCALL_WRAP3(tgkill, int, tgid, int, pid, int, sig); -COMPAT_SYSCALL_WRAP5(perf_event_open, struct perf_event_attr __user *, attr_uptr, pid_t, pid, int, cpu, int, group_fd, u32, flags); -COMPAT_SYSCALL_WRAP5(clone, u32, newsp, u32, clone_flags, int __user *, parent_tidptr, int __user *, child_tidptr, int, tls_val); +COMPAT_SYSCALL_WRAP5(perf_event_open, struct perf_event_attr __user *, attr_uptr, pid_t, pid, int, cpu, int, group_fd, unsigned long, flags); +COMPAT_SYSCALL_WRAP5(clone, unsigned long, newsp, unsigned long, clone_flags, int __user *, parent_tidptr, int __user *, child_tidptr, int, tls_val); COMPAT_SYSCALL_WRAP2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags); COMPAT_SYSCALL_WRAP4(prlimit64, pid_t, pid, unsigned int, resource, const struct rlimit64 __user *, new_rlim, struct rlimit64 __user *, old_rlim); COMPAT_SYSCALL_WRAP5(name_to_handle_at, int, dfd, const char __user *, name, struct file_handle __user *, handle, int __user *, mnt_id, int, flag); COMPAT_SYSCALL_WRAP1(syncfs, int, fd); COMPAT_SYSCALL_WRAP2(setns, int, fd, int, nstype); COMPAT_SYSCALL_WRAP2(s390_runtime_instr, int, command, int, signum); -COMPAT_SYSCALL_WRAP5(kcmp, pid_t, pid1, pid_t, pid2, int, type, u32, idx1, u32, idx2); +COMPAT_SYSCALL_WRAP5(kcmp, pid_t, pid1, pid_t, pid2, int, type, unsigned long, idx1, unsigned long, idx2); COMPAT_SYSCALL_WRAP3(finit_module, int, fd, const char __user *, uargs, int, flags); COMPAT_SYSCALL_WRAP3(sched_setattr, pid_t, pid, struct sched_attr __user *, attr, unsigned int, flags); COMPAT_SYSCALL_WRAP4(sched_getattr, pid_t, pid, struct sched_attr __user *, attr, unsigned int, size, unsigned int, flags); diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index a747a77ea584..1e67b7a5968c 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -98,6 +98,8 @@ struct sigaltstack; #define __MAP(n,...) __MAP##n(__VA_ARGS__) #define __SC_DECL(t, a) t a +#define __TYPE_IS_L(t) (__same_type((t)0, 0L)) +#define __TYPE_IS_UL(t) (__same_type((t)0, 0UL)) #define __TYPE_IS_LL(t) (__same_type((t)0, 0LL) || __same_type((t)0, 0ULL)) #define __SC_LONG(t, a) __typeof(__builtin_choose_expr(__TYPE_IS_LL(t), 0LL, 0L)) a #define __SC_CAST(t, a) (t) a -- cgit v1.2.3 From 668f9abbd4334e6c29fa8acd71635c4f9101caa7 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Mon, 3 Mar 2014 15:38:18 -0800 Subject: mm: close PageTail race Commit bf6bddf1924e ("mm: introduce compaction and migration for ballooned pages") introduces page_count(page) into memory compaction which dereferences page->first_page if PageTail(page). This results in a very rare NULL pointer dereference on the aforementioned page_count(page). Indeed, anything that does compound_head(), including page_count() is susceptible to racing with prep_compound_page() and seeing a NULL or dangling page->first_page pointer. This patch uses Andrea's implementation of compound_trans_head() that deals with such a race and makes it the default compound_head() implementation. This includes a read memory barrier that ensures that if PageTail(head) is true that we return a head page that is neither NULL nor dangling. The patch then adds a store memory barrier to prep_compound_page() to ensure page->first_page is set. This is the safest way to ensure we see the head page that we are expecting, PageTail(page) is already in the unlikely() path and the memory barriers are unfortunately required. Hugetlbfs is the exception, we don't enforce a store memory barrier during init since no race is possible. Signed-off-by: David Rientjes Cc: Holger Kiehl Cc: Christoph Lameter Cc: Rafael Aquini Cc: Vlastimil Babka Cc: Michal Hocko Cc: Mel Gorman Cc: Andrea Arcangeli Cc: Rik van Riel Cc: "Kirill A. Shutemov" Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/aoe/aoecmd.c | 4 ++-- drivers/vfio/vfio_iommu_type1.c | 4 ++-- fs/proc/page.c | 5 ++--- include/linux/huge_mm.h | 41 ----------------------------------------- include/linux/mm.h | 14 ++++++++++++-- mm/ksm.c | 2 +- mm/memory-failure.c | 2 +- mm/page_alloc.c | 4 +++- mm/swap.c | 4 ++-- 9 files changed, 25 insertions(+), 55 deletions(-) (limited to 'include/linux') diff --git a/drivers/block/aoe/aoecmd.c b/drivers/block/aoe/aoecmd.c index 8184451b57c0..422b7d84f686 100644 --- a/drivers/block/aoe/aoecmd.c +++ b/drivers/block/aoe/aoecmd.c @@ -874,7 +874,7 @@ bio_pageinc(struct bio *bio) /* Non-zero page count for non-head members of * compound pages is no longer allowed by the kernel. */ - page = compound_trans_head(bv.bv_page); + page = compound_head(bv.bv_page); atomic_inc(&page->_count); } } @@ -887,7 +887,7 @@ bio_pagedec(struct bio *bio) struct bvec_iter iter; bio_for_each_segment(bv, bio, iter) { - page = compound_trans_head(bv.bv_page); + page = compound_head(bv.bv_page); atomic_dec(&page->_count); } } diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c index 4fb7a8f83c8a..54af4e933695 100644 --- a/drivers/vfio/vfio_iommu_type1.c +++ b/drivers/vfio/vfio_iommu_type1.c @@ -186,12 +186,12 @@ static bool is_invalid_reserved_pfn(unsigned long pfn) if (pfn_valid(pfn)) { bool reserved; struct page *tail = pfn_to_page(pfn); - struct page *head = compound_trans_head(tail); + struct page *head = compound_head(tail); reserved = !!(PageReserved(head)); if (head != tail) { /* * "head" is not a dangling pointer - * (compound_trans_head takes care of that) + * (compound_head takes care of that) * but the hugepage may have been split * from under us (and we may not hold a * reference count on the head page so it can diff --git a/fs/proc/page.c b/fs/proc/page.c index 02174a610315..e647c55275d9 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c @@ -121,9 +121,8 @@ u64 stable_page_flags(struct page *page) * just checks PG_head/PG_tail, so we need to check PageLRU/PageAnon * to make sure a given page is a thp, not a non-huge compound page. */ - else if (PageTransCompound(page) && - (PageLRU(compound_trans_head(page)) || - PageAnon(compound_trans_head(page)))) + else if (PageTransCompound(page) && (PageLRU(compound_head(page)) || + PageAnon(compound_head(page)))) u |= 1 << KPF_THP; /* diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index db512014e061..b826239bdce0 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -157,46 +157,6 @@ static inline int hpage_nr_pages(struct page *page) return HPAGE_PMD_NR; return 1; } -/* - * compound_trans_head() should be used instead of compound_head(), - * whenever the "page" passed as parameter could be the tail of a - * transparent hugepage that could be undergoing a - * __split_huge_page_refcount(). The page structure layout often - * changes across releases and it makes extensive use of unions. So if - * the page structure layout will change in a way that - * page->first_page gets clobbered by __split_huge_page_refcount, the - * implementation making use of smp_rmb() will be required. - * - * Currently we define compound_trans_head as compound_head, because - * page->private is in the same union with page->first_page, and - * page->private isn't clobbered. However this also means we're - * currently leaving dirt into the page->private field of anonymous - * pages resulting from a THP split, instead of setting page->private - * to zero like for every other page that has PG_private not set. But - * anonymous pages don't use page->private so this is not a problem. - */ -#if 0 -/* This will be needed if page->private will be clobbered in split_huge_page */ -static inline struct page *compound_trans_head(struct page *page) -{ - if (PageTail(page)) { - struct page *head; - head = page->first_page; - smp_rmb(); - /* - * head may be a dangling pointer. - * __split_huge_page_refcount clears PageTail before - * overwriting first_page, so if PageTail is still - * there it means the head pointer isn't dangling. - */ - if (PageTail(page)) - return head; - } - return page; -} -#else -#define compound_trans_head(page) compound_head(page) -#endif extern int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, pmd_t pmd, pmd_t *pmdp); @@ -226,7 +186,6 @@ static inline int split_huge_page(struct page *page) do { } while (0) #define split_huge_page_pmd_mm(__mm, __address, __pmd) \ do { } while (0) -#define compound_trans_head(page) compound_head(page) static inline int hugepage_madvise(struct vm_area_struct *vma, unsigned long *vm_flags, int advice) { diff --git a/include/linux/mm.h b/include/linux/mm.h index f28f46eade6a..03ab3e58f511 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -399,8 +399,18 @@ static inline void compound_unlock_irqrestore(struct page *page, static inline struct page *compound_head(struct page *page) { - if (unlikely(PageTail(page))) - return page->first_page; + if (unlikely(PageTail(page))) { + struct page *head = page->first_page; + + /* + * page->first_page may be a dangling pointer to an old + * compound page, so recheck that it is still a tail + * page before returning. + */ + smp_rmb(); + if (likely(PageTail(page))) + return head; + } return page; } diff --git a/mm/ksm.c b/mm/ksm.c index aa4c7c7250c1..68710e80994a 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -444,7 +444,7 @@ static void break_cow(struct rmap_item *rmap_item) static struct page *page_trans_compound_anon(struct page *page) { if (PageTransCompound(page)) { - struct page *head = compound_trans_head(page); + struct page *head = compound_head(page); /* * head may actually be splitted and freed from under * us but it's ok here. diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 2f2f34a4e77d..90002ea43638 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1651,7 +1651,7 @@ int soft_offline_page(struct page *page, int flags) { int ret; unsigned long pfn = page_to_pfn(page); - struct page *hpage = compound_trans_head(page); + struct page *hpage = compound_head(page); if (PageHWPoison(page)) { pr_info("soft offline: %#lx page already poisoned\n", pfn); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index e3758a09a009..3d1bf889465a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -369,9 +369,11 @@ void prep_compound_page(struct page *page, unsigned long order) __SetPageHead(page); for (i = 1; i < nr_pages; i++) { struct page *p = page + i; - __SetPageTail(p); set_page_count(p, 0); p->first_page = page; + /* Make sure p->first_page is always valid for PageTail() */ + smp_wmb(); + __SetPageTail(p); } } diff --git a/mm/swap.c b/mm/swap.c index b31ba67d440a..0092097b3f4c 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -98,7 +98,7 @@ static void put_compound_page(struct page *page) } /* __split_huge_page_refcount can run under us */ - page_head = compound_trans_head(page); + page_head = compound_head(page); /* * THP can not break up slab pages so avoid taking @@ -253,7 +253,7 @@ bool __get_page_tail(struct page *page) */ unsigned long flags; bool got; - struct page *page_head = compound_trans_head(page); + struct page *page_head = compound_head(page); /* Ref to put_compound_page() comment. */ if (!__compound_tail_refcounted(page_head)) { -- cgit v1.2.3 From 9050d7eba40b3d79551668f54e68fd6f51945ef3 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Mon, 3 Mar 2014 15:38:27 -0800 Subject: mm: include VM_MIXEDMAP flag in the VM_SPECIAL list to avoid m(un)locking Daniel Borkmann reported a VM_BUG_ON assertion failing: ------------[ cut here ]------------ kernel BUG at mm/mlock.c:528! invalid opcode: 0000 [#1] SMP Modules linked in: ccm arc4 iwldvm [...] video CPU: 3 PID: 2266 Comm: netsniff-ng Not tainted 3.14.0-rc2+ #8 Hardware name: LENOVO 2429BP3/2429BP3, BIOS G4ET37WW (1.12 ) 05/29/2012 task: ffff8801f87f9820 ti: ffff88002cb44000 task.ti: ffff88002cb44000 RIP: 0010:[] [] munlock_vma_pages_range+0x2e0/0x2f0 Call Trace: do_munmap+0x18f/0x3b0 vm_munmap+0x41/0x60 SyS_munmap+0x22/0x30 system_call_fastpath+0x1a/0x1f RIP munlock_vma_pages_range+0x2e0/0x2f0 ---[ end trace a0088dcf07ae10f2 ]--- because munlock_vma_pages_range() thinks it's unexpectedly in the middle of a THP page. This can be reproduced with default config since 3.11 kernels. A reproducer can be found in the kernel's selftest directory for networking by running ./psock_tpacket. The problem is that an order=2 compound page (allocated by alloc_one_pg_vec_page() is part of the munlocked VM_MIXEDMAP vma (mapped by packet_mmap()) and mistaken for a THP page and assumed to be order=9. The checks for THP in munlock came with commit ff6a6da60b89 ("mm: accelerate munlock() treatment of THP pages"), i.e. since 3.9, but did not trigger a bug. It just makes munlock_vma_pages_range() skip such compound pages until the next 512-pages-aligned page, when it encounters a head page. This is however not a problem for vma's where mlocking has no effect anyway, but it can distort the accounting. Since commit 7225522bb429 ("mm: munlock: batch non-THP page isolation and munlock+putback using pagevec") this can trigger a VM_BUG_ON in PageTransHuge() check. This patch fixes the issue by adding VM_MIXEDMAP flag to VM_SPECIAL, a list of flags that make vma's non-mlockable and non-mergeable. The reasoning is that VM_MIXEDMAP vma's are similar to VM_PFNMAP, which is already on the VM_SPECIAL list, and both are intended for non-LRU pages where mlocking makes no sense anyway. Related Lkml discussion can be found in [2]. [1] tools/testing/selftests/net/psock_tpacket [2] https://lkml.org/lkml/2014/1/10/427 Signed-off-by: Vlastimil Babka Signed-off-by: Daniel Borkmann Reported-by: Daniel Borkmann Tested-by: Daniel Borkmann Cc: Thomas Hellstrom Cc: John David Anglin Cc: HATAYAMA Daisuke Cc: Konstantin Khlebnikov Cc: Carsten Otte Cc: Jared Hulbert Tested-by: Hannes Frederic Sowa Cc: Kirill A. Shutemov Acked-by: Rik van Riel Cc: Andrea Arcangeli Cc: [3.11.x+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 2 +- mm/huge_memory.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 03ab3e58f511..a1fe25110f50 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -175,7 +175,7 @@ extern unsigned int kobjsize(const void *objp); * Special vmas that are non-mergable, non-mlock()able. * Note: mm/huge_memory.c VM_NO_THP depends on this definition. */ -#define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_PFNMAP) +#define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_PFNMAP | VM_MIXEDMAP) /* * mapping from the currently active vm_flags protection bits (the diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 4df39b1bde91..1546655a2d78 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1961,7 +1961,7 @@ out: return ret; } -#define VM_NO_THP (VM_SPECIAL|VM_MIXEDMAP|VM_HUGETLB|VM_SHARED|VM_MAYSHARE) +#define VM_NO_THP (VM_SPECIAL | VM_HUGETLB | VM_SHARED | VM_MAYSHARE) int hugepage_madvise(struct vm_area_struct *vma, unsigned long *vm_flags, int advice) -- cgit v1.2.3 From 1ae71d03194ea7424cbd14e449581f67c463d20d Mon Sep 17 00:00:00 2001 From: Liu Ping Fan Date: Mon, 3 Mar 2014 15:38:39 -0800 Subject: mm: numa: bugfix for LAST_CPUPID_NOT_IN_PAGE_FLAGS When doing some numa tests on powerpc, I triggered an oops bug. I find it is caused by using page->_last_cpupid. It should be initialized as "-1 & LAST_CPUPID_MASK", but not "-1". Otherwise, in task_numa_fault(), we will miss the checking (last_cpupid == (-1 & LAST_CPUPID_MASK)). And finally cause an oops bug in task_numa_group(), since the online cpu is less than possible cpu. This happen with CONFIG_SPARSE_VMEMMAP disabled Call trace: SMP NR_CPUS=64 NUMA PowerNV Modules linked in: CPU: 24 PID: 804 Comm: systemd-udevd Not tainted3.13.0-rc1+ #32 task: c000001e2746aa80 ti: c000001e32c50000 task.ti:c000001e32c50000 REGS: c000001e32c53510 TRAP: 0300 Not tainted(3.13.0-rc1+) MSR: 9000000000009032 CR:28024424 XER: 20000000 CFAR: c000000000009324 DAR: 7265717569726857 DSISR:40000000 SOFTE: 1 NIP .task_numa_fault+0x1470/0x2370 LR .task_numa_fault+0x1468/0x2370 Call Trace: .task_numa_fault+0x1468/0x2370 (unreliable) .do_numa_page+0x480/0x4a0 .handle_mm_fault+0x4ec/0xc90 .do_page_fault+0x3a8/0x890 handle_page_fault+0x10/0x30 Instruction dump: 3c82fefb 3884b138 48d9cff1 60000000 48000574 3c62fefb3863af78 3c82fefb 3884b138 48d9cfd5 60000000 e93f0100 <812902e4> 7d2907b45529063e 7d2a07b4 ---[ end trace 15f2510da5ae07cf ]--- Signed-off-by: Liu Ping Fan Signed-off-by: Aneesh Kumar K.V Acked-by: Peter Zijlstra Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index a1fe25110f50..c1b7414c7bef 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -767,7 +767,7 @@ static inline bool __cpupid_match_pid(pid_t task_pid, int cpupid) #ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS static inline int page_cpupid_xchg_last(struct page *page, int cpupid) { - return xchg(&page->_last_cpupid, cpupid); + return xchg(&page->_last_cpupid, cpupid & LAST_CPUPID_MASK); } static inline int page_cpupid_last(struct page *page) @@ -776,7 +776,7 @@ static inline int page_cpupid_last(struct page *page) } static inline void page_cpupid_reset_last(struct page *page) { - page->_last_cpupid = -1; + page->_last_cpupid = -1 & LAST_CPUPID_MASK; } #else static inline int page_cpupid_last(struct page *page) -- cgit v1.2.3 From 3e909599215456928e6b42a04f11c2517881570b Mon Sep 17 00:00:00 2001 From: Matt Fleming Date: Wed, 15 Jan 2014 13:21:22 +0000 Subject: efi: Move facility flags to struct efi As we grow support for more EFI architectures they're going to want the ability to query which EFI features are available on the running system. Instead of storing this information in an architecture-specific place, stick it in the global 'struct efi', which is already the central location for EFI state. While we're at it, let's change the return value of efi_enabled() to be bool and replace all references to 'facility' with 'feature', which is the usual word used to describe the attributes of the running system. Signed-off-by: Matt Fleming --- arch/x86/include/asm/efi.h | 1 - arch/x86/kernel/setup.c | 6 +++--- arch/x86/platform/efi/efi.c | 21 +++++---------------- include/linux/efi.h | 18 +++++++++++++----- 4 files changed, 21 insertions(+), 25 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index 3d6b9f81cc68..a7a8bd89024e 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -119,7 +119,6 @@ extern void __iomem *efi_ioremap(unsigned long addr, unsigned long size, #endif /* CONFIG_X86_32 */ extern int add_efi_memmap; -extern unsigned long x86_efi_facility; extern struct efi_scratch efi_scratch; extern void efi_set_executable(efi_memory_desc_t *md, bool executable); extern int efi_memblock_x86_reserve_range(void); diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 06853e670354..3b5d278b7d5b 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -926,11 +926,11 @@ void __init setup_arch(char **cmdline_p) #ifdef CONFIG_EFI if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature, "EL32", 4)) { - set_bit(EFI_BOOT, &x86_efi_facility); + set_bit(EFI_BOOT, &efi.flags); } else if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature, "EL64", 4)) { - set_bit(EFI_BOOT, &x86_efi_facility); - set_bit(EFI_64BIT, &x86_efi_facility); + set_bit(EFI_BOOT, &efi.flags); + set_bit(EFI_64BIT, &efi.flags); } if (efi_enabled(EFI_BOOT)) diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index 1a201ac7cef8..821562984452 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -67,8 +67,6 @@ struct efi_memory_map memmap; static struct efi efi_phys __initdata; static efi_system_table_t efi_systab __initdata; -unsigned long x86_efi_facility; - static __initdata efi_config_table_type_t arch_tables[] = { #ifdef CONFIG_X86_UV {UV_SYSTEM_TABLE_GUID, "UVsystab", &efi.uv_systab}, @@ -78,15 +76,6 @@ static __initdata efi_config_table_type_t arch_tables[] = { u64 efi_setup; /* efi setup_data physical address */ -/* - * Returns 1 if 'facility' is enabled, 0 otherwise. - */ -int efi_enabled(int facility) -{ - return test_bit(facility, &x86_efi_facility) != 0; -} -EXPORT_SYMBOL(efi_enabled); - static bool __initdata disable_runtime = false; static int __init setup_noefi(char *arg) { @@ -455,7 +444,7 @@ void __init efi_reserve_boot_services(void) void __init efi_unmap_memmap(void) { - clear_bit(EFI_MEMMAP, &x86_efi_facility); + clear_bit(EFI_MEMMAP, &efi.flags); if (memmap.map) { early_iounmap(memmap.map, memmap.nr_map * memmap.desc_size); memmap.map = NULL; @@ -722,7 +711,7 @@ void __init efi_init(void) if (efi_systab_init(efi_phys.systab)) return; - set_bit(EFI_SYSTEM_TABLES, &x86_efi_facility); + set_bit(EFI_SYSTEM_TABLES, &efi.flags); efi.config_table = (unsigned long)efi.systab->tables; efi.fw_vendor = (unsigned long)efi.systab->fw_vendor; @@ -750,7 +739,7 @@ void __init efi_init(void) if (efi_config_init(arch_tables)) return; - set_bit(EFI_CONFIG_TABLES, &x86_efi_facility); + set_bit(EFI_CONFIG_TABLES, &efi.flags); /* * Note: We currently don't support runtime services on an EFI @@ -762,12 +751,12 @@ void __init efi_init(void) else { if (disable_runtime || efi_runtime_init()) return; - set_bit(EFI_RUNTIME_SERVICES, &x86_efi_facility); + set_bit(EFI_RUNTIME_SERVICES, &efi.flags); } if (efi_memmap_init()) return; - set_bit(EFI_MEMMAP, &x86_efi_facility); + set_bit(EFI_MEMMAP, &efi.flags); print_efi_memmap(); } diff --git a/include/linux/efi.h b/include/linux/efi.h index 0a819e7a60c9..214833b4a97d 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -573,6 +573,7 @@ extern struct efi { efi_reset_system_t *reset_system; efi_set_virtual_address_map_t *set_virtual_address_map; struct efi_memory_map *memmap; + unsigned long flags; } efi; static inline int @@ -660,17 +661,24 @@ extern int __init efi_setup_pcdp_console(char *); #ifdef CONFIG_EFI # ifdef CONFIG_X86 -extern int efi_enabled(int facility); + +/* + * Test whether the above EFI_* bits are enabled. + */ +static inline bool efi_enabled(int feature) +{ + return test_bit(feature, &efi.flags) != 0; +} # else -static inline int efi_enabled(int facility) +static inline bool efi_enabled(int feature) { - return 1; + return true; } # endif #else -static inline int efi_enabled(int facility) +static inline bool efi_enabled(int feature) { - return 0; + return false; } #endif -- cgit v1.2.3 From 092063808c498eccac8e891973bf143e7b60d723 Mon Sep 17 00:00:00 2001 From: Matt Fleming Date: Wed, 15 Jan 2014 13:49:51 +0000 Subject: ia64/efi: Implement efi_enabled() There's no good reason to keep efi_enabled() under CONFIG_X86 anymore, since nothing about the implementation is specific to x86. Set EFI feature flags in the ia64 boot path instead of claiming to support all features. The old behaviour was actually buggy since efi.memmap never points to a valid memory map, so we shouldn't be claiming to support EFI_MEMMAP. Fortunately, this bug was never triggered because EFI_MEMMAP isn't used outside of arch/x86 currently, but that may not always be the case. Reviewed-and-tested-by: Tony Luck Signed-off-by: Matt Fleming --- arch/ia64/kernel/efi.c | 7 +++++++ include/linux/efi.h | 8 -------- 2 files changed, 7 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/arch/ia64/kernel/efi.c b/arch/ia64/kernel/efi.c index da5b462e6de6..741b99c1a0b1 100644 --- a/arch/ia64/kernel/efi.c +++ b/arch/ia64/kernel/efi.c @@ -477,6 +477,9 @@ efi_init (void) char *cp, vendor[100] = "unknown"; int i; + set_bit(EFI_BOOT, &efi.flags); + set_bit(EFI_64BIT, &efi.flags); + /* * It's too early to be able to use the standard kernel command line * support... @@ -529,6 +532,8 @@ efi_init (void) efi.systab->hdr.revision >> 16, efi.systab->hdr.revision & 0xffff, vendor); + set_bit(EFI_SYSTEM_TABLES, &efi.flags); + palo_phys = EFI_INVALID_TABLE_ADDR; if (efi_config_init(arch_tables) != 0) @@ -657,6 +662,8 @@ efi_enter_virtual_mode (void) return; } + set_bit(EFI_RUNTIME_SERVICES, &efi.flags); + /* * Now that EFI is in virtual mode, we call the EFI functions more * efficiently: diff --git a/include/linux/efi.h b/include/linux/efi.h index 214833b4a97d..64d532ca890a 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -660,8 +660,6 @@ extern int __init efi_setup_pcdp_console(char *); #define EFI_ARCH_1 6 /* First arch-specific bit */ #ifdef CONFIG_EFI -# ifdef CONFIG_X86 - /* * Test whether the above EFI_* bits are enabled. */ @@ -669,12 +667,6 @@ static inline bool efi_enabled(int feature) { return test_bit(feature, &efi.flags) != 0; } -# else -static inline bool efi_enabled(int feature) -{ - return true; -} -# endif #else static inline bool efi_enabled(int feature) { -- cgit v1.2.3 From 792d0018a5fe31ef8ef9d07a7a02081d4abdf6b7 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 23 Feb 2014 21:40:14 +0000 Subject: genirq: Add a kstat helper to increment irq stats There is a common pattern all over the place: kstat_incr_irqs_this_cpu(irq, irq_to_desc(irq)); This results in a call to core code anyway. So provide a function which does the same thing in core. While at it, replace the butt ugly macro with an inline. Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20140223212737.422068876@linutronix.de Signed-off-by: Thomas Gleixner --- include/linux/kernel_stat.h | 12 +++++++----- kernel/irq/irqdesc.c | 5 +++++ 2 files changed, 12 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h index 51c72be4a7c3..54ec7e0a7d72 100644 --- a/include/linux/kernel_stat.h +++ b/include/linux/kernel_stat.h @@ -54,11 +54,13 @@ extern unsigned long long nr_context_switches(void); #include extern unsigned int kstat_irqs_cpu(unsigned int irq, int cpu); -#define kstat_incr_irqs_this_cpu(irqno, DESC) \ -do { \ - __this_cpu_inc(*(DESC)->kstat_irqs); \ - __this_cpu_inc(kstat.irqs_sum); \ -} while (0) +static inline void kstat_incr_irqs_this_cpu(unsigned int irq, struct irq_desc *desc) +{ + __this_cpu_inc(*desc->kstat_irqs); + __this_cpu_inc(kstat.irqs_sum); +} + +extern void kstat_incr_irq_this_cpu(unsigned int irq); static inline void kstat_incr_softirqs_this_cpu(unsigned int irq) { diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 8ab8e9390297..a7174617616b 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -489,6 +489,11 @@ void dynamic_irq_cleanup(unsigned int irq) raw_spin_unlock_irqrestore(&desc->lock, flags); } +void kstat_incr_irq_this_cpu(unsigned int irq) +{ + kstat_incr_irqs_this_cpu(irq, irq_to_desc(irq)); +} + unsigned int kstat_irqs_cpu(unsigned int irq, int cpu) { struct irq_desc *desc = irq_to_desc(irq); -- cgit v1.2.3 From 8f945a3325bbe0dd651e2f496a53df9b06fc6d07 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 23 Feb 2014 21:40:23 +0000 Subject: genirq: Move kstat_incr_irqs_this_cpu() to core No more users outside the core code. Put it into the poison cabinet. That also gets rid of the linux/irq.h include in kernel_stat.h Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20140223212739.124207133@linutronix.de Signed-off-by: Thomas Gleixner --- include/linux/kernel_stat.h | 8 -------- kernel/irq/internals.h | 7 +++++++ 2 files changed, 7 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h index 54ec7e0a7d72..c3fbda7690d6 100644 --- a/include/linux/kernel_stat.h +++ b/include/linux/kernel_stat.h @@ -51,15 +51,7 @@ DECLARE_PER_CPU(struct kernel_cpustat, kernel_cpustat); extern unsigned long long nr_context_switches(void); -#include extern unsigned int kstat_irqs_cpu(unsigned int irq, int cpu); - -static inline void kstat_incr_irqs_this_cpu(unsigned int irq, struct irq_desc *desc) -{ - __this_cpu_inc(*desc->kstat_irqs); - __this_cpu_inc(kstat.irqs_sum); -} - extern void kstat_incr_irq_this_cpu(unsigned int irq); static inline void kstat_incr_softirqs_this_cpu(unsigned int irq) diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index d61ac29e32d0..17b671713d5f 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -6,6 +6,7 @@ * of this file for your non core code. */ #include +#include #ifdef CONFIG_SPARSE_IRQ # define IRQ_BITMAP_BITS (NR_IRQS + 8196) @@ -180,3 +181,9 @@ static inline bool irqd_has_set(struct irq_data *d, unsigned int mask) { return d->state_use_accessors & mask; } + +static inline void kstat_incr_irqs_this_cpu(unsigned int irq, struct irq_desc *desc) +{ + __this_cpu_inc(*desc->kstat_irqs); + __this_cpu_inc(kstat.irqs_sum); +} -- cgit v1.2.3 From 677703cef0a148ba07d37ced649ad25b1cda2f78 Mon Sep 17 00:00:00 2001 From: Matt Fleming Date: Fri, 10 Jan 2014 13:47:37 +0000 Subject: efi: Add separate 32-bit/64-bit definitions The traditional approach of using machine-specific types such as 'unsigned long' does not allow the kernel to interact with firmware running in a different CPU mode, e.g. 64-bit kernel with 32-bit EFI. Add distinct EFI structure definitions for both 32-bit and 64-bit so that we can use them in the 32-bit and 64-bit code paths. Acked-by: Borislav Petkov Signed-off-by: Matt Fleming --- arch/x86/boot/compressed/eboot.h | 44 +++++++ include/linux/efi.h | 252 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 296 insertions(+) (limited to 'include/linux') diff --git a/arch/x86/boot/compressed/eboot.h b/arch/x86/boot/compressed/eboot.h index 81b6b652b46a..d487e727f1ec 100644 --- a/arch/x86/boot/compressed/eboot.h +++ b/arch/x86/boot/compressed/eboot.h @@ -37,6 +37,24 @@ struct efi_graphics_output_mode_info { u32 pixels_per_scan_line; } __packed; +struct efi_graphics_output_protocol_mode_32 { + u32 max_mode; + u32 mode; + u32 info; + u32 size_of_info; + u64 frame_buffer_base; + u32 frame_buffer_size; +} __packed; + +struct efi_graphics_output_protocol_mode_64 { + u32 max_mode; + u32 mode; + u64 info; + u64 size_of_info; + u64 frame_buffer_base; + u64 frame_buffer_size; +} __packed; + struct efi_graphics_output_protocol_mode { u32 max_mode; u32 mode; @@ -46,6 +64,20 @@ struct efi_graphics_output_protocol_mode { unsigned long frame_buffer_size; } __packed; +struct efi_graphics_output_protocol_32 { + u32 query_mode; + u32 set_mode; + u32 blt; + u32 mode; +}; + +struct efi_graphics_output_protocol_64 { + u64 query_mode; + u64 set_mode; + u64 blt; + u64 mode; +}; + struct efi_graphics_output_protocol { void *query_mode; unsigned long set_mode; @@ -53,6 +85,18 @@ struct efi_graphics_output_protocol { struct efi_graphics_output_protocol_mode *mode; }; +struct efi_uga_draw_protocol_32 { + u32 get_mode; + u32 set_mode; + u32 blt; +}; + +struct efi_uga_draw_protocol_64 { + u64 get_mode; + u64 set_mode; + u64 blt; +}; + struct efi_uga_draw_protocol { void *get_mode; void *set_mode; diff --git a/include/linux/efi.h b/include/linux/efi.h index 0a819e7a60c9..42c662729e13 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -153,6 +153,102 @@ typedef struct { u8 sets_to_zero; } efi_time_cap_t; +typedef struct { + efi_table_hdr_t hdr; + u32 raise_tpl; + u32 restore_tpl; + u32 allocate_pages; + u32 free_pages; + u32 get_memory_map; + u32 allocate_pool; + u32 free_pool; + u32 create_event; + u32 set_timer; + u32 wait_for_event; + u32 signal_event; + u32 close_event; + u32 check_event; + u32 install_protocol_interface; + u32 reinstall_protocol_interface; + u32 uninstall_protocol_interface; + u32 handle_protocol; + u32 __reserved; + u32 register_protocol_notify; + u32 locate_handle; + u32 locate_device_path; + u32 install_configuration_table; + u32 load_image; + u32 start_image; + u32 exit; + u32 unload_image; + u32 exit_boot_services; + u32 get_next_monotonic_count; + u32 stall; + u32 set_watchdog_timer; + u32 connect_controller; + u32 disconnect_controller; + u32 open_protocol; + u32 close_protocol; + u32 open_protocol_information; + u32 protocols_per_handle; + u32 locate_handle_buffer; + u32 locate_protocol; + u32 install_multiple_protocol_interfaces; + u32 uninstall_multiple_protocol_interfaces; + u32 calculate_crc32; + u32 copy_mem; + u32 set_mem; + u32 create_event_ex; +} __packed efi_boot_services_32_t; + +typedef struct { + efi_table_hdr_t hdr; + u64 raise_tpl; + u64 restore_tpl; + u64 allocate_pages; + u64 free_pages; + u64 get_memory_map; + u64 allocate_pool; + u64 free_pool; + u64 create_event; + u64 set_timer; + u64 wait_for_event; + u64 signal_event; + u64 close_event; + u64 check_event; + u64 install_protocol_interface; + u64 reinstall_protocol_interface; + u64 uninstall_protocol_interface; + u64 handle_protocol; + u64 __reserved; + u64 register_protocol_notify; + u64 locate_handle; + u64 locate_device_path; + u64 install_configuration_table; + u64 load_image; + u64 start_image; + u64 exit; + u64 unload_image; + u64 exit_boot_services; + u64 get_next_monotonic_count; + u64 stall; + u64 set_watchdog_timer; + u64 connect_controller; + u64 disconnect_controller; + u64 open_protocol; + u64 close_protocol; + u64 open_protocol_information; + u64 protocols_per_handle; + u64 locate_handle_buffer; + u64 locate_protocol; + u64 install_multiple_protocol_interfaces; + u64 uninstall_multiple_protocol_interfaces; + u64 calculate_crc32; + u64 copy_mem; + u64 set_mem; + u64 create_event_ex; +} __packed efi_boot_services_64_t; + /* * EFI Boot Services table */ @@ -231,12 +327,61 @@ typedef enum { EfiPciIoAttributeOperationMaximum } EFI_PCI_IO_PROTOCOL_ATTRIBUTE_OPERATION; +typedef struct { + u32 read; + u32 write; +} efi_pci_io_protocol_access_32_t; + +typedef struct { + u64 read; + u64 write; +} efi_pci_io_protocol_access_64_t; typedef struct { void *read; void *write; } efi_pci_io_protocol_access_t; +typedef struct { + u32 poll_mem; + u32 poll_io; + efi_pci_io_protocol_access_32_t mem; + efi_pci_io_protocol_access_32_t io; + efi_pci_io_protocol_access_32_t pci; + u32 copy_mem; + u32 map; + u32 unmap; + u32 allocate_buffer; + u32 free_buffer; + u32 flush; + u32 get_location; + u32 attributes; + u32 get_bar_attributes; + u32 set_bar_attributes; + uint64_t romsize; + void *romimage; +} efi_pci_io_protocol_32; + +typedef struct { + u64 poll_mem; + u64 poll_io; + efi_pci_io_protocol_access_64_t mem; + efi_pci_io_protocol_access_64_t io; + efi_pci_io_protocol_access_64_t pci; + u64 copy_mem; + u64 map; + u64 unmap; + u64 allocate_buffer; + u64 free_buffer; + u64 flush; + u64 get_location; + u64 attributes; + u64 get_bar_attributes; + u64 set_bar_attributes; + uint64_t romsize; + void *romimage; +} efi_pci_io_protocol_64; + typedef struct { void *poll_mem; void *poll_io; @@ -290,6 +435,42 @@ typedef struct { #define EFI_RUNTIME_SERVICES_SIGNATURE ((u64)0x5652453544e5552ULL) #define EFI_RUNTIME_SERVICES_REVISION 0x00010000 +typedef struct { + efi_table_hdr_t hdr; + u32 get_time; + u32 set_time; + u32 get_wakeup_time; + u32 set_wakeup_time; + u32 set_virtual_address_map; + u32 convert_pointer; + u32 get_variable; + u32 get_next_variable; + u32 set_variable; + u32 get_next_high_mono_count; + u32 reset_system; + u32 update_capsule; + u32 query_capsule_caps; + u32 query_variable_info; +} efi_runtime_services_32_t; + +typedef struct { + efi_table_hdr_t hdr; + u64 get_time; + u64 set_time; + u64 get_wakeup_time; + u64 set_wakeup_time; + u64 set_virtual_address_map; + u64 convert_pointer; + u64 get_variable; + u64 get_next_variable; + u64 set_variable; + u64 get_next_high_mono_count; + u64 reset_system; + u64 update_capsule; + u64 query_capsule_caps; + u64 query_variable_info; +} efi_runtime_services_64_t; + typedef struct { efi_table_hdr_t hdr; void *get_time; @@ -483,6 +664,38 @@ struct efi_memory_map { unsigned long desc_size; }; +typedef struct { + u32 revision; + u32 parent_handle; + u32 system_table; + u32 device_handle; + u32 file_path; + u32 reserved; + u32 load_options_size; + u32 load_options; + u32 image_base; + __aligned_u64 image_size; + unsigned int image_code_type; + unsigned int image_data_type; + unsigned long unload; +} efi_loaded_image_32_t; + +typedef struct { + u32 revision; + u64 parent_handle; + u64 system_table; + u64 device_handle; + u64 file_path; + u64 reserved; + u32 load_options_size; + u64 load_options; + u64 image_base; + __aligned_u64 image_size; + unsigned int image_code_type; + unsigned int image_data_type; + unsigned long unload; +} efi_loaded_image_64_t; + typedef struct { u32 revision; void *parent_handle; @@ -511,6 +724,34 @@ typedef struct { efi_char16_t filename[1]; } efi_file_info_t; +typedef struct { + u64 revision; + u32 open; + u32 close; + u32 delete; + u32 read; + u32 write; + u32 get_position; + u32 set_position; + u32 get_info; + u32 set_info; + u32 flush; +} efi_file_handle_32_t; + +typedef struct { + u64 revision; + u64 open; + u64 close; + u64 delete; + u64 read; + u64 write; + u64 get_position; + u64 set_position; + u64 get_info; + u64 set_info; + u64 flush; +} efi_file_handle_64_t; + typedef struct _efi_file_handle { u64 revision; efi_status_t (*open)(struct _efi_file_handle *, @@ -809,6 +1050,17 @@ struct efivar_entry { bool deleting; }; +struct efi_simple_text_output_protocol_32 { + u32 reset; + u32 output_string; + u32 test_string; +}; + +struct efi_simple_text_output_protocol_64 { + u64 reset; + u64 output_string; + u64 test_string; +}; struct efi_simple_text_output_protocol { void *reset; -- cgit v1.2.3 From 291fdb0bcebd5e8db6af767c1fdc522167dad73d Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Tue, 4 Mar 2014 12:39:03 +0100 Subject: ipc/compat_sys_msgrcv: change msgtyp type from long to compat_long_t Change the type of compat_sys_msgrcv's msgtyp parameter from long to compat_long_t, since compat user space passes only a 32 bit signed value. Let the compat wrapper do proper sign extension to 64 bit of this parameter. Signed-off-by: Heiko Carstens --- include/linux/compat.h | 2 +- ipc/compat.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/compat.h b/include/linux/compat.h index 4c42df3fce37..179b1da9e19f 100644 --- a/include/linux/compat.h +++ b/include/linux/compat.h @@ -321,7 +321,7 @@ asmlinkage long compat_sys_semctl(int semid, int semnum, int cmd, int arg); asmlinkage long compat_sys_msgsnd(int msqid, compat_uptr_t msgp, compat_ssize_t msgsz, int msgflg); asmlinkage long compat_sys_msgrcv(int msqid, compat_uptr_t msgp, - compat_ssize_t msgsz, long msgtyp, int msgflg); + compat_ssize_t msgsz, compat_long_t msgtyp, int msgflg); long compat_sys_msgctl(int first, int second, void __user *uptr); long compat_sys_shmctl(int first, int second, void __user *uptr); long compat_sys_semtimedop(int semid, struct sembuf __user *tsems, diff --git a/ipc/compat.c b/ipc/compat.c index f486b0096a67..e1f4ab65660c 100644 --- a/ipc/compat.c +++ b/ipc/compat.c @@ -430,9 +430,9 @@ COMPAT_SYSCALL_DEFINE4(msgsnd, int, msqid, compat_uptr_t, msgp, } COMPAT_SYSCALL_DEFINE5(msgrcv, int, msqid, compat_uptr_t, msgp, - compat_ssize_t, msgsz, long, msgtyp, int, msgflg) + compat_ssize_t, msgsz, compat_long_t, msgtyp, int, msgflg) { - return do_msgrcv(msqid, compat_ptr(msgp), (ssize_t)msgsz, msgtyp, + return do_msgrcv(msqid, compat_ptr(msgp), (ssize_t)msgsz, (long)msgtyp, msgflg, compat_do_msg_fill); } -- cgit v1.2.3 From 378a10f3ae2e8a67ecc8f548c8e5ff25881bd88a Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 5 Mar 2014 10:43:51 +0100 Subject: fs/compat: optional preadv64/pwrite64 compat system calls The preadv64/pwrite64 have been implemented for the x32 ABI, in order to allow passing 64 bit arguments from user space without splitting them into two 32 bit parameters, like it would be necessary for usual compat tasks. Howevert these two system calls are only being used for the x32 ABI, so add __ARCH_WANT_COMPAT defines for these two compat syscalls and make these two only visible for x86. Signed-off-by: Heiko Carstens --- arch/x86/include/asm/unistd.h | 2 ++ fs/read_write.c | 36 ++++++++++++++++++++++++++++-------- include/linux/compat.h | 13 +++++++++++++ 3 files changed, 43 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/include/asm/unistd.h b/arch/x86/include/asm/unistd.h index f4b5795d7e34..3f556c6a0157 100644 --- a/arch/x86/include/asm/unistd.h +++ b/arch/x86/include/asm/unistd.h @@ -24,6 +24,8 @@ # include # define __ARCH_WANT_COMPAT_SYS_TIME # define __ARCH_WANT_COMPAT_SYS_GETDENTS64 +# define __ARCH_WANT_COMPAT_SYS_PREADV64 +# define __ARCH_WANT_COMPAT_SYS_PWRITEV64 # endif diff --git a/fs/read_write.c b/fs/read_write.c index edc5746a902a..72c09b4a01a4 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -982,9 +982,9 @@ COMPAT_SYSCALL_DEFINE3(readv, compat_ulong_t, fd, return ret; } -COMPAT_SYSCALL_DEFINE4(preadv64, unsigned long, fd, - const struct compat_iovec __user *,vec, - unsigned long, vlen, loff_t, pos) +static long __compat_sys_preadv64(unsigned long fd, + const struct compat_iovec __user *vec, + unsigned long vlen, loff_t pos) { struct fd f; ssize_t ret; @@ -1001,12 +1001,22 @@ COMPAT_SYSCALL_DEFINE4(preadv64, unsigned long, fd, return ret; } +#ifdef __ARCH_WANT_COMPAT_SYS_PREADV64 +COMPAT_SYSCALL_DEFINE4(preadv64, unsigned long, fd, + const struct compat_iovec __user *,vec, + unsigned long, vlen, loff_t, pos) +{ + return __compat_sys_preadv64(fd, vec, vlen, pos); +} +#endif + COMPAT_SYSCALL_DEFINE5(preadv, compat_ulong_t, fd, const struct compat_iovec __user *,vec, compat_ulong_t, vlen, u32, pos_low, u32, pos_high) { loff_t pos = ((loff_t)pos_high << 32) | pos_low; - return compat_sys_preadv64(fd, vec, vlen, pos); + + return __compat_sys_preadv64(fd, vec, vlen, pos); } static size_t compat_writev(struct file *file, @@ -1049,9 +1059,9 @@ COMPAT_SYSCALL_DEFINE3(writev, compat_ulong_t, fd, return ret; } -COMPAT_SYSCALL_DEFINE4(pwritev64, unsigned long, fd, - const struct compat_iovec __user *,vec, - unsigned long, vlen, loff_t, pos) +static long __compat_sys_pwritev64(unsigned long fd, + const struct compat_iovec __user *vec, + unsigned long vlen, loff_t pos) { struct fd f; ssize_t ret; @@ -1068,12 +1078,22 @@ COMPAT_SYSCALL_DEFINE4(pwritev64, unsigned long, fd, return ret; } +#ifdef __ARCH_WANT_COMPAT_SYS_PWRITEV64 +COMPAT_SYSCALL_DEFINE4(pwritev64, unsigned long, fd, + const struct compat_iovec __user *,vec, + unsigned long, vlen, loff_t, pos) +{ + return __compat_sys_pwritev64(fd, vec, vlen, pos); +} +#endif + COMPAT_SYSCALL_DEFINE5(pwritev, compat_ulong_t, fd, const struct compat_iovec __user *,vec, compat_ulong_t, vlen, u32, pos_low, u32, pos_high) { loff_t pos = ((loff_t)pos_high << 32) | pos_low; - return compat_sys_pwritev64(fd, vec, vlen, pos); + + return __compat_sys_pwritev64(fd, vec, vlen, pos); } #endif diff --git a/include/linux/compat.h b/include/linux/compat.h index 179b1da9e19f..1c457428ec0a 100644 --- a/include/linux/compat.h +++ b/include/linux/compat.h @@ -340,6 +340,19 @@ asmlinkage ssize_t compat_sys_preadv(compat_ulong_t fd, asmlinkage ssize_t compat_sys_pwritev(compat_ulong_t fd, const struct compat_iovec __user *vec, compat_ulong_t vlen, u32 pos_low, u32 pos_high); + +#ifdef __ARCH_WANT_COMPAT_SYS_PREADV64 +asmlinkage long compat_sys_preadv64(unsigned long fd, + const struct compat_iovec __user *vec, + unsigned long vlen, loff_t pos); +#endif + +#ifdef __ARCH_WANT_COMPAT_SYS_PWRITEV64 +asmlinkage long compat_sys_pwritev64(unsigned long fd, + const struct compat_iovec __user *vec, + unsigned long vlen, loff_t pos); +#endif + asmlinkage long compat_sys_lseek(unsigned int, compat_off_t, unsigned int); asmlinkage long compat_sys_execve(const char __user *filename, const compat_uptr_t __user *argv, -- cgit v1.2.3 From 932602e238329da99f8482c1b721549531fbfe7f Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Tue, 4 Mar 2014 16:07:52 +0100 Subject: fs/compat: convert to COMPAT_SYSCALL_DEFINE with changing parameter types Some fs compat system calls have unsigned long parameters instead of compat_ulong_t. In order to allow the COMPAT_SYSCALL_DEFINE macro generate code that performs proper zero and sign extension convert all 64 bit parameters their corresponding 32 bit counterparts. compat_sys_io_getevents() is a bit different: the non-compat version has signed parameters for the "min_nr" and "nr" parameters while the compat version has unsigned parameters. So change this as well. For all practical purposes this shouldn't make any difference (doesn't fix a real bug). Also introduce a generic compat_aio_context_t type which can be used everywhere. The access_ok() check within compat_sys_io_getevents() got also removed since the non-compat sys_io_getevents() should be able to handle everything anyway. Signed-off-by: Heiko Carstens --- arch/s390/include/asm/compat.h | 1 - fs/compat.c | 44 +++++++++++++++++------------------------- fs/compat_ioctl.c | 5 +++-- include/linux/compat.h | 18 +++++++++-------- 4 files changed, 31 insertions(+), 37 deletions(-) (limited to 'include/linux') diff --git a/arch/s390/include/asm/compat.h b/arch/s390/include/asm/compat.h index 1174ea2b5e7c..5d7e8cf83bd6 100644 --- a/arch/s390/include/asm/compat.h +++ b/arch/s390/include/asm/compat.h @@ -65,7 +65,6 @@ typedef u32 compat_caddr_t; typedef __kernel_fsid_t compat_fsid_t; typedef s32 compat_key_t; typedef s32 compat_timer_t; -typedef u32 compat_aio_context_t; typedef s32 compat_int_t; typedef s32 compat_long_t; diff --git a/fs/compat.c b/fs/compat.c index 6d8312b7a51d..19252b97f0cc 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -399,8 +399,8 @@ static int put_compat_flock64(struct flock *kfl, struct compat_flock64 __user *u } #endif -asmlinkage long compat_sys_fcntl64(unsigned int fd, unsigned int cmd, - unsigned long arg) +COMPAT_SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd, + compat_ulong_t, arg) { mm_segment_t old_fs; struct flock f; @@ -468,8 +468,8 @@ asmlinkage long compat_sys_fcntl64(unsigned int fd, unsigned int cmd, return ret; } -asmlinkage long compat_sys_fcntl(unsigned int fd, unsigned int cmd, - unsigned long arg) +COMPAT_SYSCALL_DEFINE3(fcntl, unsigned int, fd, unsigned int, cmd, + compat_ulong_t, arg) { if ((cmd == F_GETLK64) || (cmd == F_SETLK64) || (cmd == F_SETLKW64)) return -EINVAL; @@ -495,32 +495,24 @@ COMPAT_SYSCALL_DEFINE2(io_setup, unsigned, nr_reqs, u32 __user *, ctx32p) return ret; } -asmlinkage long -compat_sys_io_getevents(aio_context_t ctx_id, - unsigned long min_nr, - unsigned long nr, - struct io_event __user *events, - struct compat_timespec __user *timeout) +COMPAT_SYSCALL_DEFINE5(io_getevents, compat_aio_context_t, ctx_id, + compat_long_t, min_nr, + compat_long_t, nr, + struct io_event __user *, events, + struct compat_timespec __user *, timeout) { - long ret; struct timespec t; struct timespec __user *ut = NULL; - ret = -EFAULT; - if (unlikely(!access_ok(VERIFY_WRITE, events, - nr * sizeof(struct io_event)))) - goto out; if (timeout) { if (get_compat_timespec(&t, timeout)) - goto out; + return -EFAULT; ut = compat_alloc_user_space(sizeof(*ut)); if (copy_to_user(ut, &t, sizeof(t)) ) - goto out; + return -EFAULT; } - ret = sys_io_getevents(ctx_id, min_nr, nr, events, ut); -out: - return ret; + return sys_io_getevents(ctx_id, min_nr, nr, events, ut); } /* A write operation does a read from user space and vice versa */ @@ -616,8 +608,8 @@ copy_iocb(long nr, u32 __user *ptr32, struct iocb __user * __user *ptr64) #define MAX_AIO_SUBMITS (PAGE_SIZE/sizeof(struct iocb *)) -asmlinkage long -compat_sys_io_submit(aio_context_t ctx_id, int nr, u32 __user *iocb) +COMPAT_SYSCALL_DEFINE3(io_submit, compat_aio_context_t, ctx_id, + int, nr, u32 __user *, iocb) { struct iocb __user * __user *iocb64; long ret; @@ -769,10 +761,10 @@ static int do_nfs4_super_data_conv(void *raw_data) #define NCPFS_NAME "ncpfs" #define NFS4_NAME "nfs4" -asmlinkage long compat_sys_mount(const char __user * dev_name, - const char __user * dir_name, - const char __user * type, unsigned long flags, - const void __user * data) +COMPAT_SYSCALL_DEFINE5(mount, const char __user *, dev_name, + const char __user *, dir_name, + const char __user *, type, compat_ulong_t, flags, + const void __user *, data) { char *kernel_type; unsigned long data_page; diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c index 3881610b6438..e82289047272 100644 --- a/fs/compat_ioctl.c +++ b/fs/compat_ioctl.c @@ -1538,9 +1538,10 @@ static int compat_ioctl_check_table(unsigned int xcmd) return ioctl_pointer[i] == xcmd; } -asmlinkage long compat_sys_ioctl(unsigned int fd, unsigned int cmd, - unsigned long arg) +COMPAT_SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned int, cmd, + compat_ulong_t, arg32) { + unsigned long arg = arg32; struct fd f = fdget(fd); int error = -EBADF; if (!f.file) diff --git a/include/linux/compat.h b/include/linux/compat.h index 1c457428ec0a..fea8ee9afe22 100644 --- a/include/linux/compat.h +++ b/include/linux/compat.h @@ -71,6 +71,8 @@ typedef struct compat_sigaltstack { typedef __compat_uid32_t compat_uid_t; typedef __compat_gid32_t compat_gid_t; +typedef compat_ulong_t compat_aio_context_t; + struct compat_sel_arg_struct; struct rusage; @@ -497,20 +499,20 @@ asmlinkage long compat_sys_statfs64(const char __user *pathname, asmlinkage long compat_sys_fstatfs64(unsigned int fd, compat_size_t sz, struct compat_statfs64 __user *buf); asmlinkage long compat_sys_fcntl64(unsigned int fd, unsigned int cmd, - unsigned long arg); + compat_ulong_t arg); asmlinkage long compat_sys_fcntl(unsigned int fd, unsigned int cmd, - unsigned long arg); + compat_ulong_t arg); asmlinkage long compat_sys_io_setup(unsigned nr_reqs, u32 __user *ctx32p); -asmlinkage long compat_sys_io_getevents(aio_context_t ctx_id, - unsigned long min_nr, - unsigned long nr, +asmlinkage long compat_sys_io_getevents(compat_aio_context_t ctx_id, + compat_long_t min_nr, + compat_long_t nr, struct io_event __user *events, struct compat_timespec __user *timeout); -asmlinkage long compat_sys_io_submit(aio_context_t ctx_id, int nr, +asmlinkage long compat_sys_io_submit(compat_aio_context_t ctx_id, int nr, u32 __user *iocb); asmlinkage long compat_sys_mount(const char __user *dev_name, const char __user *dir_name, - const char __user *type, unsigned long flags, + const char __user *type, compat_ulong_t flags, const void __user *data); asmlinkage long compat_sys_old_readdir(unsigned int fd, struct compat_old_linux_dirent __user *, @@ -633,7 +635,7 @@ asmlinkage long compat_sys_rt_sigqueueinfo(compat_pid_t pid, int sig, struct compat_siginfo __user *uinfo); asmlinkage long compat_sys_sysinfo(struct compat_sysinfo __user *info); asmlinkage long compat_sys_ioctl(unsigned int fd, unsigned int cmd, - unsigned long arg); + compat_ulong_t arg); asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, u32 val, struct compat_timespec __user *utime, u32 __user *uaddr2, u32 val3); -- cgit v1.2.3 From 8eee9093cdbeb2aa89d67dc1a3fd118acabaea52 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Tue, 4 Mar 2014 16:19:16 +0100 Subject: ipc/compat: convert to COMPAT_SYSCALL_DEFINE with changing parameter types In order to allow the COMPAT_SYSCALL_DEFINE macro generate code that performs proper zero and sign extension convert all 64 bit parameters to their corresponding 32 bit compat counterparts. Signed-off-by: Heiko Carstens --- include/linux/compat.h | 4 ++-- ipc/compat_mq.c | 16 ++++++++-------- 2 files changed, 10 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/compat.h b/include/linux/compat.h index fea8ee9afe22..f670e591aff8 100644 --- a/include/linux/compat.h +++ b/include/linux/compat.h @@ -655,11 +655,11 @@ asmlinkage long compat_sys_mq_open(const char __user *u_name, struct compat_mq_attr __user *u_attr); asmlinkage long compat_sys_mq_timedsend(mqd_t mqdes, const char __user *u_msg_ptr, - size_t msg_len, unsigned int msg_prio, + compat_size_t msg_len, unsigned int msg_prio, const struct compat_timespec __user *u_abs_timeout); asmlinkage ssize_t compat_sys_mq_timedreceive(mqd_t mqdes, char __user *u_msg_ptr, - size_t msg_len, unsigned int __user *u_msg_prio, + compat_size_t msg_len, unsigned int __user *u_msg_prio, const struct compat_timespec __user *u_abs_timeout); asmlinkage long compat_sys_socketcall(int call, u32 __user *args); asmlinkage long compat_sys_sysctl(struct compat_sysctl_args __user *args); diff --git a/ipc/compat_mq.c b/ipc/compat_mq.c index af087fb40bf7..d58747293772 100644 --- a/ipc/compat_mq.c +++ b/ipc/compat_mq.c @@ -78,10 +78,10 @@ static int compat_prepare_timeout(struct timespec __user **p, return 0; } -asmlinkage long compat_sys_mq_timedsend(mqd_t mqdes, - const char __user *u_msg_ptr, - size_t msg_len, unsigned int msg_prio, - const struct compat_timespec __user *u_abs_timeout) +COMPAT_SYSCALL_DEFINE5(mq_timedsend, mqd_t, mqdes, + const char __user *, u_msg_ptr, + compat_size_t, msg_len, unsigned int, msg_prio, + const struct compat_timespec __user *, u_abs_timeout) { struct timespec __user *u_ts; @@ -92,10 +92,10 @@ asmlinkage long compat_sys_mq_timedsend(mqd_t mqdes, msg_prio, u_ts); } -asmlinkage ssize_t compat_sys_mq_timedreceive(mqd_t mqdes, - char __user *u_msg_ptr, - size_t msg_len, unsigned int __user *u_msg_prio, - const struct compat_timespec __user *u_abs_timeout) +COMPAT_SYSCALL_DEFINE5(mq_timedreceive, mqd_t, mqdes, + char __user *, u_msg_ptr, + compat_size_t, msg_len, unsigned int __user *, u_msg_prio, + const struct compat_timespec __user *, u_abs_timeout) { struct timespec __user *u_ts; if (compat_prepare_timeout(&u_ts, u_abs_timeout)) -- cgit v1.2.3 From 3a49a0f7181c243aa04e6c5e44ca70a90ead8f9a Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Tue, 4 Mar 2014 17:09:57 +0100 Subject: net/compat: convert to COMPAT_SYSCALL_DEFINE with changing parameter types In order to allow the COMPAT_SYSCALL_DEFINE macro generate code that performs proper zero and sign extension convert all 64 bit parameters to their corresponding 32 bit compat counterparts. Signed-off-by: Heiko Carstens --- include/linux/compat.h | 4 ++-- net/compat.c | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/compat.h b/include/linux/compat.h index f670e591aff8..8e636211f334 100644 --- a/include/linux/compat.h +++ b/include/linux/compat.h @@ -569,9 +569,9 @@ asmlinkage long compat_sys_sendmmsg(int fd, struct compat_mmsghdr __user *mmsg, unsigned vlen, unsigned int flags); asmlinkage long compat_sys_recvmsg(int fd, struct compat_msghdr __user *msg, unsigned int flags); -asmlinkage long compat_sys_recv(int fd, void __user *buf, size_t len, +asmlinkage long compat_sys_recv(int fd, void __user *buf, compat_size_t len, unsigned flags); -asmlinkage long compat_sys_recvfrom(int fd, void __user *buf, size_t len, +asmlinkage long compat_sys_recvfrom(int fd, void __user *buf, compat_size_t len, unsigned flags, struct sockaddr __user *addr, int __user *addrlen); asmlinkage long compat_sys_recvmmsg(int fd, struct compat_mmsghdr __user *mmsg, diff --git a/net/compat.c b/net/compat.c index 706b78be9a23..9a76eaf63184 100644 --- a/net/compat.c +++ b/net/compat.c @@ -758,14 +758,14 @@ COMPAT_SYSCALL_DEFINE3(recvmsg, int, fd, struct compat_msghdr __user *, msg, uns return __sys_recvmsg(fd, (struct msghdr __user *)msg, flags | MSG_CMSG_COMPAT); } -asmlinkage long compat_sys_recv(int fd, void __user *buf, size_t len, unsigned int flags) +COMPAT_SYSCALL_DEFINE4(recv, int, fd, void __user *, buf, compat_size_t, len, unsigned int, flags) { return sys_recv(fd, buf, len, flags | MSG_CMSG_COMPAT); } -asmlinkage long compat_sys_recvfrom(int fd, void __user *buf, size_t len, - unsigned int flags, struct sockaddr __user *addr, - int __user *addrlen) +COMPAT_SYSCALL_DEFINE6(recvfrom, int, fd, void __user *, buf, compat_size_t, len, + unsigned int, flags, struct sockaddr __user *, addr, + int __user *, addrlen) { return sys_recvfrom(fd, buf, len, flags | MSG_CMSG_COMPAT, addr, addrlen); } -- cgit v1.2.3 From ca2c405ab90591dcb1bc3765467cbdf2b99a0f6a Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Tue, 4 Mar 2014 17:13:42 +0100 Subject: kexec/compat: convert to COMPAT_SYSCALL_DEFINE with changing parameter types In order to allow the COMPAT_SYSCALL_DEFINE macro generate code that performs proper zero and sign extension convert all 64 bit parameters to their corresponding 32 bit compat counterparts. Signed-off-by: Heiko Carstens --- include/linux/compat.h | 6 +++--- include/linux/kexec.h | 6 ------ kernel/kexec.c | 8 ++++---- 3 files changed, 7 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/compat.h b/include/linux/compat.h index 8e636211f334..ef4834c5bcab 100644 --- a/include/linux/compat.h +++ b/include/linux/compat.h @@ -641,10 +641,10 @@ asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, u32 val, u32 val3); asmlinkage long compat_sys_getsockopt(int fd, int level, int optname, char __user *optval, int __user *optlen); -asmlinkage long compat_sys_kexec_load(unsigned long entry, - unsigned long nr_segments, +asmlinkage long compat_sys_kexec_load(compat_ulong_t entry, + compat_ulong_t nr_segments, struct compat_kexec_segment __user *, - unsigned long flags); + compat_ulong_t flags); asmlinkage long compat_sys_mq_getsetattr(mqd_t mqdes, const struct compat_mq_attr __user *u_mqstat, struct compat_mq_attr __user *u_omqstat); diff --git a/include/linux/kexec.h b/include/linux/kexec.h index 6d4066cdb5b5..a75641930049 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -127,12 +127,6 @@ extern asmlinkage long sys_kexec_load(unsigned long entry, struct kexec_segment __user *segments, unsigned long flags); extern int kernel_kexec(void); -#ifdef CONFIG_COMPAT -extern asmlinkage long compat_sys_kexec_load(unsigned long entry, - unsigned long nr_segments, - struct compat_kexec_segment __user *segments, - unsigned long flags); -#endif extern struct page *kimage_alloc_control_pages(struct kimage *image, unsigned int order); extern void crash_kexec(struct pt_regs *); diff --git a/kernel/kexec.c b/kernel/kexec.c index 60bafbed06ab..45601cf41bee 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -1039,10 +1039,10 @@ void __weak crash_unmap_reserved_pages(void) {} #ifdef CONFIG_COMPAT -asmlinkage long compat_sys_kexec_load(unsigned long entry, - unsigned long nr_segments, - struct compat_kexec_segment __user *segments, - unsigned long flags) +COMPAT_SYSCALL_DEFINE4(kexec_load, compat_ulong_t, entry, + compat_ulong_t, nr_segments, + struct compat_kexec_segment __user *, segments, + compat_ulong_t, flags) { struct compat_kexec_segment in; struct kexec_segment out, __user *ksegments; -- cgit v1.2.3 From 2f2728f6de9837abe4b354443a45be578fbbf942 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Tue, 4 Mar 2014 17:18:23 +0100 Subject: mm/compat: convert to COMPAT_SYSCALL_DEFINE with changing parameter types In order to allow the COMPAT_SYSCALL_DEFINE macro generate code that performs proper zero and sign extension convert all 64 bit parameters to their corresponding 32 bit compat counterparts. Signed-off-by: Heiko Carstens --- include/linux/compat.h | 10 +++++----- kernel/compat.c | 10 +++++----- mm/process_vm_access.c | 26 ++++++++++++-------------- 3 files changed, 22 insertions(+), 24 deletions(-) (limited to 'include/linux') diff --git a/include/linux/compat.h b/include/linux/compat.h index ef4834c5bcab..7c765624b7ef 100644 --- a/include/linux/compat.h +++ b/include/linux/compat.h @@ -469,7 +469,7 @@ asmlinkage long compat_sys_timerfd_settime(int ufd, int flags, asmlinkage long compat_sys_timerfd_gettime(int ufd, struct compat_itimerspec __user *otmr); -asmlinkage long compat_sys_move_pages(pid_t pid, unsigned long nr_page, +asmlinkage long compat_sys_move_pages(pid_t pid, compat_ulong_t nr_pages, __u32 __user *pages, const int __user *nodes, int __user *status, @@ -674,12 +674,12 @@ extern void __user *compat_alloc_user_space(unsigned long len); asmlinkage ssize_t compat_sys_process_vm_readv(compat_pid_t pid, const struct compat_iovec __user *lvec, - unsigned long liovcnt, const struct compat_iovec __user *rvec, - unsigned long riovcnt, unsigned long flags); + compat_ulong_t liovcnt, const struct compat_iovec __user *rvec, + compat_ulong_t riovcnt, compat_ulong_t flags); asmlinkage ssize_t compat_sys_process_vm_writev(compat_pid_t pid, const struct compat_iovec __user *lvec, - unsigned long liovcnt, const struct compat_iovec __user *rvec, - unsigned long riovcnt, unsigned long flags); + compat_ulong_t liovcnt, const struct compat_iovec __user *rvec, + compat_ulong_t riovcnt, compat_ulong_t flags); asmlinkage long compat_sys_sendfile(int out_fd, int in_fd, compat_off_t __user *offset, compat_size_t count); diff --git a/kernel/compat.c b/kernel/compat.c index 2622011a44c9..488ff8c4cf48 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -1065,11 +1065,11 @@ COMPAT_SYSCALL_DEFINE1(adjtimex, struct compat_timex __user *, utp) } #ifdef CONFIG_NUMA -asmlinkage long compat_sys_move_pages(pid_t pid, unsigned long nr_pages, - compat_uptr_t __user *pages32, - const int __user *nodes, - int __user *status, - int flags) +COMPAT_SYSCALL_DEFINE6(move_pages, pid_t, pid, compat_ulong_t, nr_pages, + compat_uptr_t __user *, pages32, + const int __user *, nodes, + int __user *, status, + int, flags) { const void __user * __user *pages; int i; diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c index fd26d0433509..3c5cf68566ec 100644 --- a/mm/process_vm_access.c +++ b/mm/process_vm_access.c @@ -456,25 +456,23 @@ free_iovecs: return rc; } -asmlinkage ssize_t -compat_sys_process_vm_readv(compat_pid_t pid, - const struct compat_iovec __user *lvec, - unsigned long liovcnt, - const struct compat_iovec __user *rvec, - unsigned long riovcnt, - unsigned long flags) +COMPAT_SYSCALL_DEFINE6(process_vm_readv, compat_pid_t, pid, + const struct compat_iovec __user *, lvec, + compat_ulong_t, liovcnt, + const struct compat_iovec __user *, rvec, + compat_ulong_t, riovcnt, + compat_ulong_t, flags) { return compat_process_vm_rw(pid, lvec, liovcnt, rvec, riovcnt, flags, 0); } -asmlinkage ssize_t -compat_sys_process_vm_writev(compat_pid_t pid, - const struct compat_iovec __user *lvec, - unsigned long liovcnt, - const struct compat_iovec __user *rvec, - unsigned long riovcnt, - unsigned long flags) +COMPAT_SYSCALL_DEFINE6(process_vm_writev, compat_pid_t, pid, + const struct compat_iovec __user *, lvec, + compat_ulong_t, liovcnt, + const struct compat_iovec __user *, rvec, + compat_ulong_t, riovcnt, + compat_ulong_t, flags) { return compat_process_vm_rw(pid, lvec, liovcnt, rvec, riovcnt, flags, 1); -- cgit v1.2.3 From 70044d71d31d6973665ced5be04ef39ac1c09a48 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 7 Mar 2014 10:19:57 -0500 Subject: firewire: don't use PREPARE_DELAYED_WORK PREPARE_[DELAYED_]WORK() are being phased out. They have few users and a nasty surprise in terms of reentrancy guarantee as workqueue considers work items to be different if they don't have the same work function. firewire core-device and sbp2 have been been multiplexing work items with multiple work functions. Introduce fw_device_workfn() and sbp2_lu_workfn() which invoke fw_device->workfn and sbp2_logical_unit->workfn respectively and always use the two functions as the work functions and update the users to set the ->workfn fields instead of overriding work functions using PREPARE_DELAYED_WORK(). This fixes a variety of possible regressions since a2c1c57be8d9 "workqueue: consider work function when searching for busy work items" due to which fw_workqueue lost its required non-reentrancy property. Signed-off-by: Tejun Heo Acked-by: Stefan Richter Cc: linux1394-devel@lists.sourceforge.net Cc: stable@vger.kernel.org # v3.9+ Cc: stable@vger.kernel.org # v3.8.2+ Cc: stable@vger.kernel.org # v3.4.60+ Cc: stable@vger.kernel.org # v3.2.40+ --- drivers/firewire/core-device.c | 22 +++++++++++++++------- drivers/firewire/sbp2.c | 17 +++++++++++++---- include/linux/firewire.h | 1 + 3 files changed, 29 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/drivers/firewire/core-device.c b/drivers/firewire/core-device.c index de4aa409abe2..2c6d5e118ac1 100644 --- a/drivers/firewire/core-device.c +++ b/drivers/firewire/core-device.c @@ -916,7 +916,7 @@ static int lookup_existing_device(struct device *dev, void *data) old->config_rom_retries = 0; fw_notice(card, "rediscovered device %s\n", dev_name(dev)); - PREPARE_DELAYED_WORK(&old->work, fw_device_update); + old->workfn = fw_device_update; fw_schedule_device_work(old, 0); if (current_node == card->root_node) @@ -1075,7 +1075,7 @@ static void fw_device_init(struct work_struct *work) if (atomic_cmpxchg(&device->state, FW_DEVICE_INITIALIZING, FW_DEVICE_RUNNING) == FW_DEVICE_GONE) { - PREPARE_DELAYED_WORK(&device->work, fw_device_shutdown); + device->workfn = fw_device_shutdown; fw_schedule_device_work(device, SHUTDOWN_DELAY); } else { fw_notice(card, "created device %s: GUID %08x%08x, S%d00\n", @@ -1196,13 +1196,20 @@ static void fw_device_refresh(struct work_struct *work) dev_name(&device->device), fw_rcode_string(ret)); gone: atomic_set(&device->state, FW_DEVICE_GONE); - PREPARE_DELAYED_WORK(&device->work, fw_device_shutdown); + device->workfn = fw_device_shutdown; fw_schedule_device_work(device, SHUTDOWN_DELAY); out: if (node_id == card->root_node->node_id) fw_schedule_bm_work(card, 0); } +static void fw_device_workfn(struct work_struct *work) +{ + struct fw_device *device = container_of(to_delayed_work(work), + struct fw_device, work); + device->workfn(work); +} + void fw_node_event(struct fw_card *card, struct fw_node *node, int event) { struct fw_device *device; @@ -1252,7 +1259,8 @@ void fw_node_event(struct fw_card *card, struct fw_node *node, int event) * power-up after getting plugged in. We schedule the * first config rom scan half a second after bus reset. */ - INIT_DELAYED_WORK(&device->work, fw_device_init); + device->workfn = fw_device_init; + INIT_DELAYED_WORK(&device->work, fw_device_workfn); fw_schedule_device_work(device, INITIAL_DELAY); break; @@ -1268,7 +1276,7 @@ void fw_node_event(struct fw_card *card, struct fw_node *node, int event) if (atomic_cmpxchg(&device->state, FW_DEVICE_RUNNING, FW_DEVICE_INITIALIZING) == FW_DEVICE_RUNNING) { - PREPARE_DELAYED_WORK(&device->work, fw_device_refresh); + device->workfn = fw_device_refresh; fw_schedule_device_work(device, device->is_local ? 0 : INITIAL_DELAY); } @@ -1283,7 +1291,7 @@ void fw_node_event(struct fw_card *card, struct fw_node *node, int event) smp_wmb(); /* update node_id before generation */ device->generation = card->generation; if (atomic_read(&device->state) == FW_DEVICE_RUNNING) { - PREPARE_DELAYED_WORK(&device->work, fw_device_update); + device->workfn = fw_device_update; fw_schedule_device_work(device, 0); } break; @@ -1308,7 +1316,7 @@ void fw_node_event(struct fw_card *card, struct fw_node *node, int event) device = node->data; if (atomic_xchg(&device->state, FW_DEVICE_GONE) == FW_DEVICE_RUNNING) { - PREPARE_DELAYED_WORK(&device->work, fw_device_shutdown); + device->workfn = fw_device_shutdown; fw_schedule_device_work(device, list_empty(&card->link) ? 0 : SHUTDOWN_DELAY); } diff --git a/drivers/firewire/sbp2.c b/drivers/firewire/sbp2.c index 281029daf98c..7aef911fdc71 100644 --- a/drivers/firewire/sbp2.c +++ b/drivers/firewire/sbp2.c @@ -146,6 +146,7 @@ struct sbp2_logical_unit { */ int generation; int retries; + work_func_t workfn; struct delayed_work work; bool has_sdev; bool blocked; @@ -864,7 +865,7 @@ static void sbp2_login(struct work_struct *work) /* set appropriate retry limit(s) in BUSY_TIMEOUT register */ sbp2_set_busy_timeout(lu); - PREPARE_DELAYED_WORK(&lu->work, sbp2_reconnect); + lu->workfn = sbp2_reconnect; sbp2_agent_reset(lu); /* This was a re-login. */ @@ -918,7 +919,7 @@ static void sbp2_login(struct work_struct *work) * If a bus reset happened, sbp2_update will have requeued * lu->work already. Reset the work from reconnect to login. */ - PREPARE_DELAYED_WORK(&lu->work, sbp2_login); + lu->workfn = sbp2_login; } static void sbp2_reconnect(struct work_struct *work) @@ -952,7 +953,7 @@ static void sbp2_reconnect(struct work_struct *work) lu->retries++ >= 5) { dev_err(tgt_dev(tgt), "failed to reconnect\n"); lu->retries = 0; - PREPARE_DELAYED_WORK(&lu->work, sbp2_login); + lu->workfn = sbp2_login; } sbp2_queue_work(lu, DIV_ROUND_UP(HZ, 5)); @@ -972,6 +973,13 @@ static void sbp2_reconnect(struct work_struct *work) sbp2_conditionally_unblock(lu); } +static void sbp2_lu_workfn(struct work_struct *work) +{ + struct sbp2_logical_unit *lu = container_of(to_delayed_work(work), + struct sbp2_logical_unit, work); + lu->workfn(work); +} + static int sbp2_add_logical_unit(struct sbp2_target *tgt, int lun_entry) { struct sbp2_logical_unit *lu; @@ -998,7 +1006,8 @@ static int sbp2_add_logical_unit(struct sbp2_target *tgt, int lun_entry) lu->blocked = false; ++tgt->dont_block; INIT_LIST_HEAD(&lu->orb_list); - INIT_DELAYED_WORK(&lu->work, sbp2_login); + lu->workfn = sbp2_login; + INIT_DELAYED_WORK(&lu->work, sbp2_lu_workfn); list_add_tail(&lu->link, &tgt->lu_list); return 0; diff --git a/include/linux/firewire.h b/include/linux/firewire.h index 5d7782e42b8f..c3683bdf28fe 100644 --- a/include/linux/firewire.h +++ b/include/linux/firewire.h @@ -200,6 +200,7 @@ struct fw_device { unsigned irmc:1; unsigned bc_implemented:2; + work_func_t workfn; struct delayed_work work; struct fw_attribute_group attribute_group; }; -- cgit v1.2.3 From 9ca9737444f1a8602f74b85018d881e7e54b5bd1 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 7 Mar 2014 10:24:49 -0500 Subject: nvme: don't use PREPARE_WORK PREPARE_[DELAYED_]WORK() are being phased out. They have few users and a nasty surprise in terms of reentrancy guarantee as workqueue considers work items to be different if they don't have the same work function. nvme_dev->reset_work is multiplexed with multiple work functions. Introduce nvme_reset_workfn() which invokes nvme_dev->reset_workfn and always use it as the work function and update the users to set the ->reset_workfn field instead of overriding the work function using PREPARE_WORK(). It would probably be best to route this with other related updates through the workqueue tree. Compile tested. Signed-off-by: Tejun Heo Cc: Matthew Wilcox Cc: linux-nvme@lists.infradead.org --- drivers/block/nvme-core.c | 18 ++++++++++++------ include/linux/nvme.h | 1 + 2 files changed, 13 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index 51824d1f23ea..8459e4e7c719 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -993,7 +993,7 @@ static void nvme_abort_cmd(int cmdid, struct nvme_queue *nvmeq) dev_warn(&dev->pci_dev->dev, "I/O %d QID %d timeout, reset controller\n", cmdid, nvmeq->qid); - PREPARE_WORK(&dev->reset_work, nvme_reset_failed_dev); + dev->reset_workfn = nvme_reset_failed_dev; queue_work(nvme_workq, &dev->reset_work); return; } @@ -1696,8 +1696,7 @@ static int nvme_kthread(void *data) list_del_init(&dev->node); dev_warn(&dev->pci_dev->dev, "Failed status, reset controller\n"); - PREPARE_WORK(&dev->reset_work, - nvme_reset_failed_dev); + dev->reset_workfn = nvme_reset_failed_dev; queue_work(nvme_workq, &dev->reset_work); continue; } @@ -2406,7 +2405,7 @@ static int nvme_dev_resume(struct nvme_dev *dev) return ret; if (ret == -EBUSY) { spin_lock(&dev_list_lock); - PREPARE_WORK(&dev->reset_work, nvme_remove_disks); + dev->reset_workfn = nvme_remove_disks; queue_work(nvme_workq, &dev->reset_work); spin_unlock(&dev_list_lock); } @@ -2435,6 +2434,12 @@ static void nvme_reset_failed_dev(struct work_struct *ws) nvme_dev_reset(dev); } +static void nvme_reset_workfn(struct work_struct *work) +{ + struct nvme_dev *dev = container_of(work, struct nvme_dev, reset_work); + dev->reset_workfn(work); +} + static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) { int result = -ENOMEM; @@ -2453,7 +2458,8 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) goto free; INIT_LIST_HEAD(&dev->namespaces); - INIT_WORK(&dev->reset_work, nvme_reset_failed_dev); + dev->reset_workfn = nvme_reset_failed_dev; + INIT_WORK(&dev->reset_work, nvme_reset_workfn); dev->pci_dev = pdev; pci_set_drvdata(pdev, dev); result = nvme_set_instance(dev); @@ -2553,7 +2559,7 @@ static int nvme_resume(struct device *dev) struct nvme_dev *ndev = pci_get_drvdata(pdev); if (nvme_dev_resume(ndev) && !work_busy(&ndev->reset_work)) { - PREPARE_WORK(&ndev->reset_work, nvme_reset_failed_dev); + ndev->reset_workfn = nvme_reset_failed_dev; queue_work(nvme_workq, &ndev->reset_work); } return 0; diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 69ae03f6eb15..6b9aafed225f 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -87,6 +87,7 @@ struct nvme_dev { struct list_head namespaces; struct kref kref; struct miscdevice miscdev; + work_func_t reset_workfn; struct work_struct reset_work; char name[12]; char serial[20]; -- cgit v1.2.3 From f073f9229ff1137d3be20558bec3bfb77e3af2a4 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 7 Mar 2014 10:24:50 -0500 Subject: workqueue: remove PREPARE_[DELAYED_]WORK() Peter Hurley noticed that since a2c1c57be8d9 ("workqueue: consider work function when searching for busy work items"), a work item which gets assigned a different work function would break out of the non-reentrancy guarantee as workqueue would consider it a different work item. This is fragile and extremely subtle. PREPARE_[DELAYED_]WORK() have never been used widely and its semantics has always been somewhat iffy. If the work item is known not to be on queue when PREPARE_WORK() is called, there's no difference from using INIT_WORK(). If the work item may be queued at the time of PREPARE_WORK(), we can't really tell whether the old or new function will be executed the next time. We really don't want this level of subtlety in workqueue interface for such marginal use cases. The previous patches converted all existing users away from PREPARE_[DELAYED_]WORK(). Let's remove them. Signed-off-by: Tejun Heo Cc: Peter Hurley Link: http://lkml.kernel.org/g/1392493119-9277-1-git-send-email-peter@hurleysoftware.com --- include/linux/workqueue.h | 15 ++------------- 1 file changed, 2 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 8059334a6b02..29da9e77c3bb 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -177,17 +177,6 @@ struct execute_work { #define DECLARE_DEFERRABLE_WORK(n, f) \ struct delayed_work n = __DELAYED_WORK_INITIALIZER(n, f, TIMER_DEFERRABLE) -/* - * initialize a work item's function pointer - */ -#define PREPARE_WORK(_work, _func) \ - do { \ - (_work)->func = (_func); \ - } while (0) - -#define PREPARE_DELAYED_WORK(_work, _func) \ - PREPARE_WORK(&(_work)->work, (_func)) - #ifdef CONFIG_DEBUG_OBJECTS_WORK extern void __init_work(struct work_struct *work, int onstack); extern void destroy_work_on_stack(struct work_struct *work); @@ -217,7 +206,7 @@ static inline unsigned int work_static(struct work_struct *work) { return 0; } (_work)->data = (atomic_long_t) WORK_DATA_INIT(); \ lockdep_init_map(&(_work)->lockdep_map, #_work, &__key, 0); \ INIT_LIST_HEAD(&(_work)->entry); \ - PREPARE_WORK((_work), (_func)); \ + (_work)->func = (_func); \ } while (0) #else #define __INIT_WORK(_work, _func, _onstack) \ @@ -225,7 +214,7 @@ static inline unsigned int work_static(struct work_struct *work) { return 0; } __init_work((_work), _onstack); \ (_work)->data = (atomic_long_t) WORK_DATA_INIT(); \ INIT_LIST_HEAD(&(_work)->entry); \ - PREPARE_WORK((_work), (_func)); \ + (_work)->func = (_func); \ } while (0) #endif -- cgit v1.2.3 From 52a4c6404f91f2d2c5592ee6365a8418c4565f53 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Fri, 7 Mar 2014 12:44:19 +0100 Subject: selinux: add gfp argument to security_xfrm_policy_alloc and fix callers security_xfrm_policy_alloc can be called in atomic context so the allocation should be done with GFP_ATOMIC. Add an argument to let the callers choose the appropriate way. In order to do so a gfp argument needs to be added to the method xfrm_policy_alloc_security in struct security_operations and to the internal function selinux_xfrm_alloc_user. After that switch to GFP_ATOMIC in the atomic callers and leave GFP_KERNEL as before for the rest. The path that needed the gfp argument addition is: security_xfrm_policy_alloc -> security_ops.xfrm_policy_alloc_security -> all users of xfrm_policy_alloc_security (e.g. selinux_xfrm_policy_alloc) -> selinux_xfrm_alloc_user (here the allocation used to be GFP_KERNEL only) Now adding a gfp argument to selinux_xfrm_alloc_user requires us to also add it to security_context_to_sid which is used inside and prior to this patch did only GFP_KERNEL allocation. So add gfp argument to security_context_to_sid and adjust all of its callers as well. CC: Paul Moore CC: Dave Jones CC: Steffen Klassert CC: Fan Du CC: David S. Miller CC: LSM list CC: SELinux list Signed-off-by: Nikolay Aleksandrov Acked-by: Paul Moore Signed-off-by: Steffen Klassert --- include/linux/security.h | 10 +++++++--- net/key/af_key.c | 6 +++--- net/xfrm/xfrm_user.c | 6 +++--- security/capability.c | 3 ++- security/security.c | 6 ++++-- security/selinux/hooks.c | 13 +++++++------ security/selinux/include/security.h | 2 +- security/selinux/include/xfrm.h | 3 ++- security/selinux/selinuxfs.c | 28 ++++++++++++++++++---------- security/selinux/ss/services.c | 6 ++++-- security/selinux/xfrm.c | 14 ++++++++------ 11 files changed, 59 insertions(+), 38 deletions(-) (limited to 'include/linux') diff --git a/include/linux/security.h b/include/linux/security.h index 5623a7f965b7..2fc42d191f79 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -1040,6 +1040,7 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts) * Allocate a security structure to the xp->security field; the security * field is initialized to NULL when the xfrm_policy is allocated. * Return 0 if operation was successful (memory to allocate, legal context) + * @gfp is to specify the context for the allocation * @xfrm_policy_clone_security: * @old_ctx contains an existing xfrm_sec_ctx. * @new_ctxp contains a new xfrm_sec_ctx being cloned from old. @@ -1683,7 +1684,7 @@ struct security_operations { #ifdef CONFIG_SECURITY_NETWORK_XFRM int (*xfrm_policy_alloc_security) (struct xfrm_sec_ctx **ctxp, - struct xfrm_user_sec_ctx *sec_ctx); + struct xfrm_user_sec_ctx *sec_ctx, gfp_t gfp); int (*xfrm_policy_clone_security) (struct xfrm_sec_ctx *old_ctx, struct xfrm_sec_ctx **new_ctx); void (*xfrm_policy_free_security) (struct xfrm_sec_ctx *ctx); int (*xfrm_policy_delete_security) (struct xfrm_sec_ctx *ctx); @@ -2859,7 +2860,8 @@ static inline void security_skb_owned_by(struct sk_buff *skb, struct sock *sk) #ifdef CONFIG_SECURITY_NETWORK_XFRM -int security_xfrm_policy_alloc(struct xfrm_sec_ctx **ctxp, struct xfrm_user_sec_ctx *sec_ctx); +int security_xfrm_policy_alloc(struct xfrm_sec_ctx **ctxp, + struct xfrm_user_sec_ctx *sec_ctx, gfp_t gfp); int security_xfrm_policy_clone(struct xfrm_sec_ctx *old_ctx, struct xfrm_sec_ctx **new_ctxp); void security_xfrm_policy_free(struct xfrm_sec_ctx *ctx); int security_xfrm_policy_delete(struct xfrm_sec_ctx *ctx); @@ -2877,7 +2879,9 @@ void security_skb_classify_flow(struct sk_buff *skb, struct flowi *fl); #else /* CONFIG_SECURITY_NETWORK_XFRM */ -static inline int security_xfrm_policy_alloc(struct xfrm_sec_ctx **ctxp, struct xfrm_user_sec_ctx *sec_ctx) +static inline int security_xfrm_policy_alloc(struct xfrm_sec_ctx **ctxp, + struct xfrm_user_sec_ctx *sec_ctx, + gfp_t gfp) { return 0; } diff --git a/net/key/af_key.c b/net/key/af_key.c index 1526023f99ed..79326978517a 100644 --- a/net/key/af_key.c +++ b/net/key/af_key.c @@ -2239,7 +2239,7 @@ static int pfkey_spdadd(struct sock *sk, struct sk_buff *skb, const struct sadb_ goto out; } - err = security_xfrm_policy_alloc(&xp->security, uctx); + err = security_xfrm_policy_alloc(&xp->security, uctx, GFP_KERNEL); kfree(uctx); if (err) @@ -2341,7 +2341,7 @@ static int pfkey_spddelete(struct sock *sk, struct sk_buff *skb, const struct sa if (!uctx) return -ENOMEM; - err = security_xfrm_policy_alloc(&pol_ctx, uctx); + err = security_xfrm_policy_alloc(&pol_ctx, uctx, GFP_KERNEL); kfree(uctx); if (err) return err; @@ -3241,7 +3241,7 @@ static struct xfrm_policy *pfkey_compile_policy(struct sock *sk, int opt, if ((*dir = verify_sec_ctx_len(p))) goto out; uctx = pfkey_sadb2xfrm_user_sec_ctx(sec_ctx, GFP_ATOMIC); - *dir = security_xfrm_policy_alloc(&xp->security, uctx); + *dir = security_xfrm_policy_alloc(&xp->security, uctx, GFP_ATOMIC); kfree(uctx); if (*dir) diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index c274179d60a2..2f7ddc3a59b4 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -1221,7 +1221,7 @@ static int copy_from_user_sec_ctx(struct xfrm_policy *pol, struct nlattr **attrs return 0; uctx = nla_data(rt); - return security_xfrm_policy_alloc(&pol->security, uctx); + return security_xfrm_policy_alloc(&pol->security, uctx, GFP_KERNEL); } static void copy_templates(struct xfrm_policy *xp, struct xfrm_user_tmpl *ut, @@ -1626,7 +1626,7 @@ static int xfrm_get_policy(struct sk_buff *skb, struct nlmsghdr *nlh, if (rt) { struct xfrm_user_sec_ctx *uctx = nla_data(rt); - err = security_xfrm_policy_alloc(&ctx, uctx); + err = security_xfrm_policy_alloc(&ctx, uctx, GFP_KERNEL); if (err) return err; } @@ -1928,7 +1928,7 @@ static int xfrm_add_pol_expire(struct sk_buff *skb, struct nlmsghdr *nlh, if (rt) { struct xfrm_user_sec_ctx *uctx = nla_data(rt); - err = security_xfrm_policy_alloc(&ctx, uctx); + err = security_xfrm_policy_alloc(&ctx, uctx, GFP_KERNEL); if (err) return err; } diff --git a/security/capability.c b/security/capability.c index 8b4f24ae4338..21e2b9cae685 100644 --- a/security/capability.c +++ b/security/capability.c @@ -757,7 +757,8 @@ static void cap_skb_owned_by(struct sk_buff *skb, struct sock *sk) #ifdef CONFIG_SECURITY_NETWORK_XFRM static int cap_xfrm_policy_alloc_security(struct xfrm_sec_ctx **ctxp, - struct xfrm_user_sec_ctx *sec_ctx) + struct xfrm_user_sec_ctx *sec_ctx, + gfp_t gfp) { return 0; } diff --git a/security/security.c b/security/security.c index 15b6928592ef..919cad93ac82 100644 --- a/security/security.c +++ b/security/security.c @@ -1317,9 +1317,11 @@ void security_skb_owned_by(struct sk_buff *skb, struct sock *sk) #ifdef CONFIG_SECURITY_NETWORK_XFRM -int security_xfrm_policy_alloc(struct xfrm_sec_ctx **ctxp, struct xfrm_user_sec_ctx *sec_ctx) +int security_xfrm_policy_alloc(struct xfrm_sec_ctx **ctxp, + struct xfrm_user_sec_ctx *sec_ctx, + gfp_t gfp) { - return security_ops->xfrm_policy_alloc_security(ctxp, sec_ctx); + return security_ops->xfrm_policy_alloc_security(ctxp, sec_ctx, gfp); } EXPORT_SYMBOL(security_xfrm_policy_alloc); diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index 4b34847208cc..b332e2cc0954 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -668,7 +668,7 @@ static int selinux_set_mnt_opts(struct super_block *sb, if (flags[i] == SBLABEL_MNT) continue; rc = security_context_to_sid(mount_options[i], - strlen(mount_options[i]), &sid); + strlen(mount_options[i]), &sid, GFP_KERNEL); if (rc) { printk(KERN_WARNING "SELinux: security_context_to_sid" "(%s) failed for (dev %s, type %s) errno=%d\n", @@ -2489,7 +2489,8 @@ static int selinux_sb_remount(struct super_block *sb, void *data) if (flags[i] == SBLABEL_MNT) continue; len = strlen(mount_options[i]); - rc = security_context_to_sid(mount_options[i], len, &sid); + rc = security_context_to_sid(mount_options[i], len, &sid, + GFP_KERNEL); if (rc) { printk(KERN_WARNING "SELinux: security_context_to_sid" "(%s) failed for (dev %s, type %s) errno=%d\n", @@ -2893,7 +2894,7 @@ static int selinux_inode_setxattr(struct dentry *dentry, const char *name, if (rc) return rc; - rc = security_context_to_sid(value, size, &newsid); + rc = security_context_to_sid(value, size, &newsid, GFP_KERNEL); if (rc == -EINVAL) { if (!capable(CAP_MAC_ADMIN)) { struct audit_buffer *ab; @@ -3050,7 +3051,7 @@ static int selinux_inode_setsecurity(struct inode *inode, const char *name, if (!value || !size) return -EACCES; - rc = security_context_to_sid((void *)value, size, &newsid); + rc = security_context_to_sid((void *)value, size, &newsid, GFP_KERNEL); if (rc) return rc; @@ -5529,7 +5530,7 @@ static int selinux_setprocattr(struct task_struct *p, str[size-1] = 0; size--; } - error = security_context_to_sid(value, size, &sid); + error = security_context_to_sid(value, size, &sid, GFP_KERNEL); if (error == -EINVAL && !strcmp(name, "fscreate")) { if (!capable(CAP_MAC_ADMIN)) { struct audit_buffer *ab; @@ -5638,7 +5639,7 @@ static int selinux_secid_to_secctx(u32 secid, char **secdata, u32 *seclen) static int selinux_secctx_to_secid(const char *secdata, u32 seclen, u32 *secid) { - return security_context_to_sid(secdata, seclen, secid); + return security_context_to_sid(secdata, seclen, secid, GFP_KERNEL); } static void selinux_release_secctx(char *secdata, u32 seclen) diff --git a/security/selinux/include/security.h b/security/selinux/include/security.h index 8ed8daf7f1ee..ce7852cf526b 100644 --- a/security/selinux/include/security.h +++ b/security/selinux/include/security.h @@ -134,7 +134,7 @@ int security_sid_to_context(u32 sid, char **scontext, int security_sid_to_context_force(u32 sid, char **scontext, u32 *scontext_len); int security_context_to_sid(const char *scontext, u32 scontext_len, - u32 *out_sid); + u32 *out_sid, gfp_t gfp); int security_context_to_sid_default(const char *scontext, u32 scontext_len, u32 *out_sid, u32 def_sid, gfp_t gfp_flags); diff --git a/security/selinux/include/xfrm.h b/security/selinux/include/xfrm.h index 48c3cc94c168..9f0584710c85 100644 --- a/security/selinux/include/xfrm.h +++ b/security/selinux/include/xfrm.h @@ -10,7 +10,8 @@ #include int selinux_xfrm_policy_alloc(struct xfrm_sec_ctx **ctxp, - struct xfrm_user_sec_ctx *uctx); + struct xfrm_user_sec_ctx *uctx, + gfp_t gfp); int selinux_xfrm_policy_clone(struct xfrm_sec_ctx *old_ctx, struct xfrm_sec_ctx **new_ctxp); void selinux_xfrm_policy_free(struct xfrm_sec_ctx *ctx); diff --git a/security/selinux/selinuxfs.c b/security/selinux/selinuxfs.c index 5122affe06a8..d60c0ee66387 100644 --- a/security/selinux/selinuxfs.c +++ b/security/selinux/selinuxfs.c @@ -576,7 +576,7 @@ static ssize_t sel_write_context(struct file *file, char *buf, size_t size) if (length) goto out; - length = security_context_to_sid(buf, size, &sid); + length = security_context_to_sid(buf, size, &sid, GFP_KERNEL); if (length) goto out; @@ -731,11 +731,13 @@ static ssize_t sel_write_access(struct file *file, char *buf, size_t size) if (sscanf(buf, "%s %s %hu", scon, tcon, &tclass) != 3) goto out; - length = security_context_to_sid(scon, strlen(scon) + 1, &ssid); + length = security_context_to_sid(scon, strlen(scon) + 1, &ssid, + GFP_KERNEL); if (length) goto out; - length = security_context_to_sid(tcon, strlen(tcon) + 1, &tsid); + length = security_context_to_sid(tcon, strlen(tcon) + 1, &tsid, + GFP_KERNEL); if (length) goto out; @@ -817,11 +819,13 @@ static ssize_t sel_write_create(struct file *file, char *buf, size_t size) objname = namebuf; } - length = security_context_to_sid(scon, strlen(scon) + 1, &ssid); + length = security_context_to_sid(scon, strlen(scon) + 1, &ssid, + GFP_KERNEL); if (length) goto out; - length = security_context_to_sid(tcon, strlen(tcon) + 1, &tsid); + length = security_context_to_sid(tcon, strlen(tcon) + 1, &tsid, + GFP_KERNEL); if (length) goto out; @@ -878,11 +882,13 @@ static ssize_t sel_write_relabel(struct file *file, char *buf, size_t size) if (sscanf(buf, "%s %s %hu", scon, tcon, &tclass) != 3) goto out; - length = security_context_to_sid(scon, strlen(scon) + 1, &ssid); + length = security_context_to_sid(scon, strlen(scon) + 1, &ssid, + GFP_KERNEL); if (length) goto out; - length = security_context_to_sid(tcon, strlen(tcon) + 1, &tsid); + length = security_context_to_sid(tcon, strlen(tcon) + 1, &tsid, + GFP_KERNEL); if (length) goto out; @@ -934,7 +940,7 @@ static ssize_t sel_write_user(struct file *file, char *buf, size_t size) if (sscanf(buf, "%s %s", con, user) != 2) goto out; - length = security_context_to_sid(con, strlen(con) + 1, &sid); + length = security_context_to_sid(con, strlen(con) + 1, &sid, GFP_KERNEL); if (length) goto out; @@ -994,11 +1000,13 @@ static ssize_t sel_write_member(struct file *file, char *buf, size_t size) if (sscanf(buf, "%s %s %hu", scon, tcon, &tclass) != 3) goto out; - length = security_context_to_sid(scon, strlen(scon) + 1, &ssid); + length = security_context_to_sid(scon, strlen(scon) + 1, &ssid, + GFP_KERNEL); if (length) goto out; - length = security_context_to_sid(tcon, strlen(tcon) + 1, &tsid); + length = security_context_to_sid(tcon, strlen(tcon) + 1, &tsid, + GFP_KERNEL); if (length) goto out; diff --git a/security/selinux/ss/services.c b/security/selinux/ss/services.c index 5d0144ee8ed6..4bca49414a40 100644 --- a/security/selinux/ss/services.c +++ b/security/selinux/ss/services.c @@ -1289,16 +1289,18 @@ out: * @scontext: security context * @scontext_len: length in bytes * @sid: security identifier, SID + * @gfp: context for the allocation * * Obtains a SID associated with the security context that * has the string representation specified by @scontext. * Returns -%EINVAL if the context is invalid, -%ENOMEM if insufficient * memory is available, or 0 on success. */ -int security_context_to_sid(const char *scontext, u32 scontext_len, u32 *sid) +int security_context_to_sid(const char *scontext, u32 scontext_len, u32 *sid, + gfp_t gfp) { return security_context_to_sid_core(scontext, scontext_len, - sid, SECSID_NULL, GFP_KERNEL, 0); + sid, SECSID_NULL, gfp, 0); } /** diff --git a/security/selinux/xfrm.c b/security/selinux/xfrm.c index 0462cb3ff0a7..98b042630a9e 100644 --- a/security/selinux/xfrm.c +++ b/security/selinux/xfrm.c @@ -78,7 +78,8 @@ static inline int selinux_authorizable_xfrm(struct xfrm_state *x) * xfrm_user_sec_ctx context. */ static int selinux_xfrm_alloc_user(struct xfrm_sec_ctx **ctxp, - struct xfrm_user_sec_ctx *uctx) + struct xfrm_user_sec_ctx *uctx, + gfp_t gfp) { int rc; const struct task_security_struct *tsec = current_security(); @@ -94,7 +95,7 @@ static int selinux_xfrm_alloc_user(struct xfrm_sec_ctx **ctxp, if (str_len >= PAGE_SIZE) return -ENOMEM; - ctx = kmalloc(sizeof(*ctx) + str_len + 1, GFP_KERNEL); + ctx = kmalloc(sizeof(*ctx) + str_len + 1, gfp); if (!ctx) return -ENOMEM; @@ -103,7 +104,7 @@ static int selinux_xfrm_alloc_user(struct xfrm_sec_ctx **ctxp, ctx->ctx_len = str_len; memcpy(ctx->ctx_str, &uctx[1], str_len); ctx->ctx_str[str_len] = '\0'; - rc = security_context_to_sid(ctx->ctx_str, str_len, &ctx->ctx_sid); + rc = security_context_to_sid(ctx->ctx_str, str_len, &ctx->ctx_sid, gfp); if (rc) goto err; @@ -282,9 +283,10 @@ int selinux_xfrm_skb_sid(struct sk_buff *skb, u32 *sid) * LSM hook implementation that allocs and transfers uctx spec to xfrm_policy. */ int selinux_xfrm_policy_alloc(struct xfrm_sec_ctx **ctxp, - struct xfrm_user_sec_ctx *uctx) + struct xfrm_user_sec_ctx *uctx, + gfp_t gfp) { - return selinux_xfrm_alloc_user(ctxp, uctx); + return selinux_xfrm_alloc_user(ctxp, uctx, gfp); } /* @@ -332,7 +334,7 @@ int selinux_xfrm_policy_delete(struct xfrm_sec_ctx *ctx) int selinux_xfrm_state_alloc(struct xfrm_state *x, struct xfrm_user_sec_ctx *uctx) { - return selinux_xfrm_alloc_user(&x->security, uctx); + return selinux_xfrm_alloc_user(&x->security, uctx, GFP_KERNEL); } /* -- cgit v1.2.3 From 9c225f2655e36a470c4f58dbbc99244c5fc7f2d4 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 3 Mar 2014 09:36:58 -0800 Subject: vfs: atomic f_pos accesses as per POSIX Our write() system call has always been atomic in the sense that you get the expected thread-safe contiguous write, but we haven't actually guaranteed that concurrent writes are serialized wrt f_pos accesses, so threads (or processes) that share a file descriptor and use "write()" concurrently would quite likely overwrite each others data. This violates POSIX.1-2008/SUSv4 Section XSI 2.9.7 that says: "2.9.7 Thread Interactions with Regular File Operations All of the following functions shall be atomic with respect to each other in the effects specified in POSIX.1-2008 when they operate on regular files or symbolic links: [...]" and one of the effects is the file position update. This unprotected file position behavior is not new behavior, and nobody has ever cared. Until now. Yongzhi Pan reported unexpected behavior to Michael Kerrisk that was due to this. This resolves the issue with a f_pos-specific lock that is taken by read/write/lseek on file descriptors that may be shared across threads or processes. Reported-by: Yongzhi Pan Reported-by: Michael Kerrisk Cc: Al Viro Signed-off-by: Linus Torvalds Signed-off-by: Al Viro --- fs/file_table.c | 1 + fs/namei.c | 2 +- fs/open.c | 4 ++++ fs/read_write.c | 54 ++++++++++++++++++++++++++++++++++++++-------------- include/linux/file.h | 6 ++++-- include/linux/fs.h | 6 +++++- 6 files changed, 55 insertions(+), 18 deletions(-) (limited to 'include/linux') diff --git a/fs/file_table.c b/fs/file_table.c index 5fff9030be34..5b24008ea4f6 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -135,6 +135,7 @@ struct file *get_empty_filp(void) atomic_long_set(&f->f_count, 1); rwlock_init(&f->f_owner.lock); spin_lock_init(&f->f_lock); + mutex_init(&f->f_pos_lock); eventpoll_init_file(f); /* f->f_version: 0 */ return f; diff --git a/fs/namei.c b/fs/namei.c index 385f7817bfcc..2f730ef9b4b3 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1884,7 +1884,7 @@ static int path_init(int dfd, const char *name, unsigned int flags, nd->path = f.file->f_path; if (flags & LOOKUP_RCU) { - if (f.need_put) + if (f.flags & FDPUT_FPUT) *fp = f.file; nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq); rcu_read_lock(); diff --git a/fs/open.c b/fs/open.c index 4b3e1edf2fe4..b9ed8b25c108 100644 --- a/fs/open.c +++ b/fs/open.c @@ -705,6 +705,10 @@ static int do_dentry_open(struct file *f, return 0; } + /* POSIX.1-2008/SUSv4 Section XSI 2.9.7 */ + if (S_ISREG(inode->i_mode)) + f->f_mode |= FMODE_ATOMIC_POS; + f->f_op = fops_get(inode->i_fop); if (unlikely(WARN_ON(!f->f_op))) { error = -ENODEV; diff --git a/fs/read_write.c b/fs/read_write.c index edc5746a902a..932bb3414a96 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -264,10 +264,36 @@ loff_t vfs_llseek(struct file *file, loff_t offset, int whence) } EXPORT_SYMBOL(vfs_llseek); +/* + * We only lock f_pos if we have threads or if the file might be + * shared with another process. In both cases we'll have an elevated + * file count (done either by fdget() or by fork()). + */ +static inline struct fd fdget_pos(int fd) +{ + struct fd f = fdget(fd); + struct file *file = f.file; + + if (file && (file->f_mode & FMODE_ATOMIC_POS)) { + if (file_count(file) > 1) { + f.flags |= FDPUT_POS_UNLOCK; + mutex_lock(&file->f_pos_lock); + } + } + return f; +} + +static inline void fdput_pos(struct fd f) +{ + if (f.flags & FDPUT_POS_UNLOCK) + mutex_unlock(&f.file->f_pos_lock); + fdput(f); +} + SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, whence) { off_t retval; - struct fd f = fdget(fd); + struct fd f = fdget_pos(fd); if (!f.file) return -EBADF; @@ -278,7 +304,7 @@ SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, whence) if (res != (loff_t)retval) retval = -EOVERFLOW; /* LFS: should only happen on 32 bit platforms */ } - fdput(f); + fdput_pos(f); return retval; } @@ -498,7 +524,7 @@ static inline void file_pos_write(struct file *file, loff_t pos) SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count) { - struct fd f = fdget(fd); + struct fd f = fdget_pos(fd); ssize_t ret = -EBADF; if (f.file) { @@ -506,7 +532,7 @@ SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count) ret = vfs_read(f.file, buf, count, &pos); if (ret >= 0) file_pos_write(f.file, pos); - fdput(f); + fdput_pos(f); } return ret; } @@ -514,7 +540,7 @@ SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count) SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf, size_t, count) { - struct fd f = fdget(fd); + struct fd f = fdget_pos(fd); ssize_t ret = -EBADF; if (f.file) { @@ -522,7 +548,7 @@ SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf, ret = vfs_write(f.file, buf, count, &pos); if (ret >= 0) file_pos_write(f.file, pos); - fdput(f); + fdput_pos(f); } return ret; @@ -797,7 +823,7 @@ EXPORT_SYMBOL(vfs_writev); SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec, unsigned long, vlen) { - struct fd f = fdget(fd); + struct fd f = fdget_pos(fd); ssize_t ret = -EBADF; if (f.file) { @@ -805,7 +831,7 @@ SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec, ret = vfs_readv(f.file, vec, vlen, &pos); if (ret >= 0) file_pos_write(f.file, pos); - fdput(f); + fdput_pos(f); } if (ret > 0) @@ -817,7 +843,7 @@ SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec, SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec, unsigned long, vlen) { - struct fd f = fdget(fd); + struct fd f = fdget_pos(fd); ssize_t ret = -EBADF; if (f.file) { @@ -825,7 +851,7 @@ SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec, ret = vfs_writev(f.file, vec, vlen, &pos); if (ret >= 0) file_pos_write(f.file, pos); - fdput(f); + fdput_pos(f); } if (ret > 0) @@ -968,7 +994,7 @@ COMPAT_SYSCALL_DEFINE3(readv, compat_ulong_t, fd, const struct compat_iovec __user *,vec, compat_ulong_t, vlen) { - struct fd f = fdget(fd); + struct fd f = fdget_pos(fd); ssize_t ret; loff_t pos; @@ -978,7 +1004,7 @@ COMPAT_SYSCALL_DEFINE3(readv, compat_ulong_t, fd, ret = compat_readv(f.file, vec, vlen, &pos); if (ret >= 0) f.file->f_pos = pos; - fdput(f); + fdput_pos(f); return ret; } @@ -1035,7 +1061,7 @@ COMPAT_SYSCALL_DEFINE3(writev, compat_ulong_t, fd, const struct compat_iovec __user *, vec, compat_ulong_t, vlen) { - struct fd f = fdget(fd); + struct fd f = fdget_pos(fd); ssize_t ret; loff_t pos; @@ -1045,7 +1071,7 @@ COMPAT_SYSCALL_DEFINE3(writev, compat_ulong_t, fd, ret = compat_writev(f.file, vec, vlen, &pos); if (ret >= 0) f.file->f_pos = pos; - fdput(f); + fdput_pos(f); return ret; } diff --git a/include/linux/file.h b/include/linux/file.h index cbacf4faf447..f2517fa2d610 100644 --- a/include/linux/file.h +++ b/include/linux/file.h @@ -28,12 +28,14 @@ static inline void fput_light(struct file *file, int fput_needed) struct fd { struct file *file; - int need_put; + unsigned int flags; }; +#define FDPUT_FPUT 1 +#define FDPUT_POS_UNLOCK 2 static inline void fdput(struct fd fd) { - if (fd.need_put) + if (fd.flags & FDPUT_FPUT) fput(fd.file); } diff --git a/include/linux/fs.h b/include/linux/fs.h index 60829565e552..ebfde04bca06 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -123,6 +123,9 @@ typedef void (dio_iodone_t)(struct kiocb *iocb, loff_t offset, /* File is opened with O_PATH; almost nothing can be done with it */ #define FMODE_PATH ((__force fmode_t)0x4000) +/* File needs atomic accesses to f_pos */ +#define FMODE_ATOMIC_POS ((__force fmode_t)0x8000) + /* File was opened by fanotify and shouldn't generate fanotify events */ #define FMODE_NONOTIFY ((__force fmode_t)0x1000000) @@ -780,13 +783,14 @@ struct file { const struct file_operations *f_op; /* - * Protects f_ep_links, f_flags, f_pos vs i_size in lseek SEEK_CUR. + * Protects f_ep_links, f_flags. * Must not be taken from IRQ context. */ spinlock_t f_lock; atomic_long_t f_count; unsigned int f_flags; fmode_t f_mode; + struct mutex f_pos_lock; loff_t f_pos; struct fown_struct f_owner; const struct cred *f_cred; -- cgit v1.2.3 From bd2a31d522344b3ac2fb680bd2366e77a9bd8209 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 4 Mar 2014 14:54:22 -0500 Subject: get rid of fget_light() instead of returning the flags by reference, we can just have the low-level primitive return those in lower bits of unsigned long, with struct file * derived from the rest. Signed-off-by: Al Viro --- fs/file.c | 56 ++++++++++++++++++++++++++++++++++++++++------------ fs/read_write.c | 16 +-------------- include/linux/file.h | 21 ++++++++++---------- include/linux/fs.h | 2 +- 4 files changed, 56 insertions(+), 39 deletions(-) (limited to 'include/linux') diff --git a/fs/file.c b/fs/file.c index db25c2bdfe46..60a45e9f5323 100644 --- a/fs/file.c +++ b/fs/file.c @@ -683,35 +683,65 @@ EXPORT_SYMBOL(fget_raw); * The fput_needed flag returned by fget_light should be passed to the * corresponding fput_light. */ -struct file *__fget_light(unsigned int fd, fmode_t mask, int *fput_needed) +static unsigned long __fget_light(unsigned int fd, fmode_t mask) { struct files_struct *files = current->files; struct file *file; - *fput_needed = 0; if (atomic_read(&files->count) == 1) { file = __fcheck_files(files, fd); - if (file && (file->f_mode & mask)) - file = NULL; + if (!file || unlikely(file->f_mode & mask)) + return 0; + return (unsigned long)file; } else { file = __fget(fd, mask); - if (file) - *fput_needed = 1; + if (!file) + return 0; + return FDPUT_FPUT | (unsigned long)file; } - - return file; } -struct file *fget_light(unsigned int fd, int *fput_needed) +unsigned long __fdget(unsigned int fd) { - return __fget_light(fd, FMODE_PATH, fput_needed); + return __fget_light(fd, FMODE_PATH); } -EXPORT_SYMBOL(fget_light); +EXPORT_SYMBOL(__fdget); -struct file *fget_raw_light(unsigned int fd, int *fput_needed) +unsigned long __fdget_raw(unsigned int fd) { - return __fget_light(fd, 0, fput_needed); + return __fget_light(fd, 0); +} + +unsigned long __fdget_pos(unsigned int fd) +{ + struct files_struct *files = current->files; + struct file *file; + unsigned long v; + + if (atomic_read(&files->count) == 1) { + file = __fcheck_files(files, fd); + v = 0; + } else { + file = __fget(fd, 0); + v = FDPUT_FPUT; + } + if (!file) + return 0; + + if (file->f_mode & FMODE_ATOMIC_POS) { + if (file_count(file) > 1) { + v |= FDPUT_POS_UNLOCK; + mutex_lock(&file->f_pos_lock); + } + } + return v | (unsigned long)file; } +/* + * We only lock f_pos if we have threads or if the file might be + * shared with another process. In both cases we'll have an elevated + * file count (done either by fdget() or by fork()). + */ + void set_close_on_exec(unsigned int fd, int flag) { struct files_struct *files = current->files; diff --git a/fs/read_write.c b/fs/read_write.c index 932bb3414a96..54e19b9392dc 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -264,23 +264,9 @@ loff_t vfs_llseek(struct file *file, loff_t offset, int whence) } EXPORT_SYMBOL(vfs_llseek); -/* - * We only lock f_pos if we have threads or if the file might be - * shared with another process. In both cases we'll have an elevated - * file count (done either by fdget() or by fork()). - */ static inline struct fd fdget_pos(int fd) { - struct fd f = fdget(fd); - struct file *file = f.file; - - if (file && (file->f_mode & FMODE_ATOMIC_POS)) { - if (file_count(file) > 1) { - f.flags |= FDPUT_POS_UNLOCK; - mutex_lock(&file->f_pos_lock); - } - } - return f; + return __to_fd(__fdget_pos(fd)); } static inline void fdput_pos(struct fd f) diff --git a/include/linux/file.h b/include/linux/file.h index f2517fa2d610..4d69123377a2 100644 --- a/include/linux/file.h +++ b/include/linux/file.h @@ -40,23 +40,24 @@ static inline void fdput(struct fd fd) } extern struct file *fget(unsigned int fd); -extern struct file *fget_light(unsigned int fd, int *fput_needed); +extern struct file *fget_raw(unsigned int fd); +extern unsigned long __fdget(unsigned int fd); +extern unsigned long __fdget_raw(unsigned int fd); +extern unsigned long __fdget_pos(unsigned int fd); -static inline struct fd fdget(unsigned int fd) +static inline struct fd __to_fd(unsigned long v) { - int b; - struct file *f = fget_light(fd, &b); - return (struct fd){f,b}; + return (struct fd){(struct file *)(v & ~3),v & 3}; } -extern struct file *fget_raw(unsigned int fd); -extern struct file *fget_raw_light(unsigned int fd, int *fput_needed); +static inline struct fd fdget(unsigned int fd) +{ + return __to_fd(__fdget(fd)); +} static inline struct fd fdget_raw(unsigned int fd) { - int b; - struct file *f = fget_raw_light(fd, &b); - return (struct fd){f,b}; + return __to_fd(__fdget_raw(fd)); } extern int f_dupfd(unsigned int from, struct file *file, unsigned flags); diff --git a/include/linux/fs.h b/include/linux/fs.h index ebfde04bca06..23b2a35d712e 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -812,7 +812,7 @@ struct file { #ifdef CONFIG_DEBUG_WRITECOUNT unsigned long f_mnt_write_state; #endif -}; +} __attribute__((aligned(4))); /* lest something weird decides that 2 is OK */ struct file_handle { __u32 handle_bytes; -- cgit v1.2.3 From e97ca8e5b864f88b028c1759ba8536fa827d6d96 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 10 Mar 2014 15:49:43 -0700 Subject: mm: fix GFP_THISNODE callers and clarify GFP_THISNODE is for callers that implement their own clever fallback to remote nodes. It restricts the allocation to the specified node and does not invoke reclaim, assuming that the caller will take care of it when the fallback fails, e.g. through a subsequent allocation request without GFP_THISNODE set. However, many current GFP_THISNODE users only want the node exclusive aspect of the flag, without actually implementing their own fallback or triggering reclaim if necessary. This results in things like page migration failing prematurely even when there is easily reclaimable memory available, unless kswapd happens to be running already or a concurrent allocation attempt triggers the necessary reclaim. Convert all callsites that don't implement their own fallback strategy to __GFP_THISNODE. This restricts the allocation a single node too, but at the same time allows the allocator to enter the slowpath, wake kswapd, and invoke direct reclaim if necessary, to make the allocation happen when memory is full. Signed-off-by: Johannes Weiner Acked-by: Rik van Riel Cc: Jan Stancek Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/ia64/kernel/uncached.c | 2 +- arch/powerpc/platforms/cell/ras.c | 3 ++- drivers/misc/sgi-xp/xpc_uv.c | 2 +- include/linux/gfp.h | 4 ++++ include/linux/mmzone.h | 4 ++-- include/linux/slab.h | 2 +- kernel/profile.c | 4 ++-- mm/migrate.c | 11 ++++++----- 8 files changed, 19 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/arch/ia64/kernel/uncached.c b/arch/ia64/kernel/uncached.c index a96bcf83a735..20e8a9b21d75 100644 --- a/arch/ia64/kernel/uncached.c +++ b/arch/ia64/kernel/uncached.c @@ -98,7 +98,7 @@ static int uncached_add_chunk(struct uncached_pool *uc_pool, int nid) /* attempt to allocate a granule's worth of cached memory pages */ page = alloc_pages_exact_node(nid, - GFP_KERNEL | __GFP_ZERO | GFP_THISNODE, + GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE, IA64_GRANULE_SHIFT-PAGE_SHIFT); if (!page) { mutex_unlock(&uc_pool->add_chunk_mutex); diff --git a/arch/powerpc/platforms/cell/ras.c b/arch/powerpc/platforms/cell/ras.c index 5ec1e47a0d77..e865d748179b 100644 --- a/arch/powerpc/platforms/cell/ras.c +++ b/arch/powerpc/platforms/cell/ras.c @@ -123,7 +123,8 @@ static int __init cbe_ptcal_enable_on_node(int nid, int order) area->nid = nid; area->order = order; - area->pages = alloc_pages_exact_node(area->nid, GFP_KERNEL|GFP_THISNODE, + area->pages = alloc_pages_exact_node(area->nid, + GFP_KERNEL|__GFP_THISNODE, area->order); if (!area->pages) { diff --git a/drivers/misc/sgi-xp/xpc_uv.c b/drivers/misc/sgi-xp/xpc_uv.c index b9e2000969f0..95c894482fdd 100644 --- a/drivers/misc/sgi-xp/xpc_uv.c +++ b/drivers/misc/sgi-xp/xpc_uv.c @@ -240,7 +240,7 @@ xpc_create_gru_mq_uv(unsigned int mq_size, int cpu, char *irq_name, nid = cpu_to_node(cpu); page = alloc_pages_exact_node(nid, - GFP_KERNEL | __GFP_ZERO | GFP_THISNODE, + GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE, pg_order); if (page == NULL) { dev_err(xpc_part, "xpc_create_gru_mq_uv() failed to alloc %d " diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 0437439bc047..39b81dc7d01a 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -123,6 +123,10 @@ struct vm_area_struct; __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN | \ __GFP_NO_KSWAPD) +/* + * GFP_THISNODE does not perform any reclaim, you most likely want to + * use __GFP_THISNODE to allocate from a given node without fallback! + */ #ifdef CONFIG_NUMA #define GFP_THISNODE (__GFP_THISNODE | __GFP_NOWARN | __GFP_NORETRY) #else diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 5f2052c83154..9b61b9bf81ac 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -590,10 +590,10 @@ static inline bool zone_is_empty(struct zone *zone) /* * The NUMA zonelists are doubled because we need zonelists that restrict the - * allocations to a single node for GFP_THISNODE. + * allocations to a single node for __GFP_THISNODE. * * [0] : Zonelist with fallback - * [1] : No fallback (GFP_THISNODE) + * [1] : No fallback (__GFP_THISNODE) */ #define MAX_ZONELISTS 2 diff --git a/include/linux/slab.h b/include/linux/slab.h index 9260abdd67df..b5b2df60299e 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -410,7 +410,7 @@ static __always_inline void *kmalloc_large(size_t size, gfp_t flags) * * %GFP_NOWAIT - Allocation will not sleep. * - * %GFP_THISNODE - Allocate node-local memory only. + * %__GFP_THISNODE - Allocate node-local memory only. * * %GFP_DMA - Allocation suitable for DMA. * Should only be used for kmalloc() caches. Otherwise, use a diff --git a/kernel/profile.c b/kernel/profile.c index 6631e1ef55ab..ebdd9c1a86b4 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -549,14 +549,14 @@ static int create_hash_tables(void) struct page *page; page = alloc_pages_exact_node(node, - GFP_KERNEL | __GFP_ZERO | GFP_THISNODE, + GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE, 0); if (!page) goto out_cleanup; per_cpu(cpu_profile_hits, cpu)[1] = (struct profile_hit *)page_address(page); page = alloc_pages_exact_node(node, - GFP_KERNEL | __GFP_ZERO | GFP_THISNODE, + GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE, 0); if (!page) goto out_cleanup; diff --git a/mm/migrate.c b/mm/migrate.c index 482a33d89134..b494fdb9a636 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1158,7 +1158,7 @@ static struct page *new_page_node(struct page *p, unsigned long private, pm->node); else return alloc_pages_exact_node(pm->node, - GFP_HIGHUSER_MOVABLE | GFP_THISNODE, 0); + GFP_HIGHUSER_MOVABLE | __GFP_THISNODE, 0); } /* @@ -1544,9 +1544,9 @@ static struct page *alloc_misplaced_dst_page(struct page *page, struct page *newpage; newpage = alloc_pages_exact_node(nid, - (GFP_HIGHUSER_MOVABLE | GFP_THISNODE | - __GFP_NOMEMALLOC | __GFP_NORETRY | - __GFP_NOWARN) & + (GFP_HIGHUSER_MOVABLE | + __GFP_THISNODE | __GFP_NOMEMALLOC | + __GFP_NORETRY | __GFP_NOWARN) & ~GFP_IOFS, 0); return newpage; @@ -1747,7 +1747,8 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, goto out_dropref; new_page = alloc_pages_node(node, - (GFP_TRANSHUGE | GFP_THISNODE) & ~__GFP_WAIT, HPAGE_PMD_ORDER); + (GFP_TRANSHUGE | __GFP_THISNODE) & ~__GFP_WAIT, + HPAGE_PMD_ORDER); if (!new_page) goto out_fail; -- cgit v1.2.3 From c9122da1e2d29bd6a1475a0d1ce2aa6ac6ea25fa Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 3 Feb 2014 13:32:16 +0100 Subject: locking: Move mcs_spinlock.h into kernel/locking/ The mcs_spinlock code is not meant (or suitable) as a generic locking primitive, therefore take it away from the normal includes and place it in kernel/locking/. This way the locking primitives implemented there can use it as part of their implementation but we do not risk it getting used inapropriately. Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-byirmpamgr7h25m5kyavwpzx@git.kernel.org Signed-off-by: Ingo Molnar --- include/linux/mcs_spinlock.h | 114 ------------------------------------------ kernel/locking/mcs_spinlock.h | 114 ++++++++++++++++++++++++++++++++++++++++++ kernel/locking/mutex.c | 2 +- 3 files changed, 115 insertions(+), 115 deletions(-) delete mode 100644 include/linux/mcs_spinlock.h create mode 100644 kernel/locking/mcs_spinlock.h (limited to 'include/linux') diff --git a/include/linux/mcs_spinlock.h b/include/linux/mcs_spinlock.h deleted file mode 100644 index f2a5c6360083..000000000000 --- a/include/linux/mcs_spinlock.h +++ /dev/null @@ -1,114 +0,0 @@ -/* - * MCS lock defines - * - * This file contains the main data structure and API definitions of MCS lock. - * - * The MCS lock (proposed by Mellor-Crummey and Scott) is a simple spin-lock - * with the desirable properties of being fair, and with each cpu trying - * to acquire the lock spinning on a local variable. - * It avoids expensive cache bouncings that common test-and-set spin-lock - * implementations incur. - */ -#ifndef __LINUX_MCS_SPINLOCK_H -#define __LINUX_MCS_SPINLOCK_H - -#include - -struct mcs_spinlock { - struct mcs_spinlock *next; - int locked; /* 1 if lock acquired */ -}; - -#ifndef arch_mcs_spin_lock_contended -/* - * Using smp_load_acquire() provides a memory barrier that ensures - * subsequent operations happen after the lock is acquired. - */ -#define arch_mcs_spin_lock_contended(l) \ -do { \ - while (!(smp_load_acquire(l))) \ - arch_mutex_cpu_relax(); \ -} while (0) -#endif - -#ifndef arch_mcs_spin_unlock_contended -/* - * smp_store_release() provides a memory barrier to ensure all - * operations in the critical section has been completed before - * unlocking. - */ -#define arch_mcs_spin_unlock_contended(l) \ - smp_store_release((l), 1) -#endif - -/* - * Note: the smp_load_acquire/smp_store_release pair is not - * sufficient to form a full memory barrier across - * cpus for many architectures (except x86) for mcs_unlock and mcs_lock. - * For applications that need a full barrier across multiple cpus - * with mcs_unlock and mcs_lock pair, smp_mb__after_unlock_lock() should be - * used after mcs_lock. - */ - -/* - * In order to acquire the lock, the caller should declare a local node and - * pass a reference of the node to this function in addition to the lock. - * If the lock has already been acquired, then this will proceed to spin - * on this node->locked until the previous lock holder sets the node->locked - * in mcs_spin_unlock(). - * - * We don't inline mcs_spin_lock() so that perf can correctly account for the - * time spent in this lock function. - */ -static inline -void mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node) -{ - struct mcs_spinlock *prev; - - /* Init node */ - node->locked = 0; - node->next = NULL; - - prev = xchg(lock, node); - if (likely(prev == NULL)) { - /* - * Lock acquired, don't need to set node->locked to 1. Threads - * only spin on its own node->locked value for lock acquisition. - * However, since this thread can immediately acquire the lock - * and does not proceed to spin on its own node->locked, this - * value won't be used. If a debug mode is needed to - * audit lock status, then set node->locked value here. - */ - return; - } - ACCESS_ONCE(prev->next) = node; - - /* Wait until the lock holder passes the lock down. */ - arch_mcs_spin_lock_contended(&node->locked); -} - -/* - * Releases the lock. The caller should pass in the corresponding node that - * was used to acquire the lock. - */ -static inline -void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node) -{ - struct mcs_spinlock *next = ACCESS_ONCE(node->next); - - if (likely(!next)) { - /* - * Release the lock by setting it to NULL - */ - if (likely(cmpxchg(lock, node, NULL) == node)) - return; - /* Wait until the next pointer is set */ - while (!(next = ACCESS_ONCE(node->next))) - arch_mutex_cpu_relax(); - } - - /* Pass lock to next waiter. */ - arch_mcs_spin_unlock_contended(&next->locked); -} - -#endif /* __LINUX_MCS_SPINLOCK_H */ diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h new file mode 100644 index 000000000000..f2a5c6360083 --- /dev/null +++ b/kernel/locking/mcs_spinlock.h @@ -0,0 +1,114 @@ +/* + * MCS lock defines + * + * This file contains the main data structure and API definitions of MCS lock. + * + * The MCS lock (proposed by Mellor-Crummey and Scott) is a simple spin-lock + * with the desirable properties of being fair, and with each cpu trying + * to acquire the lock spinning on a local variable. + * It avoids expensive cache bouncings that common test-and-set spin-lock + * implementations incur. + */ +#ifndef __LINUX_MCS_SPINLOCK_H +#define __LINUX_MCS_SPINLOCK_H + +#include + +struct mcs_spinlock { + struct mcs_spinlock *next; + int locked; /* 1 if lock acquired */ +}; + +#ifndef arch_mcs_spin_lock_contended +/* + * Using smp_load_acquire() provides a memory barrier that ensures + * subsequent operations happen after the lock is acquired. + */ +#define arch_mcs_spin_lock_contended(l) \ +do { \ + while (!(smp_load_acquire(l))) \ + arch_mutex_cpu_relax(); \ +} while (0) +#endif + +#ifndef arch_mcs_spin_unlock_contended +/* + * smp_store_release() provides a memory barrier to ensure all + * operations in the critical section has been completed before + * unlocking. + */ +#define arch_mcs_spin_unlock_contended(l) \ + smp_store_release((l), 1) +#endif + +/* + * Note: the smp_load_acquire/smp_store_release pair is not + * sufficient to form a full memory barrier across + * cpus for many architectures (except x86) for mcs_unlock and mcs_lock. + * For applications that need a full barrier across multiple cpus + * with mcs_unlock and mcs_lock pair, smp_mb__after_unlock_lock() should be + * used after mcs_lock. + */ + +/* + * In order to acquire the lock, the caller should declare a local node and + * pass a reference of the node to this function in addition to the lock. + * If the lock has already been acquired, then this will proceed to spin + * on this node->locked until the previous lock holder sets the node->locked + * in mcs_spin_unlock(). + * + * We don't inline mcs_spin_lock() so that perf can correctly account for the + * time spent in this lock function. + */ +static inline +void mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node) +{ + struct mcs_spinlock *prev; + + /* Init node */ + node->locked = 0; + node->next = NULL; + + prev = xchg(lock, node); + if (likely(prev == NULL)) { + /* + * Lock acquired, don't need to set node->locked to 1. Threads + * only spin on its own node->locked value for lock acquisition. + * However, since this thread can immediately acquire the lock + * and does not proceed to spin on its own node->locked, this + * value won't be used. If a debug mode is needed to + * audit lock status, then set node->locked value here. + */ + return; + } + ACCESS_ONCE(prev->next) = node; + + /* Wait until the lock holder passes the lock down. */ + arch_mcs_spin_lock_contended(&node->locked); +} + +/* + * Releases the lock. The caller should pass in the corresponding node that + * was used to acquire the lock. + */ +static inline +void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node) +{ + struct mcs_spinlock *next = ACCESS_ONCE(node->next); + + if (likely(!next)) { + /* + * Release the lock by setting it to NULL + */ + if (likely(cmpxchg(lock, node, NULL) == node)) + return; + /* Wait until the next pointer is set */ + while (!(next = ACCESS_ONCE(node->next))) + arch_mutex_cpu_relax(); + } + + /* Pass lock to next waiter. */ + arch_mcs_spin_unlock_contended(&next->locked); +} + +#endif /* __LINUX_MCS_SPINLOCK_H */ diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index 45fe1b5293d6..4f408be39a07 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -25,7 +25,7 @@ #include #include #include -#include +#include "mcs_spinlock.h" /* * In the DEBUG case we are using the "NULL fastpath" for mutexes, -- cgit v1.2.3 From fb0527bd5ea99bfeb2dd91e3c1433ecf745d6b99 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 29 Jan 2014 12:51:42 +0100 Subject: locking/mutexes: Introduce cancelable MCS lock for adaptive spinning Since we want a task waiting for a mutex_lock() to go to sleep and reschedule on need_resched() we must be able to abort the mcs_spin_lock() around the adaptive spin. Therefore implement a cancelable mcs lock. Signed-off-by: Peter Zijlstra Cc: chegu_vinod@hp.com Cc: paulmck@linux.vnet.ibm.com Cc: Waiman.Long@hp.com Cc: torvalds@linux-foundation.org Cc: tglx@linutronix.de Cc: riel@redhat.com Cc: akpm@linux-foundation.org Cc: davidlohr@hp.com Cc: hpa@zytor.com Cc: andi@firstfloor.org Cc: aswin@hp.com Cc: scott.norton@hp.com Cc: Jason Low Link: http://lkml.kernel.org/n/tip-62hcl5wxydmjzd182zhvk89m@git.kernel.org Signed-off-by: Ingo Molnar --- include/linux/mutex.h | 4 +- kernel/locking/Makefile | 2 +- kernel/locking/mcs_spinlock.c | 178 ++++++++++++++++++++++++++++++++++++++++++ kernel/locking/mcs_spinlock.h | 15 ++++ kernel/locking/mutex.c | 10 ++- 5 files changed, 202 insertions(+), 7 deletions(-) create mode 100644 kernel/locking/mcs_spinlock.c (limited to 'include/linux') diff --git a/include/linux/mutex.h b/include/linux/mutex.h index c482e1d2cc49..11692dea18aa 100644 --- a/include/linux/mutex.h +++ b/include/linux/mutex.h @@ -46,7 +46,7 @@ * - detects multi-task circular deadlocks and prints out all affected * locks and tasks (and only those tasks) */ -struct mcs_spinlock; +struct optimistic_spin_queue; struct mutex { /* 1: unlocked, 0: locked, negative: locked, possible waiters */ atomic_t count; @@ -56,7 +56,7 @@ struct mutex { struct task_struct *owner; #endif #ifdef CONFIG_MUTEX_SPIN_ON_OWNER - struct mcs_spinlock *mcs_lock; /* Spinner MCS lock */ + struct optimistic_spin_queue *osq; /* Spinner MCS lock */ #endif #ifdef CONFIG_DEBUG_MUTEXES const char *name; diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile index baab8e5e7f66..2a9ee96ecf00 100644 --- a/kernel/locking/Makefile +++ b/kernel/locking/Makefile @@ -1,5 +1,5 @@ -obj-y += mutex.o semaphore.o rwsem.o lglock.o +obj-y += mutex.o semaphore.o rwsem.o lglock.o mcs_spinlock.o ifdef CONFIG_FUNCTION_TRACER CFLAGS_REMOVE_lockdep.o = -pg diff --git a/kernel/locking/mcs_spinlock.c b/kernel/locking/mcs_spinlock.c new file mode 100644 index 000000000000..838dc9e00669 --- /dev/null +++ b/kernel/locking/mcs_spinlock.c @@ -0,0 +1,178 @@ + +#include +#include +#include +#include "mcs_spinlock.h" + +#ifdef CONFIG_SMP + +/* + * An MCS like lock especially tailored for optimistic spinning for sleeping + * lock implementations (mutex, rwsem, etc). + * + * Using a single mcs node per CPU is safe because sleeping locks should not be + * called from interrupt context and we have preemption disabled while + * spinning. + */ +static DEFINE_PER_CPU_SHARED_ALIGNED(struct optimistic_spin_queue, osq_node); + +/* + * Get a stable @node->next pointer, either for unlock() or unqueue() purposes. + * Can return NULL in case we were the last queued and we updated @lock instead. + */ +static inline struct optimistic_spin_queue * +osq_wait_next(struct optimistic_spin_queue **lock, + struct optimistic_spin_queue *node, + struct optimistic_spin_queue *prev) +{ + struct optimistic_spin_queue *next = NULL; + + for (;;) { + if (*lock == node && cmpxchg(lock, node, prev) == node) { + /* + * We were the last queued, we moved @lock back. @prev + * will now observe @lock and will complete its + * unlock()/unqueue(). + */ + break; + } + + /* + * We must xchg() the @node->next value, because if we were to + * leave it in, a concurrent unlock()/unqueue() from + * @node->next might complete Step-A and think its @prev is + * still valid. + * + * If the concurrent unlock()/unqueue() wins the race, we'll + * wait for either @lock to point to us, through its Step-B, or + * wait for a new @node->next from its Step-C. + */ + if (node->next) { + next = xchg(&node->next, NULL); + if (next) + break; + } + + arch_mutex_cpu_relax(); + } + + return next; +} + +bool osq_lock(struct optimistic_spin_queue **lock) +{ + struct optimistic_spin_queue *node = this_cpu_ptr(&osq_node); + struct optimistic_spin_queue *prev, *next; + + node->locked = 0; + node->next = NULL; + + node->prev = prev = xchg(lock, node); + if (likely(prev == NULL)) + return true; + + ACCESS_ONCE(prev->next) = node; + + /* + * Normally @prev is untouchable after the above store; because at that + * moment unlock can proceed and wipe the node element from stack. + * + * However, since our nodes are static per-cpu storage, we're + * guaranteed their existence -- this allows us to apply + * cmpxchg in an attempt to undo our queueing. + */ + + while (!smp_load_acquire(&node->locked)) { + /* + * If we need to reschedule bail... so we can block. + */ + if (need_resched()) + goto unqueue; + + arch_mutex_cpu_relax(); + } + return true; + +unqueue: + /* + * Step - A -- stabilize @prev + * + * Undo our @prev->next assignment; this will make @prev's + * unlock()/unqueue() wait for a next pointer since @lock points to us + * (or later). + */ + + for (;;) { + if (prev->next == node && + cmpxchg(&prev->next, node, NULL) == node) + break; + + /* + * We can only fail the cmpxchg() racing against an unlock(), + * in which case we should observe @node->locked becomming + * true. + */ + if (smp_load_acquire(&node->locked)) + return true; + + arch_mutex_cpu_relax(); + + /* + * Or we race against a concurrent unqueue()'s step-B, in which + * case its step-C will write us a new @node->prev pointer. + */ + prev = ACCESS_ONCE(node->prev); + } + + /* + * Step - B -- stabilize @next + * + * Similar to unlock(), wait for @node->next or move @lock from @node + * back to @prev. + */ + + next = osq_wait_next(lock, node, prev); + if (!next) + return false; + + /* + * Step - C -- unlink + * + * @prev is stable because its still waiting for a new @prev->next + * pointer, @next is stable because our @node->next pointer is NULL and + * it will wait in Step-A. + */ + + ACCESS_ONCE(next->prev) = prev; + ACCESS_ONCE(prev->next) = next; + + return false; +} + +void osq_unlock(struct optimistic_spin_queue **lock) +{ + struct optimistic_spin_queue *node = this_cpu_ptr(&osq_node); + struct optimistic_spin_queue *next; + + /* + * Fast path for the uncontended case. + */ + if (likely(cmpxchg(lock, node, NULL) == node)) + return; + + /* + * Second most likely case. + */ + next = xchg(&node->next, NULL); + if (next) { + ACCESS_ONCE(next->locked) = 1; + return; + } + + next = osq_wait_next(lock, node, NULL); + if (next) + ACCESS_ONCE(next->locked) = 1; +} + +#endif + diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h index f2a5c6360083..a2dbac4aca6b 100644 --- a/kernel/locking/mcs_spinlock.h +++ b/kernel/locking/mcs_spinlock.h @@ -111,4 +111,19 @@ void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node) arch_mcs_spin_unlock_contended(&next->locked); } +/* + * Cancellable version of the MCS lock above. + * + * Intended for adaptive spinning of sleeping locks: + * mutex_lock()/rwsem_down_{read,write}() etc. + */ + +struct optimistic_spin_queue { + struct optimistic_spin_queue *next, *prev; + int locked; /* 1 if lock acquired */ +}; + +extern bool osq_lock(struct optimistic_spin_queue **lock); +extern void osq_unlock(struct optimistic_spin_queue **lock); + #endif /* __LINUX_MCS_SPINLOCK_H */ diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index dc3d6f2bbe2a..2670b84067d6 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -53,7 +53,7 @@ __mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key) INIT_LIST_HEAD(&lock->wait_list); mutex_clear_owner(lock); #ifdef CONFIG_MUTEX_SPIN_ON_OWNER - lock->mcs_lock = NULL; + lock->osq = NULL; #endif debug_mutex_init(lock, name, key); @@ -403,7 +403,9 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, if (!mutex_can_spin_on_owner(lock)) goto slowpath; - mcs_spin_lock(&lock->mcs_lock, &node); + if (!osq_lock(&lock->osq)) + goto slowpath; + for (;;) { struct task_struct *owner; @@ -442,7 +444,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, } mutex_set_owner(lock); - mcs_spin_unlock(&lock->mcs_lock, &node); + osq_unlock(&lock->osq); preempt_enable(); return 0; } @@ -464,7 +466,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, */ arch_mutex_cpu_relax(); } - mcs_spin_unlock(&lock->mcs_lock, &node); + osq_unlock(&lock->osq); slowpath: #endif spin_lock_mutex(&lock->wait_lock, flags); -- cgit v1.2.3 From c1bacbae8192dd2a9ebadd22d793b68054f6c6e5 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 8 Mar 2014 08:59:58 +0100 Subject: genirq: Provide irq_request/release_resources chip callbacks For certain irq types, e.g. gpios, it's necessary to request resources before starting up the irq. This might fail so we cannot use the irq_startup() callback because we might call the irq_set_type() callback before that which does not make sense when the resource is not available. Calling irq_startup() before irq_set_type() can lead to spurious interrupts which is not desired either. Signed-off-by: Thomas Gleixner Cc: Jean-Jacques Hiblot Cc: Grant Likely Cc: linux-arm-kernel@lists.infradead.org Reviewed-by: Linus Walleij Link: http://lkml.kernel.org/r/alpine.DEB.2.02.1403080857160.18573@ionos.tec.linutronix.de Signed-off-by: Thomas Gleixner --- include/linux/irq.h | 6 ++++++ kernel/irq/manage.c | 28 +++++++++++++++++++++++++++- 2 files changed, 33 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/irq.h b/include/linux/irq.h index 7dc10036eff5..e675971bdc3f 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -303,6 +303,10 @@ static inline irq_hw_number_t irqd_to_hwirq(struct irq_data *d) * @irq_pm_shutdown: function called from core code on shutdown once per chip * @irq_calc_mask: Optional function to set irq_data.mask for special cases * @irq_print_chip: optional to print special chip info in show_interrupts + * @irq_request_resources: optional to request resources before calling + * any other callback related to this irq + * @irq_release_resources: optional to release resources acquired with + * irq_request_resources * @flags: chip specific flags */ struct irq_chip { @@ -336,6 +340,8 @@ struct irq_chip { void (*irq_calc_mask)(struct irq_data *data); void (*irq_print_chip)(struct irq_data *data, struct seq_file *p); + int (*irq_request_resources)(struct irq_data *data); + void (*irq_release_resources)(struct irq_data *data); unsigned long flags; }; diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index d3bf660cb57f..5d35cbe22896 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -896,6 +896,23 @@ static void irq_setup_forced_threading(struct irqaction *new) } } +static int irq_request_resources(struct irq_desc *desc) +{ + struct irq_data *d = &desc->irq_data; + struct irq_chip *c = d->chip; + + return c->irq_request_resources ? c->irq_request_resources(d) : 0; +} + +static void irq_release_resources(struct irq_desc *desc) +{ + struct irq_data *d = &desc->irq_data; + struct irq_chip *c = d->chip; + + if (c->irq_release_resources) + c->irq_release_resources(d); +} + /* * Internal function to register an irqaction - typically used to * allocate special interrupts that are part of the architecture. @@ -1091,6 +1108,13 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) } if (!shared) { + ret = irq_request_resources(desc); + if (ret) { + pr_err("Failed to request resources for %s (irq %d) on irqchip %s\n", + new->name, irq, desc->irq_data.chip->name); + goto out_mask; + } + init_waitqueue_head(&desc->wait_for_threads); /* Setup the type (level, edge polarity) if configured: */ @@ -1261,8 +1285,10 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id) *action_ptr = action->next; /* If this was the last handler, shut down the IRQ line: */ - if (!desc->action) + if (!desc->action) { irq_shutdown(desc); + irq_release_resources(desc); + } #ifdef CONFIG_SMP /* make sure affinity_hint is cleaned up */ -- cgit v1.2.3 From bfc3f0281e08066fa8111c3972cff6edc1049864 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 5 Mar 2014 16:33:42 +0100 Subject: cputime: Default implementation of nsecs -> cputime conversion The architectures that override cputime_t (s390, ppc) don't provide any version of nsecs_to_cputime(). Indeed this cputime_t implementation by backend only happens when CONFIG_VIRT_CPU_ACCOUNTING_NATIVE=y under which the core code doesn't make any use of nsecs_to_cputime(). At least for now. We are going to make a broader use of it so lets provide a default version with a per usecs granularity. It should be good enough for most usecases. Cc: Ingo Molnar Cc: Marcelo Tosatti Cc: Peter Zijlstra Cc: Thomas Gleixner Acked-by: Rik van Riel Signed-off-by: Frederic Weisbecker --- drivers/cpufreq/cpufreq_stats.c | 2 +- drivers/s390/cio/cio.c | 2 +- fs/proc/stat.c | 2 +- fs/proc/uptime.c | 2 +- include/linux/cputime.h | 11 +++++++++++ include/linux/kernel_stat.h | 2 +- include/linux/sched.h | 2 +- 7 files changed, 17 insertions(+), 6 deletions(-) create mode 100644 include/linux/cputime.h (limited to 'include/linux') diff --git a/drivers/cpufreq/cpufreq_stats.c b/drivers/cpufreq/cpufreq_stats.c index 5793e1447fb1..79911a27a48a 100644 --- a/drivers/cpufreq/cpufreq_stats.c +++ b/drivers/cpufreq/cpufreq_stats.c @@ -13,7 +13,7 @@ #include #include #include -#include +#include static spinlock_t cpufreq_stats_lock; diff --git a/drivers/s390/cio/cio.c b/drivers/s390/cio/cio.c index 88e35d85d205..5154513de112 100644 --- a/drivers/s390/cio/cio.c +++ b/drivers/s390/cio/cio.c @@ -28,7 +28,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/fs/proc/stat.c b/fs/proc/stat.c index 6f599c62f0cc..9d231e9e5f0e 100644 --- a/fs/proc/stat.c +++ b/fs/proc/stat.c @@ -9,7 +9,7 @@ #include #include #include -#include +#include #include #ifndef arch_irq_stat_cpu diff --git a/fs/proc/uptime.c b/fs/proc/uptime.c index 7141b8d0ca9e..33de567c25af 100644 --- a/fs/proc/uptime.c +++ b/fs/proc/uptime.c @@ -5,7 +5,7 @@ #include #include #include -#include +#include static int uptime_proc_show(struct seq_file *m, void *v) { diff --git a/include/linux/cputime.h b/include/linux/cputime.h new file mode 100644 index 000000000000..2842ebe2844d --- /dev/null +++ b/include/linux/cputime.h @@ -0,0 +1,11 @@ +#ifndef __LINUX_CPUTIME_H +#define __LINUX_CPUTIME_H + +#include + +#ifndef nsecs_to_cputime +# define nsecs_to_cputime(__nsecs) \ + usecs_to_cputime((__nsecs) / NSEC_PER_USEC) +#endif + +#endif /* __LINUX_CPUTIME_H */ diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h index 51c72be4a7c3..d7c61317db86 100644 --- a/include/linux/kernel_stat.h +++ b/include/linux/kernel_stat.h @@ -9,7 +9,7 @@ #include #include #include -#include +#include /* * 'kernel_stat.h' contains the definitions needed for doing diff --git a/include/linux/sched.h b/include/linux/sched.h index 68a0e84463a0..1ac566c48d3d 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -27,7 +27,7 @@ struct sched_param { #include #include -#include +#include #include #include -- cgit v1.2.3 From d8a9ce3f8ad2b546b9ebaf65de809da0793f11c5 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 5 Mar 2014 16:22:37 +0100 Subject: cputime: Bring cputime -> nsecs conversion We already have nsecs_to_cputime(). Now we need to be able to convert the other way around in order to fix a bug on steal time accounting. Cc: Ingo Molnar Cc: Marcelo Tosatti Cc: Peter Zijlstra Cc: Thomas Gleixner Acked-by: Rik van Riel Signed-off-by: Frederic Weisbecker --- include/asm-generic/cputime_jiffies.h | 4 +++- include/asm-generic/cputime_nsecs.h | 2 ++ include/linux/cputime.h | 5 +++++ 3 files changed, 10 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/asm-generic/cputime_jiffies.h b/include/asm-generic/cputime_jiffies.h index 272ecba9f588..d5cb78f53986 100644 --- a/include/asm-generic/cputime_jiffies.h +++ b/include/asm-generic/cputime_jiffies.h @@ -15,8 +15,10 @@ typedef u64 __nocast cputime64_t; /* - * Convert nanoseconds to cputime + * Convert nanoseconds <-> cputime */ +#define cputime_to_nsecs(__ct) \ + jiffies_to_nsecs(cputime_to_jiffies(__ct)) #define nsecs_to_cputime64(__nsec) \ jiffies64_to_cputime64(nsecs_to_jiffies64(__nsec)) #define nsecs_to_cputime(__nsec) \ diff --git a/include/asm-generic/cputime_nsecs.h b/include/asm-generic/cputime_nsecs.h index 768294f8bf85..4e817606c549 100644 --- a/include/asm-generic/cputime_nsecs.h +++ b/include/asm-generic/cputime_nsecs.h @@ -44,6 +44,8 @@ typedef u64 __nocast cputime64_t; /* * Convert cputime <-> nanoseconds */ +#define cputime_to_nsecs(__ct) \ + (__force u64)(__ct) #define nsecs_to_cputime(__nsecs) \ (__force cputime_t)(__nsecs) diff --git a/include/linux/cputime.h b/include/linux/cputime.h index 2842ebe2844d..f2eb2ee535ca 100644 --- a/include/linux/cputime.h +++ b/include/linux/cputime.h @@ -3,6 +3,11 @@ #include +#ifndef cputime_to_nsecs +# define cputime_to_nsecs(__ct) \ + (cputime_to_usecs(__ct) * NSEC_PER_USEC) +#endif + #ifndef nsecs_to_cputime # define nsecs_to_cputime(__nsecs) \ usecs_to_cputime((__nsecs) / NSEC_PER_USEC) -- cgit v1.2.3 From 4f6e4f71c9d39cf49e0cb1be5b7721db5fbe92ac Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 13 Mar 2014 15:32:47 +0100 Subject: genirq: Document IRQCHIP_ONESHOT_SAFE flag Add missing documentation of the flag. Signed-off-by: Thomas Gleixner --- include/linux/irq.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/irq.h b/include/linux/irq.h index e675971bdc3f..67ace7aa7947 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -355,6 +355,7 @@ struct irq_chip { * IRQCHIP_ONOFFLINE_ENABLED: Only call irq_on/off_line callbacks * when irq enabled * IRQCHIP_SKIP_SET_WAKE: Skip chip.irq_set_wake(), for this irq chip + * IRQCHIP_ONESHOT_SAFE: One shot does not require mask/unmask */ enum { IRQCHIP_SET_TYPE_MASKED = (1 << 0), -- cgit v1.2.3 From 328a4978df833249b099c9875738d7b72042ffe1 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 13 Mar 2014 19:03:51 +0100 Subject: genirq: Add a new IRQCHIP_EOI_THREADED flag The flag is necessary for interrupt chips which require an ACK/EOI after the handler has run. In case of threaded handlers this needs to happen after the threaded handler has completed before the unmask of the interrupt. The flag is only unseful in combination with the handle_fasteoi_irq flow control handler. It can be combined with the flag IRQCHIP_EOI_IF_HANDLED, so the EOI is not issued when the interrupt is disabled or in progress. Tested-by: Hans de Goede Reviewed-by: Hans de Goede Cc: linux-arm-kernel@lists.infradead.org Cc: linux-sunxi@googlegroups.com Cc: Maxime Ripard Link: http://lkml.kernel.org/r/1394733834-26839-2-git-send-email-hdegoede@redhat.com Signed-off-by: Thomas Gleixner --- include/linux/irq.h | 2 ++ kernel/irq/chip.c | 48 ++++++++++++++++++++++++++++++++++++++++-------- kernel/irq/internals.h | 1 + kernel/irq/manage.c | 2 +- 4 files changed, 44 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/irq.h b/include/linux/irq.h index 67ace7aa7947..d278838908cb 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -356,6 +356,7 @@ struct irq_chip { * when irq enabled * IRQCHIP_SKIP_SET_WAKE: Skip chip.irq_set_wake(), for this irq chip * IRQCHIP_ONESHOT_SAFE: One shot does not require mask/unmask + * IRQCHIP_EOI_THREADED: Chip requires eoi() on unmask in threaded mode */ enum { IRQCHIP_SET_TYPE_MASKED = (1 << 0), @@ -364,6 +365,7 @@ enum { IRQCHIP_ONOFFLINE_ENABLED = (1 << 3), IRQCHIP_SKIP_SET_WAKE = (1 << 4), IRQCHIP_ONESHOT_SAFE = (1 << 5), + IRQCHIP_EOI_THREADED = (1 << 6), }; /* This include will go away once we isolated irq_desc usage to core code */ diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index dc04c166c54d..6397df2d6945 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -281,6 +281,19 @@ void unmask_irq(struct irq_desc *desc) } } +void unmask_threaded_irq(struct irq_desc *desc) +{ + struct irq_chip *chip = desc->irq_data.chip; + + if (chip->flags & IRQCHIP_EOI_THREADED) + chip->irq_eoi(&desc->irq_data); + + if (chip->irq_unmask) { + chip->irq_unmask(&desc->irq_data); + irq_state_clr_masked(desc); + } +} + /* * handle_nested_irq - Handle a nested irq from a irq thread * @irq: the interrupt number @@ -435,6 +448,27 @@ static inline void preflow_handler(struct irq_desc *desc) static inline void preflow_handler(struct irq_desc *desc) { } #endif +static void cond_unmask_eoi_irq(struct irq_desc *desc, struct irq_chip *chip) +{ + if (!(desc->istate & IRQS_ONESHOT)) { + chip->irq_eoi(&desc->irq_data); + return; + } + /* + * We need to unmask in the following cases: + * - Oneshot irq which did not wake the thread (caused by a + * spurious interrupt or a primary handler handling it + * completely). + */ + if (!irqd_irq_disabled(&desc->irq_data) && + irqd_irq_masked(&desc->irq_data) && !desc->threads_oneshot) { + chip->irq_eoi(&desc->irq_data); + unmask_irq(desc); + } else if (!(chip->flags & IRQCHIP_EOI_THREADED)) { + chip->irq_eoi(&desc->irq_data); + } +} + /** * handle_fasteoi_irq - irq handler for transparent controllers * @irq: the interrupt number @@ -448,6 +482,8 @@ static inline void preflow_handler(struct irq_desc *desc) { } void handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc) { + struct irq_chip *chip = desc->irq_data.chip; + raw_spin_lock(&desc->lock); if (unlikely(irqd_irq_inprogress(&desc->irq_data))) @@ -473,18 +509,14 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc) preflow_handler(desc); handle_irq_event(desc); - if (desc->istate & IRQS_ONESHOT) - cond_unmask_irq(desc); + cond_unmask_eoi_irq(desc, chip); -out_eoi: - desc->irq_data.chip->irq_eoi(&desc->irq_data); -out_unlock: raw_spin_unlock(&desc->lock); return; out: - if (!(desc->irq_data.chip->flags & IRQCHIP_EOI_IF_HANDLED)) - goto out_eoi; - goto out_unlock; + if (!(chip->flags & IRQCHIP_EOI_IF_HANDLED)) + chip->irq_eoi(&desc->irq_data); + raw_spin_unlock(&desc->lock); } /** diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 17b671713d5f..ddf1ffeb79f1 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -74,6 +74,7 @@ extern void irq_percpu_enable(struct irq_desc *desc, unsigned int cpu); extern void irq_percpu_disable(struct irq_desc *desc, unsigned int cpu); extern void mask_irq(struct irq_desc *desc); extern void unmask_irq(struct irq_desc *desc); +extern void unmask_threaded_irq(struct irq_desc *desc); extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr); diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index de1a8ed29b40..2486a4c1a710 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -748,7 +748,7 @@ again: if (!desc->threads_oneshot && !irqd_irq_disabled(&desc->irq_data) && irqd_irq_masked(&desc->irq_data)) - unmask_irq(desc); + unmask_threaded_irq(desc); out_unlock: raw_spin_unlock_irq(&desc->lock); -- cgit v1.2.3 From ff0992e9036e9810e7cd45234fa32ca1e79750e2 Mon Sep 17 00:00:00 2001 From: Bjørn Mork Date: Mon, 17 Mar 2014 16:25:18 +0100 Subject: net: cdc_ncm: fix control message ordering MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is a context modified revert of commit 6a9612e2cb22 ("net: cdc_ncm: remove ncm_parm field") which introduced a NCM specification violation, causing setup errors for some devices. These errors resulted in the device and host disagreeing about shared settings, with complete failure to communicate as the end result. The NCM specification require that many of the NCM specific control reuests are sent only while the NCM Data Interface is in alternate setting 0. Reverting the commit ensures that we follow this requirement. Fixes: 6a9612e2cb22 ("net: cdc_ncm: remove ncm_parm field") Reported-and-tested-by: Pasi Kärkkäinen Reported-by: Thomas Schäfer Signed-off-by: Bjørn Mork Signed-off-by: David S. Miller --- drivers/net/usb/cdc_ncm.c | 48 ++++++++++++++++++++++----------------------- include/linux/usb/cdc_ncm.h | 1 + 2 files changed, 24 insertions(+), 25 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/usb/cdc_ncm.c b/drivers/net/usb/cdc_ncm.c index dbff290ed0e4..d350d2795e10 100644 --- a/drivers/net/usb/cdc_ncm.c +++ b/drivers/net/usb/cdc_ncm.c @@ -68,7 +68,6 @@ static struct usb_driver cdc_ncm_driver; static int cdc_ncm_setup(struct usbnet *dev) { struct cdc_ncm_ctx *ctx = (struct cdc_ncm_ctx *)dev->data[0]; - struct usb_cdc_ncm_ntb_parameters ncm_parm; u32 val; u8 flags; u8 iface_no; @@ -82,22 +81,22 @@ static int cdc_ncm_setup(struct usbnet *dev) err = usbnet_read_cmd(dev, USB_CDC_GET_NTB_PARAMETERS, USB_TYPE_CLASS | USB_DIR_IN |USB_RECIP_INTERFACE, - 0, iface_no, &ncm_parm, - sizeof(ncm_parm)); + 0, iface_no, &ctx->ncm_parm, + sizeof(ctx->ncm_parm)); if (err < 0) { dev_err(&dev->intf->dev, "failed GET_NTB_PARAMETERS\n"); return err; /* GET_NTB_PARAMETERS is required */ } /* read correct set of parameters according to device mode */ - ctx->rx_max = le32_to_cpu(ncm_parm.dwNtbInMaxSize); - ctx->tx_max = le32_to_cpu(ncm_parm.dwNtbOutMaxSize); - ctx->tx_remainder = le16_to_cpu(ncm_parm.wNdpOutPayloadRemainder); - ctx->tx_modulus = le16_to_cpu(ncm_parm.wNdpOutDivisor); - ctx->tx_ndp_modulus = le16_to_cpu(ncm_parm.wNdpOutAlignment); + ctx->rx_max = le32_to_cpu(ctx->ncm_parm.dwNtbInMaxSize); + ctx->tx_max = le32_to_cpu(ctx->ncm_parm.dwNtbOutMaxSize); + ctx->tx_remainder = le16_to_cpu(ctx->ncm_parm.wNdpOutPayloadRemainder); + ctx->tx_modulus = le16_to_cpu(ctx->ncm_parm.wNdpOutDivisor); + ctx->tx_ndp_modulus = le16_to_cpu(ctx->ncm_parm.wNdpOutAlignment); /* devices prior to NCM Errata shall set this field to zero */ - ctx->tx_max_datagrams = le16_to_cpu(ncm_parm.wNtbOutMaxDatagrams); - ntb_fmt_supported = le16_to_cpu(ncm_parm.bmNtbFormatsSupported); + ctx->tx_max_datagrams = le16_to_cpu(ctx->ncm_parm.wNtbOutMaxDatagrams); + ntb_fmt_supported = le16_to_cpu(ctx->ncm_parm.bmNtbFormatsSupported); /* there are some minor differences in NCM and MBIM defaults */ if (cdc_ncm_comm_intf_is_mbim(ctx->control->cur_altsetting)) { @@ -146,7 +145,7 @@ static int cdc_ncm_setup(struct usbnet *dev) } /* inform device about NTB input size changes */ - if (ctx->rx_max != le32_to_cpu(ncm_parm.dwNtbInMaxSize)) { + if (ctx->rx_max != le32_to_cpu(ctx->ncm_parm.dwNtbInMaxSize)) { __le32 dwNtbInMaxSize = cpu_to_le32(ctx->rx_max); err = usbnet_write_cmd(dev, USB_CDC_SET_NTB_INPUT_SIZE, @@ -162,14 +161,6 @@ static int cdc_ncm_setup(struct usbnet *dev) dev_dbg(&dev->intf->dev, "Using default maximum transmit length=%d\n", CDC_NCM_NTB_MAX_SIZE_TX); ctx->tx_max = CDC_NCM_NTB_MAX_SIZE_TX; - - /* Adding a pad byte here simplifies the handling in - * cdc_ncm_fill_tx_frame, by making tx_max always - * represent the real skb max size. - */ - if (ctx->tx_max % usb_maxpacket(dev->udev, dev->out, 1) == 0) - ctx->tx_max++; - } /* @@ -439,6 +430,10 @@ advance: goto error2; } + /* initialize data interface */ + if (cdc_ncm_setup(dev)) + goto error2; + /* configure data interface */ temp = usb_set_interface(dev->udev, iface_no, data_altsetting); if (temp) { @@ -453,12 +448,6 @@ advance: goto error2; } - /* initialize data interface */ - if (cdc_ncm_setup(dev)) { - dev_dbg(&intf->dev, "cdc_ncm_setup() failed\n"); - goto error2; - } - usb_set_intfdata(ctx->data, dev); usb_set_intfdata(ctx->control, dev); @@ -475,6 +464,15 @@ advance: dev->hard_mtu = ctx->tx_max; dev->rx_urb_size = ctx->rx_max; + /* cdc_ncm_setup will override dwNtbOutMaxSize if it is + * outside the sane range. Adding a pad byte here if necessary + * simplifies the handling in cdc_ncm_fill_tx_frame, making + * tx_max always represent the real skb max size. + */ + if (ctx->tx_max != le32_to_cpu(ctx->ncm_parm.dwNtbOutMaxSize) && + ctx->tx_max % usb_maxpacket(dev->udev, dev->out, 1) == 0) + ctx->tx_max++; + return 0; error2: diff --git a/include/linux/usb/cdc_ncm.h b/include/linux/usb/cdc_ncm.h index c3fa80745996..2c14d9cdd57a 100644 --- a/include/linux/usb/cdc_ncm.h +++ b/include/linux/usb/cdc_ncm.h @@ -88,6 +88,7 @@ #define cdc_ncm_data_intf_is_mbim(x) ((x)->desc.bInterfaceProtocol == USB_CDC_MBIM_PROTO_NTB) struct cdc_ncm_ctx { + struct usb_cdc_ncm_ntb_parameters ncm_parm; struct hrtimer tx_timer; struct tasklet_struct bh; -- cgit v1.2.3 From bc6e7c4b0d1a1f742d96556f63d68f17f4e232c3 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 14 Mar 2014 13:52:48 -0700 Subject: libata, libsas: kill pm_result and related cleanup Tejun says: "At least for libata, worrying about suspend/resume failures don't make whole lot of sense. If suspend failed, just proceed with suspend. If the device can't be woken up afterwards, that's that. There isn't anything we could have done differently anyway. The same for resume, if spinup fails, the device is dud and the following commands will invoke EH actions and will eventually fail. Again, there really isn't any *choice* to make. Just making sure the errors are handled gracefully (ie. don't crash) and the following commands are handled correctly should be enough." The only libata user that actually cares about the result from a suspend operation is libsas. However, it only cares about whether queuing a new operation collides with an in-flight one. All libsas does with the error is retry, but we can just let libata wait for the previous operation before continuing. Other cleanups include: 1/ Unifying all ata port pm operations on an ata_port_pm_ prefix 2/ Marking all ata port pm helper routines as returning void, only ata_port_pm_ entry points need to fake a 0 return value. 3/ Killing ata_port_{suspend|resume}_common() in favor of calling ata_port_request_pm() directly 4/ Killing the wrappers that just do a to_ata_port() conversion 5/ Clearly marking the entry points that do async operations with an _async suffix. Reference: http://marc.info/?l=linux-scsi&m=138995409532286&w=2 Cc: Phillip Susi Cc: Alan Stern Suggested-by: Tejun Heo Signed-off-by: Todd Brandt Signed-off-by: Dan Williams Signed-off-by: Tejun Heo --- drivers/ata/libata-core.c | 135 +++++++++++++++++++----------------------- drivers/ata/libata-eh.c | 13 +--- drivers/scsi/libsas/sas_ata.c | 35 ++--------- include/linux/libata.h | 11 ++-- include/scsi/libsas.h | 1 - 5 files changed, 71 insertions(+), 124 deletions(-) (limited to 'include/linux') diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c index 1a3dbd1b196e..66110ed2c1c0 100644 --- a/drivers/ata/libata-core.c +++ b/drivers/ata/libata-core.c @@ -5351,22 +5351,17 @@ bool ata_link_offline(struct ata_link *link) } #ifdef CONFIG_PM -static int ata_port_request_pm(struct ata_port *ap, pm_message_t mesg, - unsigned int action, unsigned int ehi_flags, - int *async) +static void ata_port_request_pm(struct ata_port *ap, pm_message_t mesg, + unsigned int action, unsigned int ehi_flags, + bool async) { struct ata_link *link; unsigned long flags; - int rc = 0; /* Previous resume operation might still be in * progress. Wait for PM_PENDING to clear. */ if (ap->pflags & ATA_PFLAG_PM_PENDING) { - if (async) { - *async = -EAGAIN; - return 0; - } ata_port_wait_eh(ap); WARN_ON(ap->pflags & ATA_PFLAG_PM_PENDING); } @@ -5375,11 +5370,6 @@ static int ata_port_request_pm(struct ata_port *ap, pm_message_t mesg, spin_lock_irqsave(ap->lock, flags); ap->pm_mesg = mesg; - if (async) - ap->pm_result = async; - else - ap->pm_result = &rc; - ap->pflags |= ATA_PFLAG_PM_PENDING; ata_for_each_link(link, ap, HOST_FIRST) { link->eh_info.action |= action; @@ -5390,87 +5380,81 @@ static int ata_port_request_pm(struct ata_port *ap, pm_message_t mesg, spin_unlock_irqrestore(ap->lock, flags); - /* wait and check result */ if (!async) { ata_port_wait_eh(ap); WARN_ON(ap->pflags & ATA_PFLAG_PM_PENDING); } - - return rc; } -static int __ata_port_suspend_common(struct ata_port *ap, pm_message_t mesg, int *async) +/* + * On some hardware, device fails to respond after spun down for suspend. As + * the device won't be used before being resumed, we don't need to touch the + * device. Ask EH to skip the usual stuff and proceed directly to suspend. + * + * http://thread.gmane.org/gmane.linux.ide/46764 + */ +static const unsigned int ata_port_suspend_ehi = ATA_EHI_QUIET + | ATA_EHI_NO_AUTOPSY + | ATA_EHI_NO_RECOVERY; + +static void ata_port_suspend(struct ata_port *ap, pm_message_t mesg) { - /* - * On some hardware, device fails to respond after spun down - * for suspend. As the device won't be used before being - * resumed, we don't need to touch the device. Ask EH to skip - * the usual stuff and proceed directly to suspend. - * - * http://thread.gmane.org/gmane.linux.ide/46764 - */ - unsigned int ehi_flags = ATA_EHI_QUIET | ATA_EHI_NO_AUTOPSY | - ATA_EHI_NO_RECOVERY; - return ata_port_request_pm(ap, mesg, 0, ehi_flags, async); + ata_port_request_pm(ap, mesg, 0, ata_port_suspend_ehi, false); } -static int ata_port_suspend_common(struct device *dev, pm_message_t mesg) +static void ata_port_suspend_async(struct ata_port *ap, pm_message_t mesg) { - struct ata_port *ap = to_ata_port(dev); - - return __ata_port_suspend_common(ap, mesg, NULL); + ata_port_request_pm(ap, mesg, 0, ata_port_suspend_ehi, true); } -static int ata_port_suspend(struct device *dev) +static int ata_port_pm_suspend(struct device *dev) { + struct ata_port *ap = to_ata_port(dev); + if (pm_runtime_suspended(dev)) return 0; - return ata_port_suspend_common(dev, PMSG_SUSPEND); + ata_port_suspend(ap, PMSG_SUSPEND); + return 0; } -static int ata_port_do_freeze(struct device *dev) +static int ata_port_pm_freeze(struct device *dev) { + struct ata_port *ap = to_ata_port(dev); + if (pm_runtime_suspended(dev)) return 0; - return ata_port_suspend_common(dev, PMSG_FREEZE); + ata_port_suspend(ap, PMSG_FREEZE); + return 0; } -static int ata_port_poweroff(struct device *dev) +static int ata_port_pm_poweroff(struct device *dev) { - return ata_port_suspend_common(dev, PMSG_HIBERNATE); + ata_port_suspend(to_ata_port(dev), PMSG_HIBERNATE); + return 0; } -static int __ata_port_resume_common(struct ata_port *ap, pm_message_t mesg, - int *async) -{ - int rc; +static const unsigned int ata_port_resume_ehi = ATA_EHI_NO_AUTOPSY + | ATA_EHI_QUIET; - rc = ata_port_request_pm(ap, mesg, ATA_EH_RESET, - ATA_EHI_NO_AUTOPSY | ATA_EHI_QUIET, async); - return rc; +static void ata_port_resume(struct ata_port *ap, pm_message_t mesg) +{ + ata_port_request_pm(ap, mesg, ATA_EH_RESET, ata_port_resume_ehi, false); } -static int ata_port_resume_common(struct device *dev, pm_message_t mesg) +static void ata_port_resume_async(struct ata_port *ap, pm_message_t mesg) { - struct ata_port *ap = to_ata_port(dev); - - return __ata_port_resume_common(ap, mesg, NULL); + ata_port_request_pm(ap, mesg, ATA_EH_RESET, ata_port_resume_ehi, true); } -static int ata_port_resume(struct device *dev) +static int ata_port_pm_resume(struct device *dev) { - int rc; - - rc = ata_port_resume_common(dev, PMSG_RESUME); - if (!rc) { - pm_runtime_disable(dev); - pm_runtime_set_active(dev); - pm_runtime_enable(dev); - } - - return rc; + ata_port_resume(to_ata_port(dev), PMSG_RESUME); + pm_runtime_disable(dev); + pm_runtime_set_active(dev); + pm_runtime_enable(dev); + return 0; } /* @@ -5499,21 +5483,23 @@ static int ata_port_runtime_idle(struct device *dev) static int ata_port_runtime_suspend(struct device *dev) { - return ata_port_suspend_common(dev, PMSG_AUTO_SUSPEND); + ata_port_suspend(to_ata_port(dev), PMSG_AUTO_SUSPEND); + return 0; } static int ata_port_runtime_resume(struct device *dev) { - return ata_port_resume_common(dev, PMSG_AUTO_RESUME); + ata_port_resume(to_ata_port(dev), PMSG_AUTO_RESUME); + return 0; } static const struct dev_pm_ops ata_port_pm_ops = { - .suspend = ata_port_suspend, - .resume = ata_port_resume, - .freeze = ata_port_do_freeze, - .thaw = ata_port_resume, - .poweroff = ata_port_poweroff, - .restore = ata_port_resume, + .suspend = ata_port_pm_suspend, + .resume = ata_port_pm_resume, + .freeze = ata_port_pm_freeze, + .thaw = ata_port_pm_resume, + .poweroff = ata_port_pm_poweroff, + .restore = ata_port_pm_resume, .runtime_suspend = ata_port_runtime_suspend, .runtime_resume = ata_port_runtime_resume, @@ -5525,18 +5511,17 @@ static const struct dev_pm_ops ata_port_pm_ops = { * level. sas suspend/resume is async to allow parallel port recovery * since sas has multiple ata_port instances per Scsi_Host. */ -int ata_sas_port_async_suspend(struct ata_port *ap, int *async) +void ata_sas_port_suspend(struct ata_port *ap) { - return __ata_port_suspend_common(ap, PMSG_SUSPEND, async); + ata_port_suspend_async(ap, PMSG_SUSPEND); } -EXPORT_SYMBOL_GPL(ata_sas_port_async_suspend); +EXPORT_SYMBOL_GPL(ata_sas_port_suspend); -int ata_sas_port_async_resume(struct ata_port *ap, int *async) +void ata_sas_port_resume(struct ata_port *ap) { - return __ata_port_resume_common(ap, PMSG_RESUME, async); + ata_port_resume_async(ap, PMSG_RESUME); } -EXPORT_SYMBOL_GPL(ata_sas_port_async_resume); - +EXPORT_SYMBOL_GPL(ata_sas_port_resume); /** * ata_host_suspend - suspend host diff --git a/drivers/ata/libata-eh.c b/drivers/ata/libata-eh.c index c1d0170a6585..6760fc4e85b8 100644 --- a/drivers/ata/libata-eh.c +++ b/drivers/ata/libata-eh.c @@ -4070,7 +4070,7 @@ static void ata_eh_handle_port_suspend(struct ata_port *ap) ata_acpi_set_state(ap, ap->pm_mesg); out: - /* report result */ + /* update the flags */ spin_lock_irqsave(ap->lock, flags); ap->pflags &= ~ATA_PFLAG_PM_PENDING; @@ -4079,11 +4079,6 @@ static void ata_eh_handle_port_suspend(struct ata_port *ap) else if (ap->pflags & ATA_PFLAG_FROZEN) ata_port_schedule_eh(ap); - if (ap->pm_result) { - *ap->pm_result = rc; - ap->pm_result = NULL; - } - spin_unlock_irqrestore(ap->lock, flags); return; @@ -4135,13 +4130,9 @@ static void ata_eh_handle_port_resume(struct ata_port *ap) /* tell ACPI that we're resuming */ ata_acpi_on_resume(ap); - /* report result */ + /* update the flags */ spin_lock_irqsave(ap->lock, flags); ap->pflags &= ~(ATA_PFLAG_PM_PENDING | ATA_PFLAG_SUSPENDED); - if (ap->pm_result) { - *ap->pm_result = rc; - ap->pm_result = NULL; - } spin_unlock_irqrestore(ap->lock, flags); } #endif /* CONFIG_PM */ diff --git a/drivers/scsi/libsas/sas_ata.c b/drivers/scsi/libsas/sas_ata.c index d2895836f9fa..766098af4eb7 100644 --- a/drivers/scsi/libsas/sas_ata.c +++ b/drivers/scsi/libsas/sas_ata.c @@ -700,46 +700,26 @@ void sas_probe_sata(struct asd_sas_port *port) } -static bool sas_ata_flush_pm_eh(struct asd_sas_port *port, const char *func) +static void sas_ata_flush_pm_eh(struct asd_sas_port *port, const char *func) { struct domain_device *dev, *n; - bool retry = false; list_for_each_entry_safe(dev, n, &port->dev_list, dev_list_node) { - int rc; - if (!dev_is_sata(dev)) continue; sas_ata_wait_eh(dev); - rc = dev->sata_dev.pm_result; - if (rc == -EAGAIN) - retry = true; - else if (rc) { - /* since we don't have a - * ->port_{suspend|resume} routine in our - * ata_port ops, and no entanglements with - * acpi, suspend should just be mechanical trip - * through eh, catch cases where these - * assumptions are invalidated - */ - WARN_ONCE(1, "failed %s %s error: %d\n", func, - dev_name(&dev->rphy->dev), rc); - } /* if libata failed to power manage the device, tear it down */ if (ata_dev_disabled(sas_to_ata_dev(dev))) sas_fail_probe(dev, func, -ENODEV); } - - return retry; } void sas_suspend_sata(struct asd_sas_port *port) { struct domain_device *dev; - retry: mutex_lock(&port->ha->disco_mutex); list_for_each_entry(dev, &port->dev_list, dev_list_node) { struct sata_device *sata; @@ -751,20 +731,17 @@ void sas_suspend_sata(struct asd_sas_port *port) if (sata->ap->pm_mesg.event == PM_EVENT_SUSPEND) continue; - sata->pm_result = -EIO; - ata_sas_port_async_suspend(sata->ap, &sata->pm_result); + ata_sas_port_suspend(sata->ap); } mutex_unlock(&port->ha->disco_mutex); - if (sas_ata_flush_pm_eh(port, __func__)) - goto retry; + sas_ata_flush_pm_eh(port, __func__); } void sas_resume_sata(struct asd_sas_port *port) { struct domain_device *dev; - retry: mutex_lock(&port->ha->disco_mutex); list_for_each_entry(dev, &port->dev_list, dev_list_node) { struct sata_device *sata; @@ -776,13 +753,11 @@ void sas_resume_sata(struct asd_sas_port *port) if (sata->ap->pm_mesg.event == PM_EVENT_ON) continue; - sata->pm_result = -EIO; - ata_sas_port_async_resume(sata->ap, &sata->pm_result); + ata_sas_port_resume(sata->ap); } mutex_unlock(&port->ha->disco_mutex); - if (sas_ata_flush_pm_eh(port, __func__)) - goto retry; + sas_ata_flush_pm_eh(port, __func__); } /** diff --git a/include/linux/libata.h b/include/linux/libata.h index bec6dbe939a0..5c09e86982c9 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -848,7 +848,6 @@ struct ata_port { struct completion park_req_pending; pm_message_t pm_mesg; - int *pm_result; enum ata_lpm_policy target_lpm_policy; struct timer_list fastdrain_timer; @@ -1140,16 +1139,14 @@ extern bool ata_link_offline(struct ata_link *link); #ifdef CONFIG_PM extern int ata_host_suspend(struct ata_host *host, pm_message_t mesg); extern void ata_host_resume(struct ata_host *host); -extern int ata_sas_port_async_suspend(struct ata_port *ap, int *async); -extern int ata_sas_port_async_resume(struct ata_port *ap, int *async); +extern void ata_sas_port_suspend(struct ata_port *ap); +extern void ata_sas_port_resume(struct ata_port *ap); #else -static inline int ata_sas_port_async_suspend(struct ata_port *ap, int *async) +static inline void ata_sas_port_suspend(struct ata_port *ap) { - return 0; } -static inline int ata_sas_port_async_resume(struct ata_port *ap, int *async) +static inline void ata_sas_port_async_resume(struct ata_port *ap) { - return 0; } #endif extern int ata_ratelimit(void); diff --git a/include/scsi/libsas.h b/include/scsi/libsas.h index f843dd8722a9..ef7872c20da9 100644 --- a/include/scsi/libsas.h +++ b/include/scsi/libsas.h @@ -172,7 +172,6 @@ struct sata_device { enum ata_command_set command_set; struct smp_resp rps_resp; /* report_phy_sata_resp */ u8 port_no; /* port number, if this is a PM (Port) */ - int pm_result; struct ata_port *ap; struct ata_host ata_host; -- cgit v1.2.3 From a5a6569959fc55d4ebf1526f7855003596946c32 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 19 Mar 2014 10:46:25 -0700 Subject: libata.h: add stub for ata_sas_port_resume Fix build error when CONFIG_PM is not enabled by adding a stub function in . drivers/scsi/libsas/sas_ata.c: In function 'sas_resume_sata': drivers/scsi/libsas/sas_ata.c:756:3: error: implicit declaration of function 'ata_sas_port_resume' [-Werror=implicit-function-declaration] Signed-off-by: Randy Dunlap Reported-by: Jim Davis Signed-off-by: Tejun Heo Cc: Dan Williams --- include/linux/libata.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/libata.h b/include/linux/libata.h index 5c09e86982c9..52723789b991 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -1148,6 +1148,9 @@ static inline void ata_sas_port_suspend(struct ata_port *ap) static inline void ata_sas_port_async_resume(struct ata_port *ap) { } +static inline void ata_sas_port_resume(struct ata_port *ap) +{ +} #endif extern int ata_ratelimit(void); extern void ata_msleep(struct ata_port *ap, unsigned int msecs); -- cgit v1.2.3 From 0dd5d6f0e8763ff09939adf3e5b1465a3a414fea Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Wed, 19 Mar 2014 11:14:15 -0700 Subject: libata: remove unused ata_sas_port_async_resume() stub Commit bc6e7c4b0d1a "libata, libsas: kill pm_result and related cleanup" renamed ata_sas_port_async_resume() to ata_sas_port_resume(), but missed a CONFIG_PM=n stub conversion. Randy fixed that up in commit a5a6569959fc "libata.h: add stub for ata_sas_port_resume", but missed the deletion of the now unused ata_sas_port_async_resume() routine. Cc: Randy Dunlap Signed-off-by: Dan Williams Signed-off-by: Tejun Heo --- include/linux/libata.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/libata.h b/include/linux/libata.h index 52723789b991..1de36be64df4 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -1145,9 +1145,6 @@ extern void ata_sas_port_resume(struct ata_port *ap); static inline void ata_sas_port_suspend(struct ata_port *ap) { } -static inline void ata_sas_port_async_resume(struct ata_port *ap) -{ -} static inline void ata_sas_port_resume(struct ata_port *ap) { } -- cgit v1.2.3 From e2e680fb7566880f7210cadf628c092c0433971c Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Wed, 19 Mar 2014 16:21:33 +0530 Subject: hrtimer: Rearrange comments in the order struct members are declared Rearrange kernel doc comments in the order members of struct hrtimer are declared. Signed-off-by: Viresh Kumar Cc: linaro-kernel@lists.linaro.org Cc: fweisbec@gmail.com Cc: trivial@kernel.org Link: http://lkml.kernel.org/r/1db1a3cfbe8a9ea49396af75c6ac04a2e67e3ab0.1395226248.git.viresh.kumar@linaro.org Signed-off-by: Thomas Gleixner --- include/linux/hrtimer.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index d19a5c2d2270..e7a8d3fa91d5 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -96,12 +96,12 @@ enum hrtimer_restart { * @function: timer expiry callback function * @base: pointer to the timer base (per cpu and per clock) * @state: state information (See bit values above) + * @start_pid: timer statistics field to store the pid of the task which + * started the timer * @start_site: timer statistics field to store the site where the timer * was started * @start_comm: timer statistics field to store the name of the process which * started the timer - * @start_pid: timer statistics field to store the pid of the task which - * started the timer * * The hrtimer structure must be initialized by hrtimer_init() */ -- cgit v1.2.3 From 6201b4d61fbf194df6371fb3376c5026cb8f5eec Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Tue, 18 Mar 2014 16:26:07 +0530 Subject: timer: Remove code redundancy while calling get_nohz_timer_target() There are only two users of get_nohz_timer_target(): timer and hrtimer. Both call it under same circumstances, i.e. #ifdef CONFIG_NO_HZ_COMMON if (!pinned && get_sysctl_timer_migration() && idle_cpu(this_cpu)) return get_nohz_timer_target(); #endif So, it makes more sense to get all this as part of get_nohz_timer_target() instead of duplicating code at two places. For this another parameter is required to be passed to this routine, pinned. Signed-off-by: Viresh Kumar Cc: linaro-kernel@lists.linaro.org Cc: fweisbec@gmail.com Cc: peterz@infradead.org Link: http://lkml.kernel.org/r/1e1b53537217d58d48c2d7a222a9c3ac47d5b64c.1395140107.git.viresh.kumar@linaro.org Signed-off-by: Thomas Gleixner --- include/linux/sched.h | 6 +++++- kernel/hrtimer.c | 15 +-------------- kernel/sched/core.c | 5 ++++- kernel/timer.c | 7 +------ 4 files changed, 11 insertions(+), 22 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 68a0e84463a0..6f6c56f63c68 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -291,10 +291,14 @@ extern int runqueue_is_locked(int cpu); #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) extern void nohz_balance_enter_idle(int cpu); extern void set_cpu_sd_state_idle(void); -extern int get_nohz_timer_target(void); +extern int get_nohz_timer_target(int pinned); #else static inline void nohz_balance_enter_idle(int cpu) { } static inline void set_cpu_sd_state_idle(void) { } +static inline int get_nohz_timer_target(int pinned) +{ + return smp_processor_id(); +} #endif /* diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 09094361dce5..d55092ceee29 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -168,19 +168,6 @@ struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer, } } - -/* - * Get the preferred target CPU for NOHZ - */ -static int hrtimer_get_target(int this_cpu, int pinned) -{ -#ifdef CONFIG_NO_HZ_COMMON - if (!pinned && get_sysctl_timer_migration() && idle_cpu(this_cpu)) - return get_nohz_timer_target(); -#endif - return this_cpu; -} - /* * With HIGHRES=y we do not migrate the timer when it is expiring * before the next event on the target cpu because we cannot reprogram @@ -214,7 +201,7 @@ switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base, struct hrtimer_clock_base *new_base; struct hrtimer_cpu_base *new_cpu_base; int this_cpu = smp_processor_id(); - int cpu = hrtimer_get_target(this_cpu, pinned); + int cpu = get_nohz_timer_target(pinned); int basenum = base->index; again: diff --git a/kernel/sched/core.c b/kernel/sched/core.c index b46131ef6aab..c0339e206cc2 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -555,12 +555,15 @@ void resched_cpu(int cpu) * selecting an idle cpu will add more delays to the timers than intended * (as that cpu's timer base may not be uptodate wrt jiffies etc). */ -int get_nohz_timer_target(void) +int get_nohz_timer_target(int pinned) { int cpu = smp_processor_id(); int i; struct sched_domain *sd; + if (pinned || !get_sysctl_timer_migration() || !idle_cpu(cpu)) + return cpu; + rcu_read_lock(); for_each_domain(cpu, sd) { for_each_cpu(i, sched_domain_span(sd)) { diff --git a/kernel/timer.c b/kernel/timer.c index 8e503fec1fba..1d35ddadc045 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -760,12 +760,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires, debug_activate(timer, expires); - cpu = smp_processor_id(); - -#if defined(CONFIG_NO_HZ_COMMON) && defined(CONFIG_SMP) - if (!pinned && get_sysctl_timer_migration() && idle_cpu(cpu)) - cpu = get_nohz_timer_target(); -#endif + cpu = get_nohz_timer_target(pinned); new_base = per_cpu(tvec_bases, cpu); if (base != new_base) { -- cgit v1.2.3 From f5b972e9fbd2e99a2abc3221783d089799b69394 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Thu, 20 Mar 2014 15:30:14 +0100 Subject: compat: include linux/unistd.h within linux/compat.h linux/compat.h does not include linux/unistd.h but the compat.h header file contains various conditional #ifdef __ARCH_WANT_COMPAT_... asmlinkage long compat...() #endif compat system call function declarations. If linux/unistd.h isn't included it depends on previous includes if those __ARCH_WANT_COMPAT_... defines are defined or not. So add an additional linux/unistd.h include. Should fix this compile error on tile: include/uapi/asm-generic/unistd.h:195:1: error: 'compat_sys_getdents64' undeclared make[3]: *** [arch/tile/kernel/compat.o] Error 1 Reported-by: Geert Uytterhoeven Acked-by: Geert Uytterhoeven Acked-by: Chris Metcalf Signed-off-by: Heiko Carstens --- include/linux/compat.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/compat.h b/include/linux/compat.h index 7c765624b7ef..01c0aa57ccec 100644 --- a/include/linux/compat.h +++ b/include/linux/compat.h @@ -14,6 +14,7 @@ #include #include #include /* for aio_context_t */ +#include #include #include -- cgit v1.2.3 From 87291347c49dc40aa339f587b209618201c2e527 Mon Sep 17 00:00:00 2001 From: Vaibhav Nagarnaik Date: Thu, 13 Feb 2014 19:51:48 -0800 Subject: tracing: Fix array size mismatch in format string In event format strings, the array size is reported in two locations. One in array subscript and then via the "size:" attribute. The values reported there have a mismatch. For e.g., in sched:sched_switch the prev_comm and next_comm character arrays have subscript values as [32] where as the actual field size is 16. name: sched_switch ID: 301 format: field:unsigned short common_type; offset:0; size:2; signed:0; field:unsigned char common_flags; offset:2; size:1; signed:0; field:unsigned char common_preempt_count; offset:3; size:1;signed:0; field:int common_pid; offset:4; size:4; signed:1; field:char prev_comm[32]; offset:8; size:16; signed:1; field:pid_t prev_pid; offset:24; size:4; signed:1; field:int prev_prio; offset:28; size:4; signed:1; field:long prev_state; offset:32; size:8; signed:1; field:char next_comm[32]; offset:40; size:16; signed:1; field:pid_t next_pid; offset:56; size:4; signed:1; field:int next_prio; offset:60; size:4; signed:1; After bisection, the following commit was blamed: 92edca0 tracing: Use direct field, type and system names This commit removes the duplication of strings for field->name and field->type assuming that all the strings passed in __trace_define_field() are immutable. This is not true for arrays, where the type string is created in event_storage variable and field->type for all array fields points to event_storage. Use __stringify() to create a string constant for the type string. Also, get rid of event_storage and event_storage_mutex that are not needed anymore. also, an added benefit is that this reduces the overhead of events a bit more: text data bss dec hex filename 8424787 2036472 1302528 11763787 b3804b vmlinux 8420814 2036408 1302528 11759750 b37086 vmlinux.patched Link: http://lkml.kernel.org/r/1392349908-29685-1-git-send-email-vnagarnaik@google.com Cc: Laurent Chavey Cc: stable@vger.kernel.org # 3.10+ Signed-off-by: Vaibhav Nagarnaik Signed-off-by: Steven Rostedt --- include/linux/ftrace_event.h | 4 ---- include/trace/ftrace.h | 7 ++----- kernel/trace/trace_events.c | 6 ------ kernel/trace/trace_export.c | 7 ++----- 4 files changed, 4 insertions(+), 20 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index 4e4cc28623ad..4cdb3a17bcb5 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -495,10 +495,6 @@ enum { FILTER_TRACE_FN, }; -#define EVENT_STORAGE_SIZE 128 -extern struct mutex event_storage_mutex; -extern char event_storage[EVENT_STORAGE_SIZE]; - extern int trace_event_raw_init(struct ftrace_event_call *call); extern int trace_define_field(struct ftrace_event_call *call, const char *type, const char *name, int offset, int size, diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h index 1a8b28db3775..1ee19a24cc5f 100644 --- a/include/trace/ftrace.h +++ b/include/trace/ftrace.h @@ -310,15 +310,12 @@ static struct trace_event_functions ftrace_event_type_funcs_##call = { \ #undef __array #define __array(type, item, len) \ do { \ - mutex_lock(&event_storage_mutex); \ + char *type_str = #type"["__stringify(len)"]"; \ BUILD_BUG_ON(len > MAX_FILTER_STR_VAL); \ - snprintf(event_storage, sizeof(event_storage), \ - "%s[%d]", #type, len); \ - ret = trace_define_field(event_call, event_storage, #item, \ + ret = trace_define_field(event_call, type_str, #item, \ offsetof(typeof(field), item), \ sizeof(field.item), \ is_signed_type(type), FILTER_OTHER); \ - mutex_unlock(&event_storage_mutex); \ if (ret) \ return ret; \ } while (0); diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index f3989ceb5cd5..7b16d40bd64d 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -27,12 +27,6 @@ DEFINE_MUTEX(event_mutex); -DEFINE_MUTEX(event_storage_mutex); -EXPORT_SYMBOL_GPL(event_storage_mutex); - -char event_storage[EVENT_STORAGE_SIZE]; -EXPORT_SYMBOL_GPL(event_storage); - LIST_HEAD(ftrace_events); static LIST_HEAD(ftrace_common_fields); diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index 7c3e3e72e2b6..ee0a5098ac43 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -95,15 +95,12 @@ static void __always_unused ____ftrace_check_##name(void) \ #undef __array #define __array(type, item, len) \ do { \ + char *type_str = #type"["__stringify(len)"]"; \ BUILD_BUG_ON(len > MAX_FILTER_STR_VAL); \ - mutex_lock(&event_storage_mutex); \ - snprintf(event_storage, sizeof(event_storage), \ - "%s[%d]", #type, len); \ - ret = trace_define_field(event_call, event_storage, #item, \ + ret = trace_define_field(event_call, type_str, #item, \ offsetof(typeof(field), item), \ sizeof(field.item), \ is_signed_type(type), filter_type); \ - mutex_unlock(&event_storage_mutex); \ if (ret) \ return ret; \ } while (0); -- cgit v1.2.3 From 8c90487cdc64847b4fdd812ab3047f426fec4d13 Mon Sep 17 00:00:00 2001 From: Dave Jones Date: Wed, 26 Feb 2014 10:49:49 -0500 Subject: Rename TAINT_UNSAFE_SMP to TAINT_CPU_OUT_OF_SPEC Rename TAINT_UNSAFE_SMP to TAINT_CPU_OUT_OF_SPEC, so we can repurpose the flag to encompass a wider range of pushing the CPU beyond its warrany. Signed-off-by: Dave Jones Link: http://lkml.kernel.org/r/20140226154949.GA770@redhat.com Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/amd.c | 2 +- include/linux/kernel.h | 2 +- kernel/module.c | 2 +- kernel/panic.c | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index b85e43a5a462..ce8b8ff0e0ef 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -218,7 +218,7 @@ static void amd_k7_smp_check(struct cpuinfo_x86 *c) */ WARN_ONCE(1, "WARNING: This combination of AMD" " processors is not suitable for SMP.\n"); - add_taint(TAINT_UNSAFE_SMP, LOCKDEP_NOW_UNRELIABLE); + add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_NOW_UNRELIABLE); } static void init_amd_k7(struct cpuinfo_x86 *c) diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 196d1ea86df0..08fb02477641 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -458,7 +458,7 @@ extern enum system_states { #define TAINT_PROPRIETARY_MODULE 0 #define TAINT_FORCED_MODULE 1 -#define TAINT_UNSAFE_SMP 2 +#define TAINT_CPU_OUT_OF_SPEC 2 #define TAINT_FORCED_RMMOD 3 #define TAINT_MACHINE_CHECK 4 #define TAINT_BAD_PAGE 5 diff --git a/kernel/module.c b/kernel/module.c index d24fcf29cb64..ca2c1aded7ee 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1015,7 +1015,7 @@ static size_t module_flags_taint(struct module *mod, char *buf) buf[l++] = 'C'; /* * TAINT_FORCED_RMMOD: could be added. - * TAINT_UNSAFE_SMP, TAINT_MACHINE_CHECK, TAINT_BAD_PAGE don't + * TAINT_CPU_OUT_OF_SPEC, TAINT_MACHINE_CHECK, TAINT_BAD_PAGE don't * apply to modules. */ return l; diff --git a/kernel/panic.c b/kernel/panic.c index 6d6300375090..2270cfd1d6be 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -199,7 +199,7 @@ struct tnt { static const struct tnt tnts[] = { { TAINT_PROPRIETARY_MODULE, 'P', 'G' }, { TAINT_FORCED_MODULE, 'F', ' ' }, - { TAINT_UNSAFE_SMP, 'S', ' ' }, + { TAINT_CPU_OUT_OF_SPEC, 'S', ' ' }, { TAINT_FORCED_RMMOD, 'R', ' ' }, { TAINT_MACHINE_CHECK, 'M', ' ' }, { TAINT_BAD_PAGE, 'B', ' ' }, -- cgit v1.2.3 From 765a3f4fed708ae429ee095914a7897acb3a65bd Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 14 Mar 2014 16:37:08 -0700 Subject: rcu: Provide grace-period piggybacking API The following pattern is currently not well supported by RCU: 1. Make data element inaccessible to RCU readers. 2. Do work that probably lasts for more than one grace period. 3. Do something to make sure RCU readers in flight before #1 above have completed. Here are some things that could currently be done: a. Do a synchronize_rcu() unconditionally at either #1 or #3 above. This works, but imposes needless work and latency. b. Post an RCU callback at #1 above that does a wakeup, then wait for the wakeup at #3. This works well, but likely results in an extra unneeded grace period. Open-coding this is also a bit more semi-tricky code than would be good. This commit therefore adds get_state_synchronize_rcu() and cond_synchronize_rcu() APIs. Call get_state_synchronize_rcu() at #1 above and pass its return value to cond_synchronize_rcu() at #3 above. This results in a call to synchronize_rcu() if no grace period has elapsed between #1 and #3, but requires only a load, comparison, and memory barrier if a full grace period did elapse. Requested-by: Peter Zijlstra Signed-off-by: Paul E. McKenney Acked-by: Peter Zijlstra --- include/linux/rcutiny.h | 10 ++++++++ include/linux/rcutree.h | 2 ++ kernel/rcu/tree.c | 62 +++++++++++++++++++++++++++++++++++++++++++++---- 3 files changed, 70 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index e8cb6e3b52a7..425c659d54e5 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -27,6 +27,16 @@ #include +static inline unsigned long get_state_synchronize_rcu(void) +{ + return 0; +} + +static inline void cond_synchronize_rcu(unsigned long oldstate) +{ + might_sleep(); +} + static inline void rcu_barrier_bh(void) { wait_rcu_gp(call_rcu_bh); diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index e9c63884df0a..a59ca05fd4e3 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -76,6 +76,8 @@ static inline void synchronize_rcu_bh_expedited(void) void rcu_barrier(void); void rcu_barrier_bh(void); void rcu_barrier_sched(void); +unsigned long get_state_synchronize_rcu(void); +void cond_synchronize_rcu(unsigned long oldstate); extern unsigned long rcutorture_testseq; extern unsigned long rcutorture_vernum; diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 351faba48b91..0c47e300210a 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1421,13 +1421,14 @@ static int rcu_gp_init(struct rcu_state *rsp) /* Advance to a new grace period and initialize state. */ record_gp_stall_check_time(rsp); - smp_wmb(); /* Record GP times before starting GP. */ - rsp->gpnum++; + /* Record GP times before starting GP, hence smp_store_release(). */ + smp_store_release(&rsp->gpnum, rsp->gpnum + 1); trace_rcu_grace_period(rsp->name, rsp->gpnum, TPS("start")); raw_spin_unlock_irq(&rnp->lock); /* Exclude any concurrent CPU-hotplug operations. */ mutex_lock(&rsp->onoff_mutex); + smp_mb__after_unlock_lock(); /* ->gpnum increment before GP! */ /* * Set the quiescent-state-needed bits in all the rcu_node @@ -1555,10 +1556,11 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) } rnp = rcu_get_root(rsp); raw_spin_lock_irq(&rnp->lock); - smp_mb__after_unlock_lock(); + smp_mb__after_unlock_lock(); /* Order GP before ->completed update. */ rcu_nocb_gp_set(rnp, nocb); - rsp->completed = rsp->gpnum; /* Declare grace period done. */ + /* Declare grace period done. */ + ACCESS_ONCE(rsp->completed) = rsp->gpnum; trace_rcu_grace_period(rsp->name, rsp->completed, TPS("end")); rsp->fqs_state = RCU_GP_IDLE; rdp = this_cpu_ptr(rsp->rda); @@ -2637,6 +2639,58 @@ void synchronize_rcu_bh(void) } EXPORT_SYMBOL_GPL(synchronize_rcu_bh); +/** + * get_state_synchronize_rcu - Snapshot current RCU state + * + * Returns a cookie that is used by a later call to cond_synchronize_rcu() + * to determine whether or not a full grace period has elapsed in the + * meantime. + */ +unsigned long get_state_synchronize_rcu(void) +{ + /* + * Any prior manipulation of RCU-protected data must happen + * before the load from ->gpnum. + */ + smp_mb(); /* ^^^ */ + + /* + * Make sure this load happens before the purportedly + * time-consuming work between get_state_synchronize_rcu() + * and cond_synchronize_rcu(). + */ + return smp_load_acquire(&rcu_state->gpnum); +} +EXPORT_SYMBOL_GPL(get_state_synchronize_rcu); + +/** + * cond_synchronize_rcu - Conditionally wait for an RCU grace period + * + * @oldstate: return value from earlier call to get_state_synchronize_rcu() + * + * If a full RCU grace period has elapsed since the earlier call to + * get_state_synchronize_rcu(), just return. Otherwise, invoke + * synchronize_rcu() to wait for a full grace period. + * + * Yes, this function does not take counter wrap into account. But + * counter wrap is harmless. If the counter wraps, we have waited for + * more than 2 billion grace periods (and way more on a 64-bit system!), + * so waiting for one additional grace period should be just fine. + */ +void cond_synchronize_rcu(unsigned long oldstate) +{ + unsigned long newstate; + + /* + * Ensure that this load happens before any RCU-destructive + * actions the caller might carry out after we return. + */ + newstate = smp_load_acquire(&rcu_state->completed); + if (ULONG_CMP_GE(oldstate, newstate)) + synchronize_rcu(); +} +EXPORT_SYMBOL_GPL(cond_synchronize_rcu); + static int synchronize_sched_expedited_cpu_stop(void *data) { /* -- cgit v1.2.3 From 7e09e738afd21ef99f047425fc0b0c9be8b03254 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 20 Mar 2014 21:52:17 -0700 Subject: mm: fix swapops.h:131 bug if remap_file_pages raced migration Add remove_linear_migration_ptes_from_nonlinear(), to fix an interesting little include/linux/swapops.h:131 BUG_ON(!PageLocked) found by trinity: indicating that remove_migration_ptes() failed to find one of the migration entries that was temporarily inserted. The problem comes from remap_file_pages()'s switch from vma_interval_tree (good for inserting the migration entry) to i_mmap_nonlinear list (no good for locating it again); but can only be a problem if the remap_file_pages() range does not cover the whole of the vma (zap_pte() clears the range). remove_migration_ptes() needs a file_nonlinear method to go down the i_mmap_nonlinear list, applying linear location to look for migration entries in those vmas too, just in case there was this race. The file_nonlinear method does need rmap_walk_control.arg to do this; but it never needed vma passed in - vma comes from its own iteration. Reported-and-tested-by: Dave Jones Reported-and-tested-by: Sasha Levin Signed-off-by: Hugh Dickins Signed-off-by: Linus Torvalds --- include/linux/rmap.h | 3 +-- mm/migrate.c | 32 ++++++++++++++++++++++++++++++++ mm/rmap.c | 5 +++-- 3 files changed, 36 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 1da693d51255..b66c2110cb1f 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -250,8 +250,7 @@ struct rmap_walk_control { int (*rmap_one)(struct page *page, struct vm_area_struct *vma, unsigned long addr, void *arg); int (*done)(struct page *page); - int (*file_nonlinear)(struct page *, struct address_space *, - struct vm_area_struct *vma); + int (*file_nonlinear)(struct page *, struct address_space *, void *arg); struct anon_vma *(*anon_lock)(struct page *page); bool (*invalid_vma)(struct vm_area_struct *vma, void *arg); }; diff --git a/mm/migrate.c b/mm/migrate.c index b494fdb9a636..bed48809e5d0 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -177,6 +177,37 @@ out: return SWAP_AGAIN; } +/* + * Congratulations to trinity for discovering this bug. + * mm/fremap.c's remap_file_pages() accepts any range within a single vma to + * convert that vma to VM_NONLINEAR; and generic_file_remap_pages() will then + * replace the specified range by file ptes throughout (maybe populated after). + * If page migration finds a page within that range, while it's still located + * by vma_interval_tree rather than lost to i_mmap_nonlinear list, no problem: + * zap_pte() clears the temporary migration entry before mmap_sem is dropped. + * But if the migrating page is in a part of the vma outside the range to be + * remapped, then it will not be cleared, and remove_migration_ptes() needs to + * deal with it. Fortunately, this part of the vma is of course still linear, + * so we just need to use linear location on the nonlinear list. + */ +static int remove_linear_migration_ptes_from_nonlinear(struct page *page, + struct address_space *mapping, void *arg) +{ + struct vm_area_struct *vma; + /* hugetlbfs does not support remap_pages, so no huge pgoff worries */ + pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); + unsigned long addr; + + list_for_each_entry(vma, + &mapping->i_mmap_nonlinear, shared.nonlinear) { + + addr = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); + if (addr >= vma->vm_start && addr < vma->vm_end) + remove_migration_pte(page, vma, addr, arg); + } + return SWAP_AGAIN; +} + /* * Get rid of all migration entries and replace them by * references to the indicated page. @@ -186,6 +217,7 @@ static void remove_migration_ptes(struct page *old, struct page *new) struct rmap_walk_control rwc = { .rmap_one = remove_migration_pte, .arg = old, + .file_nonlinear = remove_linear_migration_ptes_from_nonlinear, }; rmap_walk(new, &rwc); diff --git a/mm/rmap.c b/mm/rmap.c index d9d42316a99a..8fc049f9a5a6 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1360,8 +1360,9 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount, } static int try_to_unmap_nonlinear(struct page *page, - struct address_space *mapping, struct vm_area_struct *vma) + struct address_space *mapping, void *arg) { + struct vm_area_struct *vma; int ret = SWAP_AGAIN; unsigned long cursor; unsigned long max_nl_cursor = 0; @@ -1663,7 +1664,7 @@ static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc) if (list_empty(&mapping->i_mmap_nonlinear)) goto done; - ret = rwc->file_nonlinear(page, mapping, vma); + ret = rwc->file_nonlinear(page, mapping, rwc->arg); done: mutex_unlock(&mapping->i_mmap_mutex); -- cgit v1.2.3 From 41f50094b2ab4e37673e41a084ea61b907447159 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Mon, 24 Mar 2014 21:37:02 +0100 Subject: workqueue: Spelling s/instensive/intensive/ Signed-off-by: Geert Uytterhoeven Signed-off-by: Tejun Heo --- include/linux/workqueue.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 29da9e77c3bb..0523eab05f63 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -294,7 +294,7 @@ enum { WQ_FREEZABLE = 1 << 2, /* freeze during suspend */ WQ_MEM_RECLAIM = 1 << 3, /* may be used for memory reclaim */ WQ_HIGHPRI = 1 << 4, /* high priority */ - WQ_CPU_INTENSIVE = 1 << 5, /* cpu instensive workqueue */ + WQ_CPU_INTENSIVE = 1 << 5, /* cpu intensive workqueue */ WQ_SYSFS = 1 << 6, /* visible in sysfs, see wq_sysfs_register() */ /* -- cgit v1.2.3 From ea2e64f280d2a34a8ed9ae3d783cd770d14b70ec Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 23 Mar 2014 14:20:44 +0000 Subject: workqueue: Provide destroy_delayed_work_on_stack() If a delayed or deferrable work is on stack we need to tell debug objects that we are destroying the timer and the work. Otherwise we leak the tracking object. Signed-off-by: Thomas Gleixner Cc: Vince Weaver Acked-by: Tejun Heo Link: http://lkml.kernel.org/r/20140323141939.911487677@linutronix.de Signed-off-by: Thomas Gleixner --- include/linux/workqueue.h | 2 ++ kernel/workqueue.c | 7 +++++++ 2 files changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 594521ba0d43..abdbe5af119d 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -191,6 +191,7 @@ struct execute_work { #ifdef CONFIG_DEBUG_OBJECTS_WORK extern void __init_work(struct work_struct *work, int onstack); extern void destroy_work_on_stack(struct work_struct *work); +extern void destroy_delayed_work_on_stack(struct delayed_work *work); static inline unsigned int work_static(struct work_struct *work) { return *work_data_bits(work) & WORK_STRUCT_STATIC; @@ -198,6 +199,7 @@ static inline unsigned int work_static(struct work_struct *work) #else static inline void __init_work(struct work_struct *work, int onstack) { } static inline void destroy_work_on_stack(struct work_struct *work) { } +static inline void destroy_delayed_work_on_stack(struct delayed_work *work) { } static inline unsigned int work_static(struct work_struct *work) { return 0; } #endif diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 82ef9f3b7473..5b690b5a9e74 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -516,6 +516,13 @@ void destroy_work_on_stack(struct work_struct *work) } EXPORT_SYMBOL_GPL(destroy_work_on_stack); +void destroy_delayed_work_on_stack(struct delayed_work *work) +{ + destroy_timer_on_stack(&work->timer); + debug_object_free(&work->work, &work_debug_descr); +} +EXPORT_SYMBOL_GPL(destroy_delayed_work_on_stack); + #else static inline void debug_work_activate(struct work_struct *work) { } static inline void debug_work_deactivate(struct work_struct *work) { } -- cgit v1.2.3 From 14a0d635d18d0fb552dcc979d6d25106e6541f2e Mon Sep 17 00:00:00 2001 From: Oliver Neukum Date: Wed, 26 Mar 2014 14:32:51 +0100 Subject: usbnet: include wait queue head in device structure This fixes a race which happens by freeing an object on the stack. Quoting Julius: > The issue is > that it calls usbnet_terminate_urbs() before that, which temporarily > installs a waitqueue in dev->wait in order to be able to wait on the > tasklet to run and finish up some queues. The waiting itself looks > okay, but the access to 'dev->wait' is totally unprotected and can > race arbitrarily. I think in this case usbnet_bh() managed to succeed > it's dev->wait check just before usbnet_terminate_urbs() sets it back > to NULL. The latter then finishes and the waitqueue_t structure on its > stack gets overwritten by other functions halfway through the > wake_up() call in usbnet_bh(). The fix is to just not allocate the data structure on the stack. As dev->wait is abused as a flag it also takes a runtime PM change to fix this bug. Signed-off-by: Oliver Neukum Reported-by: Grant Grundler Tested-by: Grant Grundler Signed-off-by: David S. Miller --- drivers/net/usb/usbnet.c | 33 +++++++++++++++++++-------------- include/linux/usb/usbnet.h | 2 +- 2 files changed, 20 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/usb/usbnet.c b/drivers/net/usb/usbnet.c index dd10d5817d2a..f9e96c427558 100644 --- a/drivers/net/usb/usbnet.c +++ b/drivers/net/usb/usbnet.c @@ -752,14 +752,12 @@ EXPORT_SYMBOL_GPL(usbnet_unlink_rx_urbs); // precondition: never called in_interrupt static void usbnet_terminate_urbs(struct usbnet *dev) { - DECLARE_WAIT_QUEUE_HEAD_ONSTACK(unlink_wakeup); DECLARE_WAITQUEUE(wait, current); int temp; /* ensure there are no more active urbs */ - add_wait_queue(&unlink_wakeup, &wait); + add_wait_queue(&dev->wait, &wait); set_current_state(TASK_UNINTERRUPTIBLE); - dev->wait = &unlink_wakeup; temp = unlink_urbs(dev, &dev->txq) + unlink_urbs(dev, &dev->rxq); @@ -773,15 +771,14 @@ static void usbnet_terminate_urbs(struct usbnet *dev) "waited for %d urb completions\n", temp); } set_current_state(TASK_RUNNING); - dev->wait = NULL; - remove_wait_queue(&unlink_wakeup, &wait); + remove_wait_queue(&dev->wait, &wait); } int usbnet_stop (struct net_device *net) { struct usbnet *dev = netdev_priv(net); struct driver_info *info = dev->driver_info; - int retval; + int retval, pm; clear_bit(EVENT_DEV_OPEN, &dev->flags); netif_stop_queue (net); @@ -791,6 +788,8 @@ int usbnet_stop (struct net_device *net) net->stats.rx_packets, net->stats.tx_packets, net->stats.rx_errors, net->stats.tx_errors); + /* to not race resume */ + pm = usb_autopm_get_interface(dev->intf); /* allow minidriver to stop correctly (wireless devices to turn off * radio etc) */ if (info->stop) { @@ -817,6 +816,9 @@ int usbnet_stop (struct net_device *net) dev->flags = 0; del_timer_sync (&dev->delay); tasklet_kill (&dev->bh); + if (!pm) + usb_autopm_put_interface(dev->intf); + if (info->manage_power && !test_and_clear_bit(EVENT_NO_RUNTIME_PM, &dev->flags)) info->manage_power(dev, 0); @@ -1437,11 +1439,12 @@ static void usbnet_bh (unsigned long param) /* restart RX again after disabling due to high error rate */ clear_bit(EVENT_RX_KILL, &dev->flags); - // waiting for all pending urbs to complete? - if (dev->wait) { - if ((dev->txq.qlen + dev->rxq.qlen + dev->done.qlen) == 0) { - wake_up (dev->wait); - } + /* waiting for all pending urbs to complete? + * only then can we forgo submitting anew + */ + if (waitqueue_active(&dev->wait)) { + if (dev->txq.qlen + dev->rxq.qlen + dev->done.qlen == 0) + wake_up_all(&dev->wait); // or are we maybe short a few urbs? } else if (netif_running (dev->net) && @@ -1580,6 +1583,7 @@ usbnet_probe (struct usb_interface *udev, const struct usb_device_id *prod) dev->driver_name = name; dev->msg_enable = netif_msg_init (msg_level, NETIF_MSG_DRV | NETIF_MSG_PROBE | NETIF_MSG_LINK); + init_waitqueue_head(&dev->wait); skb_queue_head_init (&dev->rxq); skb_queue_head_init (&dev->txq); skb_queue_head_init (&dev->done); @@ -1791,9 +1795,10 @@ int usbnet_resume (struct usb_interface *intf) spin_unlock_irq(&dev->txq.lock); if (test_bit(EVENT_DEV_OPEN, &dev->flags)) { - /* handle remote wakeup ASAP */ - if (!dev->wait && - netif_device_present(dev->net) && + /* handle remote wakeup ASAP + * we cannot race against stop + */ + if (netif_device_present(dev->net) && !timer_pending(&dev->delay) && !test_bit(EVENT_RX_HALT, &dev->flags)) rx_alloc_submit(dev, GFP_NOIO); diff --git a/include/linux/usb/usbnet.h b/include/linux/usb/usbnet.h index e303eef94dd5..0662e98fef72 100644 --- a/include/linux/usb/usbnet.h +++ b/include/linux/usb/usbnet.h @@ -30,7 +30,7 @@ struct usbnet { struct driver_info *driver_info; const char *driver_name; void *driver_priv; - wait_queue_head_t *wait; + wait_queue_head_t wait; struct mutex phy_mutex; unsigned char suspend_count; unsigned char pkt_cnt, pkt_err; -- cgit v1.2.3 From 36d5fe6a000790f56039afe26834265db0a3ad4c Mon Sep 17 00:00:00 2001 From: Zoltan Kiss Date: Wed, 26 Mar 2014 22:37:45 +0000 Subject: core, nfqueue, openvswitch: Orphan frags in skb_zerocopy and handle errors skb_zerocopy can copy elements of the frags array between skbs, but it doesn't orphan them. Also, it doesn't handle errors, so this patch takes care of that as well, and modify the callers accordingly. skb_tx_error() is also added to the callers so they will signal the failed delivery towards the creator of the skb. Signed-off-by: Zoltan Kiss Signed-off-by: David S. Miller --- include/linux/skbuff.h | 4 ++-- net/core/skbuff.c | 27 ++++++++++++++++++++------- net/netfilter/nfnetlink_queue_core.c | 9 +++++++-- net/openvswitch/datapath.c | 6 +++++- 4 files changed, 34 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 5e1e6f2d98c2..15ede6a823a6 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -2451,8 +2451,8 @@ int skb_splice_bits(struct sk_buff *skb, unsigned int offset, unsigned int flags); void skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to); unsigned int skb_zerocopy_headlen(const struct sk_buff *from); -void skb_zerocopy(struct sk_buff *to, const struct sk_buff *from, - int len, int hlen); +int skb_zerocopy(struct sk_buff *to, struct sk_buff *from, + int len, int hlen); void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len); int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen); void skb_scrub_packet(struct sk_buff *skb, bool xnet); diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 869c7afe3b07..97e5a2c3d947 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -2127,25 +2127,31 @@ EXPORT_SYMBOL_GPL(skb_zerocopy_headlen); * * The `hlen` as calculated by skb_zerocopy_headlen() specifies the * headroom in the `to` buffer. + * + * Return value: + * 0: everything is OK + * -ENOMEM: couldn't orphan frags of @from due to lack of memory + * -EFAULT: skb_copy_bits() found some problem with skb geometry */ -void -skb_zerocopy(struct sk_buff *to, const struct sk_buff *from, int len, int hlen) +int +skb_zerocopy(struct sk_buff *to, struct sk_buff *from, int len, int hlen) { int i, j = 0; int plen = 0; /* length of skb->head fragment */ + int ret; struct page *page; unsigned int offset; BUG_ON(!from->head_frag && !hlen); /* dont bother with small payloads */ - if (len <= skb_tailroom(to)) { - skb_copy_bits(from, 0, skb_put(to, len), len); - return; - } + if (len <= skb_tailroom(to)) + return skb_copy_bits(from, 0, skb_put(to, len), len); if (hlen) { - skb_copy_bits(from, 0, skb_put(to, hlen), hlen); + ret = skb_copy_bits(from, 0, skb_put(to, hlen), hlen); + if (unlikely(ret)) + return ret; len -= hlen; } else { plen = min_t(int, skb_headlen(from), len); @@ -2163,6 +2169,11 @@ skb_zerocopy(struct sk_buff *to, const struct sk_buff *from, int len, int hlen) to->len += len + plen; to->data_len += len + plen; + if (unlikely(skb_orphan_frags(from, GFP_ATOMIC))) { + skb_tx_error(from); + return -ENOMEM; + } + for (i = 0; i < skb_shinfo(from)->nr_frags; i++) { if (!len) break; @@ -2173,6 +2184,8 @@ skb_zerocopy(struct sk_buff *to, const struct sk_buff *from, int len, int hlen) j++; } skb_shinfo(to)->nr_frags = j; + + return 0; } EXPORT_SYMBOL_GPL(skb_zerocopy); diff --git a/net/netfilter/nfnetlink_queue_core.c b/net/netfilter/nfnetlink_queue_core.c index f072fe803510..108120f216b1 100644 --- a/net/netfilter/nfnetlink_queue_core.c +++ b/net/netfilter/nfnetlink_queue_core.c @@ -354,13 +354,16 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue, skb = nfnetlink_alloc_skb(net, size, queue->peer_portid, GFP_ATOMIC); - if (!skb) + if (!skb) { + skb_tx_error(entskb); return NULL; + } nlh = nlmsg_put(skb, 0, 0, NFNL_SUBSYS_QUEUE << 8 | NFQNL_MSG_PACKET, sizeof(struct nfgenmsg), 0); if (!nlh) { + skb_tx_error(entskb); kfree_skb(skb); return NULL; } @@ -488,13 +491,15 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue, nla->nla_type = NFQA_PAYLOAD; nla->nla_len = nla_attr_size(data_len); - skb_zerocopy(skb, entskb, data_len, hlen); + if (skb_zerocopy(skb, entskb, data_len, hlen)) + goto nla_put_failure; } nlh->nlmsg_len = skb->len; return skb; nla_put_failure: + skb_tx_error(entskb); kfree_skb(skb); net_err_ratelimited("nf_queue: error creating packet message\n"); return NULL; diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index 8601b320b443..270b77dfac30 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -464,7 +464,9 @@ static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb, } nla->nla_len = nla_attr_size(skb->len); - skb_zerocopy(user_skb, skb, skb->len, hlen); + err = skb_zerocopy(user_skb, skb, skb->len, hlen); + if (err) + goto out; /* Pad OVS_PACKET_ATTR_PACKET if linear copy was performed */ if (!(dp->user_features & OVS_DP_F_UNALIGNED)) { @@ -478,6 +480,8 @@ static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb, err = genlmsg_unicast(ovs_dp_get_net(dp), user_skb, upcall_info->portid); out: + if (err) + skb_tx_error(skb); kfree_skb(nskb); return err; } -- cgit v1.2.3 From 53d6471cef17262d3ad1c7ce8982a234244f68ec Mon Sep 17 00:00:00 2001 From: Vlad Yasevich Date: Thu, 27 Mar 2014 17:26:18 -0400 Subject: net: Account for all vlan headers in skb_mac_gso_segment skb_network_protocol() already accounts for multiple vlan headers that may be present in the skb. However, skb_mac_gso_segment() doesn't know anything about it and assumes that skb->mac_len is set correctly to skip all mac headers. That may not always be the case. If we are simply forwarding the packet (via bridge or macvtap), all vlan headers may not be accounted for. A simple solution is to allow skb_network_protocol to return the vlan depth it has calculated. This way skb_mac_gso_segment will correctly skip all mac headers. Signed-off-by: Vlad Yasevich Signed-off-by: David S. Miller --- include/linux/netdevice.h | 2 +- net/core/dev.c | 13 +++++++++---- net/core/skbuff.c | 3 ++- 3 files changed, 12 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index e8eeebd49a98..daafd9561cbc 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3014,7 +3014,7 @@ struct sk_buff *skb_gso_segment(struct sk_buff *skb, netdev_features_t features) { return __skb_gso_segment(skb, features, true); } -__be16 skb_network_protocol(struct sk_buff *skb); +__be16 skb_network_protocol(struct sk_buff *skb, int *depth); static inline bool can_checksum_protocol(netdev_features_t features, __be16 protocol) diff --git a/net/core/dev.c b/net/core/dev.c index b1b0c8d4d7df..45fa2f11f84d 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2286,7 +2286,7 @@ out: } EXPORT_SYMBOL(skb_checksum_help); -__be16 skb_network_protocol(struct sk_buff *skb) +__be16 skb_network_protocol(struct sk_buff *skb, int *depth) { __be16 type = skb->protocol; int vlan_depth = ETH_HLEN; @@ -2313,6 +2313,8 @@ __be16 skb_network_protocol(struct sk_buff *skb) vlan_depth += VLAN_HLEN; } + *depth = vlan_depth; + return type; } @@ -2326,12 +2328,13 @@ struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb, { struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT); struct packet_offload *ptype; - __be16 type = skb_network_protocol(skb); + int vlan_depth = skb->mac_len; + __be16 type = skb_network_protocol(skb, &vlan_depth); if (unlikely(!type)) return ERR_PTR(-EINVAL); - __skb_pull(skb, skb->mac_len); + __skb_pull(skb, vlan_depth); rcu_read_lock(); list_for_each_entry_rcu(ptype, &offload_base, list) { @@ -2498,8 +2501,10 @@ static netdev_features_t harmonize_features(struct sk_buff *skb, const struct net_device *dev, netdev_features_t features) { + int tmp; + if (skb->ip_summed != CHECKSUM_NONE && - !can_checksum_protocol(features, skb_network_protocol(skb))) { + !can_checksum_protocol(features, skb_network_protocol(skb, &tmp))) { features &= ~NETIF_F_ALL_CSUM; } else if (illegal_highdma(dev, skb)) { features &= ~NETIF_F_SG; diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 97e5a2c3d947..90b96a11b974 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -2879,8 +2879,9 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb, int err = -ENOMEM; int i = 0; int pos; + int dummy; - proto = skb_network_protocol(head_skb); + proto = skb_network_protocol(head_skb, &dummy); if (unlikely(!proto)) return ERR_PTR(-EINVAL); -- cgit v1.2.3 From 2adb956b084d6d49f519541a4b5f9947e96f8ef7 Mon Sep 17 00:00:00 2001 From: Vlad Yasevich Date: Thu, 27 Mar 2014 22:14:49 -0400 Subject: vlan: Warn the user if lowerdev has bad vlan features. Some drivers incorrectly assign vlan acceleration features to vlan_features thus causing issues for Q-in-Q vlan configurations. Warn the user of such cases. Signed-off-by: Vlad Yasevich Signed-off-by: David S. Miller --- include/linux/netdev_features.h | 7 +++++++ net/8021q/vlan_dev.c | 3 +++ 2 files changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h index 1005ebf17575..5a09a48f2658 100644 --- a/include/linux/netdev_features.h +++ b/include/linux/netdev_features.h @@ -163,4 +163,11 @@ enum { /* changeable features with no special hardware requirements */ #define NETIF_F_SOFT_FEATURES (NETIF_F_GSO | NETIF_F_GRO) +#define NETIF_F_VLAN_FEATURES (NETIF_F_HW_VLAN_CTAG_FILTER | \ + NETIF_F_HW_VLAN_CTAG_RX | \ + NETIF_F_HW_VLAN_CTAG_TX | \ + NETIF_F_HW_VLAN_STAG_FILTER | \ + NETIF_F_HW_VLAN_STAG_RX | \ + NETIF_F_HW_VLAN_STAG_TX) + #endif /* _LINUX_NETDEV_FEATURES_H */ diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index a9591ff2b678..27bfe2f8e2de 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -578,6 +578,9 @@ static int vlan_dev_init(struct net_device *dev) dev->features |= real_dev->vlan_features | NETIF_F_LLTX; dev->gso_max_size = real_dev->gso_max_size; + if (dev->features & NETIF_F_VLAN_FEATURES) + netdev_warn(real_dev, "VLAN features are set incorrectly. Q-in-Q configurations may not work correctly.\n"); + /* ipv6 shared card related stuff */ dev->dev_id = real_dev->dev_id; -- cgit v1.2.3 From 59ff3eb6d6f75c6c1c3ea8b46ac2cc64eb216547 Mon Sep 17 00:00:00 2001 From: ZhangZhen Date: Thu, 27 Mar 2014 09:41:47 +0800 Subject: workqueue: remove deprecated WQ_NON_REENTRANT Tejun Heo has made WQ_NON_REENTRANT useless in the dbf2576e37 ("workqueue: make all workqueues non-reentrant"). So remove its usages and definition. This patch doesn't introduce any behavior changes. tj: minor description updates. Signed-off-by: ZhangZhen Sigend-off-by: Tejun Heo Acked-by: James Chapman Acked-by: Ulf Hansson --- drivers/mmc/host/dw_mmc.c | 2 +- include/linux/workqueue.h | 6 ------ net/l2tp/l2tp_core.c | 2 +- 3 files changed, 2 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/drivers/mmc/host/dw_mmc.c b/drivers/mmc/host/dw_mmc.c index 55cd110a49c4..c204b7d1532c 100644 --- a/drivers/mmc/host/dw_mmc.c +++ b/drivers/mmc/host/dw_mmc.c @@ -2607,7 +2607,7 @@ int dw_mci_probe(struct dw_mci *host) tasklet_init(&host->tasklet, dw_mci_tasklet_func, (unsigned long)host); host->card_workqueue = alloc_workqueue("dw-mci-card", - WQ_MEM_RECLAIM | WQ_NON_REENTRANT, 1); + WQ_MEM_RECLAIM, 1); if (!host->card_workqueue) { ret = -ENOMEM; goto err_dmaunmap; diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 0523eab05f63..532994651684 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -284,12 +284,6 @@ static inline unsigned int work_static(struct work_struct *work) { return 0; } * Documentation/workqueue.txt. */ enum { - /* - * All wqs are now non-reentrant making the following flag - * meaningless. Will be removed. - */ - WQ_NON_REENTRANT = 1 << 0, /* DEPRECATED */ - WQ_UNBOUND = 1 << 1, /* not bound to any cpu */ WQ_FREEZABLE = 1 << 2, /* freeze during suspend */ WQ_MEM_RECLAIM = 1 << 3, /* may be used for memory reclaim */ diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c index 735d0f60c83a..fcbd63ea4909 100644 --- a/net/l2tp/l2tp_core.c +++ b/net/l2tp/l2tp_core.c @@ -2016,7 +2016,7 @@ static int __init l2tp_init(void) if (rc) goto out; - l2tp_wq = alloc_workqueue("l2tp", WQ_NON_REENTRANT | WQ_UNBOUND, 0); + l2tp_wq = alloc_workqueue("l2tp", WQ_UNBOUND, 0); if (!l2tp_wq) { pr_err("alloc_workqueue failed\n"); rc = -ENOMEM; -- cgit v1.2.3 From 00a1a053ebe5febcfc2ec498bd894f035ad2aa06 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sun, 30 Mar 2014 10:20:01 -0400 Subject: ext4: atomically set inode->i_flags in ext4_set_inode_flags() Use cmpxchg() to atomically set i_flags instead of clearing out the S_IMMUTABLE, S_APPEND, etc. flags and then setting them from the EXT4_IMMUTABLE_FL, EXT4_APPEND_FL flags, since this opens up a race where an immutable file has the immutable flag cleared for a brief window of time. Reported-by: John Sullivan Signed-off-by: "Theodore Ts'o" Cc: stable@kernel.org Signed-off-by: Linus Torvalds --- fs/ext4/inode.c | 15 +++++++++------ include/linux/bitops.h | 15 +++++++++++++++ 2 files changed, 24 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 6e39895a91b8..24bfd7ff3049 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -38,6 +38,7 @@ #include #include #include +#include #include "ext4_jbd2.h" #include "xattr.h" @@ -3921,18 +3922,20 @@ int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc) void ext4_set_inode_flags(struct inode *inode) { unsigned int flags = EXT4_I(inode)->i_flags; + unsigned int new_fl = 0; - inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC); if (flags & EXT4_SYNC_FL) - inode->i_flags |= S_SYNC; + new_fl |= S_SYNC; if (flags & EXT4_APPEND_FL) - inode->i_flags |= S_APPEND; + new_fl |= S_APPEND; if (flags & EXT4_IMMUTABLE_FL) - inode->i_flags |= S_IMMUTABLE; + new_fl |= S_IMMUTABLE; if (flags & EXT4_NOATIME_FL) - inode->i_flags |= S_NOATIME; + new_fl |= S_NOATIME; if (flags & EXT4_DIRSYNC_FL) - inode->i_flags |= S_DIRSYNC; + new_fl |= S_DIRSYNC; + set_mask_bits(&inode->i_flags, + S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC, new_fl); } /* Propagate flags from i_flags to EXT4_I(inode)->i_flags */ diff --git a/include/linux/bitops.h b/include/linux/bitops.h index abc9ca778456..be5fd38bd5a0 100644 --- a/include/linux/bitops.h +++ b/include/linux/bitops.h @@ -196,6 +196,21 @@ static inline unsigned long __ffs64(u64 word) #ifdef __KERNEL__ +#ifndef set_mask_bits +#define set_mask_bits(ptr, _mask, _bits) \ +({ \ + const typeof(*ptr) mask = (_mask), bits = (_bits); \ + typeof(*ptr) old, new; \ + \ + do { \ + old = ACCESS_ONCE(*ptr); \ + new = (old & ~mask) | bits; \ + } while (cmpxchg(ptr, old, new) != old); \ + \ + new; \ +}) +#endif + #ifndef find_last_bit /** * find_last_bit - find the last set bit in a memory region -- cgit v1.2.3