From 967fc04671feea4dbf780c9e55a0bc8fcf68a14e Mon Sep 17 00:00:00 2001 From: Gregory Haskins Date: Mon, 29 Dec 2008 09:39:52 -0500 Subject: sched: add sched_class->needs_post_schedule() member We currently run class->post_schedule() outside of the rq->lock, which means that we need to test for the need to post_schedule outside of the lock to avoid a forced reacquistion. This is currently not a problem as we only look at rq->rt.overloaded. However, we want to enhance this going forward to look at more state to reduce the need to post_schedule to a bare minimum set. Therefore, we introduce a new member-func called needs_post_schedule() which tests for the post_schedule condtion without actually performing the work. Therefore it is safe to call this function before the rq->lock is released, because we are guaranteed not to drop the lock at an intermediate point (such as what post_schedule() may do). We will use this later in the series [ rostedt: removed paranoid BUG_ON ] Signed-off-by: Gregory Haskins --- include/linux/sched.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index e5f928a079e8..836a86c32a65 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1012,6 +1012,7 @@ struct sched_class { struct rq *busiest, struct sched_domain *sd, enum cpu_idle_type idle); void (*pre_schedule) (struct rq *this_rq, struct task_struct *task); + int (*needs_post_schedule) (struct rq *this_rq); void (*post_schedule) (struct rq *this_rq); void (*task_wake_up) (struct rq *this_rq, struct task_struct *task); -- cgit v1.2.3 From 917b627d4d981dc614519d7b34ea31a976b14e12 Mon Sep 17 00:00:00 2001 From: Gregory Haskins Date: Mon, 29 Dec 2008 09:39:53 -0500 Subject: sched: create "pushable_tasks" list to limit pushing to one attempt The RT scheduler employs a "push/pull" design to actively balance tasks within the system (on a per disjoint cpuset basis). When a task is awoken, it is immediately determined if there are any lower priority cpus which should be preempted. This is opposed to the way normal SCHED_OTHER tasks behave, which will wait for a periodic rebalancing operation to occur before spreading out load. When a particular RQ has more than 1 active RT task, it is said to be in an "overloaded" state. Once this occurs, the system enters the active balancing mode, where it will try to push the task away, or persuade a different cpu to pull it over. The system will stay in this state until the system falls back below the <= 1 queued RT task per RQ. However, the current implementation suffers from a limitation in the push logic. Once overloaded, all tasks (other than current) on the RQ are analyzed on every push operation, even if it was previously unpushable (due to affinity, etc). Whats more, the operation stops at the first task that is unpushable and will not look at items lower in the queue. This causes two problems: 1) We can have the same tasks analyzed over and over again during each push, which extends out the fast path in the scheduler for no gain. Consider a RQ that has dozens of tasks that are bound to a core. Each one of those tasks will be encountered and skipped for each push operation while they are queued. 2) There may be lower-priority tasks under the unpushable task that could have been successfully pushed, but will never be considered until either the unpushable task is cleared, or a pull operation succeeds. The net result is a potential latency source for mid priority tasks. This patch aims to rectify these two conditions by introducing a new priority sorted list: "pushable_tasks". A task is added to the list each time a task is activated or preempted. It is removed from the list any time it is deactivated, made current, or fails to push. This works because a task only needs to be attempted to push once. After an initial failure to push, the other cpus will eventually try to pull the task when the conditions are proper. This also solves the problem that we don't completely analyze all tasks due to encountering an unpushable tasks. Now every task will have a push attempted (when appropriate). This reduces latency both by shorting the critical section of the rq->lock for certain workloads, and by making sure the algorithm considers all eligible tasks in the system. [ rostedt: added a couple more BUG_ONs ] Signed-off-by: Gregory Haskins Acked-by: Steven Rostedt --- include/linux/init_task.h | 1 + include/linux/sched.h | 1 + kernel/sched.c | 4 ++ kernel/sched_rt.c | 119 +++++++++++++++++++++++++++++++++++++++------- 4 files changed, 107 insertions(+), 18 deletions(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/init_task.h b/include/linux/init_task.h index 23fd8909b9e5..6851225f44a7 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -140,6 +140,7 @@ extern struct group_info init_groups; .nr_cpus_allowed = NR_CPUS, \ }, \ .tasks = LIST_HEAD_INIT(tsk.tasks), \ + .pushable_tasks = PLIST_NODE_INIT(tsk.pushable_tasks, MAX_PRIO), \ .ptraced = LIST_HEAD_INIT(tsk.ptraced), \ .ptrace_entry = LIST_HEAD_INIT(tsk.ptrace_entry), \ .real_parent = &tsk, \ diff --git a/include/linux/sched.h b/include/linux/sched.h index 836a86c32a65..440cabb2d432 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1179,6 +1179,7 @@ struct task_struct { #endif struct list_head tasks; + struct plist_node pushable_tasks; struct mm_struct *mm, *active_mm; diff --git a/kernel/sched.c b/kernel/sched.c index 3acbad8991a2..24ab80c28765 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -471,6 +471,7 @@ struct rt_rq { #ifdef CONFIG_SMP unsigned long rt_nr_migratory; int overloaded; + struct plist_head pushable_tasks; #endif int rt_throttled; u64 rt_time; @@ -2481,6 +2482,8 @@ void sched_fork(struct task_struct *p, int clone_flags) /* Want to start with kernel preemption disabled. */ task_thread_info(p)->preempt_count = 1; #endif + plist_node_init(&p->pushable_tasks, MAX_PRIO); + put_cpu(); } @@ -8237,6 +8240,7 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) #ifdef CONFIG_SMP rt_rq->rt_nr_migratory = 0; rt_rq->overloaded = 0; + plist_head_init(&rq->rt.pushable_tasks, &rq->lock); #endif rt_rq->rt_time = 0; diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index b0b6ea4ed674..fe9da6084c87 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -49,6 +49,24 @@ static void update_rt_migration(struct rq *rq) rq->rt.overloaded = 0; } } + +static void enqueue_pushable_task(struct rq *rq, struct task_struct *p) +{ + plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks); + plist_node_init(&p->pushable_tasks, p->prio); + plist_add(&p->pushable_tasks, &rq->rt.pushable_tasks); +} + +static void dequeue_pushable_task(struct rq *rq, struct task_struct *p) +{ + plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks); +} + +#else + +#define enqueue_pushable_task(rq, p) do { } while (0) +#define dequeue_pushable_task(rq, p) do { } while (0) + #endif /* CONFIG_SMP */ static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se) @@ -751,6 +769,9 @@ static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup) enqueue_rt_entity(rt_se); + if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1) + enqueue_pushable_task(rq, p); + inc_cpu_load(rq, p->se.load.weight); } @@ -761,6 +782,8 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep) update_curr_rt(rq); dequeue_rt_entity(rt_se); + dequeue_pushable_task(rq, p); + dec_cpu_load(rq, p->se.load.weight); } @@ -911,7 +934,7 @@ static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq, return next; } -static struct task_struct *pick_next_task_rt(struct rq *rq) +static struct task_struct *_pick_next_task_rt(struct rq *rq) { struct sched_rt_entity *rt_se; struct task_struct *p; @@ -933,6 +956,18 @@ static struct task_struct *pick_next_task_rt(struct rq *rq) p = rt_task_of(rt_se); p->se.exec_start = rq->clock; + + return p; +} + +static struct task_struct *pick_next_task_rt(struct rq *rq) +{ + struct task_struct *p = _pick_next_task_rt(rq); + + /* The running task is never eligible for pushing */ + if (p) + dequeue_pushable_task(rq, p); + return p; } @@ -940,6 +975,13 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p) { update_curr_rt(rq); p->se.exec_start = 0; + + /* + * The previous task needs to be made eligible for pushing + * if it is still active + */ + if (p->se.on_rq && p->rt.nr_cpus_allowed > 1) + enqueue_pushable_task(rq, p); } #ifdef CONFIG_SMP @@ -1116,6 +1158,31 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq) return lowest_rq; } +static inline int has_pushable_tasks(struct rq *rq) +{ + return !plist_head_empty(&rq->rt.pushable_tasks); +} + +static struct task_struct *pick_next_pushable_task(struct rq *rq) +{ + struct task_struct *p; + + if (!has_pushable_tasks(rq)) + return NULL; + + p = plist_first_entry(&rq->rt.pushable_tasks, + struct task_struct, pushable_tasks); + + BUG_ON(rq->cpu != task_cpu(p)); + BUG_ON(task_current(rq, p)); + BUG_ON(p->rt.nr_cpus_allowed <= 1); + + BUG_ON(!p->se.on_rq); + BUG_ON(!rt_task(p)); + + return p; +} + /* * If the current CPU has more than one RT task, see if the non * running task can migrate over to a CPU that is running a task @@ -1125,13 +1192,12 @@ static int push_rt_task(struct rq *rq) { struct task_struct *next_task; struct rq *lowest_rq; - int ret = 0; int paranoid = RT_MAX_TRIES; if (!rq->rt.overloaded) return 0; - next_task = pick_next_highest_task_rt(rq, -1); + next_task = pick_next_pushable_task(rq); if (!next_task) return 0; @@ -1163,12 +1229,19 @@ static int push_rt_task(struct rq *rq) * so it is possible that next_task has changed. * If it has, then try again. */ - task = pick_next_highest_task_rt(rq, -1); + task = pick_next_pushable_task(rq); if (unlikely(task != next_task) && task && paranoid--) { put_task_struct(next_task); next_task = task; goto retry; } + + /* + * Once we have failed to push this task, we will not + * try again, since the other cpus will pull from us + * when they are ready + */ + dequeue_pushable_task(rq, next_task); goto out; } @@ -1180,23 +1253,12 @@ static int push_rt_task(struct rq *rq) double_unlock_balance(rq, lowest_rq); - ret = 1; out: put_task_struct(next_task); - return ret; + return 1; } -/* - * TODO: Currently we just use the second highest prio task on - * the queue, and stop when it can't migrate (or there's - * no more RT tasks). There may be a case where a lower - * priority RT task has a different affinity than the - * higher RT task. In this case the lower RT task could - * possibly be able to migrate where as the higher priority - * RT task could not. We currently ignore this issue. - * Enhancements are welcome! - */ static void push_rt_tasks(struct rq *rq) { /* push_rt_task will return true if it moved an RT */ @@ -1295,7 +1357,7 @@ static void pre_schedule_rt(struct rq *rq, struct task_struct *prev) */ static int needs_post_schedule_rt(struct rq *rq) { - return rq->rt.overloaded ? 1 : 0; + return has_pushable_tasks(rq); } static void post_schedule_rt(struct rq *rq) @@ -1317,7 +1379,7 @@ static void task_wake_up_rt(struct rq *rq, struct task_struct *p) { if (!task_running(rq, p) && !test_tsk_need_resched(rq->curr) && - rq->rt.overloaded && + has_pushable_tasks(rq) && p->rt.nr_cpus_allowed > 1) push_rt_tasks(rq); } @@ -1354,6 +1416,24 @@ static void set_cpus_allowed_rt(struct task_struct *p, if (p->se.on_rq && (weight != p->rt.nr_cpus_allowed)) { struct rq *rq = task_rq(p); + if (!task_current(rq, p)) { + /* + * Make sure we dequeue this task from the pushable list + * before going further. It will either remain off of + * the list because we are no longer pushable, or it + * will be requeued. + */ + if (p->rt.nr_cpus_allowed > 1) + dequeue_pushable_task(rq, p); + + /* + * Requeue if our weight is changing and still > 1 + */ + if (weight > 1) + enqueue_pushable_task(rq, p); + + } + if ((p->rt.nr_cpus_allowed <= 1) && (weight > 1)) { rq->rt.rt_nr_migratory++; } else if ((p->rt.nr_cpus_allowed > 1) && (weight <= 1)) { @@ -1538,6 +1618,9 @@ static void set_curr_task_rt(struct rq *rq) struct task_struct *p = rq->curr; p->se.exec_start = rq->clock; + + /* The running task is never eligible for pushing */ + dequeue_pushable_task(rq, p); } static const struct sched_class rt_sched_class = { -- cgit v1.2.3 From 41719b03091911028116155deddc5eedf8c45e37 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 14 Jan 2009 15:36:26 +0100 Subject: mutex: preemption fixes The problem is that dropping the spinlock right before schedule is a voluntary preemption point and can cause a schedule, right after which we schedule again. Fix this inefficiency by keeping preemption disabled until we schedule, do this by explicity disabling preemption and providing a schedule() variant that assumes preemption is already disabled. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- include/linux/sched.h | 1 + kernel/mutex.c | 5 ++++- kernel/sched.c | 10 +++++++--- 3 files changed, 12 insertions(+), 4 deletions(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 4cae9b81a1f8..9f0b372cfa6f 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -328,6 +328,7 @@ extern signed long schedule_timeout(signed long timeout); extern signed long schedule_timeout_interruptible(signed long timeout); extern signed long schedule_timeout_killable(signed long timeout); extern signed long schedule_timeout_uninterruptible(signed long timeout); +asmlinkage void __schedule(void); asmlinkage void schedule(void); struct nsproxy; diff --git a/kernel/mutex.c b/kernel/mutex.c index 357c6d221efe..524ffc33dc05 100644 --- a/kernel/mutex.c +++ b/kernel/mutex.c @@ -131,6 +131,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, struct mutex_waiter waiter; unsigned long flags; + preempt_disable(); spin_lock_mutex(&lock->wait_lock, flags); debug_mutex_lock_common(lock, &waiter); @@ -170,13 +171,14 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, spin_unlock_mutex(&lock->wait_lock, flags); debug_mutex_free_waiter(&waiter); + preempt_enable(); return -EINTR; } __set_task_state(task, state); /* didnt get the lock, go to sleep: */ spin_unlock_mutex(&lock->wait_lock, flags); - schedule(); + __schedule(); spin_lock_mutex(&lock->wait_lock, flags); } @@ -193,6 +195,7 @@ done: spin_unlock_mutex(&lock->wait_lock, flags); debug_mutex_free_waiter(&waiter); + preempt_enable(); return 0; } diff --git a/kernel/sched.c b/kernel/sched.c index 8be2c13b50d0..b001c133c359 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4538,15 +4538,13 @@ pick_next_task(struct rq *rq, struct task_struct *prev) /* * schedule() is the main scheduler function. */ -asmlinkage void __sched schedule(void) +asmlinkage void __sched __schedule(void) { struct task_struct *prev, *next; unsigned long *switch_count; struct rq *rq; int cpu; -need_resched: - preempt_disable(); cpu = smp_processor_id(); rq = cpu_rq(cpu); rcu_qsctr_inc(cpu); @@ -4603,7 +4601,13 @@ need_resched_nonpreemptible: if (unlikely(reacquire_kernel_lock(current) < 0)) goto need_resched_nonpreemptible; +} +asmlinkage void __sched schedule(void) +{ +need_resched: + preempt_disable(); + __schedule(); preempt_enable_no_resched(); if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) goto need_resched; -- cgit v1.2.3 From 0d66bf6d3514b35eb6897629059443132992dbd7 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 12 Jan 2009 14:01:47 +0100 Subject: mutex: implement adaptive spinning Change mutex contention behaviour such that it will sometimes busy wait on acquisition - moving its behaviour closer to that of spinlocks. This concept got ported to mainline from the -rt tree, where it was originally implemented for rtmutexes by Steven Rostedt, based on work by Gregory Haskins. Testing with Ingo's test-mutex application (http://lkml.org/lkml/2006/1/8/50) gave a 345% boost for VFS scalability on my testbox: # ./test-mutex-shm V 16 10 | grep "^avg ops" avg ops/sec: 296604 # ./test-mutex-shm V 16 10 | grep "^avg ops" avg ops/sec: 85870 The key criteria for the busy wait is that the lock owner has to be running on a (different) cpu. The idea is that as long as the owner is running, there is a fair chance it'll release the lock soon, and thus we'll be better off spinning instead of blocking/scheduling. Since regular mutexes (as opposed to rtmutexes) do not atomically track the owner, we add the owner in a non-atomic fashion and deal with the races in the slowpath. Furthermore, to ease the testing of the performance impact of this new code, there is means to disable this behaviour runtime (without having to reboot the system), when scheduler debugging is enabled (CONFIG_SCHED_DEBUG=y), by issuing the following command: # echo NO_OWNER_SPIN > /debug/sched_features This command re-enables spinning again (this is also the default): # echo OWNER_SPIN > /debug/sched_features Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- include/linux/mutex.h | 5 ++- include/linux/sched.h | 1 + kernel/mutex-debug.c | 9 +--- kernel/mutex-debug.h | 18 ++++---- kernel/mutex.c | 115 +++++++++++++++++++++++++++++++++++++++++++----- kernel/mutex.h | 22 ++++++++- kernel/sched.c | 61 +++++++++++++++++++++++++ kernel/sched_features.h | 1 + 8 files changed, 201 insertions(+), 31 deletions(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/mutex.h b/include/linux/mutex.h index 7a0e5c4f8072..3069ec7e0ab8 100644 --- a/include/linux/mutex.h +++ b/include/linux/mutex.h @@ -50,8 +50,10 @@ struct mutex { atomic_t count; spinlock_t wait_lock; struct list_head wait_list; -#ifdef CONFIG_DEBUG_MUTEXES +#if defined(CONFIG_DEBUG_MUTEXES) || defined(CONFIG_SMP) struct thread_info *owner; +#endif +#ifdef CONFIG_DEBUG_MUTEXES const char *name; void *magic; #endif @@ -68,7 +70,6 @@ struct mutex_waiter { struct list_head list; struct task_struct *task; #ifdef CONFIG_DEBUG_MUTEXES - struct mutex *lock; void *magic; #endif }; diff --git a/include/linux/sched.h b/include/linux/sched.h index 9f0b372cfa6f..c34b137cd1e5 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -330,6 +330,7 @@ extern signed long schedule_timeout_killable(signed long timeout); extern signed long schedule_timeout_uninterruptible(signed long timeout); asmlinkage void __schedule(void); asmlinkage void schedule(void); +extern int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner); struct nsproxy; struct user_namespace; diff --git a/kernel/mutex-debug.c b/kernel/mutex-debug.c index 1d94160eb532..50d022e5a560 100644 --- a/kernel/mutex-debug.c +++ b/kernel/mutex-debug.c @@ -26,11 +26,6 @@ /* * Must be called with lock->wait_lock held. */ -void debug_mutex_set_owner(struct mutex *lock, struct thread_info *new_owner) -{ - lock->owner = new_owner; -} - void debug_mutex_lock_common(struct mutex *lock, struct mutex_waiter *waiter) { memset(waiter, MUTEX_DEBUG_INIT, sizeof(*waiter)); @@ -59,7 +54,6 @@ void debug_mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter, /* Mark the current thread as blocked on the lock: */ ti->task->blocked_on = waiter; - waiter->lock = lock; } void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter, @@ -82,7 +76,7 @@ void debug_mutex_unlock(struct mutex *lock) DEBUG_LOCKS_WARN_ON(lock->magic != lock); DEBUG_LOCKS_WARN_ON(lock->owner != current_thread_info()); DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next); - DEBUG_LOCKS_WARN_ON(lock->owner != current_thread_info()); + mutex_clear_owner(lock); } void debug_mutex_init(struct mutex *lock, const char *name, @@ -95,7 +89,6 @@ void debug_mutex_init(struct mutex *lock, const char *name, debug_check_no_locks_freed((void *)lock, sizeof(*lock)); lockdep_init_map(&lock->dep_map, name, key, 0); #endif - lock->owner = NULL; lock->magic = lock; } diff --git a/kernel/mutex-debug.h b/kernel/mutex-debug.h index babfbdfc534b..6b2d735846a5 100644 --- a/kernel/mutex-debug.h +++ b/kernel/mutex-debug.h @@ -13,14 +13,6 @@ /* * This must be called with lock->wait_lock held. */ -extern void -debug_mutex_set_owner(struct mutex *lock, struct thread_info *new_owner); - -static inline void debug_mutex_clear_owner(struct mutex *lock) -{ - lock->owner = NULL; -} - extern void debug_mutex_lock_common(struct mutex *lock, struct mutex_waiter *waiter); extern void debug_mutex_wake_waiter(struct mutex *lock, @@ -35,6 +27,16 @@ extern void debug_mutex_unlock(struct mutex *lock); extern void debug_mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key); +static inline void mutex_set_owner(struct mutex *lock) +{ + lock->owner = current_thread_info(); +} + +static inline void mutex_clear_owner(struct mutex *lock) +{ + lock->owner = NULL; +} + #define spin_lock_mutex(lock, flags) \ do { \ struct mutex *l = container_of(lock, struct mutex, wait_lock); \ diff --git a/kernel/mutex.c b/kernel/mutex.c index 524ffc33dc05..ff42e975590c 100644 --- a/kernel/mutex.c +++ b/kernel/mutex.c @@ -10,6 +10,11 @@ * Many thanks to Arjan van de Ven, Thomas Gleixner, Steven Rostedt and * David Howells for suggestions and improvements. * + * - Adaptive spinning for mutexes by Peter Zijlstra. (Ported to mainline + * from the -rt tree, where it was originally implemented for rtmutexes + * by Steven Rostedt, based on work by Gregory Haskins, Peter Morreale + * and Sven Dietrich. + * * Also see Documentation/mutex-design.txt. */ #include @@ -46,6 +51,7 @@ __mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key) atomic_set(&lock->count, 1); spin_lock_init(&lock->wait_lock); INIT_LIST_HEAD(&lock->wait_list); + mutex_clear_owner(lock); debug_mutex_init(lock, name, key); } @@ -91,6 +97,7 @@ void inline __sched mutex_lock(struct mutex *lock) * 'unlocked' into 'locked' state. */ __mutex_fastpath_lock(&lock->count, __mutex_lock_slowpath); + mutex_set_owner(lock); } EXPORT_SYMBOL(mutex_lock); @@ -115,6 +122,14 @@ void __sched mutex_unlock(struct mutex *lock) * The unlocking fastpath is the 0->1 transition from 'locked' * into 'unlocked' state: */ +#ifndef CONFIG_DEBUG_MUTEXES + /* + * When debugging is enabled we must not clear the owner before time, + * the slow path will always be taken, and that clears the owner field + * after verifying that it was indeed current. + */ + mutex_clear_owner(lock); +#endif __mutex_fastpath_unlock(&lock->count, __mutex_unlock_slowpath); } @@ -132,10 +147,71 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, unsigned long flags; preempt_disable(); + mutex_acquire(&lock->dep_map, subclass, 0, ip); +#if defined(CONFIG_SMP) && !defined(CONFIG_DEBUG_MUTEXES) + /* + * Optimistic spinning. + * + * We try to spin for acquisition when we find that there are no + * pending waiters and the lock owner is currently running on a + * (different) CPU. + * + * The rationale is that if the lock owner is running, it is likely to + * release the lock soon. + * + * Since this needs the lock owner, and this mutex implementation + * doesn't track the owner atomically in the lock field, we need to + * track it non-atomically. + * + * We can't do this for DEBUG_MUTEXES because that relies on wait_lock + * to serialize everything. + */ + + for (;;) { + struct thread_info *owner; + + /* + * If there are pending waiters, join them. + */ + if (!list_empty(&lock->wait_list)) + break; + + /* + * If there's an owner, wait for it to either + * release the lock or go to sleep. + */ + owner = ACCESS_ONCE(lock->owner); + if (owner && !mutex_spin_on_owner(lock, owner)) + break; + + /* + * When there's no owner, we might have preempted between the + * owner acquiring the lock and setting the owner field. If + * we're an RT task that will live-lock because we won't let + * the owner complete. + */ + if (!owner && (need_resched() || rt_task(task))) + break; + + if (atomic_cmpxchg(&lock->count, 1, 0) == 1) { + lock_acquired(&lock->dep_map, ip); + mutex_set_owner(lock); + preempt_enable(); + return 0; + } + + /* + * The cpu_relax() call is a compiler barrier which forces + * everything in this loop to be re-loaded. We don't need + * memory barriers as we'll eventually observe the right + * values at the cost of a few extra spins. + */ + cpu_relax(); + } +#endif spin_lock_mutex(&lock->wait_lock, flags); debug_mutex_lock_common(lock, &waiter); - mutex_acquire(&lock->dep_map, subclass, 0, ip); debug_mutex_add_waiter(lock, &waiter, task_thread_info(task)); /* add waiting tasks to the end of the waitqueue (FIFO): */ @@ -185,8 +261,8 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, done: lock_acquired(&lock->dep_map, ip); /* got the lock - rejoice! */ - mutex_remove_waiter(lock, &waiter, task_thread_info(task)); - debug_mutex_set_owner(lock, task_thread_info(task)); + mutex_remove_waiter(lock, &waiter, current_thread_info()); + mutex_set_owner(lock); /* set it to 0 if there are no waiters left: */ if (likely(list_empty(&lock->wait_list))) @@ -222,7 +298,8 @@ int __sched mutex_lock_interruptible_nested(struct mutex *lock, unsigned int subclass) { might_sleep(); - return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, subclass, _RET_IP_); + return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, + subclass, _RET_IP_); } EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested); @@ -260,8 +337,6 @@ __mutex_unlock_common_slowpath(atomic_t *lock_count, int nested) wake_up_process(waiter->task); } - debug_mutex_clear_owner(lock); - spin_unlock_mutex(&lock->wait_lock, flags); } @@ -298,18 +373,30 @@ __mutex_lock_interruptible_slowpath(atomic_t *lock_count); */ int __sched mutex_lock_interruptible(struct mutex *lock) { + int ret; + might_sleep(); - return __mutex_fastpath_lock_retval + ret = __mutex_fastpath_lock_retval (&lock->count, __mutex_lock_interruptible_slowpath); + if (!ret) + mutex_set_owner(lock); + + return ret; } EXPORT_SYMBOL(mutex_lock_interruptible); int __sched mutex_lock_killable(struct mutex *lock) { + int ret; + might_sleep(); - return __mutex_fastpath_lock_retval + ret = __mutex_fastpath_lock_retval (&lock->count, __mutex_lock_killable_slowpath); + if (!ret) + mutex_set_owner(lock); + + return ret; } EXPORT_SYMBOL(mutex_lock_killable); @@ -352,9 +439,10 @@ static inline int __mutex_trylock_slowpath(atomic_t *lock_count) prev = atomic_xchg(&lock->count, -1); if (likely(prev == 1)) { - debug_mutex_set_owner(lock, current_thread_info()); + mutex_set_owner(lock); mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_); } + /* Set it back to 0 if there are no waiters: */ if (likely(list_empty(&lock->wait_list))) atomic_set(&lock->count, 0); @@ -380,8 +468,13 @@ static inline int __mutex_trylock_slowpath(atomic_t *lock_count) */ int __sched mutex_trylock(struct mutex *lock) { - return __mutex_fastpath_trylock(&lock->count, - __mutex_trylock_slowpath); + int ret; + + ret = __mutex_fastpath_trylock(&lock->count, __mutex_trylock_slowpath); + if (ret) + mutex_set_owner(lock); + + return ret; } EXPORT_SYMBOL(mutex_trylock); diff --git a/kernel/mutex.h b/kernel/mutex.h index a075dafbb290..67578ca48f94 100644 --- a/kernel/mutex.h +++ b/kernel/mutex.h @@ -16,8 +16,26 @@ #define mutex_remove_waiter(lock, waiter, ti) \ __list_del((waiter)->list.prev, (waiter)->list.next) -#define debug_mutex_set_owner(lock, new_owner) do { } while (0) -#define debug_mutex_clear_owner(lock) do { } while (0) +#ifdef CONFIG_SMP +static inline void mutex_set_owner(struct mutex *lock) +{ + lock->owner = current_thread_info(); +} + +static inline void mutex_clear_owner(struct mutex *lock) +{ + lock->owner = NULL; +} +#else +static inline void mutex_set_owner(struct mutex *lock) +{ +} + +static inline void mutex_clear_owner(struct mutex *lock) +{ +} +#endif + #define debug_mutex_wake_waiter(lock, waiter) do { } while (0) #define debug_mutex_free_waiter(waiter) do { } while (0) #define debug_mutex_add_waiter(lock, waiter, ti) do { } while (0) diff --git a/kernel/sched.c b/kernel/sched.c index b001c133c359..589e7308c615 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4614,6 +4614,67 @@ need_resched: } EXPORT_SYMBOL(schedule); +#ifdef CONFIG_SMP +/* + * Look out! "owner" is an entirely speculative pointer + * access and not reliable. + */ +int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner) +{ + unsigned int cpu; + struct rq *rq; + + if (!sched_feat(OWNER_SPIN)) + return 0; + +#ifdef CONFIG_DEBUG_PAGEALLOC + /* + * Need to access the cpu field knowing that + * DEBUG_PAGEALLOC could have unmapped it if + * the mutex owner just released it and exited. + */ + if (probe_kernel_address(&owner->cpu, cpu)) + goto out; +#else + cpu = owner->cpu; +#endif + + /* + * Even if the access succeeded (likely case), + * the cpu field may no longer be valid. + */ + if (cpu >= nr_cpumask_bits) + goto out; + + /* + * We need to validate that we can do a + * get_cpu() and that we have the percpu area. + */ + if (!cpu_online(cpu)) + goto out; + + rq = cpu_rq(cpu); + + for (;;) { + /* + * Owner changed, break to re-assess state. + */ + if (lock->owner != owner) + break; + + /* + * Is that owner really running on that cpu? + */ + if (task_thread_info(rq->curr) != owner || need_resched()) + return 0; + + cpu_relax(); + } +out: + return 1; +} +#endif + #ifdef CONFIG_PREEMPT /* * this is the entry point to schedule() from in-kernel preemption diff --git a/kernel/sched_features.h b/kernel/sched_features.h index da5d93b5d2c6..07bc02e99ab1 100644 --- a/kernel/sched_features.h +++ b/kernel/sched_features.h @@ -13,3 +13,4 @@ SCHED_FEAT(LB_WAKEUP_UPDATE, 1) SCHED_FEAT(ASYM_EFF_LOAD, 1) SCHED_FEAT(WAKEUP_OVERLAP, 0) SCHED_FEAT(LAST_BUDDY, 1) +SCHED_FEAT(OWNER_SPIN, 1) -- cgit v1.2.3 From 831451ac4e44d3a20b581ce726ef1d1144373f7d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 14 Jan 2009 12:39:18 +0100 Subject: sched: introduce avg_wakeup Introduce a new avg_wakeup statistic. avg_wakeup is a measure of how frequently a task wakes up other tasks, it represents the average time between wakeups, with a limit of avg_runtime for when it doesn't wake up anybody. Signed-off-by: Peter Zijlstra Signed-off-by: Mike Galbraith Signed-off-by: Ingo Molnar --- include/linux/sched.h | 3 +++ kernel/sched.c | 36 ++++++++++++++++++++++++++++++------ kernel/sched_debug.c | 1 + 3 files changed, 34 insertions(+), 6 deletions(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 4cae9b81a1f8..daf4e07bc978 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1046,6 +1046,9 @@ struct sched_entity { u64 exec_max; u64 slice_max; + u64 start_runtime; + u64 avg_wakeup; + u64 nr_migrations; u64 nr_migrations_cold; u64 nr_failed_migrations_affine; diff --git a/kernel/sched.c b/kernel/sched.c index 8be2c13b50d0..86f5a063f0b9 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1705,6 +1705,9 @@ static void update_avg(u64 *avg, u64 sample) static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup) { + if (wakeup) + p->se.start_runtime = p->se.sum_exec_runtime; + sched_info_queued(p); p->sched_class->enqueue_task(rq, p, wakeup); p->se.on_rq = 1; @@ -1712,10 +1715,15 @@ static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup) static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep) { - if (sleep && p->se.last_wakeup) { - update_avg(&p->se.avg_overlap, - p->se.sum_exec_runtime - p->se.last_wakeup); - p->se.last_wakeup = 0; + if (sleep) { + if (p->se.last_wakeup) { + update_avg(&p->se.avg_overlap, + p->se.sum_exec_runtime - p->se.last_wakeup); + p->se.last_wakeup = 0; + } else { + update_avg(&p->se.avg_wakeup, + sysctl_sched_wakeup_granularity); + } } sched_info_dequeued(p); @@ -2345,6 +2353,22 @@ out_activate: activate_task(rq, p, 1); success = 1; + /* + * Only attribute actual wakeups done by this task. + */ + if (!in_interrupt()) { + struct sched_entity *se = ¤t->se; + u64 sample = se->sum_exec_runtime; + + if (se->last_wakeup) + sample -= se->last_wakeup; + else + sample -= se->start_runtime; + update_avg(&se->avg_wakeup, sample); + + se->last_wakeup = se->sum_exec_runtime; + } + out_running: trace_sched_wakeup(rq, p, success); check_preempt_curr(rq, p, sync); @@ -2355,8 +2379,6 @@ out_running: p->sched_class->task_wake_up(rq, p); #endif out: - current->se.last_wakeup = current->se.sum_exec_runtime; - task_rq_unlock(rq, &flags); return success; @@ -2386,6 +2408,8 @@ static void __sched_fork(struct task_struct *p) p->se.prev_sum_exec_runtime = 0; p->se.last_wakeup = 0; p->se.avg_overlap = 0; + p->se.start_runtime = 0; + p->se.avg_wakeup = sysctl_sched_wakeup_granularity; #ifdef CONFIG_SCHEDSTATS p->se.wait_start = 0; diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index 16eeba4e4169..2b1260f0e800 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -397,6 +397,7 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) PN(se.vruntime); PN(se.sum_exec_runtime); PN(se.avg_overlap); + PN(se.avg_wakeup); nr_switches = p->nvcsw + p->nivcsw; -- cgit v1.2.3 From 34cb61359b503d7aff6447acb037a5efd6ce93b2 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 16 Jan 2009 13:36:06 +0100 Subject: sched: fix !CONFIG_SCHEDSTATS build failure Stephen Rothwell reported this linux-next build failure with !CONFIG_SCHEDSTATS: | In file included from kernel/sched.c:1703: | kernel/sched_fair.c: In function 'adaptive_gran': | kernel/sched_fair.c:1324: error: 'struct sched_entity' has no member named 'avg_wakeup' The start_runtime and avg_wakeup metrics are now not just for statistics, but also for scheduling - so they always need to be available. (Also move out the nr_migrations fields - for future perfcounters usage.) Reported-by: Stephen Rothwell Signed-off-by: Ingo Molnar --- include/linux/sched.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index daf4e07bc978..5d56b54350a5 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1031,6 +1031,10 @@ struct sched_entity { u64 last_wakeup; u64 avg_overlap; + u64 start_runtime; + u64 avg_wakeup; + u64 nr_migrations; + #ifdef CONFIG_SCHEDSTATS u64 wait_start; u64 wait_max; @@ -1046,10 +1050,6 @@ struct sched_entity { u64 exec_max; u64 slice_max; - u64 start_runtime; - u64 avg_wakeup; - - u64 nr_migrations; u64 nr_migrations_cold; u64 nr_failed_migrations_affine; u64 nr_failed_migrations_running; -- cgit v1.2.3 From 7e49fcce1bdadd723ae6a0b3b324c4daced61563 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 22 Jan 2009 19:01:40 -0500 Subject: trace, lockdep: manual preempt count adding for local_bh_disable Impact: fix to preempt trace triggering lockdep check_flag failure In local_bh_disable, the use of add_preempt_count causes the preempt tracer to start recording the time preemption is off. But because it already modified the preempt_count to show softirqs disabled, and before it called the lockdep code to handle this, it causes a state that lockdep can not handle. The preempt tracer will reset the ring buffer on start of a trace, and the ring buffer reset code does a spin_lock_irqsave. This calls into lockdep and lockdep will fail when it detects the invalid state of having softirqs disabled but the internal current->softirqs_enabled is still set. The fix is to manually add the SOFTIRQ_OFFSET to preempt count and call the preempt tracer code outside the lockdep critical area. Thanks to Peter Zijlstra for suggesting this solution. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- include/linux/sched.h | 2 ++ kernel/sched.c | 8 ++++---- kernel/softirq.c | 13 ++++++++++++- 3 files changed, 18 insertions(+), 5 deletions(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 4cae9b81a1f8..33085b88f87b 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -137,6 +137,8 @@ extern unsigned long nr_uninterruptible(void); extern unsigned long nr_active(void); extern unsigned long nr_iowait(void); +extern unsigned long get_parent_ip(unsigned long addr); + struct seq_file; struct cfs_rq; struct task_group; diff --git a/kernel/sched.c b/kernel/sched.c index 52bbf1c842a8..c154825ae753 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4399,10 +4399,7 @@ void scheduler_tick(void) #endif } -#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ - defined(CONFIG_PREEMPT_TRACER)) - -static inline unsigned long get_parent_ip(unsigned long addr) +unsigned long get_parent_ip(unsigned long addr) { if (in_lock_functions(addr)) { addr = CALLER_ADDR2; @@ -4412,6 +4409,9 @@ static inline unsigned long get_parent_ip(unsigned long addr) return addr; } +#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ + defined(CONFIG_PREEMPT_TRACER)) + void __kprobes add_preempt_count(int val) { #ifdef CONFIG_DEBUG_PREEMPT diff --git a/kernel/softirq.c b/kernel/softirq.c index bdbe9de9cd8d..6edfc2c11d99 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include @@ -79,13 +80,23 @@ static void __local_bh_disable(unsigned long ip) WARN_ON_ONCE(in_irq()); raw_local_irq_save(flags); - add_preempt_count(SOFTIRQ_OFFSET); + /* + * The preempt tracer hooks into add_preempt_count and will break + * lockdep because it calls back into lockdep after SOFTIRQ_OFFSET + * is set and before current->softirq_enabled is cleared. + * We must manually increment preempt_count here and manually + * call the trace_preempt_off later. + */ + preempt_count() += SOFTIRQ_OFFSET; /* * Were softirqs turned off above: */ if (softirq_count() == SOFTIRQ_OFFSET) trace_softirqs_off(ip); raw_local_irq_restore(flags); + + if (preempt_count() == SOFTIRQ_OFFSET) + trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); } #else /* !CONFIG_TRACE_IRQFLAGS */ static inline void __local_bh_disable(unsigned long ip) -- cgit v1.2.3 From f9ce1f1cda8b73a36f47e424975a9dfa78b7840c Mon Sep 17 00:00:00 2001 From: Kentaro Takeda Date: Thu, 5 Feb 2009 17:18:11 +0900 Subject: Add in_execve flag into task_struct. This patch allows LSM modules to determine whether current process is in an execve operation or not so that they can behave differently while an execve operation is in progress. This patch is needed by TOMOYO. Please see another patch titled "LSM adapter functions." for backgrounds. Signed-off-by: Tetsuo Handa Signed-off-by: David Howells Signed-off-by: James Morris --- fs/compat.c | 3 +++ fs/exec.c | 3 +++ include/linux/sched.h | 2 ++ 3 files changed, 8 insertions(+) (limited to 'include/linux/sched.h') diff --git a/fs/compat.c b/fs/compat.c index 65a070e705ab..25589f8322f2 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -1402,6 +1402,7 @@ int compat_do_execve(char * filename, retval = mutex_lock_interruptible(¤t->cred_exec_mutex); if (retval < 0) goto out_free; + current->in_execve = 1; retval = -ENOMEM; bprm->cred = prepare_exec_creds(); @@ -1454,6 +1455,7 @@ int compat_do_execve(char * filename, goto out; /* execve succeeded */ + current->in_execve = 0; mutex_unlock(¤t->cred_exec_mutex); acct_update_integrals(current); free_bprm(bprm); @@ -1470,6 +1472,7 @@ out_file: } out_unlock: + current->in_execve = 0; mutex_unlock(¤t->cred_exec_mutex); out_free: diff --git a/fs/exec.c b/fs/exec.c index febfd8ed6ad1..9881dc3bb488 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1278,6 +1278,7 @@ int do_execve(char * filename, retval = mutex_lock_interruptible(¤t->cred_exec_mutex); if (retval < 0) goto out_free; + current->in_execve = 1; retval = -ENOMEM; bprm->cred = prepare_exec_creds(); @@ -1331,6 +1332,7 @@ int do_execve(char * filename, goto out; /* execve succeeded */ + current->in_execve = 0; mutex_unlock(¤t->cred_exec_mutex); acct_update_integrals(current); free_bprm(bprm); @@ -1349,6 +1351,7 @@ out_file: } out_unlock: + current->in_execve = 0; mutex_unlock(¤t->cred_exec_mutex); out_free: diff --git a/include/linux/sched.h b/include/linux/sched.h index 2127e959e0f4..397c20cfb6a5 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1158,6 +1158,8 @@ struct task_struct { /* ??? */ unsigned int personality; unsigned did_exec:1; + unsigned in_execve:1; /* Tell the LSMs that the process is doing an + * execve */ pid_t pid; pid_t tgid; -- cgit v1.2.3 From cf40bd16fdad42c053040bcd3988f5fdedbb6c57 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Wed, 21 Jan 2009 08:12:39 +0100 Subject: lockdep: annotate reclaim context (__GFP_NOFS) Here is another version, with the incremental patch rolled up, and added reclaim context annotation to kswapd, and allocation tracing to slab allocators (which may only ever reach the page allocator in rare cases, so it is good to put annotations here too). Haven't tested this version as such, but it should be getting closer to merge worthy ;) -- After noticing some code in mm/filemap.c accidentally perform a __GFP_FS allocation when it should not have been, I thought it might be a good idea to try to catch this kind of thing with lockdep. I coded up a little idea that seems to work. Unfortunately the system has to actually be in __GFP_FS page reclaim, then take the lock, before it will mark it. But at least that might still be some orders of magnitude more common (and more debuggable) than an actual deadlock condition, so we have some improvement I hope (the concept is no less complete than discovery of a lock's interrupt contexts). I guess we could even do the same thing with __GFP_IO (normal reclaim), and even GFP_NOIO locks too... but filesystems will have the most locks and fiddly code paths, so let's start there and see how it goes. It *seems* to work. I did a quick test. ================================= [ INFO: inconsistent lock state ] 2.6.28-rc6-00007-ged31348-dirty #26 --------------------------------- inconsistent {in-reclaim-W} -> {ov-reclaim-W} usage. modprobe/8526 [HC0[0]:SC0[0]:HE1:SE1] takes: (testlock){--..}, at: [] brd_init+0x55/0x216 [brd] {in-reclaim-W} state was registered at: [] __lock_acquire+0x75b/0x1a60 [] lock_acquire+0x91/0xc0 [] mutex_lock_nested+0xb1/0x310 [] brd_init+0x2b/0x216 [brd] [] _stext+0x3b/0x170 [] sys_init_module+0xaf/0x1e0 [] system_call_fastpath+0x16/0x1b [] 0xffffffffffffffff irq event stamp: 3929 hardirqs last enabled at (3929): [] mutex_lock_nested+0x285/0x310 hardirqs last disabled at (3928): [] mutex_lock_nested+0x59/0x310 softirqs last enabled at (3732): [] sk_filter+0x83/0xe0 softirqs last disabled at (3730): [] sk_filter+0x16/0xe0 other info that might help us debug this: 1 lock held by modprobe/8526: #0: (testlock){--..}, at: [] brd_init+0x55/0x216 [brd] stack backtrace: Pid: 8526, comm: modprobe Not tainted 2.6.28-rc6-00007-ged31348-dirty #26 Call Trace: [] print_usage_bug+0x193/0x1d0 [] mark_lock+0xaf0/0xca0 [] mark_held_locks+0x55/0xc0 [] ? brd_init+0x0/0x216 [brd] [] trace_reclaim_fs+0x2a/0x60 [] __alloc_pages_internal+0x475/0x580 [] ? mutex_lock_nested+0x26e/0x310 [] ? brd_init+0x0/0x216 [brd] [] brd_init+0x6a/0x216 [brd] [] ? brd_init+0x0/0x216 [brd] [] _stext+0x3b/0x170 [] ? mutex_unlock+0x9/0x10 [] ? __mutex_unlock_slowpath+0x10d/0x180 [] ? trace_hardirqs_on_caller+0x12c/0x190 [] sys_init_module+0xaf/0x1e0 [] system_call_fastpath+0x16/0x1b Signed-off-by: Nick Piggin Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- include/linux/lockdep.h | 17 +++- include/linux/sched.h | 1 + kernel/lockdep.c | 229 ++++++++++++++++++++++++++++++++++++++++++--- kernel/lockdep_internals.h | 3 +- kernel/lockdep_proc.c | 6 +- mm/page_alloc.c | 5 + mm/slab.c | 4 + mm/slob.c | 2 + mm/slub.c | 1 + mm/vmscan.c | 3 + 10 files changed, 254 insertions(+), 17 deletions(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index 23bf02fb124f..cc97bdbc7969 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -27,12 +27,16 @@ enum lock_usage_bit LOCK_USED = 0, LOCK_USED_IN_HARDIRQ, LOCK_USED_IN_SOFTIRQ, + LOCK_USED_IN_RECLAIM_FS, LOCK_ENABLED_SOFTIRQS, LOCK_ENABLED_HARDIRQS, + LOCK_HELD_OVER_RECLAIM_FS, LOCK_USED_IN_HARDIRQ_READ, LOCK_USED_IN_SOFTIRQ_READ, + LOCK_USED_IN_RECLAIM_FS_READ, LOCK_ENABLED_SOFTIRQS_READ, LOCK_ENABLED_HARDIRQS_READ, + LOCK_HELD_OVER_RECLAIM_FS_READ, LOCK_USAGE_STATES }; @@ -42,16 +46,20 @@ enum lock_usage_bit #define LOCKF_USED (1 << LOCK_USED) #define LOCKF_USED_IN_HARDIRQ (1 << LOCK_USED_IN_HARDIRQ) #define LOCKF_USED_IN_SOFTIRQ (1 << LOCK_USED_IN_SOFTIRQ) +#define LOCKF_USED_IN_RECLAIM_FS (1 << LOCK_USED_IN_RECLAIM_FS) #define LOCKF_ENABLED_HARDIRQS (1 << LOCK_ENABLED_HARDIRQS) #define LOCKF_ENABLED_SOFTIRQS (1 << LOCK_ENABLED_SOFTIRQS) +#define LOCKF_HELD_OVER_RECLAIM_FS (1 << LOCK_HELD_OVER_RECLAIM_FS) #define LOCKF_ENABLED_IRQS (LOCKF_ENABLED_HARDIRQS | LOCKF_ENABLED_SOFTIRQS) #define LOCKF_USED_IN_IRQ (LOCKF_USED_IN_HARDIRQ | LOCKF_USED_IN_SOFTIRQ) #define LOCKF_USED_IN_HARDIRQ_READ (1 << LOCK_USED_IN_HARDIRQ_READ) #define LOCKF_USED_IN_SOFTIRQ_READ (1 << LOCK_USED_IN_SOFTIRQ_READ) +#define LOCKF_USED_IN_RECLAIM_FS_READ (1 << LOCK_USED_IN_RECLAIM_FS_READ) #define LOCKF_ENABLED_HARDIRQS_READ (1 << LOCK_ENABLED_HARDIRQS_READ) #define LOCKF_ENABLED_SOFTIRQS_READ (1 << LOCK_ENABLED_SOFTIRQS_READ) +#define LOCKF_HELD_OVER_RECLAIM_FS_READ (1 << LOCK_HELD_OVER_RECLAIM_FS_READ) #define LOCKF_ENABLED_IRQS_READ \ (LOCKF_ENABLED_HARDIRQS_READ | LOCKF_ENABLED_SOFTIRQS_READ) @@ -324,7 +332,11 @@ static inline void lock_set_subclass(struct lockdep_map *lock, lock_set_class(lock, lock->name, lock->key, subclass, ip); } -# define INIT_LOCKDEP .lockdep_recursion = 0, +extern void lockdep_set_current_reclaim_state(gfp_t gfp_mask); +extern void lockdep_clear_current_reclaim_state(void); +extern void lockdep_trace_alloc(gfp_t mask); + +# define INIT_LOCKDEP .lockdep_recursion = 0, .lockdep_reclaim_gfp = 0, #define lockdep_depth(tsk) (debug_locks ? (tsk)->lockdep_depth : 0) @@ -342,6 +354,9 @@ static inline void lockdep_on(void) # define lock_release(l, n, i) do { } while (0) # define lock_set_class(l, n, k, s, i) do { } while (0) # define lock_set_subclass(l, s, i) do { } while (0) +# define lockdep_set_current_reclaim_state(g) do { } while (0) +# define lockdep_clear_current_reclaim_state() do { } while (0) +# define lockdep_trace_alloc(g) do { } while (0) # define lockdep_init() do { } while (0) # define lockdep_info() do { } while (0) # define lockdep_init_map(lock, name, key, sub) \ diff --git a/include/linux/sched.h b/include/linux/sched.h index 4efb552aca47..b00a77f4999e 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1313,6 +1313,7 @@ struct task_struct { int lockdep_depth; unsigned int lockdep_recursion; struct held_lock held_locks[MAX_LOCK_DEPTH]; + gfp_t lockdep_reclaim_gfp; #endif /* journalling filesystem info */ diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 06b0c3568f0b..977f940fd562 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -310,12 +310,14 @@ EXPORT_SYMBOL(lockdep_on); #if VERBOSE # define HARDIRQ_VERBOSE 1 # define SOFTIRQ_VERBOSE 1 +# define RECLAIM_VERBOSE 1 #else # define HARDIRQ_VERBOSE 0 # define SOFTIRQ_VERBOSE 0 +# define RECLAIM_VERBOSE 0 #endif -#if VERBOSE || HARDIRQ_VERBOSE || SOFTIRQ_VERBOSE +#if VERBOSE || HARDIRQ_VERBOSE || SOFTIRQ_VERBOSE || RECLAIM_VERBOSE /* * Quick filtering for interesting events: */ @@ -454,6 +456,10 @@ static const char *usage_str[] = [LOCK_USED_IN_SOFTIRQ_READ] = "in-softirq-R", [LOCK_ENABLED_SOFTIRQS_READ] = "softirq-on-R", [LOCK_ENABLED_HARDIRQS_READ] = "hardirq-on-R", + [LOCK_USED_IN_RECLAIM_FS] = "in-reclaim-W", + [LOCK_USED_IN_RECLAIM_FS_READ] = "in-reclaim-R", + [LOCK_HELD_OVER_RECLAIM_FS] = "ov-reclaim-W", + [LOCK_HELD_OVER_RECLAIM_FS_READ] = "ov-reclaim-R", }; const char * __get_key_name(struct lockdep_subclass_key *key, char *str) @@ -462,9 +468,10 @@ const char * __get_key_name(struct lockdep_subclass_key *key, char *str) } void -get_usage_chars(struct lock_class *class, char *c1, char *c2, char *c3, char *c4) +get_usage_chars(struct lock_class *class, char *c1, char *c2, char *c3, + char *c4, char *c5, char *c6) { - *c1 = '.', *c2 = '.', *c3 = '.', *c4 = '.'; + *c1 = '.', *c2 = '.', *c3 = '.', *c4 = '.', *c5 = '.', *c6 = '.'; if (class->usage_mask & LOCKF_USED_IN_HARDIRQ) *c1 = '+'; @@ -493,14 +500,29 @@ get_usage_chars(struct lock_class *class, char *c1, char *c2, char *c3, char *c4 if (class->usage_mask & LOCKF_ENABLED_SOFTIRQS_READ) *c4 = '?'; } + + if (class->usage_mask & LOCKF_USED_IN_RECLAIM_FS) + *c5 = '+'; + else + if (class->usage_mask & LOCKF_HELD_OVER_RECLAIM_FS) + *c5 = '-'; + + if (class->usage_mask & LOCKF_HELD_OVER_RECLAIM_FS_READ) + *c6 = '-'; + if (class->usage_mask & LOCKF_USED_IN_RECLAIM_FS_READ) { + *c6 = '+'; + if (class->usage_mask & LOCKF_HELD_OVER_RECLAIM_FS_READ) + *c6 = '?'; + } + } static void print_lock_name(struct lock_class *class) { - char str[KSYM_NAME_LEN], c1, c2, c3, c4; + char str[KSYM_NAME_LEN], c1, c2, c3, c4, c5, c6; const char *name; - get_usage_chars(class, &c1, &c2, &c3, &c4); + get_usage_chars(class, &c1, &c2, &c3, &c4, &c5, &c6); name = class->name; if (!name) { @@ -513,7 +535,7 @@ static void print_lock_name(struct lock_class *class) if (class->subclass) printk("/%d", class->subclass); } - printk("){%c%c%c%c}", c1, c2, c3, c4); + printk("){%c%c%c%c%c%c}", c1, c2, c3, c4, c5, c6); } static void print_lockdep_cache(struct lockdep_map *lock) @@ -1306,6 +1328,26 @@ check_prev_add_irq(struct task_struct *curr, struct held_lock *prev, LOCK_ENABLED_SOFTIRQS, "soft")) return 0; + /* + * Prove that the new dependency does not connect a reclaim-fs-safe + * lock with a reclaim-fs-unsafe lock - to achieve this we search + * the backwards-subgraph starting at , and the + * forwards-subgraph starting at : + */ + if (!check_usage(curr, prev, next, LOCK_USED_IN_RECLAIM_FS, + LOCK_HELD_OVER_RECLAIM_FS, "reclaim-fs")) + return 0; + + /* + * Prove that the new dependency does not connect a reclaim-fs-safe-read + * lock with a reclaim-fs-unsafe lock - to achieve this we search + * the backwards-subgraph starting at , and the + * forwards-subgraph starting at : + */ + if (!check_usage(curr, prev, next, LOCK_USED_IN_RECLAIM_FS_READ, + LOCK_HELD_OVER_RECLAIM_FS, "reclaim-fs-read")) + return 0; + return 1; } @@ -1949,6 +1991,14 @@ static int softirq_verbose(struct lock_class *class) return 0; } +static int reclaim_verbose(struct lock_class *class) +{ +#if RECLAIM_VERBOSE + return class_filter(class); +#endif + return 0; +} + #define STRICT_READ_CHECKS 1 static int mark_lock_irq(struct task_struct *curr, struct held_lock *this, @@ -2007,6 +2057,31 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this, if (softirq_verbose(hlock_class(this))) ret = 2; break; + case LOCK_USED_IN_RECLAIM_FS: + if (!valid_state(curr, this, new_bit, LOCK_HELD_OVER_RECLAIM_FS)) + return 0; + if (!valid_state(curr, this, new_bit, + LOCK_HELD_OVER_RECLAIM_FS_READ)) + return 0; + /* + * just marked it reclaim-fs-safe, check that this lock + * took no reclaim-fs-unsafe lock in the past: + */ + if (!check_usage_forwards(curr, this, + LOCK_HELD_OVER_RECLAIM_FS, "reclaim-fs")) + return 0; +#if STRICT_READ_CHECKS + /* + * just marked it reclaim-fs-safe, check that this lock + * took no reclaim-fs-unsafe-read lock in the past: + */ + if (!check_usage_forwards(curr, this, + LOCK_HELD_OVER_RECLAIM_FS_READ, "reclaim-fs-read")) + return 0; +#endif + if (reclaim_verbose(hlock_class(this))) + ret = 2; + break; case LOCK_USED_IN_HARDIRQ_READ: if (!valid_state(curr, this, new_bit, LOCK_ENABLED_HARDIRQS)) return 0; @@ -2033,6 +2108,19 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this, if (softirq_verbose(hlock_class(this))) ret = 2; break; + case LOCK_USED_IN_RECLAIM_FS_READ: + if (!valid_state(curr, this, new_bit, LOCK_HELD_OVER_RECLAIM_FS)) + return 0; + /* + * just marked it reclaim-fs-read-safe, check that this lock + * took no reclaim-fs-unsafe lock in the past: + */ + if (!check_usage_forwards(curr, this, + LOCK_HELD_OVER_RECLAIM_FS, "reclaim-fs")) + return 0; + if (reclaim_verbose(hlock_class(this))) + ret = 2; + break; case LOCK_ENABLED_HARDIRQS: if (!valid_state(curr, this, new_bit, LOCK_USED_IN_HARDIRQ)) return 0; @@ -2085,6 +2173,32 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this, if (softirq_verbose(hlock_class(this))) ret = 2; break; + case LOCK_HELD_OVER_RECLAIM_FS: + if (!valid_state(curr, this, new_bit, LOCK_USED_IN_RECLAIM_FS)) + return 0; + if (!valid_state(curr, this, new_bit, + LOCK_USED_IN_RECLAIM_FS_READ)) + return 0; + /* + * just marked it reclaim-fs-unsafe, check that no reclaim-fs-safe + * lock in the system ever took it in the past: + */ + if (!check_usage_backwards(curr, this, + LOCK_USED_IN_RECLAIM_FS, "reclaim-fs")) + return 0; +#if STRICT_READ_CHECKS + /* + * just marked it softirq-unsafe, check that no + * softirq-safe-read lock in the system ever took + * it in the past: + */ + if (!check_usage_backwards(curr, this, + LOCK_USED_IN_RECLAIM_FS_READ, "reclaim-fs-read")) + return 0; +#endif + if (reclaim_verbose(hlock_class(this))) + ret = 2; + break; case LOCK_ENABLED_HARDIRQS_READ: if (!valid_state(curr, this, new_bit, LOCK_USED_IN_HARDIRQ)) return 0; @@ -2115,6 +2229,21 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this, if (softirq_verbose(hlock_class(this))) ret = 2; break; + case LOCK_HELD_OVER_RECLAIM_FS_READ: + if (!valid_state(curr, this, new_bit, LOCK_USED_IN_RECLAIM_FS)) + return 0; +#if STRICT_READ_CHECKS + /* + * just marked it reclaim-fs-read-unsafe, check that no + * reclaim-fs-safe lock in the system ever took it in the past: + */ + if (!check_usage_backwards(curr, this, + LOCK_USED_IN_RECLAIM_FS, "reclaim-fs")) + return 0; +#endif + if (reclaim_verbose(hlock_class(this))) + ret = 2; + break; default: WARN_ON(1); break; @@ -2123,11 +2252,17 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this, return ret; } +enum mark_type { + HARDIRQ, + SOFTIRQ, + RECLAIM_FS, +}; + /* * Mark all held locks with a usage bit: */ static int -mark_held_locks(struct task_struct *curr, int hardirq) +mark_held_locks(struct task_struct *curr, enum mark_type mark) { enum lock_usage_bit usage_bit; struct held_lock *hlock; @@ -2136,17 +2271,32 @@ mark_held_locks(struct task_struct *curr, int hardirq) for (i = 0; i < curr->lockdep_depth; i++) { hlock = curr->held_locks + i; - if (hardirq) { + switch (mark) { + case HARDIRQ: if (hlock->read) usage_bit = LOCK_ENABLED_HARDIRQS_READ; else usage_bit = LOCK_ENABLED_HARDIRQS; - } else { + break; + + case SOFTIRQ: if (hlock->read) usage_bit = LOCK_ENABLED_SOFTIRQS_READ; else usage_bit = LOCK_ENABLED_SOFTIRQS; + break; + + case RECLAIM_FS: + if (hlock->read) + usage_bit = LOCK_HELD_OVER_RECLAIM_FS_READ; + else + usage_bit = LOCK_HELD_OVER_RECLAIM_FS; + break; + + default: + BUG(); } + if (!mark_lock(curr, hlock, usage_bit)) return 0; } @@ -2200,7 +2350,7 @@ void trace_hardirqs_on_caller(unsigned long ip) * We are going to turn hardirqs on, so set the * usage bit for all held locks: */ - if (!mark_held_locks(curr, 1)) + if (!mark_held_locks(curr, HARDIRQ)) return; /* * If we have softirqs enabled, then set the usage @@ -2208,7 +2358,7 @@ void trace_hardirqs_on_caller(unsigned long ip) * this bit from being set before) */ if (curr->softirqs_enabled) - if (!mark_held_locks(curr, 0)) + if (!mark_held_locks(curr, SOFTIRQ)) return; curr->hardirq_enable_ip = ip; @@ -2288,7 +2438,7 @@ void trace_softirqs_on(unsigned long ip) * enabled too: */ if (curr->hardirqs_enabled) - mark_held_locks(curr, 0); + mark_held_locks(curr, SOFTIRQ); } /* @@ -2317,6 +2467,31 @@ void trace_softirqs_off(unsigned long ip) debug_atomic_inc(&redundant_softirqs_off); } +void lockdep_trace_alloc(gfp_t gfp_mask) +{ + struct task_struct *curr = current; + + if (unlikely(!debug_locks)) + return; + + /* no reclaim without waiting on it */ + if (!(gfp_mask & __GFP_WAIT)) + return; + + /* this guy won't enter reclaim */ + if ((curr->flags & PF_MEMALLOC) && !(gfp_mask & __GFP_NOMEMALLOC)) + return; + + /* We're only interested __GFP_FS allocations for now */ + if (!(gfp_mask & __GFP_FS)) + return; + + if (DEBUG_LOCKS_WARN_ON(irqs_disabled())) + return; + + mark_held_locks(curr, RECLAIM_FS); +} + static int mark_irqflags(struct task_struct *curr, struct held_lock *hlock) { /* @@ -2362,6 +2537,22 @@ static int mark_irqflags(struct task_struct *curr, struct held_lock *hlock) } } + /* + * We reuse the irq context infrastructure more broadly as a general + * context checking code. This tests GFP_FS recursion (a lock taken + * during reclaim for a GFP_FS allocation is held over a GFP_FS + * allocation). + */ + if (!hlock->trylock && (curr->lockdep_reclaim_gfp & __GFP_FS)) { + if (hlock->read) { + if (!mark_lock(curr, hlock, LOCK_USED_IN_RECLAIM_FS_READ)) + return 0; + } else { + if (!mark_lock(curr, hlock, LOCK_USED_IN_RECLAIM_FS)) + return 0; + } + } + return 1; } @@ -2453,6 +2644,10 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this, case LOCK_ENABLED_SOFTIRQS: case LOCK_ENABLED_HARDIRQS_READ: case LOCK_ENABLED_SOFTIRQS_READ: + case LOCK_USED_IN_RECLAIM_FS: + case LOCK_USED_IN_RECLAIM_FS_READ: + case LOCK_HELD_OVER_RECLAIM_FS: + case LOCK_HELD_OVER_RECLAIM_FS_READ: ret = mark_lock_irq(curr, this, new_bit); if (!ret) return 0; @@ -2966,6 +3161,16 @@ void lock_release(struct lockdep_map *lock, int nested, } EXPORT_SYMBOL_GPL(lock_release); +void lockdep_set_current_reclaim_state(gfp_t gfp_mask) +{ + current->lockdep_reclaim_gfp = gfp_mask; +} + +void lockdep_clear_current_reclaim_state(void) +{ + current->lockdep_reclaim_gfp = 0; +} + #ifdef CONFIG_LOCK_STAT static int print_lock_contention_bug(struct task_struct *curr, struct lockdep_map *lock, diff --git a/kernel/lockdep_internals.h b/kernel/lockdep_internals.h index 56b196932c08..e887b783244f 100644 --- a/kernel/lockdep_internals.h +++ b/kernel/lockdep_internals.h @@ -32,7 +32,8 @@ extern struct list_head all_lock_classes; extern struct lock_chain lock_chains[]; extern void -get_usage_chars(struct lock_class *class, char *c1, char *c2, char *c3, char *c4); +get_usage_chars(struct lock_class *class, char *c1, char *c2, char *c3, + char *c4, char *c5, char *c6); extern const char * __get_key_name(struct lockdep_subclass_key *key, char *str); diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c index 13716b813896..b84a1dfa9077 100644 --- a/kernel/lockdep_proc.c +++ b/kernel/lockdep_proc.c @@ -84,7 +84,7 @@ static int l_show(struct seq_file *m, void *v) { struct lock_class *class = v; struct lock_list *entry; - char c1, c2, c3, c4; + char c1, c2, c3, c4, c5, c6; if (v == SEQ_START_TOKEN) { seq_printf(m, "all lock classes:\n"); @@ -100,8 +100,8 @@ static int l_show(struct seq_file *m, void *v) seq_printf(m, " BD:%5ld", lockdep_count_backward_deps(class)); #endif - get_usage_chars(class, &c1, &c2, &c3, &c4); - seq_printf(m, " %c%c%c%c", c1, c2, c3, c4); + get_usage_chars(class, &c1, &c2, &c3, &c4, &c5, &c6); + seq_printf(m, " %c%c%c%c%c%c", c1, c2, c3, c4, c5, c6); seq_printf(m, ": "); print_name(m, class); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 5675b3073854..22b15a4cde8a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1479,6 +1479,8 @@ __alloc_pages_internal(gfp_t gfp_mask, unsigned int order, unsigned long did_some_progress; unsigned long pages_reclaimed = 0; + lockdep_trace_alloc(gfp_mask); + might_sleep_if(wait); if (should_fail_alloc_page(gfp_mask, order)) @@ -1578,12 +1580,15 @@ nofail_alloc: */ cpuset_update_task_memory_state(); p->flags |= PF_MEMALLOC; + + lockdep_set_current_reclaim_state(gfp_mask); reclaim_state.reclaimed_slab = 0; p->reclaim_state = &reclaim_state; did_some_progress = try_to_free_pages(zonelist, order, gfp_mask); p->reclaim_state = NULL; + lockdep_clear_current_reclaim_state(); p->flags &= ~PF_MEMALLOC; cond_resched(); diff --git a/mm/slab.c b/mm/slab.c index ddc41f337d58..6b61de8543ec 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -3318,6 +3318,8 @@ __cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, unsigned long save_flags; void *ptr; + lockdep_trace_alloc(flags); + if (slab_should_failslab(cachep, flags)) return NULL; @@ -3394,6 +3396,8 @@ __cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller) unsigned long save_flags; void *objp; + lockdep_trace_alloc(flags); + if (slab_should_failslab(cachep, flags)) return NULL; diff --git a/mm/slob.c b/mm/slob.c index bf7e8fc3aed8..1264799df5d1 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -464,6 +464,8 @@ void *__kmalloc_node(size_t size, gfp_t gfp, int node) unsigned int *m; int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); + lockdep_trace_alloc(flags); + if (size < PAGE_SIZE - align) { if (!size) return ZERO_SIZE_PTR; diff --git a/mm/slub.c b/mm/slub.c index bdc9abb08a23..214eb207c513 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1596,6 +1596,7 @@ static __always_inline void *slab_alloc(struct kmem_cache *s, unsigned long flags; unsigned int objsize; + lockdep_trace_alloc(gfpflags); might_sleep_if(gfpflags & __GFP_WAIT); if (should_failslab(s->objsize, gfpflags)) diff --git a/mm/vmscan.c b/mm/vmscan.c index 9a27c44aa327..303eb658b50b 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1963,6 +1963,9 @@ static int kswapd(void *p) struct reclaim_state reclaim_state = { .reclaimed_slab = 0, }; + + lockdep_set_current_reclaim_state(GFP_KERNEL); + node_to_cpumask_ptr(cpumask, pgdat->node_id); if (!cpumask_empty(cpumask)) -- cgit v1.2.3 From b342501cd31e5546d0c9ca8ceff5ded1832f9e5b Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 26 Feb 2009 20:20:29 +0100 Subject: sched: allow architectures to specify sched_clock_stable Allow CONFIG_HAVE_UNSTABLE_SCHED_CLOCK architectures to still specify that their sched_clock() implementation is reliable. This will be used by x86 to switch on a faster sched_clock_cpu() implementation on certain CPU types. Signed-off-by: Ingo Molnar --- include/linux/sched.h | 10 ++++++++++ kernel/sched_clock.c | 45 ++++++++++++++++++++------------------------- 2 files changed, 30 insertions(+), 25 deletions(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 8981e52c714f..a063d19b7a7d 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1670,6 +1670,16 @@ static inline int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask) return set_cpus_allowed_ptr(p, &new_mask); } +/* + * Architectures can set this to 1 if they have specified + * CONFIG_HAVE_UNSTABLE_SCHED_CLOCK in their arch Kconfig, + * but then during bootup it turns out that sched_clock() + * is reliable after all: + */ +#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK +extern int sched_clock_stable; +#endif + extern unsigned long long sched_clock(void); extern void sched_clock_init(void); diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c index a0b0852414cc..a755d023805a 100644 --- a/kernel/sched_clock.c +++ b/kernel/sched_clock.c @@ -24,11 +24,11 @@ * The clock: sched_clock_cpu() is monotonic per cpu, and should be somewhat * consistent between cpus (never more than 2 jiffies difference). */ -#include -#include #include -#include #include +#include +#include +#include /* * Scheduler clock - returns current time in nanosec units. @@ -43,6 +43,10 @@ unsigned long long __attribute__((weak)) sched_clock(void) static __read_mostly int sched_clock_running; #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK +__read_mostly int sched_clock_stable; +#else +static const int sched_clock_stable = 1; +#endif struct sched_clock_data { /* @@ -87,7 +91,7 @@ void sched_clock_init(void) } /* - * min,max except they take wrapping into account + * min, max except they take wrapping into account */ static inline u64 wrap_min(u64 x, u64 y) @@ -116,10 +120,13 @@ static u64 __update_sched_clock(struct sched_clock_data *scd, u64 now) if (unlikely(delta < 0)) delta = 0; + if (unlikely(!sched_clock_running)) + return 0ull; + /* * scd->clock = clamp(scd->tick_gtod + delta, - * max(scd->tick_gtod, scd->clock), - * scd->tick_gtod + TICK_NSEC); + * max(scd->tick_gtod, scd->clock), + * scd->tick_gtod + TICK_NSEC); */ clock = scd->tick_gtod + delta; @@ -148,12 +155,13 @@ static void lock_double_clock(struct sched_clock_data *data1, u64 sched_clock_cpu(int cpu) { - struct sched_clock_data *scd = cpu_sdc(cpu); u64 now, clock, this_clock, remote_clock; + struct sched_clock_data *scd; - if (unlikely(!sched_clock_running)) - return 0ull; + if (sched_clock_stable) + return sched_clock(); + scd = cpu_sdc(cpu); WARN_ON_ONCE(!irqs_disabled()); now = sched_clock(); @@ -193,6 +201,8 @@ u64 sched_clock_cpu(int cpu) return clock; } +#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK + void sched_clock_tick(void) { struct sched_clock_data *scd = this_scd(); @@ -235,22 +245,7 @@ void sched_clock_idle_wakeup_event(u64 delta_ns) } EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); -#else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ - -void sched_clock_init(void) -{ - sched_clock_running = 1; -} - -u64 sched_clock_cpu(int cpu) -{ - if (unlikely(!sched_clock_running)) - return 0; - - return sched_clock(); -} - -#endif +#endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ unsigned long long cpu_clock(int cpu) { -- cgit v1.2.3 From 8aef2d2856158a36c295a8d1288281e4839bff13 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 24 Mar 2009 01:10:15 -0400 Subject: function-graph: ignore times across schedule Impact: more accurate timings The current method of function graph tracing does not take into account the time spent when a task is not running. This shows functions that call schedule have increased costs: 3) + 18.664 us | } ------------------------------------------ 3) -0 => kblockd-123 ------------------------------------------ 3) | finish_task_switch() { 3) 1.441 us | _spin_unlock_irq(); 3) 3.966 us | } 3) ! 2959.433 us | } 3) ! 2961.465 us | } This patch uses the tracepoint in the scheduling context switch to account for time that has elapsed while a task is scheduled out. Now we see: ------------------------------------------ 3) -0 => edac-po-1067 ------------------------------------------ 3) | finish_task_switch() { 3) 0.685 us | _spin_unlock_irq(); 3) 2.331 us | } 3) + 41.439 us | } 3) + 42.663 us | } Signed-off-by: Steven Rostedt --- include/linux/sched.h | 2 ++ kernel/trace/ftrace.c | 36 ++++++++++++++++++++++++++++++++++++ 2 files changed, 38 insertions(+) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 89cd308cc7a5..471e36d30123 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1409,6 +1409,8 @@ struct task_struct { int curr_ret_stack; /* Stack of return addresses for return function tracing */ struct ftrace_ret_stack *ret_stack; + /* time stamp for last schedule */ + unsigned long long ftrace_timestamp; /* * Number of functions that haven't been traced * because of depth overrun. diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index c81a759fbf76..0b90364d1a2c 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -29,6 +29,8 @@ #include #include +#include + #include #include "trace.h" @@ -2590,6 +2592,31 @@ free: return ret; } +static void +ftrace_graph_probe_sched_switch(struct rq *__rq, struct task_struct *prev, + struct task_struct *next) +{ + unsigned long long timestamp; + int index; + + timestamp = trace_clock_local(); + + prev->ftrace_timestamp = timestamp; + + /* only process tasks that we timestamped */ + if (!next->ftrace_timestamp) + return; + + /* + * Update all the counters in next to make up for the + * time next was sleeping. + */ + timestamp -= next->ftrace_timestamp; + + for (index = next->curr_ret_stack; index >= 0; index--) + next->ret_stack[index].calltime += timestamp; +} + /* Allocate a return stack for each task */ static int start_graph_tracing(void) { @@ -2611,6 +2638,13 @@ static int start_graph_tracing(void) ret = alloc_retstack_tasklist(ret_stack_list); } while (ret == -EAGAIN); + if (!ret) { + ret = register_trace_sched_switch(ftrace_graph_probe_sched_switch); + if (ret) + pr_info("ftrace_graph: Couldn't activate tracepoint" + " probe to kernel_sched_switch\n"); + } + kfree(ret_stack_list); return ret; } @@ -2674,6 +2708,7 @@ void unregister_ftrace_graph(void) mutex_lock(&ftrace_lock); atomic_dec(&ftrace_graph_active); + unregister_trace_sched_switch(ftrace_graph_probe_sched_switch); ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub; ftrace_graph_entry = ftrace_graph_entry_stub; ftrace_shutdown(FTRACE_STOP_FUNC_RET); @@ -2694,6 +2729,7 @@ void ftrace_graph_init_task(struct task_struct *t) t->curr_ret_stack = -1; atomic_set(&t->tracing_graph_pause, 0); atomic_set(&t->trace_overrun, 0); + t->ftrace_timestamp = 0; } else t->ret_stack = NULL; } -- cgit v1.2.3 From 5ad4e53bd5406ee214ddc5a41f03f779b8b2d526 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 29 Mar 2009 19:50:06 -0400 Subject: Get rid of indirect include of fs_struct.h Don't pull it in sched.h; very few files actually need it and those can include directly. sched.h itself only needs forward declaration of struct fs_struct; Signed-off-by: Al Viro --- arch/cris/kernel/process.c | 1 - fs/dcache.c | 1 + fs/exec.c | 1 + fs/fs_struct.c | 1 + fs/namei.c | 1 + fs/namespace.c | 1 + fs/open.c | 1 + fs/proc/base.c | 1 + fs/proc/task_nommu.c | 1 + include/linux/mnt_namespace.h | 2 ++ include/linux/nsproxy.h | 1 + include/linux/sched.h | 3 ++- init/do_mounts.c | 1 + kernel/auditsc.c | 1 + kernel/exec_domain.c | 1 + kernel/exit.c | 1 + kernel/fork.c | 1 + kernel/sys.c | 1 + security/tomoyo/realpath.c | 1 + 19 files changed, 20 insertions(+), 2 deletions(-) (limited to 'include/linux/sched.h') diff --git a/arch/cris/kernel/process.c b/arch/cris/kernel/process.c index 60816e876455..4df0b320d524 100644 --- a/arch/cris/kernel/process.c +++ b/arch/cris/kernel/process.c @@ -19,7 +19,6 @@ #include #include #include -#include #include #include #include diff --git a/fs/dcache.c b/fs/dcache.c index 90bbd7e1b116..0dc4de21f088 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -32,6 +32,7 @@ #include #include #include +#include #include "internal.h" int sysctl_vfs_cache_pressure __read_mostly = 100; diff --git a/fs/exec.c b/fs/exec.c index 614991bf0c87..052a961e41aa 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -53,6 +53,7 @@ #include #include #include +#include #include #include diff --git a/fs/fs_struct.c b/fs/fs_struct.c index 6ac219338670..eee059052db5 100644 --- a/fs/fs_struct.c +++ b/fs/fs_struct.c @@ -3,6 +3,7 @@ #include #include #include +#include /* * Replace the fs->{rootmnt,root} with {mnt,dentry}. Put the old values. diff --git a/fs/namei.c b/fs/namei.c index 964c0249444b..b8433ebfae05 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -32,6 +32,7 @@ #include #include #include +#include #include #define ACC_MODE(x) ("\000\004\002\006"[(x)&O_ACCMODE]) diff --git a/fs/namespace.c b/fs/namespace.c index 1e56303c718e..c6f54e4c4290 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include #include "pnode.h" diff --git a/fs/open.c b/fs/open.c index 75b61677daaf..377eb25b6abf 100644 --- a/fs/open.c +++ b/fs/open.c @@ -29,6 +29,7 @@ #include #include #include +#include int vfs_statfs(struct dentry *dentry, struct kstatfs *buf) { diff --git a/fs/proc/base.c b/fs/proc/base.c index e0afd326b688..f71559784bfb 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -80,6 +80,7 @@ #include #include #include +#include #include "internal.h" /* NOTE: diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c index 6ca01052c5bc..253afc04484c 100644 --- a/fs/proc/task_nommu.c +++ b/fs/proc/task_nommu.c @@ -2,6 +2,7 @@ #include #include #include +#include #include #include #include diff --git a/include/linux/mnt_namespace.h b/include/linux/mnt_namespace.h index 830bbcd449d6..3a059298cc19 100644 --- a/include/linux/mnt_namespace.h +++ b/include/linux/mnt_namespace.h @@ -22,6 +22,8 @@ struct proc_mounts { int event; }; +struct fs_struct; + extern struct mnt_namespace *copy_mnt_ns(unsigned long, struct mnt_namespace *, struct fs_struct *); extern void __put_mnt_ns(struct mnt_namespace *ns); diff --git a/include/linux/nsproxy.h b/include/linux/nsproxy.h index afad7dec1b36..7b370c7cfeff 100644 --- a/include/linux/nsproxy.h +++ b/include/linux/nsproxy.h @@ -8,6 +8,7 @@ struct mnt_namespace; struct uts_namespace; struct ipc_namespace; struct pid_namespace; +struct fs_struct; /* * A structure to contain pointers to all per-process diff --git a/include/linux/sched.h b/include/linux/sched.h index 29df6374d2de..b4e065ea0de1 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -68,7 +68,7 @@ struct sched_param { #include #include #include -#include +#include #include #include #include @@ -97,6 +97,7 @@ struct futex_pi_state; struct robust_list_head; struct bio; struct bts_tracer; +struct fs_struct; /* * List of flags we want to share for kernel threads, diff --git a/init/do_mounts.c b/init/do_mounts.c index 8d4ff5afc1d8..dd7ee5f203f3 100644 --- a/init/do_mounts.c +++ b/init/do_mounts.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 8cbddff6c283..2bfc64786765 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -66,6 +66,7 @@ #include #include #include +#include #include "audit.h" diff --git a/kernel/exec_domain.c b/kernel/exec_domain.c index cb8e9626c215..c35452cadded 100644 --- a/kernel/exec_domain.c +++ b/kernel/exec_domain.c @@ -18,6 +18,7 @@ #include #include #include +#include static void default_handler(int, struct pt_regs *); diff --git a/kernel/exit.c b/kernel/exit.c index ad8375758a79..b5d656845c90 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -46,6 +46,7 @@ #include #include #include +#include #include #include diff --git a/kernel/fork.c b/kernel/fork.c index 51f138a131de..e82a14577a98 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -60,6 +60,7 @@ #include #include #include +#include #include #include diff --git a/kernel/sys.c b/kernel/sys.c index 37f458e6882a..ce182aaed204 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -34,6 +34,7 @@ #include #include #include +#include #include #include diff --git a/security/tomoyo/realpath.c b/security/tomoyo/realpath.c index d47f16b844b2..3bbe01a7a4b5 100644 --- a/security/tomoyo/realpath.c +++ b/security/tomoyo/realpath.c @@ -12,6 +12,7 @@ #include #include #include +#include #include "common.h" #include "realpath.h" -- cgit v1.2.3 From 9de1581e75ba9d7979766d4ab6d56f0f2d87f7c6 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 31 Mar 2009 15:19:29 -0700 Subject: get_mm_hiwater_xxx: trivial, s/define/inline/ Andrew pointed out get_mm_hiwater_xxx() evaluate "mm" argument thrice/twice, make them inline. Signed-off-by: Oleg Nesterov Cc: Hugh Dickins Reviewed-by: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 29df6374d2de..481fad3a9b42 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -391,8 +391,15 @@ extern void arch_unmap_area_topdown(struct mm_struct *, unsigned long); (mm)->hiwater_vm = (mm)->total_vm; \ } while (0) -#define get_mm_hiwater_rss(mm) max((mm)->hiwater_rss, get_mm_rss(mm)) -#define get_mm_hiwater_vm(mm) max((mm)->hiwater_vm, (mm)->total_vm) +static inline unsigned long get_mm_hiwater_rss(struct mm_struct *mm) +{ + return max(mm->hiwater_rss, get_mm_rss(mm)); +} + +static inline unsigned long get_mm_hiwater_vm(struct mm_struct *mm) +{ + return max(mm->hiwater_vm, mm->total_vm); +} extern void set_dumpable(struct mm_struct *mm, int value); extern int get_dumpable(struct mm_struct *mm); -- cgit v1.2.3 From 6f2c55b843836d26528c56a0968689accaedbc67 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Thu, 2 Apr 2009 16:56:59 -0700 Subject: Simplify copy_thread() First argument unused since 2.3.11. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Alexey Dobriyan Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/alpha/kernel/process.c | 2 +- arch/arm/kernel/process.c | 2 +- arch/avr32/kernel/process.c | 2 +- arch/blackfin/kernel/process.c | 2 +- arch/cris/arch-v10/kernel/process.c | 2 +- arch/cris/arch-v32/kernel/process.c | 2 +- arch/frv/kernel/process.c | 2 +- arch/h8300/kernel/process.c | 2 +- arch/ia64/kernel/process.c | 2 +- arch/m32r/kernel/process.c | 2 +- arch/m68k/kernel/process.c | 2 +- arch/m68knommu/kernel/process.c | 2 +- arch/mips/kernel/process.c | 2 +- arch/mn10300/kernel/process.c | 2 +- arch/parisc/kernel/process.c | 2 +- arch/powerpc/kernel/process.c | 2 +- arch/s390/kernel/process.c | 2 +- arch/sh/kernel/process_32.c | 2 +- arch/sh/kernel/process_64.c | 2 +- arch/sparc/kernel/process_32.c | 2 +- arch/sparc/kernel/process_64.c | 2 +- arch/um/kernel/process.c | 2 +- arch/x86/kernel/process_32.c | 2 +- arch/x86/kernel/process_64.c | 2 +- arch/xtensa/kernel/process.c | 2 +- include/linux/sched.h | 3 ++- kernel/fork.c | 2 +- 27 files changed, 28 insertions(+), 27 deletions(-) (limited to 'include/linux/sched.h') diff --git a/arch/alpha/kernel/process.c b/arch/alpha/kernel/process.c index 8d0097f10208..3a2fb7a02db4 100644 --- a/arch/alpha/kernel/process.c +++ b/arch/alpha/kernel/process.c @@ -272,7 +272,7 @@ alpha_vfork(struct pt_regs *regs) */ int -copy_thread(int nr, unsigned long clone_flags, unsigned long usp, +copy_thread(unsigned long clone_flags, unsigned long usp, unsigned long unused, struct task_struct * p, struct pt_regs * regs) { diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c index 2de14e2afdc5..c3265a2e7cd4 100644 --- a/arch/arm/kernel/process.c +++ b/arch/arm/kernel/process.c @@ -301,7 +301,7 @@ void release_thread(struct task_struct *dead_task) asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); int -copy_thread(int nr, unsigned long clone_flags, unsigned long stack_start, +copy_thread(unsigned long clone_flags, unsigned long stack_start, unsigned long stk_sz, struct task_struct *p, struct pt_regs *regs) { struct thread_info *thread = task_thread_info(p); diff --git a/arch/avr32/kernel/process.c b/arch/avr32/kernel/process.c index 43ae555ecb33..1bbe1da54869 100644 --- a/arch/avr32/kernel/process.c +++ b/arch/avr32/kernel/process.c @@ -332,7 +332,7 @@ int dump_fpu(struct pt_regs *regs, elf_fpregset_t *fpu) asmlinkage void ret_from_fork(void); -int copy_thread(int nr, unsigned long clone_flags, unsigned long usp, +int copy_thread(unsigned long clone_flags, unsigned long usp, unsigned long unused, struct task_struct *p, struct pt_regs *regs) { diff --git a/arch/blackfin/kernel/process.c b/arch/blackfin/kernel/process.c index 33e2e8993f7f..f49427293ca1 100644 --- a/arch/blackfin/kernel/process.c +++ b/arch/blackfin/kernel/process.c @@ -193,7 +193,7 @@ asmlinkage int bfin_clone(struct pt_regs *regs) } int -copy_thread(int nr, unsigned long clone_flags, +copy_thread(unsigned long clone_flags, unsigned long usp, unsigned long topstk, struct task_struct *p, struct pt_regs *regs) { diff --git a/arch/cris/arch-v10/kernel/process.c b/arch/cris/arch-v10/kernel/process.c index bd9b3ff63f6c..c4c69cf721e5 100644 --- a/arch/cris/arch-v10/kernel/process.c +++ b/arch/cris/arch-v10/kernel/process.c @@ -115,7 +115,7 @@ int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags) */ asmlinkage void ret_from_fork(void); -int copy_thread(int nr, unsigned long clone_flags, unsigned long usp, +int copy_thread(unsigned long clone_flags, unsigned long usp, unsigned long unused, struct task_struct *p, struct pt_regs *regs) { diff --git a/arch/cris/arch-v32/kernel/process.c b/arch/cris/arch-v32/kernel/process.c index ced5b725d9bd..120e7f796fea 100644 --- a/arch/cris/arch-v32/kernel/process.c +++ b/arch/cris/arch-v32/kernel/process.c @@ -131,7 +131,7 @@ kernel_thread(int (*fn)(void *), void * arg, unsigned long flags) extern asmlinkage void ret_from_fork(void); int -copy_thread(int nr, unsigned long clone_flags, unsigned long usp, +copy_thread(unsigned long clone_flags, unsigned long usp, unsigned long unused, struct task_struct *p, struct pt_regs *regs) { diff --git a/arch/frv/kernel/process.c b/arch/frv/kernel/process.c index 9583a338e9d6..0de50df74970 100644 --- a/arch/frv/kernel/process.c +++ b/arch/frv/kernel/process.c @@ -204,7 +204,7 @@ void prepare_to_copy(struct task_struct *tsk) /* * set up the kernel stack and exception frames for a new process */ -int copy_thread(int nr, unsigned long clone_flags, +int copy_thread(unsigned long clone_flags, unsigned long usp, unsigned long topstk, struct task_struct *p, struct pt_regs *regs) { diff --git a/arch/h8300/kernel/process.c b/arch/h8300/kernel/process.c index a8ef654a5a0b..e2f33d0f9969 100644 --- a/arch/h8300/kernel/process.c +++ b/arch/h8300/kernel/process.c @@ -191,7 +191,7 @@ asmlinkage int h8300_clone(struct pt_regs *regs) } -int copy_thread(int nr, unsigned long clone_flags, +int copy_thread(unsigned long clone_flags, unsigned long usp, unsigned long topstk, struct task_struct * p, struct pt_regs * regs) { diff --git a/arch/ia64/kernel/process.c b/arch/ia64/kernel/process.c index c57162705147..5d7c0e5b9e76 100644 --- a/arch/ia64/kernel/process.c +++ b/arch/ia64/kernel/process.c @@ -413,7 +413,7 @@ ia64_load_extra (struct task_struct *task) * so there is nothing to worry about. */ int -copy_thread (int nr, unsigned long clone_flags, +copy_thread(unsigned long clone_flags, unsigned long user_stack_base, unsigned long user_stack_size, struct task_struct *p, struct pt_regs *regs) { diff --git a/arch/m32r/kernel/process.c b/arch/m32r/kernel/process.c index 7103d91e1a2f..3e876f0baebc 100644 --- a/arch/m32r/kernel/process.c +++ b/arch/m32r/kernel/process.c @@ -225,7 +225,7 @@ int dump_fpu(struct pt_regs *regs, elf_fpregset_t *fpu) return 0; /* Task didn't use the fpu at all. */ } -int copy_thread(int nr, unsigned long clone_flags, unsigned long spu, +int copy_thread(unsigned long clone_flags, unsigned long spu, unsigned long unused, struct task_struct *tsk, struct pt_regs *regs) { struct pt_regs *childregs = task_pt_regs(tsk); diff --git a/arch/m68k/kernel/process.c b/arch/m68k/kernel/process.c index 632ce016014d..ec37fb56c127 100644 --- a/arch/m68k/kernel/process.c +++ b/arch/m68k/kernel/process.c @@ -233,7 +233,7 @@ asmlinkage int m68k_clone(struct pt_regs *regs) parent_tidptr, child_tidptr); } -int copy_thread(int nr, unsigned long clone_flags, unsigned long usp, +int copy_thread(unsigned long clone_flags, unsigned long usp, unsigned long unused, struct task_struct * p, struct pt_regs * regs) { diff --git a/arch/m68knommu/kernel/process.c b/arch/m68knommu/kernel/process.c index 3f2d7745f31e..1e96c6eb6312 100644 --- a/arch/m68knommu/kernel/process.c +++ b/arch/m68knommu/kernel/process.c @@ -199,7 +199,7 @@ asmlinkage int m68k_clone(struct pt_regs *regs) return do_fork(clone_flags, newsp, regs, 0, NULL, NULL); } -int copy_thread(int nr, unsigned long clone_flags, +int copy_thread(unsigned long clone_flags, unsigned long usp, unsigned long topstk, struct task_struct * p, struct pt_regs * regs) { diff --git a/arch/mips/kernel/process.c b/arch/mips/kernel/process.c index ca2e4026ad20..1eaaa450e20c 100644 --- a/arch/mips/kernel/process.c +++ b/arch/mips/kernel/process.c @@ -99,7 +99,7 @@ void flush_thread(void) { } -int copy_thread(int nr, unsigned long clone_flags, unsigned long usp, +int copy_thread(unsigned long clone_flags, unsigned long usp, unsigned long unused, struct task_struct *p, struct pt_regs *regs) { struct thread_info *ti = task_thread_info(p); diff --git a/arch/mn10300/kernel/process.c b/arch/mn10300/kernel/process.c index b28c9a60445b..234cf344cdce 100644 --- a/arch/mn10300/kernel/process.c +++ b/arch/mn10300/kernel/process.c @@ -193,7 +193,7 @@ void prepare_to_copy(struct task_struct *tsk) * set up the kernel stack for a new thread and copy arch-specific thread * control information */ -int copy_thread(int nr, unsigned long clone_flags, +int copy_thread(unsigned long clone_flags, unsigned long c_usp, unsigned long ustk_size, struct task_struct *p, struct pt_regs *kregs) { diff --git a/arch/parisc/kernel/process.c b/arch/parisc/kernel/process.c index b80e02a4d81d..8aa591ed9127 100644 --- a/arch/parisc/kernel/process.c +++ b/arch/parisc/kernel/process.c @@ -263,7 +263,7 @@ sys_vfork(struct pt_regs *regs) } int -copy_thread(int nr, unsigned long clone_flags, unsigned long usp, +copy_thread(unsigned long clone_flags, unsigned long usp, unsigned long unused, /* in ia64 this is "user_stack_size" */ struct task_struct * p, struct pt_regs * pregs) { diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index eac064948780..7b44a33f03c2 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -598,7 +598,7 @@ void prepare_to_copy(struct task_struct *tsk) /* * Copy a thread.. */ -int copy_thread(int nr, unsigned long clone_flags, unsigned long usp, +int copy_thread(unsigned long clone_flags, unsigned long usp, unsigned long unused, struct task_struct *p, struct pt_regs *regs) { diff --git a/arch/s390/kernel/process.c b/arch/s390/kernel/process.c index b48e961a38f6..a3acd8e60aff 100644 --- a/arch/s390/kernel/process.c +++ b/arch/s390/kernel/process.c @@ -160,7 +160,7 @@ void release_thread(struct task_struct *dead_task) { } -int copy_thread(int nr, unsigned long clone_flags, unsigned long new_stackp, +int copy_thread(unsigned long clone_flags, unsigned long new_stackp, unsigned long unused, struct task_struct *p, struct pt_regs *regs) { diff --git a/arch/sh/kernel/process_32.c b/arch/sh/kernel/process_32.c index ddafbbbab2ab..694bc15f84fd 100644 --- a/arch/sh/kernel/process_32.c +++ b/arch/sh/kernel/process_32.c @@ -170,7 +170,7 @@ int dump_fpu(struct pt_regs *regs, elf_fpregset_t *fpu) asmlinkage void ret_from_fork(void); -int copy_thread(int nr, unsigned long clone_flags, unsigned long usp, +int copy_thread(unsigned long clone_flags, unsigned long usp, unsigned long unused, struct task_struct *p, struct pt_regs *regs) { diff --git a/arch/sh/kernel/process_64.c b/arch/sh/kernel/process_64.c index c90c7e5e5fee..96be839040f8 100644 --- a/arch/sh/kernel/process_64.c +++ b/arch/sh/kernel/process_64.c @@ -425,7 +425,7 @@ int dump_fpu(struct pt_regs *regs, elf_fpregset_t *fpu) asmlinkage void ret_from_fork(void); -int copy_thread(int nr, unsigned long clone_flags, unsigned long usp, +int copy_thread(unsigned long clone_flags, unsigned long usp, unsigned long unused, struct task_struct *p, struct pt_regs *regs) { diff --git a/arch/sparc/kernel/process_32.c b/arch/sparc/kernel/process_32.c index f4bee35a1b46..2830b415e214 100644 --- a/arch/sparc/kernel/process_32.c +++ b/arch/sparc/kernel/process_32.c @@ -455,7 +455,7 @@ asmlinkage int sparc_do_fork(unsigned long clone_flags, */ extern void ret_from_fork(void); -int copy_thread(int nr, unsigned long clone_flags, unsigned long sp, +int copy_thread(unsigned long clone_flags, unsigned long sp, unsigned long unused, struct task_struct *p, struct pt_regs *regs) { diff --git a/arch/sparc/kernel/process_64.c b/arch/sparc/kernel/process_64.c index a73954b87f0a..4041f94e7724 100644 --- a/arch/sparc/kernel/process_64.c +++ b/arch/sparc/kernel/process_64.c @@ -561,7 +561,7 @@ asmlinkage long sparc_do_fork(unsigned long clone_flags, * Parent --> %o0 == childs pid, %o1 == 0 * Child --> %o0 == parents pid, %o1 == 1 */ -int copy_thread(int nr, unsigned long clone_flags, unsigned long sp, +int copy_thread(unsigned long clone_flags, unsigned long sp, unsigned long unused, struct task_struct *p, struct pt_regs *regs) { diff --git a/arch/um/kernel/process.c b/arch/um/kernel/process.c index a1c6d07cac3e..4a28a1568d85 100644 --- a/arch/um/kernel/process.c +++ b/arch/um/kernel/process.c @@ -179,7 +179,7 @@ void fork_handler(void) userspace(¤t->thread.regs.regs); } -int copy_thread(int nr, unsigned long clone_flags, unsigned long sp, +int copy_thread(unsigned long clone_flags, unsigned long sp, unsigned long stack_top, struct task_struct * p, struct pt_regs *regs) { diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 14014d766cad..76f8f84043a2 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -245,7 +245,7 @@ void prepare_to_copy(struct task_struct *tsk) unlazy_fpu(tsk); } -int copy_thread(int nr, unsigned long clone_flags, unsigned long sp, +int copy_thread(unsigned long clone_flags, unsigned long sp, unsigned long unused, struct task_struct *p, struct pt_regs *regs) { diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index abb7e6a7f0c6..b751a41392b1 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -278,7 +278,7 @@ void prepare_to_copy(struct task_struct *tsk) unlazy_fpu(tsk); } -int copy_thread(int nr, unsigned long clone_flags, unsigned long sp, +int copy_thread(unsigned long clone_flags, unsigned long sp, unsigned long unused, struct task_struct *p, struct pt_regs *regs) { diff --git a/arch/xtensa/kernel/process.c b/arch/xtensa/kernel/process.c index 9185597eb6a0..031f36685710 100644 --- a/arch/xtensa/kernel/process.c +++ b/arch/xtensa/kernel/process.c @@ -172,7 +172,7 @@ void prepare_to_copy(struct task_struct *tsk) * childregs. */ -int copy_thread(int nr, unsigned long clone_flags, unsigned long usp, +int copy_thread(unsigned long clone_flags, unsigned long usp, unsigned long unused, struct task_struct * p, struct pt_regs * regs) { diff --git a/include/linux/sched.h b/include/linux/sched.h index 481fad3a9b42..9186f8c5d5f2 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1975,7 +1975,8 @@ extern void mm_release(struct task_struct *, struct mm_struct *); /* Allocate a new mm structure and copy contents from tsk->mm */ extern struct mm_struct *dup_mm(struct task_struct *tsk); -extern int copy_thread(int, unsigned long, unsigned long, unsigned long, struct task_struct *, struct pt_regs *); +extern int copy_thread(unsigned long, unsigned long, unsigned long, + struct task_struct *, struct pt_regs *); extern void flush_thread(void); extern void exit_thread(void); diff --git a/kernel/fork.c b/kernel/fork.c index 51d1aa21483b..d7eb727eb535 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1125,7 +1125,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, goto bad_fork_cleanup_mm; if ((retval = copy_io(clone_flags, p))) goto bad_fork_cleanup_namespaces; - retval = copy_thread(0, clone_flags, stack_start, stack_size, p, regs); + retval = copy_thread(clone_flags, stack_start, stack_size, p, regs); if (retval) goto bad_fork_cleanup_io; -- cgit v1.2.3 From 39c626ae47c469abdfd30c6e42eff884931380d6 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 2 Apr 2009 16:58:18 -0700 Subject: forget_original_parent: split out the un-ptrace part By discussion with Roland. - Rename ptrace_exit() to exit_ptrace(), and change it to do all the necessary work with ->ptraced list by its own. - Move this code from exit.c to ptrace.c - Update the comment in ptrace_detach() to explain the rechecking of the child->ptrace. Signed-off-by: Oleg Nesterov Cc: "Eric W. Biederman" Cc: "Metzger, Markus T" Cc: Roland McGrath Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/ptrace.h | 2 +- include/linux/sched.h | 5 +++ kernel/exit.c | 95 ++++---------------------------------------------- kernel/ptrace.c | 78 +++++++++++++++++++++++++++++++++++++++-- 4 files changed, 88 insertions(+), 92 deletions(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h index 1a2b0cb55535..67c15653fc23 100644 --- a/include/linux/ptrace.h +++ b/include/linux/ptrace.h @@ -94,7 +94,7 @@ extern void ptrace_notify(int exit_code); extern void __ptrace_link(struct task_struct *child, struct task_struct *new_parent); extern void __ptrace_unlink(struct task_struct *child); -extern int __ptrace_detach(struct task_struct *tracer, struct task_struct *p); +extern void exit_ptrace(struct task_struct *tracer); extern void ptrace_fork(struct task_struct *task, unsigned long clone_flags); #define PTRACE_MODE_READ 1 #define PTRACE_MODE_ATTACH 2 diff --git a/include/linux/sched.h b/include/linux/sched.h index 9186f8c5d5f2..b47c94e7560b 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2061,6 +2061,11 @@ static inline int thread_group_empty(struct task_struct *p) #define delay_group_leader(p) \ (thread_group_leader(p) && !thread_group_empty(p)) +static inline int task_detached(struct task_struct *p) +{ + return p->exit_signal == -1; +} + /* * Protects ->fs, ->files, ->mm, ->group_info, ->comm, keyring * subscriptions and synchronises with wait4(). Also used in procfs. Also diff --git a/kernel/exit.c b/kernel/exit.c index 3e09b7cb3b20..506693dfdd4e 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -61,11 +61,6 @@ DEFINE_TRACE(sched_process_wait); static void exit_mm(struct task_struct * tsk); -static inline int task_detached(struct task_struct *p) -{ - return p->exit_signal == -1; -} - static void __unhash_process(struct task_struct *p) { nr_threads--; @@ -731,85 +726,6 @@ static void exit_mm(struct task_struct * tsk) mmput(mm); } -/* - * Called with irqs disabled, returns true if childs should reap themselves. - */ -static int ignoring_children(struct sighand_struct *sigh) -{ - int ret; - spin_lock(&sigh->siglock); - ret = (sigh->action[SIGCHLD-1].sa.sa_handler == SIG_IGN) || - (sigh->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDWAIT); - spin_unlock(&sigh->siglock); - return ret; -} - -/* Returns nonzero if the tracee should be released. */ -int __ptrace_detach(struct task_struct *tracer, struct task_struct *p) -{ - __ptrace_unlink(p); - - if (p->exit_state != EXIT_ZOMBIE) - return 0; - /* - * If it's a zombie, our attachedness prevented normal - * parent notification or self-reaping. Do notification - * now if it would have happened earlier. If it should - * reap itself we return true. - * - * If it's our own child, there is no notification to do. - * But if our normal children self-reap, then this child - * was prevented by ptrace and we must reap it now. - */ - if (!task_detached(p) && thread_group_empty(p)) { - if (!same_thread_group(p->real_parent, tracer)) - do_notify_parent(p, p->exit_signal); - else if (ignoring_children(tracer->sighand)) - p->exit_signal = -1; - } - - if (!task_detached(p)) - return 0; - - /* Mark it as in the process of being reaped. */ - p->exit_state = EXIT_DEAD; - return 1; -} - -/* - * Detach all tasks we were using ptrace on. - * Any that need to be release_task'd are put on the @dead list. - * - * Called with write_lock(&tasklist_lock) held. - */ -static void ptrace_exit(struct task_struct *parent, struct list_head *dead) -{ - struct task_struct *p, *n; - - list_for_each_entry_safe(p, n, &parent->ptraced, ptrace_entry) { - if (__ptrace_detach(parent, p)) - list_add(&p->ptrace_entry, dead); - } -} - -/* - * Finish up exit-time ptrace cleanup. - * - * Called without locks. - */ -static void ptrace_exit_finish(struct task_struct *parent, - struct list_head *dead) -{ - struct task_struct *p, *n; - - BUG_ON(!list_empty(&parent->ptraced)); - - list_for_each_entry_safe(p, n, dead, ptrace_entry) { - list_del_init(&p->ptrace_entry); - release_task(p); - } -} - /* Returns nonzero if the child should be released. */ static int reparent_thread(struct task_struct *p, struct task_struct *father) { @@ -894,12 +810,10 @@ static void forget_original_parent(struct task_struct *father) struct task_struct *p, *n, *reaper; LIST_HEAD(ptrace_dead); + exit_ptrace(father); + write_lock_irq(&tasklist_lock); reaper = find_new_reaper(father); - /* - * First clean up ptrace if we were using it. - */ - ptrace_exit(father, &ptrace_dead); list_for_each_entry_safe(p, n, &father->children, sibling) { p->real_parent = reaper; @@ -914,7 +828,10 @@ static void forget_original_parent(struct task_struct *father) write_unlock_irq(&tasklist_lock); BUG_ON(!list_empty(&father->children)); - ptrace_exit_finish(father, &ptrace_dead); + list_for_each_entry_safe(p, n, &ptrace_dead, ptrace_entry) { + list_del_init(&p->ptrace_entry); + release_task(p); + } } /* diff --git a/kernel/ptrace.c b/kernel/ptrace.c index ee553b6ad125..f5a9fa5aafa1 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -235,9 +235,57 @@ out: return retval; } +/* + * Called with irqs disabled, returns true if childs should reap themselves. + */ +static int ignoring_children(struct sighand_struct *sigh) +{ + int ret; + spin_lock(&sigh->siglock); + ret = (sigh->action[SIGCHLD-1].sa.sa_handler == SIG_IGN) || + (sigh->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDWAIT); + spin_unlock(&sigh->siglock); + return ret; +} + +/* + * Called with tasklist_lock held for writing. + * Unlink a traced task, and clean it up if it was a traced zombie. + * Return true if it needs to be reaped with release_task(). + * (We can't call release_task() here because we already hold tasklist_lock.) + * + * If it's a zombie, our attachedness prevented normal parent notification + * or self-reaping. Do notification now if it would have happened earlier. + * If it should reap itself, return true. + * + * If it's our own child, there is no notification to do. + * But if our normal children self-reap, then this child + * was prevented by ptrace and we must reap it now. + */ +static bool __ptrace_detach(struct task_struct *tracer, struct task_struct *p) +{ + __ptrace_unlink(p); + + if (p->exit_state == EXIT_ZOMBIE) { + if (!task_detached(p) && thread_group_empty(p)) { + if (!same_thread_group(p->real_parent, tracer)) + do_notify_parent(p, p->exit_signal); + else if (ignoring_children(tracer->sighand)) + p->exit_signal = -1; + } + if (task_detached(p)) { + /* Mark it as in the process of being reaped. */ + p->exit_state = EXIT_DEAD; + return true; + } + } + + return false; +} + int ptrace_detach(struct task_struct *child, unsigned int data) { - int dead = 0; + bool dead = false; if (!valid_signal(data)) return -EIO; @@ -247,7 +295,10 @@ int ptrace_detach(struct task_struct *child, unsigned int data) clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE); write_lock_irq(&tasklist_lock); - /* protect against de_thread()->release_task() */ + /* + * This child can be already killed. Make sure de_thread() or + * our sub-thread doing do_wait() didn't do release_task() yet. + */ if (child->ptrace) { child->exit_code = data; @@ -264,6 +315,29 @@ int ptrace_detach(struct task_struct *child, unsigned int data) return 0; } +/* + * Detach all tasks we were using ptrace on. + */ +void exit_ptrace(struct task_struct *tracer) +{ + struct task_struct *p, *n; + LIST_HEAD(ptrace_dead); + + write_lock_irq(&tasklist_lock); + list_for_each_entry_safe(p, n, &tracer->ptraced, ptrace_entry) { + if (__ptrace_detach(tracer, p)) + list_add(&p->ptrace_entry, &ptrace_dead); + } + write_unlock_irq(&tasklist_lock); + + BUG_ON(!list_empty(&tracer->ptraced)); + + list_for_each_entry_safe(p, n, &ptrace_dead, ptrace_entry) { + list_del_init(&p->ptrace_entry); + release_task(p); + } +} + int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __user *dst, int len) { int copied = 0; -- cgit v1.2.3 From 6dda81f4384b94930826eded254d8c16f89a9248 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 2 Apr 2009 16:58:35 -0700 Subject: pids: document task_pgrp/task_session is not safe without tasklist/rcu Even if task == current, it is not safe to dereference the result of task_pgrp/task_session. We can race with another thread which changes the special pid via setpgid/setsid. Document this. The next 2 patches give an example of the unsafe usage, we have more bad users. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Oleg Nesterov Cc: Louis Rilling Cc: "Eric W. Biederman" Cc: Pavel Emelyanov Cc: Sukadev Bhattiprolu Cc: Roland McGrath Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index b47c94e7560b..722dd313bf8a 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1489,6 +1489,11 @@ static inline struct pid *task_tgid(struct task_struct *task) return task->group_leader->pids[PIDTYPE_PID].pid; } +/* + * Without tasklist or rcu lock it is not safe to dereference + * the result of task_pgrp/task_session even if task == current, + * we can race with another thread doing sys_setsid/sys_setpgid. + */ static inline struct pid *task_pgrp(struct task_struct *task) { return task->group_leader->pids[PIDTYPE_PGID].pid; -- cgit v1.2.3 From 52ee2dfdd4f51cf422ea6a96a0846dc94244aa37 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 2 Apr 2009 16:58:38 -0700 Subject: pids: refactor vnr/nr_ns helpers to make them safe Inho, the safety rules for vnr/nr_ns helpers are horrible and buggy. task_pid_nr_ns(task) needs rcu/tasklist depending on task == current. As for "special" pids, vnr/nr_ns helpers always need rcu. However, if task != current, they are unsafe even under rcu lock, we can't trust task->group_leader without the special checks. And almost every helper has a callsite which needs a fix. Also, it is a bit annoying that the implementations of, say, task_pgrp_vnr() and task_pgrp_nr_ns() are not "symmetrical". This patch introduces the new helper, __task_pid_nr_ns(), which is always safe to use, and turns all other helpers into the trivial wrappers. After this I'll send another patch which converts task_tgid_xxx() as well, they're are a bit special. Signed-off-by: Oleg Nesterov Cc: Louis Rilling Cc: "Eric W. Biederman" Cc: Pavel Emelyanov Cc: Sukadev Bhattiprolu Cc: Roland McGrath Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 27 ++++++++++++++++++++------- kernel/pid.c | 31 ++++++++++++++++--------------- 2 files changed, 36 insertions(+), 22 deletions(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 722dd313bf8a..49df878a0cad 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1519,17 +1519,23 @@ struct pid_namespace; * * see also pid_nr() etc in include/linux/pid.h */ +pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type, + struct pid_namespace *ns); static inline pid_t task_pid_nr(struct task_struct *tsk) { return tsk->pid; } -pid_t task_pid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns); +static inline pid_t task_pid_nr_ns(struct task_struct *tsk, + struct pid_namespace *ns) +{ + return __task_pid_nr_ns(tsk, PIDTYPE_PID, ns); +} static inline pid_t task_pid_vnr(struct task_struct *tsk) { - return pid_vnr(task_pid(tsk)); + return __task_pid_nr_ns(tsk, PIDTYPE_PID, NULL); } @@ -1551,11 +1557,15 @@ static inline pid_t task_pgrp_nr(struct task_struct *tsk) return tsk->signal->__pgrp; } -pid_t task_pgrp_nr_ns(struct task_struct *tsk, struct pid_namespace *ns); +static inline pid_t task_pgrp_nr_ns(struct task_struct *tsk, + struct pid_namespace *ns) +{ + return __task_pid_nr_ns(tsk, PIDTYPE_PGID, ns); +} static inline pid_t task_pgrp_vnr(struct task_struct *tsk) { - return pid_vnr(task_pgrp(tsk)); + return __task_pid_nr_ns(tsk, PIDTYPE_PGID, NULL); } @@ -1564,14 +1574,17 @@ static inline pid_t task_session_nr(struct task_struct *tsk) return tsk->signal->__session; } -pid_t task_session_nr_ns(struct task_struct *tsk, struct pid_namespace *ns); +static inline pid_t task_session_nr_ns(struct task_struct *tsk, + struct pid_namespace *ns) +{ + return __task_pid_nr_ns(tsk, PIDTYPE_SID, ns); +} static inline pid_t task_session_vnr(struct task_struct *tsk) { - return pid_vnr(task_session(tsk)); + return __task_pid_nr_ns(tsk, PIDTYPE_SID, NULL); } - /** * pid_alive - check that a task structure is not stale * @p: Task structure to be checked. diff --git a/kernel/pid.c b/kernel/pid.c index 6628abcc520e..b2e5f78fd281 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -452,11 +452,24 @@ pid_t pid_vnr(struct pid *pid) } EXPORT_SYMBOL_GPL(pid_vnr); -pid_t task_pid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns) +pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type, + struct pid_namespace *ns) { - return pid_nr_ns(task_pid(tsk), ns); + pid_t nr = 0; + + rcu_read_lock(); + if (!ns) + ns = current->nsproxy->pid_ns; + if (likely(pid_alive(task))) { + if (type != PIDTYPE_PID) + task = task->group_leader; + nr = pid_nr_ns(task->pids[type].pid, ns); + } + rcu_read_unlock(); + + return nr; } -EXPORT_SYMBOL(task_pid_nr_ns); +EXPORT_SYMBOL(__task_pid_nr_ns); pid_t task_tgid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns) { @@ -464,18 +477,6 @@ pid_t task_tgid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns) } EXPORT_SYMBOL(task_tgid_nr_ns); -pid_t task_pgrp_nr_ns(struct task_struct *tsk, struct pid_namespace *ns) -{ - return pid_nr_ns(task_pgrp(tsk), ns); -} -EXPORT_SYMBOL(task_pgrp_nr_ns); - -pid_t task_session_nr_ns(struct task_struct *tsk, struct pid_namespace *ns) -{ - return pid_nr_ns(task_session(tsk), ns); -} -EXPORT_SYMBOL(task_session_nr_ns); - struct pid_namespace *task_active_pid_ns(struct task_struct *tsk) { return ns_of_pid(task_pid(tsk)); -- cgit v1.2.3 From 1b0f7ffd0ea27cd3a0b9ca04e3df9522048c32a3 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 2 Apr 2009 16:58:39 -0700 Subject: pids: kill signal_struct-> __pgrp/__session and friends We are wasting 2 words in signal_struct without any reason to implement task_pgrp_nr() and task_session_nr(). task_session_nr() has no callers since 2e2ba22ea4fd4bb85f0fa37c521066db6775cbef, we can remove it. task_pgrp_nr() is still (I believe wrongly) used in fs/autofsX and fs/coda. This patch reimplements task_pgrp_nr() via task_pgrp_nr_ns(), and kills __pgrp/__session and the related helpers. The change in drivers/char/tty_io.c is cosmetic, but hopefully makes sense anyway. Signed-off-by: Oleg Nesterov Acked-by: Alan Cox [tty parts] Cc: Cedric Le Goater Cc: Dave Hansen Cc: Eric Biederman Cc: Pavel Emelyanov Cc: Serge Hallyn Cc: Sukadev Bhattiprolu Cc: Roland McGrath Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/char/tty_io.c | 4 ++-- include/linux/sched.h | 43 ++++++------------------------------------- kernel/exit.c | 10 +++------- kernel/fork.c | 2 -- kernel/sys.c | 4 +--- 5 files changed, 12 insertions(+), 51 deletions(-) (limited to 'include/linux/sched.h') diff --git a/drivers/char/tty_io.c b/drivers/char/tty_io.c index a44b701c5bba..66b99a2049e3 100644 --- a/drivers/char/tty_io.c +++ b/drivers/char/tty_io.c @@ -2681,7 +2681,7 @@ void __do_SAK(struct tty_struct *tty) /* Kill the entire session */ do_each_pid_task(session, PIDTYPE_SID, p) { printk(KERN_NOTICE "SAK: killed process %d" - " (%s): task_session_nr(p)==tty->session\n", + " (%s): task_session(p)==tty->session\n", task_pid_nr(p), p->comm); send_sig(SIGKILL, p, 1); } while_each_pid_task(session, PIDTYPE_SID, p); @@ -2691,7 +2691,7 @@ void __do_SAK(struct tty_struct *tty) do_each_thread(g, p) { if (p->signal->tty == tty) { printk(KERN_NOTICE "SAK: killed process %d" - " (%s): task_session_nr(p)==tty->session\n", + " (%s): task_session(p)==tty->session\n", task_pid_nr(p), p->comm); send_sig(SIGKILL, p, 1); continue; diff --git a/include/linux/sched.h b/include/linux/sched.h index 49df878a0cad..206ac003e8c0 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -547,25 +547,8 @@ struct signal_struct { struct list_head cpu_timers[3]; - /* job control IDs */ - - /* - * pgrp and session fields are deprecated. - * use the task_session_Xnr and task_pgrp_Xnr routines below - */ - - union { - pid_t pgrp __deprecated; - pid_t __pgrp; - }; - struct pid *tty_old_pgrp; - union { - pid_t session __deprecated; - pid_t __session; - }; - /* boolean value for session group leader */ int leader; @@ -1469,16 +1452,6 @@ static inline int rt_task(struct task_struct *p) return rt_prio(p->prio); } -static inline void set_task_session(struct task_struct *tsk, pid_t session) -{ - tsk->signal->__session = session; -} - -static inline void set_task_pgrp(struct task_struct *tsk, pid_t pgrp) -{ - tsk->signal->__pgrp = pgrp; -} - static inline struct pid *task_pid(struct task_struct *task) { return task->pids[PIDTYPE_PID].pid; @@ -1552,11 +1525,6 @@ static inline pid_t task_tgid_vnr(struct task_struct *tsk) } -static inline pid_t task_pgrp_nr(struct task_struct *tsk) -{ - return tsk->signal->__pgrp; -} - static inline pid_t task_pgrp_nr_ns(struct task_struct *tsk, struct pid_namespace *ns) { @@ -1569,11 +1537,6 @@ static inline pid_t task_pgrp_vnr(struct task_struct *tsk) } -static inline pid_t task_session_nr(struct task_struct *tsk) -{ - return tsk->signal->__session; -} - static inline pid_t task_session_nr_ns(struct task_struct *tsk, struct pid_namespace *ns) { @@ -1585,6 +1548,12 @@ static inline pid_t task_session_vnr(struct task_struct *tsk) return __task_pid_nr_ns(tsk, PIDTYPE_SID, NULL); } +/* obsolete, do not use */ +static inline pid_t task_pgrp_nr(struct task_struct *tsk) +{ + return task_pgrp_nr_ns(tsk, &init_pid_ns); +} + /** * pid_alive - check that a task structure is not stale * @p: Task structure to be checked. diff --git a/kernel/exit.c b/kernel/exit.c index 384f09caf2ef..3bec141c82f6 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -357,16 +357,12 @@ static void reparent_to_kthreadd(void) void __set_special_pids(struct pid *pid) { struct task_struct *curr = current->group_leader; - pid_t nr = pid_nr(pid); - if (task_session(curr) != pid) { + if (task_session(curr) != pid) change_pid(curr, PIDTYPE_SID, pid); - set_task_session(curr, nr); - } - if (task_pgrp(curr) != pid) { + + if (task_pgrp(curr) != pid) change_pid(curr, PIDTYPE_PGID, pid); - set_task_pgrp(curr, nr); - } } static void set_special_pids(struct pid *pid) diff --git a/kernel/fork.c b/kernel/fork.c index adbea16ec649..f74458231449 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1265,8 +1265,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->signal->leader_pid = pid; tty_kref_put(p->signal->tty); p->signal->tty = tty_kref_get(current->signal->tty); - set_task_pgrp(p, task_pgrp_nr(current)); - set_task_session(p, task_session_nr(current)); attach_pid(p, PIDTYPE_PGID, task_pgrp(current)); attach_pid(p, PIDTYPE_SID, task_session(current)); list_add_tail_rcu(&p->tasks, &init_task.tasks); diff --git a/kernel/sys.c b/kernel/sys.c index 37f458e6882a..742cefa527e6 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1013,10 +1013,8 @@ SYSCALL_DEFINE2(setpgid, pid_t, pid, pid_t, pgid) if (err) goto out; - if (task_pgrp(p) != pgrp) { + if (task_pgrp(p) != pgrp) change_pid(p, PIDTYPE_PGID, pgrp); - set_task_pgrp(p, pid_nr(pgrp)); - } err = 0; out: -- cgit v1.2.3