diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2014-10-13 18:23:15 +0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2014-10-13 18:23:15 +0400 |
commit | faafcba3b5e15999cf75d5c5a513ac8e47e2545f (patch) | |
tree | 47d58d1c00e650e820506c91eb9a41268756bdda /kernel | |
parent | 13ead805c5a14b0e7ecd34f61404a5bfba655895 (diff) | |
parent | f10e00f4bf360c36edbe6bf18a6c75b171cbe012 (diff) | |
download | linux-faafcba3b5e15999cf75d5c5a513ac8e47e2545f.tar.xz |
Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler updates from Ingo Molnar:
"The main changes in this cycle were:
- Optimized support for Intel "Cluster-on-Die" (CoD) topologies (Dave
Hansen)
- Various sched/idle refinements for better idle handling (Nicolas
Pitre, Daniel Lezcano, Chuansheng Liu, Vincent Guittot)
- sched/numa updates and optimizations (Rik van Riel)
- sysbench speedup (Vincent Guittot)
- capacity calculation cleanups/refactoring (Vincent Guittot)
- Various cleanups to thread group iteration (Oleg Nesterov)
- Double-rq-lock removal optimization and various refactorings
(Kirill Tkhai)
- various sched/deadline fixes
... and lots of other changes"
* 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (72 commits)
sched/dl: Use dl_bw_of() under rcu_read_lock_sched()
sched/fair: Delete resched_cpu() from idle_balance()
sched, time: Fix build error with 64 bit cputime_t on 32 bit systems
sched: Improve sysbench performance by fixing spurious active migration
sched/x86: Fix up typo in topology detection
x86, sched: Add new topology for multi-NUMA-node CPUs
sched/rt: Use resched_curr() in task_tick_rt()
sched: Use rq->rd in sched_setaffinity() under RCU read lock
sched: cleanup: Rename 'out_unlock' to 'out_free_new_mask'
sched: Use dl_bw_of() under RCU read lock
sched/fair: Remove duplicate code from can_migrate_task()
sched, mips, ia64: Remove __ARCH_WANT_UNLOCKED_CTXSW
sched: print_rq(): Don't use tasklist_lock
sched: normalize_rt_tasks(): Don't use _irqsave for tasklist_lock, use task_rq_lock()
sched: Fix the task-group check in tg_has_rt_tasks()
sched/fair: Leverage the idle state info when choosing the "idlest" cpu
sched: Let the scheduler see CPU idle states
sched/deadline: Fix inter- exclusive cpusets migrations
sched/deadline: Clear dl_entity params when setscheduling to different class
sched/numa: Kill the wrong/dead TASK_DEAD check in task_numa_fault()
...
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/exit.c | 47 | ||||
-rw-r--r-- | kernel/fork.c | 13 | ||||
-rw-r--r-- | kernel/sched/auto_group.c | 5 | ||||
-rw-r--r-- | kernel/sched/core.c | 295 | ||||
-rw-r--r-- | kernel/sched/cpudeadline.c | 4 | ||||
-rw-r--r-- | kernel/sched/cputime.c | 64 | ||||
-rw-r--r-- | kernel/sched/deadline.c | 33 | ||||
-rw-r--r-- | kernel/sched/debug.c | 13 | ||||
-rw-r--r-- | kernel/sched/fair.c | 479 | ||||
-rw-r--r-- | kernel/sched/idle.c | 6 | ||||
-rw-r--r-- | kernel/sched/rt.c | 21 | ||||
-rw-r--r-- | kernel/sched/sched.h | 80 | ||||
-rw-r--r-- | kernel/sched/stop_task.c | 2 | ||||
-rw-r--r-- | kernel/smp.c | 22 | ||||
-rw-r--r-- | kernel/sys.c | 2 | ||||
-rw-r--r-- | kernel/time/hrtimer.c | 1 | ||||
-rw-r--r-- | kernel/time/posix-cpu-timers.c | 14 | ||||
-rw-r--r-- | kernel/trace/ring_buffer_benchmark.c | 3 | ||||
-rw-r--r-- | kernel/trace/trace_stack.c | 4 |
19 files changed, 667 insertions, 441 deletions
diff --git a/kernel/exit.c b/kernel/exit.c index d13f2eec4bb8..5d30019ff953 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -115,32 +115,33 @@ static void __exit_signal(struct task_struct *tsk) if (tsk == sig->curr_target) sig->curr_target = next_thread(tsk); - /* - * Accumulate here the counters for all threads but the - * group leader as they die, so they can be added into - * the process-wide totals when those are taken. - * The group leader stays around as a zombie as long - * as there are other threads. When it gets reaped, - * the exit.c code will add its counts into these totals. - * We won't ever get here for the group leader, since it - * will have been the last reference on the signal_struct. - */ - task_cputime(tsk, &utime, &stime); - sig->utime += utime; - sig->stime += stime; - sig->gtime += task_gtime(tsk); - sig->min_flt += tsk->min_flt; - sig->maj_flt += tsk->maj_flt; - sig->nvcsw += tsk->nvcsw; - sig->nivcsw += tsk->nivcsw; - sig->inblock += task_io_get_inblock(tsk); - sig->oublock += task_io_get_oublock(tsk); - task_io_accounting_add(&sig->ioac, &tsk->ioac); - sig->sum_sched_runtime += tsk->se.sum_exec_runtime; } + /* + * Accumulate here the counters for all threads but the group leader + * as they die, so they can be added into the process-wide totals + * when those are taken. The group leader stays around as a zombie as + * long as there are other threads. When it gets reaped, the exit.c + * code will add its counts into these totals. We won't ever get here + * for the group leader, since it will have been the last reference on + * the signal_struct. + */ + task_cputime(tsk, &utime, &stime); + write_seqlock(&sig->stats_lock); + sig->utime += utime; + sig->stime += stime; + sig->gtime += task_gtime(tsk); + sig->min_flt += tsk->min_flt; + sig->maj_flt += tsk->maj_flt; + sig->nvcsw += tsk->nvcsw; + sig->nivcsw += tsk->nivcsw; + sig->inblock += task_io_get_inblock(tsk); + sig->oublock += task_io_get_oublock(tsk); + task_io_accounting_add(&sig->ioac, &tsk->ioac); + sig->sum_sched_runtime += tsk->se.sum_exec_runtime; sig->nr_threads--; __unhash_process(tsk, group_dead); + write_sequnlock(&sig->stats_lock); /* * Do this under ->siglock, we can race with another thread @@ -1046,6 +1047,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) spin_lock_irq(&p->real_parent->sighand->siglock); psig = p->real_parent->signal; sig = p->signal; + write_seqlock(&psig->stats_lock); psig->cutime += tgutime + sig->cutime; psig->cstime += tgstime + sig->cstime; psig->cgtime += task_gtime(p) + sig->gtime + sig->cgtime; @@ -1068,6 +1070,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) psig->cmaxrss = maxrss; task_io_accounting_add(&psig->ioac, &p->ioac); task_io_accounting_add(&psig->ioac, &sig->ioac); + write_sequnlock(&psig->stats_lock); spin_unlock_irq(&p->real_parent->sighand->siglock); } diff --git a/kernel/fork.c b/kernel/fork.c index 8c162d102740..9b7d746d6d62 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -294,11 +294,18 @@ int __weak arch_dup_task_struct(struct task_struct *dst, return 0; } +void set_task_stack_end_magic(struct task_struct *tsk) +{ + unsigned long *stackend; + + stackend = end_of_stack(tsk); + *stackend = STACK_END_MAGIC; /* for overflow detection */ +} + static struct task_struct *dup_task_struct(struct task_struct *orig) { struct task_struct *tsk; struct thread_info *ti; - unsigned long *stackend; int node = tsk_fork_get_node(orig); int err; @@ -328,8 +335,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) setup_thread_stack(tsk, orig); clear_user_return_notifier(tsk); clear_tsk_need_resched(tsk); - stackend = end_of_stack(tsk); - *stackend = STACK_END_MAGIC; /* for overflow detection */ + set_task_stack_end_magic(tsk); #ifdef CONFIG_CC_STACKPROTECTOR tsk->stack_canary = get_random_int(); @@ -1067,6 +1073,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) sig->curr_target = tsk; init_sigpending(&sig->shared_pending); INIT_LIST_HEAD(&sig->posix_timers); + seqlock_init(&sig->stats_lock); hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); sig->real_timer.function = it_real_fn; diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c index e73efba98301..8a2e230fb86a 100644 --- a/kernel/sched/auto_group.c +++ b/kernel/sched/auto_group.c @@ -148,11 +148,8 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag) if (!ACCESS_ONCE(sysctl_sched_autogroup_enabled)) goto out; - t = p; - do { + for_each_thread(p, t) sched_move_task(t); - } while_each_thread(p, t); - out: unlock_task_sighand(p, &flags); autogroup_kref_put(prev); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index f235c41a3532..44999505e1bf 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -317,9 +317,12 @@ static inline struct rq *__task_rq_lock(struct task_struct *p) for (;;) { rq = task_rq(p); raw_spin_lock(&rq->lock); - if (likely(rq == task_rq(p))) + if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) return rq; raw_spin_unlock(&rq->lock); + + while (unlikely(task_on_rq_migrating(p))) + cpu_relax(); } } @@ -336,10 +339,13 @@ static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags) raw_spin_lock_irqsave(&p->pi_lock, *flags); rq = task_rq(p); raw_spin_lock(&rq->lock); - if (likely(rq == task_rq(p))) + if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) return rq; raw_spin_unlock(&rq->lock); raw_spin_unlock_irqrestore(&p->pi_lock, *flags); + + while (unlikely(task_on_rq_migrating(p))) + cpu_relax(); } } @@ -433,7 +439,15 @@ static void __hrtick_start(void *arg) void hrtick_start(struct rq *rq, u64 delay) { struct hrtimer *timer = &rq->hrtick_timer; - ktime_t time = ktime_add_ns(timer->base->get_time(), delay); + ktime_t time; + s64 delta; + + /* + * Don't schedule slices shorter than 10000ns, that just + * doesn't make sense and can cause timer DoS. + */ + delta = max_t(s64, delay, 10000LL); + time = ktime_add_ns(timer->base->get_time(), delta); hrtimer_set_expires(timer, time); @@ -1027,7 +1041,7 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) * A queue event has occurred, and we're going to schedule. In * this case, we can save a useless back to back clock update. */ - if (rq->curr->on_rq && test_tsk_need_resched(rq->curr)) + if (task_on_rq_queued(rq->curr) && test_tsk_need_resched(rq->curr)) rq->skip_clock_update = 1; } @@ -1072,7 +1086,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) static void __migrate_swap_task(struct task_struct *p, int cpu) { - if (p->on_rq) { + if (task_on_rq_queued(p)) { struct rq *src_rq, *dst_rq; src_rq = task_rq(p); @@ -1198,7 +1212,7 @@ static int migration_cpu_stop(void *data); unsigned long wait_task_inactive(struct task_struct *p, long match_state) { unsigned long flags; - int running, on_rq; + int running, queued; unsigned long ncsw; struct rq *rq; @@ -1236,7 +1250,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state) rq = task_rq_lock(p, &flags); trace_sched_wait_task(p); running = task_running(rq, p); - on_rq = p->on_rq; + queued = task_on_rq_queued(p); ncsw = 0; if (!match_state || p->state == match_state) ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ @@ -1268,7 +1282,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state) * running right now), it's preempted, and we should * yield - it could be a while. */ - if (unlikely(on_rq)) { + if (unlikely(queued)) { ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ); set_current_state(TASK_UNINTERRUPTIBLE); @@ -1462,7 +1476,7 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags) static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags) { activate_task(rq, p, en_flags); - p->on_rq = 1; + p->on_rq = TASK_ON_RQ_QUEUED; /* if a worker is waking up, notify workqueue */ if (p->flags & PF_WQ_WORKER) @@ -1521,7 +1535,7 @@ static int ttwu_remote(struct task_struct *p, int wake_flags) int ret = 0; rq = __task_rq_lock(p); - if (p->on_rq) { + if (task_on_rq_queued(p)) { /* check_preempt_curr() may use rq clock */ update_rq_clock(rq); ttwu_do_wakeup(rq, p, wake_flags); @@ -1604,6 +1618,25 @@ static void ttwu_queue_remote(struct task_struct *p, int cpu) } } +void wake_up_if_idle(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + unsigned long flags; + + if (!is_idle_task(rq->curr)) + return; + + if (set_nr_if_polling(rq->idle)) { + trace_sched_wake_idle_without_ipi(cpu); + } else { + raw_spin_lock_irqsave(&rq->lock, flags); + if (is_idle_task(rq->curr)) + smp_send_reschedule(cpu); + /* Else cpu is not in idle, do nothing here */ + raw_spin_unlock_irqrestore(&rq->lock, flags); + } +} + bool cpus_share_cache(int this_cpu, int that_cpu) { return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); @@ -1726,7 +1759,7 @@ static void try_to_wake_up_local(struct task_struct *p) if (!(p->state & TASK_NORMAL)) goto out; - if (!p->on_rq) + if (!task_on_rq_queued(p)) ttwu_activate(rq, p, ENQUEUE_WAKEUP); ttwu_do_wakeup(rq, p, 0); @@ -1760,6 +1793,20 @@ int wake_up_state(struct task_struct *p, unsigned int state) } /* + * This function clears the sched_dl_entity static params. + */ +void __dl_clear_params(struct task_struct *p) +{ + struct sched_dl_entity *dl_se = &p->dl; + + dl_se->dl_runtime = 0; + dl_se->dl_deadline = 0; + dl_se->dl_period = 0; + dl_se->flags = 0; + dl_se->dl_bw = 0; +} + +/* * Perform scheduler related setup for a newly forked process p. * p is forked by current. * @@ -1783,10 +1830,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) RB_CLEAR_NODE(&p->dl.rb_node); hrtimer_init(&p->dl.dl_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - p->dl.dl_runtime = p->dl.runtime = 0; - p->dl.dl_deadline = p->dl.deadline = 0; - p->dl.dl_period = 0; - p->dl.flags = 0; + __dl_clear_params(p); INIT_LIST_HEAD(&p->rt.run_list); @@ -1961,6 +2005,8 @@ unsigned long to_ratio(u64 period, u64 runtime) #ifdef CONFIG_SMP inline struct dl_bw *dl_bw_of(int i) { + rcu_lockdep_assert(rcu_read_lock_sched_held(), + "sched RCU must be held"); return &cpu_rq(i)->rd->dl_bw; } @@ -1969,6 +2015,8 @@ static inline int dl_bw_cpus(int i) struct root_domain *rd = cpu_rq(i)->rd; int cpus = 0; + rcu_lockdep_assert(rcu_read_lock_sched_held(), + "sched RCU must be held"); for_each_cpu_and(i, rd->span, cpu_active_mask) cpus++; @@ -2079,7 +2127,7 @@ void wake_up_new_task(struct task_struct *p) init_task_runnable_average(p); rq = __task_rq_lock(p); activate_task(rq, p, 0); - p->on_rq = 1; + p->on_rq = TASK_ON_RQ_QUEUED; trace_sched_wakeup_new(p, true); check_preempt_curr(rq, p, WF_FORK); #ifdef CONFIG_SMP @@ -2271,10 +2319,6 @@ asmlinkage __visible void schedule_tail(struct task_struct *prev) */ post_schedule(rq); -#ifdef __ARCH_WANT_UNLOCKED_CTXSW - /* In this case, finish_task_switch does not reenable preemption */ - preempt_enable(); -#endif if (current->set_child_tid) put_user(task_pid_vnr(current), current->set_child_tid); } @@ -2317,9 +2361,7 @@ context_switch(struct rq *rq, struct task_struct *prev, * of the scheduler it's an obvious special-case), so we * do an early lockdep release here: */ -#ifndef __ARCH_WANT_UNLOCKED_CTXSW spin_release(&rq->lock.dep_map, 1, _THIS_IP_); -#endif context_tracking_task_switch(prev, next); /* Here we just switch the register state and the stack. */ @@ -2447,7 +2489,7 @@ static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq) * project cycles that may never be accounted to this * thread, breaking clock_gettime(). */ - if (task_current(rq, p) && p->on_rq) { + if (task_current(rq, p) && task_on_rq_queued(p)) { update_rq_clock(rq); ns = rq_clock_task(rq) - p->se.exec_start; if ((s64)ns < 0) @@ -2493,7 +2535,7 @@ unsigned long long task_sched_runtime(struct task_struct *p) * If we see ->on_cpu without ->on_rq, the task is leaving, and has * been accounted, so we're correct here as well. */ - if (!p->on_cpu || !p->on_rq) + if (!p->on_cpu || !task_on_rq_queued(p)) return p->se.sum_exec_runtime; #endif @@ -2656,6 +2698,9 @@ static noinline void __schedule_bug(struct task_struct *prev) */ static inline void schedule_debug(struct task_struct *prev) { +#ifdef CONFIG_SCHED_STACK_END_CHECK + BUG_ON(unlikely(task_stack_end_corrupted(prev))); +#endif /* * Test if we are atomic. Since do_exit() needs to call into * schedule() atomically, we ignore that path. Otherwise whine @@ -2797,7 +2842,7 @@ need_resched: switch_count = &prev->nvcsw; } - if (prev->on_rq || rq->skip_clock_update < 0) + if (task_on_rq_queued(prev) || rq->skip_clock_update < 0) update_rq_clock(rq); next = pick_next_task(rq, prev); @@ -2962,7 +3007,7 @@ EXPORT_SYMBOL(default_wake_function); */ void rt_mutex_setprio(struct task_struct *p, int prio) { - int oldprio, on_rq, running, enqueue_flag = 0; + int oldprio, queued, running, enqueue_flag = 0; struct rq *rq; const struct sched_class *prev_class; @@ -2991,12 +3036,12 @@ void rt_mutex_setprio(struct task_struct *p, int prio) trace_sched_pi_setprio(p, prio); oldprio = p->prio; prev_class = p->sched_class; - on_rq = p->on_rq; + queued = task_on_rq_queued(p); running = task_current(rq, p); - if (on_rq) + if (queued) dequeue_task(rq, p, 0); if (running) - p->sched_class->put_prev_task(rq, p); + put_prev_task(rq, p); /* * Boosting condition are: @@ -3033,7 +3078,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) if (running) p->sched_class->set_curr_task(rq); - if (on_rq) + if (queued) enqueue_task(rq, p, enqueue_flag); check_class_changed(rq, p, prev_class, oldprio); @@ -3044,7 +3089,7 @@ out_unlock: void set_user_nice(struct task_struct *p, long nice) { - int old_prio, delta, on_rq; + int old_prio, delta, queued; unsigned long flags; struct rq *rq; @@ -3065,8 +3110,8 @@ void set_user_nice(struct task_struct *p, long nice) p->static_prio = NICE_TO_PRIO(nice); goto out_unlock; } - on_rq = p->on_rq; - if (on_rq) + queued = task_on_rq_queued(p); + if (queued) dequeue_task(rq, p, 0); p->static_prio = NICE_TO_PRIO(nice); @@ -3075,7 +3120,7 @@ void set_user_nice(struct task_struct *p, long nice) p->prio = effective_prio(p); delta = p->prio - old_prio; - if (on_rq) { + if (queued) { enqueue_task(rq, p, 0); /* * If the task increased its priority or is running and @@ -3347,7 +3392,7 @@ static int __sched_setscheduler(struct task_struct *p, { int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 : MAX_RT_PRIO - 1 - attr->sched_priority; - int retval, oldprio, oldpolicy = -1, on_rq, running; + int retval, oldprio, oldpolicy = -1, queued, running; int policy = attr->sched_policy; unsigned long flags; const struct sched_class *prev_class; @@ -3544,19 +3589,19 @@ change: return 0; } - on_rq = p->on_rq; + queued = task_on_rq_queued(p); running = task_current(rq, p); - if (on_rq) + if (queued) dequeue_task(rq, p, 0); if (running) - p->sched_class->put_prev_task(rq, p); + put_prev_task(rq, p); prev_class = p->sched_class; __setscheduler(rq, p, attr); if (running) p->sched_class->set_curr_task(rq); - if (on_rq) { + if (queued) { /* * We enqueue to tail when the priority of a task is * increased (user space view). @@ -3980,14 +4025,14 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) rcu_read_lock(); if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) { rcu_read_unlock(); - goto out_unlock; + goto out_free_new_mask; } rcu_read_unlock(); } retval = security_task_setscheduler(p); if (retval) - goto out_unlock; + goto out_free_new_mask; cpuset_cpus_allowed(p, cpus_allowed); @@ -4000,13 +4045,14 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) * root_domain. */ #ifdef CONFIG_SMP - if (task_has_dl_policy(p)) { - const struct cpumask *span = task_rq(p)->rd->span; - - if (dl_bandwidth_enabled() && !cpumask_subset(span, new_mask)) { + if (task_has_dl_policy(p) && dl_bandwidth_enabled()) { + rcu_read_lock(); + if (!cpumask_subset(task_rq(p)->rd->span, new_mask)) { retval = -EBUSY; - goto out_unlock; + rcu_read_unlock(); + goto out_free_new_mask; } + rcu_read_unlock(); } #endif again: @@ -4024,7 +4070,7 @@ again: goto again; } } -out_unlock: +out_free_new_mask: free_cpumask_var(new_mask); out_free_cpus_allowed: free_cpumask_var(cpus_allowed); @@ -4508,7 +4554,7 @@ void show_state_filter(unsigned long state_filter) " task PC stack pid father\n"); #endif rcu_read_lock(); - do_each_thread(g, p) { + for_each_process_thread(g, p) { /* * reset the NMI-timeout, listing all files on a slow * console might take a lot of time: @@ -4516,7 +4562,7 @@ void show_state_filter(unsigned long state_filter) touch_nmi_watchdog(); if (!state_filter || (p->state & state_filter)) sched_show_task(p); - } while_each_thread(g, p); + } touch_all_softlockup_watchdogs(); @@ -4571,7 +4617,7 @@ void init_idle(struct task_struct *idle, int cpu) rcu_read_unlock(); rq->curr = rq->idle = idle; - idle->on_rq = 1; + idle->on_rq = TASK_ON_RQ_QUEUED; #if defined(CONFIG_SMP) idle->on_cpu = 1; #endif @@ -4592,6 +4638,33 @@ void init_idle(struct task_struct *idle, int cpu) } #ifdef CONFIG_SMP +/* + * move_queued_task - move a queued task to new rq. + * + * Returns (locked) new rq. Old rq's lock is released. + */ +static struct rq *move_queued_task(struct task_struct *p, int new_cpu) +{ + struct rq *rq = task_rq(p); + + lockdep_assert_held(&rq->lock); + + dequeue_task(rq, p, 0); + p->on_rq = TASK_ON_RQ_MIGRATING; + set_task_cpu(p, new_cpu); + raw_spin_unlock(&rq->lock); + + rq = cpu_rq(new_cpu); + + raw_spin_lock(&rq->lock); + BUG_ON(task_cpu(p) != new_cpu); + p->on_rq = TASK_ON_RQ_QUEUED; + enqueue_task(rq, p, 0); + check_preempt_curr(rq, p, 0); + + return rq; +} + void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) { if (p->sched_class && p->sched_class->set_cpus_allowed) @@ -4648,14 +4721,15 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) goto out; dest_cpu = cpumask_any_and(cpu_active_mask, new_mask); - if (p->on_rq) { + if (task_running(rq, p) || p->state == TASK_WAKING) { struct migration_arg arg = { p, dest_cpu }; /* Need help from migration thread: drop lock and wait. */ task_rq_unlock(rq, p, &flags); stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); tlb_migrate_finish(p->mm); return 0; - } + } else if (task_on_rq_queued(p)) + rq = move_queued_task(p, dest_cpu); out: task_rq_unlock(rq, p, &flags); @@ -4676,20 +4750,20 @@ EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); */ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) { - struct rq *rq_dest, *rq_src; + struct rq *rq; int ret = 0; if (unlikely(!cpu_active(dest_cpu))) return ret; - rq_src = cpu_rq(src_cpu); - rq_dest = cpu_rq(dest_cpu); + rq = cpu_rq(src_cpu); raw_spin_lock(&p->pi_lock); - double_rq_lock(rq_src, rq_dest); + raw_spin_lock(&rq->lock); /* Already moved. */ if (task_cpu(p) != src_cpu) goto done; + /* Affinity changed (again). */ if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p))) goto fail; @@ -4698,16 +4772,12 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) * If we're not on a rq, the next wake-up will ensure we're * placed properly. */ - if (p->on_rq) { - dequeue_task(rq_src, p, 0); - set_task_cpu(p, dest_cpu); - enqueue_task(rq_dest, p, 0); - check_preempt_curr(rq_dest, p, 0); - } + if (task_on_rq_queued(p)) + rq = move_queued_task(p, dest_cpu); done: ret = 1; fail: - double_rq_unlock(rq_src, rq_dest); + raw_spin_unlock(&rq->lock); raw_spin_unlock(&p->pi_lock); return ret; } @@ -4739,22 +4809,22 @@ void sched_setnuma(struct task_struct *p, int nid) { struct rq *rq; unsigned long flags; - bool on_rq, running; + bool queued, running; rq = task_rq_lock(p, &flags); - on_rq = p->on_rq; + queued = task_on_rq_queued(p); running = task_current(rq, p); - if (on_rq) + if (queued) dequeue_task(rq, p, 0); if (running) - p->sched_class->put_prev_task(rq, p); + put_prev_task(rq, p); p->numa_preferred_nid = nid; if (running) p->sched_class->set_curr_task(rq); - if (on_rq) + if (queued) enqueue_task(rq, p, 0); task_rq_unlock(rq, p, &flags); } @@ -4774,6 +4844,12 @@ static int migration_cpu_stop(void *data) * be on another cpu but it doesn't matter. */ local_irq_disable(); + /* + * We need to explicitly wake pending tasks before running + * __migrate_task() such that we will not miss enforcing cpus_allowed + * during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test. + */ + sched_ttwu_pending(); __migrate_task(arg->task, raw_smp_processor_id(), arg->dest_cpu); local_irq_enable(); return 0; @@ -5184,6 +5260,7 @@ static int sched_cpu_inactive(struct notifier_block *nfb, { unsigned long flags; long cpu = (long)hcpu; + struct dl_bw *dl_b; switch (action & ~CPU_TASKS_FROZEN) { case CPU_DOWN_PREPARE: @@ -5191,15 +5268,19 @@ static int sched_cpu_inactive(struct notifier_block *nfb, /* explicitly allow suspend */ if (!(action & CPU_TASKS_FROZEN)) { - struct dl_bw *dl_b = dl_bw_of(cpu); bool overflow; int cpus; + rcu_read_lock_sched(); + dl_b = dl_bw_of(cpu); + raw_spin_lock_irqsave(&dl_b->lock, flags); cpus = dl_bw_cpus(cpu); overflow = __dl_overflow(dl_b, cpus, 0, 0); raw_spin_unlock_irqrestore(&dl_b->lock, flags); + rcu_read_unlock_sched(); + if (overflow) return notifier_from_errno(-EBUSY); } @@ -5742,7 +5823,7 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) const struct cpumask *span = sched_domain_span(sd); struct cpumask *covered = sched_domains_tmpmask; struct sd_data *sdd = sd->private; - struct sched_domain *child; + struct sched_domain *sibling; int i; cpumask_clear(covered); @@ -5753,10 +5834,10 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) if (cpumask_test_cpu(i, covered)) continue; - child = *per_cpu_ptr(sdd->sd, i); + sibling = *per_cpu_ptr(sdd->sd, i); /* See the comment near build_group_mask(). */ - if (!cpumask_test_cpu(i, sched_domain_span(child))) + if (!cpumask_test_cpu(i, sched_domain_span(sibling))) continue; sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), @@ -5766,10 +5847,9 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) goto fail; sg_span = sched_group_cpus(sg); - if (child->child) { - child = child->child; - cpumask_copy(sg_span, sched_domain_span(child)); - } else + if (sibling->child) + cpumask_copy(sg_span, sched_domain_span(sibling->child)); + else cpumask_set_cpu(i, sg_span); cpumask_or(covered, covered, sg_span); @@ -7120,13 +7200,13 @@ static void normalize_task(struct rq *rq, struct task_struct *p) .sched_policy = SCHED_NORMAL, }; int old_prio = p->prio; - int on_rq; + int queued; - on_rq = p->on_rq; - if (on_rq) + queued = task_on_rq_queued(p); + if (queued) dequeue_task(rq, p, 0); __setscheduler(rq, p, &attr); - if (on_rq) { + if (queued) { enqueue_task(rq, p, 0); resched_curr(rq); } @@ -7140,12 +7220,12 @@ void normalize_rt_tasks(void) unsigned long flags; struct rq *rq; - read_lock_irqsave(&tasklist_lock, flags); - do_each_thread(g, p) { + read_lock(&tasklist_lock); + for_each_process_thread(g, p) { /* * Only normalize user tasks: */ - if (!p->mm) + if (p->flags & PF_KTHREAD) continue; p->se.exec_start = 0; @@ -7160,21 +7240,16 @@ void normalize_rt_tasks(void) * Renice negative nice level userspace * tasks back to 0: */ - if (task_nice(p) < 0 && p->mm) + if (task_nice(p) < 0) set_user_nice(p, 0); continue; } - raw_spin_lock(&p->pi_lock); - rq = __task_rq_lock(p); - + rq = task_rq_lock(p, &flags); normalize_task(rq, p); - - __task_rq_unlock(rq); - raw_spin_unlock(&p->pi_lock); - } while_each_thread(g, p); - - read_unlock_irqrestore(&tasklist_lock, flags); + task_rq_unlock(rq, p, &flags); + } + read_unlock(&tasklist_lock); } #endif /* CONFIG_MAGIC_SYSRQ */ @@ -7314,19 +7389,19 @@ void sched_offline_group(struct task_group *tg) void sched_move_task(struct task_struct *tsk) { struct task_group *tg; - int on_rq, running; + int queued, running; unsigned long flags; struct rq *rq; rq = task_rq_lock(tsk, &flags); running = task_current(rq, tsk); - on_rq = tsk->on_rq; + queued = task_on_rq_queued(tsk); - if (on_rq) + if (queued) dequeue_task(rq, tsk, 0); if (unlikely(running)) - tsk->sched_class->put_prev_task(rq, tsk); + put_prev_task(rq, tsk); tg = container_of(task_css_check(tsk, cpu_cgrp_id, lockdep_is_held(&tsk->sighand->siglock)), @@ -7336,14 +7411,14 @@ void sched_move_task(struct task_struct *tsk) #ifdef CONFIG_FAIR_GROUP_SCHED if (tsk->sched_class->task_move_group) - tsk->sched_class->task_move_group(tsk, on_rq); + tsk->sched_class->task_move_group(tsk, queued); else #endif set_task_rq(tsk, task_cpu(tsk)); if (unlikely(running)) tsk->sched_class->set_curr_task(rq); - if (on_rq) + if (queued) enqueue_task(rq, tsk, 0); task_rq_unlock(rq, tsk, &flags); @@ -7361,10 +7436,10 @@ static inline int tg_has_rt_tasks(struct task_group *tg) { struct task_struct *g, *p; - do_each_thread(g, p) { - if (rt_task(p) && task_rq(p)->rt.tg == tg) + for_each_process_thread(g, p) { + if (rt_task(p) && task_group(p) == tg) return 1; - } while_each_thread(g, p); + } return 0; } @@ -7573,6 +7648,7 @@ static int sched_dl_global_constraints(void) u64 runtime = global_rt_runtime(); u64 period = global_rt_period(); u64 new_bw = to_ratio(period, runtime); + struct dl_bw *dl_b; int cpu, ret = 0; unsigned long flags; @@ -7586,13 +7662,16 @@ static int sched_dl_global_constraints(void) * solutions is welcome! */ for_each_possible_cpu(cpu) { - struct dl_bw *dl_b = dl_bw_of(cpu); + rcu_read_lock_sched(); + dl_b = dl_bw_of(cpu); raw_spin_lock_irqsave(&dl_b->lock, flags); if (new_bw < dl_b->total_bw) ret = -EBUSY; raw_spin_unlock_irqrestore(&dl_b->lock, flags); + rcu_read_unlock_sched(); + if (ret) break; } @@ -7603,6 +7682,7 @@ static int sched_dl_global_constraints(void) static void sched_dl_do_global(void) { u64 new_bw = -1; + struct dl_bw *dl_b; int cpu; unsigned long flags; @@ -7616,11 +7696,14 @@ static void sched_dl_do_global(void) * FIXME: As above... */ for_each_possible_cpu(cpu) { - struct dl_bw *dl_b = dl_bw_of(cpu); + rcu_read_lock_sched(); + dl_b = dl_bw_of(cpu); raw_spin_lock_irqsave(&dl_b->lock, flags); dl_b->bw = new_bw; raw_spin_unlock_irqrestore(&dl_b->lock, flags); + + rcu_read_unlock_sched(); } } @@ -8001,7 +8084,7 @@ static int tg_cfs_schedulable_down(struct task_group *tg, void *data) struct cfs_bandwidth *parent_b = &tg->parent->cfs_bandwidth; quota = normalize_cfs_quota(tg, d); - parent_quota = parent_b->hierarchal_quota; + parent_quota = parent_b->hierarchical_quota; /* * ensure max(child_quota) <= parent_quota, inherit when no @@ -8012,7 +8095,7 @@ static int tg_cfs_schedulable_down(struct task_group *tg, void *data) else if (parent_quota != RUNTIME_INF && quota > parent_quota) return -EINVAL; } - cfs_b->hierarchal_quota = quota; + cfs_b->hierarchical_quota = quota; return 0; } diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c index bd95963dae80..539ca3ce071b 100644 --- a/kernel/sched/cpudeadline.c +++ b/kernel/sched/cpudeadline.c @@ -107,9 +107,7 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p, int best_cpu = -1; const struct sched_dl_entity *dl_se = &p->dl; - if (later_mask && cpumask_and(later_mask, cp->free_cpus, - &p->cpus_allowed) && cpumask_and(later_mask, - later_mask, cpu_active_mask)) { + if (later_mask && cpumask_and(later_mask, later_mask, cp->free_cpus)) { best_cpu = cpumask_any(later_mask); goto out; } else if (cpumask_test_cpu(cpudl_maximum(cp), &p->cpus_allowed) && diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 72fdf06ef865..8394b1ee600c 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -288,24 +288,29 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) struct signal_struct *sig = tsk->signal; cputime_t utime, stime; struct task_struct *t; - - times->utime = sig->utime; - times->stime = sig->stime; - times->sum_exec_runtime = sig->sum_sched_runtime; + unsigned int seq, nextseq; + unsigned long flags; rcu_read_lock(); - /* make sure we can trust tsk->thread_group list */ - if (!likely(pid_alive(tsk))) - goto out; - - t = tsk; + /* Attempt a lockless read on the first round. */ + nextseq = 0; do { - task_cputime(t, &utime, &stime); - times->utime += utime; - times->stime += stime; - times->sum_exec_runtime += task_sched_runtime(t); - } while_each_thread(tsk, t); -out: + seq = nextseq; + flags = read_seqbegin_or_lock_irqsave(&sig->stats_lock, &seq); + times->utime = sig->utime; + times->stime = sig->stime; + times->sum_exec_runtime = sig->sum_sched_runtime; + + for_each_thread(tsk, t) { + task_cputime(t, &utime, &stime); + times->utime += utime; + times->stime += stime; + times->sum_exec_runtime += task_sched_runtime(t); + } + /* If lockless access failed, take the lock. */ + nextseq = 1; + } while (need_seqretry(&sig->stats_lock, seq)); + done_seqretry_irqrestore(&sig->stats_lock, seq, flags); rcu_read_unlock(); } @@ -550,6 +555,23 @@ drop_precision: } /* + * Atomically advance counter to the new value. Interrupts, vcpu + * scheduling, and scaling inaccuracies can cause cputime_advance + * to be occasionally called with a new value smaller than counter. + * Let's enforce atomicity. + * + * Normally a caller will only go through this loop once, or not + * at all in case a previous caller updated counter the same jiffy. + */ +static void cputime_advance(cputime_t *counter, cputime_t new) +{ + cputime_t old; + + while (new > (old = ACCESS_ONCE(*counter))) + cmpxchg_cputime(counter, old, new); +} + +/* * Adjust tick based cputime random precision against scheduler * runtime accounting. */ @@ -594,13 +616,8 @@ static void cputime_adjust(struct task_cputime *curr, utime = rtime - stime; } - /* - * If the tick based count grows faster than the scheduler one, - * the result of the scaling may go backward. - * Let's enforce monotonicity. - */ - prev->stime = max(prev->stime, stime); - prev->utime = max(prev->utime, utime); + cputime_advance(&prev->stime, stime); + cputime_advance(&prev->utime, utime); out: *ut = prev->utime; @@ -617,9 +634,6 @@ void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) cputime_adjust(&cputime, &p->prev_cputime, ut, st); } -/* - * Must be called with siglock held. - */ void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) { struct task_cputime cputime; diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 255ce138b652..abfaf3d9a29f 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -530,7 +530,7 @@ again: update_rq_clock(rq); dl_se->dl_throttled = 0; dl_se->dl_yielded = 0; - if (p->on_rq) { + if (task_on_rq_queued(p)) { enqueue_task_dl(rq, p, ENQUEUE_REPLENISH); if (task_has_dl_policy(rq->curr)) check_preempt_curr_dl(rq, p, 0); @@ -997,10 +997,7 @@ static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p, #ifdef CONFIG_SCHED_HRTICK static void start_hrtick_dl(struct rq *rq, struct task_struct *p) { - s64 delta = p->dl.dl_runtime - p->dl.runtime; - - if (delta > 10000) - hrtick_start(rq, p->dl.runtime); + hrtick_start(rq, p->dl.runtime); } #endif @@ -1030,7 +1027,7 @@ struct task_struct *pick_next_task_dl(struct rq *rq, struct task_struct *prev) * means a stop task can slip in, in which case we need to * re-start task selection. */ - if (rq->stop && rq->stop->on_rq) + if (rq->stop && task_on_rq_queued(rq->stop)) return RETRY_TASK; } @@ -1124,10 +1121,8 @@ static void set_curr_task_dl(struct rq *rq) static int pick_dl_task(struct rq *rq, struct task_struct *p, int cpu) { if (!task_running(rq, p) && - (cpu < 0 || cpumask_test_cpu(cpu, &p->cpus_allowed)) && - (p->nr_cpus_allowed > 1)) + cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) return 1; - return 0; } @@ -1169,6 +1164,13 @@ static int find_later_rq(struct task_struct *task) if (task->nr_cpus_allowed == 1) return -1; + /* + * We have to consider system topology and task affinity + * first, then we can look for a suitable cpu. + */ + cpumask_copy(later_mask, task_rq(task)->rd->span); + cpumask_and(later_mask, later_mask, cpu_active_mask); + cpumask_and(later_mask, later_mask, &task->cpus_allowed); best_cpu = cpudl_find(&task_rq(task)->rd->cpudl, task, later_mask); if (best_cpu == -1) @@ -1257,7 +1259,8 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq) if (unlikely(task_rq(task) != rq || !cpumask_test_cpu(later_rq->cpu, &task->cpus_allowed) || - task_running(rq, task) || !task->on_rq)) { + task_running(rq, task) || + !task_on_rq_queued(task))) { double_unlock_balance(rq, later_rq); later_rq = NULL; break; @@ -1296,7 +1299,7 @@ static struct task_struct *pick_next_pushable_dl_task(struct rq *rq) BUG_ON(task_current(rq, p)); BUG_ON(p->nr_cpus_allowed <= 1); - BUG_ON(!p->on_rq); + BUG_ON(!task_on_rq_queued(p)); BUG_ON(!dl_task(p)); return p; @@ -1443,7 +1446,7 @@ static int pull_dl_task(struct rq *this_rq) dl_time_before(p->dl.deadline, this_rq->dl.earliest_dl.curr))) { WARN_ON(p == src_rq->curr); - WARN_ON(!p->on_rq); + WARN_ON(!task_on_rq_queued(p)); /* * Then we pull iff p has actually an earlier @@ -1569,6 +1572,8 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p) if (hrtimer_active(&p->dl.dl_timer) && !dl_policy(p->policy)) hrtimer_try_to_cancel(&p->dl.dl_timer); + __dl_clear_params(p); + #ifdef CONFIG_SMP /* * Since this might be the only -deadline task on the rq, @@ -1596,7 +1601,7 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p) if (unlikely(p->dl.dl_throttled)) return; - if (p->on_rq && rq->curr != p) { + if (task_on_rq_queued(p) && rq->curr != p) { #ifdef CONFIG_SMP if (rq->dl.overloaded && push_dl_task(rq) && rq != task_rq(p)) /* Only reschedule if pushing failed */ @@ -1614,7 +1619,7 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p) static void prio_changed_dl(struct rq *rq, struct task_struct *p, int oldprio) { - if (p->on_rq || rq->curr == p) { + if (task_on_rq_queued(p) || rq->curr == p) { #ifdef CONFIG_SMP /* * This might be too much, but unfortunately diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 627b3c34b821..ce33780d8f20 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -150,7 +150,6 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) { struct task_struct *g, *p; - unsigned long flags; SEQ_printf(m, "\nrunnable tasks:\n" @@ -159,16 +158,14 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) "------------------------------------------------------" "----------------------------------------------------\n"); - read_lock_irqsave(&tasklist_lock, flags); - - do_each_thread(g, p) { + rcu_read_lock(); + for_each_process_thread(g, p) { if (task_cpu(p) != rq_cpu) continue; print_task(m, rq, p); - } while_each_thread(g, p); - - read_unlock_irqrestore(&tasklist_lock, flags); + } + rcu_read_unlock(); } void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) @@ -333,9 +330,7 @@ do { \ print_cfs_stats(m, cpu); print_rt_stats(m, cpu); - rcu_read_lock(); print_rq(m, rq, cpu); - rcu_read_unlock(); spin_unlock_irqrestore(&sched_debug_lock, flags); SEQ_printf(m, "\n"); } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 82088b29704e..b78280c59b46 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -23,6 +23,7 @@ #include <linux/latencytop.h> #include <linux/sched.h> #include <linux/cpumask.h> +#include <linux/cpuidle.h> #include <linux/slab.h> #include <linux/profile.h> #include <linux/interrupt.h> @@ -665,6 +666,7 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se) } #ifdef CONFIG_SMP +static int select_idle_sibling(struct task_struct *p, int cpu); static unsigned long task_h_load(struct task_struct *p); static inline void __update_task_entity_contrib(struct sched_entity *se); @@ -1038,7 +1040,8 @@ struct numa_stats { */ static void update_numa_stats(struct numa_stats *ns, int nid) { - int cpu, cpus = 0; + int smt, cpu, cpus = 0; + unsigned long capacity; memset(ns, 0, sizeof(*ns)); for_each_cpu(cpu, cpumask_of_node(nid)) { @@ -1062,8 +1065,12 @@ static void update_numa_stats(struct numa_stats *ns, int nid) if (!cpus) return; - ns->task_capacity = - DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE); + /* smt := ceil(cpus / capacity), assumes: 1 < smt_power < 2 */ + smt = DIV_ROUND_UP(SCHED_CAPACITY_SCALE * cpus, ns->compute_capacity); + capacity = cpus / smt; /* cores */ + + ns->task_capacity = min_t(unsigned, capacity, + DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE)); ns->has_free_capacity = (ns->nr_running < ns->task_capacity); } @@ -1206,7 +1213,7 @@ static void task_numa_compare(struct task_numa_env *env, if (!cur) { /* Is there capacity at our destination? */ - if (env->src_stats.has_free_capacity && + if (env->src_stats.nr_running <= env->src_stats.task_capacity && !env->dst_stats.has_free_capacity) goto unlock; @@ -1252,6 +1259,13 @@ balance: if (load_too_imbalanced(src_load, dst_load, env)) goto unlock; + /* + * One idle CPU per node is evaluated for a task numa move. + * Call select_idle_sibling to maybe find a better one. + */ + if (!cur) + env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu); + assign: task_numa_assign(env, cur, imp); unlock: @@ -1775,7 +1789,7 @@ void task_numa_free(struct task_struct *p) list_del(&p->numa_entry); grp->nr_tasks--; spin_unlock_irqrestore(&grp->lock, flags); - rcu_assign_pointer(p->numa_group, NULL); + RCU_INIT_POINTER(p->numa_group, NULL); put_numa_group(grp); } @@ -1804,10 +1818,6 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags) if (!p->mm) return; - /* Do not worry about placement if exiting */ - if (p->state == TASK_DEAD) - return; - /* Allocate buffer to track faults on a per-node basis */ if (unlikely(!p->numa_faults_memory)) { int size = sizeof(*p->numa_faults_memory) * @@ -2211,8 +2221,8 @@ static __always_inline u64 decay_load(u64 val, u64 n) /* * As y^PERIOD = 1/2, we can combine - * y^n = 1/2^(n/PERIOD) * k^(n%PERIOD) - * With a look-up table which covers k^n (n<PERIOD) + * y^n = 1/2^(n/PERIOD) * y^(n%PERIOD) + * With a look-up table which covers y^n (n<PERIOD) * * To achieve constant time decay_load. */ @@ -2377,6 +2387,9 @@ static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq, tg_contrib = cfs_rq->runnable_load_avg + cfs_rq->blocked_load_avg; tg_contrib -= cfs_rq->tg_load_contrib; + if (!tg_contrib) + return; + if (force_update || abs(tg_contrib) > cfs_rq->tg_load_contrib / 8) { atomic_long_add(tg_contrib, &tg->load_avg); cfs_rq->tg_load_contrib += tg_contrib; @@ -3892,14 +3905,6 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p) resched_curr(rq); return; } - - /* - * Don't schedule slices shorter than 10000ns, that just - * doesn't make sense. Rely on vruntime for fairness. - */ - if (rq->curr != p) - delta = max_t(s64, 10000LL, delta); - hrtick_start(rq, delta); } } @@ -4087,7 +4092,7 @@ static unsigned long capacity_of(int cpu) static unsigned long cpu_avg_load_per_task(int cpu) { struct rq *rq = cpu_rq(cpu); - unsigned long nr_running = ACCESS_ONCE(rq->nr_running); + unsigned long nr_running = ACCESS_ONCE(rq->cfs.h_nr_running); unsigned long load_avg = rq->cfs.runnable_load_avg; if (nr_running) @@ -4276,8 +4281,8 @@ static int wake_wide(struct task_struct *p) static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) { s64 this_load, load; + s64 this_eff_load, prev_eff_load; int idx, this_cpu, prev_cpu; - unsigned long tl_per_task; struct task_group *tg; unsigned long weight; int balanced; @@ -4320,47 +4325,30 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) * Otherwise check if either cpus are near enough in load to allow this * task to be woken on this_cpu. */ - if (this_load > 0) { - s64 this_eff_load, prev_eff_load; + this_eff_load = 100; + this_eff_load *= capacity_of(prev_cpu); - this_eff_load = 100; - this_eff_load *= capacity_of(prev_cpu); + prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2; + prev_eff_load *= capacity_of(this_cpu); + + if (this_load > 0) { this_eff_load *= this_load + effective_load(tg, this_cpu, weight, weight); - prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2; - prev_eff_load *= capacity_of(this_cpu); prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight); + } - balanced = this_eff_load <= prev_eff_load; - } else - balanced = true; - - /* - * If the currently running task will sleep within - * a reasonable amount of time then attract this newly - * woken task: - */ - if (sync && balanced) - return 1; + balanced = this_eff_load <= prev_eff_load; schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts); - tl_per_task = cpu_avg_load_per_task(this_cpu); - if (balanced || - (this_load <= load && - this_load + target_load(prev_cpu, idx) <= tl_per_task)) { - /* - * This domain has SD_WAKE_AFFINE and - * p is cache cold in this domain, and - * there is no bad imbalance. - */ - schedstat_inc(sd, ttwu_move_affine); - schedstat_inc(p, se.statistics.nr_wakeups_affine); + if (!balanced) + return 0; - return 1; - } - return 0; + schedstat_inc(sd, ttwu_move_affine); + schedstat_inc(p, se.statistics.nr_wakeups_affine); + + return 1; } /* @@ -4428,20 +4416,46 @@ static int find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) { unsigned long load, min_load = ULONG_MAX; - int idlest = -1; + unsigned int min_exit_latency = UINT_MAX; + u64 latest_idle_timestamp = 0; + int least_loaded_cpu = this_cpu; + int shallowest_idle_cpu = -1; int i; /* Traverse only the allowed CPUs */ for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) { - load = weighted_cpuload(i); - - if (load < min_load || (load == min_load && i == this_cpu)) { - min_load = load; - idlest = i; + if (idle_cpu(i)) { + struct rq *rq = cpu_rq(i); + struct cpuidle_state *idle = idle_get_state(rq); + if (idle && idle->exit_latency < min_exit_latency) { + /* + * We give priority to a CPU whose idle state + * has the smallest exit latency irrespective + * of any idle timestamp. + */ + min_exit_latency = idle->exit_latency; + latest_idle_timestamp = rq->idle_stamp; + shallowest_idle_cpu = i; + } else if ((!idle || idle->exit_latency == min_exit_latency) && + rq->idle_stamp > latest_idle_timestamp) { + /* + * If equal or no active idle state, then + * the most recently idled CPU might have + * a warmer cache. + */ + latest_idle_timestamp = rq->idle_stamp; + shallowest_idle_cpu = i; + } + } else { + load = weighted_cpuload(i); + if (load < min_load || (load == min_load && i == this_cpu)) { + min_load = load; + least_loaded_cpu = i; + } } } - return idlest; + return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu; } /* @@ -4513,11 +4527,8 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f if (p->nr_cpus_allowed == 1) return prev_cpu; - if (sd_flag & SD_BALANCE_WAKE) { - if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) - want_affine = 1; - new_cpu = prev_cpu; - } + if (sd_flag & SD_BALANCE_WAKE) + want_affine = cpumask_test_cpu(cpu, tsk_cpus_allowed(p)); rcu_read_lock(); for_each_domain(cpu, tmp) { @@ -4704,7 +4715,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ return; /* - * This is possible from callers such as move_task(), in which we + * This is possible from callers such as attach_tasks(), in which we * unconditionally check_prempt_curr() after an enqueue (which may have * lead to a throttle). This both saves work and prevents false * next-buddy nomination below. @@ -5112,27 +5123,18 @@ struct lb_env { unsigned int loop_max; enum fbq_type fbq_type; + struct list_head tasks; }; /* - * move_task - move a task from one runqueue to another runqueue. - * Both runqueues must be locked. - */ -static void move_task(struct task_struct *p, struct lb_env *env) -{ - deactivate_task(env->src_rq, p, 0); - set_task_cpu(p, env->dst_cpu); - activate_task(env->dst_rq, p, 0); - check_preempt_curr(env->dst_rq, p, 0); -} - -/* * Is this task likely cache-hot: */ static int task_hot(struct task_struct *p, struct lb_env *env) { s64 delta; + lockdep_assert_held(&env->src_rq->lock); + if (p->sched_class != &fair_sched_class) return 0; @@ -5252,6 +5254,9 @@ static int can_migrate_task(struct task_struct *p, struct lb_env *env) { int tsk_cache_hot = 0; + + lockdep_assert_held(&env->src_rq->lock); + /* * We do not migrate tasks that are: * 1) throttled_lb_pair, or @@ -5310,24 +5315,12 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) if (!tsk_cache_hot) tsk_cache_hot = migrate_degrades_locality(p, env); - if (migrate_improves_locality(p, env)) { -#ifdef CONFIG_SCHEDSTATS + if (migrate_improves_locality(p, env) || !tsk_cache_hot || + env->sd->nr_balance_failed > env->sd->cache_nice_tries) { if (tsk_cache_hot) { schedstat_inc(env->sd, lb_hot_gained[env->idle]); schedstat_inc(p, se.statistics.nr_forced_migrations); } -#endif - return 1; - } - - if (!tsk_cache_hot || - env->sd->nr_balance_failed > env->sd->cache_nice_tries) { - - if (tsk_cache_hot) { - schedstat_inc(env->sd, lb_hot_gained[env->idle]); - schedstat_inc(p, se.statistics.nr_forced_migrations); - } - return 1; } @@ -5336,47 +5329,63 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) } /* - * move_one_task tries to move exactly one task from busiest to this_rq, as + * detach_task() -- detach the task for the migration specified in env + */ +static void detach_task(struct task_struct *p, struct lb_env *env) +{ + lockdep_assert_held(&env->src_rq->lock); + + deactivate_task(env->src_rq, p, 0); + p->on_rq = TASK_ON_RQ_MIGRATING; + set_task_cpu(p, env->dst_cpu); +} + +/* + * detach_one_task() -- tries to dequeue exactly one task from env->src_rq, as * part of active balancing operations within "domain". - * Returns 1 if successful and 0 otherwise. * - * Called with both runqueues locked. + * Returns a task if successful and NULL otherwise. */ -static int move_one_task(struct lb_env *env) +static struct task_struct *detach_one_task(struct lb_env *env) { struct task_struct *p, *n; + lockdep_assert_held(&env->src_rq->lock); + list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) { if (!can_migrate_task(p, env)) continue; - move_task(p, env); + detach_task(p, env); + /* - * Right now, this is only the second place move_task() - * is called, so we can safely collect move_task() - * stats here rather than inside move_task(). + * Right now, this is only the second place where + * lb_gained[env->idle] is updated (other is detach_tasks) + * so we can safely collect stats here rather than + * inside detach_tasks(). */ schedstat_inc(env->sd, lb_gained[env->idle]); - return 1; + return p; } - return 0; + return NULL; } static const unsigned int sched_nr_migrate_break = 32; /* - * move_tasks tries to move up to imbalance weighted load from busiest to - * this_rq, as part of a balancing operation within domain "sd". - * Returns 1 if successful and 0 otherwise. + * detach_tasks() -- tries to detach up to imbalance weighted load from + * busiest_rq, as part of a balancing operation within domain "sd". * - * Called with both runqueues locked. + * Returns number of detached tasks if successful and 0 otherwise. */ -static int move_tasks(struct lb_env *env) +static int detach_tasks(struct lb_env *env) { struct list_head *tasks = &env->src_rq->cfs_tasks; struct task_struct *p; unsigned long load; - int pulled = 0; + int detached = 0; + + lockdep_assert_held(&env->src_rq->lock); if (env->imbalance <= 0) return 0; @@ -5407,14 +5416,16 @@ static int move_tasks(struct lb_env *env) if ((load / 2) > env->imbalance) goto next; - move_task(p, env); - pulled++; + detach_task(p, env); + list_add(&p->se.group_node, &env->tasks); + + detached++; env->imbalance -= load; #ifdef CONFIG_PREEMPT /* * NEWIDLE balancing is a source of latency, so preemptible - * kernels will stop after the first task is pulled to minimize + * kernels will stop after the first task is detached to minimize * the critical section. */ if (env->idle == CPU_NEWLY_IDLE) @@ -5434,13 +5445,58 @@ next: } /* - * Right now, this is one of only two places move_task() is called, - * so we can safely collect move_task() stats here rather than - * inside move_task(). + * Right now, this is one of only two places we collect this stat + * so we can safely collect detach_one_task() stats here rather + * than inside detach_one_task(). */ - schedstat_add(env->sd, lb_gained[env->idle], pulled); + schedstat_add(env->sd, lb_gained[env->idle], detached); + + return detached; +} + +/* + * attach_task() -- attach the task detached by detach_task() to its new rq. + */ +static void attach_task(struct rq *rq, struct task_struct *p) +{ + lockdep_assert_held(&rq->lock); + + BUG_ON(task_rq(p) != rq); + p->on_rq = TASK_ON_RQ_QUEUED; + activate_task(rq, p, 0); + check_preempt_curr(rq, p, 0); +} + +/* + * attach_one_task() -- attaches the task returned from detach_one_task() to + * its new rq. + */ +static void attach_one_task(struct rq *rq, struct task_struct *p) +{ + raw_spin_lock(&rq->lock); + attach_task(rq, p); + raw_spin_unlock(&rq->lock); +} + +/* + * attach_tasks() -- attaches all tasks detached by detach_tasks() to their + * new rq. + */ +static void attach_tasks(struct lb_env *env) +{ + struct list_head *tasks = &env->tasks; + struct task_struct *p; + + raw_spin_lock(&env->dst_rq->lock); + + while (!list_empty(tasks)) { + p = list_first_entry(tasks, struct task_struct, se.group_node); + list_del_init(&p->se.group_node); - return pulled; + attach_task(env->dst_rq, p); + } + + raw_spin_unlock(&env->dst_rq->lock); } #ifdef CONFIG_FAIR_GROUP_SCHED @@ -5559,6 +5615,13 @@ static unsigned long task_h_load(struct task_struct *p) #endif /********** Helpers for find_busiest_group ************************/ + +enum group_type { + group_other = 0, + group_imbalanced, + group_overloaded, +}; + /* * sg_lb_stats - stats of a sched_group required for load_balancing */ @@ -5572,7 +5635,7 @@ struct sg_lb_stats { unsigned int group_capacity_factor; unsigned int idle_cpus; unsigned int group_weight; - int group_imb; /* Is there an imbalance in the group ? */ + enum group_type group_type; int group_has_free_capacity; #ifdef CONFIG_NUMA_BALANCING unsigned int nr_numa_running; @@ -5610,6 +5673,8 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds) .total_capacity = 0UL, .busiest_stat = { .avg_load = 0UL, + .sum_nr_running = 0, + .group_type = group_other, }, }; } @@ -5652,19 +5717,17 @@ unsigned long __weak arch_scale_freq_capacity(struct sched_domain *sd, int cpu) return default_scale_capacity(sd, cpu); } -static unsigned long default_scale_smt_capacity(struct sched_domain *sd, int cpu) +static unsigned long default_scale_cpu_capacity(struct sched_domain *sd, int cpu) { - unsigned long weight = sd->span_weight; - unsigned long smt_gain = sd->smt_gain; + if ((sd->flags & SD_SHARE_CPUCAPACITY) && (sd->span_weight > 1)) + return sd->smt_gain / sd->span_weight; - smt_gain /= weight; - - return smt_gain; + return SCHED_CAPACITY_SCALE; } -unsigned long __weak arch_scale_smt_capacity(struct sched_domain *sd, int cpu) +unsigned long __weak arch_scale_cpu_capacity(struct sched_domain *sd, int cpu) { - return default_scale_smt_capacity(sd, cpu); + return default_scale_cpu_capacity(sd, cpu); } static unsigned long scale_rt_capacity(int cpu) @@ -5703,18 +5766,15 @@ static unsigned long scale_rt_capacity(int cpu) static void update_cpu_capacity(struct sched_domain *sd, int cpu) { - unsigned long weight = sd->span_weight; unsigned long capacity = SCHED_CAPACITY_SCALE; struct sched_group *sdg = sd->groups; - if ((sd->flags & SD_SHARE_CPUCAPACITY) && weight > 1) { - if (sched_feat(ARCH_CAPACITY)) - capacity *= arch_scale_smt_capacity(sd, cpu); - else - capacity *= default_scale_smt_capacity(sd, cpu); + if (sched_feat(ARCH_CAPACITY)) + capacity *= arch_scale_cpu_capacity(sd, cpu); + else + capacity *= default_scale_cpu_capacity(sd, cpu); - capacity >>= SCHED_CAPACITY_SHIFT; - } + capacity >>= SCHED_CAPACITY_SHIFT; sdg->sgc->capacity_orig = capacity; @@ -5891,6 +5951,18 @@ static inline int sg_capacity_factor(struct lb_env *env, struct sched_group *gro return capacity_factor; } +static enum group_type +group_classify(struct sched_group *group, struct sg_lb_stats *sgs) +{ + if (sgs->sum_nr_running > sgs->group_capacity_factor) + return group_overloaded; + + if (sg_imbalanced(group)) + return group_imbalanced; + + return group_other; +} + /** * update_sg_lb_stats - Update sched_group's statistics for load balancing. * @env: The load balancing environment. @@ -5920,7 +5992,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, load = source_load(i, load_idx); sgs->group_load += load; - sgs->sum_nr_running += rq->nr_running; + sgs->sum_nr_running += rq->cfs.h_nr_running; if (rq->nr_running > 1) *overload = true; @@ -5942,9 +6014,8 @@ static inline void update_sg_lb_stats(struct lb_env *env, sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; sgs->group_weight = group->group_weight; - - sgs->group_imb = sg_imbalanced(group); sgs->group_capacity_factor = sg_capacity_factor(env, group); + sgs->group_type = group_classify(group, sgs); if (sgs->group_capacity_factor > sgs->sum_nr_running) sgs->group_has_free_capacity = 1; @@ -5968,13 +6039,19 @@ static bool update_sd_pick_busiest(struct lb_env *env, struct sched_group *sg, struct sg_lb_stats *sgs) { - if (sgs->avg_load <= sds->busiest_stat.avg_load) - return false; + struct sg_lb_stats *busiest = &sds->busiest_stat; - if (sgs->sum_nr_running > sgs->group_capacity_factor) + if (sgs->group_type > busiest->group_type) return true; - if (sgs->group_imb) + if (sgs->group_type < busiest->group_type) + return false; + + if (sgs->avg_load <= busiest->avg_load) + return false; + + /* This is the busiest node in its class. */ + if (!(env->sd->flags & SD_ASYM_PACKING)) return true; /* @@ -5982,8 +6059,7 @@ static bool update_sd_pick_busiest(struct lb_env *env, * numbered CPUs in the group, therefore mark all groups * higher than ourself as busy. */ - if ((env->sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running && - env->dst_cpu < group_first_cpu(sg)) { + if (sgs->sum_nr_running && env->dst_cpu < group_first_cpu(sg)) { if (!sds->busiest) return true; @@ -6228,7 +6304,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s local = &sds->local_stat; busiest = &sds->busiest_stat; - if (busiest->group_imb) { + if (busiest->group_type == group_imbalanced) { /* * In the group_imb case we cannot rely on group-wide averages * to ensure cpu-load equilibrium, look at wider averages. XXX @@ -6248,12 +6324,11 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s return fix_small_imbalance(env, sds); } - if (!busiest->group_imb) { - /* - * Don't want to pull so many tasks that a group would go idle. - * Except of course for the group_imb case, since then we might - * have to drop below capacity to reach cpu-load equilibrium. - */ + /* + * If there aren't any idle cpus, avoid creating some. + */ + if (busiest->group_type == group_overloaded && + local->group_type == group_overloaded) { load_above_capacity = (busiest->sum_nr_running - busiest->group_capacity_factor); @@ -6337,7 +6412,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env) * work because they assume all things are equal, which typically * isn't true due to cpus_allowed constraints and the like. */ - if (busiest->group_imb) + if (busiest->group_type == group_imbalanced) goto force_balance; /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ @@ -6346,7 +6421,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env) goto force_balance; /* - * If the local group is more busy than the selected busiest group + * If the local group is busier than the selected busiest group * don't try and pull any tasks. */ if (local->avg_load >= busiest->avg_load) @@ -6361,13 +6436,14 @@ static struct sched_group *find_busiest_group(struct lb_env *env) if (env->idle == CPU_IDLE) { /* - * This cpu is idle. If the busiest group load doesn't - * have more tasks than the number of available cpu's and - * there is no imbalance between this and busiest group - * wrt to idle cpu's, it is balanced. + * This cpu is idle. If the busiest group is not overloaded + * and there is no imbalance between this and busiest group + * wrt idle cpus, it is balanced. The imbalance becomes + * significant if the diff is greater than 1 otherwise we + * might end up to just move the imbalance on another group */ - if ((local->idle_cpus < busiest->idle_cpus) && - busiest->sum_nr_running <= busiest->group_weight) + if ((busiest->group_type != group_overloaded) && + (local->idle_cpus <= (busiest->idle_cpus + 1))) goto out_balanced; } else { /* @@ -6550,6 +6626,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, .loop_break = sched_nr_migrate_break, .cpus = cpus, .fbq_type = all, + .tasks = LIST_HEAD_INIT(env.tasks), }; /* @@ -6599,23 +6676,30 @@ redo: env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running); more_balance: - local_irq_save(flags); - double_rq_lock(env.dst_rq, busiest); + raw_spin_lock_irqsave(&busiest->lock, flags); /* * cur_ld_moved - load moved in current iteration * ld_moved - cumulative load moved across iterations */ - cur_ld_moved = move_tasks(&env); - ld_moved += cur_ld_moved; - double_rq_unlock(env.dst_rq, busiest); - local_irq_restore(flags); + cur_ld_moved = detach_tasks(&env); /* - * some other cpu did the load balance for us. + * We've detached some tasks from busiest_rq. Every + * task is masked "TASK_ON_RQ_MIGRATING", so we can safely + * unlock busiest->lock, and we are able to be sure + * that nobody can manipulate the tasks in parallel. + * See task_rq_lock() family for the details. */ - if (cur_ld_moved && env.dst_cpu != smp_processor_id()) - resched_cpu(env.dst_cpu); + + raw_spin_unlock(&busiest->lock); + + if (cur_ld_moved) { + attach_tasks(&env); + ld_moved += cur_ld_moved; + } + + local_irq_restore(flags); if (env.flags & LBF_NEED_BREAK) { env.flags &= ~LBF_NEED_BREAK; @@ -6665,10 +6749,8 @@ more_balance: if (sd_parent) { int *group_imbalance = &sd_parent->groups->sgc->imbalance; - if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) { + if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) *group_imbalance = 1; - } else if (*group_imbalance) - *group_imbalance = 0; } /* All tasks on this runqueue were pinned by CPU affinity */ @@ -6679,7 +6761,7 @@ more_balance: env.loop_break = sched_nr_migrate_break; goto redo; } - goto out_balanced; + goto out_all_pinned; } } @@ -6744,7 +6826,7 @@ more_balance: * If we've begun active balancing, start to back off. This * case may not be covered by the all_pinned logic if there * is only 1 task on the busy runqueue (because we don't call - * move_tasks). + * detach_tasks). */ if (sd->balance_interval < sd->max_interval) sd->balance_interval *= 2; @@ -6753,6 +6835,23 @@ more_balance: goto out; out_balanced: + /* + * We reach balance although we may have faced some affinity + * constraints. Clear the imbalance flag if it was set. + */ + if (sd_parent) { + int *group_imbalance = &sd_parent->groups->sgc->imbalance; + + if (*group_imbalance) + *group_imbalance = 0; + } + +out_all_pinned: + /* + * We reach balance because all tasks are pinned at this level so + * we can't migrate them. Let the imbalance flag set so parent level + * can try to migrate them. + */ schedstat_inc(sd, lb_balanced[idle]); sd->nr_balance_failed = 0; @@ -6914,6 +7013,7 @@ static int active_load_balance_cpu_stop(void *data) int target_cpu = busiest_rq->push_cpu; struct rq *target_rq = cpu_rq(target_cpu); struct sched_domain *sd; + struct task_struct *p = NULL; raw_spin_lock_irq(&busiest_rq->lock); @@ -6933,9 +7033,6 @@ static int active_load_balance_cpu_stop(void *data) */ BUG_ON(busiest_rq == target_rq); - /* move a task from busiest_rq to target_rq */ - double_lock_balance(busiest_rq, target_rq); - /* Search for an sd spanning us and the target CPU. */ rcu_read_lock(); for_each_domain(target_cpu, sd) { @@ -6956,16 +7053,22 @@ static int active_load_balance_cpu_stop(void *data) schedstat_inc(sd, alb_count); - if (move_one_task(&env)) + p = detach_one_task(&env); + if (p) schedstat_inc(sd, alb_pushed); else schedstat_inc(sd, alb_failed); } rcu_read_unlock(); - double_unlock_balance(busiest_rq, target_rq); out_unlock: busiest_rq->active_balance = 0; - raw_spin_unlock_irq(&busiest_rq->lock); + raw_spin_unlock(&busiest_rq->lock); + + if (p) + attach_one_task(target_rq, p); + + local_irq_enable(); + return 0; } @@ -7465,7 +7568,7 @@ static void task_fork_fair(struct task_struct *p) static void prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio) { - if (!p->se.on_rq) + if (!task_on_rq_queued(p)) return; /* @@ -7490,11 +7593,11 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p) * switched back to the fair class the enqueue_entity(.flags=0) will * do the right thing. * - * If it's on_rq, then the dequeue_entity(.flags=0) will already - * have normalized the vruntime, if it's !on_rq, then only when + * If it's queued, then the dequeue_entity(.flags=0) will already + * have normalized the vruntime, if it's !queued, then only when * the task is sleeping will it still have non-normalized vruntime. */ - if (!p->on_rq && p->state != TASK_RUNNING) { + if (!task_on_rq_queued(p) && p->state != TASK_RUNNING) { /* * Fix up our vruntime so that the current sleep doesn't * cause 'unlimited' sleep bonus. @@ -7521,15 +7624,15 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p) */ static void switched_to_fair(struct rq *rq, struct task_struct *p) { - struct sched_entity *se = &p->se; #ifdef CONFIG_FAIR_GROUP_SCHED + struct sched_entity *se = &p->se; /* * Since the real-depth could have been changed (only FAIR * class maintain depth value), reset depth properly. */ se->depth = se->parent ? se->parent->depth + 1 : 0; #endif - if (!se->on_rq) + if (!task_on_rq_queued(p)) return; /* @@ -7575,7 +7678,7 @@ void init_cfs_rq(struct cfs_rq *cfs_rq) } #ifdef CONFIG_FAIR_GROUP_SCHED -static void task_move_group_fair(struct task_struct *p, int on_rq) +static void task_move_group_fair(struct task_struct *p, int queued) { struct sched_entity *se = &p->se; struct cfs_rq *cfs_rq; @@ -7594,7 +7697,7 @@ static void task_move_group_fair(struct task_struct *p, int on_rq) * fair sleeper stuff for the first placement, but who cares. */ /* - * When !on_rq, vruntime of the task has usually NOT been normalized. + * When !queued, vruntime of the task has usually NOT been normalized. * But there are some cases where it has already been normalized: * * - Moving a forked child which is waiting for being woken up by @@ -7605,14 +7708,14 @@ static void task_move_group_fair(struct task_struct *p, int on_rq) * To prevent boost or penalty in the new cfs_rq caused by delta * min_vruntime between the two cfs_rqs, we skip vruntime adjustment. */ - if (!on_rq && (!se->sum_exec_runtime || p->state == TASK_WAKING)) - on_rq = 1; + if (!queued && (!se->sum_exec_runtime || p->state == TASK_WAKING)) + queued = 1; - if (!on_rq) + if (!queued) se->vruntime -= cfs_rq_of(se)->min_vruntime; set_task_rq(p, task_cpu(p)); se->depth = se->parent ? se->parent->depth + 1 : 0; - if (!on_rq) { + if (!queued) { cfs_rq = cfs_rq_of(se); se->vruntime += cfs_rq->min_vruntime; #ifdef CONFIG_SMP diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 11e7bc434f43..c47fce75e666 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -147,6 +147,9 @@ use_default: clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &dev->cpu)) goto use_default; + /* Take note of the planned idle state. */ + idle_set_state(this_rq(), &drv->states[next_state]); + /* * Enter the idle state previously returned by the governor decision. * This function will block until an interrupt occurs and will take @@ -154,6 +157,9 @@ use_default: */ entered_state = cpuidle_enter(drv, dev, next_state); + /* The cpu is no longer idle or about to enter idle. */ + idle_set_state(this_rq(), NULL); + if (broadcast) clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &dev->cpu); diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 5f6edca4fafd..87ea5bf1b87f 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1448,7 +1448,7 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev) * means a dl or stop task can slip in, in which case we need * to re-start task selection. */ - if (unlikely((rq->stop && rq->stop->on_rq) || + if (unlikely((rq->stop && task_on_rq_queued(rq->stop)) || rq->dl.dl_nr_running)) return RETRY_TASK; } @@ -1468,8 +1468,7 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev) p = _pick_next_task_rt(rq); /* The running task is never eligible for pushing */ - if (p) - dequeue_pushable_task(rq, p); + dequeue_pushable_task(rq, p); set_post_schedule(rq); @@ -1624,7 +1623,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq) !cpumask_test_cpu(lowest_rq->cpu, tsk_cpus_allowed(task)) || task_running(rq, task) || - !task->on_rq)) { + !task_on_rq_queued(task))) { double_unlock_balance(rq, lowest_rq); lowest_rq = NULL; @@ -1658,7 +1657,7 @@ static struct task_struct *pick_next_pushable_task(struct rq *rq) BUG_ON(task_current(rq, p)); BUG_ON(p->nr_cpus_allowed <= 1); - BUG_ON(!p->on_rq); + BUG_ON(!task_on_rq_queued(p)); BUG_ON(!rt_task(p)); return p; @@ -1809,7 +1808,7 @@ static int pull_rt_task(struct rq *this_rq) */ if (p && (p->prio < this_rq->rt.highest_prio.curr)) { WARN_ON(p == src_rq->curr); - WARN_ON(!p->on_rq); + WARN_ON(!task_on_rq_queued(p)); /* * There's a chance that p is higher in priority @@ -1870,7 +1869,7 @@ static void set_cpus_allowed_rt(struct task_struct *p, BUG_ON(!rt_task(p)); - if (!p->on_rq) + if (!task_on_rq_queued(p)) return; weight = cpumask_weight(new_mask); @@ -1936,7 +1935,7 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p) * we may need to handle the pulling of RT tasks * now. */ - if (!p->on_rq || rq->rt.rt_nr_running) + if (!task_on_rq_queued(p) || rq->rt.rt_nr_running) return; if (pull_rt_task(rq)) @@ -1970,7 +1969,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p) * If that current running task is also an RT task * then see if we can move to another run queue. */ - if (p->on_rq && rq->curr != p) { + if (task_on_rq_queued(p) && rq->curr != p) { #ifdef CONFIG_SMP if (p->nr_cpus_allowed > 1 && rq->rt.overloaded && /* Don't resched if we changed runqueues */ @@ -1989,7 +1988,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p) static void prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio) { - if (!p->on_rq) + if (!task_on_rq_queued(p)) return; if (rq->curr == p) { @@ -2073,7 +2072,7 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued) for_each_sched_rt_entity(rt_se) { if (rt_se->run_list.prev != rt_se->run_list.next) { requeue_task_rt(rq, p, 0); - set_tsk_need_resched(p); + resched_curr(rq); return; } } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 579712f4e9d5..6130251de280 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -14,6 +14,11 @@ #include "cpuacct.h" struct rq; +struct cpuidle_state; + +/* task_struct::on_rq states: */ +#define TASK_ON_RQ_QUEUED 1 +#define TASK_ON_RQ_MIGRATING 2 extern __read_mostly int scheduler_running; @@ -126,6 +131,9 @@ struct rt_bandwidth { u64 rt_runtime; struct hrtimer rt_period_timer; }; + +void __dl_clear_params(struct task_struct *p); + /* * To keep the bandwidth of -deadline tasks and groups under control * we need some place where: @@ -184,7 +192,7 @@ struct cfs_bandwidth { raw_spinlock_t lock; ktime_t period; u64 quota, runtime; - s64 hierarchal_quota; + s64 hierarchical_quota; u64 runtime_expires; int idle, timer_active; @@ -636,6 +644,11 @@ struct rq { #ifdef CONFIG_SMP struct llist_head wake_list; #endif + +#ifdef CONFIG_CPU_IDLE + /* Must be inspected within a rcu lock section */ + struct cpuidle_state *idle_state; +#endif }; static inline int cpu_of(struct rq *rq) @@ -647,7 +660,7 @@ static inline int cpu_of(struct rq *rq) #endif } -DECLARE_PER_CPU(struct rq, runqueues); +DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); #define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) #define this_rq() (&__get_cpu_var(runqueues)) @@ -942,6 +955,15 @@ static inline int task_running(struct rq *rq, struct task_struct *p) #endif } +static inline int task_on_rq_queued(struct task_struct *p) +{ + return p->on_rq == TASK_ON_RQ_QUEUED; +} + +static inline int task_on_rq_migrating(struct task_struct *p) +{ + return p->on_rq == TASK_ON_RQ_MIGRATING; +} #ifndef prepare_arch_switch # define prepare_arch_switch(next) do { } while (0) @@ -953,7 +975,6 @@ static inline int task_running(struct rq *rq, struct task_struct *p) # define finish_arch_post_lock_switch() do { } while (0) #endif -#ifndef __ARCH_WANT_UNLOCKED_CTXSW static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) { #ifdef CONFIG_SMP @@ -991,35 +1012,6 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) raw_spin_unlock_irq(&rq->lock); } -#else /* __ARCH_WANT_UNLOCKED_CTXSW */ -static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) -{ -#ifdef CONFIG_SMP - /* - * We can optimise this out completely for !SMP, because the - * SMP rebalancing from interrupt is the only thing that cares - * here. - */ - next->on_cpu = 1; -#endif - raw_spin_unlock(&rq->lock); -} - -static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) -{ -#ifdef CONFIG_SMP - /* - * After ->on_cpu is cleared, the task can be moved to a different CPU. - * We must ensure this doesn't happen until the switch is completely - * finished. - */ - smp_wmb(); - prev->on_cpu = 0; -#endif - local_irq_enable(); -} -#endif /* __ARCH_WANT_UNLOCKED_CTXSW */ - /* * wake flags */ @@ -1180,6 +1172,30 @@ static inline void idle_exit_fair(struct rq *rq) { } #endif +#ifdef CONFIG_CPU_IDLE +static inline void idle_set_state(struct rq *rq, + struct cpuidle_state *idle_state) +{ + rq->idle_state = idle_state; +} + +static inline struct cpuidle_state *idle_get_state(struct rq *rq) +{ + WARN_ON(!rcu_read_lock_held()); + return rq->idle_state; +} +#else +static inline void idle_set_state(struct rq *rq, + struct cpuidle_state *idle_state) +{ +} + +static inline struct cpuidle_state *idle_get_state(struct rq *rq) +{ + return NULL; +} +#endif + extern void sysrq_sched_debug_show(void); extern void sched_init_granularity(void); extern void update_max_interval(void); diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index bfe0edadbfbb..67426e529f59 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -28,7 +28,7 @@ pick_next_task_stop(struct rq *rq, struct task_struct *prev) { struct task_struct *stop = rq->stop; - if (!stop || !stop->on_rq) + if (!stop || !task_on_rq_queued(stop)) return NULL; put_prev_task(rq, prev); diff --git a/kernel/smp.c b/kernel/smp.c index aff8aa14f547..9e0d0b289118 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -13,6 +13,7 @@ #include <linux/gfp.h> #include <linux/smp.h> #include <linux/cpu.h> +#include <linux/sched.h> #include "smpboot.h" @@ -699,3 +700,24 @@ void kick_all_cpus_sync(void) smp_call_function(do_nothing, NULL, 1); } EXPORT_SYMBOL_GPL(kick_all_cpus_sync); + +/** + * wake_up_all_idle_cpus - break all cpus out of idle + * wake_up_all_idle_cpus try to break all cpus which is in idle state even + * including idle polling cpus, for non-idle cpus, we will do nothing + * for them. + */ +void wake_up_all_idle_cpus(void) +{ + int cpu; + + preempt_disable(); + for_each_online_cpu(cpu) { + if (cpu == smp_processor_id()) + continue; + + wake_up_if_idle(cpu); + } + preempt_enable(); +} +EXPORT_SYMBOL_GPL(wake_up_all_idle_cpus); diff --git a/kernel/sys.c b/kernel/sys.c index dfce4debd138..1eaa2f0b0246 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -869,11 +869,9 @@ void do_sys_times(struct tms *tms) { cputime_t tgutime, tgstime, cutime, cstime; - spin_lock_irq(¤t->sighand->siglock); thread_group_cputime_adjusted(current, &tgutime, &tgstime); cutime = current->signal->cutime; cstime = current->signal->cstime; - spin_unlock_irq(¤t->sighand->siglock); tms->tms_utime = cputime_to_clock_t(tgutime); tms->tms_stime = cputime_to_clock_t(tgstime); tms->tms_cutime = cputime_to_clock_t(cutime); diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 1c2fe7de2842..ab370ffffd53 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1776,7 +1776,6 @@ schedule_hrtimeout_range_clock(ktime_t *expires, unsigned long delta, */ if (!expires) { schedule(); - __set_current_state(TASK_RUNNING); return -EINTR; } diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 3b8946416a5f..492b986195d5 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -272,22 +272,8 @@ static int posix_cpu_clock_get_task(struct task_struct *tsk, if (same_thread_group(tsk, current)) err = cpu_clock_sample(which_clock, tsk, &rtn); } else { - unsigned long flags; - struct sighand_struct *sighand; - - /* - * while_each_thread() is not yet entirely RCU safe, - * keep locking the group while sampling process - * clock for now. - */ - sighand = lock_task_sighand(tsk, &flags); - if (!sighand) - return err; - if (tsk == current || thread_group_leader(tsk)) err = cpu_clock_sample_group(which_clock, tsk, &rtn); - - unlock_task_sighand(tsk, &flags); } if (!err) diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c index 0434ff1b808e..3f9e328c30b5 100644 --- a/kernel/trace/ring_buffer_benchmark.c +++ b/kernel/trace/ring_buffer_benchmark.c @@ -205,7 +205,6 @@ static void ring_buffer_consumer(void) break; schedule(); - __set_current_state(TASK_RUNNING); } reader_finish = 0; complete(&read_done); @@ -379,7 +378,6 @@ static int ring_buffer_consumer_thread(void *arg) break; schedule(); - __set_current_state(TASK_RUNNING); } __set_current_state(TASK_RUNNING); @@ -407,7 +405,6 @@ static int ring_buffer_producer_thread(void *arg) trace_printk("Sleeping for 10 secs\n"); set_current_state(TASK_INTERRUPTIBLE); schedule_timeout(HZ * SLEEP_TIME); - __set_current_state(TASK_RUNNING); } if (kill_test) diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 8a4e5cb66a4c..16eddb308c33 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -13,7 +13,6 @@ #include <linux/sysctl.h> #include <linux/init.h> #include <linux/fs.h> -#include <linux/magic.h> #include <asm/setup.h> @@ -171,8 +170,7 @@ check_stack(unsigned long ip, unsigned long *stack) i++; } - if ((current != &init_task && - *(end_of_stack(current)) != STACK_END_MAGIC)) { + if (task_stack_end_corrupted(current)) { print_max_stack(); BUG(); } |