diff options
Diffstat (limited to 'kernel/sched')
-rw-r--r-- | kernel/sched/auto_group.c | 2 | ||||
-rw-r--r-- | kernel/sched/clock.c | 2 | ||||
-rw-r--r-- | kernel/sched/core.c | 225 | ||||
-rw-r--r-- | kernel/sched/cputime.c | 75 | ||||
-rw-r--r-- | kernel/sched/deadline.c | 59 | ||||
-rw-r--r-- | kernel/sched/fair.c | 316 | ||||
-rw-r--r-- | kernel/sched/idle_task.c | 1 | ||||
-rw-r--r-- | kernel/sched/rt.c | 2 | ||||
-rw-r--r-- | kernel/sched/sched.h | 73 | ||||
-rw-r--r-- | kernel/sched/wait.c | 28 |
10 files changed, 510 insertions, 273 deletions
diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c index 750ed601ddf7..a5d966cb8891 100644 --- a/kernel/sched/auto_group.c +++ b/kernel/sched/auto_group.c @@ -212,7 +212,7 @@ int proc_sched_autogroup_set_nice(struct task_struct *p, int nice) ag = autogroup_task_get(p); down_write(&ag->lock); - err = sched_group_set_shares(ag->tg, prio_to_weight[nice + 20]); + err = sched_group_set_shares(ag->tg, sched_prio_to_weight[nice + 20]); if (!err) ag->nice = nice; up_write(&ag->lock); diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index c0a205101c23..caf4041f5b0a 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -1,7 +1,7 @@ /* * sched_clock for unstable cpu clocks * - * Copyright (C) 2008 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> + * Copyright (C) 2008 Red Hat, Inc., Peter Zijlstra * * Updates and enhancements: * Copyright (C) 2008 Red Hat, Inc. Steven Rostedt <srostedt@redhat.com> diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 4d568ac9319e..77d97a6fc715 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -731,7 +731,7 @@ bool sched_can_stop_tick(void) if (current->policy == SCHED_RR) { struct sched_rt_entity *rt_se = ¤t->rt; - return rt_se->run_list.prev == rt_se->run_list.next; + return list_is_singular(&rt_se->run_list); } /* @@ -823,8 +823,8 @@ static void set_load_weight(struct task_struct *p) return; } - load->weight = scale_load(prio_to_weight[prio]); - load->inv_weight = prio_to_wmult[prio]; + load->weight = scale_load(sched_prio_to_weight[prio]); + load->inv_weight = sched_prio_to_wmult[prio]; } static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags) @@ -1071,8 +1071,8 @@ static struct rq *move_queued_task(struct rq *rq, struct task_struct *p, int new { lockdep_assert_held(&rq->lock); - dequeue_task(rq, p, 0); p->on_rq = TASK_ON_RQ_MIGRATING; + dequeue_task(rq, p, 0); set_task_cpu(p, new_cpu); raw_spin_unlock(&rq->lock); @@ -1080,8 +1080,8 @@ static struct rq *move_queued_task(struct rq *rq, struct task_struct *p, int new raw_spin_lock(&rq->lock); BUG_ON(task_cpu(p) != new_cpu); - p->on_rq = TASK_ON_RQ_QUEUED; enqueue_task(rq, p, 0); + p->on_rq = TASK_ON_RQ_QUEUED; check_preempt_curr(rq, p, 0); return rq; @@ -1274,6 +1274,15 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && !p->on_rq); + /* + * Migrating fair class task must have p->on_rq = TASK_ON_RQ_MIGRATING, + * because schedstat_wait_{start,end} rebase migrating task's wait_start + * time relying on p->on_rq. + */ + WARN_ON_ONCE(p->state == TASK_RUNNING && + p->sched_class == &fair_sched_class && + (p->on_rq && !task_on_rq_migrating(p))); + #ifdef CONFIG_LOCKDEP /* * The caller should hold either p->pi_lock or rq->lock, when changing @@ -1310,9 +1319,11 @@ static void __migrate_swap_task(struct task_struct *p, int cpu) src_rq = task_rq(p); dst_rq = cpu_rq(cpu); + p->on_rq = TASK_ON_RQ_MIGRATING; deactivate_task(src_rq, p, 0); set_task_cpu(p, cpu); activate_task(dst_rq, p, 0); + p->on_rq = TASK_ON_RQ_QUEUED; check_preempt_curr(dst_rq, p, 0); } else { /* @@ -1905,6 +1916,97 @@ static void ttwu_queue(struct task_struct *p, int cpu) raw_spin_unlock(&rq->lock); } +/* + * Notes on Program-Order guarantees on SMP systems. + * + * MIGRATION + * + * The basic program-order guarantee on SMP systems is that when a task [t] + * migrates, all its activity on its old cpu [c0] happens-before any subsequent + * execution on its new cpu [c1]. + * + * For migration (of runnable tasks) this is provided by the following means: + * + * A) UNLOCK of the rq(c0)->lock scheduling out task t + * B) migration for t is required to synchronize *both* rq(c0)->lock and + * rq(c1)->lock (if not at the same time, then in that order). + * C) LOCK of the rq(c1)->lock scheduling in task + * + * Transitivity guarantees that B happens after A and C after B. + * Note: we only require RCpc transitivity. + * Note: the cpu doing B need not be c0 or c1 + * + * Example: + * + * CPU0 CPU1 CPU2 + * + * LOCK rq(0)->lock + * sched-out X + * sched-in Y + * UNLOCK rq(0)->lock + * + * LOCK rq(0)->lock // orders against CPU0 + * dequeue X + * UNLOCK rq(0)->lock + * + * LOCK rq(1)->lock + * enqueue X + * UNLOCK rq(1)->lock + * + * LOCK rq(1)->lock // orders against CPU2 + * sched-out Z + * sched-in X + * UNLOCK rq(1)->lock + * + * + * BLOCKING -- aka. SLEEP + WAKEUP + * + * For blocking we (obviously) need to provide the same guarantee as for + * migration. However the means are completely different as there is no lock + * chain to provide order. Instead we do: + * + * 1) smp_store_release(X->on_cpu, 0) + * 2) smp_cond_acquire(!X->on_cpu) + * + * Example: + * + * CPU0 (schedule) CPU1 (try_to_wake_up) CPU2 (schedule) + * + * LOCK rq(0)->lock LOCK X->pi_lock + * dequeue X + * sched-out X + * smp_store_release(X->on_cpu, 0); + * + * smp_cond_acquire(!X->on_cpu); + * X->state = WAKING + * set_task_cpu(X,2) + * + * LOCK rq(2)->lock + * enqueue X + * X->state = RUNNING + * UNLOCK rq(2)->lock + * + * LOCK rq(2)->lock // orders against CPU1 + * sched-out Z + * sched-in X + * UNLOCK rq(2)->lock + * + * UNLOCK X->pi_lock + * UNLOCK rq(0)->lock + * + * + * However; for wakeups there is a second guarantee we must provide, namely we + * must observe the state that lead to our wakeup. That is, not only must our + * task observe its own prior state, it must also observe the stores prior to + * its wakeup. + * + * This means that any means of doing remote wakeups must order the CPU doing + * the wakeup against the CPU the task is going to end up running on. This, + * however, is already required for the regular Program-Order guarantee above, + * since the waking CPU is the one issueing the ACQUIRE (smp_cond_acquire). + * + */ + /** * try_to_wake_up - wake up a thread * @p: the thread to be awakened @@ -1947,15 +2049,34 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) #ifdef CONFIG_SMP /* - * If the owning (remote) cpu is still in the middle of schedule() with - * this task as prev, wait until its done referencing the task. + * Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be + * possible to, falsely, observe p->on_cpu == 0. + * + * One must be running (->on_cpu == 1) in order to remove oneself + * from the runqueue. + * + * [S] ->on_cpu = 1; [L] ->on_rq + * UNLOCK rq->lock + * RMB + * LOCK rq->lock + * [S] ->on_rq = 0; [L] ->on_cpu + * + * Pairs with the full barrier implied in the UNLOCK+LOCK on rq->lock + * from the consecutive calls to schedule(); the first switching to our + * task, the second putting it to sleep. */ - while (p->on_cpu) - cpu_relax(); + smp_rmb(); + /* - * Pairs with the smp_wmb() in finish_lock_switch(). + * If the owning (remote) cpu is still in the middle of schedule() with + * this task as prev, wait until its done referencing the task. + * + * Pairs with the smp_store_release() in finish_lock_switch(). + * + * This ensures that tasks getting woken will be fully ordered against + * their previous state and preserve Program Order. */ - smp_rmb(); + smp_cond_acquire(!p->on_cpu); p->sched_contributes_to_load = !!task_contributes_to_load(p); p->state = TASK_WAKING; @@ -2039,7 +2160,6 @@ out: */ int wake_up_process(struct task_struct *p) { - WARN_ON(task_is_stopped_or_traced(p)); return try_to_wake_up(p, TASK_NORMAL, 0); } EXPORT_SYMBOL(wake_up_process); @@ -2085,6 +2205,10 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) p->se.vruntime = 0; INIT_LIST_HEAD(&p->se.group_node); +#ifdef CONFIG_FAIR_GROUP_SCHED + p->se.cfs_rq = NULL; +#endif + #ifdef CONFIG_SCHEDSTATS memset(&p->se.statistics, 0, sizeof(p->se.statistics)); #endif @@ -3085,7 +3209,6 @@ static void __sched notrace __schedule(bool preempt) cpu = smp_processor_id(); rq = cpu_rq(cpu); - rcu_note_context_switch(); prev = rq->curr; /* @@ -3104,13 +3227,16 @@ static void __sched notrace __schedule(bool preempt) if (sched_feat(HRTICK)) hrtick_clear(rq); + local_irq_disable(); + rcu_note_context_switch(); + /* * Make sure that signal_pending_state()->signal_pending() below * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE) * done by the caller to avoid the race with signal_wake_up(). */ smp_mb__before_spinlock(); - raw_spin_lock_irq(&rq->lock); + raw_spin_lock(&rq->lock); lockdep_pin_lock(&rq->lock); rq->clock_skip_update <<= 1; /* promote REQ to ACT */ @@ -5847,13 +5973,13 @@ static int init_rootdomain(struct root_domain *rd) { memset(rd, 0, sizeof(*rd)); - if (!alloc_cpumask_var(&rd->span, GFP_KERNEL)) + if (!zalloc_cpumask_var(&rd->span, GFP_KERNEL)) goto out; - if (!alloc_cpumask_var(&rd->online, GFP_KERNEL)) + if (!zalloc_cpumask_var(&rd->online, GFP_KERNEL)) goto free_span; - if (!alloc_cpumask_var(&rd->dlo_mask, GFP_KERNEL)) + if (!zalloc_cpumask_var(&rd->dlo_mask, GFP_KERNEL)) goto free_online; - if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL)) + if (!zalloc_cpumask_var(&rd->rto_mask, GFP_KERNEL)) goto free_dlo_mask; init_dl_bw(&rd->dl_bw); @@ -7331,6 +7457,9 @@ int in_sched_functions(unsigned long addr) */ struct task_group root_task_group; LIST_HEAD(task_groups); + +/* Cacheline aligned slab cache for task_group */ +static struct kmem_cache *task_group_cache __read_mostly; #endif DECLARE_PER_CPU(cpumask_var_t, load_balance_mask); @@ -7388,11 +7517,12 @@ void __init sched_init(void) #endif /* CONFIG_RT_GROUP_SCHED */ #ifdef CONFIG_CGROUP_SCHED + task_group_cache = KMEM_CACHE(task_group, 0); + list_add(&root_task_group.list, &task_groups); INIT_LIST_HEAD(&root_task_group.children); INIT_LIST_HEAD(&root_task_group.siblings); autogroup_init(&init_task); - #endif /* CONFIG_CGROUP_SCHED */ for_each_possible_cpu(i) { @@ -7673,7 +7803,7 @@ static void free_sched_group(struct task_group *tg) free_fair_sched_group(tg); free_rt_sched_group(tg); autogroup_free(tg); - kfree(tg); + kmem_cache_free(task_group_cache, tg); } /* allocate runqueue etc for a new task group */ @@ -7681,7 +7811,7 @@ struct task_group *sched_create_group(struct task_group *parent) { struct task_group *tg; - tg = kzalloc(sizeof(*tg), GFP_KERNEL); + tg = kmem_cache_alloc(task_group_cache, GFP_KERNEL | __GFP_ZERO); if (!tg) return ERR_PTR(-ENOMEM); @@ -8217,12 +8347,12 @@ static void cpu_cgroup_fork(struct task_struct *task, void *private) sched_move_task(task); } -static int cpu_cgroup_can_attach(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset) +static int cpu_cgroup_can_attach(struct cgroup_taskset *tset) { struct task_struct *task; + struct cgroup_subsys_state *css; - cgroup_taskset_for_each(task, tset) { + cgroup_taskset_for_each(task, css, tset) { #ifdef CONFIG_RT_GROUP_SCHED if (!sched_rt_can_attach(css_tg(css), task)) return -EINVAL; @@ -8235,12 +8365,12 @@ static int cpu_cgroup_can_attach(struct cgroup_subsys_state *css, return 0; } -static void cpu_cgroup_attach(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset) +static void cpu_cgroup_attach(struct cgroup_taskset *tset) { struct task_struct *task; + struct cgroup_subsys_state *css; - cgroup_taskset_for_each(task, tset) + cgroup_taskset_for_each(task, css, tset) sched_move_task(task); } @@ -8586,3 +8716,44 @@ void dump_cpu_task(int cpu) pr_info("Task dump for CPU %d:\n", cpu); sched_show_task(cpu_curr(cpu)); } + +/* + * Nice levels are multiplicative, with a gentle 10% change for every + * nice level changed. I.e. when a CPU-bound task goes from nice 0 to + * nice 1, it will get ~10% less CPU time than another CPU-bound task + * that remained on nice 0. + * + * The "10% effect" is relative and cumulative: from _any_ nice level, + * if you go up 1 level, it's -10% CPU usage, if you go down 1 level + * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25. + * If a task goes up by ~10% and another task goes down by ~10% then + * the relative distance between them is ~25%.) + */ +const int sched_prio_to_weight[40] = { + /* -20 */ 88761, 71755, 56483, 46273, 36291, + /* -15 */ 29154, 23254, 18705, 14949, 11916, + /* -10 */ 9548, 7620, 6100, 4904, 3906, + /* -5 */ 3121, 2501, 1991, 1586, 1277, + /* 0 */ 1024, 820, 655, 526, 423, + /* 5 */ 335, 272, 215, 172, 137, + /* 10 */ 110, 87, 70, 56, 45, + /* 15 */ 36, 29, 23, 18, 15, +}; + +/* + * Inverse (2^32/x) values of the sched_prio_to_weight[] array, precalculated. + * + * In cases where the weight does not change often, we can use the + * precalculated inverse to speed up arithmetics by turning divisions + * into multiplications: + */ +const u32 sched_prio_to_wmult[40] = { + /* -20 */ 48388, 59856, 76040, 92818, 118348, + /* -15 */ 147320, 184698, 229616, 287308, 360437, + /* -10 */ 449829, 563644, 704093, 875809, 1099582, + /* -5 */ 1376151, 1717300, 2157191, 2708050, 3363326, + /* 0 */ 4194304, 5237765, 6557202, 8165337, 10153587, + /* 5 */ 12820798, 15790321, 19976592, 24970740, 31350126, + /* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717, + /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, +}; diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 26a54461bf59..d5ff5c6bf829 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -466,7 +466,7 @@ void account_process_tick(struct task_struct *p, int user_tick) cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); struct rq *rq = this_rq(); - if (vtime_accounting_enabled()) + if (vtime_accounting_cpu_enabled()) return; if (sched_clock_irqtime) { @@ -680,7 +680,7 @@ static cputime_t get_vtime_delta(struct task_struct *tsk) { unsigned long long delta = vtime_delta(tsk); - WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_SLEEPING); + WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_INACTIVE); tsk->vtime_snap += delta; /* CHECKME: always safe to convert nsecs to cputime? */ @@ -696,37 +696,37 @@ static void __vtime_account_system(struct task_struct *tsk) void vtime_account_system(struct task_struct *tsk) { - write_seqlock(&tsk->vtime_seqlock); + write_seqcount_begin(&tsk->vtime_seqcount); __vtime_account_system(tsk); - write_sequnlock(&tsk->vtime_seqlock); + write_seqcount_end(&tsk->vtime_seqcount); } void vtime_gen_account_irq_exit(struct task_struct *tsk) { - write_seqlock(&tsk->vtime_seqlock); + write_seqcount_begin(&tsk->vtime_seqcount); __vtime_account_system(tsk); if (context_tracking_in_user()) tsk->vtime_snap_whence = VTIME_USER; - write_sequnlock(&tsk->vtime_seqlock); + write_seqcount_end(&tsk->vtime_seqcount); } void vtime_account_user(struct task_struct *tsk) { cputime_t delta_cpu; - write_seqlock(&tsk->vtime_seqlock); + write_seqcount_begin(&tsk->vtime_seqcount); delta_cpu = get_vtime_delta(tsk); tsk->vtime_snap_whence = VTIME_SYS; account_user_time(tsk, delta_cpu, cputime_to_scaled(delta_cpu)); - write_sequnlock(&tsk->vtime_seqlock); + write_seqcount_end(&tsk->vtime_seqcount); } void vtime_user_enter(struct task_struct *tsk) { - write_seqlock(&tsk->vtime_seqlock); + write_seqcount_begin(&tsk->vtime_seqcount); __vtime_account_system(tsk); tsk->vtime_snap_whence = VTIME_USER; - write_sequnlock(&tsk->vtime_seqlock); + write_seqcount_end(&tsk->vtime_seqcount); } void vtime_guest_enter(struct task_struct *tsk) @@ -738,19 +738,19 @@ void vtime_guest_enter(struct task_struct *tsk) * synchronization against the reader (task_gtime()) * that can thus safely catch up with a tickless delta. */ - write_seqlock(&tsk->vtime_seqlock); + write_seqcount_begin(&tsk->vtime_seqcount); __vtime_account_system(tsk); current->flags |= PF_VCPU; - write_sequnlock(&tsk->vtime_seqlock); + write_seqcount_end(&tsk->vtime_seqcount); } EXPORT_SYMBOL_GPL(vtime_guest_enter); void vtime_guest_exit(struct task_struct *tsk) { - write_seqlock(&tsk->vtime_seqlock); + write_seqcount_begin(&tsk->vtime_seqcount); __vtime_account_system(tsk); current->flags &= ~PF_VCPU; - write_sequnlock(&tsk->vtime_seqlock); + write_seqcount_end(&tsk->vtime_seqcount); } EXPORT_SYMBOL_GPL(vtime_guest_exit); @@ -763,24 +763,26 @@ void vtime_account_idle(struct task_struct *tsk) void arch_vtime_task_switch(struct task_struct *prev) { - write_seqlock(&prev->vtime_seqlock); - prev->vtime_snap_whence = VTIME_SLEEPING; - write_sequnlock(&prev->vtime_seqlock); + write_seqcount_begin(&prev->vtime_seqcount); + prev->vtime_snap_whence = VTIME_INACTIVE; + write_seqcount_end(&prev->vtime_seqcount); - write_seqlock(¤t->vtime_seqlock); + write_seqcount_begin(¤t->vtime_seqcount); current->vtime_snap_whence = VTIME_SYS; current->vtime_snap = sched_clock_cpu(smp_processor_id()); - write_sequnlock(¤t->vtime_seqlock); + write_seqcount_end(¤t->vtime_seqcount); } void vtime_init_idle(struct task_struct *t, int cpu) { unsigned long flags; - write_seqlock_irqsave(&t->vtime_seqlock, flags); + local_irq_save(flags); + write_seqcount_begin(&t->vtime_seqcount); t->vtime_snap_whence = VTIME_SYS; t->vtime_snap = sched_clock_cpu(cpu); - write_sequnlock_irqrestore(&t->vtime_seqlock, flags); + write_seqcount_end(&t->vtime_seqcount); + local_irq_restore(flags); } cputime_t task_gtime(struct task_struct *t) @@ -788,14 +790,17 @@ cputime_t task_gtime(struct task_struct *t) unsigned int seq; cputime_t gtime; + if (!vtime_accounting_enabled()) + return t->gtime; + do { - seq = read_seqbegin(&t->vtime_seqlock); + seq = read_seqcount_begin(&t->vtime_seqcount); gtime = t->gtime; - if (t->flags & PF_VCPU) + if (t->vtime_snap_whence == VTIME_SYS && t->flags & PF_VCPU) gtime += vtime_delta(t); - } while (read_seqretry(&t->vtime_seqlock, seq)); + } while (read_seqcount_retry(&t->vtime_seqcount, seq)); return gtime; } @@ -818,7 +823,7 @@ fetch_task_cputime(struct task_struct *t, *udelta = 0; *sdelta = 0; - seq = read_seqbegin(&t->vtime_seqlock); + seq = read_seqcount_begin(&t->vtime_seqcount); if (u_dst) *u_dst = *u_src; @@ -826,7 +831,7 @@ fetch_task_cputime(struct task_struct *t, *s_dst = *s_src; /* Task is sleeping, nothing to add */ - if (t->vtime_snap_whence == VTIME_SLEEPING || + if (t->vtime_snap_whence == VTIME_INACTIVE || is_idle_task(t)) continue; @@ -842,7 +847,7 @@ fetch_task_cputime(struct task_struct *t, if (t->vtime_snap_whence == VTIME_SYS) *sdelta = delta; } - } while (read_seqretry(&t->vtime_seqlock, seq)); + } while (read_seqcount_retry(&t->vtime_seqcount, seq)); } @@ -850,6 +855,14 @@ void task_cputime(struct task_struct *t, cputime_t *utime, cputime_t *stime) { cputime_t udelta, sdelta; + if (!vtime_accounting_enabled()) { + if (utime) + *utime = t->utime; + if (stime) + *stime = t->stime; + return; + } + fetch_task_cputime(t, utime, stime, &t->utime, &t->stime, &udelta, &sdelta); if (utime) @@ -863,6 +876,14 @@ void task_cputime_scaled(struct task_struct *t, { cputime_t udelta, sdelta; + if (!vtime_accounting_enabled()) { + if (utimescaled) + *utimescaled = t->utimescaled; + if (stimescaled) + *stimescaled = t->stimescaled; + return; + } + fetch_task_cputime(t, utimescaled, stimescaled, &t->utimescaled, &t->stimescaled, &udelta, &sdelta); if (utimescaled) diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 8b0a15e285f9..cd64c979d0e1 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -176,8 +176,10 @@ static void enqueue_pushable_dl_task(struct rq *rq, struct task_struct *p) } } - if (leftmost) + if (leftmost) { dl_rq->pushable_dl_tasks_leftmost = &p->pushable_dl_tasks; + dl_rq->earliest_dl.next = p->dl.deadline; + } rb_link_node(&p->pushable_dl_tasks, parent, link); rb_insert_color(&p->pushable_dl_tasks, &dl_rq->pushable_dl_tasks_root); @@ -195,6 +197,10 @@ static void dequeue_pushable_dl_task(struct rq *rq, struct task_struct *p) next_node = rb_next(&p->pushable_dl_tasks); dl_rq->pushable_dl_tasks_leftmost = next_node; + if (next_node) { + dl_rq->earliest_dl.next = rb_entry(next_node, + struct task_struct, pushable_dl_tasks)->dl.deadline; + } } rb_erase(&p->pushable_dl_tasks, &dl_rq->pushable_dl_tasks_root); @@ -782,42 +788,14 @@ static void update_curr_dl(struct rq *rq) #ifdef CONFIG_SMP -static struct task_struct *pick_next_earliest_dl_task(struct rq *rq, int cpu); - -static inline u64 next_deadline(struct rq *rq) -{ - struct task_struct *next = pick_next_earliest_dl_task(rq, rq->cpu); - - if (next && dl_prio(next->prio)) - return next->dl.deadline; - else - return 0; -} - static void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline) { struct rq *rq = rq_of_dl_rq(dl_rq); if (dl_rq->earliest_dl.curr == 0 || dl_time_before(deadline, dl_rq->earliest_dl.curr)) { - /* - * If the dl_rq had no -deadline tasks, or if the new task - * has shorter deadline than the current one on dl_rq, we - * know that the previous earliest becomes our next earliest, - * as the new task becomes the earliest itself. - */ - dl_rq->earliest_dl.next = dl_rq->earliest_dl.curr; dl_rq->earliest_dl.curr = deadline; cpudl_set(&rq->rd->cpudl, rq->cpu, deadline, 1); - } else if (dl_rq->earliest_dl.next == 0 || - dl_time_before(deadline, dl_rq->earliest_dl.next)) { - /* - * On the other hand, if the new -deadline task has a - * a later deadline than the earliest one on dl_rq, but - * it is earlier than the next (if any), we must - * recompute the next-earliest. - */ - dl_rq->earliest_dl.next = next_deadline(rq); } } @@ -839,7 +817,6 @@ static void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline) entry = rb_entry(leftmost, struct sched_dl_entity, rb_node); dl_rq->earliest_dl.curr = entry->deadline; - dl_rq->earliest_dl.next = next_deadline(rq); cpudl_set(&rq->rd->cpudl, rq->cpu, entry->deadline, 1); } } @@ -1274,28 +1251,6 @@ static int pick_dl_task(struct rq *rq, struct task_struct *p, int cpu) return 0; } -/* Returns the second earliest -deadline task, NULL otherwise */ -static struct task_struct *pick_next_earliest_dl_task(struct rq *rq, int cpu) -{ - struct rb_node *next_node = rq->dl.rb_leftmost; - struct sched_dl_entity *dl_se; - struct task_struct *p = NULL; - -next_node: - next_node = rb_next(next_node); - if (next_node) { - dl_se = rb_entry(next_node, struct sched_dl_entity, rb_node); - p = dl_task_of(dl_se); - - if (pick_dl_task(rq, p, cpu)) - return p; - - goto next_node; - } - - return NULL; -} - /* * Return the earliest pushable rq's task, which is suitable to be executed * on the CPU, NULL otherwise: diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index f04fda8f669c..1926606ece80 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -17,7 +17,7 @@ * Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de> * * Adaptive scheduling granularity, math enhancements by Peter Zijlstra - * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> + * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra */ #include <linux/latencytop.h> @@ -738,12 +738,56 @@ static void update_curr_fair(struct rq *rq) update_curr(cfs_rq_of(&rq->curr->se)); } +#ifdef CONFIG_SCHEDSTATS +static inline void +update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + u64 wait_start = rq_clock(rq_of(cfs_rq)); + + if (entity_is_task(se) && task_on_rq_migrating(task_of(se)) && + likely(wait_start > se->statistics.wait_start)) + wait_start -= se->statistics.wait_start; + + se->statistics.wait_start = wait_start; +} + +static void +update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + struct task_struct *p; + u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start; + + if (entity_is_task(se)) { + p = task_of(se); + if (task_on_rq_migrating(p)) { + /* + * Preserve migrating task's wait time so wait_start + * time stamp can be adjusted to accumulate wait time + * prior to migration. + */ + se->statistics.wait_start = delta; + return; + } + trace_sched_stat_wait(p, delta); + } + + se->statistics.wait_max = max(se->statistics.wait_max, delta); + se->statistics.wait_count++; + se->statistics.wait_sum += delta; + se->statistics.wait_start = 0; +} +#else static inline void update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) { - schedstat_set(se->statistics.wait_start, rq_clock(rq_of(cfs_rq))); } +static inline void +update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ +} +#endif + /* * Task is being enqueued - update stats: */ @@ -757,23 +801,6 @@ static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) update_stats_wait_start(cfs_rq, se); } -static void -update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) -{ - schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max, - rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start)); - schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1); - schedstat_set(se->statistics.wait_sum, se->statistics.wait_sum + - rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start); -#ifdef CONFIG_SCHEDSTATS - if (entity_is_task(se)) { - trace_sched_stat_wait(task_of(se), - rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start); - } -#endif - schedstat_set(se->statistics.wait_start, 0); -} - static inline void update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) { @@ -2155,6 +2182,7 @@ void task_numa_work(struct callback_head *work) unsigned long migrate, next_scan, now = jiffies; struct task_struct *p = current; struct mm_struct *mm = p->mm; + u64 runtime = p->se.sum_exec_runtime; struct vm_area_struct *vma; unsigned long start, end; unsigned long nr_pte_updates = 0; @@ -2277,6 +2305,17 @@ out: else reset_ptenuma_scan(p); up_read(&mm->mmap_sem); + + /* + * Make sure tasks use at least 32x as much time to run other code + * than they used here, to limit NUMA PTE scanning overhead to 3% max. + * Usually update_task_scan_period slows down scanning enough; on an + * overloaded system we need to limit overhead on a per task basis. + */ + if (unlikely(p->se.sum_exec_runtime != runtime)) { + u64 diff = p->se.sum_exec_runtime - runtime; + p->node_stamp += 32 * diff; + } } /* @@ -2670,12 +2709,64 @@ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) { long delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib; + /* + * No need to update load_avg for root_task_group as it is not used. + */ + if (cfs_rq->tg == &root_task_group) + return; + if (force || abs(delta) > cfs_rq->tg_load_avg_contrib / 64) { atomic_long_add(delta, &cfs_rq->tg->load_avg); cfs_rq->tg_load_avg_contrib = cfs_rq->avg.load_avg; } } +/* + * Called within set_task_rq() right before setting a task's cpu. The + * caller only guarantees p->pi_lock is held; no other assumptions, + * including the state of rq->lock, should be made. + */ +void set_task_rq_fair(struct sched_entity *se, + struct cfs_rq *prev, struct cfs_rq *next) +{ + if (!sched_feat(ATTACH_AGE_LOAD)) + return; + + /* + * We are supposed to update the task to "current" time, then its up to + * date and ready to go to new CPU/cfs_rq. But we have difficulty in + * getting what current time is, so simply throw away the out-of-date + * time. This will result in the wakee task is less decayed, but giving + * the wakee more load sounds not bad. + */ + if (se->avg.last_update_time && prev) { + u64 p_last_update_time; + u64 n_last_update_time; + +#ifndef CONFIG_64BIT + u64 p_last_update_time_copy; + u64 n_last_update_time_copy; + + do { + p_last_update_time_copy = prev->load_last_update_time_copy; + n_last_update_time_copy = next->load_last_update_time_copy; + + smp_rmb(); + + p_last_update_time = prev->avg.last_update_time; + n_last_update_time = next->avg.last_update_time; + + } while (p_last_update_time != p_last_update_time_copy || + n_last_update_time != n_last_update_time_copy); +#else + p_last_update_time = prev->avg.last_update_time; + n_last_update_time = next->avg.last_update_time; +#endif + __update_load_avg(p_last_update_time, cpu_of(rq_of(prev)), + &se->avg, 0, 0, NULL); + se->avg.last_update_time = n_last_update_time; + } +} #else /* CONFIG_FAIR_GROUP_SCHED */ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {} #endif /* CONFIG_FAIR_GROUP_SCHED */ @@ -2689,7 +2780,7 @@ static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) int decayed, removed = 0; if (atomic_long_read(&cfs_rq->removed_load_avg)) { - long r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0); + s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0); sa->load_avg = max_t(long, sa->load_avg - r, 0); sa->load_sum = max_t(s64, sa->load_sum - r * LOAD_AVG_MAX, 0); removed = 1; @@ -2809,48 +2900,48 @@ dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) max_t(s64, cfs_rq->runnable_load_sum - se->avg.load_sum, 0); } -/* - * Task first catches up with cfs_rq, and then subtract - * itself from the cfs_rq (task must be off the queue now). - */ -void remove_entity_load_avg(struct sched_entity *se) -{ - struct cfs_rq *cfs_rq = cfs_rq_of(se); - u64 last_update_time; - #ifndef CONFIG_64BIT +static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq) +{ u64 last_update_time_copy; + u64 last_update_time; do { last_update_time_copy = cfs_rq->load_last_update_time_copy; smp_rmb(); last_update_time = cfs_rq->avg.last_update_time; } while (last_update_time != last_update_time_copy); -#else - last_update_time = cfs_rq->avg.last_update_time; -#endif - __update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL); - atomic_long_add(se->avg.load_avg, &cfs_rq->removed_load_avg); - atomic_long_add(se->avg.util_avg, &cfs_rq->removed_util_avg); + return last_update_time; } - -/* - * Update the rq's load with the elapsed running time before entering - * idle. if the last scheduled task is not a CFS task, idle_enter will - * be the only way to update the runnable statistic. - */ -void idle_enter_fair(struct rq *this_rq) +#else +static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq) { + return cfs_rq->avg.last_update_time; } +#endif /* - * Update the rq's load with the elapsed idle time before a task is - * scheduled. if the newly scheduled task is not a CFS task, idle_exit will - * be the only way to update the runnable statistic. + * Task first catches up with cfs_rq, and then subtract + * itself from the cfs_rq (task must be off the queue now). */ -void idle_exit_fair(struct rq *this_rq) +void remove_entity_load_avg(struct sched_entity *se) { + struct cfs_rq *cfs_rq = cfs_rq_of(se); + u64 last_update_time; + + /* + * Newly created task or never used group entity should not be removed + * from its (source) cfs_rq + */ + if (se->avg.last_update_time == 0) + return; + + last_update_time = cfs_rq_last_update_time(cfs_rq); + + __update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL); + atomic_long_add(se->avg.load_avg, &cfs_rq->removed_load_avg); + atomic_long_add(se->avg.util_avg, &cfs_rq->removed_util_avg); } static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq) @@ -4240,42 +4331,37 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) */ /* - * The exact cpuload at various idx values, calculated at every tick would be - * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load + * The exact cpuload calculated at every tick would be: + * + * load' = (1 - 1/2^i) * load + (1/2^i) * cur_load * - * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called - * on nth tick when cpu may be busy, then we have: - * load = ((2^idx - 1) / 2^idx)^(n-1) * load - * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load + * If a cpu misses updates for n ticks (as it was idle) and update gets + * called on the n+1-th tick when cpu may be busy, then we have: + * + * load_n = (1 - 1/2^i)^n * load_0 + * load_n+1 = (1 - 1/2^i) * load_n + (1/2^i) * cur_load * * decay_load_missed() below does efficient calculation of - * load = ((2^idx - 1) / 2^idx)^(n-1) * load - * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load + * + * load' = (1 - 1/2^i)^n * load + * + * Because x^(n+m) := x^n * x^m we can decompose any x^n in power-of-2 factors. + * This allows us to precompute the above in said factors, thereby allowing the + * reduction of an arbitrary n in O(log_2 n) steps. (See also + * fixed_power_int()) * * The calculation is approximated on a 128 point scale. - * degrade_zero_ticks is the number of ticks after which load at any - * particular idx is approximated to be zero. - * degrade_factor is a precomputed table, a row for each load idx. - * Each column corresponds to degradation factor for a power of two ticks, - * based on 128 point scale. - * Example: - * row 2, col 3 (=12) says that the degradation at load idx 2 after - * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8). - * - * With this power of 2 load factors, we can degrade the load n times - * by looking at 1 bits in n and doing as many mult/shift instead of - * n mult/shifts needed by the exact degradation. */ #define DEGRADE_SHIFT 7 -static const unsigned char - degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128}; -static const unsigned char - degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = { - {0, 0, 0, 0, 0, 0, 0, 0}, - {64, 32, 8, 0, 0, 0, 0, 0}, - {96, 72, 40, 12, 1, 0, 0}, - {112, 98, 75, 43, 15, 1, 0}, - {120, 112, 98, 76, 45, 16, 2} }; + +static const u8 degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128}; +static const u8 degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = { + { 0, 0, 0, 0, 0, 0, 0, 0 }, + { 64, 32, 8, 0, 0, 0, 0, 0 }, + { 96, 72, 40, 12, 1, 0, 0, 0 }, + { 112, 98, 75, 43, 15, 1, 0, 0 }, + { 120, 112, 98, 76, 45, 16, 2, 0 } +}; /* * Update cpu_load for any missed ticks, due to tickless idle. The backlog @@ -4306,14 +4392,46 @@ decay_load_missed(unsigned long load, unsigned long missed_updates, int idx) return load; } -/* +/** + * __update_cpu_load - update the rq->cpu_load[] statistics + * @this_rq: The rq to update statistics for + * @this_load: The current load + * @pending_updates: The number of missed updates + * @active: !0 for NOHZ_FULL + * * Update rq->cpu_load[] statistics. This function is usually called every - * scheduler tick (TICK_NSEC). With tickless idle this will not be called - * every tick. We fix it up based on jiffies. + * scheduler tick (TICK_NSEC). + * + * This function computes a decaying average: + * + * load[i]' = (1 - 1/2^i) * load[i] + (1/2^i) * load + * + * Because of NOHZ it might not get called on every tick which gives need for + * the @pending_updates argument. + * + * load[i]_n = (1 - 1/2^i) * load[i]_n-1 + (1/2^i) * load_n-1 + * = A * load[i]_n-1 + B ; A := (1 - 1/2^i), B := (1/2^i) * load + * = A * (A * load[i]_n-2 + B) + B + * = A * (A * (A * load[i]_n-3 + B) + B) + B + * = A^3 * load[i]_n-3 + (A^2 + A + 1) * B + * = A^n * load[i]_0 + (A^(n-1) + A^(n-2) + ... + 1) * B + * = A^n * load[i]_0 + ((1 - A^n) / (1 - A)) * B + * = (1 - 1/2^i)^n * (load[i]_0 - load) + load + * + * In the above we've assumed load_n := load, which is true for NOHZ_FULL as + * any change in load would have resulted in the tick being turned back on. + * + * For regular NOHZ, this reduces to: + * + * load[i]_n = (1 - 1/2^i)^n * load[i]_0 + * + * see decay_load_misses(). For NOHZ_FULL we get to subtract and add the extra + * term. See the @active paramter. */ static void __update_cpu_load(struct rq *this_rq, unsigned long this_load, - unsigned long pending_updates) + unsigned long pending_updates, int active) { + unsigned long tickless_load = active ? this_rq->cpu_load[0] : 0; int i, scale; this_rq->nr_load_updates++; @@ -4325,8 +4443,9 @@ static void __update_cpu_load(struct rq *this_rq, unsigned long this_load, /* scale is effectively 1 << i now, and >> i divides by scale */ - old_load = this_rq->cpu_load[i]; + old_load = this_rq->cpu_load[i] - tickless_load; old_load = decay_load_missed(old_load, pending_updates - 1, i); + old_load += tickless_load; new_load = this_load; /* * Round up the averaging division if load is increasing. This @@ -4381,16 +4500,17 @@ static void update_idle_cpu_load(struct rq *this_rq) pending_updates = curr_jiffies - this_rq->last_load_update_tick; this_rq->last_load_update_tick = curr_jiffies; - __update_cpu_load(this_rq, load, pending_updates); + __update_cpu_load(this_rq, load, pending_updates, 0); } /* * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed. */ -void update_cpu_load_nohz(void) +void update_cpu_load_nohz(int active) { struct rq *this_rq = this_rq(); unsigned long curr_jiffies = READ_ONCE(jiffies); + unsigned long load = active ? weighted_cpuload(cpu_of(this_rq)) : 0; unsigned long pending_updates; if (curr_jiffies == this_rq->last_load_update_tick) @@ -4401,10 +4521,11 @@ void update_cpu_load_nohz(void) if (pending_updates) { this_rq->last_load_update_tick = curr_jiffies; /* - * We were idle, this means load 0, the current load might be - * !0 due to remote wakeups and the sort. + * In the regular NOHZ case, we were idle, this means load 0. + * In the NOHZ_FULL case, we were non-idle, we should consider + * its weighted load. */ - __update_cpu_load(this_rq, 0, pending_updates); + __update_cpu_load(this_rq, load, pending_updates, active); } raw_spin_unlock(&this_rq->lock); } @@ -4420,7 +4541,7 @@ void update_cpu_load_active(struct rq *this_rq) * See the mess around update_idle_cpu_load() / update_cpu_load_nohz(). */ this_rq->last_load_update_tick = jiffies; - __update_cpu_load(this_rq, load, 1); + __update_cpu_load(this_rq, load, 1, 1); } /* @@ -5007,8 +5128,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f /* * Called immediately before a task is migrated to a new cpu; task_cpu(p) and * cfs_rq_of(p) references at time of call are still valid and identify the - * previous cpu. However, the caller only guarantees p->pi_lock is held; no - * other assumptions, including the state of rq->lock, should be made. + * previous cpu. The caller guarantees p->pi_lock or task_rq(p)->lock is held. */ static void migrate_task_rq_fair(struct task_struct *p) { @@ -5721,8 +5841,8 @@ static void detach_task(struct task_struct *p, struct lb_env *env) { lockdep_assert_held(&env->src_rq->lock); - deactivate_task(env->src_rq, p, 0); p->on_rq = TASK_ON_RQ_MIGRATING; + deactivate_task(env->src_rq, p, 0); set_task_cpu(p, env->dst_cpu); } @@ -5855,8 +5975,8 @@ static void attach_task(struct rq *rq, struct task_struct *p) lockdep_assert_held(&rq->lock); BUG_ON(task_rq(p) != rq); - p->on_rq = TASK_ON_RQ_QUEUED; activate_task(rq, p, 0); + p->on_rq = TASK_ON_RQ_QUEUED; check_preempt_curr(rq, p, 0); } @@ -6302,7 +6422,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, bool *overload) { unsigned long load; - int i; + int i, nr_running; memset(sgs, 0, sizeof(*sgs)); @@ -6319,7 +6439,8 @@ static inline void update_sg_lb_stats(struct lb_env *env, sgs->group_util += cpu_util(i); sgs->sum_nr_running += rq->cfs.h_nr_running; - if (rq->nr_running > 1) + nr_running = rq->nr_running; + if (nr_running > 1) *overload = true; #ifdef CONFIG_NUMA_BALANCING @@ -6327,7 +6448,10 @@ static inline void update_sg_lb_stats(struct lb_env *env, sgs->nr_preferred_running += rq->nr_preferred_running; #endif sgs->sum_weighted_load += weighted_cpuload(i); - if (idle_cpu(i)) + /* + * No need to call idle_cpu() if nr_running is not 0 + */ + if (!nr_running && idle_cpu(i)) sgs->idle_cpus++; } @@ -7248,8 +7372,6 @@ static int idle_balance(struct rq *this_rq) int pulled_task = 0; u64 curr_cost = 0; - idle_enter_fair(this_rq); - /* * We must set idle_stamp _before_ calling idle_balance(), such that we * measure the duration of idle_balance() as idle time. @@ -7330,10 +7452,8 @@ out: if (this_rq->nr_running != this_rq->cfs.h_nr_running) pulled_task = -1; - if (pulled_task) { - idle_exit_fair(this_rq); + if (pulled_task) this_rq->idle_stamp = 0; - } return pulled_task; } diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c index c4ae0f1fdf9b..47ce94931f1b 100644 --- a/kernel/sched/idle_task.c +++ b/kernel/sched/idle_task.c @@ -47,7 +47,6 @@ dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags) static void put_prev_task_idle(struct rq *rq, struct task_struct *prev) { - idle_exit_fair(rq); rq_last_tick_reset(rq); } diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index e3cc16312046..8ec86abe0ea1 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -64,7 +64,7 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b) raw_spin_unlock(&rt_b->rt_runtime_lock); } -#ifdef CONFIG_SMP +#if defined(CONFIG_SMP) && defined(HAVE_RT_PUSH_IPI) static void push_irq_work_func(struct irq_work *work); #endif diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index efd3bfc7e347..10f16374df7f 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -248,7 +248,12 @@ struct task_group { unsigned long shares; #ifdef CONFIG_SMP - atomic_long_t load_avg; + /* + * load_avg can be heavily contended at clock tick time, so put + * it in its own cacheline separated from the fields above which + * will also be accessed at each tick. + */ + atomic_long_t load_avg ____cacheline_aligned; #endif #endif @@ -335,7 +340,15 @@ extern void sched_move_task(struct task_struct *tsk); #ifdef CONFIG_FAIR_GROUP_SCHED extern int sched_group_set_shares(struct task_group *tg, unsigned long shares); -#endif + +#ifdef CONFIG_SMP +extern void set_task_rq_fair(struct sched_entity *se, + struct cfs_rq *prev, struct cfs_rq *next); +#else /* !CONFIG_SMP */ +static inline void set_task_rq_fair(struct sched_entity *se, + struct cfs_rq *prev, struct cfs_rq *next) { } +#endif /* CONFIG_SMP */ +#endif /* CONFIG_FAIR_GROUP_SCHED */ #else /* CONFIG_CGROUP_SCHED */ @@ -933,6 +946,7 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu) #endif #ifdef CONFIG_FAIR_GROUP_SCHED + set_task_rq_fair(&p->se, p->se.cfs_rq, tg->cfs_rq[cpu]); p->se.cfs_rq = tg->cfs_rq[cpu]; p->se.parent = tg->se[cpu]; #endif @@ -1073,7 +1087,10 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) * We must ensure this doesn't happen until the switch is completely * finished. * - * Pairs with the control dependency and rmb in try_to_wake_up(). + * In particular, the load of prev->state in finish_task_switch() must + * happen before this. + * + * Pairs with the smp_cond_acquire() in try_to_wake_up(). */ smp_store_release(&prev->on_cpu, 0); #endif @@ -1110,46 +1127,8 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) #define WEIGHT_IDLEPRIO 3 #define WMULT_IDLEPRIO 1431655765 -/* - * Nice levels are multiplicative, with a gentle 10% change for every - * nice level changed. I.e. when a CPU-bound task goes from nice 0 to - * nice 1, it will get ~10% less CPU time than another CPU-bound task - * that remained on nice 0. - * - * The "10% effect" is relative and cumulative: from _any_ nice level, - * if you go up 1 level, it's -10% CPU usage, if you go down 1 level - * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25. - * If a task goes up by ~10% and another task goes down by ~10% then - * the relative distance between them is ~25%.) - */ -static const int prio_to_weight[40] = { - /* -20 */ 88761, 71755, 56483, 46273, 36291, - /* -15 */ 29154, 23254, 18705, 14949, 11916, - /* -10 */ 9548, 7620, 6100, 4904, 3906, - /* -5 */ 3121, 2501, 1991, 1586, 1277, - /* 0 */ 1024, 820, 655, 526, 423, - /* 5 */ 335, 272, 215, 172, 137, - /* 10 */ 110, 87, 70, 56, 45, - /* 15 */ 36, 29, 23, 18, 15, -}; - -/* - * Inverse (2^32/x) values of the prio_to_weight[] array, precalculated. - * - * In cases where the weight does not change often, we can use the - * precalculated inverse to speed up arithmetics by turning divisions - * into multiplications: - */ -static const u32 prio_to_wmult[40] = { - /* -20 */ 48388, 59856, 76040, 92818, 118348, - /* -15 */ 147320, 184698, 229616, 287308, 360437, - /* -10 */ 449829, 563644, 704093, 875809, 1099582, - /* -5 */ 1376151, 1717300, 2157191, 2708050, 3363326, - /* 0 */ 4194304, 5237765, 6557202, 8165337, 10153587, - /* 5 */ 12820798, 15790321, 19976592, 24970740, 31350126, - /* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717, - /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, -}; +extern const int sched_prio_to_weight[40]; +extern const u32 sched_prio_to_wmult[40]; #define ENQUEUE_WAKEUP 0x01 #define ENQUEUE_HEAD 0x02 @@ -1249,16 +1228,8 @@ extern void update_group_capacity(struct sched_domain *sd, int cpu); extern void trigger_load_balance(struct rq *rq); -extern void idle_enter_fair(struct rq *this_rq); -extern void idle_exit_fair(struct rq *this_rq); - extern void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask); -#else - -static inline void idle_enter_fair(struct rq *rq) { } -static inline void idle_exit_fair(struct rq *rq) { } - #endif #ifdef CONFIG_CPU_IDLE diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index 052e02672d12..f15d6b6a538a 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -392,7 +392,7 @@ __wait_on_bit(wait_queue_head_t *wq, struct wait_bit_queue *q, do { prepare_to_wait(wq, &q->wait, mode); if (test_bit(q->key.bit_nr, q->key.flags)) - ret = (*action)(&q->key); + ret = (*action)(&q->key, mode); } while (test_bit(q->key.bit_nr, q->key.flags) && !ret); finish_wait(wq, &q->wait); return ret; @@ -431,7 +431,7 @@ __wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q, prepare_to_wait_exclusive(wq, &q->wait, mode); if (!test_bit(q->key.bit_nr, q->key.flags)) continue; - ret = action(&q->key); + ret = action(&q->key, mode); if (!ret) continue; abort_exclusive_wait(wq, &q->wait, mode, &q->key); @@ -581,44 +581,44 @@ void wake_up_atomic_t(atomic_t *p) } EXPORT_SYMBOL(wake_up_atomic_t); -__sched int bit_wait(struct wait_bit_key *word) +__sched int bit_wait(struct wait_bit_key *word, int mode) { - if (signal_pending_state(current->state, current)) - return 1; schedule(); + if (signal_pending_state(mode, current)) + return -EINTR; return 0; } EXPORT_SYMBOL(bit_wait); -__sched int bit_wait_io(struct wait_bit_key *word) +__sched int bit_wait_io(struct wait_bit_key *word, int mode) { - if (signal_pending_state(current->state, current)) - return 1; io_schedule(); + if (signal_pending_state(mode, current)) + return -EINTR; return 0; } EXPORT_SYMBOL(bit_wait_io); -__sched int bit_wait_timeout(struct wait_bit_key *word) +__sched int bit_wait_timeout(struct wait_bit_key *word, int mode) { unsigned long now = READ_ONCE(jiffies); - if (signal_pending_state(current->state, current)) - return 1; if (time_after_eq(now, word->timeout)) return -EAGAIN; schedule_timeout(word->timeout - now); + if (signal_pending_state(mode, current)) + return -EINTR; return 0; } EXPORT_SYMBOL_GPL(bit_wait_timeout); -__sched int bit_wait_io_timeout(struct wait_bit_key *word) +__sched int bit_wait_io_timeout(struct wait_bit_key *word, int mode) { unsigned long now = READ_ONCE(jiffies); - if (signal_pending_state(current->state, current)) - return 1; if (time_after_eq(now, word->timeout)) return -EAGAIN; io_schedule_timeout(word->timeout - now); + if (signal_pending_state(mode, current)) + return -EINTR; return 0; } EXPORT_SYMBOL_GPL(bit_wait_io_timeout); |