diff options
Diffstat (limited to 'kernel/sched')
-rw-r--r-- | kernel/sched/completion.c | 5 | ||||
-rw-r--r-- | kernel/sched/core.c | 98 | ||||
-rw-r--r-- | kernel/sched/cpufreq_schedutil.c | 95 | ||||
-rw-r--r-- | kernel/sched/deadline.c | 143 | ||||
-rw-r--r-- | kernel/sched/fair.c | 145 | ||||
-rw-r--r-- | kernel/sched/membarrier.c | 2 | ||||
-rw-r--r-- | kernel/sched/rt.c | 10 | ||||
-rw-r--r-- | kernel/sched/sched.h | 112 | ||||
-rw-r--r-- | kernel/sched/wait.c | 2 |
9 files changed, 406 insertions, 206 deletions
diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c index 2ddaec40956f..0926aef10dad 100644 --- a/kernel/sched/completion.c +++ b/kernel/sched/completion.c @@ -34,11 +34,6 @@ void complete(struct completion *x) spin_lock_irqsave(&x->wait.lock, flags); - /* - * Perform commit of crossrelease here. - */ - complete_release_commit(x); - if (x->done != UINT_MAX) x->done++; __wake_up_locked(&x->wait, TASK_NORMAL, 1); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 75554f366fd3..3da7a2444a91 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -508,7 +508,8 @@ void resched_cpu(int cpu) unsigned long flags; raw_spin_lock_irqsave(&rq->lock, flags); - resched_curr(rq); + if (cpu_online(cpu) || cpu == smp_processor_id()) + resched_curr(rq); raw_spin_unlock_irqrestore(&rq->lock, flags); } @@ -2045,7 +2046,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) * If the owning (remote) CPU is still in the middle of schedule() with * this task as prev, wait until its done referencing the task. * - * Pairs with the smp_store_release() in finish_lock_switch(). + * Pairs with the smp_store_release() in finish_task(). * * This ensures that tasks getting woken will be fully ordered against * their previous state and preserve Program Order. @@ -2056,7 +2057,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) p->state = TASK_WAKING; if (p->in_iowait) { - delayacct_blkio_end(); + delayacct_blkio_end(p); atomic_dec(&task_rq(p)->nr_iowait); } @@ -2069,7 +2070,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) #else /* CONFIG_SMP */ if (p->in_iowait) { - delayacct_blkio_end(); + delayacct_blkio_end(p); atomic_dec(&task_rq(p)->nr_iowait); } @@ -2122,7 +2123,7 @@ static void try_to_wake_up_local(struct task_struct *p, struct rq_flags *rf) if (!task_on_rq_queued(p)) { if (p->in_iowait) { - delayacct_blkio_end(); + delayacct_blkio_end(p); atomic_dec(&rq->nr_iowait); } ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK); @@ -2571,6 +2572,50 @@ fire_sched_out_preempt_notifiers(struct task_struct *curr, #endif /* CONFIG_PREEMPT_NOTIFIERS */ +static inline void prepare_task(struct task_struct *next) +{ +#ifdef CONFIG_SMP + /* + * Claim the task as running, we do this before switching to it + * such that any running task will have this set. + */ + next->on_cpu = 1; +#endif +} + +static inline void finish_task(struct task_struct *prev) +{ +#ifdef CONFIG_SMP + /* + * After ->on_cpu is cleared, the task can be moved to a different CPU. + * We must ensure this doesn't happen until the switch is completely + * finished. + * + * In particular, the load of prev->state in finish_task_switch() must + * happen before this. + * + * Pairs with the smp_cond_load_acquire() in try_to_wake_up(). + */ + smp_store_release(&prev->on_cpu, 0); +#endif +} + +static inline void finish_lock_switch(struct rq *rq) +{ +#ifdef CONFIG_DEBUG_SPINLOCK + /* this is a valid case when another task releases the spinlock */ + rq->lock.owner = current; +#endif + /* + * If we are tracking spinlock dependencies then we have to + * fix up the runqueue lock - which gets 'carried over' from + * prev into current: + */ + spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_); + + raw_spin_unlock_irq(&rq->lock); +} + /** * prepare_task_switch - prepare to switch tasks * @rq: the runqueue preparing to switch @@ -2591,7 +2636,7 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev, sched_info_switch(rq, prev, next); perf_event_task_sched_out(prev, next); fire_sched_out_preempt_notifiers(prev, next); - prepare_lock_switch(rq, next); + prepare_task(next); prepare_arch_switch(next); } @@ -2646,7 +2691,7 @@ static struct rq *finish_task_switch(struct task_struct *prev) * the scheduled task must drop that reference. * * We must observe prev->state before clearing prev->on_cpu (in - * finish_lock_switch), otherwise a concurrent wakeup can get prev + * finish_task), otherwise a concurrent wakeup can get prev * running on another CPU and we could rave with its RUNNING -> DEAD * transition, resulting in a double drop. */ @@ -2663,7 +2708,8 @@ static struct rq *finish_task_switch(struct task_struct *prev) * to use. */ smp_mb__after_unlock_lock(); - finish_lock_switch(rq, prev); + finish_task(prev); + finish_lock_switch(rq); finish_arch_post_lock_switch(); fire_sched_in_preempt_notifiers(current); @@ -4040,8 +4086,7 @@ recheck: return -EINVAL; } - if (attr->sched_flags & - ~(SCHED_FLAG_RESET_ON_FORK | SCHED_FLAG_RECLAIM)) + if (attr->sched_flags & ~(SCHED_FLAG_ALL | SCHED_FLAG_SUGOV)) return -EINVAL; /* @@ -4108,6 +4153,9 @@ recheck: } if (user) { + if (attr->sched_flags & SCHED_FLAG_SUGOV) + return -EINVAL; + retval = security_task_setscheduler(p); if (retval) return retval; @@ -4163,7 +4211,8 @@ change: } #endif #ifdef CONFIG_SMP - if (dl_bandwidth_enabled() && dl_policy(policy)) { + if (dl_bandwidth_enabled() && dl_policy(policy) && + !(attr->sched_flags & SCHED_FLAG_SUGOV)) { cpumask_t *span = rq->rd->span; /* @@ -4293,6 +4342,11 @@ int sched_setattr(struct task_struct *p, const struct sched_attr *attr) } EXPORT_SYMBOL_GPL(sched_setattr); +int sched_setattr_nocheck(struct task_struct *p, const struct sched_attr *attr) +{ + return __sched_setscheduler(p, attr, false, true); +} + /** * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace. * @p: the task in question. @@ -5097,17 +5151,6 @@ SYSCALL_DEFINE1(sched_get_priority_min, int, policy) return ret; } -/** - * sys_sched_rr_get_interval - return the default timeslice of a process. - * @pid: pid of the process. - * @interval: userspace pointer to the timeslice value. - * - * this syscall writes the default timeslice value of a given process - * into the user-space timespec buffer. A value of '0' means infinity. - * - * Return: On success, 0 and the timeslice is in @interval. Otherwise, - * an error code. - */ static int sched_rr_get_interval(pid_t pid, struct timespec64 *t) { struct task_struct *p; @@ -5144,6 +5187,17 @@ out_unlock: return retval; } +/** + * sys_sched_rr_get_interval - return the default timeslice of a process. + * @pid: pid of the process. + * @interval: userspace pointer to the timeslice value. + * + * this syscall writes the default timeslice value of a given process + * into the user-space timespec buffer. A value of '0' means infinity. + * + * Return: On success, 0 and the timeslice is in @interval. Otherwise, + * an error code. + */ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, struct timespec __user *, interval) { diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 2f52ec0f1539..dd062a1c8cf0 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -60,7 +60,8 @@ struct sugov_cpu { u64 last_update; /* The fields below are only needed when sharing a policy. */ - unsigned long util; + unsigned long util_cfs; + unsigned long util_dl; unsigned long max; unsigned int flags; @@ -176,21 +177,28 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy, return cpufreq_driver_resolve_freq(policy, freq); } -static void sugov_get_util(unsigned long *util, unsigned long *max, int cpu) +static void sugov_get_util(struct sugov_cpu *sg_cpu) { - struct rq *rq = cpu_rq(cpu); - unsigned long cfs_max; + struct rq *rq = cpu_rq(sg_cpu->cpu); - cfs_max = arch_scale_cpu_capacity(NULL, cpu); + sg_cpu->max = arch_scale_cpu_capacity(NULL, sg_cpu->cpu); + sg_cpu->util_cfs = cpu_util_cfs(rq); + sg_cpu->util_dl = cpu_util_dl(rq); +} - *util = min(rq->cfs.avg.util_avg, cfs_max); - *max = cfs_max; +static unsigned long sugov_aggregate_util(struct sugov_cpu *sg_cpu) +{ + /* + * Ideally we would like to set util_dl as min/guaranteed freq and + * util_cfs + util_dl as requested freq. However, cpufreq is not yet + * ready for such an interface. So, we only do the latter for now. + */ + return min(sg_cpu->util_cfs + sg_cpu->util_dl, sg_cpu->max); } -static void sugov_set_iowait_boost(struct sugov_cpu *sg_cpu, u64 time, - unsigned int flags) +static void sugov_set_iowait_boost(struct sugov_cpu *sg_cpu, u64 time) { - if (flags & SCHED_CPUFREQ_IOWAIT) { + if (sg_cpu->flags & SCHED_CPUFREQ_IOWAIT) { if (sg_cpu->iowait_boost_pending) return; @@ -244,7 +252,7 @@ static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, unsigned long *util, #ifdef CONFIG_NO_HZ_COMMON static bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) { - unsigned long idle_calls = tick_nohz_get_idle_calls(); + unsigned long idle_calls = tick_nohz_get_idle_calls_cpu(sg_cpu->cpu); bool ret = idle_calls == sg_cpu->saved_idle_calls; sg_cpu->saved_idle_calls = idle_calls; @@ -264,7 +272,7 @@ static void sugov_update_single(struct update_util_data *hook, u64 time, unsigned int next_f; bool busy; - sugov_set_iowait_boost(sg_cpu, time, flags); + sugov_set_iowait_boost(sg_cpu, time); sg_cpu->last_update = time; if (!sugov_should_update_freq(sg_policy, time)) @@ -272,10 +280,12 @@ static void sugov_update_single(struct update_util_data *hook, u64 time, busy = sugov_cpu_is_busy(sg_cpu); - if (flags & SCHED_CPUFREQ_RT_DL) { + if (flags & SCHED_CPUFREQ_RT) { next_f = policy->cpuinfo.max_freq; } else { - sugov_get_util(&util, &max, sg_cpu->cpu); + sugov_get_util(sg_cpu); + max = sg_cpu->max; + util = sugov_aggregate_util(sg_cpu); sugov_iowait_boost(sg_cpu, &util, &max); next_f = get_next_freq(sg_policy, util, max); /* @@ -305,23 +315,27 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time) s64 delta_ns; /* - * If the CPU utilization was last updated before the previous - * frequency update and the time elapsed between the last update - * of the CPU utilization and the last frequency update is long - * enough, don't take the CPU into account as it probably is - * idle now (and clear iowait_boost for it). + * If the CFS CPU utilization was last updated before the + * previous frequency update and the time elapsed between the + * last update of the CPU utilization and the last frequency + * update is long enough, reset iowait_boost and util_cfs, as + * they are now probably stale. However, still consider the + * CPU contribution if it has some DEADLINE utilization + * (util_dl). */ delta_ns = time - j_sg_cpu->last_update; if (delta_ns > TICK_NSEC) { j_sg_cpu->iowait_boost = 0; j_sg_cpu->iowait_boost_pending = false; - continue; + j_sg_cpu->util_cfs = 0; + if (j_sg_cpu->util_dl == 0) + continue; } - if (j_sg_cpu->flags & SCHED_CPUFREQ_RT_DL) + if (j_sg_cpu->flags & SCHED_CPUFREQ_RT) return policy->cpuinfo.max_freq; - j_util = j_sg_cpu->util; j_max = j_sg_cpu->max; + j_util = sugov_aggregate_util(j_sg_cpu); if (j_util * max > j_max * util) { util = j_util; max = j_max; @@ -338,22 +352,18 @@ static void sugov_update_shared(struct update_util_data *hook, u64 time, { struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util); struct sugov_policy *sg_policy = sg_cpu->sg_policy; - unsigned long util, max; unsigned int next_f; - sugov_get_util(&util, &max, sg_cpu->cpu); - raw_spin_lock(&sg_policy->update_lock); - sg_cpu->util = util; - sg_cpu->max = max; + sugov_get_util(sg_cpu); sg_cpu->flags = flags; - sugov_set_iowait_boost(sg_cpu, time, flags); + sugov_set_iowait_boost(sg_cpu, time); sg_cpu->last_update = time; if (sugov_should_update_freq(sg_policy, time)) { - if (flags & SCHED_CPUFREQ_RT_DL) + if (flags & SCHED_CPUFREQ_RT) next_f = sg_policy->policy->cpuinfo.max_freq; else next_f = sugov_next_freq_shared(sg_cpu, time); @@ -383,9 +393,9 @@ static void sugov_irq_work(struct irq_work *irq_work) sg_policy = container_of(irq_work, struct sugov_policy, irq_work); /* - * For RT and deadline tasks, the schedutil governor shoots the - * frequency to maximum. Special care must be taken to ensure that this - * kthread doesn't result in the same behavior. + * For RT tasks, the schedutil governor shoots the frequency to maximum. + * Special care must be taken to ensure that this kthread doesn't result + * in the same behavior. * * This is (mostly) guaranteed by the work_in_progress flag. The flag is * updated only at the end of the sugov_work() function and before that @@ -470,7 +480,20 @@ static void sugov_policy_free(struct sugov_policy *sg_policy) static int sugov_kthread_create(struct sugov_policy *sg_policy) { struct task_struct *thread; - struct sched_param param = { .sched_priority = MAX_USER_RT_PRIO / 2 }; + struct sched_attr attr = { + .size = sizeof(struct sched_attr), + .sched_policy = SCHED_DEADLINE, + .sched_flags = SCHED_FLAG_SUGOV, + .sched_nice = 0, + .sched_priority = 0, + /* + * Fake (unused) bandwidth; workaround to "fix" + * priority inheritance. + */ + .sched_runtime = 1000000, + .sched_deadline = 10000000, + .sched_period = 10000000, + }; struct cpufreq_policy *policy = sg_policy->policy; int ret; @@ -488,10 +511,10 @@ static int sugov_kthread_create(struct sugov_policy *sg_policy) return PTR_ERR(thread); } - ret = sched_setscheduler_nocheck(thread, SCHED_FIFO, ¶m); + ret = sched_setattr_nocheck(thread, &attr); if (ret) { kthread_stop(thread); - pr_warn("%s: failed to set SCHED_FIFO\n", __func__); + pr_warn("%s: failed to set SCHED_DEADLINE\n", __func__); return ret; } @@ -655,7 +678,7 @@ static int sugov_start(struct cpufreq_policy *policy) memset(sg_cpu, 0, sizeof(*sg_cpu)); sg_cpu->cpu = cpu; sg_cpu->sg_policy = sg_policy; - sg_cpu->flags = SCHED_CPUFREQ_RT; + sg_cpu->flags = 0; sg_cpu->iowait_boost_max = policy->cpuinfo.max_freq; } diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 2473736c7616..9bb0e0c412ec 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -78,7 +78,7 @@ static inline int dl_bw_cpus(int i) #endif static inline -void add_running_bw(u64 dl_bw, struct dl_rq *dl_rq) +void __add_running_bw(u64 dl_bw, struct dl_rq *dl_rq) { u64 old = dl_rq->running_bw; @@ -86,10 +86,12 @@ void add_running_bw(u64 dl_bw, struct dl_rq *dl_rq) dl_rq->running_bw += dl_bw; SCHED_WARN_ON(dl_rq->running_bw < old); /* overflow */ SCHED_WARN_ON(dl_rq->running_bw > dl_rq->this_bw); + /* kick cpufreq (see the comment in kernel/sched/sched.h). */ + cpufreq_update_util(rq_of_dl_rq(dl_rq), SCHED_CPUFREQ_DL); } static inline -void sub_running_bw(u64 dl_bw, struct dl_rq *dl_rq) +void __sub_running_bw(u64 dl_bw, struct dl_rq *dl_rq) { u64 old = dl_rq->running_bw; @@ -98,10 +100,12 @@ void sub_running_bw(u64 dl_bw, struct dl_rq *dl_rq) SCHED_WARN_ON(dl_rq->running_bw > old); /* underflow */ if (dl_rq->running_bw > old) dl_rq->running_bw = 0; + /* kick cpufreq (see the comment in kernel/sched/sched.h). */ + cpufreq_update_util(rq_of_dl_rq(dl_rq), SCHED_CPUFREQ_DL); } static inline -void add_rq_bw(u64 dl_bw, struct dl_rq *dl_rq) +void __add_rq_bw(u64 dl_bw, struct dl_rq *dl_rq) { u64 old = dl_rq->this_bw; @@ -111,7 +115,7 @@ void add_rq_bw(u64 dl_bw, struct dl_rq *dl_rq) } static inline -void sub_rq_bw(u64 dl_bw, struct dl_rq *dl_rq) +void __sub_rq_bw(u64 dl_bw, struct dl_rq *dl_rq) { u64 old = dl_rq->this_bw; @@ -123,16 +127,46 @@ void sub_rq_bw(u64 dl_bw, struct dl_rq *dl_rq) SCHED_WARN_ON(dl_rq->running_bw > dl_rq->this_bw); } +static inline +void add_rq_bw(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) +{ + if (!dl_entity_is_special(dl_se)) + __add_rq_bw(dl_se->dl_bw, dl_rq); +} + +static inline +void sub_rq_bw(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) +{ + if (!dl_entity_is_special(dl_se)) + __sub_rq_bw(dl_se->dl_bw, dl_rq); +} + +static inline +void add_running_bw(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) +{ + if (!dl_entity_is_special(dl_se)) + __add_running_bw(dl_se->dl_bw, dl_rq); +} + +static inline +void sub_running_bw(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) +{ + if (!dl_entity_is_special(dl_se)) + __sub_running_bw(dl_se->dl_bw, dl_rq); +} + void dl_change_utilization(struct task_struct *p, u64 new_bw) { struct rq *rq; + BUG_ON(p->dl.flags & SCHED_FLAG_SUGOV); + if (task_on_rq_queued(p)) return; rq = task_rq(p); if (p->dl.dl_non_contending) { - sub_running_bw(p->dl.dl_bw, &rq->dl); + sub_running_bw(&p->dl, &rq->dl); p->dl.dl_non_contending = 0; /* * If the timer handler is currently running and the @@ -144,8 +178,8 @@ void dl_change_utilization(struct task_struct *p, u64 new_bw) if (hrtimer_try_to_cancel(&p->dl.inactive_timer) == 1) put_task_struct(p); } - sub_rq_bw(p->dl.dl_bw, &rq->dl); - add_rq_bw(new_bw, &rq->dl); + __sub_rq_bw(p->dl.dl_bw, &rq->dl); + __add_rq_bw(new_bw, &rq->dl); } /* @@ -217,6 +251,9 @@ static void task_non_contending(struct task_struct *p) if (dl_se->dl_runtime == 0) return; + if (dl_entity_is_special(dl_se)) + return; + WARN_ON(hrtimer_active(&dl_se->inactive_timer)); WARN_ON(dl_se->dl_non_contending); @@ -236,12 +273,12 @@ static void task_non_contending(struct task_struct *p) */ if (zerolag_time < 0) { if (dl_task(p)) - sub_running_bw(dl_se->dl_bw, dl_rq); + sub_running_bw(dl_se, dl_rq); if (!dl_task(p) || p->state == TASK_DEAD) { struct dl_bw *dl_b = dl_bw_of(task_cpu(p)); if (p->state == TASK_DEAD) - sub_rq_bw(p->dl.dl_bw, &rq->dl); + sub_rq_bw(&p->dl, &rq->dl); raw_spin_lock(&dl_b->lock); __dl_sub(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p))); __dl_clear_params(p); @@ -268,7 +305,7 @@ static void task_contending(struct sched_dl_entity *dl_se, int flags) return; if (flags & ENQUEUE_MIGRATED) - add_rq_bw(dl_se->dl_bw, dl_rq); + add_rq_bw(dl_se, dl_rq); if (dl_se->dl_non_contending) { dl_se->dl_non_contending = 0; @@ -289,7 +326,7 @@ static void task_contending(struct sched_dl_entity *dl_se, int flags) * when the "inactive timer" fired). * So, add it back. */ - add_running_bw(dl_se->dl_bw, dl_rq); + add_running_bw(dl_se, dl_rq); } } @@ -1114,7 +1151,8 @@ static void update_curr_dl(struct rq *rq) { struct task_struct *curr = rq->curr; struct sched_dl_entity *dl_se = &curr->dl; - u64 delta_exec; + u64 delta_exec, scaled_delta_exec; + int cpu = cpu_of(rq); if (!dl_task(curr) || !on_dl_rq(dl_se)) return; @@ -1134,9 +1172,6 @@ static void update_curr_dl(struct rq *rq) return; } - /* kick cpufreq (see the comment in kernel/sched/sched.h). */ - cpufreq_update_util(rq, SCHED_CPUFREQ_DL); - schedstat_set(curr->se.statistics.exec_max, max(curr->se.statistics.exec_max, delta_exec)); @@ -1148,13 +1183,39 @@ static void update_curr_dl(struct rq *rq) sched_rt_avg_update(rq, delta_exec); - if (unlikely(dl_se->flags & SCHED_FLAG_RECLAIM)) - delta_exec = grub_reclaim(delta_exec, rq, &curr->dl); - dl_se->runtime -= delta_exec; + if (dl_entity_is_special(dl_se)) + return; + + /* + * For tasks that participate in GRUB, we implement GRUB-PA: the + * spare reclaimed bandwidth is used to clock down frequency. + * + * For the others, we still need to scale reservation parameters + * according to current frequency and CPU maximum capacity. + */ + if (unlikely(dl_se->flags & SCHED_FLAG_RECLAIM)) { + scaled_delta_exec = grub_reclaim(delta_exec, + rq, + &curr->dl); + } else { + unsigned long scale_freq = arch_scale_freq_capacity(cpu); + unsigned long scale_cpu = arch_scale_cpu_capacity(NULL, cpu); + + scaled_delta_exec = cap_scale(delta_exec, scale_freq); + scaled_delta_exec = cap_scale(scaled_delta_exec, scale_cpu); + } + + dl_se->runtime -= scaled_delta_exec; throttle: if (dl_runtime_exceeded(dl_se) || dl_se->dl_yielded) { dl_se->dl_throttled = 1; + + /* If requested, inform the user about runtime overruns. */ + if (dl_runtime_exceeded(dl_se) && + (dl_se->flags & SCHED_FLAG_DL_OVERRUN)) + dl_se->dl_overrun = 1; + __dequeue_task_dl(rq, curr, 0); if (unlikely(dl_se->dl_boosted || !start_dl_timer(curr))) enqueue_task_dl(rq, curr, ENQUEUE_REPLENISH); @@ -1204,8 +1265,8 @@ static enum hrtimer_restart inactive_task_timer(struct hrtimer *timer) struct dl_bw *dl_b = dl_bw_of(task_cpu(p)); if (p->state == TASK_DEAD && dl_se->dl_non_contending) { - sub_running_bw(p->dl.dl_bw, dl_rq_of_se(&p->dl)); - sub_rq_bw(p->dl.dl_bw, dl_rq_of_se(&p->dl)); + sub_running_bw(&p->dl, dl_rq_of_se(&p->dl)); + sub_rq_bw(&p->dl, dl_rq_of_se(&p->dl)); dl_se->dl_non_contending = 0; } @@ -1222,7 +1283,7 @@ static enum hrtimer_restart inactive_task_timer(struct hrtimer *timer) sched_clock_tick(); update_rq_clock(rq); - sub_running_bw(dl_se->dl_bw, &rq->dl); + sub_running_bw(dl_se, &rq->dl); dl_se->dl_non_contending = 0; unlock: task_rq_unlock(rq, p, &rf); @@ -1416,8 +1477,8 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) dl_check_constrained_dl(&p->dl); if (p->on_rq == TASK_ON_RQ_MIGRATING || flags & ENQUEUE_RESTORE) { - add_rq_bw(p->dl.dl_bw, &rq->dl); - add_running_bw(p->dl.dl_bw, &rq->dl); + add_rq_bw(&p->dl, &rq->dl); + add_running_bw(&p->dl, &rq->dl); } /* @@ -1457,8 +1518,8 @@ static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags) __dequeue_task_dl(rq, p, flags); if (p->on_rq == TASK_ON_RQ_MIGRATING || flags & DEQUEUE_SAVE) { - sub_running_bw(p->dl.dl_bw, &rq->dl); - sub_rq_bw(p->dl.dl_bw, &rq->dl); + sub_running_bw(&p->dl, &rq->dl); + sub_rq_bw(&p->dl, &rq->dl); } /* @@ -1564,7 +1625,7 @@ static void migrate_task_rq_dl(struct task_struct *p) */ raw_spin_lock(&rq->lock); if (p->dl.dl_non_contending) { - sub_running_bw(p->dl.dl_bw, &rq->dl); + sub_running_bw(&p->dl, &rq->dl); p->dl.dl_non_contending = 0; /* * If the timer handler is currently running and the @@ -1576,7 +1637,7 @@ static void migrate_task_rq_dl(struct task_struct *p) if (hrtimer_try_to_cancel(&p->dl.inactive_timer) == 1) put_task_struct(p); } - sub_rq_bw(p->dl.dl_bw, &rq->dl); + sub_rq_bw(&p->dl, &rq->dl); raw_spin_unlock(&rq->lock); } @@ -2019,11 +2080,11 @@ retry: } deactivate_task(rq, next_task, 0); - sub_running_bw(next_task->dl.dl_bw, &rq->dl); - sub_rq_bw(next_task->dl.dl_bw, &rq->dl); + sub_running_bw(&next_task->dl, &rq->dl); + sub_rq_bw(&next_task->dl, &rq->dl); set_task_cpu(next_task, later_rq->cpu); - add_rq_bw(next_task->dl.dl_bw, &later_rq->dl); - add_running_bw(next_task->dl.dl_bw, &later_rq->dl); + add_rq_bw(&next_task->dl, &later_rq->dl); + add_running_bw(&next_task->dl, &later_rq->dl); activate_task(later_rq, next_task, 0); ret = 1; @@ -2111,11 +2172,11 @@ static void pull_dl_task(struct rq *this_rq) resched = true; deactivate_task(src_rq, p, 0); - sub_running_bw(p->dl.dl_bw, &src_rq->dl); - sub_rq_bw(p->dl.dl_bw, &src_rq->dl); + sub_running_bw(&p->dl, &src_rq->dl); + sub_rq_bw(&p->dl, &src_rq->dl); set_task_cpu(p, this_cpu); - add_rq_bw(p->dl.dl_bw, &this_rq->dl); - add_running_bw(p->dl.dl_bw, &this_rq->dl); + add_rq_bw(&p->dl, &this_rq->dl); + add_running_bw(&p->dl, &this_rq->dl); activate_task(this_rq, p, 0); dmin = p->dl.deadline; @@ -2224,7 +2285,7 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p) task_non_contending(p); if (!task_on_rq_queued(p)) - sub_rq_bw(p->dl.dl_bw, &rq->dl); + sub_rq_bw(&p->dl, &rq->dl); /* * We cannot use inactive_task_timer() to invoke sub_running_bw() @@ -2256,7 +2317,7 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p) /* If p is not queued we will update its parameters at next wakeup. */ if (!task_on_rq_queued(p)) { - add_rq_bw(p->dl.dl_bw, &rq->dl); + add_rq_bw(&p->dl, &rq->dl); return; } @@ -2435,6 +2496,9 @@ int sched_dl_overflow(struct task_struct *p, int policy, u64 new_bw = dl_policy(policy) ? to_ratio(period, runtime) : 0; int cpus, err = -1; + if (attr->sched_flags & SCHED_FLAG_SUGOV) + return 0; + /* !deadline task may carry old deadline bandwidth */ if (new_bw == p->dl.dl_bw && task_has_dl_policy(p)) return 0; @@ -2521,6 +2585,10 @@ void __getparam_dl(struct task_struct *p, struct sched_attr *attr) */ bool __checkparam_dl(const struct sched_attr *attr) { + /* special dl tasks don't actually use any parameter */ + if (attr->sched_flags & SCHED_FLAG_SUGOV) + return true; + /* deadline != 0 */ if (attr->sched_deadline == 0) return false; @@ -2566,6 +2634,7 @@ void __dl_clear_params(struct task_struct *p) dl_se->dl_throttled = 0; dl_se->dl_yielded = 0; dl_se->dl_non_contending = 0; + dl_se->dl_overrun = 0; } bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 4037e19bbca2..7b6535987500 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3020,9 +3020,7 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq) /* * There are a few boundary cases this might miss but it should * get called often enough that that should (hopefully) not be - * a real problem -- added to that it only calls on the local - * CPU, so if we enqueue remotely we'll miss an update, but - * the next tick/schedule should update. + * a real problem. * * It will not get called when we go idle, because the idle * thread is a different class (!fair), nor will the utilization @@ -3091,8 +3089,6 @@ static u32 __accumulate_pelt_segments(u64 periods, u32 d1, u32 d3) return c1 + c2 + c3; } -#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT) - /* * Accumulate the three separate parts of the sum; d1 the remainder * of the last (incomplete) period, d2 the span of full periods and d3 @@ -3122,7 +3118,7 @@ accumulate_sum(u64 delta, int cpu, struct sched_avg *sa, u32 contrib = (u32)delta; /* p == 0 -> delta < 1024 */ u64 periods; - scale_freq = arch_scale_freq_capacity(NULL, cpu); + scale_freq = arch_scale_freq_capacity(cpu); scale_cpu = arch_scale_cpu_capacity(NULL, cpu); delta += sa->period_contrib; @@ -3413,9 +3409,9 @@ void set_task_rq_fair(struct sched_entity *se, * _IFF_ we look at the pure running and runnable sums. Because they * represent the very same entity, just at different points in the hierarchy. * - * - * Per the above update_tg_cfs_util() is trivial (and still 'wrong') and - * simply copies the running sum over. + * Per the above update_tg_cfs_util() is trivial and simply copies the running + * sum over (but still wrong, because the group entity and group rq do not have + * their PELT windows aligned). * * However, update_tg_cfs_runnable() is more complex. So we have: * @@ -3424,11 +3420,11 @@ void set_task_rq_fair(struct sched_entity *se, * And since, like util, the runnable part should be directly transferable, * the following would _appear_ to be the straight forward approach: * - * grq->avg.load_avg = grq->load.weight * grq->avg.running_avg (3) + * grq->avg.load_avg = grq->load.weight * grq->avg.runnable_avg (3) * * And per (1) we have: * - * ge->avg.running_avg == grq->avg.running_avg + * ge->avg.runnable_avg == grq->avg.runnable_avg * * Which gives: * @@ -3447,27 +3443,28 @@ void set_task_rq_fair(struct sched_entity *se, * to (shortly) return to us. This only works by keeping the weights as * integral part of the sum. We therefore cannot decompose as per (3). * - * OK, so what then? + * Another reason this doesn't work is that runnable isn't a 0-sum entity. + * Imagine a rq with 2 tasks that each are runnable 2/3 of the time. Then the + * rq itself is runnable anywhere between 2/3 and 1 depending on how the + * runnable section of these tasks overlap (or not). If they were to perfectly + * align the rq as a whole would be runnable 2/3 of the time. If however we + * always have at least 1 runnable task, the rq as a whole is always runnable. * + * So we'll have to approximate.. :/ * - * Another way to look at things is: + * Given the constraint: * - * grq->avg.load_avg = \Sum se->avg.load_avg + * ge->avg.running_sum <= ge->avg.runnable_sum <= LOAD_AVG_MAX * - * Therefore, per (2): + * We can construct a rule that adds runnable to a rq by assuming minimal + * overlap. * - * grq->avg.load_avg = \Sum se->load.weight * se->avg.runnable_avg + * On removal, we'll assume each task is equally runnable; which yields: * - * And the very thing we're propagating is a change in that sum (someone - * joined/left). So we can easily know the runnable change, which would be, per - * (2) the already tracked se->load_avg divided by the corresponding - * se->weight. + * grq->avg.runnable_sum = grq->avg.load_sum / grq->load.weight * - * Basically (4) but in differential form: + * XXX: only do this for the part of runnable > running ? * - * d(runnable_avg) += se->avg.load_avg / se->load.weight - * (5) - * ge->avg.load_avg += ge->load.weight * d(runnable_avg) */ static inline void @@ -3479,6 +3476,14 @@ update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq if (!delta) return; + /* + * The relation between sum and avg is: + * + * LOAD_AVG_MAX - 1024 + sa->period_contrib + * + * however, the PELT windows are not aligned between grq and gse. + */ + /* Set new sched_entity's utilization */ se->avg.util_avg = gcfs_rq->avg.util_avg; se->avg.util_sum = se->avg.util_avg * LOAD_AVG_MAX; @@ -3491,33 +3496,68 @@ update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq static inline void update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq) { - long runnable_sum = gcfs_rq->prop_runnable_sum; - long runnable_load_avg, load_avg; - s64 runnable_load_sum, load_sum; + long delta_avg, running_sum, runnable_sum = gcfs_rq->prop_runnable_sum; + unsigned long runnable_load_avg, load_avg; + u64 runnable_load_sum, load_sum = 0; + s64 delta_sum; if (!runnable_sum) return; gcfs_rq->prop_runnable_sum = 0; + if (runnable_sum >= 0) { + /* + * Add runnable; clip at LOAD_AVG_MAX. Reflects that until + * the CPU is saturated running == runnable. + */ + runnable_sum += se->avg.load_sum; + runnable_sum = min(runnable_sum, (long)LOAD_AVG_MAX); + } else { + /* + * Estimate the new unweighted runnable_sum of the gcfs_rq by + * assuming all tasks are equally runnable. + */ + if (scale_load_down(gcfs_rq->load.weight)) { + load_sum = div_s64(gcfs_rq->avg.load_sum, + scale_load_down(gcfs_rq->load.weight)); + } + + /* But make sure to not inflate se's runnable */ + runnable_sum = min(se->avg.load_sum, load_sum); + } + + /* + * runnable_sum can't be lower than running_sum + * As running sum is scale with cpu capacity wehreas the runnable sum + * is not we rescale running_sum 1st + */ + running_sum = se->avg.util_sum / + arch_scale_cpu_capacity(NULL, cpu_of(rq_of(cfs_rq))); + runnable_sum = max(runnable_sum, running_sum); + load_sum = (s64)se_weight(se) * runnable_sum; load_avg = div_s64(load_sum, LOAD_AVG_MAX); - add_positive(&se->avg.load_sum, runnable_sum); - add_positive(&se->avg.load_avg, load_avg); + delta_sum = load_sum - (s64)se_weight(se) * se->avg.load_sum; + delta_avg = load_avg - se->avg.load_avg; - add_positive(&cfs_rq->avg.load_avg, load_avg); - add_positive(&cfs_rq->avg.load_sum, load_sum); + se->avg.load_sum = runnable_sum; + se->avg.load_avg = load_avg; + add_positive(&cfs_rq->avg.load_avg, delta_avg); + add_positive(&cfs_rq->avg.load_sum, delta_sum); runnable_load_sum = (s64)se_runnable(se) * runnable_sum; runnable_load_avg = div_s64(runnable_load_sum, LOAD_AVG_MAX); + delta_sum = runnable_load_sum - se_weight(se) * se->avg.runnable_load_sum; + delta_avg = runnable_load_avg - se->avg.runnable_load_avg; - add_positive(&se->avg.runnable_load_sum, runnable_sum); - add_positive(&se->avg.runnable_load_avg, runnable_load_avg); + se->avg.runnable_load_sum = runnable_sum; + se->avg.runnable_load_avg = runnable_load_avg; if (se->on_rq) { - add_positive(&cfs_rq->avg.runnable_load_avg, runnable_load_avg); - add_positive(&cfs_rq->avg.runnable_load_sum, runnable_load_sum); + add_positive(&cfs_rq->avg.runnable_load_avg, delta_avg); + add_positive(&cfs_rq->avg.runnable_load_sum, delta_sum); } } @@ -4321,12 +4361,12 @@ static inline bool cfs_bandwidth_used(void) void cfs_bandwidth_usage_inc(void) { - static_key_slow_inc(&__cfs_bandwidth_used); + static_key_slow_inc_cpuslocked(&__cfs_bandwidth_used); } void cfs_bandwidth_usage_dec(void) { - static_key_slow_dec(&__cfs_bandwidth_used); + static_key_slow_dec_cpuslocked(&__cfs_bandwidth_used); } #else /* HAVE_JUMP_LABEL */ static bool cfs_bandwidth_used(void) @@ -5645,8 +5685,8 @@ static int wake_wide(struct task_struct *p) * soonest. For the purpose of speed we only consider the waking and previous * CPU. * - * wake_affine_idle() - only considers 'now', it check if the waking CPU is (or - * will be) idle. + * wake_affine_idle() - only considers 'now', it check if the waking CPU is + * cache-affine and is (or will be) idle. * * wake_affine_weight() - considers the weight to reflect the average * scheduling latency of the CPUs. This seems to work @@ -5657,7 +5697,13 @@ static bool wake_affine_idle(struct sched_domain *sd, struct task_struct *p, int this_cpu, int prev_cpu, int sync) { - if (idle_cpu(this_cpu)) + /* + * If this_cpu is idle, it implies the wakeup is from interrupt + * context. Only allow the move if cache is shared. Otherwise an + * interrupt intensive workload could force all tasks onto one + * node depending on the IO topology or IRQ affinity settings. + */ + if (idle_cpu(this_cpu) && cpus_share_cache(this_cpu, prev_cpu)) return true; if (sync && cpu_rq(this_cpu)->nr_running == 1) @@ -5721,12 +5767,12 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, return affine; } -static inline int task_util(struct task_struct *p); -static int cpu_util_wake(int cpu, struct task_struct *p); +static inline unsigned long task_util(struct task_struct *p); +static unsigned long cpu_util_wake(int cpu, struct task_struct *p); static unsigned long capacity_spare_wake(int cpu, struct task_struct *p) { - return capacity_orig_of(cpu) - cpu_util_wake(cpu, p); + return max_t(long, capacity_of(cpu) - cpu_util_wake(cpu, p), 0); } /* @@ -5906,7 +5952,7 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this } } else if (shallowest_idle_cpu == -1) { load = weighted_cpuload(cpu_rq(i)); - if (load < min_load || (load == min_load && i == this_cpu)) { + if (load < min_load) { min_load = load; least_loaded_cpu = i; } @@ -6203,7 +6249,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) * capacity_orig) as it useful for predicting the capacity required after task * migrations (scheduler-driven DVFS). */ -static int cpu_util(int cpu) +static unsigned long cpu_util(int cpu) { unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg; unsigned long capacity = capacity_orig_of(cpu); @@ -6211,7 +6257,7 @@ static int cpu_util(int cpu) return (util >= capacity) ? capacity : util; } -static inline int task_util(struct task_struct *p) +static inline unsigned long task_util(struct task_struct *p) { return p->se.avg.util_avg; } @@ -6220,7 +6266,7 @@ static inline int task_util(struct task_struct *p) * cpu_util_wake: Compute cpu utilization with any contributions from * the waking task p removed. */ -static int cpu_util_wake(int cpu, struct task_struct *p) +static unsigned long cpu_util_wake(int cpu, struct task_struct *p) { unsigned long util, capacity; @@ -6405,8 +6451,7 @@ static void task_dead_fair(struct task_struct *p) } #endif /* CONFIG_SMP */ -static unsigned long -wakeup_gran(struct sched_entity *curr, struct sched_entity *se) +static unsigned long wakeup_gran(struct sched_entity *se) { unsigned long gran = sysctl_sched_wakeup_granularity; @@ -6448,7 +6493,7 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se) if (vdiff <= 0) return -1; - gran = wakeup_gran(curr, se); + gran = wakeup_gran(se); if (vdiff > gran) return 1; diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c index dd7908743dab..9bcbacba82a8 100644 --- a/kernel/sched/membarrier.c +++ b/kernel/sched/membarrier.c @@ -89,7 +89,9 @@ static int membarrier_private_expedited(void) rcu_read_unlock(); } if (!fallback) { + preempt_disable(); smp_call_function_many(tmpmask, ipi_mb, NULL, 1); + preempt_enable(); free_cpumask_var(tmpmask); } cpus_read_unlock(); diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 4056c19ca3f0..862a513adca3 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -2034,8 +2034,9 @@ static void pull_rt_task(struct rq *this_rq) bool resched = false; struct task_struct *p; struct rq *src_rq; + int rt_overload_count = rt_overloaded(this_rq); - if (likely(!rt_overloaded(this_rq))) + if (likely(!rt_overload_count)) return; /* @@ -2044,6 +2045,11 @@ static void pull_rt_task(struct rq *this_rq) */ smp_rmb(); + /* If we are the only overloaded CPU do nothing */ + if (rt_overload_count == 1 && + cpumask_test_cpu(this_rq->cpu, this_rq->rd->rto_mask)) + return; + #ifdef HAVE_RT_PUSH_IPI if (sched_feat(RT_PUSH_IPI)) { tell_cpu_to_push(this_rq); @@ -2206,7 +2212,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p) if (p->nr_cpus_allowed > 1 && rq->rt.overloaded) queue_push_tasks(rq); #endif /* CONFIG_SMP */ - if (p->prio < rq->curr->prio) + if (p->prio < rq->curr->prio && cpu_online(cpu_of(rq))) resched_curr(rq); } } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index b19552a212de..2e95505e23c6 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -156,13 +156,39 @@ static inline int task_has_dl_policy(struct task_struct *p) return dl_policy(p->policy); } +#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT) + +/* + * !! For sched_setattr_nocheck() (kernel) only !! + * + * This is actually gross. :( + * + * It is used to make schedutil kworker(s) higher priority than SCHED_DEADLINE + * tasks, but still be able to sleep. We need this on platforms that cannot + * atomically change clock frequency. Remove once fast switching will be + * available on such platforms. + * + * SUGOV stands for SchedUtil GOVernor. + */ +#define SCHED_FLAG_SUGOV 0x10000000 + +static inline bool dl_entity_is_special(struct sched_dl_entity *dl_se) +{ +#ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL + return unlikely(dl_se->flags & SCHED_FLAG_SUGOV); +#else + return false; +#endif +} + /* * Tells if entity @a should preempt entity @b. */ static inline bool dl_entity_preempt(struct sched_dl_entity *a, struct sched_dl_entity *b) { - return dl_time_before(a->deadline, b->deadline); + return dl_entity_is_special(a) || + dl_time_before(a->deadline, b->deadline); } /* @@ -1328,47 +1354,6 @@ static inline int task_on_rq_migrating(struct task_struct *p) # define finish_arch_post_lock_switch() do { } while (0) #endif -static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) -{ -#ifdef CONFIG_SMP - /* - * We can optimise this out completely for !SMP, because the - * SMP rebalancing from interrupt is the only thing that cares - * here. - */ - next->on_cpu = 1; -#endif -} - -static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) -{ -#ifdef CONFIG_SMP - /* - * After ->on_cpu is cleared, the task can be moved to a different CPU. - * We must ensure this doesn't happen until the switch is completely - * finished. - * - * In particular, the load of prev->state in finish_task_switch() must - * happen before this. - * - * Pairs with the smp_cond_load_acquire() in try_to_wake_up(). - */ - smp_store_release(&prev->on_cpu, 0); -#endif -#ifdef CONFIG_DEBUG_SPINLOCK - /* this is a valid case when another task releases the spinlock */ - rq->lock.owner = current; -#endif - /* - * If we are tracking spinlock dependencies then we have to - * fix up the runqueue lock - which gets 'carried over' from - * prev into current: - */ - spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_); - - raw_spin_unlock_irq(&rq->lock); -} - /* * wake flags */ @@ -1687,17 +1672,17 @@ static inline int hrtick_enabled(struct rq *rq) #endif /* CONFIG_SCHED_HRTICK */ -#ifdef CONFIG_SMP -extern void sched_avg_update(struct rq *rq); - #ifndef arch_scale_freq_capacity static __always_inline -unsigned long arch_scale_freq_capacity(struct sched_domain *sd, int cpu) +unsigned long arch_scale_freq_capacity(int cpu) { return SCHED_CAPACITY_SCALE; } #endif +#ifdef CONFIG_SMP +extern void sched_avg_update(struct rq *rq); + #ifndef arch_scale_cpu_capacity static __always_inline unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu) @@ -1711,10 +1696,17 @@ unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu) static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { - rq->rt_avg += rt_delta * arch_scale_freq_capacity(NULL, cpu_of(rq)); + rq->rt_avg += rt_delta * arch_scale_freq_capacity(cpu_of(rq)); sched_avg_update(rq); } #else +#ifndef arch_scale_cpu_capacity +static __always_inline +unsigned long arch_scale_cpu_capacity(void __always_unused *sd, int cpu) +{ + return SCHED_CAPACITY_SCALE; +} +#endif static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { } static inline void sched_avg_update(struct rq *rq) { } #endif @@ -2096,14 +2088,14 @@ DECLARE_PER_CPU(struct update_util_data *, cpufreq_update_util_data); * The way cpufreq is currently arranged requires it to evaluate the CPU * performance state (frequency/voltage) on a regular basis to prevent it from * being stuck in a completely inadequate performance level for too long. - * That is not guaranteed to happen if the updates are only triggered from CFS, - * though, because they may not be coming in if RT or deadline tasks are active - * all the time (or there are RT and DL tasks only). + * That is not guaranteed to happen if the updates are only triggered from CFS + * and DL, though, because they may not be coming in if only RT tasks are + * active all the time (or there are RT tasks only). * - * As a workaround for that issue, this function is called by the RT and DL - * sched classes to trigger extra cpufreq updates to prevent it from stalling, + * As a workaround for that issue, this function is called periodically by the + * RT sched class to trigger extra cpufreq updates to prevent it from stalling, * but that really is a band-aid. Going forward it should be replaced with - * solutions targeted more specifically at RT and DL tasks. + * solutions targeted more specifically at RT tasks. */ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) { @@ -2125,3 +2117,17 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {} #else /* arch_scale_freq_capacity */ #define arch_scale_freq_invariant() (false) #endif + +#ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL + +static inline unsigned long cpu_util_dl(struct rq *rq) +{ + return (rq->dl.running_bw * SCHED_CAPACITY_SCALE) >> BW_SHIFT; +} + +static inline unsigned long cpu_util_cfs(struct rq *rq) +{ + return rq->cfs.avg.util_avg; +} + +#endif diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index 98feab7933c7..929ecb7d6b78 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -27,7 +27,7 @@ void add_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq wq_entry->flags &= ~WQ_FLAG_EXCLUSIVE; spin_lock_irqsave(&wq_head->lock, flags); - __add_wait_queue_entry_tail(wq_head, wq_entry); + __add_wait_queue(wq_head, wq_entry); spin_unlock_irqrestore(&wq_head->lock, flags); } EXPORT_SYMBOL(add_wait_queue); |