diff options
Diffstat (limited to 'kernel/sched/core.c')
-rw-r--r-- | kernel/sched/core.c | 703 |
1 files changed, 566 insertions, 137 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 20ffcc044134..c4462c454ab9 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -237,9 +237,30 @@ static DEFINE_MUTEX(sched_core_mutex); static atomic_t sched_core_count; static struct cpumask sched_core_mask; +static void sched_core_lock(int cpu, unsigned long *flags) +{ + const struct cpumask *smt_mask = cpu_smt_mask(cpu); + int t, i = 0; + + local_irq_save(*flags); + for_each_cpu(t, smt_mask) + raw_spin_lock_nested(&cpu_rq(t)->__lock, i++); +} + +static void sched_core_unlock(int cpu, unsigned long *flags) +{ + const struct cpumask *smt_mask = cpu_smt_mask(cpu); + int t; + + for_each_cpu(t, smt_mask) + raw_spin_unlock(&cpu_rq(t)->__lock); + local_irq_restore(*flags); +} + static void __sched_core_flip(bool enabled) { - int cpu, t, i; + unsigned long flags; + int cpu, t; cpus_read_lock(); @@ -250,19 +271,12 @@ static void __sched_core_flip(bool enabled) for_each_cpu(cpu, &sched_core_mask) { const struct cpumask *smt_mask = cpu_smt_mask(cpu); - i = 0; - local_irq_disable(); - for_each_cpu(t, smt_mask) { - /* supports up to SMT8 */ - raw_spin_lock_nested(&cpu_rq(t)->__lock, i++); - } + sched_core_lock(cpu, &flags); for_each_cpu(t, smt_mask) cpu_rq(t)->core_enabled = enabled; - for_each_cpu(t, smt_mask) - raw_spin_unlock(&cpu_rq(t)->__lock); - local_irq_enable(); + sched_core_unlock(cpu, &flags); cpumask_andnot(&sched_core_mask, &sched_core_mask, smt_mask); } @@ -993,6 +1007,7 @@ int get_nohz_timer_target(void) { int i, cpu = smp_processor_id(), default_cpu = -1; struct sched_domain *sd; + const struct cpumask *hk_mask; if (housekeeping_cpu(cpu, HK_FLAG_TIMER)) { if (!idle_cpu(cpu)) @@ -1000,10 +1015,11 @@ int get_nohz_timer_target(void) default_cpu = cpu; } + hk_mask = housekeeping_cpumask(HK_FLAG_TIMER); + rcu_read_lock(); for_each_domain(cpu, sd) { - for_each_cpu_and(i, sched_domain_span(sd), - housekeeping_cpumask(HK_FLAG_TIMER)) { + for_each_cpu_and(i, sched_domain_span(sd), hk_mask) { if (cpu == i) continue; @@ -1619,6 +1635,23 @@ static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p) uclamp_rq_dec_id(rq, p, clamp_id); } +static inline void uclamp_rq_reinc_id(struct rq *rq, struct task_struct *p, + enum uclamp_id clamp_id) +{ + if (!p->uclamp[clamp_id].active) + return; + + uclamp_rq_dec_id(rq, p, clamp_id); + uclamp_rq_inc_id(rq, p, clamp_id); + + /* + * Make sure to clear the idle flag if we've transiently reached 0 + * active tasks on rq. + */ + if (clamp_id == UCLAMP_MAX && (rq->uclamp_flags & UCLAMP_FLAG_IDLE)) + rq->uclamp_flags &= ~UCLAMP_FLAG_IDLE; +} + static inline void uclamp_update_active(struct task_struct *p) { @@ -1642,12 +1675,8 @@ uclamp_update_active(struct task_struct *p) * affecting a valid clamp bucket, the next time it's enqueued, * it will already see the updated clamp bucket value. */ - for_each_clamp_id(clamp_id) { - if (p->uclamp[clamp_id].active) { - uclamp_rq_dec_id(rq, p, clamp_id); - uclamp_rq_inc_id(rq, p, clamp_id); - } - } + for_each_clamp_id(clamp_id) + uclamp_rq_reinc_id(rq, p, clamp_id); task_rq_unlock(rq, p, &rf); } @@ -2161,7 +2190,7 @@ static inline bool is_cpu_allowed(struct task_struct *p, int cpu) /* Non kernel threads are not allowed during either online or offline. */ if (!(p->flags & PF_KTHREAD)) - return cpu_active(cpu); + return cpu_active(cpu) && task_cpu_possible(cpu, p); /* KTHREAD_IS_PER_CPU is always allowed. */ if (kthread_is_per_cpu(p)) @@ -2468,6 +2497,34 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) __do_set_cpus_allowed(p, new_mask, 0); } +int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src, + int node) +{ + if (!src->user_cpus_ptr) + return 0; + + dst->user_cpus_ptr = kmalloc_node(cpumask_size(), GFP_KERNEL, node); + if (!dst->user_cpus_ptr) + return -ENOMEM; + + cpumask_copy(dst->user_cpus_ptr, src->user_cpus_ptr); + return 0; +} + +static inline struct cpumask *clear_user_cpus_ptr(struct task_struct *p) +{ + struct cpumask *user_mask = NULL; + + swap(p->user_cpus_ptr, user_mask); + + return user_mask; +} + +void release_user_cpus_ptr(struct task_struct *p) +{ + kfree(clear_user_cpus_ptr(p)); +} + /* * This function is wildly self concurrent; here be dragons. * @@ -2685,28 +2742,26 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag } /* - * Change a given task's CPU affinity. Migrate the thread to a - * proper CPU and schedule it away if the CPU it's executing on - * is removed from the allowed bitmask. - * - * NOTE: the caller must have a valid reference to the task, the - * task must not exit() & deallocate itself prematurely. The - * call is not atomic; no spinlocks may be held. + * Called with both p->pi_lock and rq->lock held; drops both before returning. */ -static int __set_cpus_allowed_ptr(struct task_struct *p, - const struct cpumask *new_mask, - u32 flags) +static int __set_cpus_allowed_ptr_locked(struct task_struct *p, + const struct cpumask *new_mask, + u32 flags, + struct rq *rq, + struct rq_flags *rf) + __releases(rq->lock) + __releases(p->pi_lock) { + const struct cpumask *cpu_allowed_mask = task_cpu_possible_mask(p); const struct cpumask *cpu_valid_mask = cpu_active_mask; + bool kthread = p->flags & PF_KTHREAD; + struct cpumask *user_mask = NULL; unsigned int dest_cpu; - struct rq_flags rf; - struct rq *rq; int ret = 0; - rq = task_rq_lock(p, &rf); update_rq_clock(rq); - if (p->flags & PF_KTHREAD || is_migration_disabled(p)) { + if (kthread || is_migration_disabled(p)) { /* * Kernel threads are allowed on online && !active CPUs, * however, during cpu-hot-unplug, even these might get pushed @@ -2720,6 +2775,11 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, cpu_valid_mask = cpu_online_mask; } + if (!kthread && !cpumask_subset(new_mask, cpu_allowed_mask)) { + ret = -EINVAL; + goto out; + } + /* * Must re-check here, to close a race against __kthread_bind(), * sched_setaffinity() is not guaranteed to observe the flag. @@ -2754,20 +2814,178 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, __do_set_cpus_allowed(p, new_mask, flags); - return affine_move_task(rq, p, &rf, dest_cpu, flags); + if (flags & SCA_USER) + user_mask = clear_user_cpus_ptr(p); + + ret = affine_move_task(rq, p, rf, dest_cpu, flags); + + kfree(user_mask); + + return ret; out: - task_rq_unlock(rq, p, &rf); + task_rq_unlock(rq, p, rf); return ret; } +/* + * Change a given task's CPU affinity. Migrate the thread to a + * proper CPU and schedule it away if the CPU it's executing on + * is removed from the allowed bitmask. + * + * NOTE: the caller must have a valid reference to the task, the + * task must not exit() & deallocate itself prematurely. The + * call is not atomic; no spinlocks may be held. + */ +static int __set_cpus_allowed_ptr(struct task_struct *p, + const struct cpumask *new_mask, u32 flags) +{ + struct rq_flags rf; + struct rq *rq; + + rq = task_rq_lock(p, &rf); + return __set_cpus_allowed_ptr_locked(p, new_mask, flags, rq, &rf); +} + int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) { return __set_cpus_allowed_ptr(p, new_mask, 0); } EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); +/* + * Change a given task's CPU affinity to the intersection of its current + * affinity mask and @subset_mask, writing the resulting mask to @new_mask + * and pointing @p->user_cpus_ptr to a copy of the old mask. + * If the resulting mask is empty, leave the affinity unchanged and return + * -EINVAL. + */ +static int restrict_cpus_allowed_ptr(struct task_struct *p, + struct cpumask *new_mask, + const struct cpumask *subset_mask) +{ + struct cpumask *user_mask = NULL; + struct rq_flags rf; + struct rq *rq; + int err; + + if (!p->user_cpus_ptr) { + user_mask = kmalloc(cpumask_size(), GFP_KERNEL); + if (!user_mask) + return -ENOMEM; + } + + rq = task_rq_lock(p, &rf); + + /* + * Forcefully restricting the affinity of a deadline task is + * likely to cause problems, so fail and noisily override the + * mask entirely. + */ + if (task_has_dl_policy(p) && dl_bandwidth_enabled()) { + err = -EPERM; + goto err_unlock; + } + + if (!cpumask_and(new_mask, &p->cpus_mask, subset_mask)) { + err = -EINVAL; + goto err_unlock; + } + + /* + * We're about to butcher the task affinity, so keep track of what + * the user asked for in case we're able to restore it later on. + */ + if (user_mask) { + cpumask_copy(user_mask, p->cpus_ptr); + p->user_cpus_ptr = user_mask; + } + + return __set_cpus_allowed_ptr_locked(p, new_mask, 0, rq, &rf); + +err_unlock: + task_rq_unlock(rq, p, &rf); + kfree(user_mask); + return err; +} + +/* + * Restrict the CPU affinity of task @p so that it is a subset of + * task_cpu_possible_mask() and point @p->user_cpu_ptr to a copy of the + * old affinity mask. If the resulting mask is empty, we warn and walk + * up the cpuset hierarchy until we find a suitable mask. + */ +void force_compatible_cpus_allowed_ptr(struct task_struct *p) +{ + cpumask_var_t new_mask; + const struct cpumask *override_mask = task_cpu_possible_mask(p); + + alloc_cpumask_var(&new_mask, GFP_KERNEL); + + /* + * __migrate_task() can fail silently in the face of concurrent + * offlining of the chosen destination CPU, so take the hotplug + * lock to ensure that the migration succeeds. + */ + cpus_read_lock(); + if (!cpumask_available(new_mask)) + goto out_set_mask; + + if (!restrict_cpus_allowed_ptr(p, new_mask, override_mask)) + goto out_free_mask; + + /* + * We failed to find a valid subset of the affinity mask for the + * task, so override it based on its cpuset hierarchy. + */ + cpuset_cpus_allowed(p, new_mask); + override_mask = new_mask; + +out_set_mask: + if (printk_ratelimit()) { + printk_deferred("Overriding affinity for process %d (%s) to CPUs %*pbl\n", + task_pid_nr(p), p->comm, + cpumask_pr_args(override_mask)); + } + + WARN_ON(set_cpus_allowed_ptr(p, override_mask)); +out_free_mask: + cpus_read_unlock(); + free_cpumask_var(new_mask); +} + +static int +__sched_setaffinity(struct task_struct *p, const struct cpumask *mask); + +/* + * Restore the affinity of a task @p which was previously restricted by a + * call to force_compatible_cpus_allowed_ptr(). This will clear (and free) + * @p->user_cpus_ptr. + * + * It is the caller's responsibility to serialise this with any calls to + * force_compatible_cpus_allowed_ptr(@p). + */ +void relax_compatible_cpus_allowed_ptr(struct task_struct *p) +{ + struct cpumask *user_mask = p->user_cpus_ptr; + unsigned long flags; + + /* + * Try to restore the old affinity mask. If this fails, then + * we free the mask explicitly to avoid it being inherited across + * a subsequent fork(). + */ + if (!user_mask || !__sched_setaffinity(p, user_mask)) + return; + + raw_spin_lock_irqsave(&p->pi_lock, flags); + user_mask = clear_user_cpus_ptr(p); + raw_spin_unlock_irqrestore(&p->pi_lock, flags); + + kfree(user_mask); +} + void set_task_cpu(struct task_struct *p, unsigned int new_cpu) { #ifdef CONFIG_SCHED_DEBUG @@ -3112,9 +3330,7 @@ static int select_fallback_rq(int cpu, struct task_struct *p) /* Look for allowed, online CPU in same node. */ for_each_cpu(dest_cpu, nodemask) { - if (!cpu_active(dest_cpu)) - continue; - if (cpumask_test_cpu(dest_cpu, p->cpus_ptr)) + if (is_cpu_allowed(p, dest_cpu)) return dest_cpu; } } @@ -3131,8 +3347,7 @@ static int select_fallback_rq(int cpu, struct task_struct *p) /* No more Mr. Nice Guy. */ switch (state) { case cpuset: - if (IS_ENABLED(CONFIG_CPUSETS)) { - cpuset_cpus_allowed_fallback(p); + if (cpuset_cpus_allowed_fallback(p)) { state = possible; break; } @@ -3144,10 +3359,9 @@ static int select_fallback_rq(int cpu, struct task_struct *p) * * More yuck to audit. */ - do_set_cpus_allowed(p, cpu_possible_mask); + do_set_cpus_allowed(p, task_cpu_possible_mask(p)); state = fail; break; - case fail: BUG(); break; @@ -3562,6 +3776,55 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) } /* + * Invoked from try_to_wake_up() to check whether the task can be woken up. + * + * The caller holds p::pi_lock if p != current or has preemption + * disabled when p == current. + * + * The rules of PREEMPT_RT saved_state: + * + * The related locking code always holds p::pi_lock when updating + * p::saved_state, which means the code is fully serialized in both cases. + * + * The lock wait and lock wakeups happen via TASK_RTLOCK_WAIT. No other + * bits set. This allows to distinguish all wakeup scenarios. + */ +static __always_inline +bool ttwu_state_match(struct task_struct *p, unsigned int state, int *success) +{ + if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)) { + WARN_ON_ONCE((state & TASK_RTLOCK_WAIT) && + state != TASK_RTLOCK_WAIT); + } + + if (READ_ONCE(p->__state) & state) { + *success = 1; + return true; + } + +#ifdef CONFIG_PREEMPT_RT + /* + * Saved state preserves the task state across blocking on + * an RT lock. If the state matches, set p::saved_state to + * TASK_RUNNING, but do not wake the task because it waits + * for a lock wakeup. Also indicate success because from + * the regular waker's point of view this has succeeded. + * + * After acquiring the lock the task will restore p::__state + * from p::saved_state which ensures that the regular + * wakeup is not lost. The restore will also set + * p::saved_state to TASK_RUNNING so any further tests will + * not result in false positives vs. @success + */ + if (p->saved_state & state) { + p->saved_state = TASK_RUNNING; + *success = 1; + } +#endif + return false; +} + +/* * Notes on Program-Order guarantees on SMP systems. * * MIGRATION @@ -3700,10 +3963,9 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) * - we're serialized against set_special_state() by virtue of * it disabling IRQs (this allows not taking ->pi_lock). */ - if (!(READ_ONCE(p->__state) & state)) + if (!ttwu_state_match(p, state, &success)) goto out; - success = 1; trace_sched_waking(p); WRITE_ONCE(p->__state, TASK_RUNNING); trace_sched_wakeup(p); @@ -3718,14 +3980,11 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) */ raw_spin_lock_irqsave(&p->pi_lock, flags); smp_mb__after_spinlock(); - if (!(READ_ONCE(p->__state) & state)) + if (!ttwu_state_match(p, state, &success)) goto unlock; trace_sched_waking(p); - /* We're going to change ->state: */ - success = 1; - /* * Ensure we load p->on_rq _after_ p->state, otherwise it would * be possible to, falsely, observe p->on_rq == 0 and get stuck @@ -5660,11 +5919,9 @@ static bool try_steal_cookie(int this, int that) if (p->core_occupation > dst->idle->core_occupation) goto next; - p->on_rq = TASK_ON_RQ_MIGRATING; deactivate_task(src, p, 0); set_task_cpu(p, this); activate_task(dst, p, 0); - p->on_rq = TASK_ON_RQ_QUEUED; resched_curr(dst); @@ -5736,35 +5993,109 @@ void queue_core_balance(struct rq *rq) queue_balance_callback(rq, &per_cpu(core_balance_head, rq->cpu), sched_core_balance); } -static inline void sched_core_cpu_starting(unsigned int cpu) +static void sched_core_cpu_starting(unsigned int cpu) { const struct cpumask *smt_mask = cpu_smt_mask(cpu); - struct rq *rq, *core_rq = NULL; - int i; + struct rq *rq = cpu_rq(cpu), *core_rq = NULL; + unsigned long flags; + int t; - core_rq = cpu_rq(cpu)->core; + sched_core_lock(cpu, &flags); - if (!core_rq) { - for_each_cpu(i, smt_mask) { - rq = cpu_rq(i); - if (rq->core && rq->core == rq) - core_rq = rq; + WARN_ON_ONCE(rq->core != rq); + + /* if we're the first, we'll be our own leader */ + if (cpumask_weight(smt_mask) == 1) + goto unlock; + + /* find the leader */ + for_each_cpu(t, smt_mask) { + if (t == cpu) + continue; + rq = cpu_rq(t); + if (rq->core == rq) { + core_rq = rq; + break; } + } - if (!core_rq) - core_rq = cpu_rq(cpu); + if (WARN_ON_ONCE(!core_rq)) /* whoopsie */ + goto unlock; - for_each_cpu(i, smt_mask) { - rq = cpu_rq(i); + /* install and validate core_rq */ + for_each_cpu(t, smt_mask) { + rq = cpu_rq(t); - WARN_ON_ONCE(rq->core && rq->core != core_rq); + if (t == cpu) rq->core = core_rq; - } + + WARN_ON_ONCE(rq->core != core_rq); + } + +unlock: + sched_core_unlock(cpu, &flags); +} + +static void sched_core_cpu_deactivate(unsigned int cpu) +{ + const struct cpumask *smt_mask = cpu_smt_mask(cpu); + struct rq *rq = cpu_rq(cpu), *core_rq = NULL; + unsigned long flags; + int t; + + sched_core_lock(cpu, &flags); + + /* if we're the last man standing, nothing to do */ + if (cpumask_weight(smt_mask) == 1) { + WARN_ON_ONCE(rq->core != rq); + goto unlock; + } + + /* if we're not the leader, nothing to do */ + if (rq->core != rq) + goto unlock; + + /* find a new leader */ + for_each_cpu(t, smt_mask) { + if (t == cpu) + continue; + core_rq = cpu_rq(t); + break; } + + if (WARN_ON_ONCE(!core_rq)) /* impossible */ + goto unlock; + + /* copy the shared state to the new leader */ + core_rq->core_task_seq = rq->core_task_seq; + core_rq->core_pick_seq = rq->core_pick_seq; + core_rq->core_cookie = rq->core_cookie; + core_rq->core_forceidle = rq->core_forceidle; + core_rq->core_forceidle_seq = rq->core_forceidle_seq; + + /* install new leader */ + for_each_cpu(t, smt_mask) { + rq = cpu_rq(t); + rq->core = core_rq; + } + +unlock: + sched_core_unlock(cpu, &flags); } + +static inline void sched_core_cpu_dying(unsigned int cpu) +{ + struct rq *rq = cpu_rq(cpu); + + if (rq->core != rq) + rq->core = rq; +} + #else /* !CONFIG_SCHED_CORE */ static inline void sched_core_cpu_starting(unsigned int cpu) {} +static inline void sched_core_cpu_deactivate(unsigned int cpu) {} +static inline void sched_core_cpu_dying(unsigned int cpu) {} static struct task_struct * pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) @@ -5775,6 +6106,24 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) #endif /* CONFIG_SCHED_CORE */ /* + * Constants for the sched_mode argument of __schedule(). + * + * The mode argument allows RT enabled kernels to differentiate a + * preemption from blocking on an 'sleeping' spin/rwlock. Note that + * SM_MASK_PREEMPT for !RT has all bits set, which allows the compiler to + * optimize the AND operation out and just check for zero. + */ +#define SM_NONE 0x0 +#define SM_PREEMPT 0x1 +#define SM_RTLOCK_WAIT 0x2 + +#ifndef CONFIG_PREEMPT_RT +# define SM_MASK_PREEMPT (~0U) +#else +# define SM_MASK_PREEMPT SM_PREEMPT +#endif + +/* * __schedule() is the main scheduler function. * * The main means of driving the scheduler and thus entering this function are: @@ -5813,7 +6162,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) * * WARNING: must be called with preemption disabled! */ -static void __sched notrace __schedule(bool preempt) +static void __sched notrace __schedule(unsigned int sched_mode) { struct task_struct *prev, *next; unsigned long *switch_count; @@ -5826,13 +6175,13 @@ static void __sched notrace __schedule(bool preempt) rq = cpu_rq(cpu); prev = rq->curr; - schedule_debug(prev, preempt); + schedule_debug(prev, !!sched_mode); if (sched_feat(HRTICK) || sched_feat(HRTICK_DL)) hrtick_clear(rq); local_irq_disable(); - rcu_note_context_switch(preempt); + rcu_note_context_switch(!!sched_mode); /* * Make sure that signal_pending_state()->signal_pending() below @@ -5866,7 +6215,7 @@ static void __sched notrace __schedule(bool preempt) * - ptrace_{,un}freeze_traced() can change ->state underneath us. */ prev_state = READ_ONCE(prev->__state); - if (!preempt && prev_state) { + if (!(sched_mode & SM_MASK_PREEMPT) && prev_state) { if (signal_pending_state(prev_state, prev)) { WRITE_ONCE(prev->__state, TASK_RUNNING); } else { @@ -5932,7 +6281,7 @@ static void __sched notrace __schedule(bool preempt) migrate_disable_switch(rq, prev); psi_sched_switch(prev, next, !task_on_rq_queued(prev)); - trace_sched_switch(preempt, prev, next); + trace_sched_switch(sched_mode & SM_MASK_PREEMPT, prev, next); /* Also unlocks the rq: */ rq = context_switch(rq, prev, next, &rf); @@ -5953,7 +6302,7 @@ void __noreturn do_task_dead(void) /* Tell freezer to ignore us: */ current->flags |= PF_NOFREEZE; - __schedule(false); + __schedule(SM_NONE); BUG(); /* Avoid "noreturn function does return" - but don't continue if BUG() is a NOP: */ @@ -6014,7 +6363,7 @@ asmlinkage __visible void __sched schedule(void) sched_submit_work(tsk); do { preempt_disable(); - __schedule(false); + __schedule(SM_NONE); sched_preempt_enable_no_resched(); } while (need_resched()); sched_update_worker(tsk); @@ -6042,7 +6391,7 @@ void __sched schedule_idle(void) */ WARN_ON_ONCE(current->__state); do { - __schedule(false); + __schedule(SM_NONE); } while (need_resched()); } @@ -6077,6 +6426,18 @@ void __sched schedule_preempt_disabled(void) preempt_disable(); } +#ifdef CONFIG_PREEMPT_RT +void __sched notrace schedule_rtlock(void) +{ + do { + preempt_disable(); + __schedule(SM_RTLOCK_WAIT); + sched_preempt_enable_no_resched(); + } while (need_resched()); +} +NOKPROBE_SYMBOL(schedule_rtlock); +#endif + static void __sched notrace preempt_schedule_common(void) { do { @@ -6095,7 +6456,7 @@ static void __sched notrace preempt_schedule_common(void) */ preempt_disable_notrace(); preempt_latency_start(1); - __schedule(true); + __schedule(SM_PREEMPT); preempt_latency_stop(1); preempt_enable_no_resched_notrace(); @@ -6174,7 +6535,7 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void) * an infinite recursion. */ prev_ctx = exception_enter(); - __schedule(true); + __schedule(SM_PREEMPT); exception_exit(prev_ctx); preempt_latency_stop(1); @@ -6323,7 +6684,7 @@ asmlinkage __visible void __sched preempt_schedule_irq(void) do { preempt_disable(); local_irq_enable(); - __schedule(true); + __schedule(SM_PREEMPT); local_irq_disable(); sched_preempt_enable_no_resched(); } while (need_resched()); @@ -7300,6 +7661,16 @@ err_size: return -E2BIG; } +static void get_params(struct task_struct *p, struct sched_attr *attr) +{ + if (task_has_dl_policy(p)) + __getparam_dl(p, attr); + else if (task_has_rt_policy(p)) + attr->sched_priority = p->rt_priority; + else + attr->sched_nice = task_nice(p); +} + /** * sys_sched_setscheduler - set/change the scheduler policy and RT priority * @pid: the pid in question. @@ -7361,6 +7732,8 @@ SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr, rcu_read_unlock(); if (likely(p)) { + if (attr.sched_flags & SCHED_FLAG_KEEP_PARAMS) + get_params(p, &attr); retval = sched_setattr(p, &attr); put_task_struct(p); } @@ -7509,12 +7882,8 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, kattr.sched_policy = p->policy; if (p->sched_reset_on_fork) kattr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; - if (task_has_dl_policy(p)) - __getparam_dl(p, &kattr); - else if (task_has_rt_policy(p)) - kattr.sched_priority = p->rt_priority; - else - kattr.sched_nice = task_nice(p); + get_params(p, &kattr); + kattr.sched_flags &= SCHED_FLAG_ALL; #ifdef CONFIG_UCLAMP_TASK /* @@ -7535,9 +7904,76 @@ out_unlock: return retval; } -long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) +#ifdef CONFIG_SMP +int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask) { + int ret = 0; + + /* + * If the task isn't a deadline task or admission control is + * disabled then we don't care about affinity changes. + */ + if (!task_has_dl_policy(p) || !dl_bandwidth_enabled()) + return 0; + + /* + * Since bandwidth control happens on root_domain basis, + * if admission test is enabled, we only admit -deadline + * tasks allowed to run on all the CPUs in the task's + * root_domain. + */ + rcu_read_lock(); + if (!cpumask_subset(task_rq(p)->rd->span, mask)) + ret = -EBUSY; + rcu_read_unlock(); + return ret; +} +#endif + +static int +__sched_setaffinity(struct task_struct *p, const struct cpumask *mask) +{ + int retval; cpumask_var_t cpus_allowed, new_mask; + + if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) + return -ENOMEM; + + if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) { + retval = -ENOMEM; + goto out_free_cpus_allowed; + } + + cpuset_cpus_allowed(p, cpus_allowed); + cpumask_and(new_mask, mask, cpus_allowed); + + retval = dl_task_check_affinity(p, new_mask); + if (retval) + goto out_free_new_mask; +again: + retval = __set_cpus_allowed_ptr(p, new_mask, SCA_CHECK | SCA_USER); + if (retval) + goto out_free_new_mask; + + cpuset_cpus_allowed(p, cpus_allowed); + if (!cpumask_subset(new_mask, cpus_allowed)) { + /* + * We must have raced with a concurrent cpuset update. + * Just reset the cpumask to the cpuset's cpus_allowed. + */ + cpumask_copy(new_mask, cpus_allowed); + goto again; + } + +out_free_new_mask: + free_cpumask_var(new_mask); +out_free_cpus_allowed: + free_cpumask_var(cpus_allowed); + return retval; +} + +long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) +{ struct task_struct *p; int retval; @@ -7557,68 +7993,22 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) retval = -EINVAL; goto out_put_task; } - if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) { - retval = -ENOMEM; - goto out_put_task; - } - if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) { - retval = -ENOMEM; - goto out_free_cpus_allowed; - } - retval = -EPERM; + if (!check_same_owner(p)) { rcu_read_lock(); if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) { rcu_read_unlock(); - goto out_free_new_mask; + retval = -EPERM; + goto out_put_task; } rcu_read_unlock(); } retval = security_task_setscheduler(p); if (retval) - goto out_free_new_mask; - - - cpuset_cpus_allowed(p, cpus_allowed); - cpumask_and(new_mask, in_mask, cpus_allowed); - - /* - * Since bandwidth control happens on root_domain basis, - * if admission test is enabled, we only admit -deadline - * tasks allowed to run on all the CPUs in the task's - * root_domain. - */ -#ifdef CONFIG_SMP - if (task_has_dl_policy(p) && dl_bandwidth_enabled()) { - rcu_read_lock(); - if (!cpumask_subset(task_rq(p)->rd->span, new_mask)) { - retval = -EBUSY; - rcu_read_unlock(); - goto out_free_new_mask; - } - rcu_read_unlock(); - } -#endif -again: - retval = __set_cpus_allowed_ptr(p, new_mask, SCA_CHECK); + goto out_put_task; - if (!retval) { - cpuset_cpus_allowed(p, cpus_allowed); - if (!cpumask_subset(new_mask, cpus_allowed)) { - /* - * We must have raced with a concurrent cpuset - * update. Just reset the cpus_allowed to the - * cpuset's cpus_allowed - */ - cpumask_copy(new_mask, cpus_allowed); - goto again; - } - } -out_free_new_mask: - free_cpumask_var(new_mask); -out_free_cpus_allowed: - free_cpumask_var(cpus_allowed); + retval = __sched_setaffinity(p, in_mask); out_put_task: put_task_struct(p); return retval; @@ -7761,6 +8151,17 @@ int __sched __cond_resched(void) preempt_schedule_common(); return 1; } + /* + * In preemptible kernels, ->rcu_read_lock_nesting tells the tick + * whether the current CPU is in an RCU read-side critical section, + * so the tick can report quiescent states even for CPUs looping + * in kernel context. In contrast, in non-preemptible kernels, + * RCU readers leave no in-memory hints, which means that CPU-bound + * processes executing in kernel context might never report an + * RCU quiescent state. Therefore, the following code causes + * cond_resched() to report a quiescent state, but only when RCU + * is in urgent need of one. + */ #ifndef CONFIG_PREEMPT_RCU rcu_all_qs(); #endif @@ -8707,6 +9108,8 @@ int sched_cpu_deactivate(unsigned int cpu) */ if (cpumask_weight(cpu_smt_mask(cpu)) == 2) static_branch_dec_cpuslocked(&sched_smt_present); + + sched_core_cpu_deactivate(cpu); #endif if (!sched_smp_initialized) @@ -8811,6 +9214,7 @@ int sched_cpu_dying(unsigned int cpu) calc_load_migrate(rq); update_max_interval(); hrtick_clear(rq); + sched_core_cpu_dying(cpu); return 0; } #endif @@ -9022,7 +9426,7 @@ void __init sched_init(void) atomic_set(&rq->nr_iowait, 0); #ifdef CONFIG_SCHED_CORE - rq->core = NULL; + rq->core = rq; rq->core_pick = NULL; rq->core_enabled = 0; rq->core_tree = RB_ROOT; @@ -9804,7 +10208,7 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota, * Prevent race between setting of cfs_rq->runtime_enabled and * unthrottle_offline_cfs_rqs(). */ - get_online_cpus(); + cpus_read_lock(); mutex_lock(&cfs_constraints_mutex); ret = __cfs_schedulable(tg, period, quota); if (ret) @@ -9848,7 +10252,7 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota, cfs_bandwidth_usage_dec(); out_unlock: mutex_unlock(&cfs_constraints_mutex); - put_online_cpus(); + cpus_read_unlock(); return ret; } @@ -10099,6 +10503,20 @@ static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css, } #endif /* CONFIG_RT_GROUP_SCHED */ +#ifdef CONFIG_FAIR_GROUP_SCHED +static s64 cpu_idle_read_s64(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + return css_tg(css)->idle; +} + +static int cpu_idle_write_s64(struct cgroup_subsys_state *css, + struct cftype *cft, s64 idle) +{ + return sched_group_set_idle(css_tg(css), idle); +} +#endif + static struct cftype cpu_legacy_files[] = { #ifdef CONFIG_FAIR_GROUP_SCHED { @@ -10106,6 +10524,11 @@ static struct cftype cpu_legacy_files[] = { .read_u64 = cpu_shares_read_u64, .write_u64 = cpu_shares_write_u64, }, + { + .name = "idle", + .read_s64 = cpu_idle_read_s64, + .write_s64 = cpu_idle_write_s64, + }, #endif #ifdef CONFIG_CFS_BANDWIDTH { @@ -10313,6 +10736,12 @@ static struct cftype cpu_files[] = { .read_s64 = cpu_weight_nice_read_s64, .write_s64 = cpu_weight_nice_write_s64, }, + { + .name = "idle", + .flags = CFTYPE_NOT_ON_ROOT, + .read_s64 = cpu_idle_read_s64, + .write_s64 = cpu_idle_write_s64, + }, #endif #ifdef CONFIG_CFS_BANDWIDTH { |