diff options
Diffstat (limited to 'kernel')
68 files changed, 3259 insertions, 890 deletions
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt index 416017301660..bd7c4147b9a8 100644 --- a/kernel/Kconfig.preempt +++ b/kernel/Kconfig.preempt @@ -99,3 +99,23 @@ config PREEMPT_DYNAMIC Interesting if you want the same pre-built kernel should be used for both Server and Desktop workloads. + +config SCHED_CORE + bool "Core Scheduling for SMT" + default y + depends on SCHED_SMT + help + This option permits Core Scheduling, a means of coordinated task + selection across SMT siblings. When enabled -- see + prctl(PR_SCHED_CORE) -- task selection ensures that all SMT siblings + will execute a task from the same 'core group', forcing idle when no + matching task is found. + + Use of this feature includes: + - mitigation of some (not all) SMT side channels; + - limiting SMT interference to improve determinism and/or performance. + + SCHED_CORE is default enabled when SCHED_SMT is enabled -- when + unused there should be no impact on performance. + + diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 94ba5163d4c5..c6a27574242d 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6483,6 +6483,27 @@ struct bpf_sanitize_info { bool mask_to_left; }; +static struct bpf_verifier_state * +sanitize_speculative_path(struct bpf_verifier_env *env, + const struct bpf_insn *insn, + u32 next_idx, u32 curr_idx) +{ + struct bpf_verifier_state *branch; + struct bpf_reg_state *regs; + + branch = push_stack(env, next_idx, curr_idx, true); + if (branch && insn) { + regs = branch->frame[branch->curframe]->regs; + if (BPF_SRC(insn->code) == BPF_K) { + mark_reg_unknown(env, regs, insn->dst_reg); + } else if (BPF_SRC(insn->code) == BPF_X) { + mark_reg_unknown(env, regs, insn->dst_reg); + mark_reg_unknown(env, regs, insn->src_reg); + } + } + return branch; +} + static int sanitize_ptr_alu(struct bpf_verifier_env *env, struct bpf_insn *insn, const struct bpf_reg_state *ptr_reg, @@ -6566,12 +6587,26 @@ do_sim: tmp = *dst_reg; *dst_reg = *ptr_reg; } - ret = push_stack(env, env->insn_idx + 1, env->insn_idx, true); + ret = sanitize_speculative_path(env, NULL, env->insn_idx + 1, + env->insn_idx); if (!ptr_is_dst_reg && ret) *dst_reg = tmp; return !ret ? REASON_STACK : 0; } +static void sanitize_mark_insn_seen(struct bpf_verifier_env *env) +{ + struct bpf_verifier_state *vstate = env->cur_state; + + /* If we simulate paths under speculation, we don't update the + * insn as 'seen' such that when we verify unreachable paths in + * the non-speculative domain, sanitize_dead_code() can still + * rewrite/sanitize them. + */ + if (!vstate->speculative) + env->insn_aux_data[env->insn_idx].seen = env->pass_cnt; +} + static int sanitize_err(struct bpf_verifier_env *env, const struct bpf_insn *insn, int reason, const struct bpf_reg_state *off_reg, @@ -8750,14 +8785,28 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, if (err) return err; } + if (pred == 1) { - /* only follow the goto, ignore fall-through */ + /* Only follow the goto, ignore fall-through. If needed, push + * the fall-through branch for simulation under speculative + * execution. + */ + if (!env->bypass_spec_v1 && + !sanitize_speculative_path(env, insn, *insn_idx + 1, + *insn_idx)) + return -EFAULT; *insn_idx += insn->off; return 0; } else if (pred == 0) { - /* only follow fall-through branch, since - * that's where the program will go + /* Only follow the fall-through branch, since that's where the + * program will go. If needed, push the goto branch for + * simulation under speculative execution. */ + if (!env->bypass_spec_v1 && + !sanitize_speculative_path(env, insn, + *insn_idx + insn->off + 1, + *insn_idx)) + return -EFAULT; return 0; } @@ -10630,7 +10679,7 @@ static int do_check(struct bpf_verifier_env *env) } regs = cur_regs(env); - env->insn_aux_data[env->insn_idx].seen = env->pass_cnt; + sanitize_mark_insn_seen(env); prev_insn_idx = env->insn_idx; if (class == BPF_ALU || class == BPF_ALU64) { @@ -10857,7 +10906,7 @@ process_bpf_exit: return err; env->insn_idx++; - env->insn_aux_data[env->insn_idx].seen = env->pass_cnt; + sanitize_mark_insn_seen(env); } else { verbose(env, "invalid BPF_LD mode\n"); return -EINVAL; @@ -11366,6 +11415,7 @@ static int adjust_insn_aux_data(struct bpf_verifier_env *env, { struct bpf_insn_aux_data *new_data, *old_data = env->insn_aux_data; struct bpf_insn *insn = new_prog->insnsi; + u32 old_seen = old_data[off].seen; u32 prog_len; int i; @@ -11386,7 +11436,8 @@ static int adjust_insn_aux_data(struct bpf_verifier_env *env, memcpy(new_data + off + cnt - 1, old_data + off, sizeof(struct bpf_insn_aux_data) * (prog_len - off - cnt + 1)); for (i = off; i < off + cnt - 1; i++) { - new_data[i].seen = env->pass_cnt; + /* Expand insni[off]'s seen count to the patched range. */ + new_data[i].seen = old_seen; new_data[i].zext_dst = insn_has_def32(env, insn + i); } env->insn_aux_data = new_data; @@ -12710,6 +12761,9 @@ static void free_states(struct bpf_verifier_env *env) * insn_aux_data was touched. These variables are compared to clear temporary * data from failed pass. For testing and experiments do_check_common() can be * run multiple times even when prior attempt to verify is unsuccessful. + * + * Note that special handling is needed on !env->bypass_spec_v1 if this is + * ever called outside of error path with subsequent program rejection. */ static void sanitize_insn_aux_data(struct bpf_verifier_env *env) { diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 1f274d7fc934..ee93b6e89587 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -713,7 +713,7 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) css_task_iter_start(&cgrp->self, 0, &it); while ((tsk = css_task_iter_next(&it))) { - switch (tsk->state) { + switch (READ_ONCE(tsk->__state)) { case TASK_RUNNING: stats->nr_running++; break; diff --git a/kernel/cpu.c b/kernel/cpu.c index d2e1692d7bdf..804b847912dc 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -1008,7 +1008,7 @@ static int takedown_cpu(unsigned int cpu) int err; /* Park the smpboot threads */ - kthread_park(per_cpu_ptr(&cpuhp_state, cpu)->thread); + kthread_park(st->thread); /* * Prevent irq alloc/free while the dying cpu reorganizes the @@ -1024,7 +1024,7 @@ static int takedown_cpu(unsigned int cpu) /* CPU refused to die */ irq_unlock_sparse(); /* Unpark the hotplug thread so we can rollback there */ - kthread_unpark(per_cpu_ptr(&cpuhp_state, cpu)->thread); + kthread_unpark(st->thread); return err; } BUG_ON(cpu_online(cpu)); diff --git a/kernel/crash_core.c b/kernel/crash_core.c index 825284baaf46..684a6061a13a 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -464,6 +464,7 @@ static int __init crash_save_vmcoreinfo_init(void) VMCOREINFO_LENGTH(mem_section, NR_SECTION_ROOTS); VMCOREINFO_STRUCT_SIZE(mem_section); VMCOREINFO_OFFSET(mem_section, section_mem_map); + VMCOREINFO_NUMBER(SECTION_SIZE_BITS); VMCOREINFO_NUMBER(MAX_PHYSMEM_BITS); #endif VMCOREINFO_STRUCT_SIZE(page); diff --git a/kernel/cred.c b/kernel/cred.c index e1d274cd741b..e6fd2b3fc31f 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -60,6 +60,7 @@ struct cred init_cred = { .user = INIT_USER, .user_ns = &init_user_ns, .group_info = &init_groups, + .ucounts = &init_ucounts, }; static inline void set_cred_subscribers(struct cred *cred, int n) @@ -119,6 +120,8 @@ static void put_cred_rcu(struct rcu_head *rcu) if (cred->group_info) put_group_info(cred->group_info); free_uid(cred->user); + if (cred->ucounts) + put_ucounts(cred->ucounts); put_user_ns(cred->user_ns); kmem_cache_free(cred_jar, cred); } @@ -222,6 +225,7 @@ struct cred *cred_alloc_blank(void) #ifdef CONFIG_DEBUG_CREDENTIALS new->magic = CRED_MAGIC; #endif + new->ucounts = get_ucounts(&init_ucounts); if (security_cred_alloc_blank(new, GFP_KERNEL_ACCOUNT) < 0) goto error; @@ -284,6 +288,11 @@ struct cred *prepare_creds(void) if (security_prepare_creds(new, old, GFP_KERNEL_ACCOUNT) < 0) goto error; + + new->ucounts = get_ucounts(new->ucounts); + if (!new->ucounts) + goto error; + validate_creds(new); return new; @@ -351,7 +360,7 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags) kdebug("share_creds(%p{%d,%d})", p->cred, atomic_read(&p->cred->usage), read_cred_subscribers(p->cred)); - atomic_inc(&p->cred->user->processes); + inc_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1); return 0; } @@ -363,6 +372,9 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags) ret = create_user_ns(new); if (ret < 0) goto error_put; + ret = set_cred_ucounts(new); + if (ret < 0) + goto error_put; } #ifdef CONFIG_KEYS @@ -384,8 +396,8 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags) } #endif - atomic_inc(&new->user->processes); p->cred = p->real_cred = get_cred(new); + inc_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1); alter_cred_subscribers(new, 2); validate_creds(new); return 0; @@ -485,12 +497,12 @@ int commit_creds(struct cred *new) * in set_user(). */ alter_cred_subscribers(new, 2); - if (new->user != old->user) - atomic_inc(&new->user->processes); + if (new->user != old->user || new->user_ns != old->user_ns) + inc_rlimit_ucounts(new->ucounts, UCOUNT_RLIMIT_NPROC, 1); rcu_assign_pointer(task->real_cred, new); rcu_assign_pointer(task->cred, new); if (new->user != old->user) - atomic_dec(&old->user->processes); + dec_rlimit_ucounts(old->ucounts, UCOUNT_RLIMIT_NPROC, 1); alter_cred_subscribers(old, -2); /* send notifications */ @@ -653,6 +665,31 @@ int cred_fscmp(const struct cred *a, const struct cred *b) } EXPORT_SYMBOL(cred_fscmp); +int set_cred_ucounts(struct cred *new) +{ + struct task_struct *task = current; + const struct cred *old = task->real_cred; + struct ucounts *old_ucounts = new->ucounts; + + if (new->user == old->user && new->user_ns == old->user_ns) + return 0; + + /* + * This optimization is needed because alloc_ucounts() uses locks + * for table lookups. + */ + if (old_ucounts && old_ucounts->ns == new->user_ns && uid_eq(old_ucounts->uid, new->euid)) + return 0; + + if (!(new->ucounts = alloc_ucounts(new->user_ns, new->euid))) + return -EAGAIN; + + if (old_ucounts) + put_ucounts(old_ucounts); + + return 0; +} + /* * initialise the credentials stuff */ @@ -719,6 +756,10 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon) if (security_prepare_creds(new, old, GFP_KERNEL_ACCOUNT) < 0) goto error; + new->ucounts = get_ucounts(new->ucounts); + if (!new->ucounts) + goto error; + put_cred(old); validate_creds(new); return new; diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 1baa96a2ecb8..622410c45da1 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -2488,7 +2488,6 @@ static void kdb_sysinfo(struct sysinfo *val) static int kdb_summary(int argc, const char **argv) { time64_t now; - struct tm tm; struct sysinfo val; if (argc) @@ -2502,13 +2501,7 @@ static int kdb_summary(int argc, const char **argv) kdb_printf("domainname %s\n", init_uts_ns.name.domainname); now = __ktime_get_real_seconds(); - time64_to_tm(now, 0, &tm); - kdb_printf("date %04ld-%02d-%02d %02d:%02d:%02d " - "tz_minuteswest %d\n", - 1900+tm.tm_year, tm.tm_mon+1, tm.tm_mday, - tm.tm_hour, tm.tm_min, tm.tm_sec, - sys_tz.tz_minuteswest); - + kdb_printf("date %ptTs tz_minuteswest %d\n", &now, sys_tz.tz_minuteswest); kdb_sysinfo(&val); kdb_printf("uptime "); if (val.uptime > (24*60*60)) { diff --git a/kernel/debug/kdb/kdb_support.c b/kernel/debug/kdb/kdb_support.c index 91bb666d7c03..9f50d22d68e6 100644 --- a/kernel/debug/kdb/kdb_support.c +++ b/kernel/debug/kdb/kdb_support.c @@ -609,23 +609,25 @@ unsigned long kdb_task_state_string(const char *s) */ char kdb_task_state_char (const struct task_struct *p) { - int cpu; - char state; + unsigned int p_state; unsigned long tmp; + char state; + int cpu; if (!p || copy_from_kernel_nofault(&tmp, (char *)p, sizeof(unsigned long))) return 'E'; cpu = kdb_process_cpu(p); - state = (p->state == 0) ? 'R' : - (p->state < 0) ? 'U' : - (p->state & TASK_UNINTERRUPTIBLE) ? 'D' : - (p->state & TASK_STOPPED) ? 'T' : - (p->state & TASK_TRACED) ? 'C' : + p_state = READ_ONCE(p->__state); + state = (p_state == 0) ? 'R' : + (p_state < 0) ? 'U' : + (p_state & TASK_UNINTERRUPTIBLE) ? 'D' : + (p_state & TASK_STOPPED) ? 'T' : + (p_state & TASK_TRACED) ? 'C' : (p->exit_state & EXIT_ZOMBIE) ? 'Z' : (p->exit_state & EXIT_DEAD) ? 'E' : - (p->state & TASK_INTERRUPTIBLE) ? 'S' : '?'; + (p_state & TASK_INTERRUPTIBLE) ? 'S' : '?'; if (is_idle_task(p)) { /* Idle task. Is it really idle, apart from the kdb * interrupt? */ diff --git a/kernel/delayacct.c b/kernel/delayacct.c index 27725754ac99..51530d5b15a8 100644 --- a/kernel/delayacct.c +++ b/kernel/delayacct.c @@ -7,30 +7,64 @@ #include <linux/sched.h> #include <linux/sched/task.h> #include <linux/sched/cputime.h> +#include <linux/sched/clock.h> #include <linux/slab.h> #include <linux/taskstats.h> -#include <linux/time.h> #include <linux/sysctl.h> #include <linux/delayacct.h> #include <linux/module.h> -int delayacct_on __read_mostly = 1; /* Delay accounting turned on/off */ -EXPORT_SYMBOL_GPL(delayacct_on); +DEFINE_STATIC_KEY_FALSE(delayacct_key); +int delayacct_on __read_mostly; /* Delay accounting turned on/off */ struct kmem_cache *delayacct_cache; -static int __init delayacct_setup_disable(char *str) +static void set_delayacct(bool enabled) { - delayacct_on = 0; + if (enabled) { + static_branch_enable(&delayacct_key); + delayacct_on = 1; + } else { + delayacct_on = 0; + static_branch_disable(&delayacct_key); + } +} + +static int __init delayacct_setup_enable(char *str) +{ + delayacct_on = 1; return 1; } -__setup("nodelayacct", delayacct_setup_disable); +__setup("delayacct", delayacct_setup_enable); void delayacct_init(void) { delayacct_cache = KMEM_CACHE(task_delay_info, SLAB_PANIC|SLAB_ACCOUNT); delayacct_tsk_init(&init_task); + set_delayacct(delayacct_on); } +#ifdef CONFIG_PROC_SYSCTL +int sysctl_delayacct(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos) +{ + int state = delayacct_on; + struct ctl_table t; + int err; + + if (write && !capable(CAP_SYS_ADMIN)) + return -EPERM; + + t = *table; + t.data = &state; + err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos); + if (err < 0) + return err; + if (write) + set_delayacct(state); + return err; +} +#endif + void __delayacct_tsk_init(struct task_struct *tsk) { tsk->delays = kmem_cache_zalloc(delayacct_cache, GFP_KERNEL); @@ -42,10 +76,9 @@ void __delayacct_tsk_init(struct task_struct *tsk) * Finish delay accounting for a statistic using its timestamps (@start), * accumalator (@total) and @count */ -static void delayacct_end(raw_spinlock_t *lock, u64 *start, u64 *total, - u32 *count) +static void delayacct_end(raw_spinlock_t *lock, u64 *start, u64 *total, u32 *count) { - s64 ns = ktime_get_ns() - *start; + s64 ns = local_clock() - *start; unsigned long flags; if (ns > 0) { @@ -58,7 +91,7 @@ static void delayacct_end(raw_spinlock_t *lock, u64 *start, u64 *total, void __delayacct_blkio_start(void) { - current->delays->blkio_start = ktime_get_ns(); + current->delays->blkio_start = local_clock(); } /* @@ -82,7 +115,7 @@ void __delayacct_blkio_end(struct task_struct *p) delayacct_end(&delays->lock, &delays->blkio_start, total, count); } -int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) +int delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) { u64 utime, stime, stimescaled, utimescaled; unsigned long long t2, t3; @@ -117,6 +150,9 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) d->cpu_run_virtual_total = (tmp < (s64)d->cpu_run_virtual_total) ? 0 : tmp; + if (!tsk->delays) + return 0; + /* zero XXX_total, non-zero XXX_count implies XXX stat overflowed */ raw_spin_lock_irqsave(&tsk->delays->lock, flags); @@ -151,21 +187,20 @@ __u64 __delayacct_blkio_ticks(struct task_struct *tsk) void __delayacct_freepages_start(void) { - current->delays->freepages_start = ktime_get_ns(); + current->delays->freepages_start = local_clock(); } void __delayacct_freepages_end(void) { - delayacct_end( - ¤t->delays->lock, - ¤t->delays->freepages_start, - ¤t->delays->freepages_delay, - ¤t->delays->freepages_count); + delayacct_end(¤t->delays->lock, + ¤t->delays->freepages_start, + ¤t->delays->freepages_delay, + ¤t->delays->freepages_count); } void __delayacct_thrashing_start(void) { - current->delays->thrashing_start = ktime_get_ns(); + current->delays->thrashing_start = local_clock(); } void __delayacct_thrashing_end(void) diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 8ca7d505d61c..e50df8d8f87e 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -335,6 +335,14 @@ void __init swiotlb_exit(void) } /* + * Return the offset into a iotlb slot required to keep the device happy. + */ +static unsigned int swiotlb_align_offset(struct device *dev, u64 addr) +{ + return addr & dma_get_min_align_mask(dev) & (IO_TLB_SIZE - 1); +} + +/* * Bounce: copy the swiotlb buffer from or back to the original dma location */ static void swiotlb_bounce(struct device *dev, phys_addr_t tlb_addr, size_t size, @@ -346,10 +354,17 @@ static void swiotlb_bounce(struct device *dev, phys_addr_t tlb_addr, size_t size size_t alloc_size = mem->slots[index].alloc_size; unsigned long pfn = PFN_DOWN(orig_addr); unsigned char *vaddr = phys_to_virt(tlb_addr); + unsigned int tlb_offset; if (orig_addr == INVALID_PHYS_ADDR) return; + tlb_offset = (tlb_addr & (IO_TLB_SIZE - 1)) - + swiotlb_align_offset(dev, orig_addr); + + orig_addr += tlb_offset; + alloc_size -= tlb_offset; + if (size > alloc_size) { dev_WARN_ONCE(dev, 1, "Buffer overflow detected. Allocation size: %zu. Mapping size: %zu.\n", @@ -391,14 +406,6 @@ static void swiotlb_bounce(struct device *dev, phys_addr_t tlb_addr, size_t size #define slot_addr(start, idx) ((start) + ((idx) << IO_TLB_SHIFT)) /* - * Return the offset into a iotlb slot required to keep the device happy. - */ -static unsigned int swiotlb_align_offset(struct device *dev, u64 addr) -{ - return addr & dma_get_min_align_mask(dev) & (IO_TLB_SIZE - 1); -} - -/* * Carefully handle integer overflow which can occur when boundary_mask == ~0UL. */ static inline unsigned long get_max_slots(unsigned long boundary_mask) diff --git a/kernel/events/core.c b/kernel/events/core.c index fe88d6eea3c2..4576413b6230 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -132,6 +132,7 @@ task_function_call(struct task_struct *p, remote_function_f func, void *info) /** * cpu_function_call - call a function on the cpu + * @cpu: target cpu to queue this function * @func: the function to be called * @info: the function call argument * @@ -3821,9 +3822,16 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx, struct task_struct *task) { struct perf_cpu_context *cpuctx; - struct pmu *pmu = ctx->pmu; + struct pmu *pmu; cpuctx = __get_cpu_context(ctx); + + /* + * HACK: for HETEROGENEOUS the task context might have switched to a + * different PMU, force (re)set the context, + */ + pmu = ctx->pmu = cpuctx->ctx.pmu; + if (cpuctx->task_ctx == ctx) { if (cpuctx->sched_cb_usage) __perf_pmu_sched_task(cpuctx, true); @@ -6669,10 +6677,10 @@ out: return data->aux_size; } -long perf_pmu_snapshot_aux(struct perf_buffer *rb, - struct perf_event *event, - struct perf_output_handle *handle, - unsigned long size) +static long perf_pmu_snapshot_aux(struct perf_buffer *rb, + struct perf_event *event, + struct perf_output_handle *handle, + unsigned long size) { unsigned long flags; long ret; @@ -8682,13 +8690,12 @@ static void perf_event_switch(struct task_struct *task, }, }; - if (!sched_in && task->state == TASK_RUNNING) + if (!sched_in && task->on_rq) { switch_event.event_id.header.misc |= PERF_RECORD_MISC_SWITCH_OUT_PREEMPT; + } - perf_iterate_sb(perf_event_switch_output, - &switch_event, - NULL); + perf_iterate_sb(perf_event_switch_output, &switch_event, NULL); } /* @@ -11919,6 +11926,7 @@ again: * @pid: target pid * @cpu: target cpu * @group_fd: group leader event fd + * @flags: perf event open flags */ SYSCALL_DEFINE5(perf_event_open, struct perf_event_attr __user *, attr_uptr, @@ -12375,6 +12383,8 @@ err_fd: * @attr: attributes of the counter to create * @cpu: cpu in which the counter is bound * @task: task to profile (NULL for percpu) + * @overflow_handler: callback to trigger when we hit the event + * @context: context data could be used in overflow_handler callback */ struct perf_event * perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index b48d7039a015..835973444a1e 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -451,6 +451,7 @@ int register_perf_hw_breakpoint(struct perf_event *bp) * register_user_hw_breakpoint - register a hardware breakpoint for user space * @attr: breakpoint attributes * @triggered: callback to trigger when we hit the breakpoint + * @context: context data could be used in the triggered callback * @tsk: pointer to 'task_struct' of the process to which the address belongs */ struct perf_event * @@ -550,6 +551,7 @@ EXPORT_SYMBOL_GPL(unregister_hw_breakpoint); * register_wide_hw_breakpoint - register a wide breakpoint in the kernel * @attr: breakpoint attributes * @triggered: callback to trigger when we hit the breakpoint + * @context: context data could be used in the triggered callback * * @return a set of per_cpu pointers to perf events */ diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 6addc9780319..a481ef696143 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -453,6 +453,7 @@ static int update_ref_ctr(struct uprobe *uprobe, struct mm_struct *mm, * that have fixed length instructions. * * uprobe_write_opcode - write the opcode at a given virtual address. + * @auprobe: arch specific probepoint information. * @mm: the probed process address space. * @vaddr: the virtual address to store the opcode. * @opcode: opcode to be written at @vaddr. diff --git a/kernel/exit.c b/kernel/exit.c index fd1c04193e18..9a89e7f36acb 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -162,7 +162,6 @@ static void __exit_signal(struct task_struct *tsk) flush_sigqueue(&sig->shared_pending); tty_kref_put(tty); } - exit_task_sigqueue_cache(tsk); } static void delayed_put_task_struct(struct rcu_head *rhp) @@ -189,7 +188,7 @@ repeat: /* don't need to get the RCU readlock here - the process is dead and * can't be modifying its own credentials. But shut RCU-lockdep up */ rcu_read_lock(); - atomic_dec(&__task_cred(p)->user->processes); + dec_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1); rcu_read_unlock(); cgroup_release(p); diff --git a/kernel/fork.c b/kernel/fork.c index dc06afd725cb..b4386ff6a641 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -425,7 +425,7 @@ static int memcg_charge_kernel_stack(struct task_struct *tsk) static void release_task_stack(struct task_struct *tsk) { - if (WARN_ON(tsk->state != TASK_DEAD)) + if (WARN_ON(READ_ONCE(tsk->__state) != TASK_DEAD)) return; /* Better to leak the stack than to free prematurely */ account_kernel_stack(tsk, -1); @@ -742,6 +742,7 @@ void __put_task_struct(struct task_struct *tsk) exit_creds(tsk); delayacct_tsk_free(tsk); put_signal_struct(tsk->signal); + sched_core_free(tsk); if (!profile_handoff_task(tsk)) free_task(tsk); @@ -824,9 +825,14 @@ void __init fork_init(void) init_task.signal->rlim[RLIMIT_SIGPENDING] = init_task.signal->rlim[RLIMIT_NPROC]; - for (i = 0; i < UCOUNT_COUNTS; i++) + for (i = 0; i < MAX_PER_NAMESPACE_UCOUNTS; i++) init_user_ns.ucount_max[i] = max_threads/2; + set_rlimit_ucount_max(&init_user_ns, UCOUNT_RLIMIT_NPROC, task_rlimit(&init_task, RLIMIT_NPROC)); + set_rlimit_ucount_max(&init_user_ns, UCOUNT_RLIMIT_MSGQUEUE, task_rlimit(&init_task, RLIMIT_MSGQUEUE)); + set_rlimit_ucount_max(&init_user_ns, UCOUNT_RLIMIT_SIGPENDING, task_rlimit(&init_task, RLIMIT_SIGPENDING)); + set_rlimit_ucount_max(&init_user_ns, UCOUNT_RLIMIT_MEMLOCK, task_rlimit(&init_task, RLIMIT_MEMLOCK)); + #ifdef CONFIG_VMAP_STACK cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "fork:vm_stack_cache", NULL, free_vm_stack_cache); @@ -1977,8 +1983,7 @@ static __latent_entropy struct task_struct *copy_process( DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled); #endif retval = -EAGAIN; - if (atomic_read(&p->real_cred->user->processes) >= - task_rlimit(p, RLIMIT_NPROC)) { + if (is_ucounts_overlimit(task_ucounts(p), UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC))) { if (p->real_cred->user != INIT_USER && !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN)) goto bad_fork_free; @@ -1999,7 +2004,7 @@ static __latent_entropy struct task_struct *copy_process( goto bad_fork_cleanup_count; delayacct_tsk_init(p); /* Must remain after dup_task_struct() */ - p->flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER | PF_IDLE); + p->flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER | PF_IDLE | PF_NO_SETAFFINITY); p->flags |= PF_FORKNOEXEC; INIT_LIST_HEAD(&p->children); INIT_LIST_HEAD(&p->sibling); @@ -2008,7 +2013,6 @@ static __latent_entropy struct task_struct *copy_process( spin_lock_init(&p->alloc_lock); init_sigpending(&p->pending); - p->sigqueue_cache = NULL; p->utime = p->stime = p->gtime = 0; #ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME @@ -2250,6 +2254,8 @@ static __latent_entropy struct task_struct *copy_process( klp_copy_process(p); + sched_core_fork(p); + spin_lock(¤t->sighand->siglock); /* @@ -2337,6 +2343,7 @@ static __latent_entropy struct task_struct *copy_process( return p; bad_fork_cancel_cgroup: + sched_core_free(p); spin_unlock(¤t->sighand->siglock); write_unlock_irq(&tasklist_lock); cgroup_cancel_fork(p, args); @@ -2385,10 +2392,10 @@ bad_fork_cleanup_threadgroup_lock: #endif delayacct_tsk_free(p); bad_fork_cleanup_count: - atomic_dec(&p->cred->user->processes); + dec_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1); exit_creds(p); bad_fork_free: - p->state = TASK_DEAD; + WRITE_ONCE(p->__state, TASK_DEAD); put_task_stack(p); delayed_free_task(p); fork_out: @@ -2408,7 +2415,7 @@ static inline void init_idle_pids(struct task_struct *idle) } } -struct task_struct *fork_idle(int cpu) +struct task_struct * __init fork_idle(int cpu) { struct task_struct *task; struct kernel_clone_args args = { @@ -2998,6 +3005,12 @@ int ksys_unshare(unsigned long unshare_flags) if (err) goto bad_unshare_cleanup_cred; + if (new_cred) { + err = set_cred_ucounts(new_cred); + if (err) + goto bad_unshare_cleanup_cred; + } + if (new_fs || new_fd || do_sysvsem || new_cred || new_nsproxy) { if (do_sysvsem) { /* diff --git a/kernel/freezer.c b/kernel/freezer.c index dc520f01f99d..45ab36ffd0e7 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c @@ -58,7 +58,7 @@ bool __refrigerator(bool check_kthr_stop) /* Hmm, should we be allowed to suspend when there are realtime processes around? */ bool was_frozen = false; - long save = current->state; + unsigned int save = get_current_state(); pr_debug("%s entered refrigerator\n", current->comm); diff --git a/kernel/futex.c b/kernel/futex.c index 4938a00bc785..2ecb07575055 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -35,7 +35,6 @@ #include <linux/jhash.h> #include <linux/pagemap.h> #include <linux/syscalls.h> -#include <linux/hugetlb.h> #include <linux/freezer.h> #include <linux/memblock.h> #include <linux/fault-inject.h> @@ -650,7 +649,7 @@ again: key->both.offset |= FUT_OFF_INODE; /* inode-based key */ key->shared.i_seq = get_inode_sequence_number(inode); - key->shared.pgoff = basepage_index(tail); + key->shared.pgoff = page_to_pgoff(tail); rcu_read_unlock(); } @@ -1728,12 +1727,9 @@ retry_private: return ret; } - if (!(flags & FLAGS_SHARED)) { - cond_resched(); - goto retry_private; - } - cond_resched(); + if (!(flags & FLAGS_SHARED)) + goto retry_private; goto retry; } @@ -1874,7 +1870,7 @@ futex_proxy_trylock_atomic(u32 __user *pifutex, struct futex_hash_bucket *hb1, * If the caller intends to requeue more than 1 waiter to pifutex, * force futex_lock_pi_atomic() to set the FUTEX_WAITERS bit now, * as we have means to handle the possible fault. If not, don't set - * the bit unecessarily as it will force the subsequent unlock to enter + * the bit unnecessarily as it will force the subsequent unlock to enter * the kernel. */ top_waiter = futex_top_waiter(hb1, key1); @@ -2103,7 +2099,7 @@ retry_private: continue; /* - * FUTEX_WAIT_REQEUE_PI and FUTEX_CMP_REQUEUE_PI should always + * FUTEX_WAIT_REQUEUE_PI and FUTEX_CMP_REQUEUE_PI should always * be paired with each other and no other futex ops. * * We should never be requeueing a futex_q with a pi_state, @@ -2318,7 +2314,7 @@ retry: } /* - * PI futexes can not be requeued and must remove themself from the + * PI futexes can not be requeued and must remove themselves from the * hash bucket. The hash bucket lock (i.e. lock_ptr) is held. */ static void unqueue_me_pi(struct futex_q *q) @@ -2786,7 +2782,7 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, if (refill_pi_state_cache()) return -ENOMEM; - to = futex_setup_timer(time, &timeout, FLAGS_CLOCKRT, 0); + to = futex_setup_timer(time, &timeout, flags, 0); retry: ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q.key, FUTEX_WRITE); @@ -2903,7 +2899,7 @@ no_block: */ res = fixup_owner(uaddr, &q, !ret); /* - * If fixup_owner() returned an error, proprogate that. If it acquired + * If fixup_owner() returned an error, propagate that. If it acquired * the lock, clear our -ETIMEDOUT or -EINTR. */ if (res) @@ -3280,7 +3276,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, */ res = fixup_owner(uaddr2, &q, !ret); /* - * If fixup_owner() returned an error, proprogate that. If it + * If fixup_owner() returned an error, propagate that. If it * acquired the lock, clear -ETIMEDOUT or -EINTR. */ if (res) @@ -3678,7 +3674,7 @@ void futex_exec_release(struct task_struct *tsk) { /* * The state handling is done for consistency, but in the case of - * exec() there is no way to prevent futher damage as the PID stays + * exec() there is no way to prevent further damage as the PID stays * the same. But for the unlikely and arguably buggy case that a * futex is held on exec(), this provides at least as much state * consistency protection which is possible. @@ -3710,12 +3706,14 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, if (op & FUTEX_CLOCK_REALTIME) { flags |= FLAGS_CLOCKRT; - if (cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI) + if (cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI && + cmd != FUTEX_LOCK_PI2) return -ENOSYS; } switch (cmd) { case FUTEX_LOCK_PI: + case FUTEX_LOCK_PI2: case FUTEX_UNLOCK_PI: case FUTEX_TRYLOCK_PI: case FUTEX_WAIT_REQUEUE_PI: @@ -3742,6 +3740,9 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, case FUTEX_WAKE_OP: return futex_wake_op(uaddr, flags, uaddr2, val, val2, val3); case FUTEX_LOCK_PI: + flags |= FLAGS_CLOCKRT; + fallthrough; + case FUTEX_LOCK_PI2: return futex_lock_pi(uaddr, flags, timeout, 0); case FUTEX_UNLOCK_PI: return futex_unlock_pi(uaddr, flags); @@ -3762,6 +3763,7 @@ static __always_inline bool futex_cmd_has_timeout(u32 cmd) switch (cmd) { case FUTEX_WAIT: case FUTEX_LOCK_PI: + case FUTEX_LOCK_PI2: case FUTEX_WAIT_BITSET: case FUTEX_WAIT_REQUEUE_PI: return true; diff --git a/kernel/hung_task.c b/kernel/hung_task.c index 396ebaebea3f..b0ce8b3f3822 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -196,7 +196,7 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout) last_break = jiffies; } /* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */ - if (t->state == TASK_UNINTERRUPTIBLE) + if (READ_ONCE(t->__state) == TASK_UNINTERRUPTIBLE) check_hung_task(t, timeout); } unlock: diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 4c14356543d9..a847dd2044c8 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -441,7 +441,8 @@ out_unlock: return ret; } -int __irq_set_affinity(unsigned int irq, const struct cpumask *mask, bool force) +static int __irq_set_affinity(unsigned int irq, const struct cpumask *mask, + bool force) { struct irq_desc *desc = irq_to_desc(irq); unsigned long flags; @@ -456,6 +457,36 @@ int __irq_set_affinity(unsigned int irq, const struct cpumask *mask, bool force) return ret; } +/** + * irq_set_affinity - Set the irq affinity of a given irq + * @irq: Interrupt to set affinity + * @cpumask: cpumask + * + * Fails if cpumask does not contain an online CPU + */ +int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask) +{ + return __irq_set_affinity(irq, cpumask, false); +} +EXPORT_SYMBOL_GPL(irq_set_affinity); + +/** + * irq_force_affinity - Force the irq affinity of a given irq + * @irq: Interrupt to set affinity + * @cpumask: cpumask + * + * Same as irq_set_affinity, but without checking the mask against + * online cpus. + * + * Solely for low level cpu hotplug code, where we need to make per + * cpu interrupts affine before the cpu becomes online. + */ +int irq_force_affinity(unsigned int irq, const struct cpumask *cpumask) +{ + return __irq_set_affinity(irq, cpumask, true); +} +EXPORT_SYMBOL_GPL(irq_force_affinity); + int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m) { unsigned long flags; diff --git a/kernel/jump_label.c b/kernel/jump_label.c index ba39fbb1f8e7..bdb0681bece8 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -309,7 +309,7 @@ EXPORT_SYMBOL_GPL(jump_label_rate_limit); static int addr_conflict(struct jump_entry *entry, void *start, void *end) { if (jump_entry_code(entry) <= (unsigned long)end && - jump_entry_code(entry) + JUMP_LABEL_NOP_SIZE > (unsigned long)start) + jump_entry_code(entry) + jump_entry_size(entry) > (unsigned long)start) return 1; return 0; @@ -483,13 +483,14 @@ void __init jump_label_init(void) for (iter = iter_start; iter < iter_stop; iter++) { struct static_key *iterk; + bool in_init; /* rewrite NOPs */ if (jump_label_type(iter) == JUMP_LABEL_NOP) arch_jump_label_transform_static(iter, JUMP_LABEL_NOP); - if (init_section_contains((void *)jump_entry_code(iter), 1)) - jump_entry_set_init(iter); + in_init = init_section_contains((void *)jump_entry_code(iter), 1); + jump_entry_set_init(iter, in_init); iterk = jump_entry_key(iter); if (iterk == key) @@ -634,9 +635,10 @@ static int jump_label_add_module(struct module *mod) for (iter = iter_start; iter < iter_stop; iter++) { struct static_key *iterk; + bool in_init; - if (within_module_init(jump_entry_code(iter), mod)) - jump_entry_set_init(iter); + in_init = within_module_init(jump_entry_code(iter), mod); + jump_entry_set_init(iter, in_init); iterk = jump_entry_key(iter); if (iterk == key) diff --git a/kernel/kcsan/report.c b/kernel/kcsan/report.c index 13dce3c664d6..56016e8e7461 100644 --- a/kernel/kcsan/report.c +++ b/kernel/kcsan/report.c @@ -460,7 +460,7 @@ static void set_other_info_task_blocking(unsigned long *flags, * We may be instrumenting a code-path where current->state is already * something other than TASK_RUNNING. */ - const bool is_running = current->state == TASK_RUNNING; + const bool is_running = task_is_running(current); /* * To avoid deadlock in case we are in an interrupt here and this is a * race with a task on the same CPU (KCSAN_INTERRUPT_WATCHER), provide a diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 745f08fdd7a6..e41385afe79d 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1183,23 +1183,6 @@ static void aggr_post_handler(struct kprobe *p, struct pt_regs *regs, } NOKPROBE_SYMBOL(aggr_post_handler); -static int aggr_fault_handler(struct kprobe *p, struct pt_regs *regs, - int trapnr) -{ - struct kprobe *cur = __this_cpu_read(kprobe_instance); - - /* - * if we faulted "during" the execution of a user specified - * probe handler, invoke just that probe's fault handler - */ - if (cur && cur->fault_handler) { - if (cur->fault_handler(cur, regs, trapnr)) - return 1; - } - return 0; -} -NOKPROBE_SYMBOL(aggr_fault_handler); - /* Walks the list and increments nmissed count for multiprobe case */ void kprobes_inc_nmissed_count(struct kprobe *p) { @@ -1330,7 +1313,6 @@ static void init_aggr_kprobe(struct kprobe *ap, struct kprobe *p) ap->addr = p->addr; ap->flags = p->flags & ~KPROBE_FLAG_OPTIMIZED; ap->pre_handler = aggr_pre_handler; - ap->fault_handler = aggr_fault_handler; /* We don't care the kprobe which has gone. */ if (p->post_handler && !kprobe_gone(p)) ap->post_handler = aggr_post_handler; @@ -2014,7 +1996,6 @@ int register_kretprobe(struct kretprobe *rp) rp->kp.pre_handler = pre_handler_kretprobe; rp->kp.post_handler = NULL; - rp->kp.fault_handler = NULL; /* Pre-allocate memory for max kretprobe instances */ if (rp->maxactive <= 0) { diff --git a/kernel/kthread.c b/kernel/kthread.c index fe3f2a40d61e..6b0a30a944b3 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -68,16 +68,6 @@ enum KTHREAD_BITS { KTHREAD_SHOULD_PARK, }; -static inline void set_kthread_struct(void *kthread) -{ - /* - * We abuse ->set_child_tid to avoid the new member and because it - * can't be wrongly copied by copy_process(). We also rely on fact - * that the caller can't exec, so PF_KTHREAD can't be cleared. - */ - current->set_child_tid = (__force void __user *)kthread; -} - static inline struct kthread *to_kthread(struct task_struct *k) { WARN_ON(!(k->flags & PF_KTHREAD)); @@ -103,6 +93,22 @@ static inline struct kthread *__to_kthread(struct task_struct *p) return kthread; } +void set_kthread_struct(struct task_struct *p) +{ + struct kthread *kthread; + + if (__to_kthread(p)) + return; + + kthread = kzalloc(sizeof(*kthread), GFP_KERNEL); + /* + * We abuse ->set_child_tid to avoid the new member and because it + * can't be wrongly copied by copy_process(). We also rely on fact + * that the caller can't exec, so PF_KTHREAD can't be cleared. + */ + p->set_child_tid = (__force void __user *)kthread; +} + void free_kthread_struct(struct task_struct *k) { struct kthread *kthread; @@ -272,8 +278,8 @@ static int kthread(void *_create) struct kthread *self; int ret; - self = kzalloc(sizeof(*self), GFP_KERNEL); - set_kthread_struct(self); + set_kthread_struct(current); + self = to_kthread(current); /* If user was SIGKILLed, I release the structure. */ done = xchg(&create->done, NULL); @@ -451,7 +457,7 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data), } EXPORT_SYMBOL(kthread_create_on_node); -static void __kthread_bind_mask(struct task_struct *p, const struct cpumask *mask, long state) +static void __kthread_bind_mask(struct task_struct *p, const struct cpumask *mask, unsigned int state) { unsigned long flags; @@ -467,7 +473,7 @@ static void __kthread_bind_mask(struct task_struct *p, const struct cpumask *mas raw_spin_unlock_irqrestore(&p->pi_lock, flags); } -static void __kthread_bind(struct task_struct *p, unsigned int cpu, long state) +static void __kthread_bind(struct task_struct *p, unsigned int cpu, unsigned int state) { __kthread_bind_mask(p, cpumask_of(cpu), state); } @@ -1093,8 +1099,38 @@ void kthread_flush_work(struct kthread_work *work) EXPORT_SYMBOL_GPL(kthread_flush_work); /* - * This function removes the work from the worker queue. Also it makes sure - * that it won't get queued later via the delayed work's timer. + * Make sure that the timer is neither set nor running and could + * not manipulate the work list_head any longer. + * + * The function is called under worker->lock. The lock is temporary + * released but the timer can't be set again in the meantime. + */ +static void kthread_cancel_delayed_work_timer(struct kthread_work *work, + unsigned long *flags) +{ + struct kthread_delayed_work *dwork = + container_of(work, struct kthread_delayed_work, work); + struct kthread_worker *worker = work->worker; + + /* + * del_timer_sync() must be called to make sure that the timer + * callback is not running. The lock must be temporary released + * to avoid a deadlock with the callback. In the meantime, + * any queuing is blocked by setting the canceling counter. + */ + work->canceling++; + raw_spin_unlock_irqrestore(&worker->lock, *flags); + del_timer_sync(&dwork->timer); + raw_spin_lock_irqsave(&worker->lock, *flags); + work->canceling--; +} + +/* + * This function removes the work from the worker queue. + * + * It is called under worker->lock. The caller must make sure that + * the timer used by delayed work is not running, e.g. by calling + * kthread_cancel_delayed_work_timer(). * * The work might still be in use when this function finishes. See the * current_work proceed by the worker. @@ -1102,28 +1138,8 @@ EXPORT_SYMBOL_GPL(kthread_flush_work); * Return: %true if @work was pending and successfully canceled, * %false if @work was not pending */ -static bool __kthread_cancel_work(struct kthread_work *work, bool is_dwork, - unsigned long *flags) +static bool __kthread_cancel_work(struct kthread_work *work) { - /* Try to cancel the timer if exists. */ - if (is_dwork) { - struct kthread_delayed_work *dwork = - container_of(work, struct kthread_delayed_work, work); - struct kthread_worker *worker = work->worker; - - /* - * del_timer_sync() must be called to make sure that the timer - * callback is not running. The lock must be temporary released - * to avoid a deadlock with the callback. In the meantime, - * any queuing is blocked by setting the canceling counter. - */ - work->canceling++; - raw_spin_unlock_irqrestore(&worker->lock, *flags); - del_timer_sync(&dwork->timer); - raw_spin_lock_irqsave(&worker->lock, *flags); - work->canceling--; - } - /* * Try to remove the work from a worker list. It might either * be from worker->work_list or from worker->delayed_work_list. @@ -1176,11 +1192,23 @@ bool kthread_mod_delayed_work(struct kthread_worker *worker, /* Work must not be used with >1 worker, see kthread_queue_work() */ WARN_ON_ONCE(work->worker != worker); - /* Do not fight with another command that is canceling this work. */ + /* + * Temporary cancel the work but do not fight with another command + * that is canceling the work as well. + * + * It is a bit tricky because of possible races with another + * mod_delayed_work() and cancel_delayed_work() callers. + * + * The timer must be canceled first because worker->lock is released + * when doing so. But the work can be removed from the queue (list) + * only when it can be queued again so that the return value can + * be used for reference counting. + */ + kthread_cancel_delayed_work_timer(work, &flags); if (work->canceling) goto out; + ret = __kthread_cancel_work(work); - ret = __kthread_cancel_work(work, true, &flags); fast_queue: __kthread_queue_delayed_work(worker, dwork, delay); out: @@ -1202,7 +1230,10 @@ static bool __kthread_cancel_work_sync(struct kthread_work *work, bool is_dwork) /* Work must not be used with >1 worker, see kthread_queue_work(). */ WARN_ON_ONCE(work->worker != worker); - ret = __kthread_cancel_work(work, is_dwork, &flags); + if (is_dwork) + kthread_cancel_delayed_work_timer(work, &flags); + + ret = __kthread_cancel_work(work); if (worker->current_work != work) goto out_fast; diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 7641bd407239..e97d08001437 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -760,7 +760,7 @@ static void lockdep_print_held_locks(struct task_struct *p) * It's not reliable to print a task's held locks if it's not sleeping * and it's not the current task. */ - if (p->state == TASK_RUNNING && p != current) + if (p != current && task_is_running(p)) return; for (i = 0; i < depth; i++) { printk(" #%d: ", i); @@ -843,7 +843,7 @@ static int count_matching_names(struct lock_class *new_class) } /* used from NMI context -- must be lockless */ -static __always_inline struct lock_class * +static noinstr struct lock_class * look_up_lock_class(const struct lockdep_map *lock, unsigned int subclass) { struct lockdep_subclass_key *key; @@ -851,12 +851,14 @@ look_up_lock_class(const struct lockdep_map *lock, unsigned int subclass) struct lock_class *class; if (unlikely(subclass >= MAX_LOCKDEP_SUBCLASSES)) { + instrumentation_begin(); debug_locks_off(); printk(KERN_ERR "BUG: looking up invalid subclass: %u\n", subclass); printk(KERN_ERR "turning off the locking correctness validator.\n"); dump_stack(); + instrumentation_end(); return NULL; } @@ -2304,7 +2306,56 @@ static void print_lock_class_header(struct lock_class *class, int depth) } /* - * printk the shortest lock dependencies from @start to @end in reverse order: + * Dependency path printing: + * + * After BFS we get a lock dependency path (linked via ->parent of lock_list), + * printing out each lock in the dependency path will help on understanding how + * the deadlock could happen. Here are some details about dependency path + * printing: + * + * 1) A lock_list can be either forwards or backwards for a lock dependency, + * for a lock dependency A -> B, there are two lock_lists: + * + * a) lock_list in the ->locks_after list of A, whose ->class is B and + * ->links_to is A. In this case, we can say the lock_list is + * "A -> B" (forwards case). + * + * b) lock_list in the ->locks_before list of B, whose ->class is A + * and ->links_to is B. In this case, we can say the lock_list is + * "B <- A" (bacwards case). + * + * The ->trace of both a) and b) point to the call trace where B was + * acquired with A held. + * + * 2) A "helper" lock_list is introduced during BFS, this lock_list doesn't + * represent a certain lock dependency, it only provides an initial entry + * for BFS. For example, BFS may introduce a "helper" lock_list whose + * ->class is A, as a result BFS will search all dependencies starting with + * A, e.g. A -> B or A -> C. + * + * The notation of a forwards helper lock_list is like "-> A", which means + * we should search the forwards dependencies starting with "A", e.g A -> B + * or A -> C. + * + * The notation of a bacwards helper lock_list is like "<- B", which means + * we should search the backwards dependencies ending with "B", e.g. + * B <- A or B <- C. + */ + +/* + * printk the shortest lock dependencies from @root to @leaf in reverse order. + * + * We have a lock dependency path as follow: + * + * @root @leaf + * | | + * V V + * ->parent ->parent + * | lock_list | <--------- | lock_list | ... | lock_list | <--------- | lock_list | + * | -> L1 | | L1 -> L2 | ... |Ln-2 -> Ln-1| | Ln-1 -> Ln| + * + * , so it's natural that we start from @leaf and print every ->class and + * ->trace until we reach the @root. */ static void __used print_shortest_lock_dependencies(struct lock_list *leaf, @@ -2332,6 +2383,61 @@ print_shortest_lock_dependencies(struct lock_list *leaf, } while (entry && (depth >= 0)); } +/* + * printk the shortest lock dependencies from @leaf to @root. + * + * We have a lock dependency path (from a backwards search) as follow: + * + * @leaf @root + * | | + * V V + * ->parent ->parent + * | lock_list | ---------> | lock_list | ... | lock_list | ---------> | lock_list | + * | L2 <- L1 | | L3 <- L2 | ... | Ln <- Ln-1 | | <- Ln | + * + * , so when we iterate from @leaf to @root, we actually print the lock + * dependency path L1 -> L2 -> .. -> Ln in the non-reverse order. + * + * Another thing to notice here is that ->class of L2 <- L1 is L1, while the + * ->trace of L2 <- L1 is the call trace of L2, in fact we don't have the call + * trace of L1 in the dependency path, which is alright, because most of the + * time we can figure out where L1 is held from the call trace of L2. + */ +static void __used +print_shortest_lock_dependencies_backwards(struct lock_list *leaf, + struct lock_list *root) +{ + struct lock_list *entry = leaf; + const struct lock_trace *trace = NULL; + int depth; + + /*compute depth from generated tree by BFS*/ + depth = get_lock_depth(leaf); + + do { + print_lock_class_header(entry->class, depth); + if (trace) { + printk("%*s ... acquired at:\n", depth, ""); + print_lock_trace(trace, 2); + printk("\n"); + } + + /* + * Record the pointer to the trace for the next lock_list + * entry, see the comments for the function. + */ + trace = entry->trace; + + if (depth == 0 && (entry != root)) { + printk("lockdep:%s bad path found in chain graph\n", __func__); + break; + } + + entry = get_lock_parent(entry); + depth--; + } while (entry && (depth >= 0)); +} + static void print_irq_lock_scenario(struct lock_list *safe_entry, struct lock_list *unsafe_entry, @@ -2446,10 +2552,7 @@ print_bad_irq_dependency(struct task_struct *curr, lockdep_print_held_locks(curr); pr_warn("\nthe dependencies between %s-irq-safe lock and the holding lock:\n", irqclass); - prev_root->trace = save_trace(); - if (!prev_root->trace) - return; - print_shortest_lock_dependencies(backwards_entry, prev_root); + print_shortest_lock_dependencies_backwards(backwards_entry, prev_root); pr_warn("\nthe dependencies between the lock to be acquired"); pr_warn(" and %s-irq-unsafe lock:\n", irqclass); @@ -2667,8 +2770,18 @@ static int check_irq_usage(struct task_struct *curr, struct held_lock *prev, * Step 3: we found a bad match! Now retrieve a lock from the backward * list whose usage mask matches the exclusive usage mask from the * lock found on the forward list. + * + * Note, we should only keep the LOCKF_ENABLED_IRQ_ALL bits, considering + * the follow case: + * + * When trying to add A -> B to the graph, we find that there is a + * hardirq-safe L, that L -> ... -> A, and another hardirq-unsafe M, + * that B -> ... -> M. However M is **softirq-safe**, if we use exact + * invert bits of M's usage_mask, we will find another lock N that is + * **softirq-unsafe** and N -> ... -> A, however N -> .. -> M will not + * cause a inversion deadlock. */ - backward_mask = original_mask(target_entry1->class->usage_mask); + backward_mask = original_mask(target_entry1->class->usage_mask & LOCKF_ENABLED_IRQ_ALL); ret = find_usage_backwards(&this, backward_mask, &target_entry); if (bfs_error(ret)) { @@ -2718,7 +2831,7 @@ static inline bool usage_skip(struct lock_list *entry, void *mask) * <target> or not. If it can, <src> -> <target> dependency is already * in the graph. * - * Return BFS_RMATCH if it does, or BFS_RMATCH if it does not, return BFS_E* if + * Return BFS_RMATCH if it does, or BFS_RNOMATCH if it does not, return BFS_E* if * any error appears in the bfs search. */ static noinline enum bfs_result @@ -4577,7 +4690,7 @@ static int check_wait_context(struct task_struct *curr, struct held_lock *next) u8 curr_inner; int depth; - if (!curr->lockdep_depth || !next_inner || next->trylock) + if (!next_inner || next->trylock) return 0; if (!next_outer) diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index 013e1b08a1bf..d2df5e68b503 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -923,7 +923,7 @@ __ww_mutex_add_waiter(struct mutex_waiter *waiter, * Lock a mutex (possibly interruptible), slowpath: */ static __always_inline int __sched -__mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, +__mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclass, struct lockdep_map *nest_lock, unsigned long ip, struct ww_acquire_ctx *ww_ctx, const bool use_ww_ctx) { @@ -1098,14 +1098,14 @@ err_early_kill: } static int __sched -__mutex_lock(struct mutex *lock, long state, unsigned int subclass, +__mutex_lock(struct mutex *lock, unsigned int state, unsigned int subclass, struct lockdep_map *nest_lock, unsigned long ip) { return __mutex_lock_common(lock, state, subclass, nest_lock, ip, NULL, false); } static int __sched -__ww_mutex_lock(struct mutex *lock, long state, unsigned int subclass, +__ww_mutex_lock(struct mutex *lock, unsigned int state, unsigned int subclass, struct lockdep_map *nest_lock, unsigned long ip, struct ww_acquire_ctx *ww_ctx) { diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 406818196a9f..b5d9bb5202c6 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -1135,7 +1135,7 @@ void __sched rt_mutex_init_waiter(struct rt_mutex_waiter *waiter) * * Must be called with lock->wait_lock held and interrupts disabled */ -static int __sched __rt_mutex_slowlock(struct rt_mutex *lock, int state, +static int __sched __rt_mutex_slowlock(struct rt_mutex *lock, unsigned int state, struct hrtimer_sleeper *timeout, struct rt_mutex_waiter *waiter) { @@ -1190,7 +1190,7 @@ static void __sched rt_mutex_handle_deadlock(int res, int detect_deadlock, /* * Slow path lock function: */ -static int __sched rt_mutex_slowlock(struct rt_mutex *lock, int state, +static int __sched rt_mutex_slowlock(struct rt_mutex *lock, unsigned int state, struct hrtimer_sleeper *timeout, enum rtmutex_chainwalk chwalk) { diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index 809b0016d344..16bfbb10c74d 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -889,7 +889,7 @@ rwsem_spin_on_owner(struct rw_semaphore *sem) * Wait for the read lock to be granted */ static struct rw_semaphore __sched * -rwsem_down_read_slowpath(struct rw_semaphore *sem, long count, int state) +rwsem_down_read_slowpath(struct rw_semaphore *sem, long count, unsigned int state) { long adjustment = -RWSEM_READER_BIAS; long rcnt = (count >> RWSEM_READER_SHIFT); diff --git a/kernel/module.c b/kernel/module.c index 7e78dfabca97..927d46cb8eb9 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -266,9 +266,18 @@ static void module_assert_mutex_or_preempt(void) #endif } +#ifdef CONFIG_MODULE_SIG static bool sig_enforce = IS_ENABLED(CONFIG_MODULE_SIG_FORCE); module_param(sig_enforce, bool_enable_only, 0644); +void set_module_sig_enforced(void) +{ + sig_enforce = true; +} +#else +#define sig_enforce false +#endif + /* * Export sig_enforce kernel cmdline parameter to allow other subsystems rely * on that instead of directly to CONFIG_MODULE_SIG_FORCE config. @@ -279,11 +288,6 @@ bool is_module_sig_enforced(void) } EXPORT_SYMBOL(is_module_sig_enforced); -void set_module_sig_enforced(void) -{ - sig_enforce = true; -} - /* Block module loading/unloading? */ int modules_disabled = 0; core_param(nomodule, modules_disabled, bint, 0); diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 421c35571797..142a58d124d9 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -3531,3 +3531,119 @@ void kmsg_dump_rewind(struct kmsg_dump_iter *iter) EXPORT_SYMBOL_GPL(kmsg_dump_rewind); #endif + +#ifdef CONFIG_SMP +static atomic_t printk_cpulock_owner = ATOMIC_INIT(-1); +static atomic_t printk_cpulock_nested = ATOMIC_INIT(0); + +/** + * __printk_wait_on_cpu_lock() - Busy wait until the printk cpu-reentrant + * spinning lock is not owned by any CPU. + * + * Context: Any context. + */ +void __printk_wait_on_cpu_lock(void) +{ + do { + cpu_relax(); + } while (atomic_read(&printk_cpulock_owner) != -1); +} +EXPORT_SYMBOL(__printk_wait_on_cpu_lock); + +/** + * __printk_cpu_trylock() - Try to acquire the printk cpu-reentrant + * spinning lock. + * + * If no processor has the lock, the calling processor takes the lock and + * becomes the owner. If the calling processor is already the owner of the + * lock, this function succeeds immediately. + * + * Context: Any context. Expects interrupts to be disabled. + * Return: 1 on success, otherwise 0. + */ +int __printk_cpu_trylock(void) +{ + int cpu; + int old; + + cpu = smp_processor_id(); + + /* + * Guarantee loads and stores from this CPU when it is the lock owner + * are _not_ visible to the previous lock owner. This pairs with + * __printk_cpu_unlock:B. + * + * Memory barrier involvement: + * + * If __printk_cpu_trylock:A reads from __printk_cpu_unlock:B, then + * __printk_cpu_unlock:A can never read from __printk_cpu_trylock:B. + * + * Relies on: + * + * RELEASE from __printk_cpu_unlock:A to __printk_cpu_unlock:B + * of the previous CPU + * matching + * ACQUIRE from __printk_cpu_trylock:A to __printk_cpu_trylock:B + * of this CPU + */ + old = atomic_cmpxchg_acquire(&printk_cpulock_owner, -1, + cpu); /* LMM(__printk_cpu_trylock:A) */ + if (old == -1) { + /* + * This CPU is now the owner and begins loading/storing + * data: LMM(__printk_cpu_trylock:B) + */ + return 1; + + } else if (old == cpu) { + /* This CPU is already the owner. */ + atomic_inc(&printk_cpulock_nested); + return 1; + } + + return 0; +} +EXPORT_SYMBOL(__printk_cpu_trylock); + +/** + * __printk_cpu_unlock() - Release the printk cpu-reentrant spinning lock. + * + * The calling processor must be the owner of the lock. + * + * Context: Any context. Expects interrupts to be disabled. + */ +void __printk_cpu_unlock(void) +{ + if (atomic_read(&printk_cpulock_nested)) { + atomic_dec(&printk_cpulock_nested); + return; + } + + /* + * This CPU is finished loading/storing data: + * LMM(__printk_cpu_unlock:A) + */ + + /* + * Guarantee loads and stores from this CPU when it was the + * lock owner are visible to the next lock owner. This pairs + * with __printk_cpu_trylock:A. + * + * Memory barrier involvement: + * + * If __printk_cpu_trylock:A reads from __printk_cpu_unlock:B, + * then __printk_cpu_trylock:B reads from __printk_cpu_unlock:A. + * + * Relies on: + * + * RELEASE from __printk_cpu_unlock:A to __printk_cpu_unlock:B + * of this CPU + * matching + * ACQUIRE from __printk_cpu_trylock:A to __printk_cpu_trylock:B + * of the next CPU + */ + atomic_set_release(&printk_cpulock_owner, + -1); /* LMM(__printk_cpu_unlock:B) */ +} +EXPORT_SYMBOL(__printk_cpu_unlock); +#endif /* CONFIG_SMP */ diff --git a/kernel/printk/printk_safe.c b/kernel/printk/printk_safe.c index 7a1414622051..94232186fccb 100644 --- a/kernel/printk/printk_safe.c +++ b/kernel/printk/printk_safe.c @@ -391,6 +391,7 @@ asmlinkage int vprintk(const char *fmt, va_list args) /* No obstacles. */ return vprintk_default(fmt, args); } +EXPORT_SYMBOL(vprintk); void __init printk_safe_init(void) { @@ -411,4 +412,3 @@ void __init printk_safe_init(void) /* Flush pending messages that did not have scheduled IRQ works. */ printk_safe_flush(); } -EXPORT_SYMBOL(vprintk); diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 2997ca600d18..f8589bf8d7dc 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -197,7 +197,7 @@ static bool ptrace_freeze_traced(struct task_struct *task) spin_lock_irq(&task->sighand->siglock); if (task_is_traced(task) && !looks_like_a_spurious_pid(task) && !__fatal_signal_pending(task)) { - task->state = __TASK_TRACED; + WRITE_ONCE(task->__state, __TASK_TRACED); ret = true; } spin_unlock_irq(&task->sighand->siglock); @@ -207,7 +207,7 @@ static bool ptrace_freeze_traced(struct task_struct *task) static void ptrace_unfreeze_traced(struct task_struct *task) { - if (task->state != __TASK_TRACED) + if (READ_ONCE(task->__state) != __TASK_TRACED) return; WARN_ON(!task->ptrace || task->parent != current); @@ -217,11 +217,11 @@ static void ptrace_unfreeze_traced(struct task_struct *task) * Recheck state under the lock to close this race. */ spin_lock_irq(&task->sighand->siglock); - if (task->state == __TASK_TRACED) { + if (READ_ONCE(task->__state) == __TASK_TRACED) { if (__fatal_signal_pending(task)) wake_up_state(task, __TASK_TRACED); else - task->state = TASK_TRACED; + WRITE_ONCE(task->__state, TASK_TRACED); } spin_unlock_irq(&task->sighand->siglock); } @@ -256,7 +256,7 @@ static int ptrace_check_attach(struct task_struct *child, bool ignore_state) */ read_lock(&tasklist_lock); if (child->ptrace && child->parent == current) { - WARN_ON(child->state == __TASK_TRACED); + WARN_ON(READ_ONCE(child->__state) == __TASK_TRACED); /* * child->sighand can't be NULL, release_task() * does ptrace_unlink() before __exit_signal(). @@ -273,7 +273,7 @@ static int ptrace_check_attach(struct task_struct *child, bool ignore_state) * ptrace_stop() changes ->state back to TASK_RUNNING, * so we should not worry about leaking __TASK_TRACED. */ - WARN_ON(child->state == __TASK_TRACED); + WARN_ON(READ_ONCE(child->__state) == __TASK_TRACED); ret = -ESRCH; } } diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 29d2f4c647d3..194b9c145c40 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1831,10 +1831,10 @@ rcu_torture_stats_print(void) srcutorture_get_gp_data(cur_ops->ttype, srcu_ctlp, &flags, &gp_seq); wtp = READ_ONCE(writer_task); - pr_alert("??? Writer stall state %s(%d) g%lu f%#x ->state %#lx cpu %d\n", + pr_alert("??? Writer stall state %s(%d) g%lu f%#x ->state %#x cpu %d\n", rcu_torture_writer_state_getname(), rcu_torture_writer_state, gp_seq, flags, - wtp == NULL ? ~0UL : wtp->state, + wtp == NULL ? ~0U : wtp->__state, wtp == NULL ? -1 : (int)task_cpu(wtp)); if (!splatted && wtp) { sched_show_task(wtp); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index ad0156b86937..4d6962048c30 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -2768,7 +2768,7 @@ EXPORT_SYMBOL_GPL(rcu_bind_current_to_nocb); #ifdef CONFIG_SMP static char *show_rcu_should_be_on_cpu(struct task_struct *tsp) { - return tsp && tsp->state == TASK_RUNNING && !tsp->on_cpu ? "!" : ""; + return tsp && task_is_running(tsp) && !tsp->on_cpu ? "!" : ""; } #else // #ifdef CONFIG_SMP static char *show_rcu_should_be_on_cpu(struct task_struct *tsp) diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 59b95cc5cbdf..acb2288063b5 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -460,12 +460,12 @@ static void rcu_check_gp_kthread_starvation(void) if (rcu_is_gp_kthread_starving(&j)) { cpu = gpk ? task_cpu(gpk) : -1; - pr_err("%s kthread starved for %ld jiffies! g%ld f%#x %s(%d) ->state=%#lx ->cpu=%d\n", + pr_err("%s kthread starved for %ld jiffies! g%ld f%#x %s(%d) ->state=%#x ->cpu=%d\n", rcu_state.name, j, (long)rcu_seq_current(&rcu_state.gp_seq), data_race(rcu_state.gp_flags), gp_state_getname(rcu_state.gp_state), rcu_state.gp_state, - gpk ? gpk->state : ~0, cpu); + gpk ? gpk->__state : ~0, cpu); if (gpk) { pr_err("\tUnless %s kthread gets sufficient CPU time, OOM is now expected behavior.\n", rcu_state.name); pr_err("RCU grace-period kthread stack dump:\n"); @@ -503,12 +503,12 @@ static void rcu_check_gp_kthread_expired_fqs_timer(void) time_after(jiffies, jiffies_fqs + RCU_STALL_MIGHT_MIN) && gpk && !READ_ONCE(gpk->on_rq)) { cpu = task_cpu(gpk); - pr_err("%s kthread timer wakeup didn't happen for %ld jiffies! g%ld f%#x %s(%d) ->state=%#lx\n", + pr_err("%s kthread timer wakeup didn't happen for %ld jiffies! g%ld f%#x %s(%d) ->state=%#x\n", rcu_state.name, (jiffies - jiffies_fqs), (long)rcu_seq_current(&rcu_state.gp_seq), data_race(rcu_state.gp_flags), gp_state_getname(RCU_GP_WAIT_FQS), RCU_GP_WAIT_FQS, - gpk->state); + gpk->__state); pr_err("\tPossible timer handling issue on cpu=%d timer-softirq=%u\n", cpu, kstat_softirqs_cpu(TIMER_SOFTIRQ, cpu)); } @@ -735,9 +735,9 @@ void show_rcu_gp_kthreads(void) ja = j - data_race(rcu_state.gp_activity); jr = j - data_race(rcu_state.gp_req_activity); jw = j - data_race(rcu_state.gp_wake_time); - pr_info("%s: wait state: %s(%d) ->state: %#lx delta ->gp_activity %lu ->gp_req_activity %lu ->gp_wake_time %lu ->gp_wake_seq %ld ->gp_seq %ld ->gp_seq_needed %ld ->gp_flags %#x\n", + pr_info("%s: wait state: %s(%d) ->state: %#x delta ->gp_activity %lu ->gp_req_activity %lu ->gp_wake_time %lu ->gp_wake_seq %ld ->gp_seq %ld ->gp_seq_needed %ld ->gp_flags %#x\n", rcu_state.name, gp_state_getname(rcu_state.gp_state), - rcu_state.gp_state, t ? t->state : 0x1ffffL, + rcu_state.gp_state, t ? t->__state : 0x1ffff, ja, jr, jw, (long)data_race(rcu_state.gp_wake_seq), (long)data_race(rcu_state.gp_seq), (long)data_race(rcu_get_root()->gp_seq_needed), diff --git a/kernel/reboot.c b/kernel/reboot.c index a6ad5eb2fa73..f7440c0c7e43 100644 --- a/kernel/reboot.c +++ b/kernel/reboot.c @@ -7,6 +7,7 @@ #define pr_fmt(fmt) "reboot: " fmt +#include <linux/atomic.h> #include <linux/ctype.h> #include <linux/export.h> #include <linux/kexec.h> @@ -518,6 +519,84 @@ void orderly_reboot(void) } EXPORT_SYMBOL_GPL(orderly_reboot); +/** + * hw_failure_emergency_poweroff_func - emergency poweroff work after a known delay + * @work: work_struct associated with the emergency poweroff function + * + * This function is called in very critical situations to force + * a kernel poweroff after a configurable timeout value. + */ +static void hw_failure_emergency_poweroff_func(struct work_struct *work) +{ + /* + * We have reached here after the emergency shutdown waiting period has + * expired. This means orderly_poweroff has not been able to shut off + * the system for some reason. + * + * Try to shut down the system immediately using kernel_power_off + * if populated + */ + pr_emerg("Hardware protection timed-out. Trying forced poweroff\n"); + kernel_power_off(); + + /* + * Worst of the worst case trigger emergency restart + */ + pr_emerg("Hardware protection shutdown failed. Trying emergency restart\n"); + emergency_restart(); +} + +static DECLARE_DELAYED_WORK(hw_failure_emergency_poweroff_work, + hw_failure_emergency_poweroff_func); + +/** + * hw_failure_emergency_poweroff - Trigger an emergency system poweroff + * + * This may be called from any critical situation to trigger a system shutdown + * after a given period of time. If time is negative this is not scheduled. + */ +static void hw_failure_emergency_poweroff(int poweroff_delay_ms) +{ + if (poweroff_delay_ms <= 0) + return; + schedule_delayed_work(&hw_failure_emergency_poweroff_work, + msecs_to_jiffies(poweroff_delay_ms)); +} + +/** + * hw_protection_shutdown - Trigger an emergency system poweroff + * + * @reason: Reason of emergency shutdown to be printed. + * @ms_until_forced: Time to wait for orderly shutdown before tiggering a + * forced shudown. Negative value disables the forced + * shutdown. + * + * Initiate an emergency system shutdown in order to protect hardware from + * further damage. Usage examples include a thermal protection or a voltage or + * current regulator failures. + * NOTE: The request is ignored if protection shutdown is already pending even + * if the previous request has given a large timeout for forced shutdown. + * Can be called from any context. + */ +void hw_protection_shutdown(const char *reason, int ms_until_forced) +{ + static atomic_t allow_proceed = ATOMIC_INIT(1); + + pr_emerg("HARDWARE PROTECTION shutdown (%s)\n", reason); + + /* Shutdown should be initiated only once. */ + if (!atomic_dec_and_test(&allow_proceed)) + return; + + /* + * Queue a backup emergency shutdown in the event of + * orderly_poweroff failure + */ + hw_failure_emergency_poweroff(ms_until_forced); + orderly_poweroff(true); +} +EXPORT_SYMBOL_GPL(hw_protection_shutdown); + static int __init reboot_setup(char *str) { for (;;) { diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 5fc9c9b70862..978fcfca5871 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -36,3 +36,4 @@ obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o obj-$(CONFIG_MEMBARRIER) += membarrier.o obj-$(CONFIG_CPU_ISOLATION) += isolation.o obj-$(CONFIG_PSI) += psi.o +obj-$(CONFIG_SCHED_CORE) += core_sched.o diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 5226cc26a095..cf16f8fda9a6 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -84,6 +84,272 @@ unsigned int sysctl_sched_rt_period = 1000000; __read_mostly int scheduler_running; +#ifdef CONFIG_SCHED_CORE + +DEFINE_STATIC_KEY_FALSE(__sched_core_enabled); + +/* kernel prio, less is more */ +static inline int __task_prio(struct task_struct *p) +{ + if (p->sched_class == &stop_sched_class) /* trumps deadline */ + return -2; + + if (rt_prio(p->prio)) /* includes deadline */ + return p->prio; /* [-1, 99] */ + + if (p->sched_class == &idle_sched_class) + return MAX_RT_PRIO + NICE_WIDTH; /* 140 */ + + return MAX_RT_PRIO + MAX_NICE; /* 120, squash fair */ +} + +/* + * l(a,b) + * le(a,b) := !l(b,a) + * g(a,b) := l(b,a) + * ge(a,b) := !l(a,b) + */ + +/* real prio, less is less */ +static inline bool prio_less(struct task_struct *a, struct task_struct *b, bool in_fi) +{ + + int pa = __task_prio(a), pb = __task_prio(b); + + if (-pa < -pb) + return true; + + if (-pb < -pa) + return false; + + if (pa == -1) /* dl_prio() doesn't work because of stop_class above */ + return !dl_time_before(a->dl.deadline, b->dl.deadline); + + if (pa == MAX_RT_PRIO + MAX_NICE) /* fair */ + return cfs_prio_less(a, b, in_fi); + + return false; +} + +static inline bool __sched_core_less(struct task_struct *a, struct task_struct *b) +{ + if (a->core_cookie < b->core_cookie) + return true; + + if (a->core_cookie > b->core_cookie) + return false; + + /* flip prio, so high prio is leftmost */ + if (prio_less(b, a, task_rq(a)->core->core_forceidle)) + return true; + + return false; +} + +#define __node_2_sc(node) rb_entry((node), struct task_struct, core_node) + +static inline bool rb_sched_core_less(struct rb_node *a, const struct rb_node *b) +{ + return __sched_core_less(__node_2_sc(a), __node_2_sc(b)); +} + +static inline int rb_sched_core_cmp(const void *key, const struct rb_node *node) +{ + const struct task_struct *p = __node_2_sc(node); + unsigned long cookie = (unsigned long)key; + + if (cookie < p->core_cookie) + return -1; + + if (cookie > p->core_cookie) + return 1; + + return 0; +} + +void sched_core_enqueue(struct rq *rq, struct task_struct *p) +{ + rq->core->core_task_seq++; + + if (!p->core_cookie) + return; + + rb_add(&p->core_node, &rq->core_tree, rb_sched_core_less); +} + +void sched_core_dequeue(struct rq *rq, struct task_struct *p) +{ + rq->core->core_task_seq++; + + if (!sched_core_enqueued(p)) + return; + + rb_erase(&p->core_node, &rq->core_tree); + RB_CLEAR_NODE(&p->core_node); +} + +/* + * Find left-most (aka, highest priority) task matching @cookie. + */ +static struct task_struct *sched_core_find(struct rq *rq, unsigned long cookie) +{ + struct rb_node *node; + + node = rb_find_first((void *)cookie, &rq->core_tree, rb_sched_core_cmp); + /* + * The idle task always matches any cookie! + */ + if (!node) + return idle_sched_class.pick_task(rq); + + return __node_2_sc(node); +} + +static struct task_struct *sched_core_next(struct task_struct *p, unsigned long cookie) +{ + struct rb_node *node = &p->core_node; + + node = rb_next(node); + if (!node) + return NULL; + + p = container_of(node, struct task_struct, core_node); + if (p->core_cookie != cookie) + return NULL; + + return p; +} + +/* + * Magic required such that: + * + * raw_spin_rq_lock(rq); + * ... + * raw_spin_rq_unlock(rq); + * + * ends up locking and unlocking the _same_ lock, and all CPUs + * always agree on what rq has what lock. + * + * XXX entirely possible to selectively enable cores, don't bother for now. + */ + +static DEFINE_MUTEX(sched_core_mutex); +static atomic_t sched_core_count; +static struct cpumask sched_core_mask; + +static void __sched_core_flip(bool enabled) +{ + int cpu, t, i; + + cpus_read_lock(); + + /* + * Toggle the online cores, one by one. + */ + cpumask_copy(&sched_core_mask, cpu_online_mask); + for_each_cpu(cpu, &sched_core_mask) { + const struct cpumask *smt_mask = cpu_smt_mask(cpu); + + i = 0; + local_irq_disable(); + for_each_cpu(t, smt_mask) { + /* supports up to SMT8 */ + raw_spin_lock_nested(&cpu_rq(t)->__lock, i++); + } + + for_each_cpu(t, smt_mask) + cpu_rq(t)->core_enabled = enabled; + + for_each_cpu(t, smt_mask) + raw_spin_unlock(&cpu_rq(t)->__lock); + local_irq_enable(); + + cpumask_andnot(&sched_core_mask, &sched_core_mask, smt_mask); + } + + /* + * Toggle the offline CPUs. + */ + cpumask_copy(&sched_core_mask, cpu_possible_mask); + cpumask_andnot(&sched_core_mask, &sched_core_mask, cpu_online_mask); + + for_each_cpu(cpu, &sched_core_mask) + cpu_rq(cpu)->core_enabled = enabled; + + cpus_read_unlock(); +} + +static void sched_core_assert_empty(void) +{ + int cpu; + + for_each_possible_cpu(cpu) + WARN_ON_ONCE(!RB_EMPTY_ROOT(&cpu_rq(cpu)->core_tree)); +} + +static void __sched_core_enable(void) +{ + static_branch_enable(&__sched_core_enabled); + /* + * Ensure all previous instances of raw_spin_rq_*lock() have finished + * and future ones will observe !sched_core_disabled(). + */ + synchronize_rcu(); + __sched_core_flip(true); + sched_core_assert_empty(); +} + +static void __sched_core_disable(void) +{ + sched_core_assert_empty(); + __sched_core_flip(false); + static_branch_disable(&__sched_core_enabled); +} + +void sched_core_get(void) +{ + if (atomic_inc_not_zero(&sched_core_count)) + return; + + mutex_lock(&sched_core_mutex); + if (!atomic_read(&sched_core_count)) + __sched_core_enable(); + + smp_mb__before_atomic(); + atomic_inc(&sched_core_count); + mutex_unlock(&sched_core_mutex); +} + +static void __sched_core_put(struct work_struct *work) +{ + if (atomic_dec_and_mutex_lock(&sched_core_count, &sched_core_mutex)) { + __sched_core_disable(); + mutex_unlock(&sched_core_mutex); + } +} + +void sched_core_put(void) +{ + static DECLARE_WORK(_work, __sched_core_put); + + /* + * "There can be only one" + * + * Either this is the last one, or we don't actually need to do any + * 'work'. If it is the last *again*, we rely on + * WORK_STRUCT_PENDING_BIT. + */ + if (!atomic_add_unless(&sched_core_count, -1, 1)) + schedule_work(&_work); +} + +#else /* !CONFIG_SCHED_CORE */ + +static inline void sched_core_enqueue(struct rq *rq, struct task_struct *p) { } +static inline void sched_core_dequeue(struct rq *rq, struct task_struct *p) { } + +#endif /* CONFIG_SCHED_CORE */ + /* * part of the period that we allow rt tasks to run in us. * default: 0.95s @@ -184,6 +450,79 @@ int sysctl_sched_rt_runtime = 950000; * */ +void raw_spin_rq_lock_nested(struct rq *rq, int subclass) +{ + raw_spinlock_t *lock; + + /* Matches synchronize_rcu() in __sched_core_enable() */ + preempt_disable(); + if (sched_core_disabled()) { + raw_spin_lock_nested(&rq->__lock, subclass); + /* preempt_count *MUST* be > 1 */ + preempt_enable_no_resched(); + return; + } + + for (;;) { + lock = __rq_lockp(rq); + raw_spin_lock_nested(lock, subclass); + if (likely(lock == __rq_lockp(rq))) { + /* preempt_count *MUST* be > 1 */ + preempt_enable_no_resched(); + return; + } + raw_spin_unlock(lock); + } +} + +bool raw_spin_rq_trylock(struct rq *rq) +{ + raw_spinlock_t *lock; + bool ret; + + /* Matches synchronize_rcu() in __sched_core_enable() */ + preempt_disable(); + if (sched_core_disabled()) { + ret = raw_spin_trylock(&rq->__lock); + preempt_enable(); + return ret; + } + + for (;;) { + lock = __rq_lockp(rq); + ret = raw_spin_trylock(lock); + if (!ret || (likely(lock == __rq_lockp(rq)))) { + preempt_enable(); + return ret; + } + raw_spin_unlock(lock); + } +} + +void raw_spin_rq_unlock(struct rq *rq) +{ + raw_spin_unlock(rq_lockp(rq)); +} + +#ifdef CONFIG_SMP +/* + * double_rq_lock - safely lock two runqueues + */ +void double_rq_lock(struct rq *rq1, struct rq *rq2) +{ + lockdep_assert_irqs_disabled(); + + if (rq_order_less(rq2, rq1)) + swap(rq1, rq2); + + raw_spin_rq_lock(rq1); + if (__rq_lockp(rq1) == __rq_lockp(rq2)) + return; + + raw_spin_rq_lock_nested(rq2, SINGLE_DEPTH_NESTING); +} +#endif + /* * __task_rq_lock - lock the rq @p resides on. */ @@ -196,12 +535,12 @@ struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf) for (;;) { rq = task_rq(p); - raw_spin_lock(&rq->lock); + raw_spin_rq_lock(rq); if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) { rq_pin_lock(rq, rf); return rq; } - raw_spin_unlock(&rq->lock); + raw_spin_rq_unlock(rq); while (unlikely(task_on_rq_migrating(p))) cpu_relax(); @@ -220,7 +559,7 @@ struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) for (;;) { raw_spin_lock_irqsave(&p->pi_lock, rf->flags); rq = task_rq(p); - raw_spin_lock(&rq->lock); + raw_spin_rq_lock(rq); /* * move_queued_task() task_rq_lock() * @@ -242,7 +581,7 @@ struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) rq_pin_lock(rq, rf); return rq; } - raw_spin_unlock(&rq->lock); + raw_spin_rq_unlock(rq); raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags); while (unlikely(task_on_rq_migrating(p))) @@ -312,7 +651,7 @@ void update_rq_clock(struct rq *rq) { s64 delta; - lockdep_assert_held(&rq->lock); + lockdep_assert_rq_held(rq); if (rq->clock_update_flags & RQCF_ACT_SKIP) return; @@ -585,7 +924,6 @@ void wake_up_q(struct wake_q_head *head) struct task_struct *task; task = container_of(node, struct task_struct, wake_q); - BUG_ON(!task); /* Task can safely be re-inserted now: */ node = node->next; task->wake_q.next = NULL; @@ -611,7 +949,7 @@ void resched_curr(struct rq *rq) struct task_struct *curr = rq->curr; int cpu; - lockdep_assert_held(&rq->lock); + lockdep_assert_rq_held(rq); if (test_tsk_need_resched(curr)) return; @@ -635,10 +973,10 @@ void resched_cpu(int cpu) struct rq *rq = cpu_rq(cpu); unsigned long flags; - raw_spin_lock_irqsave(&rq->lock, flags); + raw_spin_rq_lock_irqsave(rq, flags); if (cpu_online(cpu) || cpu == smp_processor_id()) resched_curr(rq); - raw_spin_unlock_irqrestore(&rq->lock, flags); + raw_spin_rq_unlock_irqrestore(rq, flags); } #ifdef CONFIG_SMP @@ -1065,9 +1403,10 @@ static void uclamp_sync_util_min_rt_default(void) static inline struct uclamp_se uclamp_tg_restrict(struct task_struct *p, enum uclamp_id clamp_id) { + /* Copy by value as we could modify it */ struct uclamp_se uc_req = p->uclamp_req[clamp_id]; #ifdef CONFIG_UCLAMP_TASK_GROUP - struct uclamp_se uc_max; + unsigned int tg_min, tg_max, value; /* * Tasks in autogroups or root task group will be @@ -1078,9 +1417,11 @@ uclamp_tg_restrict(struct task_struct *p, enum uclamp_id clamp_id) if (task_group(p) == &root_task_group) return uc_req; - uc_max = task_group(p)->uclamp[clamp_id]; - if (uc_req.value > uc_max.value || !uc_req.user_defined) - return uc_max; + tg_min = task_group(p)->uclamp[UCLAMP_MIN].value; + tg_max = task_group(p)->uclamp[UCLAMP_MAX].value; + value = uc_req.value; + value = clamp(value, tg_min, tg_max); + uclamp_se_set(&uc_req, value, false); #endif return uc_req; @@ -1137,7 +1478,7 @@ static inline void uclamp_rq_inc_id(struct rq *rq, struct task_struct *p, struct uclamp_se *uc_se = &p->uclamp[clamp_id]; struct uclamp_bucket *bucket; - lockdep_assert_held(&rq->lock); + lockdep_assert_rq_held(rq); /* Update task effective clamp */ p->uclamp[clamp_id] = uclamp_eff_get(p, clamp_id); @@ -1177,7 +1518,7 @@ static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p, unsigned int bkt_clamp; unsigned int rq_clamp; - lockdep_assert_held(&rq->lock); + lockdep_assert_rq_held(rq); /* * If sched_uclamp_used was enabled after task @p was enqueued, @@ -1279,8 +1620,9 @@ static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p) } static inline void -uclamp_update_active(struct task_struct *p, enum uclamp_id clamp_id) +uclamp_update_active(struct task_struct *p) { + enum uclamp_id clamp_id; struct rq_flags rf; struct rq *rq; @@ -1300,9 +1642,11 @@ uclamp_update_active(struct task_struct *p, enum uclamp_id clamp_id) * affecting a valid clamp bucket, the next time it's enqueued, * it will already see the updated clamp bucket value. */ - if (p->uclamp[clamp_id].active) { - uclamp_rq_dec_id(rq, p, clamp_id); - uclamp_rq_inc_id(rq, p, clamp_id); + for_each_clamp_id(clamp_id) { + if (p->uclamp[clamp_id].active) { + uclamp_rq_dec_id(rq, p, clamp_id); + uclamp_rq_inc_id(rq, p, clamp_id); + } } task_rq_unlock(rq, p, &rf); @@ -1310,20 +1654,14 @@ uclamp_update_active(struct task_struct *p, enum uclamp_id clamp_id) #ifdef CONFIG_UCLAMP_TASK_GROUP static inline void -uclamp_update_active_tasks(struct cgroup_subsys_state *css, - unsigned int clamps) +uclamp_update_active_tasks(struct cgroup_subsys_state *css) { - enum uclamp_id clamp_id; struct css_task_iter it; struct task_struct *p; css_task_iter_start(css, 0, &it); - while ((p = css_task_iter_next(&it))) { - for_each_clamp_id(clamp_id) { - if ((0x1 << clamp_id) & clamps) - uclamp_update_active(p, clamp_id); - } - } + while ((p = css_task_iter_next(&it))) + uclamp_update_active(p); css_task_iter_end(&it); } @@ -1590,27 +1928,38 @@ static inline void uclamp_post_fork(struct task_struct *p) { } static inline void init_uclamp(void) { } #endif /* CONFIG_UCLAMP_TASK */ +bool sched_task_on_rq(struct task_struct *p) +{ + return task_on_rq_queued(p); +} + static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags) { if (!(flags & ENQUEUE_NOCLOCK)) update_rq_clock(rq); if (!(flags & ENQUEUE_RESTORE)) { - sched_info_queued(rq, p); + sched_info_enqueue(rq, p); psi_enqueue(p, flags & ENQUEUE_WAKEUP); } uclamp_rq_inc(rq, p); p->sched_class->enqueue_task(rq, p, flags); + + if (sched_core_enabled(rq)) + sched_core_enqueue(rq, p); } static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags) { + if (sched_core_enabled(rq)) + sched_core_dequeue(rq, p); + if (!(flags & DEQUEUE_NOCLOCK)) update_rq_clock(rq); if (!(flags & DEQUEUE_SAVE)) { - sched_info_dequeued(rq, p); + sched_info_dequeue(rq, p); psi_dequeue(p, flags & DEQUEUE_SLEEP); } @@ -1850,7 +2199,7 @@ static inline bool is_cpu_allowed(struct task_struct *p, int cpu) static struct rq *move_queued_task(struct rq *rq, struct rq_flags *rf, struct task_struct *p, int new_cpu) { - lockdep_assert_held(&rq->lock); + lockdep_assert_rq_held(rq); deactivate_task(rq, p, DEQUEUE_NOCLOCK); set_task_cpu(p, new_cpu); @@ -1916,7 +2265,6 @@ static int migration_cpu_stop(void *data) struct migration_arg *arg = data; struct set_affinity_pending *pending = arg->pending; struct task_struct *p = arg->task; - int dest_cpu = arg->dest_cpu; struct rq *rq = this_rq(); bool complete = false; struct rq_flags rf; @@ -1954,19 +2302,15 @@ static int migration_cpu_stop(void *data) if (pending) { p->migration_pending = NULL; complete = true; - } - if (dest_cpu < 0) { if (cpumask_test_cpu(task_cpu(p), &p->cpus_mask)) goto out; - - dest_cpu = cpumask_any_distribute(&p->cpus_mask); } if (task_on_rq_queued(p)) - rq = __migrate_task(rq, &rf, p, dest_cpu); + rq = __migrate_task(rq, &rf, p, arg->dest_cpu); else - p->wake_cpu = dest_cpu; + p->wake_cpu = arg->dest_cpu; /* * XXX __migrate_task() can fail, at which point we might end @@ -2024,7 +2368,7 @@ int push_cpu_stop(void *arg) struct task_struct *p = arg; raw_spin_lock_irq(&p->pi_lock); - raw_spin_lock(&rq->lock); + raw_spin_rq_lock(rq); if (task_rq(p) != rq) goto out_unlock; @@ -2054,7 +2398,7 @@ int push_cpu_stop(void *arg) out_unlock: rq->push_busy = false; - raw_spin_unlock(&rq->lock); + raw_spin_rq_unlock(rq); raw_spin_unlock_irq(&p->pi_lock); put_task_struct(p); @@ -2107,7 +2451,7 @@ __do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask, u32 * Because __kthread_bind() calls this on blocked tasks without * holding rq->lock. */ - lockdep_assert_held(&rq->lock); + lockdep_assert_rq_held(rq); dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK); } if (running) @@ -2249,7 +2593,7 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag init_completion(&my_pending.done); my_pending.arg = (struct migration_arg) { .task = p, - .dest_cpu = -1, /* any */ + .dest_cpu = dest_cpu, .pending = &my_pending, }; @@ -2257,6 +2601,15 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag } else { pending = p->migration_pending; refcount_inc(&pending->refs); + /* + * Affinity has changed, but we've already installed a + * pending. migration_cpu_stop() *must* see this, else + * we risk a completion of the pending despite having a + * task on a disallowed CPU. + * + * Serialized by p->pi_lock, so this is safe. + */ + pending->arg.dest_cpu = dest_cpu; } } pending = p->migration_pending; @@ -2277,7 +2630,7 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag return -EINVAL; } - if (task_running(rq, p) || p->state == TASK_WAKING) { + if (task_running(rq, p) || READ_ONCE(p->__state) == TASK_WAKING) { /* * MIGRATE_ENABLE gets here because 'p == current', but for * anything else we cannot do is_migration_disabled(), punt @@ -2420,19 +2773,20 @@ EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); void set_task_cpu(struct task_struct *p, unsigned int new_cpu) { #ifdef CONFIG_SCHED_DEBUG + unsigned int state = READ_ONCE(p->__state); + /* * We should never call set_task_cpu() on a blocked task, * ttwu() will sort out the placement. */ - WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && - !p->on_rq); + WARN_ON_ONCE(state != TASK_RUNNING && state != TASK_WAKING && !p->on_rq); /* * Migrating fair class task must have p->on_rq = TASK_ON_RQ_MIGRATING, * because schedstat_wait_{start,end} rebase migrating task's wait_start * time relying on p->on_rq. */ - WARN_ON_ONCE(p->state == TASK_RUNNING && + WARN_ON_ONCE(state == TASK_RUNNING && p->sched_class == &fair_sched_class && (p->on_rq && !task_on_rq_migrating(p))); @@ -2448,7 +2802,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) * task_rq_lock(). */ WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) || - lockdep_is_held(&task_rq(p)->lock))); + lockdep_is_held(__rq_lockp(task_rq(p))))); #endif /* * Clearly, migrating tasks to offline CPUs is a fairly daft thing. @@ -2604,7 +2958,7 @@ out: * smp_call_function() if an IPI is sent by the same process we are * waiting to become inactive. */ -unsigned long wait_task_inactive(struct task_struct *p, long match_state) +unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state) { int running, queued; struct rq_flags rf; @@ -2632,7 +2986,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state) * is actually now running somewhere else! */ while (task_running(rq, p)) { - if (match_state && unlikely(p->state != match_state)) + if (match_state && unlikely(READ_ONCE(p->__state) != match_state)) return 0; cpu_relax(); } @@ -2647,7 +3001,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state) running = task_running(rq, p); queued = task_on_rq_queued(p); ncsw = 0; - if (!match_state || p->state == match_state) + if (!match_state || READ_ONCE(p->__state) == match_state) ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ task_rq_unlock(rq, p, &rf); @@ -2956,7 +3310,7 @@ static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags, struct rq_flags *rf) { check_preempt_curr(rq, p, wake_flags); - p->state = TASK_RUNNING; + WRITE_ONCE(p->__state, TASK_RUNNING); trace_sched_wakeup(p); #ifdef CONFIG_SMP @@ -2979,6 +3333,9 @@ static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags, if (rq->avg_idle > max) rq->avg_idle = max; + rq->wake_stamp = jiffies; + rq->wake_avg_idle = rq->avg_idle / 2; + rq->idle_stamp = 0; } #endif @@ -2990,7 +3347,7 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags, { int en_flags = ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK; - lockdep_assert_held(&rq->lock); + lockdep_assert_rq_held(rq); if (p->sched_contributes_to_load) rq->nr_uninterruptible--; @@ -3345,12 +3702,12 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) * - we're serialized against set_special_state() by virtue of * it disabling IRQs (this allows not taking ->pi_lock). */ - if (!(p->state & state)) + if (!(READ_ONCE(p->__state) & state)) goto out; success = 1; trace_sched_waking(p); - p->state = TASK_RUNNING; + WRITE_ONCE(p->__state, TASK_RUNNING); trace_sched_wakeup(p); goto out; } @@ -3363,7 +3720,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) */ raw_spin_lock_irqsave(&p->pi_lock, flags); smp_mb__after_spinlock(); - if (!(p->state & state)) + if (!(READ_ONCE(p->__state) & state)) goto unlock; trace_sched_waking(p); @@ -3429,7 +3786,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) * TASK_WAKING such that we can unlock p->pi_lock before doing the * enqueue, such as ttwu_queue_wakelist(). */ - p->state = TASK_WAKING; + WRITE_ONCE(p->__state, TASK_WAKING); /* * If the owning (remote) CPU is still in the middle of schedule() with @@ -3522,7 +3879,7 @@ bool try_invoke_on_locked_down_task(struct task_struct *p, bool (*func)(struct t ret = func(p, arg); rq_unlock(rq, &rf); } else { - switch (p->state) { + switch (READ_ONCE(p->__state)) { case TASK_RUNNING: case TASK_WAKING: break; @@ -3648,7 +4005,6 @@ int sysctl_numa_balancing(struct ctl_table *table, int write, #ifdef CONFIG_SCHEDSTATS DEFINE_STATIC_KEY_FALSE(sched_schedstats); -static bool __initdata __sched_schedstats = false; static void set_schedstats(bool enabled) { @@ -3672,16 +4028,11 @@ static int __init setup_schedstats(char *str) if (!str) goto out; - /* - * This code is called before jump labels have been set up, so we can't - * change the static branch directly just yet. Instead set a temporary - * variable so init_schedstats() can do it later. - */ if (!strcmp(str, "enable")) { - __sched_schedstats = true; + set_schedstats(true); ret = 1; } else if (!strcmp(str, "disable")) { - __sched_schedstats = false; + set_schedstats(false); ret = 1; } out: @@ -3692,11 +4043,6 @@ out: } __setup("schedstats=", setup_schedstats); -static void __init init_schedstats(void) -{ - set_schedstats(__sched_schedstats); -} - #ifdef CONFIG_PROC_SYSCTL int sysctl_schedstats(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) @@ -3718,8 +4064,6 @@ int sysctl_schedstats(struct ctl_table *table, int write, void *buffer, return err; } #endif /* CONFIG_PROC_SYSCTL */ -#else /* !CONFIG_SCHEDSTATS */ -static inline void init_schedstats(void) {} #endif /* CONFIG_SCHEDSTATS */ /* @@ -3735,7 +4079,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) * nobody will actually run it, and a signal or other external * event cannot wake it up and insert it on the runqueue either. */ - p->state = TASK_NEW; + p->__state = TASK_NEW; /* * Make sure we do not leak PI boosting priority to the child. @@ -3841,7 +4185,7 @@ void wake_up_new_task(struct task_struct *p) struct rq *rq; raw_spin_lock_irqsave(&p->pi_lock, rf.flags); - p->state = TASK_RUNNING; + WRITE_ONCE(p->__state, TASK_RUNNING); #ifdef CONFIG_SMP /* * Fork balancing, do it here and not earlier because: @@ -4001,7 +4345,7 @@ static void do_balance_callbacks(struct rq *rq, struct callback_head *head) void (*func)(struct rq *rq); struct callback_head *next; - lockdep_assert_held(&rq->lock); + lockdep_assert_rq_held(rq); while (head) { func = (void (*)(struct rq *))head->func; @@ -4024,7 +4368,7 @@ static inline struct callback_head *splice_balance_callbacks(struct rq *rq) { struct callback_head *head = rq->balance_callback; - lockdep_assert_held(&rq->lock); + lockdep_assert_rq_held(rq); if (head) rq->balance_callback = NULL; @@ -4041,9 +4385,9 @@ static inline void balance_callbacks(struct rq *rq, struct callback_head *head) unsigned long flags; if (unlikely(head)) { - raw_spin_lock_irqsave(&rq->lock, flags); + raw_spin_rq_lock_irqsave(rq, flags); do_balance_callbacks(rq, head); - raw_spin_unlock_irqrestore(&rq->lock, flags); + raw_spin_rq_unlock_irqrestore(rq, flags); } } @@ -4074,10 +4418,10 @@ prepare_lock_switch(struct rq *rq, struct task_struct *next, struct rq_flags *rf * do an early lockdep release here: */ rq_unpin_lock(rq, rf); - spin_release(&rq->lock.dep_map, _THIS_IP_); + spin_release(&__rq_lockp(rq)->dep_map, _THIS_IP_); #ifdef CONFIG_DEBUG_SPINLOCK /* this is a valid case when another task releases the spinlock */ - rq->lock.owner = next; + rq_lockp(rq)->owner = next; #endif } @@ -4088,9 +4432,9 @@ static inline void finish_lock_switch(struct rq *rq) * fix up the runqueue lock - which gets 'carried over' from * prev into current: */ - spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_); + spin_acquire(&__rq_lockp(rq)->dep_map, 0, 0, _THIS_IP_); __balance_callbacks(rq); - raw_spin_unlock_irq(&rq->lock); + raw_spin_rq_unlock_irq(rq); } /* @@ -4203,10 +4547,11 @@ static struct rq *finish_task_switch(struct task_struct *prev) * running on another CPU and we could rave with its RUNNING -> DEAD * transition, resulting in a double drop. */ - prev_state = prev->state; + prev_state = READ_ONCE(prev->__state); vtime_task_switch(prev); perf_event_task_sched_in(prev, current); finish_task(prev); + tick_nohz_task_switch(); finish_lock_switch(rq); finish_arch_post_lock_switch(); kcov_finish_switch(current); @@ -4252,7 +4597,6 @@ static struct rq *finish_task_switch(struct task_struct *prev) put_task_struct_rcu_user(prev); } - tick_nohz_task_switch(); return rq; } @@ -4348,9 +4692,9 @@ context_switch(struct rq *rq, struct task_struct *prev, * externally visible scheduler statistics: current number of runnable * threads, total number of context switches performed since bootup. */ -unsigned long nr_running(void) +unsigned int nr_running(void) { - unsigned long i, sum = 0; + unsigned int i, sum = 0; for_each_online_cpu(i) sum += cpu_rq(i)->nr_running; @@ -4395,7 +4739,7 @@ unsigned long long nr_context_switches(void) * it does become runnable. */ -unsigned long nr_iowait_cpu(int cpu) +unsigned int nr_iowait_cpu(int cpu) { return atomic_read(&cpu_rq(cpu)->nr_iowait); } @@ -4430,9 +4774,9 @@ unsigned long nr_iowait_cpu(int cpu) * Task CPU affinities can make all that even more 'interesting'. */ -unsigned long nr_iowait(void) +unsigned int nr_iowait(void) { - unsigned long i, sum = 0; + unsigned int i, sum = 0; for_each_possible_cpu(i) sum += nr_iowait_cpu(i); @@ -4897,7 +5241,7 @@ static inline void schedule_debug(struct task_struct *prev, bool preempt) #endif #ifdef CONFIG_DEBUG_ATOMIC_SLEEP - if (!preempt && prev->state && prev->non_block_count) { + if (!preempt && READ_ONCE(prev->__state) && prev->non_block_count) { printk(KERN_ERR "BUG: scheduling in a non-blocking section: %s/%d/%i\n", prev->comm, prev->pid, prev->non_block_count); dump_stack(); @@ -4943,7 +5287,7 @@ static void put_prev_task_balance(struct rq *rq, struct task_struct *prev, * Pick up the highest-prio task: */ static inline struct task_struct * -pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) +__pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) { const struct sched_class *class; struct task_struct *p; @@ -4961,7 +5305,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) if (unlikely(p == RETRY_TASK)) goto restart; - /* Assumes fair_sched_class->next == idle_sched_class */ + /* Assume the next prioritized class is idle_sched_class */ if (!p) { put_prev_task(rq, prev); p = pick_next_task_idle(rq); @@ -4983,6 +5327,455 @@ restart: BUG(); } +#ifdef CONFIG_SCHED_CORE +static inline bool is_task_rq_idle(struct task_struct *t) +{ + return (task_rq(t)->idle == t); +} + +static inline bool cookie_equals(struct task_struct *a, unsigned long cookie) +{ + return is_task_rq_idle(a) || (a->core_cookie == cookie); +} + +static inline bool cookie_match(struct task_struct *a, struct task_struct *b) +{ + if (is_task_rq_idle(a) || is_task_rq_idle(b)) + return true; + + return a->core_cookie == b->core_cookie; +} + +// XXX fairness/fwd progress conditions +/* + * Returns + * - NULL if there is no runnable task for this class. + * - the highest priority task for this runqueue if it matches + * rq->core->core_cookie or its priority is greater than max. + * - Else returns idle_task. + */ +static struct task_struct * +pick_task(struct rq *rq, const struct sched_class *class, struct task_struct *max, bool in_fi) +{ + struct task_struct *class_pick, *cookie_pick; + unsigned long cookie = rq->core->core_cookie; + + class_pick = class->pick_task(rq); + if (!class_pick) + return NULL; + + if (!cookie) { + /* + * If class_pick is tagged, return it only if it has + * higher priority than max. + */ + if (max && class_pick->core_cookie && + prio_less(class_pick, max, in_fi)) + return idle_sched_class.pick_task(rq); + + return class_pick; + } + + /* + * If class_pick is idle or matches cookie, return early. + */ + if (cookie_equals(class_pick, cookie)) + return class_pick; + + cookie_pick = sched_core_find(rq, cookie); + + /* + * If class > max && class > cookie, it is the highest priority task on + * the core (so far) and it must be selected, otherwise we must go with + * the cookie pick in order to satisfy the constraint. + */ + if (prio_less(cookie_pick, class_pick, in_fi) && + (!max || prio_less(max, class_pick, in_fi))) + return class_pick; + + return cookie_pick; +} + +extern void task_vruntime_update(struct rq *rq, struct task_struct *p, bool in_fi); + +static struct task_struct * +pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) +{ + struct task_struct *next, *max = NULL; + const struct sched_class *class; + const struct cpumask *smt_mask; + bool fi_before = false; + int i, j, cpu, occ = 0; + bool need_sync; + + if (!sched_core_enabled(rq)) + return __pick_next_task(rq, prev, rf); + + cpu = cpu_of(rq); + + /* Stopper task is switching into idle, no need core-wide selection. */ + if (cpu_is_offline(cpu)) { + /* + * Reset core_pick so that we don't enter the fastpath when + * coming online. core_pick would already be migrated to + * another cpu during offline. + */ + rq->core_pick = NULL; + return __pick_next_task(rq, prev, rf); + } + + /* + * If there were no {en,de}queues since we picked (IOW, the task + * pointers are all still valid), and we haven't scheduled the last + * pick yet, do so now. + * + * rq->core_pick can be NULL if no selection was made for a CPU because + * it was either offline or went offline during a sibling's core-wide + * selection. In this case, do a core-wide selection. + */ + if (rq->core->core_pick_seq == rq->core->core_task_seq && + rq->core->core_pick_seq != rq->core_sched_seq && + rq->core_pick) { + WRITE_ONCE(rq->core_sched_seq, rq->core->core_pick_seq); + + next = rq->core_pick; + if (next != prev) { + put_prev_task(rq, prev); + set_next_task(rq, next); + } + + rq->core_pick = NULL; + return next; + } + + put_prev_task_balance(rq, prev, rf); + + smt_mask = cpu_smt_mask(cpu); + need_sync = !!rq->core->core_cookie; + + /* reset state */ + rq->core->core_cookie = 0UL; + if (rq->core->core_forceidle) { + need_sync = true; + fi_before = true; + rq->core->core_forceidle = false; + } + + /* + * core->core_task_seq, core->core_pick_seq, rq->core_sched_seq + * + * @task_seq guards the task state ({en,de}queues) + * @pick_seq is the @task_seq we did a selection on + * @sched_seq is the @pick_seq we scheduled + * + * However, preemptions can cause multiple picks on the same task set. + * 'Fix' this by also increasing @task_seq for every pick. + */ + rq->core->core_task_seq++; + + /* + * Optimize for common case where this CPU has no cookies + * and there are no cookied tasks running on siblings. + */ + if (!need_sync) { + for_each_class(class) { + next = class->pick_task(rq); + if (next) + break; + } + + if (!next->core_cookie) { + rq->core_pick = NULL; + /* + * For robustness, update the min_vruntime_fi for + * unconstrained picks as well. + */ + WARN_ON_ONCE(fi_before); + task_vruntime_update(rq, next, false); + goto done; + } + } + + for_each_cpu(i, smt_mask) { + struct rq *rq_i = cpu_rq(i); + + rq_i->core_pick = NULL; + + if (i != cpu) + update_rq_clock(rq_i); + } + + /* + * Try and select tasks for each sibling in descending sched_class + * order. + */ + for_each_class(class) { +again: + for_each_cpu_wrap(i, smt_mask, cpu) { + struct rq *rq_i = cpu_rq(i); + struct task_struct *p; + + if (rq_i->core_pick) + continue; + + /* + * If this sibling doesn't yet have a suitable task to + * run; ask for the most eligible task, given the + * highest priority task already selected for this + * core. + */ + p = pick_task(rq_i, class, max, fi_before); + if (!p) + continue; + + if (!is_task_rq_idle(p)) + occ++; + + rq_i->core_pick = p; + if (rq_i->idle == p && rq_i->nr_running) { + rq->core->core_forceidle = true; + if (!fi_before) + rq->core->core_forceidle_seq++; + } + + /* + * If this new candidate is of higher priority than the + * previous; and they're incompatible; we need to wipe + * the slate and start over. pick_task makes sure that + * p's priority is more than max if it doesn't match + * max's cookie. + * + * NOTE: this is a linear max-filter and is thus bounded + * in execution time. + */ + if (!max || !cookie_match(max, p)) { + struct task_struct *old_max = max; + + rq->core->core_cookie = p->core_cookie; + max = p; + + if (old_max) { + rq->core->core_forceidle = false; + for_each_cpu(j, smt_mask) { + if (j == i) + continue; + + cpu_rq(j)->core_pick = NULL; + } + occ = 1; + goto again; + } + } + } + } + + rq->core->core_pick_seq = rq->core->core_task_seq; + next = rq->core_pick; + rq->core_sched_seq = rq->core->core_pick_seq; + + /* Something should have been selected for current CPU */ + WARN_ON_ONCE(!next); + + /* + * Reschedule siblings + * + * NOTE: L1TF -- at this point we're no longer running the old task and + * sending an IPI (below) ensures the sibling will no longer be running + * their task. This ensures there is no inter-sibling overlap between + * non-matching user state. + */ + for_each_cpu(i, smt_mask) { + struct rq *rq_i = cpu_rq(i); + + /* + * An online sibling might have gone offline before a task + * could be picked for it, or it might be offline but later + * happen to come online, but its too late and nothing was + * picked for it. That's Ok - it will pick tasks for itself, + * so ignore it. + */ + if (!rq_i->core_pick) + continue; + + /* + * Update for new !FI->FI transitions, or if continuing to be in !FI: + * fi_before fi update? + * 0 0 1 + * 0 1 1 + * 1 0 1 + * 1 1 0 + */ + if (!(fi_before && rq->core->core_forceidle)) + task_vruntime_update(rq_i, rq_i->core_pick, rq->core->core_forceidle); + + rq_i->core_pick->core_occupation = occ; + + if (i == cpu) { + rq_i->core_pick = NULL; + continue; + } + + /* Did we break L1TF mitigation requirements? */ + WARN_ON_ONCE(!cookie_match(next, rq_i->core_pick)); + + if (rq_i->curr == rq_i->core_pick) { + rq_i->core_pick = NULL; + continue; + } + + resched_curr(rq_i); + } + +done: + set_next_task(rq, next); + return next; +} + +static bool try_steal_cookie(int this, int that) +{ + struct rq *dst = cpu_rq(this), *src = cpu_rq(that); + struct task_struct *p; + unsigned long cookie; + bool success = false; + + local_irq_disable(); + double_rq_lock(dst, src); + + cookie = dst->core->core_cookie; + if (!cookie) + goto unlock; + + if (dst->curr != dst->idle) + goto unlock; + + p = sched_core_find(src, cookie); + if (p == src->idle) + goto unlock; + + do { + if (p == src->core_pick || p == src->curr) + goto next; + + if (!cpumask_test_cpu(this, &p->cpus_mask)) + goto next; + + if (p->core_occupation > dst->idle->core_occupation) + goto next; + + p->on_rq = TASK_ON_RQ_MIGRATING; + deactivate_task(src, p, 0); + set_task_cpu(p, this); + activate_task(dst, p, 0); + p->on_rq = TASK_ON_RQ_QUEUED; + + resched_curr(dst); + + success = true; + break; + +next: + p = sched_core_next(p, cookie); + } while (p); + +unlock: + double_rq_unlock(dst, src); + local_irq_enable(); + + return success; +} + +static bool steal_cookie_task(int cpu, struct sched_domain *sd) +{ + int i; + + for_each_cpu_wrap(i, sched_domain_span(sd), cpu) { + if (i == cpu) + continue; + + if (need_resched()) + break; + + if (try_steal_cookie(cpu, i)) + return true; + } + + return false; +} + +static void sched_core_balance(struct rq *rq) +{ + struct sched_domain *sd; + int cpu = cpu_of(rq); + + preempt_disable(); + rcu_read_lock(); + raw_spin_rq_unlock_irq(rq); + for_each_domain(cpu, sd) { + if (need_resched()) + break; + + if (steal_cookie_task(cpu, sd)) + break; + } + raw_spin_rq_lock_irq(rq); + rcu_read_unlock(); + preempt_enable(); +} + +static DEFINE_PER_CPU(struct callback_head, core_balance_head); + +void queue_core_balance(struct rq *rq) +{ + if (!sched_core_enabled(rq)) + return; + + if (!rq->core->core_cookie) + return; + + if (!rq->nr_running) /* not forced idle */ + return; + + queue_balance_callback(rq, &per_cpu(core_balance_head, rq->cpu), sched_core_balance); +} + +static inline void sched_core_cpu_starting(unsigned int cpu) +{ + const struct cpumask *smt_mask = cpu_smt_mask(cpu); + struct rq *rq, *core_rq = NULL; + int i; + + core_rq = cpu_rq(cpu)->core; + + if (!core_rq) { + for_each_cpu(i, smt_mask) { + rq = cpu_rq(i); + if (rq->core && rq->core == rq) + core_rq = rq; + } + + if (!core_rq) + core_rq = cpu_rq(cpu); + + for_each_cpu(i, smt_mask) { + rq = cpu_rq(i); + + WARN_ON_ONCE(rq->core && rq->core != core_rq); + rq->core = core_rq; + } + } +} +#else /* !CONFIG_SCHED_CORE */ + +static inline void sched_core_cpu_starting(unsigned int cpu) {} + +static struct task_struct * +pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) +{ + return __pick_next_task(rq, prev, rf); +} + +#endif /* CONFIG_SCHED_CORE */ + /* * __schedule() is the main scheduler function. * @@ -5074,10 +5867,10 @@ static void __sched notrace __schedule(bool preempt) * - we form a control dependency vs deactivate_task() below. * - ptrace_{,un}freeze_traced() can change ->state underneath us. */ - prev_state = prev->state; + prev_state = READ_ONCE(prev->__state); if (!preempt && prev_state) { if (signal_pending_state(prev_state, prev)) { - prev->state = TASK_RUNNING; + WRITE_ONCE(prev->__state, TASK_RUNNING); } else { prev->sched_contributes_to_load = (prev_state & TASK_UNINTERRUPTIBLE) && @@ -5150,7 +5943,7 @@ static void __sched notrace __schedule(bool preempt) rq_unpin_lock(rq, &rf); __balance_callbacks(rq); - raw_spin_unlock_irq(&rq->lock); + raw_spin_rq_unlock_irq(rq); } } @@ -5174,7 +5967,7 @@ static inline void sched_submit_work(struct task_struct *tsk) { unsigned int task_flags; - if (!tsk->state) + if (task_is_running(tsk)) return; task_flags = tsk->flags; @@ -5249,7 +6042,7 @@ void __sched schedule_idle(void) * current task can be in any other state. Note, idle is always in the * TASK_RUNNING state. */ - WARN_ON_ONCE(current->state); + WARN_ON_ONCE(current->__state); do { __schedule(false); } while (need_resched()); @@ -5692,7 +6485,7 @@ out_unlock: rq_unpin_lock(rq, &rf); __balance_callbacks(rq); - raw_spin_unlock(&rq->lock); + raw_spin_rq_unlock(rq); preempt_enable(); } @@ -6389,7 +7182,6 @@ int sched_setattr_nocheck(struct task_struct *p, const struct sched_attr *attr) { return __sched_setscheduler(p, attr, false, true); } -EXPORT_SYMBOL_GPL(sched_setattr_nocheck); /** * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace. @@ -7149,7 +7941,7 @@ again: if (curr->sched_class != p->sched_class) goto out_unlock; - if (task_running(p_rq, p) || p->state) + if (task_running(p_rq, p) || !task_is_running(p)) goto out_unlock; yielded = curr->sched_class->yield_to_task(rq, p); @@ -7352,7 +8144,7 @@ void sched_show_task(struct task_struct *p) pr_info("task:%-15.15s state:%c", p->comm, task_state_to_char(p)); - if (p->state == TASK_RUNNING) + if (task_is_running(p)) pr_cont(" running task "); #ifdef CONFIG_DEBUG_STACK_USAGE free = stack_not_used(p); @@ -7376,26 +8168,28 @@ EXPORT_SYMBOL_GPL(sched_show_task); static inline bool state_filter_match(unsigned long state_filter, struct task_struct *p) { + unsigned int state = READ_ONCE(p->__state); + /* no filter, everything matches */ if (!state_filter) return true; /* filter, but doesn't match */ - if (!(p->state & state_filter)) + if (!(state & state_filter)) return false; /* * When looking for TASK_UNINTERRUPTIBLE skip TASK_IDLE (allows * TASK_KILLABLE). */ - if (state_filter == TASK_UNINTERRUPTIBLE && p->state == TASK_IDLE) + if (state_filter == TASK_UNINTERRUPTIBLE && state == TASK_IDLE) return false; return true; } -void show_state_filter(unsigned long state_filter) +void show_state_filter(unsigned int state_filter) { struct task_struct *g, *p; @@ -7434,19 +8228,32 @@ void show_state_filter(unsigned long state_filter) * NOTE: this function does not set the idle thread's NEED_RESCHED * flag, to make booting more robust. */ -void init_idle(struct task_struct *idle, int cpu) +void __init init_idle(struct task_struct *idle, int cpu) { struct rq *rq = cpu_rq(cpu); unsigned long flags; __sched_fork(0, idle); + /* + * The idle task doesn't need the kthread struct to function, but it + * is dressed up as a per-CPU kthread and thus needs to play the part + * if we want to avoid special-casing it in code that deals with per-CPU + * kthreads. + */ + set_kthread_struct(idle); + raw_spin_lock_irqsave(&idle->pi_lock, flags); - raw_spin_lock(&rq->lock); + raw_spin_rq_lock(rq); - idle->state = TASK_RUNNING; + idle->__state = TASK_RUNNING; idle->se.exec_start = sched_clock(); - idle->flags |= PF_IDLE; + /* + * PF_KTHREAD should already be set at this point; regardless, make it + * look like a proper per-CPU kthread. + */ + idle->flags |= PF_IDLE | PF_KTHREAD | PF_NO_SETAFFINITY; + kthread_set_per_cpu(idle, cpu); scs_task_reset(idle); kasan_unpoison_task_stack(idle); @@ -7480,7 +8287,7 @@ void init_idle(struct task_struct *idle, int cpu) #ifdef CONFIG_SMP idle->on_cpu = 1; #endif - raw_spin_unlock(&rq->lock); + raw_spin_rq_unlock(rq); raw_spin_unlock_irqrestore(&idle->pi_lock, flags); /* Set the preempt count _outside_ the spinlocks! */ @@ -7646,7 +8453,7 @@ static void balance_push(struct rq *rq) { struct task_struct *push_task = rq->curr; - lockdep_assert_held(&rq->lock); + lockdep_assert_rq_held(rq); SCHED_WARN_ON(rq->cpu != smp_processor_id()); /* @@ -7663,12 +8470,8 @@ static void balance_push(struct rq *rq) /* * Both the cpu-hotplug and stop task are in this case and are * required to complete the hotplug process. - * - * XXX: the idle task does not match kthread_is_per_cpu() due to - * histerical raisins. */ - if (rq->idle == push_task || - kthread_is_per_cpu(push_task) || + if (kthread_is_per_cpu(push_task) || is_migration_disabled(push_task)) { /* @@ -7684,9 +8487,9 @@ static void balance_push(struct rq *rq) */ if (!rq->nr_running && !rq_has_pinned_tasks(rq) && rcuwait_active(&rq->hotplug_wait)) { - raw_spin_unlock(&rq->lock); + raw_spin_rq_unlock(rq); rcuwait_wake_up(&rq->hotplug_wait); - raw_spin_lock(&rq->lock); + raw_spin_rq_lock(rq); } return; } @@ -7696,7 +8499,7 @@ static void balance_push(struct rq *rq) * Temporarily drop rq->lock such that we can wake-up the stop task. * Both preemption and IRQs are still disabled. */ - raw_spin_unlock(&rq->lock); + raw_spin_rq_unlock(rq); stop_one_cpu_nowait(rq->cpu, __balance_push_cpu_stop, push_task, this_cpu_ptr(&push_work)); /* @@ -7704,7 +8507,7 @@ static void balance_push(struct rq *rq) * schedule(). The next pick is obviously going to be the stop task * which kthread_is_per_cpu() and will push this task away. */ - raw_spin_lock(&rq->lock); + raw_spin_rq_lock(rq); } static void balance_push_set(int cpu, bool on) @@ -7948,6 +8751,7 @@ static void sched_rq_cpu_starting(unsigned int cpu) int sched_cpu_starting(unsigned int cpu) { + sched_core_cpu_starting(cpu); sched_rq_cpu_starting(cpu); sched_tick_start(cpu); return 0; @@ -7994,7 +8798,7 @@ static void dump_rq_tasks(struct rq *rq, const char *loglvl) struct task_struct *g, *p; int cpu = cpu_of(rq); - lockdep_assert_held(&rq->lock); + lockdep_assert_rq_held(rq); printk("%sCPU%d enqueued tasks (%u total):\n", loglvl, cpu, rq->nr_running); for_each_process_thread(g, p) { @@ -8046,6 +8850,7 @@ void __init sched_init_smp(void) /* Move init over to a non-isolated CPU */ if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_DOMAIN)) < 0) BUG(); + current->flags &= ~PF_NO_SETAFFINITY; sched_init_granularity(); init_sched_rt_class(); @@ -8167,7 +8972,7 @@ void __init sched_init(void) struct rq *rq; rq = cpu_rq(i); - raw_spin_lock_init(&rq->lock); + raw_spin_lock_init(&rq->__lock); rq->nr_running = 0; rq->calc_load_active = 0; rq->calc_load_update = jiffies + LOAD_FREQ; @@ -8215,6 +9020,8 @@ void __init sched_init(void) rq->online = 0; rq->idle_stamp = 0; rq->avg_idle = 2*sysctl_sched_migration_cost; + rq->wake_stamp = jiffies; + rq->wake_avg_idle = rq->avg_idle; rq->max_idle_balance_cost = sysctl_sched_migration_cost; INIT_LIST_HEAD(&rq->cfs_tasks); @@ -8232,6 +9039,16 @@ void __init sched_init(void) #endif /* CONFIG_SMP */ hrtick_rq_init(rq); atomic_set(&rq->nr_iowait, 0); + +#ifdef CONFIG_SCHED_CORE + rq->core = NULL; + rq->core_pick = NULL; + rq->core_enabled = 0; + rq->core_tree = RB_ROOT; + rq->core_forceidle = false; + + rq->core_cookie = 0UL; +#endif } set_load_weight(&init_task, false); @@ -8258,8 +9075,6 @@ void __init sched_init(void) #endif init_sched_fair_class(); - init_schedstats(); - psi_init(); init_uclamp(); @@ -8277,15 +9092,15 @@ static inline int preempt_count_equals(int preempt_offset) void __might_sleep(const char *file, int line, int preempt_offset) { + unsigned int state = get_current_state(); /* * Blocking primitives will set (and therefore destroy) current->state, * since we will exit with TASK_RUNNING make sure we enter with it, * otherwise we will destroy state. */ - WARN_ONCE(current->state != TASK_RUNNING && current->task_state_change, + WARN_ONCE(state != TASK_RUNNING && current->task_state_change, "do not call blocking ops when !TASK_RUNNING; " - "state=%lx set at [<%p>] %pS\n", - current->state, + "state=%x set at [<%p>] %pS\n", state, (void *)current->task_state_change, (void *)current->task_state_change); @@ -8681,7 +9496,11 @@ static int cpu_cgroup_css_online(struct cgroup_subsys_state *css) #ifdef CONFIG_UCLAMP_TASK_GROUP /* Propagate the effective uclamp value for the new group */ + mutex_lock(&uclamp_mutex); + rcu_read_lock(); cpu_util_update_eff(css); + rcu_read_unlock(); + mutex_unlock(&uclamp_mutex); #endif return 0; @@ -8742,7 +9561,7 @@ static int cpu_cgroup_can_attach(struct cgroup_taskset *tset) * has happened. This would lead to problems with PELT, due to * move wanting to detach+attach while we're not attached yet. */ - if (task->state == TASK_NEW) + if (READ_ONCE(task->__state) == TASK_NEW) ret = -EINVAL; raw_spin_unlock_irq(&task->pi_lock); @@ -8771,6 +9590,9 @@ static void cpu_util_update_eff(struct cgroup_subsys_state *css) enum uclamp_id clamp_id; unsigned int clamps; + lockdep_assert_held(&uclamp_mutex); + SCHED_WARN_ON(!rcu_read_lock_held()); + css_for_each_descendant_pre(css, top_css) { uc_parent = css_tg(css)->parent ? css_tg(css)->parent->uclamp : NULL; @@ -8803,7 +9625,7 @@ static void cpu_util_update_eff(struct cgroup_subsys_state *css) } /* Immediately update descendants RUNNABLE tasks */ - uclamp_update_active_tasks(css, clamps); + uclamp_update_active_tasks(css); } } @@ -8962,7 +9784,8 @@ static const u64 max_cfs_runtime = MAX_BW * NSEC_PER_USEC; static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime); -static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) +static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota, + u64 burst) { int i, ret = 0, runtime_enabled, runtime_was_enabled; struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; @@ -8992,6 +9815,10 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) if (quota != RUNTIME_INF && quota > max_cfs_runtime) return -EINVAL; + if (quota != RUNTIME_INF && (burst > quota || + burst + quota > max_cfs_runtime)) + return -EINVAL; + /* * Prevent race between setting of cfs_rq->runtime_enabled and * unthrottle_offline_cfs_rqs(). @@ -9013,6 +9840,7 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) raw_spin_lock_irq(&cfs_b->lock); cfs_b->period = ns_to_ktime(period); cfs_b->quota = quota; + cfs_b->burst = burst; __refill_cfs_bandwidth_runtime(cfs_b); @@ -9046,9 +9874,10 @@ out_unlock: static int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us) { - u64 quota, period; + u64 quota, period, burst; period = ktime_to_ns(tg->cfs_bandwidth.period); + burst = tg->cfs_bandwidth.burst; if (cfs_quota_us < 0) quota = RUNTIME_INF; else if ((u64)cfs_quota_us <= U64_MAX / NSEC_PER_USEC) @@ -9056,7 +9885,7 @@ static int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us) else return -EINVAL; - return tg_set_cfs_bandwidth(tg, period, quota); + return tg_set_cfs_bandwidth(tg, period, quota, burst); } static long tg_get_cfs_quota(struct task_group *tg) @@ -9074,15 +9903,16 @@ static long tg_get_cfs_quota(struct task_group *tg) static int tg_set_cfs_period(struct task_group *tg, long cfs_period_us) { - u64 quota, period; + u64 quota, period, burst; if ((u64)cfs_period_us > U64_MAX / NSEC_PER_USEC) return -EINVAL; period = (u64)cfs_period_us * NSEC_PER_USEC; quota = tg->cfs_bandwidth.quota; + burst = tg->cfs_bandwidth.burst; - return tg_set_cfs_bandwidth(tg, period, quota); + return tg_set_cfs_bandwidth(tg, period, quota, burst); } static long tg_get_cfs_period(struct task_group *tg) @@ -9095,6 +9925,30 @@ static long tg_get_cfs_period(struct task_group *tg) return cfs_period_us; } +static int tg_set_cfs_burst(struct task_group *tg, long cfs_burst_us) +{ + u64 quota, period, burst; + + if ((u64)cfs_burst_us > U64_MAX / NSEC_PER_USEC) + return -EINVAL; + + burst = (u64)cfs_burst_us * NSEC_PER_USEC; + period = ktime_to_ns(tg->cfs_bandwidth.period); + quota = tg->cfs_bandwidth.quota; + + return tg_set_cfs_bandwidth(tg, period, quota, burst); +} + +static long tg_get_cfs_burst(struct task_group *tg) +{ + u64 burst_us; + + burst_us = tg->cfs_bandwidth.burst; + do_div(burst_us, NSEC_PER_USEC); + + return burst_us; +} + static s64 cpu_cfs_quota_read_s64(struct cgroup_subsys_state *css, struct cftype *cft) { @@ -9119,6 +9973,18 @@ static int cpu_cfs_period_write_u64(struct cgroup_subsys_state *css, return tg_set_cfs_period(css_tg(css), cfs_period_us); } +static u64 cpu_cfs_burst_read_u64(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + return tg_get_cfs_burst(css_tg(css)); +} + +static int cpu_cfs_burst_write_u64(struct cgroup_subsys_state *css, + struct cftype *cftype, u64 cfs_burst_us) +{ + return tg_set_cfs_burst(css_tg(css), cfs_burst_us); +} + struct cfs_schedulable_data { struct task_group *tg; u64 period, quota; @@ -9272,6 +10138,11 @@ static struct cftype cpu_legacy_files[] = { .write_u64 = cpu_cfs_period_write_u64, }, { + .name = "cfs_burst_us", + .read_u64 = cpu_cfs_burst_read_u64, + .write_u64 = cpu_cfs_burst_write_u64, + }, + { .name = "stat", .seq_show = cpu_cfs_stat_show, }, @@ -9436,12 +10307,13 @@ static ssize_t cpu_max_write(struct kernfs_open_file *of, { struct task_group *tg = css_tg(of_css(of)); u64 period = tg_get_cfs_period(tg); + u64 burst = tg_get_cfs_burst(tg); u64 quota; int ret; ret = cpu_period_quota_parse(buf, &period, "a); if (!ret) - ret = tg_set_cfs_bandwidth(tg, period, quota); + ret = tg_set_cfs_bandwidth(tg, period, quota, burst); return ret ?: nbytes; } #endif @@ -9468,6 +10340,12 @@ static struct cftype cpu_files[] = { .seq_show = cpu_max_show, .write = cpu_max_write, }, + { + .name = "max.burst", + .flags = CFTYPE_NOT_ON_ROOT, + .read_u64 = cpu_cfs_burst_read_u64, + .write_u64 = cpu_cfs_burst_write_u64, + }, #endif #ifdef CONFIG_UCLAMP_TASK_GROUP { diff --git a/kernel/sched/core_sched.c b/kernel/sched/core_sched.c new file mode 100644 index 000000000000..9a80e9a474c0 --- /dev/null +++ b/kernel/sched/core_sched.c @@ -0,0 +1,229 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include <linux/prctl.h> +#include "sched.h" + +/* + * A simple wrapper around refcount. An allocated sched_core_cookie's + * address is used to compute the cookie of the task. + */ +struct sched_core_cookie { + refcount_t refcnt; +}; + +unsigned long sched_core_alloc_cookie(void) +{ + struct sched_core_cookie *ck = kmalloc(sizeof(*ck), GFP_KERNEL); + if (!ck) + return 0; + + refcount_set(&ck->refcnt, 1); + sched_core_get(); + + return (unsigned long)ck; +} + +void sched_core_put_cookie(unsigned long cookie) +{ + struct sched_core_cookie *ptr = (void *)cookie; + + if (ptr && refcount_dec_and_test(&ptr->refcnt)) { + kfree(ptr); + sched_core_put(); + } +} + +unsigned long sched_core_get_cookie(unsigned long cookie) +{ + struct sched_core_cookie *ptr = (void *)cookie; + + if (ptr) + refcount_inc(&ptr->refcnt); + + return cookie; +} + +/* + * sched_core_update_cookie - replace the cookie on a task + * @p: the task to update + * @cookie: the new cookie + * + * Effectively exchange the task cookie; caller is responsible for lifetimes on + * both ends. + * + * Returns: the old cookie + */ +unsigned long sched_core_update_cookie(struct task_struct *p, unsigned long cookie) +{ + unsigned long old_cookie; + struct rq_flags rf; + struct rq *rq; + bool enqueued; + + rq = task_rq_lock(p, &rf); + + /* + * Since creating a cookie implies sched_core_get(), and we cannot set + * a cookie until after we've created it, similarly, we cannot destroy + * a cookie until after we've removed it, we must have core scheduling + * enabled here. + */ + SCHED_WARN_ON((p->core_cookie || cookie) && !sched_core_enabled(rq)); + + enqueued = sched_core_enqueued(p); + if (enqueued) + sched_core_dequeue(rq, p); + + old_cookie = p->core_cookie; + p->core_cookie = cookie; + + if (enqueued) + sched_core_enqueue(rq, p); + + /* + * If task is currently running, it may not be compatible anymore after + * the cookie change, so enter the scheduler on its CPU to schedule it + * away. + */ + if (task_running(rq, p)) + resched_curr(rq); + + task_rq_unlock(rq, p, &rf); + + return old_cookie; +} + +static unsigned long sched_core_clone_cookie(struct task_struct *p) +{ + unsigned long cookie, flags; + + raw_spin_lock_irqsave(&p->pi_lock, flags); + cookie = sched_core_get_cookie(p->core_cookie); + raw_spin_unlock_irqrestore(&p->pi_lock, flags); + + return cookie; +} + +void sched_core_fork(struct task_struct *p) +{ + RB_CLEAR_NODE(&p->core_node); + p->core_cookie = sched_core_clone_cookie(current); +} + +void sched_core_free(struct task_struct *p) +{ + sched_core_put_cookie(p->core_cookie); +} + +static void __sched_core_set(struct task_struct *p, unsigned long cookie) +{ + cookie = sched_core_get_cookie(cookie); + cookie = sched_core_update_cookie(p, cookie); + sched_core_put_cookie(cookie); +} + +/* Called from prctl interface: PR_SCHED_CORE */ +int sched_core_share_pid(unsigned int cmd, pid_t pid, enum pid_type type, + unsigned long uaddr) +{ + unsigned long cookie = 0, id = 0; + struct task_struct *task, *p; + struct pid *grp; + int err = 0; + + if (!static_branch_likely(&sched_smt_present)) + return -ENODEV; + + if (type > PIDTYPE_PGID || cmd >= PR_SCHED_CORE_MAX || pid < 0 || + (cmd != PR_SCHED_CORE_GET && uaddr)) + return -EINVAL; + + rcu_read_lock(); + if (pid == 0) { + task = current; + } else { + task = find_task_by_vpid(pid); + if (!task) { + rcu_read_unlock(); + return -ESRCH; + } + } + get_task_struct(task); + rcu_read_unlock(); + + /* + * Check if this process has the right to modify the specified + * process. Use the regular "ptrace_may_access()" checks. + */ + if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) { + err = -EPERM; + goto out; + } + + switch (cmd) { + case PR_SCHED_CORE_GET: + if (type != PIDTYPE_PID || uaddr & 7) { + err = -EINVAL; + goto out; + } + cookie = sched_core_clone_cookie(task); + if (cookie) { + /* XXX improve ? */ + ptr_to_hashval((void *)cookie, &id); + } + err = put_user(id, (u64 __user *)uaddr); + goto out; + + case PR_SCHED_CORE_CREATE: + cookie = sched_core_alloc_cookie(); + if (!cookie) { + err = -ENOMEM; + goto out; + } + break; + + case PR_SCHED_CORE_SHARE_TO: + cookie = sched_core_clone_cookie(current); + break; + + case PR_SCHED_CORE_SHARE_FROM: + if (type != PIDTYPE_PID) { + err = -EINVAL; + goto out; + } + cookie = sched_core_clone_cookie(task); + __sched_core_set(current, cookie); + goto out; + + default: + err = -EINVAL; + goto out; + }; + + if (type == PIDTYPE_PID) { + __sched_core_set(task, cookie); + goto out; + } + + read_lock(&tasklist_lock); + grp = task_pid_type(task, type); + + do_each_pid_thread(grp, type, p) { + if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS)) { + err = -EPERM; + goto out_tasklist; + } + } while_each_pid_thread(grp, type, p); + + do_each_pid_thread(grp, type, p) { + __sched_core_set(p, cookie); + } while_each_pid_thread(grp, type, p); +out_tasklist: + read_unlock(&tasklist_lock); + +out: + sched_core_put_cookie(cookie); + put_task_struct(task); + return err; +} + diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index 104a1bade14f..893eece65bfd 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c @@ -112,7 +112,7 @@ static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu, /* * Take rq->lock to make 64-bit read safe on 32-bit platforms. */ - raw_spin_lock_irq(&cpu_rq(cpu)->lock); + raw_spin_rq_lock_irq(cpu_rq(cpu)); #endif if (index == CPUACCT_STAT_NSTATS) { @@ -126,7 +126,7 @@ static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu, } #ifndef CONFIG_64BIT - raw_spin_unlock_irq(&cpu_rq(cpu)->lock); + raw_spin_rq_unlock_irq(cpu_rq(cpu)); #endif return data; @@ -141,14 +141,14 @@ static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val) /* * Take rq->lock to make 64-bit write safe on 32-bit platforms. */ - raw_spin_lock_irq(&cpu_rq(cpu)->lock); + raw_spin_rq_lock_irq(cpu_rq(cpu)); #endif for (i = 0; i < CPUACCT_STAT_NSTATS; i++) cpuusage->usages[i] = val; #ifndef CONFIG_64BIT - raw_spin_unlock_irq(&cpu_rq(cpu)->lock); + raw_spin_rq_unlock_irq(cpu_rq(cpu)); #endif } @@ -253,13 +253,13 @@ static int cpuacct_all_seq_show(struct seq_file *m, void *V) * Take rq->lock to make 64-bit read safe on 32-bit * platforms. */ - raw_spin_lock_irq(&cpu_rq(cpu)->lock); + raw_spin_rq_lock_irq(cpu_rq(cpu)); #endif seq_printf(m, " %llu", cpuusage->usages[index]); #ifndef CONFIG_64BIT - raw_spin_unlock_irq(&cpu_rq(cpu)->lock); + raw_spin_rq_unlock_irq(cpu_rq(cpu)); #endif } seq_puts(m, "\n"); diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 4f09afd2f321..57124614363d 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -151,6 +151,7 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy, unsigned int freq = arch_scale_freq_invariant() ? policy->cpuinfo.max_freq : policy->cur; + util = map_util_perf(util); freq = map_util_freq(util, freq, max); if (freq == sg_policy->cached_raw_freq && !sg_policy->need_freq_update) diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 9a2989749b8d..aaacd6cfd42f 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -157,7 +157,7 @@ void __add_running_bw(u64 dl_bw, struct dl_rq *dl_rq) { u64 old = dl_rq->running_bw; - lockdep_assert_held(&(rq_of_dl_rq(dl_rq))->lock); + lockdep_assert_rq_held(rq_of_dl_rq(dl_rq)); dl_rq->running_bw += dl_bw; SCHED_WARN_ON(dl_rq->running_bw < old); /* overflow */ SCHED_WARN_ON(dl_rq->running_bw > dl_rq->this_bw); @@ -170,7 +170,7 @@ void __sub_running_bw(u64 dl_bw, struct dl_rq *dl_rq) { u64 old = dl_rq->running_bw; - lockdep_assert_held(&(rq_of_dl_rq(dl_rq))->lock); + lockdep_assert_rq_held(rq_of_dl_rq(dl_rq)); dl_rq->running_bw -= dl_bw; SCHED_WARN_ON(dl_rq->running_bw > old); /* underflow */ if (dl_rq->running_bw > old) @@ -184,7 +184,7 @@ void __add_rq_bw(u64 dl_bw, struct dl_rq *dl_rq) { u64 old = dl_rq->this_bw; - lockdep_assert_held(&(rq_of_dl_rq(dl_rq))->lock); + lockdep_assert_rq_held(rq_of_dl_rq(dl_rq)); dl_rq->this_bw += dl_bw; SCHED_WARN_ON(dl_rq->this_bw < old); /* overflow */ } @@ -194,7 +194,7 @@ void __sub_rq_bw(u64 dl_bw, struct dl_rq *dl_rq) { u64 old = dl_rq->this_bw; - lockdep_assert_held(&(rq_of_dl_rq(dl_rq))->lock); + lockdep_assert_rq_held(rq_of_dl_rq(dl_rq)); dl_rq->this_bw -= dl_bw; SCHED_WARN_ON(dl_rq->this_bw > old); /* underflow */ if (dl_rq->this_bw > old) @@ -348,10 +348,10 @@ static void task_non_contending(struct task_struct *p) if ((zerolag_time < 0) || hrtimer_active(&dl_se->inactive_timer)) { if (dl_task(p)) sub_running_bw(dl_se, dl_rq); - if (!dl_task(p) || p->state == TASK_DEAD) { + if (!dl_task(p) || READ_ONCE(p->__state) == TASK_DEAD) { struct dl_bw *dl_b = dl_bw_of(task_cpu(p)); - if (p->state == TASK_DEAD) + if (READ_ONCE(p->__state) == TASK_DEAD) sub_rq_bw(&p->dl, &rq->dl); raw_spin_lock(&dl_b->lock); __dl_sub(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p))); @@ -987,7 +987,7 @@ static int start_dl_timer(struct task_struct *p) ktime_t now, act; s64 delta; - lockdep_assert_held(&rq->lock); + lockdep_assert_rq_held(rq); /* * We want the timer to fire at the deadline, but considering @@ -1097,9 +1097,9 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) * If the runqueue is no longer available, migrate the * task elsewhere. This necessarily changes rq. */ - lockdep_unpin_lock(&rq->lock, rf.cookie); + lockdep_unpin_lock(__rq_lockp(rq), rf.cookie); rq = dl_task_offline_migration(rq, p); - rf.cookie = lockdep_pin_lock(&rq->lock); + rf.cookie = lockdep_pin_lock(__rq_lockp(rq)); update_rq_clock(rq); /* @@ -1355,10 +1355,10 @@ static enum hrtimer_restart inactive_task_timer(struct hrtimer *timer) sched_clock_tick(); update_rq_clock(rq); - if (!dl_task(p) || p->state == TASK_DEAD) { + if (!dl_task(p) || READ_ONCE(p->__state) == TASK_DEAD) { struct dl_bw *dl_b = dl_bw_of(task_cpu(p)); - if (p->state == TASK_DEAD && dl_se->dl_non_contending) { + if (READ_ONCE(p->__state) == TASK_DEAD && dl_se->dl_non_contending) { sub_running_bw(&p->dl, dl_rq_of_se(&p->dl)); sub_rq_bw(&p->dl, dl_rq_of_se(&p->dl)); dl_se->dl_non_contending = 0; @@ -1722,7 +1722,7 @@ static void migrate_task_rq_dl(struct task_struct *p, int new_cpu __maybe_unused { struct rq *rq; - if (p->state != TASK_WAKING) + if (READ_ONCE(p->__state) != TASK_WAKING) return; rq = task_rq(p); @@ -1731,7 +1731,7 @@ static void migrate_task_rq_dl(struct task_struct *p, int new_cpu __maybe_unused * from try_to_wake_up(). Hence, p->pi_lock is locked, but * rq->lock is not... So, lock it */ - raw_spin_lock(&rq->lock); + raw_spin_rq_lock(rq); if (p->dl.dl_non_contending) { sub_running_bw(&p->dl, &rq->dl); p->dl.dl_non_contending = 0; @@ -1746,7 +1746,7 @@ static void migrate_task_rq_dl(struct task_struct *p, int new_cpu __maybe_unused put_task_struct(p); } sub_rq_bw(&p->dl, &rq->dl); - raw_spin_unlock(&rq->lock); + raw_spin_rq_unlock(rq); } static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p) @@ -1852,7 +1852,7 @@ static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq, return rb_entry(left, struct sched_dl_entity, rb_node); } -static struct task_struct *pick_next_task_dl(struct rq *rq) +static struct task_struct *pick_task_dl(struct rq *rq) { struct sched_dl_entity *dl_se; struct dl_rq *dl_rq = &rq->dl; @@ -1864,7 +1864,18 @@ static struct task_struct *pick_next_task_dl(struct rq *rq) dl_se = pick_next_dl_entity(rq, dl_rq); BUG_ON(!dl_se); p = dl_task_of(dl_se); - set_next_task_dl(rq, p, true); + + return p; +} + +static struct task_struct *pick_next_task_dl(struct rq *rq) +{ + struct task_struct *p; + + p = pick_task_dl(rq); + if (p) + set_next_task_dl(rq, p, true); + return p; } @@ -2291,10 +2302,10 @@ skip: double_unlock_balance(this_rq, src_rq); if (push_task) { - raw_spin_unlock(&this_rq->lock); + raw_spin_rq_unlock(this_rq); stop_one_cpu_nowait(src_rq->cpu, push_cpu_stop, push_task, &src_rq->push_work); - raw_spin_lock(&this_rq->lock); + raw_spin_rq_lock(this_rq); } } @@ -2486,6 +2497,8 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p) check_preempt_curr_dl(rq, p, 0); else resched_curr(rq); + } else { + update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 0); } } @@ -2539,6 +2552,7 @@ DEFINE_SCHED_CLASS(dl) = { #ifdef CONFIG_SMP .balance = balance_dl, + .pick_task = pick_task_dl, .select_task_rq = select_task_rq_dl, .migrate_task_rq = migrate_task_rq_dl, .set_cpus_allowed = set_cpus_allowed_dl, diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index c5aacbd492a1..0c5ec2776ddf 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -576,7 +576,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock", SPLIT_NS(cfs_rq->exec_clock)); - raw_spin_lock_irqsave(&rq->lock, flags); + raw_spin_rq_lock_irqsave(rq, flags); if (rb_first_cached(&cfs_rq->tasks_timeline)) MIN_vruntime = (__pick_first_entity(cfs_rq))->vruntime; last = __pick_last_entity(cfs_rq); @@ -584,7 +584,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) max_vruntime = last->vruntime; min_vruntime = cfs_rq->min_vruntime; rq0_min_vruntime = cpu_rq(0)->cfs.min_vruntime; - raw_spin_unlock_irqrestore(&rq->lock, flags); + raw_spin_rq_unlock_irqrestore(rq, flags); SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "MIN_vruntime", SPLIT_NS(MIN_vruntime)); SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "min_vruntime", diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 2c8a9352590d..e6d1dd4e9d68 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -268,33 +268,11 @@ const struct sched_class fair_sched_class; */ #ifdef CONFIG_FAIR_GROUP_SCHED -static inline struct task_struct *task_of(struct sched_entity *se) -{ - SCHED_WARN_ON(!entity_is_task(se)); - return container_of(se, struct task_struct, se); -} /* Walk up scheduling entities hierarchy */ #define for_each_sched_entity(se) \ for (; se; se = se->parent) -static inline struct cfs_rq *task_cfs_rq(struct task_struct *p) -{ - return p->se.cfs_rq; -} - -/* runqueue on which this entity is (to be) queued */ -static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se) -{ - return se->cfs_rq; -} - -/* runqueue "owned" by this group */ -static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) -{ - return grp->my_q; -} - static inline void cfs_rq_tg_path(struct cfs_rq *cfs_rq, char *path, int len) { if (!path) @@ -455,33 +433,9 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse) #else /* !CONFIG_FAIR_GROUP_SCHED */ -static inline struct task_struct *task_of(struct sched_entity *se) -{ - return container_of(se, struct task_struct, se); -} - #define for_each_sched_entity(se) \ for (; se; se = NULL) -static inline struct cfs_rq *task_cfs_rq(struct task_struct *p) -{ - return &task_rq(p)->cfs; -} - -static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se) -{ - struct task_struct *p = task_of(se); - struct rq *rq = task_rq(p); - - return &rq->cfs; -} - -/* runqueue "owned" by this group */ -static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) -{ - return NULL; -} - static inline void cfs_rq_tg_path(struct cfs_rq *cfs_rq, char *path, int len) { if (path) @@ -1039,11 +993,14 @@ update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) if ((flags & DEQUEUE_SLEEP) && entity_is_task(se)) { struct task_struct *tsk = task_of(se); + unsigned int state; - if (tsk->state & TASK_INTERRUPTIBLE) + /* XXX racy against TTWU */ + state = READ_ONCE(tsk->__state); + if (state & TASK_INTERRUPTIBLE) __schedstat_set(se->statistics.sleep_start, rq_clock(rq_of(cfs_rq))); - if (tsk->state & TASK_UNINTERRUPTIBLE) + if (state & TASK_UNINTERRUPTIBLE) __schedstat_set(se->statistics.block_start, rq_clock(rq_of(cfs_rq))); } @@ -1107,7 +1064,7 @@ struct numa_group { static struct numa_group *deref_task_numa_group(struct task_struct *p) { return rcu_dereference_check(p->numa_group, p == current || - (lockdep_is_held(&task_rq(p)->lock) && !READ_ONCE(p->on_cpu))); + (lockdep_is_held(__rq_lockp(task_rq(p))) && !READ_ONCE(p->on_cpu))); } static struct numa_group *deref_curr_numa_group(struct task_struct *p) @@ -3139,7 +3096,7 @@ void reweight_task(struct task_struct *p, int prio) * * tg->weight * grq->load.weight * ge->load.weight = ----------------------------- (1) - * \Sum grq->load.weight + * \Sum grq->load.weight * * Now, because computing that sum is prohibitively expensive to compute (been * there, done that) we approximate it with this average stuff. The average @@ -3153,7 +3110,7 @@ void reweight_task(struct task_struct *p, int prio) * * tg->weight * grq->avg.load_avg * ge->load.weight = ------------------------------ (3) - * tg->load_avg + * tg->load_avg * * Where: tg->load_avg ~= \Sum grq->avg.load_avg * @@ -3169,7 +3126,7 @@ void reweight_task(struct task_struct *p, int prio) * * tg->weight * grq->load.weight * ge->load.weight = ----------------------------- = tg->weight (4) - * grp->load.weight + * grp->load.weight * * That is, the sum collapses because all other CPUs are idle; the UP scenario. * @@ -3188,7 +3145,7 @@ void reweight_task(struct task_struct *p, int prio) * * tg->weight * grq->load.weight * ge->load.weight = ----------------------------- (6) - * tg_load_avg' + * tg_load_avg' * * Where: * @@ -3298,6 +3255,61 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags) #ifdef CONFIG_SMP #ifdef CONFIG_FAIR_GROUP_SCHED +/* + * Because list_add_leaf_cfs_rq always places a child cfs_rq on the list + * immediately before a parent cfs_rq, and cfs_rqs are removed from the list + * bottom-up, we only have to test whether the cfs_rq before us on the list + * is our child. + * If cfs_rq is not on the list, test whether a child needs its to be added to + * connect a branch to the tree * (see list_add_leaf_cfs_rq() for details). + */ +static inline bool child_cfs_rq_on_list(struct cfs_rq *cfs_rq) +{ + struct cfs_rq *prev_cfs_rq; + struct list_head *prev; + + if (cfs_rq->on_list) { + prev = cfs_rq->leaf_cfs_rq_list.prev; + } else { + struct rq *rq = rq_of(cfs_rq); + + prev = rq->tmp_alone_branch; + } + + prev_cfs_rq = container_of(prev, struct cfs_rq, leaf_cfs_rq_list); + + return (prev_cfs_rq->tg->parent == cfs_rq->tg); +} + +static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq) +{ + if (cfs_rq->load.weight) + return false; + + if (cfs_rq->avg.load_sum) + return false; + + if (cfs_rq->avg.util_sum) + return false; + + if (cfs_rq->avg.runnable_sum) + return false; + + if (child_cfs_rq_on_list(cfs_rq)) + return false; + + /* + * _avg must be null when _sum are null because _avg = _sum / divider + * Make sure that rounding and/or propagation of PELT values never + * break this. + */ + SCHED_WARN_ON(cfs_rq->avg.load_avg || + cfs_rq->avg.util_avg || + cfs_rq->avg.runnable_avg); + + return true; +} + /** * update_tg_load_avg - update the tg's load avg * @cfs_rq: the cfs_rq whose avg changed @@ -3548,9 +3560,12 @@ update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq load_sum = (s64)se_weight(se) * runnable_sum; load_avg = div_s64(load_sum, divider); + se->avg.load_sum = runnable_sum; + delta = load_avg - se->avg.load_avg; + if (!delta) + return; - se->avg.load_sum = runnable_sum; se->avg.load_avg = load_avg; add_positive(&cfs_rq->avg.load_avg, delta); @@ -4091,6 +4106,11 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq) #else /* CONFIG_SMP */ +static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq) +{ + return true; +} + #define UPDATE_TG 0x0 #define SKIP_AGE_LOAD 0x0 #define DO_ATTACH 0x0 @@ -4425,6 +4445,8 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) static void set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) { + clear_buddies(cfs_rq, se); + /* 'current' is not kept within the tree. */ if (se->on_rq) { /* @@ -4484,7 +4506,7 @@ pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr) * Avoid running the skip buddy, if running something else can * be done without getting too unfair. */ - if (cfs_rq->skip == se) { + if (cfs_rq->skip && cfs_rq->skip == se) { struct sched_entity *second; if (se == curr) { @@ -4511,8 +4533,6 @@ pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr) se = cfs_rq->last; } - clear_buddies(cfs_rq, se); - return se; } @@ -4634,8 +4654,11 @@ static inline u64 sched_cfs_bandwidth_slice(void) */ void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b) { - if (cfs_b->quota != RUNTIME_INF) - cfs_b->runtime = cfs_b->quota; + if (unlikely(cfs_b->quota == RUNTIME_INF)) + return; + + cfs_b->runtime += cfs_b->quota; + cfs_b->runtime = min(cfs_b->runtime, cfs_b->quota + cfs_b->burst); } static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) @@ -4749,8 +4772,8 @@ static int tg_unthrottle_up(struct task_group *tg, void *data) cfs_rq->throttled_clock_task_time += rq_clock_task(rq) - cfs_rq->throttled_clock_task; - /* Add cfs_rq with already running entity in the list */ - if (cfs_rq->nr_running >= 1) + /* Add cfs_rq with load or one or more already running entities to the list */ + if (!cfs_rq_is_decayed(cfs_rq) || cfs_rq->nr_running) list_add_leaf_cfs_rq(cfs_rq); } @@ -4996,6 +5019,9 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, u throttled = !list_empty(&cfs_b->throttled_cfs_rq); cfs_b->nr_periods += overrun; + /* Refill extra burst quota even if cfs_b->idle */ + __refill_cfs_bandwidth_runtime(cfs_b); + /* * idle depends on !throttled (for the case of a large deficit), and if * we're going inactive then everything else can be deferred @@ -5003,8 +5029,6 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, u if (cfs_b->idle && !throttled) goto out_deactivate; - __refill_cfs_bandwidth_runtime(cfs_b); - if (!throttled) { /* mark as potentially idle for the upcoming period */ cfs_b->idle = 1; @@ -5254,6 +5278,7 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer) if (new < max_cfs_quota_period) { cfs_b->period = ns_to_ktime(new); cfs_b->quota *= 2; + cfs_b->burst *= 2; pr_warn_ratelimited( "cfs_period_timer[cpu%d]: period too short, scaling up (new cfs_period_us = %lld, cfs_quota_us = %lld)\n", @@ -5285,6 +5310,7 @@ void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) cfs_b->runtime = 0; cfs_b->quota = RUNTIME_INF; cfs_b->period = ns_to_ktime(default_cfs_period()); + cfs_b->burst = 0; INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq); hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED); @@ -5334,7 +5360,7 @@ static void __maybe_unused update_runtime_enabled(struct rq *rq) { struct task_group *tg; - lockdep_assert_held(&rq->lock); + lockdep_assert_rq_held(rq); rcu_read_lock(); list_for_each_entry_rcu(tg, &task_groups, list) { @@ -5353,7 +5379,7 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq) { struct task_group *tg; - lockdep_assert_held(&rq->lock); + lockdep_assert_rq_held(rq); rcu_read_lock(); list_for_each_entry_rcu(tg, &task_groups, list) { @@ -5941,11 +5967,15 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this /* Traverse only the allowed CPUs */ for_each_cpu_and(i, sched_group_span(group), p->cpus_ptr) { + struct rq *rq = cpu_rq(i); + + if (!sched_core_cookie_match(rq, p)) + continue; + if (sched_idle_cpu(i)) return i; if (available_idle_cpu(i)) { - struct rq *rq = cpu_rq(i); struct cpuidle_state *idle = idle_get_state(rq); if (idle && idle->exit_latency < min_exit_latency) { /* @@ -6031,9 +6061,10 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p return new_cpu; } -static inline int __select_idle_cpu(int cpu) +static inline int __select_idle_cpu(int cpu, struct task_struct *p) { - if (available_idle_cpu(cpu) || sched_idle_cpu(cpu)) + if ((available_idle_cpu(cpu) || sched_idle_cpu(cpu)) && + sched_cpu_cookie_match(cpu_rq(cpu), p)) return cpu; return -1; @@ -6103,7 +6134,7 @@ static int select_idle_core(struct task_struct *p, int core, struct cpumask *cpu int cpu; if (!static_branch_likely(&sched_smt_present)) - return __select_idle_cpu(core); + return __select_idle_cpu(core, p); for_each_cpu(cpu, cpu_smt_mask(core)) { if (!available_idle_cpu(cpu)) { @@ -6159,7 +6190,7 @@ static inline bool test_idle_cores(int cpu, bool def) static inline int select_idle_core(struct task_struct *p, int core, struct cpumask *cpus, int *idle_cpu) { - return __select_idle_cpu(core); + return __select_idle_cpu(core, p); } static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target) @@ -6178,9 +6209,10 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool { struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask); int i, cpu, idle_cpu = -1, nr = INT_MAX; + struct rq *this_rq = this_rq(); int this = smp_processor_id(); struct sched_domain *this_sd; - u64 time; + u64 time = 0; this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc)); if (!this_sd) @@ -6190,12 +6222,21 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool if (sched_feat(SIS_PROP) && !has_idle_core) { u64 avg_cost, avg_idle, span_avg; + unsigned long now = jiffies; /* - * Due to large variance we need a large fuzz factor; - * hackbench in particularly is sensitive here. + * If we're busy, the assumption that the last idle period + * predicts the future is flawed; age away the remaining + * predicted idle time. */ - avg_idle = this_rq()->avg_idle / 512; + if (unlikely(this_rq->wake_stamp < now)) { + while (this_rq->wake_stamp < now && this_rq->wake_avg_idle) { + this_rq->wake_stamp++; + this_rq->wake_avg_idle >>= 1; + } + } + + avg_idle = this_rq->wake_avg_idle; avg_cost = this_sd->avg_scan_cost + 1; span_avg = sd->span_weight * avg_idle; @@ -6216,7 +6257,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool } else { if (!--nr) return -1; - idle_cpu = __select_idle_cpu(cpu); + idle_cpu = __select_idle_cpu(cpu, p); if ((unsigned int)idle_cpu < nr_cpumask_bits) break; } @@ -6227,6 +6268,13 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool if (sched_feat(SIS_PROP) && !has_idle_core) { time = cpu_clock(this) - time; + + /* + * Account for the scan cost of wakeups against the average + * idle time. + */ + this_rq->wake_avg_idle -= min(this_rq->wake_avg_idle, time); + update_avg(&this_sd->avg_scan_cost, time); } @@ -6294,6 +6342,11 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) task_util = uclamp_task_util(p); } + /* + * per-cpu select_idle_mask usage + */ + lockdep_assert_irqs_disabled(); + if ((available_idle_cpu(target) || sched_idle_cpu(target)) && asym_fits_capacity(task_util, target)) return target; @@ -6569,8 +6622,11 @@ compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd) struct cpumask *pd_mask = perf_domain_span(pd); unsigned long cpu_cap = arch_scale_cpu_capacity(cpumask_first(pd_mask)); unsigned long max_util = 0, sum_util = 0; + unsigned long _cpu_cap = cpu_cap; int cpu; + _cpu_cap -= arch_scale_thermal_pressure(cpumask_first(pd_mask)); + /* * The capacity state of CPUs of the current rd can be driven by CPUs * of another rd if they belong to the same pd. So, account for the @@ -6606,8 +6662,10 @@ compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd) * is already enough to scale the EM reported power * consumption at the (eventually clamped) cpu_capacity. */ - sum_util += effective_cpu_util(cpu, util_running, cpu_cap, - ENERGY_UTIL, NULL); + cpu_util = effective_cpu_util(cpu, util_running, cpu_cap, + ENERGY_UTIL, NULL); + + sum_util += min(cpu_util, _cpu_cap); /* * Performance domain frequency: utilization clamping @@ -6618,10 +6676,10 @@ compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd) */ cpu_util = effective_cpu_util(cpu, util_freq, cpu_cap, FREQUENCY_UTIL, tsk); - max_util = max(max_util, cpu_util); + max_util = max(max_util, min(cpu_util, _cpu_cap)); } - return em_cpu_energy(pd->em_pd, max_util, sum_util); + return em_cpu_energy(pd->em_pd, max_util, sum_util, _cpu_cap); } /* @@ -6667,15 +6725,15 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) { unsigned long prev_delta = ULONG_MAX, best_delta = ULONG_MAX; struct root_domain *rd = cpu_rq(smp_processor_id())->rd; + int cpu, best_energy_cpu = prev_cpu, target = -1; unsigned long cpu_cap, util, base_energy = 0; - int cpu, best_energy_cpu = prev_cpu; struct sched_domain *sd; struct perf_domain *pd; rcu_read_lock(); pd = rcu_dereference(rd->pd); if (!pd || READ_ONCE(rd->overutilized)) - goto fail; + goto unlock; /* * Energy-aware wake-up happens on the lowest sched_domain starting @@ -6685,7 +6743,9 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) while (sd && !cpumask_test_cpu(prev_cpu, sched_domain_span(sd))) sd = sd->parent; if (!sd) - goto fail; + goto unlock; + + target = prev_cpu; sync_entity_load_avg(&p->se); if (!task_util_est(p)) @@ -6693,13 +6753,10 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) for (; pd; pd = pd->next) { unsigned long cur_delta, spare_cap, max_spare_cap = 0; + bool compute_prev_delta = false; unsigned long base_energy_pd; int max_spare_cap_cpu = -1; - /* Compute the 'base' energy of the pd, without @p */ - base_energy_pd = compute_energy(p, -1, pd); - base_energy += base_energy_pd; - for_each_cpu_and(cpu, perf_domain_span(pd), sched_domain_span(sd)) { if (!cpumask_test_cpu(cpu, p->cpus_ptr)) continue; @@ -6720,26 +6777,40 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) if (!fits_capacity(util, cpu_cap)) continue; - /* Always use prev_cpu as a candidate. */ if (cpu == prev_cpu) { - prev_delta = compute_energy(p, prev_cpu, pd); - prev_delta -= base_energy_pd; - best_delta = min(best_delta, prev_delta); - } - - /* - * Find the CPU with the maximum spare capacity in - * the performance domain - */ - if (spare_cap > max_spare_cap) { + /* Always use prev_cpu as a candidate. */ + compute_prev_delta = true; + } else if (spare_cap > max_spare_cap) { + /* + * Find the CPU with the maximum spare capacity + * in the performance domain. + */ max_spare_cap = spare_cap; max_spare_cap_cpu = cpu; } } - /* Evaluate the energy impact of using this CPU. */ - if (max_spare_cap_cpu >= 0 && max_spare_cap_cpu != prev_cpu) { + if (max_spare_cap_cpu < 0 && !compute_prev_delta) + continue; + + /* Compute the 'base' energy of the pd, without @p */ + base_energy_pd = compute_energy(p, -1, pd); + base_energy += base_energy_pd; + + /* Evaluate the energy impact of using prev_cpu. */ + if (compute_prev_delta) { + prev_delta = compute_energy(p, prev_cpu, pd); + if (prev_delta < base_energy_pd) + goto unlock; + prev_delta -= base_energy_pd; + best_delta = min(best_delta, prev_delta); + } + + /* Evaluate the energy impact of using max_spare_cap_cpu. */ + if (max_spare_cap_cpu >= 0) { cur_delta = compute_energy(p, max_spare_cap_cpu, pd); + if (cur_delta < base_energy_pd) + goto unlock; cur_delta -= base_energy_pd; if (cur_delta < best_delta) { best_delta = cur_delta; @@ -6747,25 +6818,22 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) } } } -unlock: rcu_read_unlock(); /* * Pick the best CPU if prev_cpu cannot be used, or if it saves at * least 6% of the energy used by prev_cpu. */ - if (prev_delta == ULONG_MAX) - return best_energy_cpu; - - if ((prev_delta - best_delta) > ((prev_delta + base_energy) >> 4)) - return best_energy_cpu; + if ((prev_delta == ULONG_MAX) || + (prev_delta - best_delta) > ((prev_delta + base_energy) >> 4)) + target = best_energy_cpu; - return prev_cpu; + return target; -fail: +unlock: rcu_read_unlock(); - return -1; + return target; } /* @@ -6777,8 +6845,6 @@ fail: * certain conditions an idle sibling CPU if the domain has SD_WAKE_AFFINE set. * * Returns the target CPU number. - * - * preempt must be disabled. */ static int select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags) @@ -6791,6 +6857,10 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags) /* SD_flags and WF_flags share the first nibble */ int sd_flag = wake_flags & 0xF; + /* + * required for stable ->cpus_allowed + */ + lockdep_assert_held(&p->pi_lock); if (wake_flags & WF_TTWU) { record_wakee(p); @@ -6855,7 +6925,7 @@ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu) * min_vruntime -- the latter is done by enqueue_entity() when placing * the task on the new runqueue. */ - if (p->state == TASK_WAKING) { + if (READ_ONCE(p->__state) == TASK_WAKING) { struct sched_entity *se = &p->se; struct cfs_rq *cfs_rq = cfs_rq_of(se); u64 min_vruntime; @@ -6880,7 +6950,7 @@ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu) * In case of TASK_ON_RQ_MIGRATING we in fact hold the 'old' * rq->lock and can modify state directly. */ - lockdep_assert_held(&task_rq(p)->lock); + lockdep_assert_rq_held(task_rq(p)); detach_entity_cfs_rq(&p->se); } else { @@ -7084,6 +7154,39 @@ preempt: set_last_buddy(se); } +#ifdef CONFIG_SMP +static struct task_struct *pick_task_fair(struct rq *rq) +{ + struct sched_entity *se; + struct cfs_rq *cfs_rq; + +again: + cfs_rq = &rq->cfs; + if (!cfs_rq->nr_running) + return NULL; + + do { + struct sched_entity *curr = cfs_rq->curr; + + /* When we pick for a remote RQ, we'll not have done put_prev_entity() */ + if (curr) { + if (curr->on_rq) + update_curr(cfs_rq); + else + curr = NULL; + + if (unlikely(check_cfs_rq_runtime(cfs_rq))) + goto again; + } + + se = pick_next_entity(cfs_rq, curr); + cfs_rq = group_cfs_rq(se); + } while (cfs_rq); + + return task_of(se); +} +#endif + struct task_struct * pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) { @@ -7507,7 +7610,7 @@ static int task_hot(struct task_struct *p, struct lb_env *env) { s64 delta; - lockdep_assert_held(&env->src_rq->lock); + lockdep_assert_rq_held(env->src_rq); if (p->sched_class != &fair_sched_class) return 0; @@ -7529,6 +7632,14 @@ static int task_hot(struct task_struct *p, struct lb_env *env) if (sysctl_sched_migration_cost == -1) return 1; + + /* + * Don't migrate task if the task's cookie does not match + * with the destination CPU's core cookie. + */ + if (!sched_core_cookie_match(cpu_rq(env->dst_cpu), p)) + return 1; + if (sysctl_sched_migration_cost == 0) return 0; @@ -7605,7 +7716,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) { int tsk_cache_hot; - lockdep_assert_held(&env->src_rq->lock); + lockdep_assert_rq_held(env->src_rq); /* * We do not migrate tasks that are: @@ -7694,7 +7805,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) */ static void detach_task(struct task_struct *p, struct lb_env *env) { - lockdep_assert_held(&env->src_rq->lock); + lockdep_assert_rq_held(env->src_rq); deactivate_task(env->src_rq, p, DEQUEUE_NOCLOCK); set_task_cpu(p, env->dst_cpu); @@ -7710,7 +7821,7 @@ static struct task_struct *detach_one_task(struct lb_env *env) { struct task_struct *p; - lockdep_assert_held(&env->src_rq->lock); + lockdep_assert_rq_held(env->src_rq); list_for_each_entry_reverse(p, &env->src_rq->cfs_tasks, se.group_node) { @@ -7746,7 +7857,7 @@ static int detach_tasks(struct lb_env *env) struct task_struct *p; int detached = 0; - lockdep_assert_held(&env->src_rq->lock); + lockdep_assert_rq_held(env->src_rq); /* * Source run queue has been emptied by another CPU, clear @@ -7876,7 +7987,7 @@ next: */ static void attach_task(struct rq *rq, struct task_struct *p) { - lockdep_assert_held(&rq->lock); + lockdep_assert_rq_held(rq); BUG_ON(task_rq(p) != rq); activate_task(rq, p, ENQUEUE_NOCLOCK); @@ -7996,23 +8107,6 @@ static bool __update_blocked_others(struct rq *rq, bool *done) #ifdef CONFIG_FAIR_GROUP_SCHED -static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq) -{ - if (cfs_rq->load.weight) - return false; - - if (cfs_rq->avg.load_sum) - return false; - - if (cfs_rq->avg.util_sum) - return false; - - if (cfs_rq->avg.runnable_sum) - return false; - - return true; -} - static bool __update_blocked_fair(struct rq *rq, bool *done) { struct cfs_rq *cfs_rq, *pos; @@ -8859,6 +8953,10 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) p->cpus_ptr)) continue; + /* Skip over this group if no cookie matched */ + if (!sched_group_cookie_match(cpu_rq(this_cpu), p, group)) + continue; + local_group = cpumask_test_cpu(this_cpu, sched_group_span(group)); @@ -9787,7 +9885,7 @@ more_balance: if (need_active_balance(&env)) { unsigned long flags; - raw_spin_lock_irqsave(&busiest->lock, flags); + raw_spin_rq_lock_irqsave(busiest, flags); /* * Don't kick the active_load_balance_cpu_stop, @@ -9795,8 +9893,7 @@ more_balance: * moved to this_cpu: */ if (!cpumask_test_cpu(this_cpu, busiest->curr->cpus_ptr)) { - raw_spin_unlock_irqrestore(&busiest->lock, - flags); + raw_spin_rq_unlock_irqrestore(busiest, flags); goto out_one_pinned; } @@ -9813,7 +9910,7 @@ more_balance: busiest->push_cpu = this_cpu; active_balance = 1; } - raw_spin_unlock_irqrestore(&busiest->lock, flags); + raw_spin_rq_unlock_irqrestore(busiest, flags); if (active_balance) { stop_one_cpu_nowait(cpu_of(busiest), @@ -10598,6 +10695,14 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf) u64 curr_cost = 0; update_misfit_status(NULL, this_rq); + + /* + * There is a task waiting to run. No need to search for one. + * Return 0; the task will be enqueued when switching to idle. + */ + if (this_rq->ttwu_pending) + return 0; + /* * We must set idle_stamp _before_ calling idle_balance(), such that we * measure the duration of idle_balance() as idle time. @@ -10630,7 +10735,7 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf) goto out; } - raw_spin_unlock(&this_rq->lock); + raw_spin_rq_unlock(this_rq); update_blocked_averages(this_cpu); rcu_read_lock(); @@ -10663,12 +10768,13 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf) * Stop searching for tasks to pull if there are * now runnable tasks on this rq. */ - if (pulled_task || this_rq->nr_running > 0) + if (pulled_task || this_rq->nr_running > 0 || + this_rq->ttwu_pending) break; } rcu_read_unlock(); - raw_spin_lock(&this_rq->lock); + raw_spin_rq_lock(this_rq); if (curr_cost > this_rq->max_idle_balance_cost) this_rq->max_idle_balance_cost = curr_cost; @@ -10761,6 +10867,119 @@ static void rq_offline_fair(struct rq *rq) #endif /* CONFIG_SMP */ +#ifdef CONFIG_SCHED_CORE +static inline bool +__entity_slice_used(struct sched_entity *se, int min_nr_tasks) +{ + u64 slice = sched_slice(cfs_rq_of(se), se); + u64 rtime = se->sum_exec_runtime - se->prev_sum_exec_runtime; + + return (rtime * min_nr_tasks > slice); +} + +#define MIN_NR_TASKS_DURING_FORCEIDLE 2 +static inline void task_tick_core(struct rq *rq, struct task_struct *curr) +{ + if (!sched_core_enabled(rq)) + return; + + /* + * If runqueue has only one task which used up its slice and + * if the sibling is forced idle, then trigger schedule to + * give forced idle task a chance. + * + * sched_slice() considers only this active rq and it gets the + * whole slice. But during force idle, we have siblings acting + * like a single runqueue and hence we need to consider runnable + * tasks on this CPU and the forced idle CPU. Ideally, we should + * go through the forced idle rq, but that would be a perf hit. + * We can assume that the forced idle CPU has at least + * MIN_NR_TASKS_DURING_FORCEIDLE - 1 tasks and use that to check + * if we need to give up the CPU. + */ + if (rq->core->core_forceidle && rq->cfs.nr_running == 1 && + __entity_slice_used(&curr->se, MIN_NR_TASKS_DURING_FORCEIDLE)) + resched_curr(rq); +} + +/* + * se_fi_update - Update the cfs_rq->min_vruntime_fi in a CFS hierarchy if needed. + */ +static void se_fi_update(struct sched_entity *se, unsigned int fi_seq, bool forceidle) +{ + for_each_sched_entity(se) { + struct cfs_rq *cfs_rq = cfs_rq_of(se); + + if (forceidle) { + if (cfs_rq->forceidle_seq == fi_seq) + break; + cfs_rq->forceidle_seq = fi_seq; + } + + cfs_rq->min_vruntime_fi = cfs_rq->min_vruntime; + } +} + +void task_vruntime_update(struct rq *rq, struct task_struct *p, bool in_fi) +{ + struct sched_entity *se = &p->se; + + if (p->sched_class != &fair_sched_class) + return; + + se_fi_update(se, rq->core->core_forceidle_seq, in_fi); +} + +bool cfs_prio_less(struct task_struct *a, struct task_struct *b, bool in_fi) +{ + struct rq *rq = task_rq(a); + struct sched_entity *sea = &a->se; + struct sched_entity *seb = &b->se; + struct cfs_rq *cfs_rqa; + struct cfs_rq *cfs_rqb; + s64 delta; + + SCHED_WARN_ON(task_rq(b)->core != rq->core); + +#ifdef CONFIG_FAIR_GROUP_SCHED + /* + * Find an se in the hierarchy for tasks a and b, such that the se's + * are immediate siblings. + */ + while (sea->cfs_rq->tg != seb->cfs_rq->tg) { + int sea_depth = sea->depth; + int seb_depth = seb->depth; + + if (sea_depth >= seb_depth) + sea = parent_entity(sea); + if (sea_depth <= seb_depth) + seb = parent_entity(seb); + } + + se_fi_update(sea, rq->core->core_forceidle_seq, in_fi); + se_fi_update(seb, rq->core->core_forceidle_seq, in_fi); + + cfs_rqa = sea->cfs_rq; + cfs_rqb = seb->cfs_rq; +#else + cfs_rqa = &task_rq(a)->cfs; + cfs_rqb = &task_rq(b)->cfs; +#endif + + /* + * Find delta after normalizing se's vruntime with its cfs_rq's + * min_vruntime_fi, which would have been updated in prior calls + * to se_fi_update(). + */ + delta = (s64)(sea->vruntime - seb->vruntime) + + (s64)(cfs_rqb->min_vruntime_fi - cfs_rqa->min_vruntime_fi); + + return delta > 0; +} +#else +static inline void task_tick_core(struct rq *rq, struct task_struct *curr) {} +#endif + /* * scheduler tick hitting a task of our scheduling class. * @@ -10784,6 +11003,8 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) update_misfit_status(curr, rq); update_overutilized_status(task_rq(curr)); + + task_tick_core(rq, curr); } /* @@ -10869,7 +11090,7 @@ static inline bool vruntime_normalized(struct task_struct *p) * waiting for actually being woken up by sched_ttwu_pending(). */ if (!se->sum_exec_runtime || - (p->state == TASK_WAKING && p->sched_remote_wakeup)) + (READ_ONCE(p->__state) == TASK_WAKING && p->sched_remote_wakeup)) return true; return false; @@ -11155,9 +11376,9 @@ void unregister_fair_sched_group(struct task_group *tg) rq = cpu_rq(cpu); - raw_spin_lock_irqsave(&rq->lock, flags); + raw_spin_rq_lock_irqsave(rq, flags); list_del_leaf_cfs_rq(tg->cfs_rq[cpu]); - raw_spin_unlock_irqrestore(&rq->lock, flags); + raw_spin_rq_unlock_irqrestore(rq, flags); } } @@ -11279,6 +11500,7 @@ DEFINE_SCHED_CLASS(fair) = { #ifdef CONFIG_SMP .balance = balance_fair, + .pick_task = pick_task_fair, .select_task_rq = select_task_rq_fair, .migrate_task_rq = migrate_task_rq_fair, diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 7ca3d3d86c2a..912b47aa99d8 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -437,8 +437,16 @@ static void set_next_task_idle(struct rq *rq, struct task_struct *next, bool fir { update_idle_core(rq); schedstat_inc(rq->sched_goidle); + queue_core_balance(rq); } +#ifdef CONFIG_SMP +static struct task_struct *pick_task_idle(struct rq *rq) +{ + return rq->idle; +} +#endif + struct task_struct *pick_next_task_idle(struct rq *rq) { struct task_struct *next = rq->idle; @@ -455,10 +463,10 @@ struct task_struct *pick_next_task_idle(struct rq *rq) static void dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags) { - raw_spin_unlock_irq(&rq->lock); + raw_spin_rq_unlock_irq(rq); printk(KERN_ERR "bad: scheduling from the idle thread!\n"); dump_stack(); - raw_spin_lock_irq(&rq->lock); + raw_spin_rq_lock_irq(rq); } /* @@ -506,6 +514,7 @@ DEFINE_SCHED_CLASS(idle) = { #ifdef CONFIG_SMP .balance = balance_idle, + .pick_task = pick_task_idle, .select_task_rq = select_task_rq_idle, .set_cpus_allowed = set_cpus_allowed_common, #endif diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c index 5a6ea03f9882..7f06eaf12818 100644 --- a/kernel/sched/isolation.c +++ b/kernel/sched/isolation.c @@ -81,11 +81,9 @@ static int __init housekeeping_setup(char *str, enum hk_flags flags) { cpumask_var_t non_housekeeping_mask; cpumask_var_t tmp; - int err; alloc_bootmem_cpumask_var(&non_housekeeping_mask); - err = cpulist_parse(str, non_housekeeping_mask); - if (err < 0 || cpumask_last(non_housekeeping_mask) >= nr_cpu_ids) { + if (cpulist_parse(str, non_housekeeping_mask) < 0) { pr_warn("Housekeeping: nohz_full= or isolcpus= incorrect CPU range\n"); free_bootmem_cpumask_var(non_housekeeping_mask); return 0; diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c index 1c79896f1bc0..954b229868d9 100644 --- a/kernel/sched/loadavg.c +++ b/kernel/sched/loadavg.c @@ -81,7 +81,7 @@ long calc_load_fold_active(struct rq *this_rq, long adjust) long nr_active, delta = 0; nr_active = this_rq->nr_running - adjust; - nr_active += (long)this_rq->nr_uninterruptible; + nr_active += (int)this_rq->nr_uninterruptible; if (nr_active != this_rq->calc_load_active) { delta = nr_active - this_rq->calc_load_active; diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h index cfe94ffd2b38..e06071bf3472 100644 --- a/kernel/sched/pelt.h +++ b/kernel/sched/pelt.h @@ -132,7 +132,7 @@ static inline void update_idle_rq_clock_pelt(struct rq *rq) static inline u64 rq_clock_pelt(struct rq *rq) { - lockdep_assert_held(&rq->lock); + lockdep_assert_rq_held(rq); assert_clock_updated(rq); return rq->clock_pelt - rq->lost_idle_time; diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index cc25a3cff41f..58b36d17a09a 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -182,6 +182,8 @@ struct psi_group psi_system = { static void psi_avgs_work(struct work_struct *work); +static void poll_timer_fn(struct timer_list *t); + static void group_init(struct psi_group *group) { int cpu; @@ -201,6 +203,8 @@ static void group_init(struct psi_group *group) memset(group->polling_total, 0, sizeof(group->polling_total)); group->polling_next_update = ULLONG_MAX; group->polling_until = 0; + init_waitqueue_head(&group->poll_wait); + timer_setup(&group->poll_timer, poll_timer_fn, 0); rcu_assign_pointer(group->poll_task, NULL); } @@ -1157,9 +1161,7 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group, return ERR_CAST(task); } atomic_set(&group->poll_wakeup, 0); - init_waitqueue_head(&group->poll_wait); wake_up_process(task); - timer_setup(&group->poll_timer, poll_timer_fn, 0); rcu_assign_pointer(group->poll_task, task); } @@ -1211,6 +1213,7 @@ static void psi_trigger_destroy(struct kref *ref) group->poll_task, lockdep_is_held(&group->trigger_lock)); rcu_assign_pointer(group->poll_task, NULL); + del_timer(&group->poll_timer); } } @@ -1223,17 +1226,14 @@ static void psi_trigger_destroy(struct kref *ref) */ synchronize_rcu(); /* - * Destroy the kworker after releasing trigger_lock to prevent a + * Stop kthread 'psimon' after releasing trigger_lock to prevent a * deadlock while waiting for psi_poll_work to acquire trigger_lock */ if (task_to_destroy) { /* * After the RCU grace period has expired, the worker * can no longer be found through group->poll_task. - * But it might have been already scheduled before - * that - deschedule it cleanly before destroying it. */ - del_timer_sync(&group->poll_timer); kthread_stop(task_to_destroy); } kfree(t); diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index c286e5ba3c94..3daf42a0f462 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -888,7 +888,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) if (skip) continue; - raw_spin_lock(&rq->lock); + raw_spin_rq_lock(rq); update_rq_clock(rq); if (rt_rq->rt_time) { @@ -926,7 +926,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) if (enqueue) sched_rt_rq_enqueue(rt_rq); - raw_spin_unlock(&rq->lock); + raw_spin_rq_unlock(rq); } if (!throttled && (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)) @@ -1626,7 +1626,7 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq) return rt_task_of(rt_se); } -static struct task_struct *pick_next_task_rt(struct rq *rq) +static struct task_struct *pick_task_rt(struct rq *rq) { struct task_struct *p; @@ -1634,7 +1634,17 @@ static struct task_struct *pick_next_task_rt(struct rq *rq) return NULL; p = _pick_next_task_rt(rq); - set_next_task_rt(rq, p, true); + + return p; +} + +static struct task_struct *pick_next_task_rt(struct rq *rq) +{ + struct task_struct *p = pick_task_rt(rq); + + if (p) + set_next_task_rt(rq, p, true); + return p; } @@ -1894,10 +1904,10 @@ retry: */ push_task = get_push_task(rq); if (push_task) { - raw_spin_unlock(&rq->lock); + raw_spin_rq_unlock(rq); stop_one_cpu_nowait(rq->cpu, push_cpu_stop, push_task, &rq->push_work); - raw_spin_lock(&rq->lock); + raw_spin_rq_lock(rq); } return 0; @@ -2122,10 +2132,10 @@ void rto_push_irq_work_func(struct irq_work *work) * When it gets updated, a check is made if a push is possible. */ if (has_pushable_tasks(rq)) { - raw_spin_lock(&rq->lock); + raw_spin_rq_lock(rq); while (push_rt_task(rq, true)) ; - raw_spin_unlock(&rq->lock); + raw_spin_rq_unlock(rq); } raw_spin_lock(&rd->rto_lock); @@ -2243,10 +2253,10 @@ skip: double_unlock_balance(this_rq, src_rq); if (push_task) { - raw_spin_unlock(&this_rq->lock); + raw_spin_rq_unlock(this_rq); stop_one_cpu_nowait(src_rq->cpu, push_cpu_stop, push_task, &src_rq->push_work); - raw_spin_lock(&this_rq->lock); + raw_spin_rq_lock(this_rq); } } @@ -2331,13 +2341,20 @@ void __init init_sched_rt_class(void) static void switched_to_rt(struct rq *rq, struct task_struct *p) { /* - * If we are already running, then there's nothing - * that needs to be done. But if we are not running - * we may need to preempt the current running task. - * If that current running task is also an RT task + * If we are running, update the avg_rt tracking, as the running time + * will now on be accounted into the latter. + */ + if (task_current(rq, p)) { + update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 0); + return; + } + + /* + * If we are not running we may need to preempt the current + * running task. If that current running task is also an RT task * then see if we can move to another run queue. */ - if (task_on_rq_queued(p) && rq->curr != p) { + if (task_on_rq_queued(p)) { #ifdef CONFIG_SMP if (p->nr_cpus_allowed > 1 && rq->rt.overloaded) rt_queue_push_tasks(rq); @@ -2483,6 +2500,7 @@ DEFINE_SCHED_CLASS(rt) = { #ifdef CONFIG_SMP .balance = balance_rt, + .pick_task = pick_task_rt, .select_task_rq = select_task_rq_rt, .set_cpus_allowed = set_cpus_allowed_common, .rq_online = rq_online_rt, diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index a189bec13729..c80d42e9589b 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -366,6 +366,7 @@ struct cfs_bandwidth { ktime_t period; u64 quota; u64 runtime; + u64 burst; s64 hierarchical_quota; u8 idle; @@ -526,6 +527,11 @@ struct cfs_rq { u64 exec_clock; u64 min_vruntime; +#ifdef CONFIG_SCHED_CORE + unsigned int forceidle_seq; + u64 min_vruntime_fi; +#endif + #ifndef CONFIG_64BIT u64 min_vruntime_copy; #endif @@ -631,8 +637,8 @@ struct rt_rq { } highest_prio; #endif #ifdef CONFIG_SMP - unsigned long rt_nr_migratory; - unsigned long rt_nr_total; + unsigned int rt_nr_migratory; + unsigned int rt_nr_total; int overloaded; struct plist_head pushable_tasks; @@ -646,7 +652,7 @@ struct rt_rq { raw_spinlock_t rt_runtime_lock; #ifdef CONFIG_RT_GROUP_SCHED - unsigned long rt_nr_boosted; + unsigned int rt_nr_boosted; struct rq *rq; struct task_group *tg; @@ -663,7 +669,7 @@ struct dl_rq { /* runqueue is an rbtree, ordered by deadline */ struct rb_root_cached root; - unsigned long dl_nr_running; + unsigned int dl_nr_running; #ifdef CONFIG_SMP /* @@ -677,7 +683,7 @@ struct dl_rq { u64 next; } earliest_dl; - unsigned long dl_nr_migratory; + unsigned int dl_nr_migratory; int overloaded; /* @@ -905,7 +911,7 @@ DECLARE_STATIC_KEY_FALSE(sched_uclamp_used); */ struct rq { /* runqueue lock: */ - raw_spinlock_t lock; + raw_spinlock_t __lock; /* * nr_running and cpu_load should be in the same cacheline because @@ -955,7 +961,7 @@ struct rq { * one CPU and if it got migrated afterwards it may decrease * it on another CPU. Always updated under the runqueue lock: */ - unsigned long nr_uninterruptible; + unsigned int nr_uninterruptible; struct task_struct __rcu *curr; struct task_struct *idle; @@ -1017,6 +1023,9 @@ struct rq { u64 idle_stamp; u64 avg_idle; + unsigned long wake_stamp; + u64 wake_avg_idle; + /* This is used to determine avg_idle's max value */ u64 max_idle_balance_cost; @@ -1075,6 +1084,22 @@ struct rq { #endif unsigned int push_busy; struct cpu_stop_work push_work; + +#ifdef CONFIG_SCHED_CORE + /* per rq */ + struct rq *core; + struct task_struct *core_pick; + unsigned int core_enabled; + unsigned int core_sched_seq; + struct rb_root core_tree; + + /* shared state */ + unsigned int core_task_seq; + unsigned int core_pick_seq; + unsigned long core_cookie; + unsigned char core_forceidle; + unsigned int core_forceidle_seq; +#endif }; #ifdef CONFIG_FAIR_GROUP_SCHED @@ -1113,6 +1138,206 @@ static inline bool is_migration_disabled(struct task_struct *p) #endif } +struct sched_group; +#ifdef CONFIG_SCHED_CORE +static inline struct cpumask *sched_group_span(struct sched_group *sg); + +DECLARE_STATIC_KEY_FALSE(__sched_core_enabled); + +static inline bool sched_core_enabled(struct rq *rq) +{ + return static_branch_unlikely(&__sched_core_enabled) && rq->core_enabled; +} + +static inline bool sched_core_disabled(void) +{ + return !static_branch_unlikely(&__sched_core_enabled); +} + +/* + * Be careful with this function; not for general use. The return value isn't + * stable unless you actually hold a relevant rq->__lock. + */ +static inline raw_spinlock_t *rq_lockp(struct rq *rq) +{ + if (sched_core_enabled(rq)) + return &rq->core->__lock; + + return &rq->__lock; +} + +static inline raw_spinlock_t *__rq_lockp(struct rq *rq) +{ + if (rq->core_enabled) + return &rq->core->__lock; + + return &rq->__lock; +} + +bool cfs_prio_less(struct task_struct *a, struct task_struct *b, bool fi); + +/* + * Helpers to check if the CPU's core cookie matches with the task's cookie + * when core scheduling is enabled. + * A special case is that the task's cookie always matches with CPU's core + * cookie if the CPU is in an idle core. + */ +static inline bool sched_cpu_cookie_match(struct rq *rq, struct task_struct *p) +{ + /* Ignore cookie match if core scheduler is not enabled on the CPU. */ + if (!sched_core_enabled(rq)) + return true; + + return rq->core->core_cookie == p->core_cookie; +} + +static inline bool sched_core_cookie_match(struct rq *rq, struct task_struct *p) +{ + bool idle_core = true; + int cpu; + + /* Ignore cookie match if core scheduler is not enabled on the CPU. */ + if (!sched_core_enabled(rq)) + return true; + + for_each_cpu(cpu, cpu_smt_mask(cpu_of(rq))) { + if (!available_idle_cpu(cpu)) { + idle_core = false; + break; + } + } + + /* + * A CPU in an idle core is always the best choice for tasks with + * cookies. + */ + return idle_core || rq->core->core_cookie == p->core_cookie; +} + +static inline bool sched_group_cookie_match(struct rq *rq, + struct task_struct *p, + struct sched_group *group) +{ + int cpu; + + /* Ignore cookie match if core scheduler is not enabled on the CPU. */ + if (!sched_core_enabled(rq)) + return true; + + for_each_cpu_and(cpu, sched_group_span(group), p->cpus_ptr) { + if (sched_core_cookie_match(rq, p)) + return true; + } + return false; +} + +extern void queue_core_balance(struct rq *rq); + +static inline bool sched_core_enqueued(struct task_struct *p) +{ + return !RB_EMPTY_NODE(&p->core_node); +} + +extern void sched_core_enqueue(struct rq *rq, struct task_struct *p); +extern void sched_core_dequeue(struct rq *rq, struct task_struct *p); + +extern void sched_core_get(void); +extern void sched_core_put(void); + +extern unsigned long sched_core_alloc_cookie(void); +extern void sched_core_put_cookie(unsigned long cookie); +extern unsigned long sched_core_get_cookie(unsigned long cookie); +extern unsigned long sched_core_update_cookie(struct task_struct *p, unsigned long cookie); + +#else /* !CONFIG_SCHED_CORE */ + +static inline bool sched_core_enabled(struct rq *rq) +{ + return false; +} + +static inline bool sched_core_disabled(void) +{ + return true; +} + +static inline raw_spinlock_t *rq_lockp(struct rq *rq) +{ + return &rq->__lock; +} + +static inline raw_spinlock_t *__rq_lockp(struct rq *rq) +{ + return &rq->__lock; +} + +static inline void queue_core_balance(struct rq *rq) +{ +} + +static inline bool sched_cpu_cookie_match(struct rq *rq, struct task_struct *p) +{ + return true; +} + +static inline bool sched_core_cookie_match(struct rq *rq, struct task_struct *p) +{ + return true; +} + +static inline bool sched_group_cookie_match(struct rq *rq, + struct task_struct *p, + struct sched_group *group) +{ + return true; +} +#endif /* CONFIG_SCHED_CORE */ + +static inline void lockdep_assert_rq_held(struct rq *rq) +{ + lockdep_assert_held(__rq_lockp(rq)); +} + +extern void raw_spin_rq_lock_nested(struct rq *rq, int subclass); +extern bool raw_spin_rq_trylock(struct rq *rq); +extern void raw_spin_rq_unlock(struct rq *rq); + +static inline void raw_spin_rq_lock(struct rq *rq) +{ + raw_spin_rq_lock_nested(rq, 0); +} + +static inline void raw_spin_rq_lock_irq(struct rq *rq) +{ + local_irq_disable(); + raw_spin_rq_lock(rq); +} + +static inline void raw_spin_rq_unlock_irq(struct rq *rq) +{ + raw_spin_rq_unlock(rq); + local_irq_enable(); +} + +static inline unsigned long _raw_spin_rq_lock_irqsave(struct rq *rq) +{ + unsigned long flags; + local_irq_save(flags); + raw_spin_rq_lock(rq); + return flags; +} + +static inline void raw_spin_rq_unlock_irqrestore(struct rq *rq, unsigned long flags) +{ + raw_spin_rq_unlock(rq); + local_irq_restore(flags); +} + +#define raw_spin_rq_lock_irqsave(rq, flags) \ +do { \ + flags = _raw_spin_rq_lock_irqsave(rq); \ +} while (0) + #ifdef CONFIG_SCHED_SMT extern void __update_idle_core(struct rq *rq); @@ -1134,6 +1359,57 @@ DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); #define cpu_curr(cpu) (cpu_rq(cpu)->curr) #define raw_rq() raw_cpu_ptr(&runqueues) +#ifdef CONFIG_FAIR_GROUP_SCHED +static inline struct task_struct *task_of(struct sched_entity *se) +{ + SCHED_WARN_ON(!entity_is_task(se)); + return container_of(se, struct task_struct, se); +} + +static inline struct cfs_rq *task_cfs_rq(struct task_struct *p) +{ + return p->se.cfs_rq; +} + +/* runqueue on which this entity is (to be) queued */ +static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se) +{ + return se->cfs_rq; +} + +/* runqueue "owned" by this group */ +static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) +{ + return grp->my_q; +} + +#else + +static inline struct task_struct *task_of(struct sched_entity *se) +{ + return container_of(se, struct task_struct, se); +} + +static inline struct cfs_rq *task_cfs_rq(struct task_struct *p) +{ + return &task_rq(p)->cfs; +} + +static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se) +{ + struct task_struct *p = task_of(se); + struct rq *rq = task_rq(p); + + return &rq->cfs; +} + +/* runqueue "owned" by this group */ +static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) +{ + return NULL; +} +#endif + extern void update_rq_clock(struct rq *rq); static inline u64 __rq_clock_broken(struct rq *rq) @@ -1179,7 +1455,7 @@ static inline void assert_clock_updated(struct rq *rq) static inline u64 rq_clock(struct rq *rq) { - lockdep_assert_held(&rq->lock); + lockdep_assert_rq_held(rq); assert_clock_updated(rq); return rq->clock; @@ -1187,7 +1463,7 @@ static inline u64 rq_clock(struct rq *rq) static inline u64 rq_clock_task(struct rq *rq) { - lockdep_assert_held(&rq->lock); + lockdep_assert_rq_held(rq); assert_clock_updated(rq); return rq->clock_task; @@ -1213,7 +1489,7 @@ static inline u64 rq_clock_thermal(struct rq *rq) static inline void rq_clock_skip_update(struct rq *rq) { - lockdep_assert_held(&rq->lock); + lockdep_assert_rq_held(rq); rq->clock_update_flags |= RQCF_REQ_SKIP; } @@ -1223,7 +1499,7 @@ static inline void rq_clock_skip_update(struct rq *rq) */ static inline void rq_clock_cancel_skipupdate(struct rq *rq) { - lockdep_assert_held(&rq->lock); + lockdep_assert_rq_held(rq); rq->clock_update_flags &= ~RQCF_REQ_SKIP; } @@ -1254,7 +1530,7 @@ extern struct callback_head balance_push_callback; */ static inline void rq_pin_lock(struct rq *rq, struct rq_flags *rf) { - rf->cookie = lockdep_pin_lock(&rq->lock); + rf->cookie = lockdep_pin_lock(__rq_lockp(rq)); #ifdef CONFIG_SCHED_DEBUG rq->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP); @@ -1272,12 +1548,12 @@ static inline void rq_unpin_lock(struct rq *rq, struct rq_flags *rf) rf->clock_update_flags = RQCF_UPDATED; #endif - lockdep_unpin_lock(&rq->lock, rf->cookie); + lockdep_unpin_lock(__rq_lockp(rq), rf->cookie); } static inline void rq_repin_lock(struct rq *rq, struct rq_flags *rf) { - lockdep_repin_lock(&rq->lock, rf->cookie); + lockdep_repin_lock(__rq_lockp(rq), rf->cookie); #ifdef CONFIG_SCHED_DEBUG /* @@ -1298,7 +1574,7 @@ static inline void __task_rq_unlock(struct rq *rq, struct rq_flags *rf) __releases(rq->lock) { rq_unpin_lock(rq, rf); - raw_spin_unlock(&rq->lock); + raw_spin_rq_unlock(rq); } static inline void @@ -1307,7 +1583,7 @@ task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf) __releases(p->pi_lock) { rq_unpin_lock(rq, rf); - raw_spin_unlock(&rq->lock); + raw_spin_rq_unlock(rq); raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags); } @@ -1315,7 +1591,7 @@ static inline void rq_lock_irqsave(struct rq *rq, struct rq_flags *rf) __acquires(rq->lock) { - raw_spin_lock_irqsave(&rq->lock, rf->flags); + raw_spin_rq_lock_irqsave(rq, rf->flags); rq_pin_lock(rq, rf); } @@ -1323,7 +1599,7 @@ static inline void rq_lock_irq(struct rq *rq, struct rq_flags *rf) __acquires(rq->lock) { - raw_spin_lock_irq(&rq->lock); + raw_spin_rq_lock_irq(rq); rq_pin_lock(rq, rf); } @@ -1331,7 +1607,7 @@ static inline void rq_lock(struct rq *rq, struct rq_flags *rf) __acquires(rq->lock) { - raw_spin_lock(&rq->lock); + raw_spin_rq_lock(rq); rq_pin_lock(rq, rf); } @@ -1339,7 +1615,7 @@ static inline void rq_relock(struct rq *rq, struct rq_flags *rf) __acquires(rq->lock) { - raw_spin_lock(&rq->lock); + raw_spin_rq_lock(rq); rq_repin_lock(rq, rf); } @@ -1348,7 +1624,7 @@ rq_unlock_irqrestore(struct rq *rq, struct rq_flags *rf) __releases(rq->lock) { rq_unpin_lock(rq, rf); - raw_spin_unlock_irqrestore(&rq->lock, rf->flags); + raw_spin_rq_unlock_irqrestore(rq, rf->flags); } static inline void @@ -1356,7 +1632,7 @@ rq_unlock_irq(struct rq *rq, struct rq_flags *rf) __releases(rq->lock) { rq_unpin_lock(rq, rf); - raw_spin_unlock_irq(&rq->lock); + raw_spin_rq_unlock_irq(rq); } static inline void @@ -1364,7 +1640,7 @@ rq_unlock(struct rq *rq, struct rq_flags *rf) __releases(rq->lock) { rq_unpin_lock(rq, rf); - raw_spin_unlock(&rq->lock); + raw_spin_rq_unlock(rq); } static inline struct rq * @@ -1429,7 +1705,7 @@ queue_balance_callback(struct rq *rq, struct callback_head *head, void (*func)(struct rq *rq)) { - lockdep_assert_held(&rq->lock); + lockdep_assert_rq_held(rq); if (unlikely(head->next || rq->balance_callback == &balance_push_callback)) return; @@ -1844,6 +2120,9 @@ struct sched_class { #ifdef CONFIG_SMP int (*balance)(struct rq *rq, struct task_struct *prev, struct rq_flags *rf); int (*select_task_rq)(struct task_struct *p, int task_cpu, int flags); + + struct task_struct * (*pick_task)(struct rq *rq); + void (*migrate_task_rq)(struct task_struct *p, int new_cpu); void (*task_woken)(struct rq *this_rq, struct task_struct *task); @@ -1893,7 +2172,6 @@ static inline void put_prev_task(struct rq *rq, struct task_struct *prev) static inline void set_next_task(struct rq *rq, struct task_struct *next) { - WARN_ON_ONCE(rq->curr != next); next->sched_class->set_next_task(rq, next, false); } @@ -1969,7 +2247,7 @@ static inline struct task_struct *get_push_task(struct rq *rq) { struct task_struct *p = rq->curr; - lockdep_assert_held(&rq->lock); + lockdep_assert_rq_held(rq); if (rq->push_busy) return NULL; @@ -2181,10 +2459,38 @@ unsigned long arch_scale_freq_capacity(int cpu) } #endif + #ifdef CONFIG_SMP -#ifdef CONFIG_PREEMPTION -static inline void double_rq_lock(struct rq *rq1, struct rq *rq2); +static inline bool rq_order_less(struct rq *rq1, struct rq *rq2) +{ +#ifdef CONFIG_SCHED_CORE + /* + * In order to not have {0,2},{1,3} turn into into an AB-BA, + * order by core-id first and cpu-id second. + * + * Notably: + * + * double_rq_lock(0,3); will take core-0, core-1 lock + * double_rq_lock(1,2); will take core-1, core-0 lock + * + * when only cpu-id is considered. + */ + if (rq1->core->cpu < rq2->core->cpu) + return true; + if (rq1->core->cpu > rq2->core->cpu) + return false; + + /* + * __sched_core_flip() relies on SMT having cpu-id lock order. + */ +#endif + return rq1->cpu < rq2->cpu; +} + +extern void double_rq_lock(struct rq *rq1, struct rq *rq2); + +#ifdef CONFIG_PREEMPTION /* * fair double_lock_balance: Safely acquires both rq->locks in a fair @@ -2199,7 +2505,7 @@ static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest) __acquires(busiest->lock) __acquires(this_rq->lock) { - raw_spin_unlock(&this_rq->lock); + raw_spin_rq_unlock(this_rq); double_rq_lock(this_rq, busiest); return 1; @@ -2218,20 +2524,21 @@ static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest) __acquires(busiest->lock) __acquires(this_rq->lock) { - int ret = 0; - - if (unlikely(!raw_spin_trylock(&busiest->lock))) { - if (busiest < this_rq) { - raw_spin_unlock(&this_rq->lock); - raw_spin_lock(&busiest->lock); - raw_spin_lock_nested(&this_rq->lock, - SINGLE_DEPTH_NESTING); - ret = 1; - } else - raw_spin_lock_nested(&busiest->lock, - SINGLE_DEPTH_NESTING); + if (__rq_lockp(this_rq) == __rq_lockp(busiest)) + return 0; + + if (likely(raw_spin_rq_trylock(busiest))) + return 0; + + if (rq_order_less(this_rq, busiest)) { + raw_spin_rq_lock_nested(busiest, SINGLE_DEPTH_NESTING); + return 0; } - return ret; + + raw_spin_rq_unlock(this_rq); + double_rq_lock(this_rq, busiest); + + return 1; } #endif /* CONFIG_PREEMPTION */ @@ -2241,11 +2548,7 @@ static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest) */ static inline int double_lock_balance(struct rq *this_rq, struct rq *busiest) { - if (unlikely(!irqs_disabled())) { - /* printk() doesn't work well under rq->lock */ - raw_spin_unlock(&this_rq->lock); - BUG_ON(1); - } + lockdep_assert_irqs_disabled(); return _double_lock_balance(this_rq, busiest); } @@ -2253,8 +2556,9 @@ static inline int double_lock_balance(struct rq *this_rq, struct rq *busiest) static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest) __releases(busiest->lock) { - raw_spin_unlock(&busiest->lock); - lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_); + if (__rq_lockp(this_rq) != __rq_lockp(busiest)) + raw_spin_rq_unlock(busiest); + lock_set_subclass(&__rq_lockp(this_rq)->dep_map, 0, _RET_IP_); } static inline void double_lock(spinlock_t *l1, spinlock_t *l2) @@ -2285,31 +2589,6 @@ static inline void double_raw_lock(raw_spinlock_t *l1, raw_spinlock_t *l2) } /* - * double_rq_lock - safely lock two runqueues - * - * Note this does not disable interrupts like task_rq_lock, - * you need to do so manually before calling. - */ -static inline void double_rq_lock(struct rq *rq1, struct rq *rq2) - __acquires(rq1->lock) - __acquires(rq2->lock) -{ - BUG_ON(!irqs_disabled()); - if (rq1 == rq2) { - raw_spin_lock(&rq1->lock); - __acquire(rq2->lock); /* Fake it out ;) */ - } else { - if (rq1 < rq2) { - raw_spin_lock(&rq1->lock); - raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING); - } else { - raw_spin_lock(&rq2->lock); - raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING); - } - } -} - -/* * double_rq_unlock - safely unlock two runqueues * * Note this does not restore interrupts like task_rq_unlock, @@ -2319,11 +2598,11 @@ static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2) __releases(rq1->lock) __releases(rq2->lock) { - raw_spin_unlock(&rq1->lock); - if (rq1 != rq2) - raw_spin_unlock(&rq2->lock); + if (__rq_lockp(rq1) != __rq_lockp(rq2)) + raw_spin_rq_unlock(rq2); else __release(rq2->lock); + raw_spin_rq_unlock(rq1); } extern void set_rq_online (struct rq *rq); @@ -2344,7 +2623,7 @@ static inline void double_rq_lock(struct rq *rq1, struct rq *rq2) { BUG_ON(!irqs_disabled()); BUG_ON(rq1 != rq2); - raw_spin_lock(&rq1->lock); + raw_spin_rq_lock(rq1); __acquire(rq2->lock); /* Fake it out ;) */ } @@ -2359,7 +2638,7 @@ static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2) __releases(rq2->lock) { BUG_ON(rq1 != rq2); - raw_spin_unlock(&rq1->lock); + raw_spin_rq_unlock(rq1); __release(rq2->lock); } diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index dc218e9f4558..d8f8eb0c655b 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -25,7 +25,7 @@ rq_sched_info_depart(struct rq *rq, unsigned long long delta) } static inline void -rq_sched_info_dequeued(struct rq *rq, unsigned long long delta) +rq_sched_info_dequeue(struct rq *rq, unsigned long long delta) { if (rq) rq->rq_sched_info.run_delay += delta; @@ -42,7 +42,7 @@ rq_sched_info_dequeued(struct rq *rq, unsigned long long delta) #else /* !CONFIG_SCHEDSTATS: */ static inline void rq_sched_info_arrive (struct rq *rq, unsigned long long delta) { } -static inline void rq_sched_info_dequeued(struct rq *rq, unsigned long long delta) { } +static inline void rq_sched_info_dequeue(struct rq *rq, unsigned long long delta) { } static inline void rq_sched_info_depart (struct rq *rq, unsigned long long delta) { } # define schedstat_enabled() 0 # define __schedstat_inc(var) do { } while (0) @@ -150,29 +150,24 @@ static inline void psi_sched_switch(struct task_struct *prev, #endif /* CONFIG_PSI */ #ifdef CONFIG_SCHED_INFO -static inline void sched_info_reset_dequeued(struct task_struct *t) -{ - t->sched_info.last_queued = 0; -} - /* * We are interested in knowing how long it was from the *first* time a * task was queued to the time that it finally hit a CPU, we call this routine * from dequeue_task() to account for possible rq->clock skew across CPUs. The * delta taken on each CPU would annul the skew. */ -static inline void sched_info_dequeued(struct rq *rq, struct task_struct *t) +static inline void sched_info_dequeue(struct rq *rq, struct task_struct *t) { - unsigned long long now = rq_clock(rq), delta = 0; + unsigned long long delta = 0; - if (sched_info_on()) { - if (t->sched_info.last_queued) - delta = now - t->sched_info.last_queued; - } - sched_info_reset_dequeued(t); + if (!t->sched_info.last_queued) + return; + + delta = rq_clock(rq) - t->sched_info.last_queued; + t->sched_info.last_queued = 0; t->sched_info.run_delay += delta; - rq_sched_info_dequeued(rq, delta); + rq_sched_info_dequeue(rq, delta); } /* @@ -182,11 +177,14 @@ static inline void sched_info_dequeued(struct rq *rq, struct task_struct *t) */ static void sched_info_arrive(struct rq *rq, struct task_struct *t) { - unsigned long long now = rq_clock(rq), delta = 0; + unsigned long long now, delta = 0; + + if (!t->sched_info.last_queued) + return; - if (t->sched_info.last_queued) - delta = now - t->sched_info.last_queued; - sched_info_reset_dequeued(t); + now = rq_clock(rq); + delta = now - t->sched_info.last_queued; + t->sched_info.last_queued = 0; t->sched_info.run_delay += delta; t->sched_info.last_arrival = now; t->sched_info.pcount++; @@ -197,14 +195,12 @@ static void sched_info_arrive(struct rq *rq, struct task_struct *t) /* * This function is only called from enqueue_task(), but also only updates * the timestamp if it is already not set. It's assumed that - * sched_info_dequeued() will clear that stamp when appropriate. + * sched_info_dequeue() will clear that stamp when appropriate. */ -static inline void sched_info_queued(struct rq *rq, struct task_struct *t) +static inline void sched_info_enqueue(struct rq *rq, struct task_struct *t) { - if (sched_info_on()) { - if (!t->sched_info.last_queued) - t->sched_info.last_queued = rq_clock(rq); - } + if (!t->sched_info.last_queued) + t->sched_info.last_queued = rq_clock(rq); } /* @@ -212,7 +208,7 @@ static inline void sched_info_queued(struct rq *rq, struct task_struct *t) * due, typically, to expiring its time slice (this may also be called when * switching to the idle task). Now we can calculate how long we ran. * Also, if the process is still in the TASK_RUNNING state, call - * sched_info_queued() to mark that it has now again started waiting on + * sched_info_enqueue() to mark that it has now again started waiting on * the runqueue. */ static inline void sched_info_depart(struct rq *rq, struct task_struct *t) @@ -221,8 +217,8 @@ static inline void sched_info_depart(struct rq *rq, struct task_struct *t) rq_sched_info_depart(rq, delta); - if (t->state == TASK_RUNNING) - sched_info_queued(rq, t); + if (task_is_running(t)) + sched_info_enqueue(rq, t); } /* @@ -231,7 +227,7 @@ static inline void sched_info_depart(struct rq *rq, struct task_struct *t) * the idle task.) We are only called when prev != next. */ static inline void -__sched_info_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next) +sched_info_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next) { /* * prev now departs the CPU. It's not interesting to record @@ -245,18 +241,8 @@ __sched_info_switch(struct rq *rq, struct task_struct *prev, struct task_struct sched_info_arrive(rq, next); } -static inline void -sched_info_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next) -{ - if (sched_info_on()) - __sched_info_switch(rq, prev, next); -} - #else /* !CONFIG_SCHED_INFO: */ -# define sched_info_queued(rq, t) do { } while (0) -# define sched_info_reset_dequeued(t) do { } while (0) -# define sched_info_dequeued(rq, t) do { } while (0) -# define sched_info_depart(rq, t) do { } while (0) -# define sched_info_arrive(rq, next) do { } while (0) +# define sched_info_enqueue(rq, t) do { } while (0) +# define sched_info_dequeue(rq, t) do { } while (0) # define sched_info_switch(rq, t, next) do { } while (0) #endif /* CONFIG_SCHED_INFO */ diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index 55f39125c0e1..f988ebe3febb 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -34,15 +34,24 @@ static void set_next_task_stop(struct rq *rq, struct task_struct *stop, bool fir stop->se.exec_start = rq_clock_task(rq); } -static struct task_struct *pick_next_task_stop(struct rq *rq) +static struct task_struct *pick_task_stop(struct rq *rq) { if (!sched_stop_runnable(rq)) return NULL; - set_next_task_stop(rq, rq->stop, true); return rq->stop; } +static struct task_struct *pick_next_task_stop(struct rq *rq) +{ + struct task_struct *p = pick_task_stop(rq); + + if (p) + set_next_task_stop(rq, p, true); + + return p; +} + static void enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags) { @@ -123,6 +132,7 @@ DEFINE_SCHED_CLASS(stop) = { #ifdef CONFIG_SMP .balance = balance_stop, + .pick_task = pick_task_stop, .select_task_rq = select_task_rq_stop, .set_cpus_allowed = set_cpus_allowed_common, #endif diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 55a0a243e871..b77ad49dc14f 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -467,7 +467,7 @@ void rq_attach_root(struct rq *rq, struct root_domain *rd) struct root_domain *old_rd = NULL; unsigned long flags; - raw_spin_lock_irqsave(&rq->lock, flags); + raw_spin_rq_lock_irqsave(rq, flags); if (rq->rd) { old_rd = rq->rd; @@ -493,7 +493,7 @@ void rq_attach_root(struct rq *rq, struct root_domain *rd) if (cpumask_test_cpu(rq->cpu, cpu_active_mask)) set_rq_online(rq); - raw_spin_unlock_irqrestore(&rq->lock, flags); + raw_spin_rq_unlock_irqrestore(rq, flags); if (old_rd) call_rcu(&old_rd->rcu, free_rootdomain); @@ -675,7 +675,7 @@ static void update_top_cache_domain(int cpu) sd = highest_flag_domain(cpu, SD_ASYM_PACKING); rcu_assign_pointer(per_cpu(sd_asym_packing, cpu), sd); - sd = lowest_flag_domain(cpu, SD_ASYM_CPUCAPACITY); + sd = lowest_flag_domain(cpu, SD_ASYM_CPUCAPACITY_FULL); rcu_assign_pointer(per_cpu(sd_asym_cpucapacity, cpu), sd); } @@ -1267,6 +1267,116 @@ next: } /* + * Asymmetric CPU capacity bits + */ +struct asym_cap_data { + struct list_head link; + unsigned long capacity; + unsigned long cpus[]; +}; + +/* + * Set of available CPUs grouped by their corresponding capacities + * Each list entry contains a CPU mask reflecting CPUs that share the same + * capacity. + * The lifespan of data is unlimited. + */ +static LIST_HEAD(asym_cap_list); + +#define cpu_capacity_span(asym_data) to_cpumask((asym_data)->cpus) + +/* + * Verify whether there is any CPU capacity asymmetry in a given sched domain. + * Provides sd_flags reflecting the asymmetry scope. + */ +static inline int +asym_cpu_capacity_classify(const struct cpumask *sd_span, + const struct cpumask *cpu_map) +{ + struct asym_cap_data *entry; + int count = 0, miss = 0; + + /* + * Count how many unique CPU capacities this domain spans across + * (compare sched_domain CPUs mask with ones representing available + * CPUs capacities). Take into account CPUs that might be offline: + * skip those. + */ + list_for_each_entry(entry, &asym_cap_list, link) { + if (cpumask_intersects(sd_span, cpu_capacity_span(entry))) + ++count; + else if (cpumask_intersects(cpu_map, cpu_capacity_span(entry))) + ++miss; + } + + WARN_ON_ONCE(!count && !list_empty(&asym_cap_list)); + + /* No asymmetry detected */ + if (count < 2) + return 0; + /* Some of the available CPU capacity values have not been detected */ + if (miss) + return SD_ASYM_CPUCAPACITY; + + /* Full asymmetry */ + return SD_ASYM_CPUCAPACITY | SD_ASYM_CPUCAPACITY_FULL; + +} + +static inline void asym_cpu_capacity_update_data(int cpu) +{ + unsigned long capacity = arch_scale_cpu_capacity(cpu); + struct asym_cap_data *entry = NULL; + + list_for_each_entry(entry, &asym_cap_list, link) { + if (capacity == entry->capacity) + goto done; + } + + entry = kzalloc(sizeof(*entry) + cpumask_size(), GFP_KERNEL); + if (WARN_ONCE(!entry, "Failed to allocate memory for asymmetry data\n")) + return; + entry->capacity = capacity; + list_add(&entry->link, &asym_cap_list); +done: + __cpumask_set_cpu(cpu, cpu_capacity_span(entry)); +} + +/* + * Build-up/update list of CPUs grouped by their capacities + * An update requires explicit request to rebuild sched domains + * with state indicating CPU topology changes. + */ +static void asym_cpu_capacity_scan(void) +{ + struct asym_cap_data *entry, *next; + int cpu; + + list_for_each_entry(entry, &asym_cap_list, link) + cpumask_clear(cpu_capacity_span(entry)); + + for_each_cpu_and(cpu, cpu_possible_mask, housekeeping_cpumask(HK_FLAG_DOMAIN)) + asym_cpu_capacity_update_data(cpu); + + list_for_each_entry_safe(entry, next, &asym_cap_list, link) { + if (cpumask_empty(cpu_capacity_span(entry))) { + list_del(&entry->link); + kfree(entry); + } + } + + /* + * Only one capacity value has been detected i.e. this system is symmetric. + * No need to keep this data around. + */ + if (list_is_singular(&asym_cap_list)) { + entry = list_first_entry(&asym_cap_list, typeof(*entry), link); + list_del(&entry->link); + kfree(entry); + } +} + +/* * Initializers for schedule domains * Non-inlined to reduce accumulated stack pressure in build_sched_domains() */ @@ -1399,11 +1509,12 @@ int __read_mostly node_reclaim_distance = RECLAIM_DISTANCE; static struct sched_domain * sd_init(struct sched_domain_topology_level *tl, const struct cpumask *cpu_map, - struct sched_domain *child, int dflags, int cpu) + struct sched_domain *child, int cpu) { struct sd_data *sdd = &tl->data; struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu); int sd_id, sd_weight, sd_flags = 0; + struct cpumask *sd_span; #ifdef CONFIG_NUMA /* @@ -1420,9 +1531,6 @@ sd_init(struct sched_domain_topology_level *tl, "wrong sd_flags in topology description\n")) sd_flags &= TOPOLOGY_SD_FLAGS; - /* Apply detected topology flags */ - sd_flags |= dflags; - *sd = (struct sched_domain){ .min_interval = sd_weight, .max_interval = 2*sd_weight, @@ -1454,13 +1562,19 @@ sd_init(struct sched_domain_topology_level *tl, #endif }; - cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu)); - sd_id = cpumask_first(sched_domain_span(sd)); + sd_span = sched_domain_span(sd); + cpumask_and(sd_span, cpu_map, tl->mask(cpu)); + sd_id = cpumask_first(sd_span); + + sd->flags |= asym_cpu_capacity_classify(sd_span, cpu_map); + + WARN_ONCE((sd->flags & (SD_SHARE_CPUCAPACITY | SD_ASYM_CPUCAPACITY)) == + (SD_SHARE_CPUCAPACITY | SD_ASYM_CPUCAPACITY), + "CPU capacity asymmetry not supported on SMT\n"); /* * Convert topological properties into behaviour. */ - /* Don't attempt to spread across CPUs of different capacities. */ if ((sd->flags & SD_ASYM_CPUCAPACITY) && sd->child) sd->child->flags &= ~SD_PREFER_SIBLING; @@ -1926,9 +2040,9 @@ static void __sdt_free(const struct cpumask *cpu_map) static struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, const struct cpumask *cpu_map, struct sched_domain_attr *attr, - struct sched_domain *child, int dflags, int cpu) + struct sched_domain *child, int cpu) { - struct sched_domain *sd = sd_init(tl, cpu_map, child, dflags, cpu); + struct sched_domain *sd = sd_init(tl, cpu_map, child, cpu); if (child) { sd->level = child->level + 1; @@ -1991,65 +2105,6 @@ static bool topology_span_sane(struct sched_domain_topology_level *tl, } /* - * Find the sched_domain_topology_level where all CPU capacities are visible - * for all CPUs. - */ -static struct sched_domain_topology_level -*asym_cpu_capacity_level(const struct cpumask *cpu_map) -{ - int i, j, asym_level = 0; - bool asym = false; - struct sched_domain_topology_level *tl, *asym_tl = NULL; - unsigned long cap; - - /* Is there any asymmetry? */ - cap = arch_scale_cpu_capacity(cpumask_first(cpu_map)); - - for_each_cpu(i, cpu_map) { - if (arch_scale_cpu_capacity(i) != cap) { - asym = true; - break; - } - } - - if (!asym) - return NULL; - - /* - * Examine topology from all CPU's point of views to detect the lowest - * sched_domain_topology_level where a highest capacity CPU is visible - * to everyone. - */ - for_each_cpu(i, cpu_map) { - unsigned long max_capacity = arch_scale_cpu_capacity(i); - int tl_id = 0; - - for_each_sd_topology(tl) { - if (tl_id < asym_level) - goto next_level; - - for_each_cpu_and(j, tl->mask(i), cpu_map) { - unsigned long capacity; - - capacity = arch_scale_cpu_capacity(j); - - if (capacity <= max_capacity) - continue; - - max_capacity = capacity; - asym_level = tl_id; - asym_tl = tl; - } -next_level: - tl_id++; - } - } - - return asym_tl; -} - - -/* * Build sched domains for a given set of CPUs and attach the sched domains * to the individual CPUs */ @@ -2061,7 +2116,6 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att struct s_data d; struct rq *rq = NULL; int i, ret = -ENOMEM; - struct sched_domain_topology_level *tl_asym; bool has_asym = false; if (WARN_ON(cpumask_empty(cpu_map))) @@ -2071,24 +2125,19 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att if (alloc_state != sa_rootdomain) goto error; - tl_asym = asym_cpu_capacity_level(cpu_map); - /* Set up domains for CPUs specified by the cpu_map: */ for_each_cpu(i, cpu_map) { struct sched_domain_topology_level *tl; - int dflags = 0; sd = NULL; for_each_sd_topology(tl) { - if (tl == tl_asym) { - dflags |= SD_ASYM_CPUCAPACITY; - has_asym = true; - } if (WARN_ON(!topology_span_sane(tl, cpu_map, i))) goto error; - sd = build_sched_domain(tl, cpu_map, attr, sd, dflags, i); + sd = build_sched_domain(tl, cpu_map, attr, sd, i); + + has_asym |= sd->flags & SD_ASYM_CPUCAPACITY; if (tl == sched_domain_topology) *per_cpu_ptr(d.sd, i) = sd; @@ -2217,6 +2266,7 @@ int sched_init_domains(const struct cpumask *cpu_map) zalloc_cpumask_var(&fallback_doms, GFP_KERNEL); arch_update_cpu_topology(); + asym_cpu_capacity_scan(); ndoms_cur = 1; doms_cur = alloc_sched_domains(ndoms_cur); if (!doms_cur) @@ -2299,6 +2349,9 @@ void partition_sched_domains_locked(int ndoms_new, cpumask_var_t doms_new[], /* Let the architecture update CPU core mappings: */ new_topology = arch_update_cpu_topology(); + /* Trigger rebuilding CPU capacity asymmetry data */ + if (new_topology) + asym_cpu_capacity_scan(); if (!doms_new) { WARN_ON_ONCE(dattr_new); diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 9f58049ac16d..057e17f3215d 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -107,6 +107,7 @@ struct seccomp_knotif { * installing process should allocate the fd as normal. * @flags: The flags for the new file descriptor. At the moment, only O_CLOEXEC * is allowed. + * @ioctl_flags: The flags used for the seccomp_addfd ioctl. * @ret: The return value of the installing process. It is set to the fd num * upon success (>= 0). * @completion: Indicates that the installing process has completed fd @@ -118,6 +119,7 @@ struct seccomp_kaddfd { struct file *file; int fd; unsigned int flags; + __u32 ioctl_flags; union { bool setfd; @@ -1065,18 +1067,37 @@ static u64 seccomp_next_notify_id(struct seccomp_filter *filter) return filter->notif->next_id++; } -static void seccomp_handle_addfd(struct seccomp_kaddfd *addfd) +static void seccomp_handle_addfd(struct seccomp_kaddfd *addfd, struct seccomp_knotif *n) { + int fd; + /* * Remove the notification, and reset the list pointers, indicating * that it has been handled. */ list_del_init(&addfd->list); if (!addfd->setfd) - addfd->ret = receive_fd(addfd->file, addfd->flags); + fd = receive_fd(addfd->file, addfd->flags); else - addfd->ret = receive_fd_replace(addfd->fd, addfd->file, - addfd->flags); + fd = receive_fd_replace(addfd->fd, addfd->file, addfd->flags); + addfd->ret = fd; + + if (addfd->ioctl_flags & SECCOMP_ADDFD_FLAG_SEND) { + /* If we fail reset and return an error to the notifier */ + if (fd < 0) { + n->state = SECCOMP_NOTIFY_SENT; + } else { + /* Return the FD we just added */ + n->flags = 0; + n->error = 0; + n->val = fd; + } + } + + /* + * Mark the notification as completed. From this point, addfd mem + * might be invalidated and we can't safely read it anymore. + */ complete(&addfd->completion); } @@ -1120,7 +1141,7 @@ static int seccomp_do_user_notification(int this_syscall, struct seccomp_kaddfd, list); /* Check if we were woken up by a addfd message */ if (addfd) - seccomp_handle_addfd(addfd); + seccomp_handle_addfd(addfd, &n); } while (n.state != SECCOMP_NOTIFY_REPLIED); @@ -1581,7 +1602,7 @@ static long seccomp_notify_addfd(struct seccomp_filter *filter, if (addfd.newfd_flags & ~O_CLOEXEC) return -EINVAL; - if (addfd.flags & ~SECCOMP_ADDFD_FLAG_SETFD) + if (addfd.flags & ~(SECCOMP_ADDFD_FLAG_SETFD | SECCOMP_ADDFD_FLAG_SEND)) return -EINVAL; if (addfd.newfd && !(addfd.flags & SECCOMP_ADDFD_FLAG_SETFD)) @@ -1591,6 +1612,7 @@ static long seccomp_notify_addfd(struct seccomp_filter *filter, if (!kaddfd.file) return -EBADF; + kaddfd.ioctl_flags = addfd.flags; kaddfd.flags = addfd.newfd_flags; kaddfd.setfd = addfd.flags & SECCOMP_ADDFD_FLAG_SETFD; kaddfd.fd = addfd.newfd; @@ -1616,6 +1638,23 @@ static long seccomp_notify_addfd(struct seccomp_filter *filter, goto out_unlock; } + if (addfd.flags & SECCOMP_ADDFD_FLAG_SEND) { + /* + * Disallow queuing an atomic addfd + send reply while there are + * some addfd requests still to process. + * + * There is no clear reason to support it and allows us to keep + * the loop on the other side straight-forward. + */ + if (!list_empty(&knotif->addfd)) { + ret = -EBUSY; + goto out_unlock; + } + + /* Allow exactly only one reply */ + knotif->state = SECCOMP_NOTIFY_REPLIED; + } + list_add(&kaddfd.list, &knotif->addfd); complete(&knotif->ready); mutex_unlock(&filter->notify_lock); diff --git a/kernel/signal.c b/kernel/signal.c index f7c6ffcbd044..de0920353d30 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -412,8 +412,8 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t gfp_flags, int override_rlimit, const unsigned int sigqueue_flags) { struct sigqueue *q = NULL; - struct user_struct *user; - int sigpending; + struct ucounts *ucounts = NULL; + long sigpending; /* * Protect access to @t credentials. This can go away when all @@ -424,77 +424,38 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t gfp_flags, * changes from/to zero. */ rcu_read_lock(); - user = __task_cred(t)->user; - sigpending = atomic_inc_return(&user->sigpending); + ucounts = task_ucounts(t); + sigpending = inc_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_SIGPENDING, 1); if (sigpending == 1) - get_uid(user); + ucounts = get_ucounts(ucounts); rcu_read_unlock(); - if (override_rlimit || likely(sigpending <= task_rlimit(t, RLIMIT_SIGPENDING))) { - /* - * Preallocation does not hold sighand::siglock so it can't - * use the cache. The lockless caching requires that only - * one consumer and only one producer run at a time. - */ - q = READ_ONCE(t->sigqueue_cache); - if (!q || sigqueue_flags) - q = kmem_cache_alloc(sigqueue_cachep, gfp_flags); - else - WRITE_ONCE(t->sigqueue_cache, NULL); + if (override_rlimit || (sigpending < LONG_MAX && sigpending <= task_rlimit(t, RLIMIT_SIGPENDING))) { + q = kmem_cache_alloc(sigqueue_cachep, gfp_flags); } else { print_dropped_signal(sig); } if (unlikely(q == NULL)) { - if (atomic_dec_and_test(&user->sigpending)) - free_uid(user); + if (ucounts && dec_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_SIGPENDING, 1)) + put_ucounts(ucounts); } else { INIT_LIST_HEAD(&q->list); q->flags = sigqueue_flags; - q->user = user; + q->ucounts = ucounts; } - return q; } -void exit_task_sigqueue_cache(struct task_struct *tsk) -{ - /* Race free because @tsk is mopped up */ - struct sigqueue *q = tsk->sigqueue_cache; - - if (q) { - tsk->sigqueue_cache = NULL; - /* - * Hand it back to the cache as the task might - * be self reaping which would leak the object. - */ - kmem_cache_free(sigqueue_cachep, q); - } -} - -static void sigqueue_cache_or_free(struct sigqueue *q) -{ - /* - * Cache one sigqueue per task. This pairs with the consumer side - * in __sigqueue_alloc() and needs READ/WRITE_ONCE() to prevent the - * compiler from store tearing and to tell KCSAN that the data race - * is intentional when run without holding current->sighand->siglock, - * which is fine as current obviously cannot run __sigqueue_free() - * concurrently. - */ - if (!READ_ONCE(current->sigqueue_cache)) - WRITE_ONCE(current->sigqueue_cache, q); - else - kmem_cache_free(sigqueue_cachep, q); -} - static void __sigqueue_free(struct sigqueue *q) { if (q->flags & SIGQUEUE_PREALLOC) return; - if (atomic_dec_and_test(&q->user->sigpending)) - free_uid(q->user); - sigqueue_cache_or_free(q); + if (q->ucounts && dec_rlimit_ucounts(q->ucounts, UCOUNT_RLIMIT_SIGPENDING, 1)) { + put_ucounts(q->ucounts); + q->ucounts = NULL; + } + kmem_cache_free(sigqueue_cachep, q); } void flush_sigqueue(struct sigpending *queue) @@ -4719,7 +4680,7 @@ void kdb_send_sig(struct task_struct *t, int sig) } new_t = kdb_prev_t != t; kdb_prev_t = t; - if (t->state != TASK_RUNNING && new_t) { + if (!task_is_running(t) && new_t) { spin_unlock(&t->sighand->siglock); kdb_printf("Process is not RUNNING, sending a signal from " "kdb risks deadlock\n" diff --git a/kernel/smpboot.c b/kernel/smpboot.c index f25208e8df83..e4163042c4d6 100644 --- a/kernel/smpboot.c +++ b/kernel/smpboot.c @@ -33,7 +33,6 @@ struct task_struct *idle_thread_get(unsigned int cpu) if (!tsk) return ERR_PTR(-ENOMEM); - init_idle(tsk, cpu); return tsk; } diff --git a/kernel/softirq.c b/kernel/softirq.c index 4992853ef53d..f3a012179f47 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -76,7 +76,7 @@ static void wakeup_softirqd(void) /* Interrupts are disabled: no need to stop preemption */ struct task_struct *tsk = __this_cpu_read(ksoftirqd); - if (tsk && tsk->state != TASK_RUNNING) + if (tsk) wake_up_process(tsk); } @@ -92,8 +92,7 @@ static bool ksoftirqd_running(unsigned long pending) if (pending & SOFTIRQ_NOW_MASK) return false; - return tsk && (tsk->state == TASK_RUNNING) && - !__kthread_should_park(tsk); + return tsk && task_is_running(tsk) && !__kthread_should_park(tsk); } #ifdef CONFIG_TRACE_IRQFLAGS diff --git a/kernel/sys.c b/kernel/sys.c index 3a583a29815f..ef1a78f5d71c 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -479,7 +479,7 @@ static int set_user(struct cred *new) * for programs doing set*uid()+execve() by harmlessly deferring the * failure to the execve() stage. */ - if (atomic_read(&new_user->processes) >= rlimit(RLIMIT_NPROC) && + if (is_ucounts_overlimit(new->ucounts, UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC)) && new_user != INIT_USER) current->flags |= PF_NPROC_EXCEEDED; else @@ -558,6 +558,10 @@ long __sys_setreuid(uid_t ruid, uid_t euid) if (retval < 0) goto error; + retval = set_cred_ucounts(new); + if (retval < 0) + goto error; + return commit_creds(new); error: @@ -616,6 +620,10 @@ long __sys_setuid(uid_t uid) if (retval < 0) goto error; + retval = set_cred_ucounts(new); + if (retval < 0) + goto error; + return commit_creds(new); error: @@ -691,6 +699,10 @@ long __sys_setresuid(uid_t ruid, uid_t euid, uid_t suid) if (retval < 0) goto error; + retval = set_cred_ucounts(new); + if (retval < 0) + goto error; + return commit_creds(new); error: @@ -2550,6 +2562,11 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, error = set_syscall_user_dispatch(arg2, arg3, arg4, (char __user *) arg5); break; +#ifdef CONFIG_SCHED_CORE + case PR_SCHED_CORE: + error = sched_core_share_pid(arg2, arg3, arg4, arg5); + break; +#endif default: error = -EINVAL; break; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index d4a78e08f6d8..8c8c220637ce 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -71,6 +71,7 @@ #include <linux/coredump.h> #include <linux/latencytop.h> #include <linux/pid.h> +#include <linux/delayacct.h> #include "../lib/kstrtox.h" @@ -1747,6 +1748,17 @@ static struct ctl_table kern_table[] = { .extra2 = SYSCTL_ONE, }, #endif /* CONFIG_SCHEDSTATS */ +#ifdef CONFIG_TASK_DELAY_ACCT + { + .procname = "task_delayacct", + .data = NULL, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sysctl_delayacct, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +#endif /* CONFIG_TASK_DELAY_ACCT */ #ifdef CONFIG_NUMA_BALANCING { .procname = "numa_balancing", diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index 83e158d016ba..7df71ef0e1fd 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -117,13 +117,14 @@ config NO_HZ_FULL the task mostly runs in userspace and has few kernel activity. You need to fill up the nohz_full boot parameter with the - desired range of dynticks CPUs. + desired range of dynticks CPUs to use it. This is implemented at + the expense of some overhead in user <-> kernel transitions: + syscalls, exceptions and interrupts. - This is implemented at the expense of some overhead in user <-> kernel - transitions: syscalls, exceptions and interrupts. Even when it's - dynamically off. + By default, without passing the nohz_full parameter, this behaves just + like NO_HZ_IDLE. - Say N. + If you're a distro say Y. endchoice diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 3bb96a8b49c9..29a5e54e6e10 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -523,7 +523,7 @@ static void arm_timer(struct k_itimer *timer, struct task_struct *p) if (CPUCLOCK_PERTHREAD(timer->it_clock)) tick_dep_set_task(p, TICK_DEP_BIT_POSIX_TIMER); else - tick_dep_set_signal(p->signal, TICK_DEP_BIT_POSIX_TIMER); + tick_dep_set_signal(p, TICK_DEP_BIT_POSIX_TIMER); } /* @@ -1358,7 +1358,7 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clkid, if (*newval < *nextevt) *nextevt = *newval; - tick_dep_set_signal(tsk->signal, TICK_DEP_BIT_POSIX_TIMER); + tick_dep_set_signal(tsk, TICK_DEP_BIT_POSIX_TIMER); } static int do_cpu_nanosleep(const clockid_t which_clock, int flags, diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 6784f27a3099..6bffe5af8cb1 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -323,6 +323,46 @@ void tick_nohz_full_kick_cpu(int cpu) irq_work_queue_on(&per_cpu(nohz_full_kick_work, cpu), cpu); } +static void tick_nohz_kick_task(struct task_struct *tsk) +{ + int cpu; + + /* + * If the task is not running, run_posix_cpu_timers() + * has nothing to elapse, IPI can then be spared. + * + * activate_task() STORE p->tick_dep_mask + * STORE p->on_rq + * __schedule() (switch to task 'p') smp_mb() (atomic_fetch_or()) + * LOCK rq->lock LOAD p->on_rq + * smp_mb__after_spin_lock() + * tick_nohz_task_switch() + * LOAD p->tick_dep_mask + */ + if (!sched_task_on_rq(tsk)) + return; + + /* + * If the task concurrently migrates to another CPU, + * we guarantee it sees the new tick dependency upon + * schedule. + * + * set_task_cpu(p, cpu); + * STORE p->cpu = @cpu + * __schedule() (switch to task 'p') + * LOCK rq->lock + * smp_mb__after_spin_lock() STORE p->tick_dep_mask + * tick_nohz_task_switch() smp_mb() (atomic_fetch_or()) + * LOAD p->tick_dep_mask LOAD p->cpu + */ + cpu = task_cpu(tsk); + + preempt_disable(); + if (cpu_online(cpu)) + tick_nohz_full_kick_cpu(cpu); + preempt_enable(); +} + /* * Kick all full dynticks CPUs in order to force these to re-evaluate * their dependency on the tick and restart it if necessary. @@ -405,19 +445,8 @@ EXPORT_SYMBOL_GPL(tick_nohz_dep_clear_cpu); */ void tick_nohz_dep_set_task(struct task_struct *tsk, enum tick_dep_bits bit) { - if (!atomic_fetch_or(BIT(bit), &tsk->tick_dep_mask)) { - if (tsk == current) { - preempt_disable(); - tick_nohz_full_kick(); - preempt_enable(); - } else { - /* - * Some future tick_nohz_full_kick_task() - * should optimize this. - */ - tick_nohz_full_kick_all(); - } - } + if (!atomic_fetch_or(BIT(bit), &tsk->tick_dep_mask)) + tick_nohz_kick_task(tsk); } EXPORT_SYMBOL_GPL(tick_nohz_dep_set_task); @@ -431,9 +460,20 @@ EXPORT_SYMBOL_GPL(tick_nohz_dep_clear_task); * Set a per-taskgroup tick dependency. Posix CPU timers need this in order to elapse * per process timers. */ -void tick_nohz_dep_set_signal(struct signal_struct *sig, enum tick_dep_bits bit) +void tick_nohz_dep_set_signal(struct task_struct *tsk, + enum tick_dep_bits bit) { - tick_nohz_dep_set_all(&sig->tick_dep_mask, bit); + int prev; + struct signal_struct *sig = tsk->signal; + + prev = atomic_fetch_or(BIT(bit), &sig->tick_dep_mask); + if (!prev) { + struct task_struct *t; + + lockdep_assert_held(&tsk->sighand->siglock); + __for_each_thread(sig, t) + tick_nohz_kick_task(t); + } } void tick_nohz_dep_clear_signal(struct signal_struct *sig, enum tick_dep_bits bit) @@ -448,13 +488,10 @@ void tick_nohz_dep_clear_signal(struct signal_struct *sig, enum tick_dep_bits bi */ void __tick_nohz_task_switch(void) { - unsigned long flags; struct tick_sched *ts; - local_irq_save(flags); - if (!tick_nohz_full_cpu(smp_processor_id())) - goto out; + return; ts = this_cpu_ptr(&tick_cpu_sched); @@ -463,8 +500,6 @@ void __tick_nohz_task_switch(void) atomic_read(¤t->signal->tick_dep_mask)) tick_nohz_full_kick(); } -out: - local_irq_restore(flags); } /* Get the boot-time nohz CPU list from the kernel parameters. */ @@ -922,27 +957,31 @@ static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now) * Cancel the scheduled timer and restore the tick */ ts->tick_stopped = 0; - ts->idle_exittime = now; - tick_nohz_restart(ts, now); } -static void tick_nohz_full_update_tick(struct tick_sched *ts) +static void __tick_nohz_full_update_tick(struct tick_sched *ts, + ktime_t now) { #ifdef CONFIG_NO_HZ_FULL int cpu = smp_processor_id(); - if (!tick_nohz_full_cpu(cpu)) + if (can_stop_full_tick(cpu, ts)) + tick_nohz_stop_sched_tick(ts, cpu); + else if (ts->tick_stopped) + tick_nohz_restart_sched_tick(ts, now); +#endif +} + +static void tick_nohz_full_update_tick(struct tick_sched *ts) +{ + if (!tick_nohz_full_cpu(smp_processor_id())) return; if (!ts->tick_stopped && ts->nohz_mode == NOHZ_MODE_INACTIVE) return; - if (can_stop_full_tick(cpu, ts)) - tick_nohz_stop_sched_tick(ts, cpu); - else if (ts->tick_stopped) - tick_nohz_restart_sched_tick(ts, ktime_get()); -#endif + __tick_nohz_full_update_tick(ts, ktime_get()); } static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) @@ -1189,11 +1228,13 @@ unsigned long tick_nohz_get_idle_calls(void) return ts->idle_calls; } -static void tick_nohz_account_idle_ticks(struct tick_sched *ts) +static void tick_nohz_account_idle_time(struct tick_sched *ts, + ktime_t now) { -#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE unsigned long ticks; + ts->idle_exittime = now; + if (vtime_accounting_enabled_this_cpu()) return; /* @@ -1207,21 +1248,27 @@ static void tick_nohz_account_idle_ticks(struct tick_sched *ts) */ if (ticks && ticks < LONG_MAX) account_idle_ticks(ticks); -#endif } -static void __tick_nohz_idle_restart_tick(struct tick_sched *ts, ktime_t now) +void tick_nohz_idle_restart_tick(void) { - tick_nohz_restart_sched_tick(ts, now); - tick_nohz_account_idle_ticks(ts); + struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); + + if (ts->tick_stopped) { + ktime_t now = ktime_get(); + tick_nohz_restart_sched_tick(ts, now); + tick_nohz_account_idle_time(ts, now); + } } -void tick_nohz_idle_restart_tick(void) +static void tick_nohz_idle_update_tick(struct tick_sched *ts, ktime_t now) { - struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); + if (tick_nohz_full_cpu(smp_processor_id())) + __tick_nohz_full_update_tick(ts, now); + else + tick_nohz_restart_sched_tick(ts, now); - if (ts->tick_stopped) - __tick_nohz_idle_restart_tick(ts, ktime_get()); + tick_nohz_account_idle_time(ts, now); } /** @@ -1253,7 +1300,7 @@ void tick_nohz_idle_exit(void) tick_nohz_stop_idle(ts, now); if (tick_stopped) - __tick_nohz_idle_restart_tick(ts, now); + tick_nohz_idle_update_tick(ts, now); local_irq_enable(); } diff --git a/kernel/time/timer.c b/kernel/time/timer.c index d111adf4a0cb..467087d7bdb6 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1879,7 +1879,7 @@ signed long __sched schedule_timeout(signed long timeout) printk(KERN_ERR "schedule_timeout: wrong timeout " "value %lx\n", timeout); dump_stack(); - current->state = TASK_RUNNING; + __set_current_state(TASK_RUNNING); goto out; } } diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 9299057feb56..d23a09d3eb37 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2198,9 +2198,6 @@ struct saved_cmdlines_buffer { }; static struct saved_cmdlines_buffer *savedcmd; -/* temporary disable recording */ -static atomic_t trace_record_taskinfo_disabled __read_mostly; - static inline char *get_saved_cmdlines(int idx) { return &savedcmd->saved_cmdlines[idx * TASK_COMM_LEN]; @@ -2486,8 +2483,6 @@ static bool tracing_record_taskinfo_skip(int flags) { if (unlikely(!(flags & (TRACE_RECORD_CMDLINE | TRACE_RECORD_TGID)))) return true; - if (atomic_read(&trace_record_taskinfo_disabled) || !tracing_is_on()) - return true; if (!__this_cpu_read(trace_taskinfo_save)) return true; return false; @@ -3998,9 +3993,6 @@ static void *s_start(struct seq_file *m, loff_t *pos) return ERR_PTR(-EBUSY); #endif - if (!iter->snapshot) - atomic_inc(&trace_record_taskinfo_disabled); - if (*pos != iter->pos) { iter->ent = NULL; iter->cpu = 0; @@ -4043,9 +4035,6 @@ static void s_stop(struct seq_file *m, void *p) return; #endif - if (!iter->snapshot) - atomic_dec(&trace_record_taskinfo_disabled); - trace_access_unlock(iter->cpu_file); trace_event_read_unlock(); } diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c index c1637f90c8a3..4702efb00ff2 100644 --- a/kernel/trace/trace_clock.c +++ b/kernel/trace/trace_clock.c @@ -115,9 +115,9 @@ u64 notrace trace_clock_global(void) prev_time = READ_ONCE(trace_clock_struct.prev_time); now = sched_clock_cpu(this_cpu); - /* Make sure that now is always greater than prev_time */ + /* Make sure that now is always greater than or equal to prev_time */ if ((s64)(now - prev_time) < 0) - now = prev_time + 1; + now = prev_time; /* * If in an NMI context then dont risk lockups and simply return @@ -131,7 +131,7 @@ u64 notrace trace_clock_global(void) /* Reread prev_time in case it was already updated */ prev_time = READ_ONCE(trace_clock_struct.prev_time); if ((s64)(now - prev_time) < 0) - now = prev_time + 1; + now = prev_time; trace_clock_struct.prev_time = now; diff --git a/kernel/ucount.c b/kernel/ucount.c index 8d8874f1c35e..87799e2379bd 100644 --- a/kernel/ucount.c +++ b/kernel/ucount.c @@ -8,6 +8,12 @@ #include <linux/kmemleak.h> #include <linux/user_namespace.h> +struct ucounts init_ucounts = { + .ns = &init_user_ns, + .uid = GLOBAL_ROOT_UID, + .count = ATOMIC_INIT(1), +}; + #define UCOUNTS_HASHTABLE_BITS 10 static struct hlist_head ucounts_hashtable[(1 << UCOUNTS_HASHTABLE_BITS)]; static DEFINE_SPINLOCK(ucounts_lock); @@ -78,6 +84,10 @@ static struct ctl_table user_table[] = { UCOUNT_ENTRY("max_fanotify_groups"), UCOUNT_ENTRY("max_fanotify_marks"), #endif + { }, + { }, + { }, + { }, { } }; #endif /* CONFIG_SYSCTL */ @@ -129,7 +139,24 @@ static struct ucounts *find_ucounts(struct user_namespace *ns, kuid_t uid, struc return NULL; } -static struct ucounts *get_ucounts(struct user_namespace *ns, kuid_t uid) +static void hlist_add_ucounts(struct ucounts *ucounts) +{ + struct hlist_head *hashent = ucounts_hashentry(ucounts->ns, ucounts->uid); + spin_lock_irq(&ucounts_lock); + hlist_add_head(&ucounts->node, hashent); + spin_unlock_irq(&ucounts_lock); +} + +struct ucounts *get_ucounts(struct ucounts *ucounts) +{ + if (ucounts && atomic_add_negative(1, &ucounts->count)) { + put_ucounts(ucounts); + ucounts = NULL; + } + return ucounts; +} + +struct ucounts *alloc_ucounts(struct user_namespace *ns, kuid_t uid) { struct hlist_head *hashent = ucounts_hashentry(ns, uid); struct ucounts *ucounts, *new; @@ -145,7 +172,7 @@ static struct ucounts *get_ucounts(struct user_namespace *ns, kuid_t uid) new->ns = ns; new->uid = uid; - new->count = 0; + atomic_set(&new->count, 1); spin_lock_irq(&ucounts_lock); ucounts = find_ucounts(ns, uid, hashent); @@ -153,40 +180,35 @@ static struct ucounts *get_ucounts(struct user_namespace *ns, kuid_t uid) kfree(new); } else { hlist_add_head(&new->node, hashent); - ucounts = new; + spin_unlock_irq(&ucounts_lock); + return new; } } - if (ucounts->count == INT_MAX) - ucounts = NULL; - else - ucounts->count += 1; spin_unlock_irq(&ucounts_lock); + ucounts = get_ucounts(ucounts); return ucounts; } -static void put_ucounts(struct ucounts *ucounts) +void put_ucounts(struct ucounts *ucounts) { unsigned long flags; - spin_lock_irqsave(&ucounts_lock, flags); - ucounts->count -= 1; - if (!ucounts->count) + if (atomic_dec_and_test(&ucounts->count)) { + spin_lock_irqsave(&ucounts_lock, flags); hlist_del_init(&ucounts->node); - else - ucounts = NULL; - spin_unlock_irqrestore(&ucounts_lock, flags); - - kfree(ucounts); + spin_unlock_irqrestore(&ucounts_lock, flags); + kfree(ucounts); + } } -static inline bool atomic_inc_below(atomic_t *v, int u) +static inline bool atomic_long_inc_below(atomic_long_t *v, int u) { - int c, old; - c = atomic_read(v); + long c, old; + c = atomic_long_read(v); for (;;) { if (unlikely(c >= u)) return false; - old = atomic_cmpxchg(v, c, c+1); + old = atomic_long_cmpxchg(v, c, c+1); if (likely(old == c)) return true; c = old; @@ -198,19 +220,19 @@ struct ucounts *inc_ucount(struct user_namespace *ns, kuid_t uid, { struct ucounts *ucounts, *iter, *bad; struct user_namespace *tns; - ucounts = get_ucounts(ns, uid); + ucounts = alloc_ucounts(ns, uid); for (iter = ucounts; iter; iter = tns->ucounts) { - int max; + long max; tns = iter->ns; max = READ_ONCE(tns->ucount_max[type]); - if (!atomic_inc_below(&iter->ucount[type], max)) + if (!atomic_long_inc_below(&iter->ucount[type], max)) goto fail; } return ucounts; fail: bad = iter; for (iter = ucounts; iter != bad; iter = iter->ns->ucounts) - atomic_dec(&iter->ucount[type]); + atomic_long_dec(&iter->ucount[type]); put_ucounts(ucounts); return NULL; @@ -220,12 +242,54 @@ void dec_ucount(struct ucounts *ucounts, enum ucount_type type) { struct ucounts *iter; for (iter = ucounts; iter; iter = iter->ns->ucounts) { - int dec = atomic_dec_if_positive(&iter->ucount[type]); + long dec = atomic_long_dec_if_positive(&iter->ucount[type]); WARN_ON_ONCE(dec < 0); } put_ucounts(ucounts); } +long inc_rlimit_ucounts(struct ucounts *ucounts, enum ucount_type type, long v) +{ + struct ucounts *iter; + long ret = 0; + + for (iter = ucounts; iter; iter = iter->ns->ucounts) { + long max = READ_ONCE(iter->ns->ucount_max[type]); + long new = atomic_long_add_return(v, &iter->ucount[type]); + if (new < 0 || new > max) + ret = LONG_MAX; + else if (iter == ucounts) + ret = new; + } + return ret; +} + +bool dec_rlimit_ucounts(struct ucounts *ucounts, enum ucount_type type, long v) +{ + struct ucounts *iter; + long new = -1; /* Silence compiler warning */ + for (iter = ucounts; iter; iter = iter->ns->ucounts) { + long dec = atomic_long_add_return(-v, &iter->ucount[type]); + WARN_ON_ONCE(dec < 0); + if (iter == ucounts) + new = dec; + } + return (new == 0); +} + +bool is_ucounts_overlimit(struct ucounts *ucounts, enum ucount_type type, unsigned long max) +{ + struct ucounts *iter; + if (get_ucounts_value(ucounts, type) > max) + return true; + for (iter = ucounts; iter; iter = iter->ns->ucounts) { + max = READ_ONCE(iter->ns->ucount_max[type]); + if (get_ucounts_value(iter, type) > max) + return true; + } + return false; +} + static __init int user_namespace_sysctl_init(void) { #ifdef CONFIG_SYSCTL @@ -241,6 +305,8 @@ static __init int user_namespace_sysctl_init(void) BUG_ON(!user_header); BUG_ON(!setup_userns_sysctls(&init_user_ns)); #endif + hlist_add_ucounts(&init_ucounts); + inc_rlimit_ucounts(&init_ucounts, UCOUNT_RLIMIT_NPROC, 1); return 0; } subsys_initcall(user_namespace_sysctl_init); diff --git a/kernel/user.c b/kernel/user.c index a2478cddf536..c82399c1618a 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -98,9 +98,6 @@ static DEFINE_SPINLOCK(uidhash_lock); /* root_user.__count is 1, for init task cred */ struct user_struct root_user = { .__count = REFCOUNT_INIT(1), - .processes = ATOMIC_INIT(1), - .sigpending = ATOMIC_INIT(0), - .locked_shm = 0, .uid = GLOBAL_ROOT_UID, .ratelimit = RATELIMIT_STATE_INIT(root_user.ratelimit, 0, 0), }; diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 8d62863721b0..ef82d401dde8 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -119,9 +119,13 @@ int create_user_ns(struct cred *new) ns->owner = owner; ns->group = group; INIT_WORK(&ns->work, free_user_ns); - for (i = 0; i < UCOUNT_COUNTS; i++) { + for (i = 0; i < MAX_PER_NAMESPACE_UCOUNTS; i++) { ns->ucount_max[i] = INT_MAX; } + set_rlimit_ucount_max(ns, UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC)); + set_rlimit_ucount_max(ns, UCOUNT_RLIMIT_MSGQUEUE, rlimit(RLIMIT_MSGQUEUE)); + set_rlimit_ucount_max(ns, UCOUNT_RLIMIT_SIGPENDING, rlimit(RLIMIT_SIGPENDING)); + set_rlimit_ucount_max(ns, UCOUNT_RLIMIT_MEMLOCK, rlimit(RLIMIT_MEMLOCK)); ns->ucounts = ucounts; /* Inherit USERNS_SETGROUPS_ALLOWED from our parent */ @@ -1340,6 +1344,9 @@ static int userns_install(struct nsset *nsset, struct ns_common *ns) put_user_ns(cred->user_ns); set_cred_user_ns(cred, get_user_ns(user_ns)); + if (set_cred_ucounts(cred) < 0) + return -EINVAL; + return 0; } |