diff options
Diffstat (limited to 'kernel')
79 files changed, 3824 insertions, 2025 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index a8d923b5481b..48c5376d290a 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -64,6 +64,7 @@ obj-$(CONFIG_CRASH_CORE) += crash_core.o obj-$(CONFIG_KEXEC_CORE) += kexec_core.o obj-$(CONFIG_KEXEC) += kexec.o obj-$(CONFIG_KEXEC_FILE) += kexec_file.o +obj-$(CONFIG_KEXEC_ELF) += kexec_elf.o obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o obj-$(CONFIG_COMPAT) += compat.o obj-$(CONFIG_CGROUPS) += cgroup/ @@ -111,7 +112,6 @@ obj-$(CONFIG_CONTEXT_TRACKING) += context_tracking.o obj-$(CONFIG_TORTURE_TEST) += torture.o obj-$(CONFIG_HAS_IOMEM) += iomem.o -obj-$(CONFIG_ZONE_DEVICE) += memremap.o obj-$(CONFIG_RSEQ) += rseq.o obj-$(CONFIG_GCC_PLUGIN_STACKLEAK) += stackleak.o diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 8191a7db2777..66088a9e9b9e 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -890,7 +890,8 @@ int bpf_jit_get_func_addr(const struct bpf_prog *prog, static int bpf_jit_blind_insn(const struct bpf_insn *from, const struct bpf_insn *aux, - struct bpf_insn *to_buff) + struct bpf_insn *to_buff, + bool emit_zext) { struct bpf_insn *to = to_buff; u32 imm_rnd = get_random_int(); @@ -1005,6 +1006,8 @@ static int bpf_jit_blind_insn(const struct bpf_insn *from, case 0: /* Part 2 of BPF_LD | BPF_IMM | BPF_DW. */ *to++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ aux[0].imm); *to++ = BPF_ALU32_IMM(BPF_XOR, BPF_REG_AX, imm_rnd); + if (emit_zext) + *to++ = BPF_ZEXT_REG(BPF_REG_AX); *to++ = BPF_ALU64_REG(BPF_OR, aux[0].dst_reg, BPF_REG_AX); break; @@ -1088,7 +1091,8 @@ struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *prog) insn[1].code == 0) memcpy(aux, insn, sizeof(aux)); - rewritten = bpf_jit_blind_insn(insn, aux, insn_buff); + rewritten = bpf_jit_blind_insn(insn, aux, insn_buff, + clone->aux->verifier_zext); if (!rewritten) continue; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 5d141f16f6fa..272071e9112f 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1707,20 +1707,26 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr) if (err) goto free_used_maps; - err = bpf_prog_new_fd(prog); - if (err < 0) { - /* failed to allocate fd. - * bpf_prog_put() is needed because the above - * bpf_prog_alloc_id() has published the prog - * to the userspace and the userspace may - * have refcnt-ed it through BPF_PROG_GET_FD_BY_ID. - */ - bpf_prog_put(prog); - return err; - } - + /* Upon success of bpf_prog_alloc_id(), the BPF prog is + * effectively publicly exposed. However, retrieving via + * bpf_prog_get_fd_by_id() will take another reference, + * therefore it cannot be gone underneath us. + * + * Only for the time /after/ successful bpf_prog_new_fd() + * and before returning to userspace, we might just hold + * one reference and any parallel close on that fd could + * rip everything out. Hence, below notifications must + * happen before bpf_prog_new_fd(). + * + * Also, any failure handling from this point onwards must + * be using bpf_prog_put() given the program is exposed. + */ bpf_prog_kallsyms_add(prog); perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_LOAD, 0); + + err = bpf_prog_new_fd(prog); + if (err < 0) + bpf_prog_put(prog); return err; free_used_maps: diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 5900cbb966b1..c36a719fee6d 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -985,9 +985,6 @@ static void __mark_reg_unbounded(struct bpf_reg_state *reg) reg->smax_value = S64_MAX; reg->umin_value = 0; reg->umax_value = U64_MAX; - - /* constant backtracking is enabled for root only for now */ - reg->precise = capable(CAP_SYS_ADMIN) ? false : true; } /* Mark a register as having a completely unknown (scalar) value. */ @@ -1014,7 +1011,11 @@ static void mark_reg_unknown(struct bpf_verifier_env *env, __mark_reg_not_init(regs + regno); return; } - __mark_reg_unknown(regs + regno); + regs += regno; + __mark_reg_unknown(regs); + /* constant backtracking is enabled for root without bpf2bpf calls */ + regs->precise = env->subprog_cnt > 1 || !env->allow_ptr_leaks ? + true : false; } static void __mark_reg_not_init(struct bpf_reg_state *reg) @@ -1771,16 +1772,21 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno, bitmap_from_u64(mask, stack_mask); for_each_set_bit(i, mask, 64) { if (i >= func->allocated_stack / BPF_REG_SIZE) { - /* This can happen if backtracking - * is propagating stack precision where - * caller has larger stack frame - * than callee, but backtrack_insn() should - * have returned -ENOTSUPP. + /* the sequence of instructions: + * 2: (bf) r3 = r10 + * 3: (7b) *(u64 *)(r3 -8) = r0 + * 4: (79) r4 = *(u64 *)(r10 -8) + * doesn't contain jmps. It's backtracked + * as a single block. + * During backtracking insn 3 is not recognized as + * stack access, so at the end of backtracking + * stack slot fp-8 is still marked in stack_mask. + * However the parent state may not have accessed + * fp-8 and it's "unallocated" stack space. + * In such case fallback to conservative. */ - verbose(env, "BUG spi %d stack_size %d\n", - i, func->allocated_stack); - WARN_ONCE(1, "verifier backtracking bug"); - return -EFAULT; + mark_all_scalars_precise(env, st); + return 0; } if (func->stack[i].slot_type[0] != STACK_SPILL) { @@ -8616,8 +8622,8 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) } if (is_narrower_load && size < target_size) { - u8 shift = (off & (size_default - 1)) * 8; - + u8 shift = bpf_ctx_narrow_load_shift(off, size, + size_default); if (ctx_field_size <= 4) { if (shift) insn_buf[cnt++] = BPF_ALU32_IMM(BPF_RSH, diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 753afbca549f..a7ce73a2c401 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1891,7 +1891,7 @@ static int cgroup_reconfigure(struct fs_context *fc) */ static bool use_task_css_set_links __read_mostly; -static void cgroup_enable_task_cg_lists(void) +void cgroup_enable_task_cg_lists(void) { struct task_struct *p, *g; @@ -5255,8 +5255,16 @@ static struct cgroup *cgroup_create(struct cgroup *parent) * if the parent has to be frozen, the child has too. */ cgrp->freezer.e_freeze = parent->freezer.e_freeze; - if (cgrp->freezer.e_freeze) + if (cgrp->freezer.e_freeze) { + /* + * Set the CGRP_FREEZE flag, so when a process will be + * attached to the child cgroup, it will become frozen. + * At this point the new cgroup is unpopulated, so we can + * consider it frozen immediately. + */ + set_bit(CGRP_FREEZE, &cgrp->flags); set_bit(CGRP_FROZEN, &cgrp->flags); + } spin_lock_irq(&css_set_lock); for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) { diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 5aa37531ce76..c52bc91f882b 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -45,6 +45,7 @@ #include <linux/proc_fs.h> #include <linux/rcupdate.h> #include <linux/sched.h> +#include <linux/sched/deadline.h> #include <linux/sched/mm.h> #include <linux/sched/task.h> #include <linux/seq_file.h> @@ -332,7 +333,18 @@ static struct cpuset top_cpuset = { * guidelines for accessing subsystem state in kernel/cgroup.c */ -static DEFINE_MUTEX(cpuset_mutex); +DEFINE_STATIC_PERCPU_RWSEM(cpuset_rwsem); + +void cpuset_read_lock(void) +{ + percpu_down_read(&cpuset_rwsem); +} + +void cpuset_read_unlock(void) +{ + percpu_up_read(&cpuset_rwsem); +} + static DEFINE_SPINLOCK(callback_lock); static struct workqueue_struct *cpuset_migrate_mm_wq; @@ -894,6 +906,67 @@ done: return ndoms; } +static void update_tasks_root_domain(struct cpuset *cs) +{ + struct css_task_iter it; + struct task_struct *task; + + css_task_iter_start(&cs->css, 0, &it); + + while ((task = css_task_iter_next(&it))) + dl_add_task_root_domain(task); + + css_task_iter_end(&it); +} + +static void rebuild_root_domains(void) +{ + struct cpuset *cs = NULL; + struct cgroup_subsys_state *pos_css; + + percpu_rwsem_assert_held(&cpuset_rwsem); + lockdep_assert_cpus_held(); + lockdep_assert_held(&sched_domains_mutex); + + cgroup_enable_task_cg_lists(); + + rcu_read_lock(); + + /* + * Clear default root domain DL accounting, it will be computed again + * if a task belongs to it. + */ + dl_clear_root_domain(&def_root_domain); + + cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) { + + if (cpumask_empty(cs->effective_cpus)) { + pos_css = css_rightmost_descendant(pos_css); + continue; + } + + css_get(&cs->css); + + rcu_read_unlock(); + + update_tasks_root_domain(cs); + + rcu_read_lock(); + css_put(&cs->css); + } + rcu_read_unlock(); +} + +static void +partition_and_rebuild_sched_domains(int ndoms_new, cpumask_var_t doms_new[], + struct sched_domain_attr *dattr_new) +{ + mutex_lock(&sched_domains_mutex); + partition_sched_domains_locked(ndoms_new, doms_new, dattr_new); + rebuild_root_domains(); + mutex_unlock(&sched_domains_mutex); +} + /* * Rebuild scheduler domains. * @@ -911,8 +984,8 @@ static void rebuild_sched_domains_locked(void) cpumask_var_t *doms; int ndoms; - lockdep_assert_held(&cpuset_mutex); - get_online_cpus(); + lockdep_assert_cpus_held(); + percpu_rwsem_assert_held(&cpuset_rwsem); /* * We have raced with CPU hotplug. Don't do anything to avoid @@ -921,19 +994,17 @@ static void rebuild_sched_domains_locked(void) */ if (!top_cpuset.nr_subparts_cpus && !cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask)) - goto out; + return; if (top_cpuset.nr_subparts_cpus && !cpumask_subset(top_cpuset.effective_cpus, cpu_active_mask)) - goto out; + return; /* Generate domain masks and attrs */ ndoms = generate_sched_domains(&doms, &attr); /* Have scheduler rebuild the domains */ - partition_sched_domains(ndoms, doms, attr); -out: - put_online_cpus(); + partition_and_rebuild_sched_domains(ndoms, doms, attr); } #else /* !CONFIG_SMP */ static void rebuild_sched_domains_locked(void) @@ -943,9 +1014,11 @@ static void rebuild_sched_domains_locked(void) void rebuild_sched_domains(void) { - mutex_lock(&cpuset_mutex); + get_online_cpus(); + percpu_down_write(&cpuset_rwsem); rebuild_sched_domains_locked(); - mutex_unlock(&cpuset_mutex); + percpu_up_write(&cpuset_rwsem); + put_online_cpus(); } /** @@ -1051,7 +1124,7 @@ static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd, int deleting; /* Moving cpus from subparts_cpus to effective_cpus */ bool part_error = false; /* Partition error? */ - lockdep_assert_held(&cpuset_mutex); + percpu_rwsem_assert_held(&cpuset_rwsem); /* * The parent must be a partition root. @@ -2039,7 +2112,7 @@ static int cpuset_can_attach(struct cgroup_taskset *tset) cpuset_attach_old_cs = task_cs(cgroup_taskset_first(tset, &css)); cs = css_cs(css); - mutex_lock(&cpuset_mutex); + percpu_down_write(&cpuset_rwsem); /* allow moving tasks into an empty cpuset if on default hierarchy */ ret = -ENOSPC; @@ -2063,7 +2136,7 @@ static int cpuset_can_attach(struct cgroup_taskset *tset) cs->attach_in_progress++; ret = 0; out_unlock: - mutex_unlock(&cpuset_mutex); + percpu_up_write(&cpuset_rwsem); return ret; } @@ -2073,9 +2146,9 @@ static void cpuset_cancel_attach(struct cgroup_taskset *tset) cgroup_taskset_first(tset, &css); - mutex_lock(&cpuset_mutex); + percpu_down_write(&cpuset_rwsem); css_cs(css)->attach_in_progress--; - mutex_unlock(&cpuset_mutex); + percpu_up_write(&cpuset_rwsem); } /* @@ -2098,7 +2171,7 @@ static void cpuset_attach(struct cgroup_taskset *tset) cgroup_taskset_first(tset, &css); cs = css_cs(css); - mutex_lock(&cpuset_mutex); + percpu_down_write(&cpuset_rwsem); /* prepare for attach */ if (cs == &top_cpuset) @@ -2152,7 +2225,7 @@ static void cpuset_attach(struct cgroup_taskset *tset) if (!cs->attach_in_progress) wake_up(&cpuset_attach_wq); - mutex_unlock(&cpuset_mutex); + percpu_up_write(&cpuset_rwsem); } /* The various types of files and directories in a cpuset file system */ @@ -2183,7 +2256,8 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, cpuset_filetype_t type = cft->private; int retval = 0; - mutex_lock(&cpuset_mutex); + get_online_cpus(); + percpu_down_write(&cpuset_rwsem); if (!is_cpuset_online(cs)) { retval = -ENODEV; goto out_unlock; @@ -2219,7 +2293,8 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, break; } out_unlock: - mutex_unlock(&cpuset_mutex); + percpu_up_write(&cpuset_rwsem); + put_online_cpus(); return retval; } @@ -2230,7 +2305,8 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft, cpuset_filetype_t type = cft->private; int retval = -ENODEV; - mutex_lock(&cpuset_mutex); + get_online_cpus(); + percpu_down_write(&cpuset_rwsem); if (!is_cpuset_online(cs)) goto out_unlock; @@ -2243,7 +2319,8 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft, break; } out_unlock: - mutex_unlock(&cpuset_mutex); + percpu_up_write(&cpuset_rwsem); + put_online_cpus(); return retval; } @@ -2282,7 +2359,8 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of, kernfs_break_active_protection(of->kn); flush_work(&cpuset_hotplug_work); - mutex_lock(&cpuset_mutex); + get_online_cpus(); + percpu_down_write(&cpuset_rwsem); if (!is_cpuset_online(cs)) goto out_unlock; @@ -2306,7 +2384,8 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of, free_cpuset(trialcs); out_unlock: - mutex_unlock(&cpuset_mutex); + percpu_up_write(&cpuset_rwsem); + put_online_cpus(); kernfs_unbreak_active_protection(of->kn); css_put(&cs->css); flush_workqueue(cpuset_migrate_mm_wq); @@ -2437,13 +2516,15 @@ static ssize_t sched_partition_write(struct kernfs_open_file *of, char *buf, return -EINVAL; css_get(&cs->css); - mutex_lock(&cpuset_mutex); + get_online_cpus(); + percpu_down_write(&cpuset_rwsem); if (!is_cpuset_online(cs)) goto out_unlock; retval = update_prstate(cs, val); out_unlock: - mutex_unlock(&cpuset_mutex); + percpu_up_write(&cpuset_rwsem); + put_online_cpus(); css_put(&cs->css); return retval ?: nbytes; } @@ -2649,7 +2730,8 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) if (!parent) return 0; - mutex_lock(&cpuset_mutex); + get_online_cpus(); + percpu_down_write(&cpuset_rwsem); set_bit(CS_ONLINE, &cs->flags); if (is_spread_page(parent)) @@ -2700,7 +2782,8 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) cpumask_copy(cs->effective_cpus, parent->cpus_allowed); spin_unlock_irq(&callback_lock); out_unlock: - mutex_unlock(&cpuset_mutex); + percpu_up_write(&cpuset_rwsem); + put_online_cpus(); return 0; } @@ -2719,7 +2802,8 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css) { struct cpuset *cs = css_cs(css); - mutex_lock(&cpuset_mutex); + get_online_cpus(); + percpu_down_write(&cpuset_rwsem); if (is_partition_root(cs)) update_prstate(cs, 0); @@ -2738,7 +2822,8 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css) cpuset_dec(); clear_bit(CS_ONLINE, &cs->flags); - mutex_unlock(&cpuset_mutex); + percpu_up_write(&cpuset_rwsem); + put_online_cpus(); } static void cpuset_css_free(struct cgroup_subsys_state *css) @@ -2750,7 +2835,7 @@ static void cpuset_css_free(struct cgroup_subsys_state *css) static void cpuset_bind(struct cgroup_subsys_state *root_css) { - mutex_lock(&cpuset_mutex); + percpu_down_write(&cpuset_rwsem); spin_lock_irq(&callback_lock); if (is_in_v2_mode()) { @@ -2763,7 +2848,7 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css) } spin_unlock_irq(&callback_lock); - mutex_unlock(&cpuset_mutex); + percpu_up_write(&cpuset_rwsem); } /* @@ -2805,6 +2890,8 @@ struct cgroup_subsys cpuset_cgrp_subsys = { int __init cpuset_init(void) { + BUG_ON(percpu_init_rwsem(&cpuset_rwsem)); + BUG_ON(!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL)); BUG_ON(!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL)); BUG_ON(!zalloc_cpumask_var(&top_cpuset.subparts_cpus, GFP_KERNEL)); @@ -2876,7 +2963,7 @@ hotplug_update_tasks_legacy(struct cpuset *cs, is_empty = cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed); - mutex_unlock(&cpuset_mutex); + percpu_up_write(&cpuset_rwsem); /* * Move tasks to the nearest ancestor with execution resources, @@ -2886,7 +2973,7 @@ hotplug_update_tasks_legacy(struct cpuset *cs, if (is_empty) remove_tasks_in_empty_cpuset(cs); - mutex_lock(&cpuset_mutex); + percpu_down_write(&cpuset_rwsem); } static void @@ -2936,14 +3023,14 @@ static void cpuset_hotplug_update_tasks(struct cpuset *cs, struct tmpmasks *tmp) retry: wait_event(cpuset_attach_wq, cs->attach_in_progress == 0); - mutex_lock(&cpuset_mutex); + percpu_down_write(&cpuset_rwsem); /* * We have raced with task attaching. We wait until attaching * is finished, so we won't attach a task to an empty cpuset. */ if (cs->attach_in_progress) { - mutex_unlock(&cpuset_mutex); + percpu_up_write(&cpuset_rwsem); goto retry; } @@ -3011,7 +3098,7 @@ update_tasks: hotplug_update_tasks_legacy(cs, &new_cpus, &new_mems, cpus_updated, mems_updated); - mutex_unlock(&cpuset_mutex); + percpu_up_write(&cpuset_rwsem); } /** @@ -3041,7 +3128,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work) if (on_dfl && !alloc_cpumasks(NULL, &tmp)) ptmp = &tmp; - mutex_lock(&cpuset_mutex); + percpu_down_write(&cpuset_rwsem); /* fetch the available cpus/mems and find out which changed how */ cpumask_copy(&new_cpus, cpu_active_mask); @@ -3091,7 +3178,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work) update_tasks_nodemask(&top_cpuset); } - mutex_unlock(&cpuset_mutex); + percpu_up_write(&cpuset_rwsem); /* if cpus or mems changed, we need to propagate to descendants */ if (cpus_updated || mems_updated) { diff --git a/kernel/configs.c b/kernel/configs.c index b062425ccf8d..c09ea4c995e1 100644 --- a/kernel/configs.c +++ b/kernel/configs.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * kernel/configs.c * Echo the kernel .config file used to build the kernel @@ -6,21 +7,6 @@ * Copyright (C) 2002 Randy Dunlap <rdunlap@xenotime.net> * Copyright (C) 2002 Al Stone <ahs3@fc.hp.com> * Copyright (C) 2002 Hewlett-Packard Company - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or (at - * your option) any later version. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or - * NON INFRINGEMENT. See the GNU General Public License for more - * details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ #include <linux/kernel.h> diff --git a/kernel/cpu.c b/kernel/cpu.c index 05778e32674a..e1967e9eddc2 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -2298,6 +2298,9 @@ EXPORT_SYMBOL(__cpu_present_mask); struct cpumask __cpu_active_mask __read_mostly; EXPORT_SYMBOL(__cpu_active_mask); +atomic_t __num_online_cpus __read_mostly; +EXPORT_SYMBOL(__num_online_cpus); + void init_cpu_present(const struct cpumask *src) { cpumask_copy(&__cpu_present_mask, src); @@ -2313,6 +2316,27 @@ void init_cpu_online(const struct cpumask *src) cpumask_copy(&__cpu_online_mask, src); } +void set_cpu_online(unsigned int cpu, bool online) +{ + /* + * atomic_inc/dec() is required to handle the horrid abuse of this + * function by the reboot and kexec code which invoke it from + * IPI/NMI broadcasts when shutting down CPUs. Invocation from + * regular CPU hotplug is properly serialized. + * + * Note, that the fact that __num_online_cpus is of type atomic_t + * does not protect readers which are not serialized against + * concurrent hotplug operations. + */ + if (online) { + if (!cpumask_test_and_set_cpu(cpu, &__cpu_online_mask)) + atomic_inc(&__num_online_cpus); + } else { + if (cpumask_test_and_clear_cpu(cpu, &__cpu_online_mask)) + atomic_dec(&__num_online_cpus); + } +} + /* * Activate the first processor. */ diff --git a/kernel/cred.c b/kernel/cred.c index f9a0ce66c9c3..c0a4c12d38b2 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -144,7 +144,10 @@ void __put_cred(struct cred *cred) BUG_ON(cred == current->cred); BUG_ON(cred == current->real_cred); - call_rcu(&cred->rcu, put_cred_rcu); + if (cred->non_rcu) + put_cred_rcu(&cred->rcu); + else + call_rcu(&cred->rcu, put_cred_rcu); } EXPORT_SYMBOL(__put_cred); @@ -261,6 +264,7 @@ struct cred *prepare_creds(void) old = task->cred; memcpy(new, old, sizeof(struct cred)); + new->non_rcu = 0; atomic_set(&new->usage, 1); set_cred_subscribers(new, 0); get_group_info(new->group_info); @@ -544,7 +548,19 @@ const struct cred *override_creds(const struct cred *new) validate_creds(old); validate_creds(new); - get_cred(new); + + /* + * NOTE! This uses 'get_new_cred()' rather than 'get_cred()'. + * + * That means that we do not clear the 'non_rcu' flag, since + * we are only installing the cred into the thread-synchronous + * '->cred' pointer, not the '->real_cred' pointer that is + * visible to other threads under RCU. + * + * Also note that we did validate_creds() manually, not depending + * on the validation in 'get_cred()'. + */ + get_new_cred((struct cred *)new); alter_cred_subscribers(new, 1); rcu_assign_pointer(current->cred, new); alter_cred_subscribers(old, -1); @@ -681,6 +697,7 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon) validate_creds(old); *new = *old; + new->non_rcu = 0; atomic_set(&new->usage, 1); set_cred_subscribers(new, 0); get_uid(new->user); diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c index bfc0c17f2a3d..69cfb4345388 100644 --- a/kernel/dma/contiguous.c +++ b/kernel/dma/contiguous.c @@ -230,9 +230,7 @@ bool dma_release_from_contiguous(struct device *dev, struct page *pages, */ struct page *dma_alloc_contiguous(struct device *dev, size_t size, gfp_t gfp) { - int node = dev ? dev_to_node(dev) : NUMA_NO_NODE; - size_t count = PAGE_ALIGN(size) >> PAGE_SHIFT; - size_t align = get_order(PAGE_ALIGN(size)); + size_t count = size >> PAGE_SHIFT; struct page *page = NULL; struct cma *cma = NULL; @@ -243,13 +241,12 @@ struct page *dma_alloc_contiguous(struct device *dev, size_t size, gfp_t gfp) /* CMA can be used only in the context which permits sleeping */ if (cma && gfpflags_allow_blocking(gfp)) { - align = min_t(size_t, align, CONFIG_CMA_ALIGNMENT); - page = cma_alloc(cma, count, align, gfp & __GFP_NOWARN); + size_t align = get_order(size); + size_t cma_align = min_t(size_t, align, CONFIG_CMA_ALIGNMENT); + + page = cma_alloc(cma, count, cma_align, gfp & __GFP_NOWARN); } - /* Fallback allocation of normal pages */ - if (!page) - page = alloc_pages_node(node, gfp, align); return page; } @@ -266,7 +263,8 @@ struct page *dma_alloc_contiguous(struct device *dev, size_t size, gfp_t gfp) */ void dma_free_contiguous(struct device *dev, struct page *page, size_t size) { - if (!cma_release(dev_get_cma_area(dev), page, size >> PAGE_SHIFT)) + if (!cma_release(dev_get_cma_area(dev), page, + PAGE_ALIGN(size) >> PAGE_SHIFT)) __free_pages(page, get_order(size)); } diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 59bdceea3737..8402b29c280f 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -47,9 +47,6 @@ u64 dma_direct_get_required_mask(struct device *dev) { u64 max_dma = phys_to_dma_direct(dev, (max_pfn - 1) << PAGE_SHIFT); - if (dev->bus_dma_mask && dev->bus_dma_mask < max_dma) - max_dma = dev->bus_dma_mask; - return (1ULL << (fls64(max_dma) - 1)) * 2 - 1; } @@ -88,6 +85,8 @@ static bool dma_coherent_ok(struct device *dev, phys_addr_t phys, size_t size) struct page *__dma_direct_alloc_pages(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs) { + size_t alloc_size = PAGE_ALIGN(size); + int node = dev_to_node(dev); struct page *page = NULL; u64 phys_mask; @@ -98,8 +97,14 @@ struct page *__dma_direct_alloc_pages(struct device *dev, size_t size, gfp &= ~__GFP_ZERO; gfp |= __dma_direct_optimal_gfp_mask(dev, dev->coherent_dma_mask, &phys_mask); + page = dma_alloc_contiguous(dev, alloc_size, gfp); + if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) { + dma_free_contiguous(dev, page, alloc_size); + page = NULL; + } again: - page = dma_alloc_contiguous(dev, size, gfp); + if (!page) + page = alloc_pages_node(node, gfp, get_order(alloc_size)); if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) { dma_free_contiguous(dev, page, size); page = NULL; @@ -130,10 +135,12 @@ void *dma_direct_alloc_pages(struct device *dev, size_t size, if (!page) return NULL; - if (attrs & DMA_ATTR_NO_KERNEL_MAPPING) { + if ((attrs & DMA_ATTR_NO_KERNEL_MAPPING) && + !force_dma_unencrypted(dev)) { /* remove any dirty cache lines on the kernel alias */ if (!PageHighMem(page)) arch_dma_prep_coherent(page, size); + *dma_handle = phys_to_dma(dev, page_to_phys(page)); /* return the page pointer as the opaque cookie */ return page; } @@ -178,7 +185,8 @@ void dma_direct_free_pages(struct device *dev, size_t size, void *cpu_addr, { unsigned int page_order = get_order(size); - if (attrs & DMA_ATTR_NO_KERNEL_MAPPING) { + if ((attrs & DMA_ATTR_NO_KERNEL_MAPPING) && + !force_dma_unencrypted(dev)) { /* cpu_addr is a struct page cookie, not a kernel address */ __dma_direct_free_pages(dev, size, cpu_addr); return; @@ -297,7 +305,7 @@ void dma_direct_unmap_page(struct device *dev, dma_addr_t addr, dma_direct_sync_single_for_cpu(dev, addr, size, dir); if (unlikely(is_swiotlb_buffer(phys))) - swiotlb_tbl_unmap_single(dev, phys, size, dir, attrs); + swiotlb_tbl_unmap_single(dev, phys, size, size, dir, attrs); } EXPORT_SYMBOL(dma_direct_unmap_page); diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index 1f628e7ac709..b0038ca3aa92 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -116,11 +116,16 @@ int dma_common_get_sgtable(struct device *dev, struct sg_table *sgt, int ret; if (!dev_is_dma_coherent(dev)) { + unsigned long pfn; + if (!IS_ENABLED(CONFIG_ARCH_HAS_DMA_COHERENT_TO_PFN)) return -ENXIO; - page = pfn_to_page(arch_dma_coherent_to_pfn(dev, cpu_addr, - dma_addr)); + /* If the PFN is not valid, we do not have a struct page */ + pfn = arch_dma_coherent_to_pfn(dev, cpu_addr, dma_addr); + if (!pfn_valid(pfn)) + return -ENXIO; + page = pfn_to_page(pfn); } else { page = virt_to_page(cpu_addr); } @@ -145,6 +150,23 @@ int dma_get_sgtable_attrs(struct device *dev, struct sg_table *sgt, } EXPORT_SYMBOL(dma_get_sgtable_attrs); +#ifdef CONFIG_MMU +/* + * Return the page attributes used for mapping dma_alloc_* memory, either in + * kernel space if remapping is needed, or to userspace through dma_mmap_*. + */ +pgprot_t dma_pgprot(struct device *dev, pgprot_t prot, unsigned long attrs) +{ + if (dev_is_dma_coherent(dev) || + (IS_ENABLED(CONFIG_DMA_NONCOHERENT_CACHE_SYNC) && + (attrs & DMA_ATTR_NON_CONSISTENT))) + return prot; + if (IS_ENABLED(CONFIG_ARCH_HAS_DMA_MMAP_PGPROT)) + return arch_dma_mmap_pgprot(dev, prot, attrs); + return pgprot_noncached(prot); +} +#endif /* CONFIG_MMU */ + /* * Create userspace mapping for the DMA-coherent memory. */ @@ -159,7 +181,7 @@ int dma_common_mmap(struct device *dev, struct vm_area_struct *vma, unsigned long pfn; int ret = -ENXIO; - vma->vm_page_prot = arch_dma_mmap_pgprot(dev, vma->vm_page_prot, attrs); + vma->vm_page_prot = dma_pgprot(dev, vma->vm_page_prot, attrs); if (dma_mmap_from_dev_coherent(dev, vma, cpu_addr, size, &ret)) return ret; @@ -170,7 +192,11 @@ int dma_common_mmap(struct device *dev, struct vm_area_struct *vma, if (!dev_is_dma_coherent(dev)) { if (!IS_ENABLED(CONFIG_ARCH_HAS_DMA_COHERENT_TO_PFN)) return -ENXIO; + + /* If the PFN is not valid, we do not have a struct page */ pfn = arch_dma_coherent_to_pfn(dev, cpu_addr, dma_addr); + if (!pfn_valid(pfn)) + return -ENXIO; } else { pfn = page_to_pfn(virt_to_page(cpu_addr)); } diff --git a/kernel/dma/remap.c b/kernel/dma/remap.c index a594aec07882..ffe78f0b2fe4 100644 --- a/kernel/dma/remap.c +++ b/kernel/dma/remap.c @@ -218,7 +218,7 @@ void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle, /* create a coherent mapping */ ret = dma_common_contiguous_remap(page, size, VM_USERMAP, - arch_dma_mmap_pgprot(dev, PAGE_KERNEL, attrs), + dma_pgprot(dev, PAGE_KERNEL, attrs), __builtin_return_address(0)); if (!ret) { __dma_direct_free_pages(dev, size, page); diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 9de232229063..796a44f8ef5a 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -444,7 +444,9 @@ static void swiotlb_bounce(phys_addr_t orig_addr, phys_addr_t tlb_addr, phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, dma_addr_t tbl_dma_addr, - phys_addr_t orig_addr, size_t size, + phys_addr_t orig_addr, + size_t mapping_size, + size_t alloc_size, enum dma_data_direction dir, unsigned long attrs) { @@ -464,6 +466,12 @@ phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, pr_warn_once("%s is active and system is using DMA bounce buffers\n", sme_active() ? "SME" : "SEV"); + if (mapping_size > alloc_size) { + dev_warn_once(hwdev, "Invalid sizes (mapping: %zd bytes, alloc: %zd bytes)", + mapping_size, alloc_size); + return (phys_addr_t)DMA_MAPPING_ERROR; + } + mask = dma_get_seg_boundary(hwdev); tbl_dma_addr &= mask; @@ -471,8 +479,8 @@ phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, offset_slots = ALIGN(tbl_dma_addr, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT; /* - * Carefully handle integer overflow which can occur when mask == ~0UL. - */ + * Carefully handle integer overflow which can occur when mask == ~0UL. + */ max_slots = mask + 1 ? ALIGN(mask + 1, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT : 1UL << (BITS_PER_LONG - IO_TLB_SHIFT); @@ -481,8 +489,8 @@ phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, * For mappings greater than or equal to a page, we limit the stride * (and hence alignment) to a page size. */ - nslots = ALIGN(size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT; - if (size >= PAGE_SIZE) + nslots = ALIGN(alloc_size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT; + if (alloc_size >= PAGE_SIZE) stride = (1 << (PAGE_SHIFT - IO_TLB_SHIFT)); else stride = 1; @@ -547,7 +555,7 @@ not_found: spin_unlock_irqrestore(&io_tlb_lock, flags); if (!(attrs & DMA_ATTR_NO_WARN) && printk_ratelimit()) dev_warn(hwdev, "swiotlb buffer is full (sz: %zd bytes), total %lu (slots), used %lu (slots)\n", - size, io_tlb_nslabs, tmp_io_tlb_used); + alloc_size, io_tlb_nslabs, tmp_io_tlb_used); return (phys_addr_t)DMA_MAPPING_ERROR; found: io_tlb_used += nslots; @@ -562,7 +570,7 @@ found: io_tlb_orig_addr[index+i] = orig_addr + (i << IO_TLB_SHIFT); if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) && (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL)) - swiotlb_bounce(orig_addr, tlb_addr, size, DMA_TO_DEVICE); + swiotlb_bounce(orig_addr, tlb_addr, mapping_size, DMA_TO_DEVICE); return tlb_addr; } @@ -571,11 +579,11 @@ found: * tlb_addr is the physical address of the bounce buffer to unmap. */ void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr, - size_t size, enum dma_data_direction dir, - unsigned long attrs) + size_t mapping_size, size_t alloc_size, + enum dma_data_direction dir, unsigned long attrs) { unsigned long flags; - int i, count, nslots = ALIGN(size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT; + int i, count, nslots = ALIGN(alloc_size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT; int index = (tlb_addr - io_tlb_start) >> IO_TLB_SHIFT; phys_addr_t orig_addr = io_tlb_orig_addr[index]; @@ -585,7 +593,7 @@ void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr, if (orig_addr != INVALID_PHYS_ADDR && !(attrs & DMA_ATTR_SKIP_CPU_SYNC) && ((dir == DMA_FROM_DEVICE) || (dir == DMA_BIDIRECTIONAL))) - swiotlb_bounce(orig_addr, tlb_addr, size, DMA_FROM_DEVICE); + swiotlb_bounce(orig_addr, tlb_addr, mapping_size, DMA_FROM_DEVICE); /* * Return the buffer to the free list by setting the corresponding @@ -665,14 +673,14 @@ bool swiotlb_map(struct device *dev, phys_addr_t *phys, dma_addr_t *dma_addr, /* Oh well, have to allocate and map a bounce buffer. */ *phys = swiotlb_tbl_map_single(dev, __phys_to_dma(dev, io_tlb_start), - *phys, size, dir, attrs); + *phys, size, size, dir, attrs); if (*phys == (phys_addr_t)DMA_MAPPING_ERROR) return false; /* Ensure that the address returned is DMA'ble */ *dma_addr = __phys_to_dma(dev, *phys); if (unlikely(!dma_capable(dev, *dma_addr, size))) { - swiotlb_tbl_unmap_single(dev, *phys, size, dir, + swiotlb_tbl_unmap_single(dev, *phys, size, size, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC); return false; } diff --git a/kernel/events/core.c b/kernel/events/core.c index 026a14541a38..1c414b8866b4 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1887,6 +1887,89 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx) ctx->generation++; } +static int +perf_aux_output_match(struct perf_event *event, struct perf_event *aux_event) +{ + if (!has_aux(aux_event)) + return 0; + + if (!event->pmu->aux_output_match) + return 0; + + return event->pmu->aux_output_match(aux_event); +} + +static void put_event(struct perf_event *event); +static void event_sched_out(struct perf_event *event, + struct perf_cpu_context *cpuctx, + struct perf_event_context *ctx); + +static void perf_put_aux_event(struct perf_event *event) +{ + struct perf_event_context *ctx = event->ctx; + struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); + struct perf_event *iter; + + /* + * If event uses aux_event tear down the link + */ + if (event->aux_event) { + iter = event->aux_event; + event->aux_event = NULL; + put_event(iter); + return; + } + + /* + * If the event is an aux_event, tear down all links to + * it from other events. + */ + for_each_sibling_event(iter, event->group_leader) { + if (iter->aux_event != event) + continue; + + iter->aux_event = NULL; + put_event(event); + + /* + * If it's ACTIVE, schedule it out and put it into ERROR + * state so that we don't try to schedule it again. Note + * that perf_event_enable() will clear the ERROR status. + */ + event_sched_out(iter, cpuctx, ctx); + perf_event_set_state(event, PERF_EVENT_STATE_ERROR); + } +} + +static int perf_get_aux_event(struct perf_event *event, + struct perf_event *group_leader) +{ + /* + * Our group leader must be an aux event if we want to be + * an aux_output. This way, the aux event will precede its + * aux_output events in the group, and therefore will always + * schedule first. + */ + if (!group_leader) + return 0; + + if (!perf_aux_output_match(event, group_leader)) + return 0; + + if (!atomic_long_inc_not_zero(&group_leader->refcount)) + return 0; + + /* + * Link aux_outputs to their aux event; this is undone in + * perf_group_detach() by perf_put_aux_event(). When the + * group in torn down, the aux_output events loose their + * link to the aux_event and can't schedule any more. + */ + event->aux_event = group_leader; + + return 1; +} + static void perf_group_detach(struct perf_event *event) { struct perf_event *sibling, *tmp; @@ -1902,6 +1985,8 @@ static void perf_group_detach(struct perf_event *event) event->attach_state &= ~PERF_ATTACH_GROUP; + perf_put_aux_event(event); + /* * If this is a sibling, remove it from its group. */ @@ -4089,10 +4174,8 @@ alloc_perf_context(struct pmu *pmu, struct task_struct *task) return NULL; __perf_event_init_context(ctx); - if (task) { - ctx->task = task; - get_task_struct(task); - } + if (task) + ctx->task = get_task_struct(task); ctx->pmu = pmu; return ctx; @@ -10355,8 +10438,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, * and we cannot use the ctx information because we need the * pmu before we get a ctx. */ - get_task_struct(task); - event->hw.target = task; + event->hw.target = get_task_struct(task); } event->clock = &local_clock; @@ -10426,6 +10508,12 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, goto err_ns; } + if (event->attr.aux_output && + !(pmu->capabilities & PERF_PMU_CAP_AUX_OUTPUT)) { + err = -EOPNOTSUPP; + goto err_pmu; + } + err = exclusive_event_init(event); if (err) goto err_pmu; @@ -11082,6 +11170,8 @@ SYSCALL_DEFINE5(perf_event_open, } } + if (event->attr.aux_output && !perf_get_aux_event(event, group_leader)) + goto err_locked; /* * Must be under the same ctx::mutex as perf_install_in_context(), @@ -11274,7 +11364,7 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, goto err_unlock; } - perf_install_in_context(ctx, event, cpu); + perf_install_in_context(ctx, event, event->cpu); perf_unpin_context(ctx); mutex_unlock(&ctx->mutex); diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index c5cd852fe86b..3cc8416ec844 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -413,7 +413,7 @@ static int hw_breakpoint_parse(struct perf_event *bp, int register_perf_hw_breakpoint(struct perf_event *bp) { - struct arch_hw_breakpoint hw; + struct arch_hw_breakpoint hw = { }; int err; err = reserve_bp_slot(bp); @@ -461,7 +461,7 @@ int modify_user_hw_breakpoint_check(struct perf_event *bp, struct perf_event_attr *attr, bool check) { - struct arch_hw_breakpoint hw; + struct arch_hw_breakpoint hw = { }; int err; err = hw_breakpoint_parse(bp, attr, &hw); diff --git a/kernel/exit.c b/kernel/exit.c index 4436158a6d30..22ab6a4bdc51 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -734,9 +734,10 @@ static void exit_notify(struct task_struct *tsk, int group_dead) autoreap = true; } - tsk->exit_state = autoreap ? EXIT_DEAD : EXIT_ZOMBIE; - if (tsk->exit_state == EXIT_DEAD) + if (autoreap) { + tsk->exit_state = EXIT_DEAD; list_add(&tsk->ptrace_entry, &dead); + } /* mt-exec, de_thread() is waiting for group leader */ if (unlikely(tsk->signal->notify_count < 0)) @@ -1553,6 +1554,23 @@ end: return retval; } +static struct pid *pidfd_get_pid(unsigned int fd) +{ + struct fd f; + struct pid *pid; + + f = fdget(fd); + if (!f.file) + return ERR_PTR(-EBADF); + + pid = pidfd_pid(f.file); + if (!IS_ERR(pid)) + get_pid(pid); + + fdput(f); + return pid; +} + static long kernel_waitid(int which, pid_t upid, struct waitid_info *infop, int options, struct rusage *ru) { @@ -1575,19 +1593,32 @@ static long kernel_waitid(int which, pid_t upid, struct waitid_info *infop, type = PIDTYPE_PID; if (upid <= 0) return -EINVAL; + + pid = find_get_pid(upid); break; case P_PGID: type = PIDTYPE_PGID; - if (upid <= 0) + if (upid < 0) return -EINVAL; + + if (upid) + pid = find_get_pid(upid); + else + pid = get_task_pid(current, PIDTYPE_PGID); + break; + case P_PIDFD: + type = PIDTYPE_PID; + if (upid < 0) + return -EINVAL; + + pid = pidfd_get_pid(upid); + if (IS_ERR(pid)) + return PTR_ERR(pid); break; default: return -EINVAL; } - if (type < PIDTYPE_MAX) - pid = find_get_pid(upid); - wo.wo_type = type; wo.wo_pid = pid; wo.wo_flags = options; diff --git a/kernel/fork.c b/kernel/fork.c index d8ae0f1b4148..1d1cd06edbc1 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -726,7 +726,7 @@ void __put_task_struct(struct task_struct *tsk) WARN_ON(tsk == current); cgroup_free(tsk); - task_numa_free(tsk); + task_numa_free(tsk, true); security_task_free(tsk); exit_creds(tsk); delayacct_tsk_free(tsk); @@ -768,6 +768,7 @@ static void set_max_threads(unsigned int max_threads_suggested) int arch_task_struct_size __read_mostly; #endif +#ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR static void task_struct_whitelist(unsigned long *offset, unsigned long *size) { /* Fetch thread_struct whitelist for the architecture. */ @@ -782,6 +783,7 @@ static void task_struct_whitelist(unsigned long *offset, unsigned long *size) else *offset += offsetof(struct task_struct, thread); } +#endif /* CONFIG_ARCH_TASK_STRUCT_ALLOCATOR */ void __init fork_init(void) { @@ -1690,6 +1692,14 @@ static inline void rcu_copy_process(struct task_struct *p) #endif /* #ifdef CONFIG_TASKS_RCU */ } +struct pid *pidfd_pid(const struct file *file) +{ + if (file->f_op == &pidfd_fops) + return file->private_data; + + return ERR_PTR(-EBADF); +} + static int pidfd_release(struct inode *inode, struct file *file) { struct pid *pid = file->private_data; @@ -2338,6 +2348,8 @@ struct mm_struct *copy_init_mm(void) * * It copies the process, and if successful kick-starts * it and waits for it to finish using the VM if required. + * + * args->exit_signal is expected to be checked for sanity by the caller. */ long _do_fork(struct kernel_clone_args *args) { @@ -2562,6 +2574,14 @@ noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs, if (copy_from_user(&args, uargs, size)) return -EFAULT; + /* + * Verify that higher 32bits of exit_signal are unset and that + * it is a valid signal + */ + if (unlikely((args.exit_signal & ~((u64)CSIGNAL)) || + !valid_signal(args.exit_signal))) + return -EINVAL; + *kargs = (struct kernel_clone_args){ .flags = args.flags, .pidfd = u64_to_user_ptr(args.pidfd), diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c index 4352b08ae48d..4d89ad4fae3b 100644 --- a/kernel/irq/affinity.c +++ b/kernel/irq/affinity.c @@ -7,6 +7,7 @@ #include <linux/kernel.h> #include <linux/slab.h> #include <linux/cpu.h> +#include <linux/sort.h> static void irq_spread_init_one(struct cpumask *irqmsk, struct cpumask *nmsk, unsigned int cpus_per_vec) @@ -94,6 +95,155 @@ static int get_nodes_in_cpumask(cpumask_var_t *node_to_cpumask, return nodes; } +struct node_vectors { + unsigned id; + + union { + unsigned nvectors; + unsigned ncpus; + }; +}; + +static int ncpus_cmp_func(const void *l, const void *r) +{ + const struct node_vectors *ln = l; + const struct node_vectors *rn = r; + + return ln->ncpus - rn->ncpus; +} + +/* + * Allocate vector number for each node, so that for each node: + * + * 1) the allocated number is >= 1 + * + * 2) the allocated numbver is <= active CPU number of this node + * + * The actual allocated total vectors may be less than @numvecs when + * active total CPU number is less than @numvecs. + * + * Active CPUs means the CPUs in '@cpu_mask AND @node_to_cpumask[]' + * for each node. + */ +static void alloc_nodes_vectors(unsigned int numvecs, + cpumask_var_t *node_to_cpumask, + const struct cpumask *cpu_mask, + const nodemask_t nodemsk, + struct cpumask *nmsk, + struct node_vectors *node_vectors) +{ + unsigned n, remaining_ncpus = 0; + + for (n = 0; n < nr_node_ids; n++) { + node_vectors[n].id = n; + node_vectors[n].ncpus = UINT_MAX; + } + + for_each_node_mask(n, nodemsk) { + unsigned ncpus; + + cpumask_and(nmsk, cpu_mask, node_to_cpumask[n]); + ncpus = cpumask_weight(nmsk); + + if (!ncpus) + continue; + remaining_ncpus += ncpus; + node_vectors[n].ncpus = ncpus; + } + + numvecs = min_t(unsigned, remaining_ncpus, numvecs); + + sort(node_vectors, nr_node_ids, sizeof(node_vectors[0]), + ncpus_cmp_func, NULL); + + /* + * Allocate vectors for each node according to the ratio of this + * node's nr_cpus to remaining un-assigned ncpus. 'numvecs' is + * bigger than number of active numa nodes. Always start the + * allocation from the node with minimized nr_cpus. + * + * This way guarantees that each active node gets allocated at + * least one vector, and the theory is simple: over-allocation + * is only done when this node is assigned by one vector, so + * other nodes will be allocated >= 1 vector, since 'numvecs' is + * bigger than number of numa nodes. + * + * One perfect invariant is that number of allocated vectors for + * each node is <= CPU count of this node: + * + * 1) suppose there are two nodes: A and B + * ncpu(X) is CPU count of node X + * vecs(X) is the vector count allocated to node X via this + * algorithm + * + * ncpu(A) <= ncpu(B) + * ncpu(A) + ncpu(B) = N + * vecs(A) + vecs(B) = V + * + * vecs(A) = max(1, round_down(V * ncpu(A) / N)) + * vecs(B) = V - vecs(A) + * + * both N and V are integer, and 2 <= V <= N, suppose + * V = N - delta, and 0 <= delta <= N - 2 + * + * 2) obviously vecs(A) <= ncpu(A) because: + * + * if vecs(A) is 1, then vecs(A) <= ncpu(A) given + * ncpu(A) >= 1 + * + * otherwise, + * vecs(A) <= V * ncpu(A) / N <= ncpu(A), given V <= N + * + * 3) prove how vecs(B) <= ncpu(B): + * + * if round_down(V * ncpu(A) / N) == 0, vecs(B) won't be + * over-allocated, so vecs(B) <= ncpu(B), + * + * otherwise: + * + * vecs(A) = + * round_down(V * ncpu(A) / N) = + * round_down((N - delta) * ncpu(A) / N) = + * round_down((N * ncpu(A) - delta * ncpu(A)) / N) >= + * round_down((N * ncpu(A) - delta * N) / N) = + * cpu(A) - delta + * + * then: + * + * vecs(A) - V >= ncpu(A) - delta - V + * => + * V - vecs(A) <= V + delta - ncpu(A) + * => + * vecs(B) <= N - ncpu(A) + * => + * vecs(B) <= cpu(B) + * + * For nodes >= 3, it can be thought as one node and another big + * node given that is exactly what this algorithm is implemented, + * and we always re-calculate 'remaining_ncpus' & 'numvecs', and + * finally for each node X: vecs(X) <= ncpu(X). + * + */ + for (n = 0; n < nr_node_ids; n++) { + unsigned nvectors, ncpus; + + if (node_vectors[n].ncpus == UINT_MAX) + continue; + + WARN_ON_ONCE(numvecs == 0); + + ncpus = node_vectors[n].ncpus; + nvectors = max_t(unsigned, 1, + numvecs * ncpus / remaining_ncpus); + WARN_ON_ONCE(nvectors > ncpus); + + node_vectors[n].nvectors = nvectors; + + remaining_ncpus -= ncpus; + numvecs -= nvectors; + } +} + static int __irq_build_affinity_masks(unsigned int startvec, unsigned int numvecs, unsigned int firstvec, @@ -102,10 +252,11 @@ static int __irq_build_affinity_masks(unsigned int startvec, struct cpumask *nmsk, struct irq_affinity_desc *masks) { - unsigned int n, nodes, cpus_per_vec, extra_vecs, done = 0; + unsigned int i, n, nodes, cpus_per_vec, extra_vecs, done = 0; unsigned int last_affv = firstvec + numvecs; unsigned int curvec = startvec; nodemask_t nodemsk = NODE_MASK_NONE; + struct node_vectors *node_vectors; if (!cpumask_weight(cpu_mask)) return 0; @@ -126,42 +277,56 @@ static int __irq_build_affinity_masks(unsigned int startvec, return numvecs; } - for_each_node_mask(n, nodemsk) { - unsigned int ncpus, v, vecs_to_assign, vecs_per_node; + node_vectors = kcalloc(nr_node_ids, + sizeof(struct node_vectors), + GFP_KERNEL); + if (!node_vectors) + return -ENOMEM; - /* Spread the vectors per node */ - vecs_per_node = (numvecs - (curvec - firstvec)) / nodes; + /* allocate vector number for each node */ + alloc_nodes_vectors(numvecs, node_to_cpumask, cpu_mask, + nodemsk, nmsk, node_vectors); - /* Get the cpus on this node which are in the mask */ - cpumask_and(nmsk, cpu_mask, node_to_cpumask[n]); + for (i = 0; i < nr_node_ids; i++) { + unsigned int ncpus, v; + struct node_vectors *nv = &node_vectors[i]; + + if (nv->nvectors == UINT_MAX) + continue; - /* Calculate the number of cpus per vector */ + /* Get the cpus on this node which are in the mask */ + cpumask_and(nmsk, cpu_mask, node_to_cpumask[nv->id]); ncpus = cpumask_weight(nmsk); - vecs_to_assign = min(vecs_per_node, ncpus); + if (!ncpus) + continue; + + WARN_ON_ONCE(nv->nvectors > ncpus); /* Account for rounding errors */ - extra_vecs = ncpus - vecs_to_assign * (ncpus / vecs_to_assign); + extra_vecs = ncpus - nv->nvectors * (ncpus / nv->nvectors); - for (v = 0; curvec < last_affv && v < vecs_to_assign; - curvec++, v++) { - cpus_per_vec = ncpus / vecs_to_assign; + /* Spread allocated vectors on CPUs of the current node */ + for (v = 0; v < nv->nvectors; v++, curvec++) { + cpus_per_vec = ncpus / nv->nvectors; /* Account for extra vectors to compensate rounding errors */ if (extra_vecs) { cpus_per_vec++; --extra_vecs; } + + /* + * wrapping has to be considered given 'startvec' + * may start anywhere + */ + if (curvec >= last_affv) + curvec = firstvec; irq_spread_init_one(&masks[curvec].mask, nmsk, cpus_per_vec); } - - done += v; - if (done >= numvecs) - break; - if (curvec >= last_affv) - curvec = firstvec; - --nodes; + done += nv->nvectors; } + kfree(node_vectors); return done; } @@ -174,7 +339,7 @@ static int irq_build_affinity_masks(unsigned int startvec, unsigned int numvecs, unsigned int firstvec, struct irq_affinity_desc *masks) { - unsigned int curvec = startvec, nr_present, nr_others; + unsigned int curvec = startvec, nr_present = 0, nr_others = 0; cpumask_var_t *node_to_cpumask; cpumask_var_t nmsk, npresmsk; int ret = -ENOMEM; @@ -189,15 +354,17 @@ static int irq_build_affinity_masks(unsigned int startvec, unsigned int numvecs, if (!node_to_cpumask) goto fail_npresmsk; - ret = 0; /* Stabilize the cpumasks */ get_online_cpus(); build_node_to_cpumask(node_to_cpumask); /* Spread on present CPUs starting from affd->pre_vectors */ - nr_present = __irq_build_affinity_masks(curvec, numvecs, - firstvec, node_to_cpumask, - cpu_present_mask, nmsk, masks); + ret = __irq_build_affinity_masks(curvec, numvecs, firstvec, + node_to_cpumask, cpu_present_mask, + nmsk, masks); + if (ret < 0) + goto fail_build_affinity; + nr_present = ret; /* * Spread on non present CPUs starting from the next vector to be @@ -210,12 +377,16 @@ static int irq_build_affinity_masks(unsigned int startvec, unsigned int numvecs, else curvec = firstvec + nr_present; cpumask_andnot(npresmsk, cpu_possible_mask, cpu_present_mask); - nr_others = __irq_build_affinity_masks(curvec, numvecs, - firstvec, node_to_cpumask, - npresmsk, nmsk, masks); + ret = __irq_build_affinity_masks(curvec, numvecs, firstvec, + node_to_cpumask, npresmsk, nmsk, + masks); + if (ret >= 0) + nr_others = ret; + + fail_build_affinity: put_online_cpus(); - if (nr_present < numvecs) + if (ret >= 0) WARN_ON(nr_present + nr_others < numvecs); free_node_to_cpumask(node_to_cpumask); @@ -225,7 +396,7 @@ static int irq_build_affinity_masks(unsigned int startvec, unsigned int numvecs, fail_nmsk: free_cpumask_var(nmsk); - return ret; + return ret < 0 ? ret : 0; } static void default_calc_sets(struct irq_affinity *affd, unsigned int affvecs) @@ -251,11 +422,9 @@ irq_create_affinity_masks(unsigned int nvecs, struct irq_affinity *affd) * Determine the number of vectors which need interrupt affinities * assigned. If the pre/post request exhausts the available vectors * then nothing to do here except for invoking the calc_sets() - * callback so the device driver can adjust to the situation. If there - * is only a single vector, then managing the queue is pointless as - * well. + * callback so the device driver can adjust to the situation. */ - if (nvecs > 1 && nvecs > affd->pre_vectors + affd->post_vectors) + if (nvecs > affd->pre_vectors + affd->post_vectors) affvecs = nvecs - affd->pre_vectors - affd->post_vectors; else affvecs = 0; diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 9484e88dabc2..9be995fc3c5a 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -295,6 +295,18 @@ static void irq_sysfs_add(int irq, struct irq_desc *desc) } } +static void irq_sysfs_del(struct irq_desc *desc) +{ + /* + * If irq_sysfs_init() has not yet been invoked (early boot), then + * irq_kobj_base is NULL and the descriptor was never added. + * kobject_del() complains about a object with no parent, so make + * it conditional. + */ + if (irq_kobj_base) + kobject_del(&desc->kobj); +} + static int __init irq_sysfs_init(void) { struct irq_desc *desc; @@ -325,6 +337,7 @@ static struct kobj_type irq_kobj_type = { }; static void irq_sysfs_add(int irq, struct irq_desc *desc) {} +static void irq_sysfs_del(struct irq_desc *desc) {} #endif /* CONFIG_SYSFS */ @@ -438,7 +451,7 @@ static void free_desc(unsigned int irq) * The sysfs entry must be serialized against a concurrent * irq_sysfs_init() as well. */ - kobject_del(&desc->kobj); + irq_sysfs_del(desc); delete_irq_desc(irq); /* diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 3078d0e48bba..132672b74e4b 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -31,7 +31,7 @@ struct irqchip_fwid { struct fwnode_handle fwnode; unsigned int type; char *name; - void *data; + phys_addr_t *pa; }; #ifdef CONFIG_GENERIC_IRQ_DEBUGFS @@ -62,7 +62,8 @@ EXPORT_SYMBOL_GPL(irqchip_fwnode_ops); * domain struct. */ struct fwnode_handle *__irq_domain_alloc_fwnode(unsigned int type, int id, - const char *name, void *data) + const char *name, + phys_addr_t *pa) { struct irqchip_fwid *fwid; char *n; @@ -77,7 +78,7 @@ struct fwnode_handle *__irq_domain_alloc_fwnode(unsigned int type, int id, n = kasprintf(GFP_KERNEL, "%s-%d", name, id); break; default: - n = kasprintf(GFP_KERNEL, "irqchip@%p", data); + n = kasprintf(GFP_KERNEL, "irqchip@%pa", pa); break; } @@ -89,7 +90,7 @@ struct fwnode_handle *__irq_domain_alloc_fwnode(unsigned int type, int id, fwid->type = type; fwid->name = n; - fwid->data = data; + fwid->pa = pa; fwid->fwnode.ops = &irqchip_fwnode_ops; return &fwid->fwnode; } @@ -148,6 +149,7 @@ struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, int size, switch (fwid->type) { case IRQCHIP_FWNODE_NAMED: case IRQCHIP_FWNODE_NAMED_ID: + domain->fwnode = fwnode; domain->name = kstrdup(fwid->name, GFP_KERNEL); if (!domain->name) { kfree(domain); diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index e8f7f179bf77..1753486b440c 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -23,7 +23,7 @@ #include "internals.h" -#ifdef CONFIG_IRQ_FORCED_THREADING +#if defined(CONFIG_IRQ_FORCED_THREADING) && !defined(CONFIG_PREEMPT_RT) __read_mostly bool force_irqthreads; EXPORT_SYMBOL_GPL(force_irqthreads); @@ -1255,8 +1255,7 @@ setup_irq_thread(struct irqaction *new, unsigned int irq, bool secondary) * the thread dies to avoid that the interrupt code * references an already freed task_struct. */ - get_task_struct(t); - new->thread = t; + new->thread = get_task_struct(t); /* * Tell the thread to set its affinity. This is * important for shared interrupt handlers as we do diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index da9addb8d655..cfc4f088a0e7 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -100,10 +100,6 @@ static int irq_affinity_hint_proc_show(struct seq_file *m, void *v) return 0; } -#ifndef is_affinity_mask_valid -#define is_affinity_mask_valid(val) 1 -#endif - int no_irq_affinity; static int irq_affinity_proc_show(struct seq_file *m, void *v) { @@ -136,11 +132,6 @@ static ssize_t write_irq_affinity(int type, struct file *file, if (err) goto free_cpumask; - if (!is_affinity_mask_valid(new_value)) { - err = -EINVAL; - goto free_cpumask; - } - /* * Do not allow disabling IRQs completely - it's a too easy * way to make the system unusable accidentally :-) At least @@ -232,11 +223,6 @@ static ssize_t default_affinity_write(struct file *file, if (err) goto out; - if (!is_affinity_mask_valid(new_value)) { - err = -EINVAL; - goto out; - } - /* * Do not allow disabling IRQs completely - it's a too easy * way to make the system unusable accidentally :-) At least diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c index 95414ad3506a..98c04ca5fa43 100644 --- a/kernel/irq/resend.c +++ b/kernel/irq/resend.c @@ -36,6 +36,8 @@ static void resend_irqs(unsigned long arg) irq = find_first_bit(irqs_resend, nr_irqs); clear_bit(irq, irqs_resend); desc = irq_to_desc(irq); + if (!desc) + continue; local_irq_disable(); desc->handle_irq(desc); local_irq_enable(); diff --git a/kernel/jump_label.c b/kernel/jump_label.c index df3008419a1d..cdb3ffab128b 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -407,7 +407,9 @@ static bool jump_label_can_update(struct jump_entry *entry, bool init) return false; if (!kernel_text_address(jump_entry_code(entry))) { - WARN_ONCE(1, "can't patch jump_label at %pS", (void *)jump_entry_code(entry)); + WARN_ONCE(!jump_entry_is_init(entry), + "can't patch jump_label at %pS", + (void *)jump_entry_code(entry)); return false; } diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 95a260f9214b..136ce049c4ad 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -263,8 +263,10 @@ int kallsyms_lookup_size_offset(unsigned long addr, unsigned long *symbolsize, { char namebuf[KSYM_NAME_LEN]; - if (is_ksym_addr(addr)) - return !!get_symbol_pos(addr, symbolsize, offset); + if (is_ksym_addr(addr)) { + get_symbol_pos(addr, symbolsize, offset); + return 1; + } return !!module_address_lookup(addr, symbolsize, offset, NULL, namebuf) || !!__bpf_address_lookup(addr, symbolsize, offset, namebuf); } diff --git a/kernel/kexec_elf.c b/kernel/kexec_elf.c new file mode 100644 index 000000000000..d3689632e8b9 --- /dev/null +++ b/kernel/kexec_elf.c @@ -0,0 +1,430 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Load ELF vmlinux file for the kexec_file_load syscall. + * + * Copyright (C) 2004 Adam Litke (agl@us.ibm.com) + * Copyright (C) 2004 IBM Corp. + * Copyright (C) 2005 R Sharada (sharada@in.ibm.com) + * Copyright (C) 2006 Mohan Kumar M (mohan@in.ibm.com) + * Copyright (C) 2016 IBM Corporation + * + * Based on kexec-tools' kexec-elf-exec.c and kexec-elf-ppc64.c. + * Heavily modified for the kernel by + * Thiago Jung Bauermann <bauerman@linux.vnet.ibm.com>. + */ + +#define pr_fmt(fmt) "kexec_elf: " fmt + +#include <linux/elf.h> +#include <linux/kexec.h> +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/types.h> + +static inline bool elf_is_elf_file(const struct elfhdr *ehdr) +{ + return memcmp(ehdr->e_ident, ELFMAG, SELFMAG) == 0; +} + +static uint64_t elf64_to_cpu(const struct elfhdr *ehdr, uint64_t value) +{ + if (ehdr->e_ident[EI_DATA] == ELFDATA2LSB) + value = le64_to_cpu(value); + else if (ehdr->e_ident[EI_DATA] == ELFDATA2MSB) + value = be64_to_cpu(value); + + return value; +} + +static uint32_t elf32_to_cpu(const struct elfhdr *ehdr, uint32_t value) +{ + if (ehdr->e_ident[EI_DATA] == ELFDATA2LSB) + value = le32_to_cpu(value); + else if (ehdr->e_ident[EI_DATA] == ELFDATA2MSB) + value = be32_to_cpu(value); + + return value; +} + +static uint16_t elf16_to_cpu(const struct elfhdr *ehdr, uint16_t value) +{ + if (ehdr->e_ident[EI_DATA] == ELFDATA2LSB) + value = le16_to_cpu(value); + else if (ehdr->e_ident[EI_DATA] == ELFDATA2MSB) + value = be16_to_cpu(value); + + return value; +} + +/** + * elf_is_ehdr_sane - check that it is safe to use the ELF header + * @buf_len: size of the buffer in which the ELF file is loaded. + */ +static bool elf_is_ehdr_sane(const struct elfhdr *ehdr, size_t buf_len) +{ + if (ehdr->e_phnum > 0 && ehdr->e_phentsize != sizeof(struct elf_phdr)) { + pr_debug("Bad program header size.\n"); + return false; + } else if (ehdr->e_shnum > 0 && + ehdr->e_shentsize != sizeof(struct elf_shdr)) { + pr_debug("Bad section header size.\n"); + return false; + } else if (ehdr->e_ident[EI_VERSION] != EV_CURRENT || + ehdr->e_version != EV_CURRENT) { + pr_debug("Unknown ELF version.\n"); + return false; + } + + if (ehdr->e_phoff > 0 && ehdr->e_phnum > 0) { + size_t phdr_size; + + /* + * e_phnum is at most 65535 so calculating the size of the + * program header cannot overflow. + */ + phdr_size = sizeof(struct elf_phdr) * ehdr->e_phnum; + + /* Sanity check the program header table location. */ + if (ehdr->e_phoff + phdr_size < ehdr->e_phoff) { + pr_debug("Program headers at invalid location.\n"); + return false; + } else if (ehdr->e_phoff + phdr_size > buf_len) { + pr_debug("Program headers truncated.\n"); + return false; + } + } + + if (ehdr->e_shoff > 0 && ehdr->e_shnum > 0) { + size_t shdr_size; + + /* + * e_shnum is at most 65536 so calculating + * the size of the section header cannot overflow. + */ + shdr_size = sizeof(struct elf_shdr) * ehdr->e_shnum; + + /* Sanity check the section header table location. */ + if (ehdr->e_shoff + shdr_size < ehdr->e_shoff) { + pr_debug("Section headers at invalid location.\n"); + return false; + } else if (ehdr->e_shoff + shdr_size > buf_len) { + pr_debug("Section headers truncated.\n"); + return false; + } + } + + return true; +} + +static int elf_read_ehdr(const char *buf, size_t len, struct elfhdr *ehdr) +{ + struct elfhdr *buf_ehdr; + + if (len < sizeof(*buf_ehdr)) { + pr_debug("Buffer is too small to hold ELF header.\n"); + return -ENOEXEC; + } + + memset(ehdr, 0, sizeof(*ehdr)); + memcpy(ehdr->e_ident, buf, sizeof(ehdr->e_ident)); + if (!elf_is_elf_file(ehdr)) { + pr_debug("No ELF header magic.\n"); + return -ENOEXEC; + } + + if (ehdr->e_ident[EI_CLASS] != ELF_CLASS) { + pr_debug("Not a supported ELF class.\n"); + return -ENOEXEC; + } else if (ehdr->e_ident[EI_DATA] != ELFDATA2LSB && + ehdr->e_ident[EI_DATA] != ELFDATA2MSB) { + pr_debug("Not a supported ELF data format.\n"); + return -ENOEXEC; + } + + buf_ehdr = (struct elfhdr *) buf; + if (elf16_to_cpu(ehdr, buf_ehdr->e_ehsize) != sizeof(*buf_ehdr)) { + pr_debug("Bad ELF header size.\n"); + return -ENOEXEC; + } + + ehdr->e_type = elf16_to_cpu(ehdr, buf_ehdr->e_type); + ehdr->e_machine = elf16_to_cpu(ehdr, buf_ehdr->e_machine); + ehdr->e_version = elf32_to_cpu(ehdr, buf_ehdr->e_version); + ehdr->e_flags = elf32_to_cpu(ehdr, buf_ehdr->e_flags); + ehdr->e_phentsize = elf16_to_cpu(ehdr, buf_ehdr->e_phentsize); + ehdr->e_phnum = elf16_to_cpu(ehdr, buf_ehdr->e_phnum); + ehdr->e_shentsize = elf16_to_cpu(ehdr, buf_ehdr->e_shentsize); + ehdr->e_shnum = elf16_to_cpu(ehdr, buf_ehdr->e_shnum); + ehdr->e_shstrndx = elf16_to_cpu(ehdr, buf_ehdr->e_shstrndx); + + switch (ehdr->e_ident[EI_CLASS]) { + case ELFCLASS64: + ehdr->e_entry = elf64_to_cpu(ehdr, buf_ehdr->e_entry); + ehdr->e_phoff = elf64_to_cpu(ehdr, buf_ehdr->e_phoff); + ehdr->e_shoff = elf64_to_cpu(ehdr, buf_ehdr->e_shoff); + break; + + case ELFCLASS32: + ehdr->e_entry = elf32_to_cpu(ehdr, buf_ehdr->e_entry); + ehdr->e_phoff = elf32_to_cpu(ehdr, buf_ehdr->e_phoff); + ehdr->e_shoff = elf32_to_cpu(ehdr, buf_ehdr->e_shoff); + break; + + default: + pr_debug("Unknown ELF class.\n"); + return -EINVAL; + } + + return elf_is_ehdr_sane(ehdr, len) ? 0 : -ENOEXEC; +} + +/** + * elf_is_phdr_sane - check that it is safe to use the program header + * @buf_len: size of the buffer in which the ELF file is loaded. + */ +static bool elf_is_phdr_sane(const struct elf_phdr *phdr, size_t buf_len) +{ + + if (phdr->p_offset + phdr->p_filesz < phdr->p_offset) { + pr_debug("ELF segment location wraps around.\n"); + return false; + } else if (phdr->p_offset + phdr->p_filesz > buf_len) { + pr_debug("ELF segment not in file.\n"); + return false; + } else if (phdr->p_paddr + phdr->p_memsz < phdr->p_paddr) { + pr_debug("ELF segment address wraps around.\n"); + return false; + } + + return true; +} + +static int elf_read_phdr(const char *buf, size_t len, + struct kexec_elf_info *elf_info, + int idx) +{ + /* Override the const in proghdrs, we are the ones doing the loading. */ + struct elf_phdr *phdr = (struct elf_phdr *) &elf_info->proghdrs[idx]; + const struct elfhdr *ehdr = elf_info->ehdr; + const char *pbuf; + struct elf_phdr *buf_phdr; + + pbuf = buf + elf_info->ehdr->e_phoff + (idx * sizeof(*buf_phdr)); + buf_phdr = (struct elf_phdr *) pbuf; + + phdr->p_type = elf32_to_cpu(elf_info->ehdr, buf_phdr->p_type); + phdr->p_flags = elf32_to_cpu(elf_info->ehdr, buf_phdr->p_flags); + + switch (ehdr->e_ident[EI_CLASS]) { + case ELFCLASS64: + phdr->p_offset = elf64_to_cpu(ehdr, buf_phdr->p_offset); + phdr->p_paddr = elf64_to_cpu(ehdr, buf_phdr->p_paddr); + phdr->p_vaddr = elf64_to_cpu(ehdr, buf_phdr->p_vaddr); + phdr->p_filesz = elf64_to_cpu(ehdr, buf_phdr->p_filesz); + phdr->p_memsz = elf64_to_cpu(ehdr, buf_phdr->p_memsz); + phdr->p_align = elf64_to_cpu(ehdr, buf_phdr->p_align); + break; + + case ELFCLASS32: + phdr->p_offset = elf32_to_cpu(ehdr, buf_phdr->p_offset); + phdr->p_paddr = elf32_to_cpu(ehdr, buf_phdr->p_paddr); + phdr->p_vaddr = elf32_to_cpu(ehdr, buf_phdr->p_vaddr); + phdr->p_filesz = elf32_to_cpu(ehdr, buf_phdr->p_filesz); + phdr->p_memsz = elf32_to_cpu(ehdr, buf_phdr->p_memsz); + phdr->p_align = elf32_to_cpu(ehdr, buf_phdr->p_align); + break; + + default: + pr_debug("Unknown ELF class.\n"); + return -EINVAL; + } + + return elf_is_phdr_sane(phdr, len) ? 0 : -ENOEXEC; +} + +/** + * elf_read_phdrs - read the program headers from the buffer + * + * This function assumes that the program header table was checked for sanity. + * Use elf_is_ehdr_sane() if it wasn't. + */ +static int elf_read_phdrs(const char *buf, size_t len, + struct kexec_elf_info *elf_info) +{ + size_t phdr_size, i; + const struct elfhdr *ehdr = elf_info->ehdr; + + /* + * e_phnum is at most 65535 so calculating the size of the + * program header cannot overflow. + */ + phdr_size = sizeof(struct elf_phdr) * ehdr->e_phnum; + + elf_info->proghdrs = kzalloc(phdr_size, GFP_KERNEL); + if (!elf_info->proghdrs) + return -ENOMEM; + + for (i = 0; i < ehdr->e_phnum; i++) { + int ret; + + ret = elf_read_phdr(buf, len, elf_info, i); + if (ret) { + kfree(elf_info->proghdrs); + elf_info->proghdrs = NULL; + return ret; + } + } + + return 0; +} + +/** + * elf_read_from_buffer - read ELF file and sets up ELF header and ELF info + * @buf: Buffer to read ELF file from. + * @len: Size of @buf. + * @ehdr: Pointer to existing struct which will be populated. + * @elf_info: Pointer to existing struct which will be populated. + * + * This function allows reading ELF files with different byte order than + * the kernel, byte-swapping the fields as needed. + * + * Return: + * On success returns 0, and the caller should call + * kexec_free_elf_info(elf_info) to free the memory allocated for the section + * and program headers. + */ +static int elf_read_from_buffer(const char *buf, size_t len, + struct elfhdr *ehdr, + struct kexec_elf_info *elf_info) +{ + int ret; + + ret = elf_read_ehdr(buf, len, ehdr); + if (ret) + return ret; + + elf_info->buffer = buf; + elf_info->ehdr = ehdr; + if (ehdr->e_phoff > 0 && ehdr->e_phnum > 0) { + ret = elf_read_phdrs(buf, len, elf_info); + if (ret) + return ret; + } + return 0; +} + +/** + * kexec_free_elf_info - free memory allocated by elf_read_from_buffer + */ +void kexec_free_elf_info(struct kexec_elf_info *elf_info) +{ + kfree(elf_info->proghdrs); + memset(elf_info, 0, sizeof(*elf_info)); +} +/** + * kexec_build_elf_info - read ELF executable and check that we can use it + */ +int kexec_build_elf_info(const char *buf, size_t len, struct elfhdr *ehdr, + struct kexec_elf_info *elf_info) +{ + int i; + int ret; + + ret = elf_read_from_buffer(buf, len, ehdr, elf_info); + if (ret) + return ret; + + /* Big endian vmlinux has type ET_DYN. */ + if (ehdr->e_type != ET_EXEC && ehdr->e_type != ET_DYN) { + pr_err("Not an ELF executable.\n"); + goto error; + } else if (!elf_info->proghdrs) { + pr_err("No ELF program header.\n"); + goto error; + } + + for (i = 0; i < ehdr->e_phnum; i++) { + /* + * Kexec does not support loading interpreters. + * In addition this check keeps us from attempting + * to kexec ordinay executables. + */ + if (elf_info->proghdrs[i].p_type == PT_INTERP) { + pr_err("Requires an ELF interpreter.\n"); + goto error; + } + } + + return 0; +error: + kexec_free_elf_info(elf_info); + return -ENOEXEC; +} + + +int kexec_elf_probe(const char *buf, unsigned long len) +{ + struct elfhdr ehdr; + struct kexec_elf_info elf_info; + int ret; + + ret = kexec_build_elf_info(buf, len, &ehdr, &elf_info); + if (ret) + return ret; + + kexec_free_elf_info(&elf_info); + + return elf_check_arch(&ehdr) ? 0 : -ENOEXEC; +} + +/** + * kexec_elf_load - load ELF executable image + * @lowest_load_addr: On return, will be the address where the first PT_LOAD + * section will be loaded in memory. + * + * Return: + * 0 on success, negative value on failure. + */ +int kexec_elf_load(struct kimage *image, struct elfhdr *ehdr, + struct kexec_elf_info *elf_info, + struct kexec_buf *kbuf, + unsigned long *lowest_load_addr) +{ + unsigned long lowest_addr = UINT_MAX; + int ret; + size_t i; + + /* Read in the PT_LOAD segments. */ + for (i = 0; i < ehdr->e_phnum; i++) { + unsigned long load_addr; + size_t size; + const struct elf_phdr *phdr; + + phdr = &elf_info->proghdrs[i]; + if (phdr->p_type != PT_LOAD) + continue; + + size = phdr->p_filesz; + if (size > phdr->p_memsz) + size = phdr->p_memsz; + + kbuf->buffer = (void *) elf_info->buffer + phdr->p_offset; + kbuf->bufsz = size; + kbuf->memsz = phdr->p_memsz; + kbuf->buf_align = phdr->p_align; + kbuf->buf_min = phdr->p_paddr; + kbuf->mem = KEXEC_BUF_MEM_UNKNOWN; + ret = kexec_add_buffer(kbuf); + if (ret) + goto out; + load_addr = kbuf->mem; + + if (load_addr < lowest_addr) + lowest_addr = load_addr; + } + + *lowest_load_addr = lowest_addr; + ret = 0; + out: + return ret; +} diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 9873fc627d61..1b66ccbb744a 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -470,6 +470,7 @@ static DECLARE_DELAYED_WORK(optimizing_work, kprobe_optimizer); */ static void do_optimize_kprobes(void) { + lockdep_assert_held(&text_mutex); /* * The optimization/unoptimization refers online_cpus via * stop_machine() and cpu-hotplug modifies online_cpus. @@ -487,9 +488,7 @@ static void do_optimize_kprobes(void) list_empty(&optimizing_list)) return; - mutex_lock(&text_mutex); arch_optimize_kprobes(&optimizing_list); - mutex_unlock(&text_mutex); } /* @@ -500,6 +499,7 @@ static void do_unoptimize_kprobes(void) { struct optimized_kprobe *op, *tmp; + lockdep_assert_held(&text_mutex); /* See comment in do_optimize_kprobes() */ lockdep_assert_cpus_held(); @@ -507,7 +507,6 @@ static void do_unoptimize_kprobes(void) if (list_empty(&unoptimizing_list)) return; - mutex_lock(&text_mutex); arch_unoptimize_kprobes(&unoptimizing_list, &freeing_list); /* Loop free_list for disarming */ list_for_each_entry_safe(op, tmp, &freeing_list, list) { @@ -524,7 +523,6 @@ static void do_unoptimize_kprobes(void) } else list_del_init(&op->list); } - mutex_unlock(&text_mutex); } /* Reclaim all kprobes on the free_list */ @@ -556,6 +554,7 @@ static void kprobe_optimizer(struct work_struct *work) { mutex_lock(&kprobe_mutex); cpus_read_lock(); + mutex_lock(&text_mutex); /* Lock modules while optimizing kprobes */ mutex_lock(&module_mutex); @@ -583,6 +582,7 @@ static void kprobe_optimizer(struct work_struct *work) do_free_cleaned_kprobes(); mutex_unlock(&module_mutex); + mutex_unlock(&text_mutex); cpus_read_unlock(); mutex_unlock(&kprobe_mutex); @@ -1514,7 +1514,8 @@ static int check_kprobe_address_safe(struct kprobe *p, /* Ensure it is not in reserved area nor out of text */ if (!kernel_text_address((unsigned long) p->addr) || within_kprobe_blacklist((unsigned long) p->addr) || - jump_label_text_reserved(p->addr, p->addr)) { + jump_label_text_reserved(p->addr, p->addr) || + find_bug((unsigned long)p->addr)) { ret = -EINVAL; goto out; } @@ -1906,7 +1907,7 @@ int register_kretprobe(struct kretprobe *rp) /* Pre-allocate memory for max kretprobe instances */ if (rp->maxactive <= 0) { -#ifdef CONFIG_PREEMPT +#ifdef CONFIG_PREEMPTION rp->maxactive = max_t(unsigned int, 10, 2*num_possible_cpus()); #else rp->maxactive = num_possible_cpus(); diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 341f52117f88..233459c03b5a 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -448,34 +448,102 @@ static void print_lockdep_off(const char *bug_msg) unsigned long nr_stack_trace_entries; -#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) +#ifdef CONFIG_PROVE_LOCKING +/** + * struct lock_trace - single stack backtrace + * @hash_entry: Entry in a stack_trace_hash[] list. + * @hash: jhash() of @entries. + * @nr_entries: Number of entries in @entries. + * @entries: Actual stack backtrace. + */ +struct lock_trace { + struct hlist_node hash_entry; + u32 hash; + u32 nr_entries; + unsigned long entries[0] __aligned(sizeof(unsigned long)); +}; +#define LOCK_TRACE_SIZE_IN_LONGS \ + (sizeof(struct lock_trace) / sizeof(unsigned long)) /* - * Stack-trace: tightly packed array of stack backtrace - * addresses. Protected by the graph_lock. + * Stack-trace: sequence of lock_trace structures. Protected by the graph_lock. */ static unsigned long stack_trace[MAX_STACK_TRACE_ENTRIES]; +static struct hlist_head stack_trace_hash[STACK_TRACE_HASH_SIZE]; + +static bool traces_identical(struct lock_trace *t1, struct lock_trace *t2) +{ + return t1->hash == t2->hash && t1->nr_entries == t2->nr_entries && + memcmp(t1->entries, t2->entries, + t1->nr_entries * sizeof(t1->entries[0])) == 0; +} -static int save_trace(struct lock_trace *trace) +static struct lock_trace *save_trace(void) { - unsigned long *entries = stack_trace + nr_stack_trace_entries; + struct lock_trace *trace, *t2; + struct hlist_head *hash_head; + u32 hash; unsigned int max_entries; - trace->offset = nr_stack_trace_entries; - max_entries = MAX_STACK_TRACE_ENTRIES - nr_stack_trace_entries; - trace->nr_entries = stack_trace_save(entries, max_entries, 3); - nr_stack_trace_entries += trace->nr_entries; + BUILD_BUG_ON_NOT_POWER_OF_2(STACK_TRACE_HASH_SIZE); + BUILD_BUG_ON(LOCK_TRACE_SIZE_IN_LONGS >= MAX_STACK_TRACE_ENTRIES); - if (nr_stack_trace_entries >= MAX_STACK_TRACE_ENTRIES-1) { + trace = (struct lock_trace *)(stack_trace + nr_stack_trace_entries); + max_entries = MAX_STACK_TRACE_ENTRIES - nr_stack_trace_entries - + LOCK_TRACE_SIZE_IN_LONGS; + trace->nr_entries = stack_trace_save(trace->entries, max_entries, 3); + + if (nr_stack_trace_entries >= MAX_STACK_TRACE_ENTRIES - + LOCK_TRACE_SIZE_IN_LONGS - 1) { if (!debug_locks_off_graph_unlock()) - return 0; + return NULL; print_lockdep_off("BUG: MAX_STACK_TRACE_ENTRIES too low!"); dump_stack(); - return 0; + return NULL; } - return 1; + hash = jhash(trace->entries, trace->nr_entries * + sizeof(trace->entries[0]), 0); + trace->hash = hash; + hash_head = stack_trace_hash + (hash & (STACK_TRACE_HASH_SIZE - 1)); + hlist_for_each_entry(t2, hash_head, hash_entry) { + if (traces_identical(trace, t2)) + return t2; + } + nr_stack_trace_entries += LOCK_TRACE_SIZE_IN_LONGS + trace->nr_entries; + hlist_add_head(&trace->hash_entry, hash_head); + + return trace; +} + +/* Return the number of stack traces in the stack_trace[] array. */ +u64 lockdep_stack_trace_count(void) +{ + struct lock_trace *trace; + u64 c = 0; + int i; + + for (i = 0; i < ARRAY_SIZE(stack_trace_hash); i++) { + hlist_for_each_entry(trace, &stack_trace_hash[i], hash_entry) { + c++; + } + } + + return c; +} + +/* Return the number of stack hash chains that have at least one stack trace. */ +u64 lockdep_stack_hash_count(void) +{ + u64 c = 0; + int i; + + for (i = 0; i < ARRAY_SIZE(stack_trace_hash); i++) + if (!hlist_empty(&stack_trace_hash[i])) + c++; + + return c; } #endif @@ -491,7 +559,7 @@ unsigned int max_lockdep_depth; DEFINE_PER_CPU(struct lockdep_stats, lockdep_stats); #endif -#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) +#ifdef CONFIG_PROVE_LOCKING /* * Locking printouts: */ @@ -511,7 +579,7 @@ static const char *usage_str[] = }; #endif -const char * __get_key_name(struct lockdep_subclass_key *key, char *str) +const char *__get_key_name(const struct lockdep_subclass_key *key, char *str) { return kallsyms_lookup((unsigned long)key, NULL, NULL, NULL, str); } @@ -620,7 +688,7 @@ static void print_lock(struct held_lock *hlock) return; } - printk(KERN_CONT "%p", hlock->instance); + printk(KERN_CONT "%px", hlock->instance); print_lock_name(lock); printk(KERN_CONT ", at: %pS\n", (void *)hlock->acquire_ip); } @@ -1235,7 +1303,7 @@ static struct lock_list *alloc_list_entry(void) static int add_lock_to_list(struct lock_class *this, struct lock_class *links_to, struct list_head *head, unsigned long ip, int distance, - struct lock_trace *trace) + const struct lock_trace *trace) { struct lock_list *entry; /* @@ -1249,7 +1317,7 @@ static int add_lock_to_list(struct lock_class *this, entry->class = this; entry->links_to = links_to; entry->distance = distance; - entry->trace = *trace; + entry->trace = trace; /* * Both allocation and removal are done under the graph lock; but * iteration is under RCU-sched; see look_up_lock_class() and @@ -1470,11 +1538,10 @@ static inline int __bfs_backwards(struct lock_list *src_entry, } -static void print_lock_trace(struct lock_trace *trace, unsigned int spaces) +static void print_lock_trace(const struct lock_trace *trace, + unsigned int spaces) { - unsigned long *entries = stack_trace + trace->offset; - - stack_trace_print(entries, trace->nr_entries, spaces); + stack_trace_print(trace->entries, trace->nr_entries, spaces); } /* @@ -1489,7 +1556,7 @@ print_circular_bug_entry(struct lock_list *target, int depth) printk("\n-> #%u", depth); print_lock_name(target->class); printk(KERN_CONT ":\n"); - print_lock_trace(&target->trace, 6); + print_lock_trace(target->trace, 6); } static void @@ -1592,7 +1659,8 @@ static noinline void print_circular_bug(struct lock_list *this, if (!debug_locks_off_graph_unlock() || debug_locks_silent) return; - if (!save_trace(&this->trace)) + this->trace = save_trace(); + if (!this->trace) return; depth = get_lock_depth(target); @@ -1715,7 +1783,7 @@ check_path(struct lock_class *target, struct lock_list *src_entry, */ static noinline int check_noncircular(struct held_lock *src, struct held_lock *target, - struct lock_trace *trace) + struct lock_trace **const trace) { int ret; struct lock_list *uninitialized_var(target_entry); @@ -1729,13 +1797,13 @@ check_noncircular(struct held_lock *src, struct held_lock *target, ret = check_path(hlock_class(target), &src_entry, &target_entry); if (unlikely(!ret)) { - if (!trace->nr_entries) { + if (!*trace) { /* * If save_trace fails here, the printing might * trigger a WARN but because of the !nr_entries it * should not do bad things. */ - save_trace(trace); + *trace = save_trace(); } print_circular_bug(&src_entry, target_entry, src, target); @@ -1859,7 +1927,7 @@ static void print_lock_class_header(struct lock_class *class, int depth) len += printk("%*s %s", depth, "", usage_str[bit]); len += printk(KERN_CONT " at:\n"); - print_lock_trace(class->usage_traces + bit, len); + print_lock_trace(class->usage_traces[bit], len); } } printk("%*s }\n", depth, ""); @@ -1884,7 +1952,7 @@ print_shortest_lock_dependencies(struct lock_list *leaf, do { print_lock_class_header(entry->class, depth); printk("%*s ... acquired at:\n", depth, ""); - print_lock_trace(&entry->trace, 2); + print_lock_trace(entry->trace, 2); printk("\n"); if (depth == 0 && (entry != root)) { @@ -1995,14 +2063,14 @@ print_bad_irq_dependency(struct task_struct *curr, print_lock_name(backwards_entry->class); pr_warn("\n... which became %s-irq-safe at:\n", irqclass); - print_lock_trace(backwards_entry->class->usage_traces + bit1, 1); + print_lock_trace(backwards_entry->class->usage_traces[bit1], 1); pr_warn("\nto a %s-irq-unsafe lock:\n", irqclass); print_lock_name(forwards_entry->class); pr_warn("\n... which became %s-irq-unsafe at:\n", irqclass); pr_warn("..."); - print_lock_trace(forwards_entry->class->usage_traces + bit2, 1); + print_lock_trace(forwards_entry->class->usage_traces[bit2], 1); pr_warn("\nother info that might help us debug this:\n\n"); print_irq_lock_scenario(backwards_entry, forwards_entry, @@ -2011,13 +2079,15 @@ print_bad_irq_dependency(struct task_struct *curr, lockdep_print_held_locks(curr); pr_warn("\nthe dependencies between %s-irq-safe lock and the holding lock:\n", irqclass); - if (!save_trace(&prev_root->trace)) + prev_root->trace = save_trace(); + if (!prev_root->trace) return; print_shortest_lock_dependencies(backwards_entry, prev_root); pr_warn("\nthe dependencies between the lock to be acquired"); pr_warn(" and %s-irq-unsafe lock:\n", irqclass); - if (!save_trace(&next_root->trace)) + next_root->trace = save_trace(); + if (!next_root->trace) return; print_shortest_lock_dependencies(forwards_entry, next_root); @@ -2369,7 +2439,8 @@ check_deadlock(struct task_struct *curr, struct held_lock *next) */ static int check_prev_add(struct task_struct *curr, struct held_lock *prev, - struct held_lock *next, int distance, struct lock_trace *trace) + struct held_lock *next, int distance, + struct lock_trace **const trace) { struct lock_list *entry; int ret; @@ -2444,8 +2515,11 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, return ret; #endif - if (!trace->nr_entries && !save_trace(trace)) - return 0; + if (!*trace) { + *trace = save_trace(); + if (!*trace) + return 0; + } /* * Ok, all validations passed, add the new lock @@ -2453,14 +2527,14 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, */ ret = add_lock_to_list(hlock_class(next), hlock_class(prev), &hlock_class(prev)->locks_after, - next->acquire_ip, distance, trace); + next->acquire_ip, distance, *trace); if (!ret) return 0; ret = add_lock_to_list(hlock_class(prev), hlock_class(next), &hlock_class(next)->locks_before, - next->acquire_ip, distance, trace); + next->acquire_ip, distance, *trace); if (!ret) return 0; @@ -2476,7 +2550,7 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, static int check_prevs_add(struct task_struct *curr, struct held_lock *next) { - struct lock_trace trace = { .nr_entries = 0 }; + struct lock_trace *trace = NULL; int depth = curr->lockdep_depth; struct held_lock *hlock; @@ -2969,7 +3043,7 @@ static void check_chain_key(struct task_struct *curr) #endif } -#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) +#ifdef CONFIG_PROVE_LOCKING static int mark_lock(struct task_struct *curr, struct held_lock *this, enum lock_usage_bit new_bit); @@ -3015,7 +3089,7 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this, print_lock(this); pr_warn("{%s} state was registered at:\n", usage_str[prev_bit]); - print_lock_trace(hlock_class(this)->usage_traces + prev_bit, 1); + print_lock_trace(hlock_class(this)->usage_traces[prev_bit], 1); print_irqtrace_events(curr); pr_warn("\nother info that might help us debug this:\n"); @@ -3096,7 +3170,8 @@ print_irq_inversion_bug(struct task_struct *curr, lockdep_print_held_locks(curr); pr_warn("\nthe shortest dependencies between 2nd lock and 1st lock:\n"); - if (!save_trace(&root->trace)) + root->trace = save_trace(); + if (!root->trace) return; print_shortest_lock_dependencies(other, root); @@ -3580,7 +3655,7 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this, hlock_class(this)->usage_mask |= new_mask; - if (!save_trace(hlock_class(this)->usage_traces + new_bit)) + if (!(hlock_class(this)->usage_traces[new_bit] = save_trace())) return 0; switch (new_bit) { @@ -3608,7 +3683,7 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this, return ret; } -#else /* defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) */ +#else /* CONFIG_PROVE_LOCKING */ static inline int mark_usage(struct task_struct *curr, struct held_lock *hlock, int check) @@ -3627,7 +3702,7 @@ static inline int separate_irq_context(struct task_struct *curr, return 0; } -#endif /* defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) */ +#endif /* CONFIG_PROVE_LOCKING */ /* * Initialize a lock instance's lock-class mapping info: @@ -4321,8 +4396,7 @@ static void __lock_unpin_lock(struct lockdep_map *lock, struct pin_cookie cookie */ static void check_flags(unsigned long flags) { -#if defined(CONFIG_PROVE_LOCKING) && defined(CONFIG_DEBUG_LOCKDEP) && \ - defined(CONFIG_TRACE_IRQFLAGS) +#if defined(CONFIG_PROVE_LOCKING) && defined(CONFIG_DEBUG_LOCKDEP) if (!debug_locks) return; @@ -5158,6 +5232,12 @@ void __init lockdep_init(void) ) / 1024 ); +#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) + printk(" memory used for stack traces: %zu kB\n", + (sizeof(stack_trace) + sizeof(stack_trace_hash)) / 1024 + ); +#endif + printk(" per task-struct memory footprint: %zu bytes\n", sizeof(((struct task_struct *)NULL)->held_locks)); } diff --git a/kernel/locking/lockdep_internals.h b/kernel/locking/lockdep_internals.h index cc83568d5012..18d85aebbb57 100644 --- a/kernel/locking/lockdep_internals.h +++ b/kernel/locking/lockdep_internals.h @@ -92,6 +92,7 @@ static const unsigned long LOCKF_USED_IN_IRQ_READ = #define MAX_LOCKDEP_ENTRIES 16384UL #define MAX_LOCKDEP_CHAINS_BITS 15 #define MAX_STACK_TRACE_ENTRIES 262144UL +#define STACK_TRACE_HASH_SIZE 8192 #else #define MAX_LOCKDEP_ENTRIES 32768UL @@ -102,6 +103,7 @@ static const unsigned long LOCKF_USED_IN_IRQ_READ = * addresses. Protected by the hash_lock. */ #define MAX_STACK_TRACE_ENTRIES 524288UL +#define STACK_TRACE_HASH_SIZE 16384 #endif #define MAX_LOCKDEP_CHAINS (1UL << MAX_LOCKDEP_CHAINS_BITS) @@ -116,7 +118,8 @@ extern struct lock_chain lock_chains[]; extern void get_usage_chars(struct lock_class *class, char usage[LOCK_USAGE_CHARS]); -extern const char * __get_key_name(struct lockdep_subclass_key *key, char *str); +extern const char *__get_key_name(const struct lockdep_subclass_key *key, + char *str); struct lock_class *lock_chain_get_class(struct lock_chain *chain, int i); @@ -137,6 +140,10 @@ extern unsigned int max_bfs_queue_depth; #ifdef CONFIG_PROVE_LOCKING extern unsigned long lockdep_count_forward_deps(struct lock_class *); extern unsigned long lockdep_count_backward_deps(struct lock_class *); +#ifdef CONFIG_TRACE_IRQFLAGS +u64 lockdep_stack_trace_count(void); +u64 lockdep_stack_hash_count(void); +#endif #else static inline unsigned long lockdep_count_forward_deps(struct lock_class *class) diff --git a/kernel/locking/lockdep_proc.c b/kernel/locking/lockdep_proc.c index 65b6a1600c8f..dadb7b7fba37 100644 --- a/kernel/locking/lockdep_proc.c +++ b/kernel/locking/lockdep_proc.c @@ -200,7 +200,6 @@ static void lockdep_stats_debug_show(struct seq_file *m) static int lockdep_stats_show(struct seq_file *m, void *v) { - struct lock_class *class; unsigned long nr_unused = 0, nr_uncategorized = 0, nr_irq_safe = 0, nr_irq_unsafe = 0, nr_softirq_safe = 0, nr_softirq_unsafe = 0, @@ -211,6 +210,8 @@ static int lockdep_stats_show(struct seq_file *m, void *v) sum_forward_deps = 0; #ifdef CONFIG_PROVE_LOCKING + struct lock_class *class; + list_for_each_entry(class, &all_lock_classes, lock_entry) { if (class->usage_mask == 0) @@ -284,6 +285,12 @@ static int lockdep_stats_show(struct seq_file *m, void *v) nr_process_chains); seq_printf(m, " stack-trace entries: %11lu [max: %lu]\n", nr_stack_trace_entries, MAX_STACK_TRACE_ENTRIES); +#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) + seq_printf(m, " number of stack traces: %llu\n", + lockdep_stack_trace_count()); + seq_printf(m, " number of stack hash chains: %llu\n", + lockdep_stack_hash_count()); +#endif seq_printf(m, " combined max dependencies: %11u\n", (nr_hardirq_chains + 1) * (nr_softirq_chains + 1) * @@ -398,7 +405,7 @@ static void seq_lock_time(struct seq_file *m, struct lock_time *lt) static void seq_stats(struct seq_file *m, struct lock_stat_data *data) { - struct lockdep_subclass_key *ckey; + const struct lockdep_subclass_key *ckey; struct lock_class_stats *stats; struct lock_class *class; const char *cname; diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index edd1c082dbf5..468a9b8422e3 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -65,11 +65,37 @@ EXPORT_SYMBOL(__mutex_init); #define MUTEX_FLAGS 0x07 +/* + * Internal helper function; C doesn't allow us to hide it :/ + * + * DO NOT USE (outside of mutex code). + */ +static inline struct task_struct *__mutex_owner(struct mutex *lock) +{ + return (struct task_struct *)(atomic_long_read(&lock->owner) & ~MUTEX_FLAGS); +} + static inline struct task_struct *__owner_task(unsigned long owner) { return (struct task_struct *)(owner & ~MUTEX_FLAGS); } +bool mutex_is_locked(struct mutex *lock) +{ + return __mutex_owner(lock) != NULL; +} +EXPORT_SYMBOL(mutex_is_locked); + +__must_check enum mutex_trylock_recursive_enum +mutex_trylock_recursive(struct mutex *lock) +{ + if (unlikely(__mutex_owner(lock) == current)) + return MUTEX_TRYLOCK_RECURSIVE; + + return mutex_trylock(lock); +} +EXPORT_SYMBOL(mutex_trylock_recursive); + static inline unsigned long __owner_flags(unsigned long owner) { return owner & MUTEX_FLAGS; @@ -908,6 +934,10 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, might_sleep(); +#ifdef CONFIG_DEBUG_MUTEXES + DEBUG_LOCKS_WARN_ON(lock->magic != lock); +#endif + ww = container_of(lock, struct ww_mutex, base); if (use_ww_ctx && ww_ctx) { if (unlikely(ww_ctx == READ_ONCE(ww->ctx))) @@ -1379,8 +1409,13 @@ __ww_mutex_lock_interruptible_slowpath(struct ww_mutex *lock, */ int __sched mutex_trylock(struct mutex *lock) { - bool locked = __mutex_trylock(lock); + bool locked; + +#ifdef CONFIG_DEBUG_MUTEXES + DEBUG_LOCKS_WARN_ON(lock->magic != lock); +#endif + locked = __mutex_trylock(lock); if (locked) mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_); diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index fa83d36e30c6..2874bf556162 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -628,8 +628,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, } /* [10] Grab the next task, i.e. owner of @lock */ - task = rt_mutex_owner(lock); - get_task_struct(task); + task = get_task_struct(rt_mutex_owner(lock)); raw_spin_lock(&task->pi_lock); /* @@ -709,8 +708,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, } /* [10] Grab the next task, i.e. the owner of @lock */ - task = rt_mutex_owner(lock); - get_task_struct(task); + task = get_task_struct(rt_mutex_owner(lock)); raw_spin_lock(&task->pi_lock); /* [11] requeue the pi waiters if necessary */ diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index 37524a47f002..eef04551eae7 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -105,8 +105,9 @@ #ifdef CONFIG_DEBUG_RWSEMS # define DEBUG_RWSEMS_WARN_ON(c, sem) do { \ if (!debug_locks_silent && \ - WARN_ONCE(c, "DEBUG_RWSEMS_WARN_ON(%s): count = 0x%lx, owner = 0x%lx, curr 0x%lx, list %sempty\n",\ + WARN_ONCE(c, "DEBUG_RWSEMS_WARN_ON(%s): count = 0x%lx, magic = 0x%lx, owner = 0x%lx, curr 0x%lx, list %sempty\n",\ #c, atomic_long_read(&(sem)->count), \ + (unsigned long) sem->magic, \ atomic_long_read(&(sem)->owner), (long)current, \ list_empty(&(sem)->wait_list) ? "" : "not ")) \ debug_locks_off(); \ @@ -330,6 +331,9 @@ void __init_rwsem(struct rw_semaphore *sem, const char *name, debug_check_no_locks_freed((void *)sem, sizeof(*sem)); lockdep_init_map(&sem->dep_map, name, key, 0); #endif +#ifdef CONFIG_DEBUG_RWSEMS + sem->magic = sem; +#endif atomic_long_set(&sem->count, RWSEM_UNLOCKED_VALUE); raw_spin_lock_init(&sem->wait_lock); INIT_LIST_HEAD(&sem->wait_list); @@ -666,7 +670,11 @@ static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem, preempt_disable(); rcu_read_lock(); owner = rwsem_owner_flags(sem, &flags); - if ((flags & nonspinnable) || (owner && !owner_on_cpu(owner))) + /* + * Don't check the read-owner as the entry may be stale. + */ + if ((flags & nonspinnable) || + (owner && !(flags & RWSEM_READER_OWNED) && !owner_on_cpu(owner))) ret = false; rcu_read_unlock(); preempt_enable(); @@ -720,11 +728,12 @@ rwsem_spin_on_owner(struct rw_semaphore *sem, unsigned long nonspinnable) rcu_read_lock(); for (;;) { - if (atomic_long_read(&sem->count) & RWSEM_FLAG_HANDOFF) { - state = OWNER_NONSPINNABLE; - break; - } - + /* + * When a waiting writer set the handoff flag, it may spin + * on the owner as well. Once that writer acquires the lock, + * we can spin on it. So we don't need to quit even when the + * handoff bit is set. + */ new = rwsem_owner_flags(sem, &new_flags); if ((new != owner) || (new_flags != flags)) { state = rwsem_owner_state(new, new_flags, nonspinnable); @@ -970,6 +979,13 @@ static inline bool rwsem_reader_phase_trylock(struct rw_semaphore *sem, { return false; } + +static inline int +rwsem_spin_on_owner(struct rw_semaphore *sem, unsigned long nonspinnable) +{ + return 0; +} +#define OWNER_NULL 1 #endif /* @@ -1000,6 +1016,7 @@ rwsem_down_read_slowpath(struct rw_semaphore *sem, int state) atomic_long_add(-RWSEM_READER_BIAS, &sem->count); adjustment = 0; if (rwsem_optimistic_spin(sem, false)) { + /* rwsem_optimistic_spin() implies ACQUIRE on success */ /* * Wake up other readers in the wait list if the front * waiter is a reader. @@ -1014,6 +1031,7 @@ rwsem_down_read_slowpath(struct rw_semaphore *sem, int state) } return sem; } else if (rwsem_reader_phase_trylock(sem, waiter.last_rowner)) { + /* rwsem_reader_phase_trylock() implies ACQUIRE on success */ return sem; } @@ -1032,6 +1050,8 @@ queue: */ if (adjustment && !(atomic_long_read(&sem->count) & (RWSEM_WRITER_MASK | RWSEM_FLAG_HANDOFF))) { + /* Provide lock ACQUIRE */ + smp_acquire__after_ctrl_dep(); raw_spin_unlock_irq(&sem->wait_lock); rwsem_set_reader_owned(sem); lockevent_inc(rwsem_rlock_fast); @@ -1065,15 +1085,18 @@ queue: wake_up_q(&wake_q); /* wait to be given the lock */ - while (true) { + for (;;) { set_current_state(state); - if (!waiter.task) + if (!smp_load_acquire(&waiter.task)) { + /* Matches rwsem_mark_wake()'s smp_store_release(). */ break; + } if (signal_pending_state(state, current)) { raw_spin_lock_irq(&sem->wait_lock); if (waiter.task) goto out_nolock; raw_spin_unlock_irq(&sem->wait_lock); + /* Ordered by sem->wait_lock against rwsem_mark_wake(). */ break; } schedule(); @@ -1083,6 +1106,7 @@ queue: __set_current_state(TASK_RUNNING); lockevent_inc(rwsem_rlock); return sem; + out_nolock: list_del(&waiter.list); if (list_empty(&sem->wait_list)) { @@ -1123,8 +1147,10 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state) /* do optimistic spinning and steal lock if possible */ if (rwsem_can_spin_on_owner(sem, RWSEM_WR_NONSPINNABLE) && - rwsem_optimistic_spin(sem, true)) + rwsem_optimistic_spin(sem, true)) { + /* rwsem_optimistic_spin() implies ACQUIRE on success */ return sem; + } /* * Disable reader optimistic spinning for this rwsem after @@ -1184,12 +1210,26 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state) wait: /* wait until we successfully acquire the lock */ set_current_state(state); - while (true) { - if (rwsem_try_write_lock(sem, wstate)) + for (;;) { + if (rwsem_try_write_lock(sem, wstate)) { + /* rwsem_try_write_lock() implies ACQUIRE on success */ break; + } raw_spin_unlock_irq(&sem->wait_lock); + /* + * After setting the handoff bit and failing to acquire + * the lock, attempt to spin on owner to accelerate lock + * transfer. If the previous owner is a on-cpu writer and it + * has just released the lock, OWNER_NULL will be returned. + * In this case, we attempt to acquire the lock again + * without sleeping. + */ + if ((wstate == WRITER_HANDOFF) && + (rwsem_spin_on_owner(sem, 0) == OWNER_NULL)) + goto trylock_again; + /* Block until there are no active lockers. */ for (;;) { if (signal_pending_state(state, current)) @@ -1224,7 +1264,7 @@ wait: break; } } - +trylock_again: raw_spin_lock_irq(&sem->wait_lock); } __set_current_state(TASK_RUNNING); @@ -1322,11 +1362,14 @@ static inline int __down_read_killable(struct rw_semaphore *sem) static inline int __down_read_trylock(struct rw_semaphore *sem) { + long tmp; + + DEBUG_RWSEMS_WARN_ON(sem->magic != sem, sem); + /* * Optimize for the case when the rwsem is not locked at all. */ - long tmp = RWSEM_UNLOCKED_VALUE; - + tmp = RWSEM_UNLOCKED_VALUE; do { if (atomic_long_try_cmpxchg_acquire(&sem->count, &tmp, tmp + RWSEM_READER_BIAS)) { @@ -1367,8 +1410,11 @@ static inline int __down_write_killable(struct rw_semaphore *sem) static inline int __down_write_trylock(struct rw_semaphore *sem) { - long tmp = RWSEM_UNLOCKED_VALUE; + long tmp; + + DEBUG_RWSEMS_WARN_ON(sem->magic != sem, sem); + tmp = RWSEM_UNLOCKED_VALUE; if (atomic_long_try_cmpxchg_acquire(&sem->count, &tmp, RWSEM_WRITER_LOCKED)) { rwsem_set_owner(sem); @@ -1384,7 +1430,9 @@ inline void __up_read(struct rw_semaphore *sem) { long tmp; + DEBUG_RWSEMS_WARN_ON(sem->magic != sem, sem); DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem); + rwsem_clear_reader_owned(sem); tmp = atomic_long_add_return_release(-RWSEM_READER_BIAS, &sem->count); DEBUG_RWSEMS_WARN_ON(tmp < 0, sem); @@ -1402,12 +1450,14 @@ static inline void __up_write(struct rw_semaphore *sem) { long tmp; + DEBUG_RWSEMS_WARN_ON(sem->magic != sem, sem); /* * sem->owner may differ from current if the ownership is transferred * to an anonymous writer by setting the RWSEM_NONSPINNABLE bits. */ DEBUG_RWSEMS_WARN_ON((rwsem_owner(sem) != current) && !rwsem_test_oflags(sem, RWSEM_NONSPINNABLE), sem); + rwsem_clear_owner(sem); tmp = atomic_long_fetch_add_release(-RWSEM_WRITER_LOCKED, &sem->count); if (unlikely(tmp & RWSEM_FLAG_WAITERS)) diff --git a/kernel/memremap.c b/kernel/memremap.c deleted file mode 100644 index 6ee03a816d67..000000000000 --- a/kernel/memremap.c +++ /dev/null @@ -1,405 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright(c) 2015 Intel Corporation. All rights reserved. */ -#include <linux/device.h> -#include <linux/io.h> -#include <linux/kasan.h> -#include <linux/memory_hotplug.h> -#include <linux/mm.h> -#include <linux/pfn_t.h> -#include <linux/swap.h> -#include <linux/swapops.h> -#include <linux/types.h> -#include <linux/wait_bit.h> -#include <linux/xarray.h> - -static DEFINE_XARRAY(pgmap_array); -#define SECTION_MASK ~((1UL << PA_SECTION_SHIFT) - 1) -#define SECTION_SIZE (1UL << PA_SECTION_SHIFT) - -#ifdef CONFIG_DEV_PAGEMAP_OPS -DEFINE_STATIC_KEY_FALSE(devmap_managed_key); -EXPORT_SYMBOL(devmap_managed_key); -static atomic_t devmap_managed_enable; - -static void devmap_managed_enable_put(void *data) -{ - if (atomic_dec_and_test(&devmap_managed_enable)) - static_branch_disable(&devmap_managed_key); -} - -static int devmap_managed_enable_get(struct device *dev, struct dev_pagemap *pgmap) -{ - if (!pgmap->ops || !pgmap->ops->page_free) { - WARN(1, "Missing page_free method\n"); - return -EINVAL; - } - - if (atomic_inc_return(&devmap_managed_enable) == 1) - static_branch_enable(&devmap_managed_key); - return devm_add_action_or_reset(dev, devmap_managed_enable_put, NULL); -} -#else -static int devmap_managed_enable_get(struct device *dev, struct dev_pagemap *pgmap) -{ - return -EINVAL; -} -#endif /* CONFIG_DEV_PAGEMAP_OPS */ - -static void pgmap_array_delete(struct resource *res) -{ - xa_store_range(&pgmap_array, PHYS_PFN(res->start), PHYS_PFN(res->end), - NULL, GFP_KERNEL); - synchronize_rcu(); -} - -static unsigned long pfn_first(struct dev_pagemap *pgmap) -{ - return PHYS_PFN(pgmap->res.start) + - vmem_altmap_offset(pgmap_altmap(pgmap)); -} - -static unsigned long pfn_end(struct dev_pagemap *pgmap) -{ - const struct resource *res = &pgmap->res; - - return (res->start + resource_size(res)) >> PAGE_SHIFT; -} - -static unsigned long pfn_next(unsigned long pfn) -{ - if (pfn % 1024 == 0) - cond_resched(); - return pfn + 1; -} - -#define for_each_device_pfn(pfn, map) \ - for (pfn = pfn_first(map); pfn < pfn_end(map); pfn = pfn_next(pfn)) - -static void dev_pagemap_kill(struct dev_pagemap *pgmap) -{ - if (pgmap->ops && pgmap->ops->kill) - pgmap->ops->kill(pgmap); - else - percpu_ref_kill(pgmap->ref); -} - -static void dev_pagemap_cleanup(struct dev_pagemap *pgmap) -{ - if (pgmap->ops && pgmap->ops->cleanup) { - pgmap->ops->cleanup(pgmap); - } else { - wait_for_completion(&pgmap->done); - percpu_ref_exit(pgmap->ref); - } -} - -static void devm_memremap_pages_release(void *data) -{ - struct dev_pagemap *pgmap = data; - struct device *dev = pgmap->dev; - struct resource *res = &pgmap->res; - unsigned long pfn; - int nid; - - dev_pagemap_kill(pgmap); - for_each_device_pfn(pfn, pgmap) - put_page(pfn_to_page(pfn)); - dev_pagemap_cleanup(pgmap); - - /* pages are dead and unused, undo the arch mapping */ - nid = page_to_nid(pfn_to_page(PHYS_PFN(res->start))); - - mem_hotplug_begin(); - if (pgmap->type == MEMORY_DEVICE_PRIVATE) { - pfn = PHYS_PFN(res->start); - __remove_pages(page_zone(pfn_to_page(pfn)), pfn, - PHYS_PFN(resource_size(res)), NULL); - } else { - arch_remove_memory(nid, res->start, resource_size(res), - pgmap_altmap(pgmap)); - kasan_remove_zero_shadow(__va(res->start), resource_size(res)); - } - mem_hotplug_done(); - - untrack_pfn(NULL, PHYS_PFN(res->start), resource_size(res)); - pgmap_array_delete(res); - dev_WARN_ONCE(dev, pgmap->altmap.alloc, - "%s: failed to free all reserved pages\n", __func__); -} - -static void dev_pagemap_percpu_release(struct percpu_ref *ref) -{ - struct dev_pagemap *pgmap = - container_of(ref, struct dev_pagemap, internal_ref); - - complete(&pgmap->done); -} - -/** - * devm_memremap_pages - remap and provide memmap backing for the given resource - * @dev: hosting device for @res - * @pgmap: pointer to a struct dev_pagemap - * - * Notes: - * 1/ At a minimum the res and type members of @pgmap must be initialized - * by the caller before passing it to this function - * - * 2/ The altmap field may optionally be initialized, in which case - * PGMAP_ALTMAP_VALID must be set in pgmap->flags. - * - * 3/ The ref field may optionally be provided, in which pgmap->ref must be - * 'live' on entry and will be killed and reaped at - * devm_memremap_pages_release() time, or if this routine fails. - * - * 4/ res is expected to be a host memory range that could feasibly be - * treated as a "System RAM" range, i.e. not a device mmio range, but - * this is not enforced. - */ -void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) -{ - struct resource *res = &pgmap->res; - struct dev_pagemap *conflict_pgmap; - struct mhp_restrictions restrictions = { - /* - * We do not want any optional features only our own memmap - */ - .altmap = pgmap_altmap(pgmap), - }; - pgprot_t pgprot = PAGE_KERNEL; - int error, nid, is_ram; - bool need_devmap_managed = true; - - switch (pgmap->type) { - case MEMORY_DEVICE_PRIVATE: - if (!IS_ENABLED(CONFIG_DEVICE_PRIVATE)) { - WARN(1, "Device private memory not supported\n"); - return ERR_PTR(-EINVAL); - } - if (!pgmap->ops || !pgmap->ops->migrate_to_ram) { - WARN(1, "Missing migrate_to_ram method\n"); - return ERR_PTR(-EINVAL); - } - break; - case MEMORY_DEVICE_FS_DAX: - if (!IS_ENABLED(CONFIG_ZONE_DEVICE) || - IS_ENABLED(CONFIG_FS_DAX_LIMITED)) { - WARN(1, "File system DAX not supported\n"); - return ERR_PTR(-EINVAL); - } - break; - case MEMORY_DEVICE_DEVDAX: - case MEMORY_DEVICE_PCI_P2PDMA: - need_devmap_managed = false; - break; - default: - WARN(1, "Invalid pgmap type %d\n", pgmap->type); - break; - } - - if (!pgmap->ref) { - if (pgmap->ops && (pgmap->ops->kill || pgmap->ops->cleanup)) - return ERR_PTR(-EINVAL); - - init_completion(&pgmap->done); - error = percpu_ref_init(&pgmap->internal_ref, - dev_pagemap_percpu_release, 0, GFP_KERNEL); - if (error) - return ERR_PTR(error); - pgmap->ref = &pgmap->internal_ref; - } else { - if (!pgmap->ops || !pgmap->ops->kill || !pgmap->ops->cleanup) { - WARN(1, "Missing reference count teardown definition\n"); - return ERR_PTR(-EINVAL); - } - } - - if (need_devmap_managed) { - error = devmap_managed_enable_get(dev, pgmap); - if (error) - return ERR_PTR(error); - } - - conflict_pgmap = get_dev_pagemap(PHYS_PFN(res->start), NULL); - if (conflict_pgmap) { - dev_WARN(dev, "Conflicting mapping in same section\n"); - put_dev_pagemap(conflict_pgmap); - error = -ENOMEM; - goto err_array; - } - - conflict_pgmap = get_dev_pagemap(PHYS_PFN(res->end), NULL); - if (conflict_pgmap) { - dev_WARN(dev, "Conflicting mapping in same section\n"); - put_dev_pagemap(conflict_pgmap); - error = -ENOMEM; - goto err_array; - } - - is_ram = region_intersects(res->start, resource_size(res), - IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE); - - if (is_ram != REGION_DISJOINT) { - WARN_ONCE(1, "%s attempted on %s region %pr\n", __func__, - is_ram == REGION_MIXED ? "mixed" : "ram", res); - error = -ENXIO; - goto err_array; - } - - pgmap->dev = dev; - - error = xa_err(xa_store_range(&pgmap_array, PHYS_PFN(res->start), - PHYS_PFN(res->end), pgmap, GFP_KERNEL)); - if (error) - goto err_array; - - nid = dev_to_node(dev); - if (nid < 0) - nid = numa_mem_id(); - - error = track_pfn_remap(NULL, &pgprot, PHYS_PFN(res->start), 0, - resource_size(res)); - if (error) - goto err_pfn_remap; - - mem_hotplug_begin(); - - /* - * For device private memory we call add_pages() as we only need to - * allocate and initialize struct page for the device memory. More- - * over the device memory is un-accessible thus we do not want to - * create a linear mapping for the memory like arch_add_memory() - * would do. - * - * For all other device memory types, which are accessible by - * the CPU, we do want the linear mapping and thus use - * arch_add_memory(). - */ - if (pgmap->type == MEMORY_DEVICE_PRIVATE) { - error = add_pages(nid, PHYS_PFN(res->start), - PHYS_PFN(resource_size(res)), &restrictions); - } else { - error = kasan_add_zero_shadow(__va(res->start), resource_size(res)); - if (error) { - mem_hotplug_done(); - goto err_kasan; - } - - error = arch_add_memory(nid, res->start, resource_size(res), - &restrictions); - } - - if (!error) { - struct zone *zone; - - zone = &NODE_DATA(nid)->node_zones[ZONE_DEVICE]; - move_pfn_range_to_zone(zone, PHYS_PFN(res->start), - PHYS_PFN(resource_size(res)), restrictions.altmap); - } - - mem_hotplug_done(); - if (error) - goto err_add_memory; - - /* - * Initialization of the pages has been deferred until now in order - * to allow us to do the work while not holding the hotplug lock. - */ - memmap_init_zone_device(&NODE_DATA(nid)->node_zones[ZONE_DEVICE], - PHYS_PFN(res->start), - PHYS_PFN(resource_size(res)), pgmap); - percpu_ref_get_many(pgmap->ref, pfn_end(pgmap) - pfn_first(pgmap)); - - error = devm_add_action_or_reset(dev, devm_memremap_pages_release, - pgmap); - if (error) - return ERR_PTR(error); - - return __va(res->start); - - err_add_memory: - kasan_remove_zero_shadow(__va(res->start), resource_size(res)); - err_kasan: - untrack_pfn(NULL, PHYS_PFN(res->start), resource_size(res)); - err_pfn_remap: - pgmap_array_delete(res); - err_array: - dev_pagemap_kill(pgmap); - dev_pagemap_cleanup(pgmap); - return ERR_PTR(error); -} -EXPORT_SYMBOL_GPL(devm_memremap_pages); - -void devm_memunmap_pages(struct device *dev, struct dev_pagemap *pgmap) -{ - devm_release_action(dev, devm_memremap_pages_release, pgmap); -} -EXPORT_SYMBOL_GPL(devm_memunmap_pages); - -unsigned long vmem_altmap_offset(struct vmem_altmap *altmap) -{ - /* number of pfns from base where pfn_to_page() is valid */ - if (altmap) - return altmap->reserve + altmap->free; - return 0; -} - -void vmem_altmap_free(struct vmem_altmap *altmap, unsigned long nr_pfns) -{ - altmap->alloc -= nr_pfns; -} - -/** - * get_dev_pagemap() - take a new live reference on the dev_pagemap for @pfn - * @pfn: page frame number to lookup page_map - * @pgmap: optional known pgmap that already has a reference - * - * If @pgmap is non-NULL and covers @pfn it will be returned as-is. If @pgmap - * is non-NULL but does not cover @pfn the reference to it will be released. - */ -struct dev_pagemap *get_dev_pagemap(unsigned long pfn, - struct dev_pagemap *pgmap) -{ - resource_size_t phys = PFN_PHYS(pfn); - - /* - * In the cached case we're already holding a live reference. - */ - if (pgmap) { - if (phys >= pgmap->res.start && phys <= pgmap->res.end) - return pgmap; - put_dev_pagemap(pgmap); - } - - /* fall back to slow path lookup */ - rcu_read_lock(); - pgmap = xa_load(&pgmap_array, PHYS_PFN(phys)); - if (pgmap && !percpu_ref_tryget_live(pgmap->ref)) - pgmap = NULL; - rcu_read_unlock(); - - return pgmap; -} -EXPORT_SYMBOL_GPL(get_dev_pagemap); - -#ifdef CONFIG_DEV_PAGEMAP_OPS -void __put_devmap_managed_page(struct page *page) -{ - int count = page_ref_dec_return(page); - - /* - * If refcount is 1 then page is freed and refcount is stable as nobody - * holds a reference on the page. - */ - if (count == 1) { - /* Clear Active bit in case of parallel mark_page_accessed */ - __ClearPageActive(page); - __ClearPageWaiters(page); - - mem_cgroup_uncharge(page); - - page->pgmap->ops->page_free(page); - } else if (!count) - __put_page(page); -} -EXPORT_SYMBOL(__put_devmap_managed_page); -#endif /* CONFIG_DEV_PAGEMAP_OPS */ diff --git a/kernel/module.c b/kernel/module.c index 5933395af9a0..9ee93421269c 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -65,9 +65,9 @@ /* * Modules' sections will be aligned on page boundaries * to ensure complete separation of code and data, but - * only when CONFIG_STRICT_MODULE_RWX=y + * only when CONFIG_ARCH_HAS_STRICT_MODULE_RWX=y */ -#ifdef CONFIG_STRICT_MODULE_RWX +#ifdef CONFIG_ARCH_HAS_STRICT_MODULE_RWX # define debug_align(X) ALIGN(X, PAGE_SIZE) #else # define debug_align(X) (X) diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig index 480edf328b51..7644eda17d62 100644 --- a/kernel/rcu/Kconfig +++ b/kernel/rcu/Kconfig @@ -7,7 +7,7 @@ menu "RCU Subsystem" config TREE_RCU bool - default y if !PREEMPT && SMP + default y if !PREEMPTION && SMP help This option selects the RCU implementation that is designed for very large SMP system with hundreds or @@ -16,7 +16,7 @@ config TREE_RCU config PREEMPT_RCU bool - default y if PREEMPT + default y if PREEMPTION help This option selects the RCU implementation that is designed for very large SMP systems with hundreds or @@ -28,7 +28,7 @@ config PREEMPT_RCU config TINY_RCU bool - default y if !PREEMPT && !SMP + default y if !PREEMPTION && !SMP help This option selects the RCU implementation that is designed for UP systems from which real-time response @@ -70,7 +70,7 @@ config TREE_SRCU This option selects the full-fledged version of SRCU. config TASKS_RCU - def_bool PREEMPT + def_bool PREEMPTION select SRCU help This option enables a task-based RCU implementation that uses diff --git a/kernel/rcu/Kconfig.debug b/kernel/rcu/Kconfig.debug index 5ec3ea4028e2..4aa02eee8f6c 100644 --- a/kernel/rcu/Kconfig.debug +++ b/kernel/rcu/Kconfig.debug @@ -8,6 +8,17 @@ menu "RCU Debugging" config PROVE_RCU def_bool PROVE_LOCKING +config PROVE_RCU_LIST + bool "RCU list lockdep debugging" + depends on PROVE_RCU && RCU_EXPERT + default n + help + Enable RCU lockdep checking for list usages. By default it is + turned off since there are several list RCU users that still + need to be converted to pass a lockdep expression. To prevent + false-positive splats, we keep it default disabled but once all + users are converted, we can remove this config option. + config TORTURE_TEST tristate default n diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 5290b01de534..8fd4f82c9b3d 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -227,6 +227,7 @@ static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head) #ifdef CONFIG_RCU_STALL_COMMON +extern int rcu_cpu_stall_ftrace_dump; extern int rcu_cpu_stall_suppress; extern int rcu_cpu_stall_timeout; int rcu_jiffies_till_stall_check(void); diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c index 9bd5f6023c21..495c58ce1640 100644 --- a/kernel/rcu/rcu_segcblist.c +++ b/kernel/rcu/rcu_segcblist.c @@ -24,6 +24,49 @@ void rcu_cblist_init(struct rcu_cblist *rclp) } /* + * Enqueue an rcu_head structure onto the specified callback list. + * This function assumes that the callback is non-lazy because it + * is intended for use by no-CBs CPUs, which do not distinguish + * between lazy and non-lazy RCU callbacks. + */ +void rcu_cblist_enqueue(struct rcu_cblist *rclp, struct rcu_head *rhp) +{ + *rclp->tail = rhp; + rclp->tail = &rhp->next; + WRITE_ONCE(rclp->len, rclp->len + 1); +} + +/* + * Flush the second rcu_cblist structure onto the first one, obliterating + * any contents of the first. If rhp is non-NULL, enqueue it as the sole + * element of the second rcu_cblist structure, but ensuring that the second + * rcu_cblist structure, if initially non-empty, always appears non-empty + * throughout the process. If rdp is NULL, the second rcu_cblist structure + * is instead initialized to empty. + */ +void rcu_cblist_flush_enqueue(struct rcu_cblist *drclp, + struct rcu_cblist *srclp, + struct rcu_head *rhp) +{ + drclp->head = srclp->head; + if (drclp->head) + drclp->tail = srclp->tail; + else + drclp->tail = &drclp->head; + drclp->len = srclp->len; + drclp->len_lazy = srclp->len_lazy; + if (!rhp) { + rcu_cblist_init(srclp); + } else { + rhp->next = NULL; + srclp->head = rhp; + srclp->tail = &rhp->next; + WRITE_ONCE(srclp->len, 1); + srclp->len_lazy = 0; + } +} + +/* * Dequeue the oldest rcu_head structure from the specified callback * list. This function assumes that the callback is non-lazy, but * the caller can later invoke rcu_cblist_dequeued_lazy() if it @@ -44,6 +87,67 @@ struct rcu_head *rcu_cblist_dequeue(struct rcu_cblist *rclp) return rhp; } +/* Set the length of an rcu_segcblist structure. */ +void rcu_segcblist_set_len(struct rcu_segcblist *rsclp, long v) +{ +#ifdef CONFIG_RCU_NOCB_CPU + atomic_long_set(&rsclp->len, v); +#else + WRITE_ONCE(rsclp->len, v); +#endif +} + +/* + * Increase the numeric length of an rcu_segcblist structure by the + * specified amount, which can be negative. This can cause the ->len + * field to disagree with the actual number of callbacks on the structure. + * This increase is fully ordered with respect to the callers accesses + * both before and after. + */ +void rcu_segcblist_add_len(struct rcu_segcblist *rsclp, long v) +{ +#ifdef CONFIG_RCU_NOCB_CPU + smp_mb__before_atomic(); /* Up to the caller! */ + atomic_long_add(v, &rsclp->len); + smp_mb__after_atomic(); /* Up to the caller! */ +#else + smp_mb(); /* Up to the caller! */ + WRITE_ONCE(rsclp->len, rsclp->len + v); + smp_mb(); /* Up to the caller! */ +#endif +} + +/* + * Increase the numeric length of an rcu_segcblist structure by one. + * This can cause the ->len field to disagree with the actual number of + * callbacks on the structure. This increase is fully ordered with respect + * to the callers accesses both before and after. + */ +void rcu_segcblist_inc_len(struct rcu_segcblist *rsclp) +{ + rcu_segcblist_add_len(rsclp, 1); +} + +/* + * Exchange the numeric length of the specified rcu_segcblist structure + * with the specified value. This can cause the ->len field to disagree + * with the actual number of callbacks on the structure. This exchange is + * fully ordered with respect to the callers accesses both before and after. + */ +long rcu_segcblist_xchg_len(struct rcu_segcblist *rsclp, long v) +{ +#ifdef CONFIG_RCU_NOCB_CPU + return atomic_long_xchg(&rsclp->len, v); +#else + long ret = rsclp->len; + + smp_mb(); /* Up to the caller! */ + WRITE_ONCE(rsclp->len, v); + smp_mb(); /* Up to the caller! */ + return ret; +#endif +} + /* * Initialize an rcu_segcblist structure. */ @@ -56,8 +160,9 @@ void rcu_segcblist_init(struct rcu_segcblist *rsclp) rsclp->head = NULL; for (i = 0; i < RCU_CBLIST_NSEGS; i++) rsclp->tails[i] = &rsclp->head; - rsclp->len = 0; + rcu_segcblist_set_len(rsclp, 0); rsclp->len_lazy = 0; + rsclp->enabled = 1; } /* @@ -69,7 +174,16 @@ void rcu_segcblist_disable(struct rcu_segcblist *rsclp) WARN_ON_ONCE(!rcu_segcblist_empty(rsclp)); WARN_ON_ONCE(rcu_segcblist_n_cbs(rsclp)); WARN_ON_ONCE(rcu_segcblist_n_lazy_cbs(rsclp)); - rsclp->tails[RCU_NEXT_TAIL] = NULL; + rsclp->enabled = 0; +} + +/* + * Mark the specified rcu_segcblist structure as offloaded. This + * structure must be empty. + */ +void rcu_segcblist_offload(struct rcu_segcblist *rsclp) +{ + rsclp->offloaded = 1; } /* @@ -118,6 +232,18 @@ struct rcu_head *rcu_segcblist_first_pend_cb(struct rcu_segcblist *rsclp) } /* + * Return false if there are no CBs awaiting grace periods, otherwise, + * return true and store the nearest waited-upon grace period into *lp. + */ +bool rcu_segcblist_nextgp(struct rcu_segcblist *rsclp, unsigned long *lp) +{ + if (!rcu_segcblist_pend_cbs(rsclp)) + return false; + *lp = rsclp->gp_seq[RCU_WAIT_TAIL]; + return true; +} + +/* * Enqueue the specified callback onto the specified rcu_segcblist * structure, updating accounting as needed. Note that the ->len * field may be accessed locklessly, hence the WRITE_ONCE(). @@ -129,13 +255,13 @@ struct rcu_head *rcu_segcblist_first_pend_cb(struct rcu_segcblist *rsclp) void rcu_segcblist_enqueue(struct rcu_segcblist *rsclp, struct rcu_head *rhp, bool lazy) { - WRITE_ONCE(rsclp->len, rsclp->len + 1); /* ->len sampled locklessly. */ + rcu_segcblist_inc_len(rsclp); if (lazy) rsclp->len_lazy++; smp_mb(); /* Ensure counts are updated before callback is enqueued. */ rhp->next = NULL; - *rsclp->tails[RCU_NEXT_TAIL] = rhp; - rsclp->tails[RCU_NEXT_TAIL] = &rhp->next; + WRITE_ONCE(*rsclp->tails[RCU_NEXT_TAIL], rhp); + WRITE_ONCE(rsclp->tails[RCU_NEXT_TAIL], &rhp->next); } /* @@ -155,7 +281,7 @@ bool rcu_segcblist_entrain(struct rcu_segcblist *rsclp, if (rcu_segcblist_n_cbs(rsclp) == 0) return false; - WRITE_ONCE(rsclp->len, rsclp->len + 1); + rcu_segcblist_inc_len(rsclp); if (lazy) rsclp->len_lazy++; smp_mb(); /* Ensure counts are updated before callback is entrained. */ @@ -163,9 +289,9 @@ bool rcu_segcblist_entrain(struct rcu_segcblist *rsclp, for (i = RCU_NEXT_TAIL; i > RCU_DONE_TAIL; i--) if (rsclp->tails[i] != rsclp->tails[i - 1]) break; - *rsclp->tails[i] = rhp; + WRITE_ONCE(*rsclp->tails[i], rhp); for (; i <= RCU_NEXT_TAIL; i++) - rsclp->tails[i] = &rhp->next; + WRITE_ONCE(rsclp->tails[i], &rhp->next); return true; } @@ -182,9 +308,8 @@ void rcu_segcblist_extract_count(struct rcu_segcblist *rsclp, struct rcu_cblist *rclp) { rclp->len_lazy += rsclp->len_lazy; - rclp->len += rsclp->len; rsclp->len_lazy = 0; - WRITE_ONCE(rsclp->len, 0); /* ->len sampled locklessly. */ + rclp->len = rcu_segcblist_xchg_len(rsclp, 0); } /* @@ -200,12 +325,12 @@ void rcu_segcblist_extract_done_cbs(struct rcu_segcblist *rsclp, if (!rcu_segcblist_ready_cbs(rsclp)) return; /* Nothing to do. */ *rclp->tail = rsclp->head; - rsclp->head = *rsclp->tails[RCU_DONE_TAIL]; - *rsclp->tails[RCU_DONE_TAIL] = NULL; + WRITE_ONCE(rsclp->head, *rsclp->tails[RCU_DONE_TAIL]); + WRITE_ONCE(*rsclp->tails[RCU_DONE_TAIL], NULL); rclp->tail = rsclp->tails[RCU_DONE_TAIL]; for (i = RCU_CBLIST_NSEGS - 1; i >= RCU_DONE_TAIL; i--) if (rsclp->tails[i] == rsclp->tails[RCU_DONE_TAIL]) - rsclp->tails[i] = &rsclp->head; + WRITE_ONCE(rsclp->tails[i], &rsclp->head); } /* @@ -224,9 +349,9 @@ void rcu_segcblist_extract_pend_cbs(struct rcu_segcblist *rsclp, return; /* Nothing to do. */ *rclp->tail = *rsclp->tails[RCU_DONE_TAIL]; rclp->tail = rsclp->tails[RCU_NEXT_TAIL]; - *rsclp->tails[RCU_DONE_TAIL] = NULL; + WRITE_ONCE(*rsclp->tails[RCU_DONE_TAIL], NULL); for (i = RCU_DONE_TAIL + 1; i < RCU_CBLIST_NSEGS; i++) - rsclp->tails[i] = rsclp->tails[RCU_DONE_TAIL]; + WRITE_ONCE(rsclp->tails[i], rsclp->tails[RCU_DONE_TAIL]); } /* @@ -237,8 +362,7 @@ void rcu_segcblist_insert_count(struct rcu_segcblist *rsclp, struct rcu_cblist *rclp) { rsclp->len_lazy += rclp->len_lazy; - /* ->len sampled locklessly. */ - WRITE_ONCE(rsclp->len, rsclp->len + rclp->len); + rcu_segcblist_add_len(rsclp, rclp->len); rclp->len_lazy = 0; rclp->len = 0; } @@ -255,10 +379,10 @@ void rcu_segcblist_insert_done_cbs(struct rcu_segcblist *rsclp, if (!rclp->head) return; /* No callbacks to move. */ *rclp->tail = rsclp->head; - rsclp->head = rclp->head; + WRITE_ONCE(rsclp->head, rclp->head); for (i = RCU_DONE_TAIL; i < RCU_CBLIST_NSEGS; i++) if (&rsclp->head == rsclp->tails[i]) - rsclp->tails[i] = rclp->tail; + WRITE_ONCE(rsclp->tails[i], rclp->tail); else break; rclp->head = NULL; @@ -274,8 +398,8 @@ void rcu_segcblist_insert_pend_cbs(struct rcu_segcblist *rsclp, { if (!rclp->head) return; /* Nothing to do. */ - *rsclp->tails[RCU_NEXT_TAIL] = rclp->head; - rsclp->tails[RCU_NEXT_TAIL] = rclp->tail; + WRITE_ONCE(*rsclp->tails[RCU_NEXT_TAIL], rclp->head); + WRITE_ONCE(rsclp->tails[RCU_NEXT_TAIL], rclp->tail); rclp->head = NULL; rclp->tail = &rclp->head; } @@ -299,7 +423,7 @@ void rcu_segcblist_advance(struct rcu_segcblist *rsclp, unsigned long seq) for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++) { if (ULONG_CMP_LT(seq, rsclp->gp_seq[i])) break; - rsclp->tails[RCU_DONE_TAIL] = rsclp->tails[i]; + WRITE_ONCE(rsclp->tails[RCU_DONE_TAIL], rsclp->tails[i]); } /* If no callbacks moved, nothing more need be done. */ @@ -308,7 +432,7 @@ void rcu_segcblist_advance(struct rcu_segcblist *rsclp, unsigned long seq) /* Clean up tail pointers that might have been misordered above. */ for (j = RCU_WAIT_TAIL; j < i; j++) - rsclp->tails[j] = rsclp->tails[RCU_DONE_TAIL]; + WRITE_ONCE(rsclp->tails[j], rsclp->tails[RCU_DONE_TAIL]); /* * Callbacks moved, so clean up the misordered ->tails[] pointers @@ -319,7 +443,7 @@ void rcu_segcblist_advance(struct rcu_segcblist *rsclp, unsigned long seq) for (j = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++, j++) { if (rsclp->tails[j] == rsclp->tails[RCU_NEXT_TAIL]) break; /* No more callbacks. */ - rsclp->tails[j] = rsclp->tails[i]; + WRITE_ONCE(rsclp->tails[j], rsclp->tails[i]); rsclp->gp_seq[j] = rsclp->gp_seq[i]; } } @@ -384,7 +508,7 @@ bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp, unsigned long seq) * structure other than in the RCU_NEXT_TAIL segment. */ for (; i < RCU_NEXT_TAIL; i++) { - rsclp->tails[i] = rsclp->tails[RCU_NEXT_TAIL]; + WRITE_ONCE(rsclp->tails[i], rsclp->tails[RCU_NEXT_TAIL]); rsclp->gp_seq[i] = seq; } return true; diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h index 71b64648464e..815c2fdd3fcc 100644 --- a/kernel/rcu/rcu_segcblist.h +++ b/kernel/rcu/rcu_segcblist.h @@ -9,6 +9,12 @@ #include <linux/rcu_segcblist.h> +/* Return number of callbacks in the specified callback list. */ +static inline long rcu_cblist_n_cbs(struct rcu_cblist *rclp) +{ + return READ_ONCE(rclp->len); +} + /* * Account for the fact that a previously dequeued callback turned out * to be marked as lazy. @@ -19,6 +25,10 @@ static inline void rcu_cblist_dequeued_lazy(struct rcu_cblist *rclp) } void rcu_cblist_init(struct rcu_cblist *rclp); +void rcu_cblist_enqueue(struct rcu_cblist *rclp, struct rcu_head *rhp); +void rcu_cblist_flush_enqueue(struct rcu_cblist *drclp, + struct rcu_cblist *srclp, + struct rcu_head *rhp); struct rcu_head *rcu_cblist_dequeue(struct rcu_cblist *rclp); /* @@ -36,13 +46,17 @@ struct rcu_head *rcu_cblist_dequeue(struct rcu_cblist *rclp); */ static inline bool rcu_segcblist_empty(struct rcu_segcblist *rsclp) { - return !rsclp->head; + return !READ_ONCE(rsclp->head); } /* Return number of callbacks in segmented callback list. */ static inline long rcu_segcblist_n_cbs(struct rcu_segcblist *rsclp) { +#ifdef CONFIG_RCU_NOCB_CPU + return atomic_long_read(&rsclp->len); +#else return READ_ONCE(rsclp->len); +#endif } /* Return number of lazy callbacks in segmented callback list. */ @@ -54,16 +68,22 @@ static inline long rcu_segcblist_n_lazy_cbs(struct rcu_segcblist *rsclp) /* Return number of lazy callbacks in segmented callback list. */ static inline long rcu_segcblist_n_nonlazy_cbs(struct rcu_segcblist *rsclp) { - return rsclp->len - rsclp->len_lazy; + return rcu_segcblist_n_cbs(rsclp) - rsclp->len_lazy; } /* * Is the specified rcu_segcblist enabled, for example, not corresponding - * to an offline or callback-offloaded CPU? + * to an offline CPU? */ static inline bool rcu_segcblist_is_enabled(struct rcu_segcblist *rsclp) { - return !!rsclp->tails[RCU_NEXT_TAIL]; + return rsclp->enabled; +} + +/* Is the specified rcu_segcblist offloaded? */ +static inline bool rcu_segcblist_is_offloaded(struct rcu_segcblist *rsclp) +{ + return rsclp->offloaded; } /* @@ -73,36 +93,18 @@ static inline bool rcu_segcblist_is_enabled(struct rcu_segcblist *rsclp) */ static inline bool rcu_segcblist_restempty(struct rcu_segcblist *rsclp, int seg) { - return !*rsclp->tails[seg]; -} - -/* - * Interim function to return rcu_segcblist head pointer. Longer term, the - * rcu_segcblist will be used more pervasively, removing the need for this - * function. - */ -static inline struct rcu_head *rcu_segcblist_head(struct rcu_segcblist *rsclp) -{ - return rsclp->head; -} - -/* - * Interim function to return rcu_segcblist head pointer. Longer term, the - * rcu_segcblist will be used more pervasively, removing the need for this - * function. - */ -static inline struct rcu_head **rcu_segcblist_tail(struct rcu_segcblist *rsclp) -{ - WARN_ON_ONCE(rcu_segcblist_empty(rsclp)); - return rsclp->tails[RCU_NEXT_TAIL]; + return !READ_ONCE(*READ_ONCE(rsclp->tails[seg])); } +void rcu_segcblist_inc_len(struct rcu_segcblist *rsclp); void rcu_segcblist_init(struct rcu_segcblist *rsclp); void rcu_segcblist_disable(struct rcu_segcblist *rsclp); +void rcu_segcblist_offload(struct rcu_segcblist *rsclp); bool rcu_segcblist_ready_cbs(struct rcu_segcblist *rsclp); bool rcu_segcblist_pend_cbs(struct rcu_segcblist *rsclp); struct rcu_head *rcu_segcblist_first_cb(struct rcu_segcblist *rsclp); struct rcu_head *rcu_segcblist_first_pend_cb(struct rcu_segcblist *rsclp); +bool rcu_segcblist_nextgp(struct rcu_segcblist *rsclp, unsigned long *lp); void rcu_segcblist_enqueue(struct rcu_segcblist *rsclp, struct rcu_head *rhp, bool lazy); bool rcu_segcblist_entrain(struct rcu_segcblist *rsclp, diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c index 7a6890b23c5f..5a879d073c1c 100644 --- a/kernel/rcu/rcuperf.c +++ b/kernel/rcu/rcuperf.c @@ -89,7 +89,7 @@ torture_param(int, writer_holdoff, 0, "Holdoff (us) between GPs, zero to disable static char *perf_type = "rcu"; module_param(perf_type, charp, 0444); -MODULE_PARM_DESC(perf_type, "Type of RCU to performance-test (rcu, rcu_bh, ...)"); +MODULE_PARM_DESC(perf_type, "Type of RCU to performance-test (rcu, srcu, ...)"); static int nrealreaders; static int nrealwriters; @@ -375,6 +375,14 @@ rcu_perf_writer(void *arg) if (holdoff) schedule_timeout_uninterruptible(holdoff * HZ); + /* + * Wait until rcu_end_inkernel_boot() is called for normal GP tests + * so that RCU is not always expedited for normal GP tests. + * The system_state test is approximate, but works well in practice. + */ + while (!gp_exp && system_state != SYSTEM_RUNNING) + schedule_timeout_uninterruptible(1); + t = ktime_get_mono_fast_ns(); if (atomic_inc_return(&n_rcu_perf_writer_started) >= nrealwriters) { t_rcu_perf_writer_started = t; diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index fce4e7e6f502..3c9feca1eab1 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -161,6 +161,7 @@ static atomic_long_t n_rcu_torture_timers; static long n_barrier_attempts; static long n_barrier_successes; /* did rcu_barrier test succeed? */ static struct list_head rcu_torture_removed; +static unsigned long shutdown_jiffies; static int rcu_torture_writer_state; #define RTWS_FIXED_DELAY 0 @@ -228,6 +229,15 @@ static u64 notrace rcu_trace_clock_local(void) } #endif /* #else #ifdef CONFIG_RCU_TRACE */ +/* + * Stop aggressive CPU-hog tests a bit before the end of the test in order + * to avoid interfering with test shutdown. + */ +static bool shutdown_time_arrived(void) +{ + return shutdown_secs && time_after(jiffies, shutdown_jiffies - 30 * HZ); +} + static unsigned long boost_starttime; /* jiffies of next boost test start. */ static DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */ /* and boost task create/destroy. */ @@ -1713,12 +1723,14 @@ static void rcu_torture_fwd_cb_cr(struct rcu_head *rhp) } // Give the scheduler a chance, even on nohz_full CPUs. -static void rcu_torture_fwd_prog_cond_resched(void) +static void rcu_torture_fwd_prog_cond_resched(unsigned long iter) { if (IS_ENABLED(CONFIG_PREEMPT) && IS_ENABLED(CONFIG_NO_HZ_FULL)) { - if (need_resched()) + // Real call_rcu() floods hit userspace, so emulate that. + if (need_resched() || (iter & 0xfff)) schedule(); } else { + // No userspace emulation: CB invocation throttles call_rcu() cond_resched(); } } @@ -1746,7 +1758,7 @@ static unsigned long rcu_torture_fwd_prog_cbfree(void) spin_unlock_irqrestore(&rcu_fwd_lock, flags); kfree(rfcp); freed++; - rcu_torture_fwd_prog_cond_resched(); + rcu_torture_fwd_prog_cond_resched(freed); } return freed; } @@ -1785,15 +1797,17 @@ static void rcu_torture_fwd_prog_nr(int *tested, int *tested_tries) WRITE_ONCE(rcu_fwd_startat, jiffies); stopat = rcu_fwd_startat + dur; while (time_before(jiffies, stopat) && + !shutdown_time_arrived() && !READ_ONCE(rcu_fwd_emergency_stop) && !torture_must_stop()) { idx = cur_ops->readlock(); udelay(10); cur_ops->readunlock(idx); if (!fwd_progress_need_resched || need_resched()) - rcu_torture_fwd_prog_cond_resched(); + rcu_torture_fwd_prog_cond_resched(1); } (*tested_tries)++; if (!time_before(jiffies, stopat) && + !shutdown_time_arrived() && !READ_ONCE(rcu_fwd_emergency_stop) && !torture_must_stop()) { (*tested)++; cver = READ_ONCE(rcu_torture_current_version) - cver; @@ -1852,6 +1866,7 @@ static void rcu_torture_fwd_prog_cr(void) gps = cur_ops->get_gp_seq(); rcu_launder_gp_seq_start = gps; while (time_before(jiffies, stopat) && + !shutdown_time_arrived() && !READ_ONCE(rcu_fwd_emergency_stop) && !torture_must_stop()) { rfcp = READ_ONCE(rcu_fwd_cb_head); rfcpn = NULL; @@ -1875,7 +1890,7 @@ static void rcu_torture_fwd_prog_cr(void) rfcp->rfc_gps = 0; } cur_ops->call(&rfcp->rh, rcu_torture_fwd_cb_cr); - rcu_torture_fwd_prog_cond_resched(); + rcu_torture_fwd_prog_cond_resched(n_launders + n_max_cbs); } stoppedat = jiffies; n_launders_cb_snap = READ_ONCE(n_launders_cb); @@ -1884,7 +1899,8 @@ static void rcu_torture_fwd_prog_cr(void) cur_ops->cb_barrier(); /* Wait for callbacks to be invoked. */ (void)rcu_torture_fwd_prog_cbfree(); - if (!torture_must_stop() && !READ_ONCE(rcu_fwd_emergency_stop)) { + if (!torture_must_stop() && !READ_ONCE(rcu_fwd_emergency_stop) && + !shutdown_time_arrived()) { WARN_ON(n_max_gps < MIN_FWD_CBS_LAUNDERED); pr_alert("%s Duration %lu barrier: %lu pending %ld n_launders: %ld n_launders_sa: %ld n_max_gps: %ld n_max_cbs: %ld cver %ld gps %ld\n", __func__, @@ -2160,6 +2176,7 @@ rcu_torture_cleanup(void) return; } + show_rcu_gp_kthreads(); rcu_torture_barrier_cleanup(); torture_stop_kthread(rcu_torture_fwd_prog, fwd_prog_task); torture_stop_kthread(rcu_torture_stall, stall_task); @@ -2465,6 +2482,7 @@ rcu_torture_init(void) goto unwind; rcutor_hp = firsterr; } + shutdown_jiffies = jiffies + shutdown_secs * HZ; firsterr = torture_shutdown_init(shutdown_secs, rcu_torture_cleanup); if (firsterr) goto unwind; diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index cf0e886314f2..5dffade2d7cd 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -1279,8 +1279,9 @@ void srcu_torture_stats_print(struct srcu_struct *ssp, char *tt, char *tf) c0 = l0 - u0; c1 = l1 - u1; - pr_cont(" %d(%ld,%ld %1p)", - cpu, c0, c1, rcu_segcblist_head(&sdp->srcu_cblist)); + pr_cont(" %d(%ld,%ld %c)", + cpu, c0, c1, + "C."[rcu_segcblist_empty(&sdp->srcu_cblist)]); s0 += c0; s1 += c1; } diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index a14e5fbbea46..81105141b6a8 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -56,6 +56,7 @@ #include <linux/smpboot.h> #include <linux/jiffies.h> #include <linux/sched/isolation.h> +#include <linux/sched/clock.h> #include "../time/tick-internal.h" #include "tree.h" @@ -210,9 +211,9 @@ static long rcu_get_n_cbs_cpu(int cpu) { struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); - if (rcu_segcblist_is_enabled(&rdp->cblist)) /* Online normal CPU? */ + if (rcu_segcblist_is_enabled(&rdp->cblist)) return rcu_segcblist_n_cbs(&rdp->cblist); - return rcu_get_n_cbs_nocb_cpu(rdp); /* Works for offline, too. */ + return 0; } void rcu_softirq_qs(void) @@ -416,6 +417,12 @@ module_param(qlowmark, long, 0444); static ulong jiffies_till_first_fqs = ULONG_MAX; static ulong jiffies_till_next_fqs = ULONG_MAX; static bool rcu_kick_kthreads; +static int rcu_divisor = 7; +module_param(rcu_divisor, int, 0644); + +/* Force an exit from rcu_do_batch() after 3 milliseconds. */ +static long rcu_resched_ns = 3 * NSEC_PER_MSEC; +module_param(rcu_resched_ns, long, 0644); /* * How long the grace period must be before we start recruiting @@ -1251,6 +1258,7 @@ static bool rcu_accelerate_cbs(struct rcu_node *rnp, struct rcu_data *rdp) unsigned long gp_seq_req; bool ret = false; + rcu_lockdep_assert_cblist_protected(rdp); raw_lockdep_assert_held_rcu_node(rnp); /* If no pending (not yet ready to invoke) callbacks, nothing to do. */ @@ -1292,7 +1300,7 @@ static void rcu_accelerate_cbs_unlocked(struct rcu_node *rnp, unsigned long c; bool needwake; - lockdep_assert_irqs_disabled(); + rcu_lockdep_assert_cblist_protected(rdp); c = rcu_seq_snap(&rcu_state.gp_seq); if (!rdp->gpwrap && ULONG_CMP_GE(rdp->gp_seq_needed, c)) { /* Old request still live, so mark recent callbacks. */ @@ -1318,6 +1326,7 @@ static void rcu_accelerate_cbs_unlocked(struct rcu_node *rnp, */ static bool rcu_advance_cbs(struct rcu_node *rnp, struct rcu_data *rdp) { + rcu_lockdep_assert_cblist_protected(rdp); raw_lockdep_assert_held_rcu_node(rnp); /* If no pending (not yet ready to invoke) callbacks, nothing to do. */ @@ -1335,6 +1344,21 @@ static bool rcu_advance_cbs(struct rcu_node *rnp, struct rcu_data *rdp) } /* + * Move and classify callbacks, but only if doing so won't require + * that the RCU grace-period kthread be awakened. + */ +static void __maybe_unused rcu_advance_cbs_nowake(struct rcu_node *rnp, + struct rcu_data *rdp) +{ + rcu_lockdep_assert_cblist_protected(rdp); + if (!rcu_seq_state(rcu_seq_current(&rnp->gp_seq)) || + !raw_spin_trylock_rcu_node(rnp)) + return; + WARN_ON_ONCE(rcu_advance_cbs(rnp, rdp)); + raw_spin_unlock_rcu_node(rnp); +} + +/* * Update CPU-local rcu_data state to record the beginnings and ends of * grace periods. The caller must hold the ->lock of the leaf rcu_node * structure corresponding to the current CPU, and must have irqs disabled. @@ -1342,8 +1366,10 @@ static bool rcu_advance_cbs(struct rcu_node *rnp, struct rcu_data *rdp) */ static bool __note_gp_changes(struct rcu_node *rnp, struct rcu_data *rdp) { - bool ret; + bool ret = false; bool need_gp; + const bool offloaded = IS_ENABLED(CONFIG_RCU_NOCB_CPU) && + rcu_segcblist_is_offloaded(&rdp->cblist); raw_lockdep_assert_held_rcu_node(rnp); @@ -1353,10 +1379,12 @@ static bool __note_gp_changes(struct rcu_node *rnp, struct rcu_data *rdp) /* Handle the ends of any preceding grace periods first. */ if (rcu_seq_completed_gp(rdp->gp_seq, rnp->gp_seq) || unlikely(READ_ONCE(rdp->gpwrap))) { - ret = rcu_advance_cbs(rnp, rdp); /* Advance callbacks. */ + if (!offloaded) + ret = rcu_advance_cbs(rnp, rdp); /* Advance CBs. */ trace_rcu_grace_period(rcu_state.name, rdp->gp_seq, TPS("cpuend")); } else { - ret = rcu_accelerate_cbs(rnp, rdp); /* Recent callbacks. */ + if (!offloaded) + ret = rcu_accelerate_cbs(rnp, rdp); /* Recent CBs. */ } /* Now handle the beginnings of any new-to-this-CPU grace periods. */ @@ -1657,6 +1685,7 @@ static void rcu_gp_cleanup(void) unsigned long gp_duration; bool needgp = false; unsigned long new_gp_seq; + bool offloaded; struct rcu_data *rdp; struct rcu_node *rnp = rcu_get_root(); struct swait_queue_head *sq; @@ -1722,7 +1751,9 @@ static void rcu_gp_cleanup(void) needgp = true; } /* Advance CBs to reduce false positives below. */ - if (!rcu_accelerate_cbs(rnp, rdp) && needgp) { + offloaded = IS_ENABLED(CONFIG_RCU_NOCB_CPU) && + rcu_segcblist_is_offloaded(&rdp->cblist); + if ((offloaded || !rcu_accelerate_cbs(rnp, rdp)) && needgp) { WRITE_ONCE(rcu_state.gp_flags, RCU_GP_FLAG_INIT); rcu_state.gp_req_activity = jiffies; trace_rcu_grace_period(rcu_state.name, @@ -1881,7 +1912,7 @@ rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags) struct rcu_node *rnp_p; raw_lockdep_assert_held_rcu_node(rnp); - if (WARN_ON_ONCE(!IS_ENABLED(CONFIG_PREEMPT)) || + if (WARN_ON_ONCE(!IS_ENABLED(CONFIG_PREEMPTION)) || WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp)) || rnp->qsmask != 0) { raw_spin_unlock_irqrestore_rcu_node(rnp, flags); @@ -1916,7 +1947,9 @@ rcu_report_qs_rdp(int cpu, struct rcu_data *rdp) { unsigned long flags; unsigned long mask; - bool needwake; + bool needwake = false; + const bool offloaded = IS_ENABLED(CONFIG_RCU_NOCB_CPU) && + rcu_segcblist_is_offloaded(&rdp->cblist); struct rcu_node *rnp; rnp = rdp->mynode; @@ -1943,7 +1976,8 @@ rcu_report_qs_rdp(int cpu, struct rcu_data *rdp) * This GP can't end until cpu checks in, so all of our * callbacks can be processed during the next GP. */ - needwake = rcu_accelerate_cbs(rnp, rdp); + if (!offloaded) + needwake = rcu_accelerate_cbs(rnp, rdp); rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags); /* ^^^ Released rnp->lock */ @@ -2077,9 +2111,12 @@ int rcutree_dead_cpu(unsigned int cpu) static void rcu_do_batch(struct rcu_data *rdp) { unsigned long flags; + const bool offloaded = IS_ENABLED(CONFIG_RCU_NOCB_CPU) && + rcu_segcblist_is_offloaded(&rdp->cblist); struct rcu_head *rhp; struct rcu_cblist rcl = RCU_CBLIST_INITIALIZER(rcl); long bl, count; + long pending, tlimit = 0; /* If no callbacks are ready, just return. */ if (!rcu_segcblist_ready_cbs(&rdp->cblist)) { @@ -2099,13 +2136,19 @@ static void rcu_do_batch(struct rcu_data *rdp) * callback counts, as rcu_barrier() needs to be conservative. */ local_irq_save(flags); + rcu_nocb_lock(rdp); WARN_ON_ONCE(cpu_is_offline(smp_processor_id())); - bl = rdp->blimit; + pending = rcu_segcblist_n_cbs(&rdp->cblist); + bl = max(rdp->blimit, pending >> rcu_divisor); + if (unlikely(bl > 100)) + tlimit = local_clock() + rcu_resched_ns; trace_rcu_batch_start(rcu_state.name, rcu_segcblist_n_lazy_cbs(&rdp->cblist), rcu_segcblist_n_cbs(&rdp->cblist), bl); rcu_segcblist_extract_done_cbs(&rdp->cblist, &rcl); - local_irq_restore(flags); + if (offloaded) + rdp->qlen_last_fqs_check = rcu_segcblist_n_cbs(&rdp->cblist); + rcu_nocb_unlock_irqrestore(rdp, flags); /* Invoke callbacks. */ rhp = rcu_cblist_dequeue(&rcl); @@ -2117,13 +2160,29 @@ static void rcu_do_batch(struct rcu_data *rdp) * Stop only if limit reached and CPU has something to do. * Note: The rcl structure counts down from zero. */ - if (-rcl.len >= bl && + if (-rcl.len >= bl && !offloaded && (need_resched() || (!is_idle_task(current) && !rcu_is_callbacks_kthread()))) break; + if (unlikely(tlimit)) { + /* only call local_clock() every 32 callbacks */ + if (likely((-rcl.len & 31) || local_clock() < tlimit)) + continue; + /* Exceeded the time limit, so leave. */ + break; + } + if (offloaded) { + WARN_ON_ONCE(in_serving_softirq()); + local_bh_enable(); + lockdep_assert_irqs_enabled(); + cond_resched_tasks_rcu_qs(); + lockdep_assert_irqs_enabled(); + local_bh_disable(); + } } local_irq_save(flags); + rcu_nocb_lock(rdp); count = -rcl.len; trace_rcu_batch_end(rcu_state.name, count, !!rcl.head, need_resched(), is_idle_task(current), rcu_is_callbacks_kthread()); @@ -2149,12 +2208,14 @@ static void rcu_do_batch(struct rcu_data *rdp) * The following usually indicates a double call_rcu(). To track * this down, try building with CONFIG_DEBUG_OBJECTS_RCU_HEAD=y. */ - WARN_ON_ONCE(rcu_segcblist_empty(&rdp->cblist) != (count == 0)); + WARN_ON_ONCE(count == 0 && !rcu_segcblist_empty(&rdp->cblist)); + WARN_ON_ONCE(!IS_ENABLED(CONFIG_RCU_NOCB_CPU) && + count != 0 && rcu_segcblist_empty(&rdp->cblist)); - local_irq_restore(flags); + rcu_nocb_unlock_irqrestore(rdp, flags); /* Re-invoke RCU core processing if there are callbacks remaining. */ - if (rcu_segcblist_ready_cbs(&rdp->cblist)) + if (!offloaded && rcu_segcblist_ready_cbs(&rdp->cblist)) invoke_rcu_core(); } @@ -2205,7 +2266,7 @@ static void force_qs_rnp(int (*f)(struct rcu_data *rdp)) mask = 0; raw_spin_lock_irqsave_rcu_node(rnp, flags); if (rnp->qsmask == 0) { - if (!IS_ENABLED(CONFIG_PREEMPT) || + if (!IS_ENABLED(CONFIG_PREEMPTION) || rcu_preempt_blocked_readers_cgp(rnp)) { /* * No point in scanning bits because they @@ -2280,6 +2341,8 @@ static __latent_entropy void rcu_core(void) unsigned long flags; struct rcu_data *rdp = raw_cpu_ptr(&rcu_data); struct rcu_node *rnp = rdp->mynode; + const bool offloaded = IS_ENABLED(CONFIG_RCU_NOCB_CPU) && + rcu_segcblist_is_offloaded(&rdp->cblist); if (cpu_is_offline(smp_processor_id())) return; @@ -2299,7 +2362,7 @@ static __latent_entropy void rcu_core(void) /* No grace period and unregistered callbacks? */ if (!rcu_gp_in_progress() && - rcu_segcblist_is_enabled(&rdp->cblist)) { + rcu_segcblist_is_enabled(&rdp->cblist) && !offloaded) { local_irq_save(flags); if (!rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL)) rcu_accelerate_cbs_unlocked(rnp, rdp); @@ -2309,7 +2372,7 @@ static __latent_entropy void rcu_core(void) rcu_check_gp_start_stall(rnp, rdp, rcu_jiffies_till_stall_check()); /* If there are callbacks ready, invoke them. */ - if (rcu_segcblist_ready_cbs(&rdp->cblist) && + if (!offloaded && rcu_segcblist_ready_cbs(&rdp->cblist) && likely(READ_ONCE(rcu_scheduler_fully_active))) rcu_do_batch(rdp); @@ -2489,10 +2552,11 @@ static void rcu_leak_callback(struct rcu_head *rhp) * is expected to specify a CPU. */ static void -__call_rcu(struct rcu_head *head, rcu_callback_t func, int cpu, bool lazy) +__call_rcu(struct rcu_head *head, rcu_callback_t func, bool lazy) { unsigned long flags; struct rcu_data *rdp; + bool was_alldone; /* Misaligned rcu_head! */ WARN_ON_ONCE((unsigned long)head & (sizeof(void *) - 1)); @@ -2514,28 +2578,18 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func, int cpu, bool lazy) rdp = this_cpu_ptr(&rcu_data); /* Add the callback to our list. */ - if (unlikely(!rcu_segcblist_is_enabled(&rdp->cblist)) || cpu != -1) { - int offline; - - if (cpu != -1) - rdp = per_cpu_ptr(&rcu_data, cpu); - if (likely(rdp->mynode)) { - /* Post-boot, so this should be for a no-CBs CPU. */ - offline = !__call_rcu_nocb(rdp, head, lazy, flags); - WARN_ON_ONCE(offline); - /* Offline CPU, _call_rcu() illegal, leak callback. */ - local_irq_restore(flags); - return; - } - /* - * Very early boot, before rcu_init(). Initialize if needed - * and then drop through to queue the callback. - */ - WARN_ON_ONCE(cpu != -1); + if (unlikely(!rcu_segcblist_is_enabled(&rdp->cblist))) { + // This can trigger due to call_rcu() from offline CPU: + WARN_ON_ONCE(rcu_scheduler_active != RCU_SCHEDULER_INACTIVE); WARN_ON_ONCE(!rcu_is_watching()); + // Very early boot, before rcu_init(). Initialize if needed + // and then drop through to queue the callback. if (rcu_segcblist_empty(&rdp->cblist)) rcu_segcblist_init(&rdp->cblist); } + if (rcu_nocb_try_bypass(rdp, head, &was_alldone, flags)) + return; // Enqueued onto ->nocb_bypass, so just leave. + /* If we get here, rcu_nocb_try_bypass() acquired ->nocb_lock. */ rcu_segcblist_enqueue(&rdp->cblist, head, lazy); if (__is_kfree_rcu_offset((unsigned long)func)) trace_rcu_kfree_callback(rcu_state.name, head, @@ -2548,8 +2602,13 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func, int cpu, bool lazy) rcu_segcblist_n_cbs(&rdp->cblist)); /* Go handle any RCU core processing required. */ - __call_rcu_core(rdp, head, flags); - local_irq_restore(flags); + if (IS_ENABLED(CONFIG_RCU_NOCB_CPU) && + unlikely(rcu_segcblist_is_offloaded(&rdp->cblist))) { + __call_rcu_nocb_wake(rdp, was_alldone, flags); /* unlocks */ + } else { + __call_rcu_core(rdp, head, flags); + local_irq_restore(flags); + } } /** @@ -2589,7 +2648,7 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func, int cpu, bool lazy) */ void call_rcu(struct rcu_head *head, rcu_callback_t func) { - __call_rcu(head, func, -1, 0); + __call_rcu(head, func, 0); } EXPORT_SYMBOL_GPL(call_rcu); @@ -2602,7 +2661,7 @@ EXPORT_SYMBOL_GPL(call_rcu); */ void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func) { - __call_rcu(head, func, -1, 1); + __call_rcu(head, func, 1); } EXPORT_SYMBOL_GPL(kfree_call_rcu); @@ -2622,7 +2681,7 @@ static int rcu_blocking_is_gp(void) { int ret; - if (IS_ENABLED(CONFIG_PREEMPT)) + if (IS_ENABLED(CONFIG_PREEMPTION)) return rcu_scheduler_active == RCU_SCHEDULER_INACTIVE; might_sleep(); /* Check for RCU read-side critical section. */ preempt_disable(); @@ -2735,6 +2794,10 @@ static int rcu_pending(void) /* Check for CPU stalls, if enabled. */ check_cpu_stall(rdp); + /* Does this CPU need a deferred NOCB wakeup? */ + if (rcu_nocb_need_deferred_wakeup(rdp)) + return 1; + /* Is this CPU a NO_HZ_FULL CPU that should ignore RCU? */ if (rcu_nohz_full_cpu()) return 0; @@ -2750,6 +2813,8 @@ static int rcu_pending(void) /* Has RCU gone idle with this CPU needing another grace period? */ if (!rcu_gp_in_progress() && rcu_segcblist_is_enabled(&rdp->cblist) && + (!IS_ENABLED(CONFIG_RCU_NOCB_CPU) || + !rcu_segcblist_is_offloaded(&rdp->cblist)) && !rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL)) return 1; @@ -2758,10 +2823,6 @@ static int rcu_pending(void) unlikely(READ_ONCE(rdp->gpwrap))) /* outside lock */ return 1; - /* Does this CPU need a deferred NOCB wakeup? */ - if (rcu_nocb_need_deferred_wakeup(rdp)) - return 1; - /* nothing to do */ return 0; } @@ -2801,6 +2862,8 @@ static void rcu_barrier_func(void *unused) rcu_barrier_trace(TPS("IRQ"), -1, rcu_state.barrier_sequence); rdp->barrier_head.func = rcu_barrier_callback; debug_rcu_head_queue(&rdp->barrier_head); + rcu_nocb_lock(rdp); + WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies)); if (rcu_segcblist_entrain(&rdp->cblist, &rdp->barrier_head, 0)) { atomic_inc(&rcu_state.barrier_cpu_count); } else { @@ -2808,6 +2871,7 @@ static void rcu_barrier_func(void *unused) rcu_barrier_trace(TPS("IRQNQ"), -1, rcu_state.barrier_sequence); } + rcu_nocb_unlock(rdp); } /** @@ -2858,22 +2922,11 @@ void rcu_barrier(void) * corresponding CPU's preceding callbacks have been invoked. */ for_each_possible_cpu(cpu) { - if (!cpu_online(cpu) && !rcu_is_nocb_cpu(cpu)) - continue; rdp = per_cpu_ptr(&rcu_data, cpu); - if (rcu_is_nocb_cpu(cpu)) { - if (!rcu_nocb_cpu_needs_barrier(cpu)) { - rcu_barrier_trace(TPS("OfflineNoCB"), cpu, - rcu_state.barrier_sequence); - } else { - rcu_barrier_trace(TPS("OnlineNoCB"), cpu, - rcu_state.barrier_sequence); - smp_mb__before_atomic(); - atomic_inc(&rcu_state.barrier_cpu_count); - __call_rcu(&rdp->barrier_head, - rcu_barrier_callback, cpu, 0); - } - } else if (rcu_segcblist_n_cbs(&rdp->cblist)) { + if (!cpu_online(cpu) && + !rcu_segcblist_is_offloaded(&rdp->cblist)) + continue; + if (rcu_segcblist_n_cbs(&rdp->cblist)) { rcu_barrier_trace(TPS("OnlineQ"), cpu, rcu_state.barrier_sequence); smp_call_function_single(cpu, rcu_barrier_func, NULL, 1); @@ -2958,7 +3011,8 @@ rcu_boot_init_percpu_data(int cpu) * Initializes a CPU's per-CPU RCU data. Note that only one online or * offline event can be happening at a given time. Note also that we can * accept some slop in the rsp->gp_seq access due to the fact that this - * CPU cannot possibly have any RCU callbacks in flight yet. + * CPU cannot possibly have any non-offloaded RCU callbacks in flight yet. + * And any offloaded callbacks are being numbered elsewhere. */ int rcutree_prepare_cpu(unsigned int cpu) { @@ -2972,7 +3026,7 @@ int rcutree_prepare_cpu(unsigned int cpu) rdp->n_force_qs_snap = rcu_state.n_force_qs; rdp->blimit = blimit; if (rcu_segcblist_empty(&rdp->cblist) && /* No early-boot CBs? */ - !init_nocb_callback_list(rdp)) + !rcu_segcblist_is_offloaded(&rdp->cblist)) rcu_segcblist_init(&rdp->cblist); /* Re-enable callbacks. */ rdp->dynticks_nesting = 1; /* CPU not up, no tearing. */ rcu_dynticks_eqs_online(); @@ -3151,29 +3205,38 @@ void rcutree_migrate_callbacks(int cpu) { unsigned long flags; struct rcu_data *my_rdp; + struct rcu_node *my_rnp; struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); - struct rcu_node *rnp_root = rcu_get_root(); bool needwake; - if (rcu_is_nocb_cpu(cpu) || rcu_segcblist_empty(&rdp->cblist)) + if (rcu_segcblist_is_offloaded(&rdp->cblist) || + rcu_segcblist_empty(&rdp->cblist)) return; /* No callbacks to migrate. */ local_irq_save(flags); my_rdp = this_cpu_ptr(&rcu_data); - if (rcu_nocb_adopt_orphan_cbs(my_rdp, rdp, flags)) { - local_irq_restore(flags); - return; - } - raw_spin_lock_rcu_node(rnp_root); /* irqs already disabled. */ + my_rnp = my_rdp->mynode; + rcu_nocb_lock(my_rdp); /* irqs already disabled. */ + WARN_ON_ONCE(!rcu_nocb_flush_bypass(my_rdp, NULL, jiffies)); + raw_spin_lock_rcu_node(my_rnp); /* irqs already disabled. */ /* Leverage recent GPs and set GP for new callbacks. */ - needwake = rcu_advance_cbs(rnp_root, rdp) || - rcu_advance_cbs(rnp_root, my_rdp); + needwake = rcu_advance_cbs(my_rnp, rdp) || + rcu_advance_cbs(my_rnp, my_rdp); rcu_segcblist_merge(&my_rdp->cblist, &rdp->cblist); + needwake = needwake || rcu_advance_cbs(my_rnp, my_rdp); + rcu_segcblist_disable(&rdp->cblist); WARN_ON_ONCE(rcu_segcblist_empty(&my_rdp->cblist) != !rcu_segcblist_n_cbs(&my_rdp->cblist)); - raw_spin_unlock_irqrestore_rcu_node(rnp_root, flags); + if (rcu_segcblist_is_offloaded(&my_rdp->cblist)) { + raw_spin_unlock_rcu_node(my_rnp); /* irqs remain disabled. */ + __call_rcu_nocb_wake(my_rdp, true, flags); + } else { + rcu_nocb_unlock(my_rdp); /* irqs remain disabled. */ + raw_spin_unlock_irqrestore_rcu_node(my_rnp, flags); + } if (needwake) rcu_gp_kthread_wake(); + lockdep_assert_irqs_enabled(); WARN_ONCE(rcu_segcblist_n_cbs(&rdp->cblist) != 0 || !rcu_segcblist_empty(&rdp->cblist), "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, 1stCB=%p\n", @@ -3234,13 +3297,13 @@ static int __init rcu_spawn_gp_kthread(void) t = kthread_create(rcu_gp_kthread, NULL, "%s", rcu_state.name); if (WARN_ONCE(IS_ERR(t), "%s: Could not start grace-period kthread, OOM is now expected behavior\n", __func__)) return 0; - rnp = rcu_get_root(); - raw_spin_lock_irqsave_rcu_node(rnp, flags); - rcu_state.gp_kthread = t; if (kthread_prio) { sp.sched_priority = kthread_prio; sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); } + rnp = rcu_get_root(); + raw_spin_lock_irqsave_rcu_node(rnp, flags); + rcu_state.gp_kthread = t; raw_spin_unlock_irqrestore_rcu_node(rnp, flags); wake_up_process(t); rcu_spawn_nocb_kthreads(); diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 7acaf3a62d39..c612f306fe89 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -194,29 +194,38 @@ struct rcu_data { /* 5) Callback offloading. */ #ifdef CONFIG_RCU_NOCB_CPU - struct rcu_head *nocb_head; /* CBs waiting for kthread. */ - struct rcu_head **nocb_tail; - atomic_long_t nocb_q_count; /* # CBs waiting for nocb */ - atomic_long_t nocb_q_count_lazy; /* invocation (all stages). */ - struct rcu_head *nocb_follower_head; /* CBs ready to invoke. */ - struct rcu_head **nocb_follower_tail; - struct swait_queue_head nocb_wq; /* For nocb kthreads to sleep on. */ - struct task_struct *nocb_kthread; + struct swait_queue_head nocb_cb_wq; /* For nocb kthreads to sleep on. */ + struct task_struct *nocb_gp_kthread; raw_spinlock_t nocb_lock; /* Guard following pair of fields. */ + atomic_t nocb_lock_contended; /* Contention experienced. */ int nocb_defer_wakeup; /* Defer wakeup of nocb_kthread. */ struct timer_list nocb_timer; /* Enforce finite deferral. */ - - /* The following fields are used by the leader, hence own cacheline. */ - struct rcu_head *nocb_gp_head ____cacheline_internodealigned_in_smp; - /* CBs waiting for GP. */ - struct rcu_head **nocb_gp_tail; - bool nocb_leader_sleep; /* Is the nocb leader thread asleep? */ - struct rcu_data *nocb_next_follower; - /* Next follower in wakeup chain. */ - - /* The following fields are used by the follower, hence new cachline. */ - struct rcu_data *nocb_leader ____cacheline_internodealigned_in_smp; - /* Leader CPU takes GP-end wakeups. */ + unsigned long nocb_gp_adv_time; /* Last call_rcu() CB adv (jiffies). */ + + /* The following fields are used by call_rcu, hence own cacheline. */ + raw_spinlock_t nocb_bypass_lock ____cacheline_internodealigned_in_smp; + struct rcu_cblist nocb_bypass; /* Lock-contention-bypass CB list. */ + unsigned long nocb_bypass_first; /* Time (jiffies) of first enqueue. */ + unsigned long nocb_nobypass_last; /* Last ->cblist enqueue (jiffies). */ + int nocb_nobypass_count; /* # ->cblist enqueues at ^^^ time. */ + + /* The following fields are used by GP kthread, hence own cacheline. */ + raw_spinlock_t nocb_gp_lock ____cacheline_internodealigned_in_smp; + struct timer_list nocb_bypass_timer; /* Force nocb_bypass flush. */ + u8 nocb_gp_sleep; /* Is the nocb GP thread asleep? */ + u8 nocb_gp_bypass; /* Found a bypass on last scan? */ + u8 nocb_gp_gp; /* GP to wait for on last scan? */ + unsigned long nocb_gp_seq; /* If so, ->gp_seq to wait for. */ + unsigned long nocb_gp_loops; /* # passes through wait code. */ + struct swait_queue_head nocb_gp_wq; /* For nocb kthreads to sleep on. */ + bool nocb_cb_sleep; /* Is the nocb CB thread asleep? */ + struct task_struct *nocb_cb_kthread; + struct rcu_data *nocb_next_cb_rdp; + /* Next rcu_data in wakeup chain. */ + + /* The following fields are used by CB kthread, hence new cacheline. */ + struct rcu_data *nocb_gp_rdp ____cacheline_internodealigned_in_smp; + /* GP rdp takes GP-end wakeups. */ #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ /* 6) RCU priority boosting. */ @@ -419,25 +428,39 @@ static bool rcu_preempt_has_tasks(struct rcu_node *rnp); static bool rcu_preempt_need_deferred_qs(struct task_struct *t); static void rcu_preempt_deferred_qs(struct task_struct *t); static void zero_cpu_stall_ticks(struct rcu_data *rdp); -static bool rcu_nocb_cpu_needs_barrier(int cpu); static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp); static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq); static void rcu_init_one_nocb(struct rcu_node *rnp); -static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, - bool lazy, unsigned long flags); -static bool rcu_nocb_adopt_orphan_cbs(struct rcu_data *my_rdp, - struct rcu_data *rdp, - unsigned long flags); +static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp, + unsigned long j); +static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp, + bool *was_alldone, unsigned long flags); +static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_empty, + unsigned long flags); static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp); static void do_nocb_deferred_wakeup(struct rcu_data *rdp); static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp); static void rcu_spawn_cpu_nocb_kthread(int cpu); static void __init rcu_spawn_nocb_kthreads(void); +static void show_rcu_nocb_state(struct rcu_data *rdp); +static void rcu_nocb_lock(struct rcu_data *rdp); +static void rcu_nocb_unlock(struct rcu_data *rdp); +static void rcu_nocb_unlock_irqrestore(struct rcu_data *rdp, + unsigned long flags); +static void rcu_lockdep_assert_cblist_protected(struct rcu_data *rdp); #ifdef CONFIG_RCU_NOCB_CPU static void __init rcu_organize_nocb_kthreads(void); -#endif /* #ifdef CONFIG_RCU_NOCB_CPU */ -static bool init_nocb_callback_list(struct rcu_data *rdp); -static unsigned long rcu_get_n_cbs_nocb_cpu(struct rcu_data *rdp); +#define rcu_nocb_lock_irqsave(rdp, flags) \ +do { \ + if (!rcu_segcblist_is_offloaded(&(rdp)->cblist)) \ + local_irq_save(flags); \ + else \ + raw_spin_lock_irqsave(&(rdp)->nocb_lock, (flags)); \ +} while (0) +#else /* #ifdef CONFIG_RCU_NOCB_CPU */ +#define rcu_nocb_lock_irqsave(rdp, flags) local_irq_save(flags) +#endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */ + static void rcu_bind_gp_kthread(void); static bool rcu_nohz_full_cpu(void); static void rcu_dynticks_task_enter(void); diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index af7e7b9c86af..d632cd019597 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -781,7 +781,7 @@ static int rcu_print_task_exp_stall(struct rcu_node *rnp) * other hand, if the CPU is not in an RCU read-side critical section, * the IPI handler reports the quiescent state immediately. * - * Although this is a greate improvement over previous expedited + * Although this is a great improvement over previous expedited * implementations, it is still unfriendly to real-time workloads, so is * thus not recommended for any sort of common-case code. In fact, if * you are using synchronize_rcu_expedited() in a loop, please restructure @@ -792,6 +792,7 @@ static int rcu_print_task_exp_stall(struct rcu_node *rnp) */ void synchronize_rcu_expedited(void) { + bool boottime = (rcu_scheduler_active == RCU_SCHEDULER_INIT); struct rcu_exp_work rew; struct rcu_node *rnp; unsigned long s; @@ -817,7 +818,7 @@ void synchronize_rcu_expedited(void) return; /* Someone else did our work for us. */ /* Ensure that load happens before action based on it. */ - if (unlikely(rcu_scheduler_active == RCU_SCHEDULER_INIT)) { + if (unlikely(boottime)) { /* Direct call during scheduler init and early_initcalls(). */ rcu_exp_sel_wait_wake(s); } else { @@ -835,5 +836,8 @@ void synchronize_rcu_expedited(void) /* Let the next expedited grace period start. */ mutex_unlock(&rcu_state.exp_mutex); + + if (likely(!boottime)) + destroy_work_on_stack(&rew.rew_work); } EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index acb225023ed1..2defc7fe74c3 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -288,7 +288,6 @@ void rcu_note_context_switch(bool preempt) struct rcu_data *rdp = this_cpu_ptr(&rcu_data); struct rcu_node *rnp; - barrier(); /* Avoid RCU read-side critical sections leaking down. */ trace_rcu_utilization(TPS("Start context switch")); lockdep_assert_irqs_disabled(); WARN_ON_ONCE(!preempt && t->rcu_read_lock_nesting > 0); @@ -314,15 +313,6 @@ void rcu_note_context_switch(bool preempt) ? rnp->gp_seq : rcu_seq_snap(&rnp->gp_seq)); rcu_preempt_ctxt_queue(rnp, rdp); - } else if (t->rcu_read_lock_nesting < 0 && - t->rcu_read_unlock_special.s) { - - /* - * Complete exit from RCU read-side critical section on - * behalf of preempted instance of __rcu_read_unlock(). - */ - rcu_read_unlock_special(t); - rcu_preempt_deferred_qs(t); } else { rcu_preempt_deferred_qs(t); } @@ -340,7 +330,6 @@ void rcu_note_context_switch(bool preempt) if (rdp->exp_deferred_qs) rcu_report_exp_rdp(rdp); trace_rcu_utilization(TPS("End context switch")); - barrier(); /* Avoid RCU read-side critical sections leaking up. */ } EXPORT_SYMBOL_GPL(rcu_note_context_switch); @@ -626,22 +615,18 @@ static void rcu_read_unlock_special(struct task_struct *t) (rdp->grpmask & rnp->expmask) || tick_nohz_full_cpu(rdp->cpu); // Need to defer quiescent state until everything is enabled. - if ((exp || in_irq()) && irqs_were_disabled && use_softirq && - (in_irq() || !t->rcu_read_unlock_special.b.deferred_qs)) { + if (irqs_were_disabled && use_softirq && + (in_interrupt() || + (exp && !t->rcu_read_unlock_special.b.deferred_qs))) { // Using softirq, safe to awaken, and we get // no help from enabling irqs, unlike bh/preempt. raise_softirq_irqoff(RCU_SOFTIRQ); - } else if (exp && irqs_were_disabled && !use_softirq && - !t->rcu_read_unlock_special.b.deferred_qs) { - // Safe to awaken and we get no help from enabling - // irqs, unlike bh/preempt. - invoke_rcu_core(); } else { // Enabling BH or preempt does reschedule, so... // Also if no expediting or NO_HZ_FULL, slow is OK. set_tsk_need_resched(current); set_preempt_need_resched(); - if (IS_ENABLED(CONFIG_IRQ_WORK) && + if (IS_ENABLED(CONFIG_IRQ_WORK) && irqs_were_disabled && !rdp->defer_qs_iw_pending && exp) { // Get scheduler to re-evaluate and call hooks. // If !IRQ_WORK, FQS scan will eventually IPI. @@ -828,11 +813,6 @@ static void rcu_qs(void) * dyntick-idle quiescent state visible to other CPUs, which will in * some cases serve for expedited as well as normal grace periods. * Either way, register a lightweight quiescent state. - * - * The barrier() calls are redundant in the common case when this is - * called externally, but just in case this is called from within this - * file. - * */ void rcu_all_qs(void) { @@ -847,14 +827,12 @@ void rcu_all_qs(void) return; } this_cpu_write(rcu_data.rcu_urgent_qs, false); - barrier(); /* Avoid RCU read-side critical sections leaking down. */ if (unlikely(raw_cpu_read(rcu_data.rcu_need_heavy_qs))) { local_irq_save(flags); rcu_momentary_dyntick_idle(); local_irq_restore(flags); } rcu_qs(); - barrier(); /* Avoid RCU read-side critical sections leaking up. */ preempt_enable(); } EXPORT_SYMBOL_GPL(rcu_all_qs); @@ -864,7 +842,6 @@ EXPORT_SYMBOL_GPL(rcu_all_qs); */ void rcu_note_context_switch(bool preempt) { - barrier(); /* Avoid RCU read-side critical sections leaking down. */ trace_rcu_utilization(TPS("Start context switch")); rcu_qs(); /* Load rcu_urgent_qs before other flags. */ @@ -877,7 +854,6 @@ void rcu_note_context_switch(bool preempt) rcu_tasks_qs(current); out: trace_rcu_utilization(TPS("End context switch")); - barrier(); /* Avoid RCU read-side critical sections leaking up. */ } EXPORT_SYMBOL_GPL(rcu_note_context_switch); @@ -1134,7 +1110,7 @@ static void rcu_preempt_boost_start_gp(struct rcu_node *rnp) * already exist. We only create this kthread for preemptible RCU. * Returns zero if all is well, a negated errno otherwise. */ -static int rcu_spawn_one_boost_kthread(struct rcu_node *rnp) +static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp) { int rnp_index = rnp - rcu_get_root(); unsigned long flags; @@ -1142,25 +1118,27 @@ static int rcu_spawn_one_boost_kthread(struct rcu_node *rnp) struct task_struct *t; if (!IS_ENABLED(CONFIG_PREEMPT_RCU)) - return 0; + return; if (!rcu_scheduler_fully_active || rcu_rnp_online_cpus(rnp) == 0) - return 0; + return; rcu_state.boost = 1; + if (rnp->boost_kthread_task != NULL) - return 0; + return; + t = kthread_create(rcu_boost_kthread, (void *)rnp, "rcub/%d", rnp_index); - if (IS_ERR(t)) - return PTR_ERR(t); + if (WARN_ON_ONCE(IS_ERR(t))) + return; + raw_spin_lock_irqsave_rcu_node(rnp, flags); rnp->boost_kthread_task = t; raw_spin_unlock_irqrestore_rcu_node(rnp, flags); sp.sched_priority = kthread_prio; sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */ - return 0; } /* @@ -1201,7 +1179,7 @@ static void __init rcu_spawn_boost_kthreads(void) struct rcu_node *rnp; rcu_for_each_leaf_node(rnp) - (void)rcu_spawn_one_boost_kthread(rnp); + rcu_spawn_one_boost_kthread(rnp); } static void rcu_prepare_kthreads(int cpu) @@ -1211,7 +1189,7 @@ static void rcu_prepare_kthreads(int cpu) /* Fire up the incoming CPU's kthread and leaf rcu_node kthread. */ if (rcu_scheduler_fully_active) - (void)rcu_spawn_one_boost_kthread(rnp); + rcu_spawn_one_boost_kthread(rnp); } #else /* #ifdef CONFIG_RCU_BOOST */ @@ -1248,10 +1226,10 @@ static void rcu_prepare_kthreads(int cpu) #if !defined(CONFIG_RCU_FAST_NO_HZ) /* - * Check to see if any future RCU-related work will need to be done - * by the current CPU, even if none need be done immediately, returning - * 1 if so. This function is part of the RCU implementation; it is -not- - * an exported member of the RCU API. + * Check to see if any future non-offloaded RCU-related work will need + * to be done by the current CPU, even if none need be done immediately, + * returning 1 if so. This function is part of the RCU implementation; + * it is -not- an exported member of the RCU API. * * Because we not have RCU_FAST_NO_HZ, just check whether or not this * CPU has RCU callbacks queued. @@ -1259,7 +1237,8 @@ static void rcu_prepare_kthreads(int cpu) int rcu_needs_cpu(u64 basemono, u64 *nextevt) { *nextevt = KTIME_MAX; - return !rcu_segcblist_empty(&this_cpu_ptr(&rcu_data)->cblist); + return !rcu_segcblist_empty(&this_cpu_ptr(&rcu_data)->cblist) && + !rcu_segcblist_is_offloaded(&this_cpu_ptr(&rcu_data)->cblist); } /* @@ -1360,8 +1339,9 @@ int rcu_needs_cpu(u64 basemono, u64 *nextevt) lockdep_assert_irqs_disabled(); - /* If no callbacks, RCU doesn't need the CPU. */ - if (rcu_segcblist_empty(&rdp->cblist)) { + /* If no non-offloaded callbacks, RCU doesn't need the CPU. */ + if (rcu_segcblist_empty(&rdp->cblist) || + rcu_segcblist_is_offloaded(&this_cpu_ptr(&rcu_data)->cblist)) { *nextevt = KTIME_MAX; return 0; } @@ -1404,7 +1384,7 @@ static void rcu_prepare_for_idle(void) int tne; lockdep_assert_irqs_disabled(); - if (rcu_is_nocb_cpu(smp_processor_id())) + if (rcu_segcblist_is_offloaded(&rdp->cblist)) return; /* Handle nohz enablement switches conservatively. */ @@ -1453,8 +1433,10 @@ static void rcu_prepare_for_idle(void) */ static void rcu_cleanup_after_idle(void) { + struct rcu_data *rdp = this_cpu_ptr(&rcu_data); + lockdep_assert_irqs_disabled(); - if (rcu_is_nocb_cpu(smp_processor_id())) + if (rcu_segcblist_is_offloaded(&rdp->cblist)) return; if (rcu_try_advance_all_cbs()) invoke_rcu_core(); @@ -1469,10 +1451,10 @@ static void rcu_cleanup_after_idle(void) * specified by rcu_nocb_mask. For the CPUs in the set, there are kthreads * created that pull the callbacks from the corresponding CPU, wait for * a grace period to elapse, and invoke the callbacks. These kthreads - * are organized into leaders, which manage incoming callbacks, wait for - * grace periods, and awaken followers, and the followers, which only - * invoke callbacks. Each leader is its own follower. The no-CBs CPUs - * do a wake_up() on their kthread when they insert a callback into any + * are organized into GP kthreads, which manage incoming callbacks, wait for + * grace periods, and awaken CB kthreads, and the CB kthreads, which only + * invoke callbacks. Each GP kthread invokes its own CBs. The no-CBs CPUs + * do a wake_up() on their GP kthread when they insert a callback into any * empty list, unless the rcu_nocb_poll boot parameter has been specified, * in which case each kthread actively polls its CPU. (Which isn't so great * for energy efficiency, but which does reduce RCU's overhead on that CPU.) @@ -1515,6 +1497,116 @@ static int __init parse_rcu_nocb_poll(char *arg) early_param("rcu_nocb_poll", parse_rcu_nocb_poll); /* + * Don't bother bypassing ->cblist if the call_rcu() rate is low. + * After all, the main point of bypassing is to avoid lock contention + * on ->nocb_lock, which only can happen at high call_rcu() rates. + */ +int nocb_nobypass_lim_per_jiffy = 16 * 1000 / HZ; +module_param(nocb_nobypass_lim_per_jiffy, int, 0); + +/* + * Acquire the specified rcu_data structure's ->nocb_bypass_lock. If the + * lock isn't immediately available, increment ->nocb_lock_contended to + * flag the contention. + */ +static void rcu_nocb_bypass_lock(struct rcu_data *rdp) +{ + lockdep_assert_irqs_disabled(); + if (raw_spin_trylock(&rdp->nocb_bypass_lock)) + return; + atomic_inc(&rdp->nocb_lock_contended); + WARN_ON_ONCE(smp_processor_id() != rdp->cpu); + smp_mb__after_atomic(); /* atomic_inc() before lock. */ + raw_spin_lock(&rdp->nocb_bypass_lock); + smp_mb__before_atomic(); /* atomic_dec() after lock. */ + atomic_dec(&rdp->nocb_lock_contended); +} + +/* + * Spinwait until the specified rcu_data structure's ->nocb_lock is + * not contended. Please note that this is extremely special-purpose, + * relying on the fact that at most two kthreads and one CPU contend for + * this lock, and also that the two kthreads are guaranteed to have frequent + * grace-period-duration time intervals between successive acquisitions + * of the lock. This allows us to use an extremely simple throttling + * mechanism, and further to apply it only to the CPU doing floods of + * call_rcu() invocations. Don't try this at home! + */ +static void rcu_nocb_wait_contended(struct rcu_data *rdp) +{ + WARN_ON_ONCE(smp_processor_id() != rdp->cpu); + while (WARN_ON_ONCE(atomic_read(&rdp->nocb_lock_contended))) + cpu_relax(); +} + +/* + * Conditionally acquire the specified rcu_data structure's + * ->nocb_bypass_lock. + */ +static bool rcu_nocb_bypass_trylock(struct rcu_data *rdp) +{ + lockdep_assert_irqs_disabled(); + return raw_spin_trylock(&rdp->nocb_bypass_lock); +} + +/* + * Release the specified rcu_data structure's ->nocb_bypass_lock. + */ +static void rcu_nocb_bypass_unlock(struct rcu_data *rdp) +{ + lockdep_assert_irqs_disabled(); + raw_spin_unlock(&rdp->nocb_bypass_lock); +} + +/* + * Acquire the specified rcu_data structure's ->nocb_lock, but only + * if it corresponds to a no-CBs CPU. + */ +static void rcu_nocb_lock(struct rcu_data *rdp) +{ + lockdep_assert_irqs_disabled(); + if (!rcu_segcblist_is_offloaded(&rdp->cblist)) + return; + raw_spin_lock(&rdp->nocb_lock); +} + +/* + * Release the specified rcu_data structure's ->nocb_lock, but only + * if it corresponds to a no-CBs CPU. + */ +static void rcu_nocb_unlock(struct rcu_data *rdp) +{ + if (rcu_segcblist_is_offloaded(&rdp->cblist)) { + lockdep_assert_irqs_disabled(); + raw_spin_unlock(&rdp->nocb_lock); + } +} + +/* + * Release the specified rcu_data structure's ->nocb_lock and restore + * interrupts, but only if it corresponds to a no-CBs CPU. + */ +static void rcu_nocb_unlock_irqrestore(struct rcu_data *rdp, + unsigned long flags) +{ + if (rcu_segcblist_is_offloaded(&rdp->cblist)) { + lockdep_assert_irqs_disabled(); + raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); + } else { + local_irq_restore(flags); + } +} + +/* Lockdep check that ->cblist may be safely accessed. */ +static void rcu_lockdep_assert_cblist_protected(struct rcu_data *rdp) +{ + lockdep_assert_irqs_disabled(); + if (rcu_segcblist_is_offloaded(&rdp->cblist) && + cpu_online(rdp->cpu)) + lockdep_assert_held(&rdp->nocb_lock); +} + +/* * Wake up any no-CBs CPUs' kthreads that were waiting on the just-ended * grace period. */ @@ -1543,440 +1635,514 @@ bool rcu_is_nocb_cpu(int cpu) } /* - * Kick the leader kthread for this NOCB group. Caller holds ->nocb_lock + * Kick the GP kthread for this NOCB group. Caller holds ->nocb_lock * and this function releases it. */ -static void __wake_nocb_leader(struct rcu_data *rdp, bool force, - unsigned long flags) +static void wake_nocb_gp(struct rcu_data *rdp, bool force, + unsigned long flags) __releases(rdp->nocb_lock) { - struct rcu_data *rdp_leader = rdp->nocb_leader; + bool needwake = false; + struct rcu_data *rdp_gp = rdp->nocb_gp_rdp; lockdep_assert_held(&rdp->nocb_lock); - if (!READ_ONCE(rdp_leader->nocb_kthread)) { - raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); + if (!READ_ONCE(rdp_gp->nocb_gp_kthread)) { + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, + TPS("AlreadyAwake")); + rcu_nocb_unlock_irqrestore(rdp, flags); return; } - if (rdp_leader->nocb_leader_sleep || force) { - /* Prior smp_mb__after_atomic() orders against prior enqueue. */ - WRITE_ONCE(rdp_leader->nocb_leader_sleep, false); - del_timer(&rdp->nocb_timer); - raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); - smp_mb(); /* ->nocb_leader_sleep before swake_up_one(). */ - swake_up_one(&rdp_leader->nocb_wq); - } else { - raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); + del_timer(&rdp->nocb_timer); + rcu_nocb_unlock_irqrestore(rdp, flags); + raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags); + if (force || READ_ONCE(rdp_gp->nocb_gp_sleep)) { + WRITE_ONCE(rdp_gp->nocb_gp_sleep, false); + needwake = true; + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("DoWake")); } + raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags); + if (needwake) + wake_up_process(rdp_gp->nocb_gp_kthread); } /* - * Kick the leader kthread for this NOCB group, but caller has not - * acquired locks. + * Arrange to wake the GP kthread for this NOCB group at some future + * time when it is safe to do so. */ -static void wake_nocb_leader(struct rcu_data *rdp, bool force) +static void wake_nocb_gp_defer(struct rcu_data *rdp, int waketype, + const char *reason) { - unsigned long flags; + if (rdp->nocb_defer_wakeup == RCU_NOCB_WAKE_NOT) + mod_timer(&rdp->nocb_timer, jiffies + 1); + if (rdp->nocb_defer_wakeup < waketype) + WRITE_ONCE(rdp->nocb_defer_wakeup, waketype); + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, reason); +} + +/* + * Flush the ->nocb_bypass queue into ->cblist, enqueuing rhp if non-NULL. + * However, if there is a callback to be enqueued and if ->nocb_bypass + * proves to be initially empty, just return false because the no-CB GP + * kthread may need to be awakened in this case. + * + * Note that this function always returns true if rhp is NULL. + */ +static bool rcu_nocb_do_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp, + unsigned long j) +{ + struct rcu_cblist rcl; - raw_spin_lock_irqsave(&rdp->nocb_lock, flags); - __wake_nocb_leader(rdp, force, flags); + WARN_ON_ONCE(!rcu_segcblist_is_offloaded(&rdp->cblist)); + rcu_lockdep_assert_cblist_protected(rdp); + lockdep_assert_held(&rdp->nocb_bypass_lock); + if (rhp && !rcu_cblist_n_cbs(&rdp->nocb_bypass)) { + raw_spin_unlock(&rdp->nocb_bypass_lock); + return false; + } + /* Note: ->cblist.len already accounts for ->nocb_bypass contents. */ + if (rhp) + rcu_segcblist_inc_len(&rdp->cblist); /* Must precede enqueue. */ + rcu_cblist_flush_enqueue(&rcl, &rdp->nocb_bypass, rhp); + rcu_segcblist_insert_pend_cbs(&rdp->cblist, &rcl); + WRITE_ONCE(rdp->nocb_bypass_first, j); + rcu_nocb_bypass_unlock(rdp); + return true; } /* - * Arrange to wake the leader kthread for this NOCB group at some - * future time when it is safe to do so. + * Flush the ->nocb_bypass queue into ->cblist, enqueuing rhp if non-NULL. + * However, if there is a callback to be enqueued and if ->nocb_bypass + * proves to be initially empty, just return false because the no-CB GP + * kthread may need to be awakened in this case. + * + * Note that this function always returns true if rhp is NULL. */ -static void wake_nocb_leader_defer(struct rcu_data *rdp, int waketype, - const char *reason) +static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp, + unsigned long j) { - unsigned long flags; + if (!rcu_segcblist_is_offloaded(&rdp->cblist)) + return true; + rcu_lockdep_assert_cblist_protected(rdp); + rcu_nocb_bypass_lock(rdp); + return rcu_nocb_do_flush_bypass(rdp, rhp, j); +} - raw_spin_lock_irqsave(&rdp->nocb_lock, flags); - if (rdp->nocb_defer_wakeup == RCU_NOCB_WAKE_NOT) - mod_timer(&rdp->nocb_timer, jiffies + 1); - WRITE_ONCE(rdp->nocb_defer_wakeup, waketype); - trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, reason); - raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); +/* + * If the ->nocb_bypass_lock is immediately available, flush the + * ->nocb_bypass queue into ->cblist. + */ +static void rcu_nocb_try_flush_bypass(struct rcu_data *rdp, unsigned long j) +{ + rcu_lockdep_assert_cblist_protected(rdp); + if (!rcu_segcblist_is_offloaded(&rdp->cblist) || + !rcu_nocb_bypass_trylock(rdp)) + return; + WARN_ON_ONCE(!rcu_nocb_do_flush_bypass(rdp, NULL, j)); } -/* Does rcu_barrier need to queue an RCU callback on the specified CPU? */ -static bool rcu_nocb_cpu_needs_barrier(int cpu) +/* + * See whether it is appropriate to use the ->nocb_bypass list in order + * to control contention on ->nocb_lock. A limited number of direct + * enqueues are permitted into ->cblist per jiffy. If ->nocb_bypass + * is non-empty, further callbacks must be placed into ->nocb_bypass, + * otherwise rcu_barrier() breaks. Use rcu_nocb_flush_bypass() to switch + * back to direct use of ->cblist. However, ->nocb_bypass should not be + * used if ->cblist is empty, because otherwise callbacks can be stranded + * on ->nocb_bypass because we cannot count on the current CPU ever again + * invoking call_rcu(). The general rule is that if ->nocb_bypass is + * non-empty, the corresponding no-CBs grace-period kthread must not be + * in an indefinite sleep state. + * + * Finally, it is not permitted to use the bypass during early boot, + * as doing so would confuse the auto-initialization code. Besides + * which, there is no point in worrying about lock contention while + * there is only one CPU in operation. + */ +static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp, + bool *was_alldone, unsigned long flags) { - struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); - unsigned long ret; -#ifdef CONFIG_PROVE_RCU - struct rcu_head *rhp; -#endif /* #ifdef CONFIG_PROVE_RCU */ + unsigned long c; + unsigned long cur_gp_seq; + unsigned long j = jiffies; + long ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass); - /* - * Check count of all no-CBs callbacks awaiting invocation. - * There needs to be a barrier before this function is called, - * but associated with a prior determination that no more - * callbacks would be posted. In the worst case, the first - * barrier in rcu_barrier() suffices (but the caller cannot - * necessarily rely on this, not a substitute for the caller - * getting the concurrency design right!). There must also be a - * barrier between the following load and posting of a callback - * (if a callback is in fact needed). This is associated with an - * atomic_inc() in the caller. - */ - ret = rcu_get_n_cbs_nocb_cpu(rdp); - -#ifdef CONFIG_PROVE_RCU - rhp = READ_ONCE(rdp->nocb_head); - if (!rhp) - rhp = READ_ONCE(rdp->nocb_gp_head); - if (!rhp) - rhp = READ_ONCE(rdp->nocb_follower_head); - - /* Having no rcuo kthread but CBs after scheduler starts is bad! */ - if (!READ_ONCE(rdp->nocb_kthread) && rhp && - rcu_scheduler_fully_active) { - /* RCU callback enqueued before CPU first came online??? */ - pr_err("RCU: Never-onlined no-CBs CPU %d has CB %p\n", - cpu, rhp->func); - WARN_ON_ONCE(1); + if (!rcu_segcblist_is_offloaded(&rdp->cblist)) { + *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist); + return false; /* Not offloaded, no bypassing. */ + } + lockdep_assert_irqs_disabled(); + + // Don't use ->nocb_bypass during early boot. + if (rcu_scheduler_active != RCU_SCHEDULER_RUNNING) { + rcu_nocb_lock(rdp); + WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass)); + *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist); + return false; + } + + // If we have advanced to a new jiffy, reset counts to allow + // moving back from ->nocb_bypass to ->cblist. + if (j == rdp->nocb_nobypass_last) { + c = rdp->nocb_nobypass_count + 1; + } else { + WRITE_ONCE(rdp->nocb_nobypass_last, j); + c = rdp->nocb_nobypass_count - nocb_nobypass_lim_per_jiffy; + if (ULONG_CMP_LT(rdp->nocb_nobypass_count, + nocb_nobypass_lim_per_jiffy)) + c = 0; + else if (c > nocb_nobypass_lim_per_jiffy) + c = nocb_nobypass_lim_per_jiffy; + } + WRITE_ONCE(rdp->nocb_nobypass_count, c); + + // If there hasn't yet been all that many ->cblist enqueues + // this jiffy, tell the caller to enqueue onto ->cblist. But flush + // ->nocb_bypass first. + if (rdp->nocb_nobypass_count < nocb_nobypass_lim_per_jiffy) { + rcu_nocb_lock(rdp); + *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist); + if (*was_alldone) + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, + TPS("FirstQ")); + WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, j)); + WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass)); + return false; // Caller must enqueue the callback. + } + + // If ->nocb_bypass has been used too long or is too full, + // flush ->nocb_bypass to ->cblist. + if ((ncbs && j != READ_ONCE(rdp->nocb_bypass_first)) || + ncbs >= qhimark) { + rcu_nocb_lock(rdp); + if (!rcu_nocb_flush_bypass(rdp, rhp, j)) { + *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist); + if (*was_alldone) + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, + TPS("FirstQ")); + WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass)); + return false; // Caller must enqueue the callback. + } + if (j != rdp->nocb_gp_adv_time && + rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq) && + rcu_seq_done(&rdp->mynode->gp_seq, cur_gp_seq)) { + rcu_advance_cbs_nowake(rdp->mynode, rdp); + rdp->nocb_gp_adv_time = j; + } + rcu_nocb_unlock_irqrestore(rdp, flags); + return true; // Callback already enqueued. } -#endif /* #ifdef CONFIG_PROVE_RCU */ - return !!ret; + // We need to use the bypass. + rcu_nocb_wait_contended(rdp); + rcu_nocb_bypass_lock(rdp); + ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass); + rcu_segcblist_inc_len(&rdp->cblist); /* Must precede enqueue. */ + rcu_cblist_enqueue(&rdp->nocb_bypass, rhp); + if (!ncbs) { + WRITE_ONCE(rdp->nocb_bypass_first, j); + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("FirstBQ")); + } + rcu_nocb_bypass_unlock(rdp); + smp_mb(); /* Order enqueue before wake. */ + if (ncbs) { + local_irq_restore(flags); + } else { + // No-CBs GP kthread might be indefinitely asleep, if so, wake. + rcu_nocb_lock(rdp); // Rare during call_rcu() flood. + if (!rcu_segcblist_pend_cbs(&rdp->cblist)) { + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, + TPS("FirstBQwake")); + __call_rcu_nocb_wake(rdp, true, flags); + } else { + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, + TPS("FirstBQnoWake")); + rcu_nocb_unlock_irqrestore(rdp, flags); + } + } + return true; // Callback already enqueued. } /* - * Enqueue the specified string of rcu_head structures onto the specified - * CPU's no-CBs lists. The CPU is specified by rdp, the head of the - * string by rhp, and the tail of the string by rhtp. The non-lazy/lazy - * counts are supplied by rhcount and rhcount_lazy. + * Awaken the no-CBs grace-period kthead if needed, either due to it + * legitimately being asleep or due to overload conditions. * * If warranted, also wake up the kthread servicing this CPUs queues. */ -static void __call_rcu_nocb_enqueue(struct rcu_data *rdp, - struct rcu_head *rhp, - struct rcu_head **rhtp, - int rhcount, int rhcount_lazy, - unsigned long flags) +static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_alldone, + unsigned long flags) + __releases(rdp->nocb_lock) { - int len; - struct rcu_head **old_rhpp; + unsigned long cur_gp_seq; + unsigned long j; + long len; struct task_struct *t; - /* Enqueue the callback on the nocb list and update counts. */ - atomic_long_add(rhcount, &rdp->nocb_q_count); - /* rcu_barrier() relies on ->nocb_q_count add before xchg. */ - old_rhpp = xchg(&rdp->nocb_tail, rhtp); - WRITE_ONCE(*old_rhpp, rhp); - atomic_long_add(rhcount_lazy, &rdp->nocb_q_count_lazy); - smp_mb__after_atomic(); /* Store *old_rhpp before _wake test. */ - - /* If we are not being polled and there is a kthread, awaken it ... */ - t = READ_ONCE(rdp->nocb_kthread); + // If we are being polled or there is no kthread, just leave. + t = READ_ONCE(rdp->nocb_gp_kthread); if (rcu_nocb_poll || !t) { trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WakeNotPoll")); + rcu_nocb_unlock_irqrestore(rdp, flags); return; } - len = rcu_get_n_cbs_nocb_cpu(rdp); - if (old_rhpp == &rdp->nocb_head) { + // Need to actually to a wakeup. + len = rcu_segcblist_n_cbs(&rdp->cblist); + if (was_alldone) { + rdp->qlen_last_fqs_check = len; if (!irqs_disabled_flags(flags)) { /* ... if queue was empty ... */ - wake_nocb_leader(rdp, false); + wake_nocb_gp(rdp, false, flags); trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WakeEmpty")); } else { - wake_nocb_leader_defer(rdp, RCU_NOCB_WAKE, - TPS("WakeEmptyIsDeferred")); + wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE, + TPS("WakeEmptyIsDeferred")); + rcu_nocb_unlock_irqrestore(rdp, flags); } - rdp->qlen_last_fqs_check = 0; } else if (len > rdp->qlen_last_fqs_check + qhimark) { /* ... or if many callbacks queued. */ - if (!irqs_disabled_flags(flags)) { - wake_nocb_leader(rdp, true); - trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, - TPS("WakeOvf")); - } else { - wake_nocb_leader_defer(rdp, RCU_NOCB_WAKE_FORCE, - TPS("WakeOvfIsDeferred")); + rdp->qlen_last_fqs_check = len; + j = jiffies; + if (j != rdp->nocb_gp_adv_time && + rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq) && + rcu_seq_done(&rdp->mynode->gp_seq, cur_gp_seq)) { + rcu_advance_cbs_nowake(rdp->mynode, rdp); + rdp->nocb_gp_adv_time = j; } - rdp->qlen_last_fqs_check = LONG_MAX / 2; + smp_mb(); /* Enqueue before timer_pending(). */ + if ((rdp->nocb_cb_sleep || + !rcu_segcblist_ready_cbs(&rdp->cblist)) && + !timer_pending(&rdp->nocb_bypass_timer)) + wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE_FORCE, + TPS("WakeOvfIsDeferred")); + rcu_nocb_unlock_irqrestore(rdp, flags); } else { trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WakeNot")); + rcu_nocb_unlock_irqrestore(rdp, flags); } return; } -/* - * This is a helper for __call_rcu(), which invokes this when the normal - * callback queue is inoperable. If this is not a no-CBs CPU, this - * function returns failure back to __call_rcu(), which can complain - * appropriately. - * - * Otherwise, this function queues the callback where the corresponding - * "rcuo" kthread can find it. - */ -static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, - bool lazy, unsigned long flags) +/* Wake up the no-CBs GP kthread to flush ->nocb_bypass. */ +static void do_nocb_bypass_wakeup_timer(struct timer_list *t) { + unsigned long flags; + struct rcu_data *rdp = from_timer(rdp, t, nocb_bypass_timer); - if (!rcu_is_nocb_cpu(rdp->cpu)) - return false; - __call_rcu_nocb_enqueue(rdp, rhp, &rhp->next, 1, lazy, flags); - if (__is_kfree_rcu_offset((unsigned long)rhp->func)) - trace_rcu_kfree_callback(rcu_state.name, rhp, - (unsigned long)rhp->func, - -atomic_long_read(&rdp->nocb_q_count_lazy), - -rcu_get_n_cbs_nocb_cpu(rdp)); - else - trace_rcu_callback(rcu_state.name, rhp, - -atomic_long_read(&rdp->nocb_q_count_lazy), - -rcu_get_n_cbs_nocb_cpu(rdp)); - - /* - * If called from an extended quiescent state with interrupts - * disabled, invoke the RCU core in order to allow the idle-entry - * deferred-wakeup check to function. - */ - if (irqs_disabled_flags(flags) && - !rcu_is_watching() && - cpu_online(smp_processor_id())) - invoke_rcu_core(); - - return true; -} - -/* - * Adopt orphaned callbacks on a no-CBs CPU, or return 0 if this is - * not a no-CBs CPU. - */ -static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_data *my_rdp, - struct rcu_data *rdp, - unsigned long flags) -{ - lockdep_assert_irqs_disabled(); - if (!rcu_is_nocb_cpu(smp_processor_id())) - return false; /* Not NOCBs CPU, caller must migrate CBs. */ - __call_rcu_nocb_enqueue(my_rdp, rcu_segcblist_head(&rdp->cblist), - rcu_segcblist_tail(&rdp->cblist), - rcu_segcblist_n_cbs(&rdp->cblist), - rcu_segcblist_n_lazy_cbs(&rdp->cblist), flags); - rcu_segcblist_init(&rdp->cblist); - rcu_segcblist_disable(&rdp->cblist); - return true; + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("Timer")); + rcu_nocb_lock_irqsave(rdp, flags); + smp_mb__after_spinlock(); /* Timer expire before wakeup. */ + __call_rcu_nocb_wake(rdp, true, flags); } /* - * If necessary, kick off a new grace period, and either way wait - * for a subsequent grace period to complete. + * No-CBs GP kthreads come here to wait for additional callbacks to show up + * or for grace periods to end. */ -static void rcu_nocb_wait_gp(struct rcu_data *rdp) +static void nocb_gp_wait(struct rcu_data *my_rdp) { - unsigned long c; - bool d; + bool bypass = false; + long bypass_ncbs; + int __maybe_unused cpu = my_rdp->cpu; + unsigned long cur_gp_seq; unsigned long flags; + bool gotcbs; + unsigned long j = jiffies; + bool needwait_gp = false; // This prevents actual uninitialized use. bool needwake; - struct rcu_node *rnp = rdp->mynode; + bool needwake_gp; + struct rcu_data *rdp; + struct rcu_node *rnp; + unsigned long wait_gp_seq = 0; // Suppress "use uninitialized" warning. - local_irq_save(flags); - c = rcu_seq_snap(&rcu_state.gp_seq); - if (!rdp->gpwrap && ULONG_CMP_GE(rdp->gp_seq_needed, c)) { - local_irq_restore(flags); - } else { - raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ - needwake = rcu_start_this_gp(rnp, rdp, c); - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - if (needwake) + /* + * Each pass through the following loop checks for CBs and for the + * nearest grace period (if any) to wait for next. The CB kthreads + * and the global grace-period kthread are awakened if needed. + */ + for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_cb_rdp) { + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("Check")); + rcu_nocb_lock_irqsave(rdp, flags); + bypass_ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass); + if (bypass_ncbs && + (time_after(j, READ_ONCE(rdp->nocb_bypass_first) + 1) || + bypass_ncbs > 2 * qhimark)) { + // Bypass full or old, so flush it. + (void)rcu_nocb_try_flush_bypass(rdp, j); + bypass_ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass); + } else if (!bypass_ncbs && rcu_segcblist_empty(&rdp->cblist)) { + rcu_nocb_unlock_irqrestore(rdp, flags); + continue; /* No callbacks here, try next. */ + } + if (bypass_ncbs) { + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, + TPS("Bypass")); + bypass = true; + } + rnp = rdp->mynode; + if (bypass) { // Avoid race with first bypass CB. + WRITE_ONCE(my_rdp->nocb_defer_wakeup, + RCU_NOCB_WAKE_NOT); + del_timer(&my_rdp->nocb_timer); + } + // Advance callbacks if helpful and low contention. + needwake_gp = false; + if (!rcu_segcblist_restempty(&rdp->cblist, + RCU_NEXT_READY_TAIL) || + (rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq) && + rcu_seq_done(&rnp->gp_seq, cur_gp_seq))) { + raw_spin_lock_rcu_node(rnp); /* irqs disabled. */ + needwake_gp = rcu_advance_cbs(rnp, rdp); + raw_spin_unlock_rcu_node(rnp); /* irqs disabled. */ + } + // Need to wait on some grace period? + WARN_ON_ONCE(!rcu_segcblist_restempty(&rdp->cblist, + RCU_NEXT_READY_TAIL)); + if (rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq)) { + if (!needwait_gp || + ULONG_CMP_LT(cur_gp_seq, wait_gp_seq)) + wait_gp_seq = cur_gp_seq; + needwait_gp = true; + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, + TPS("NeedWaitGP")); + } + if (rcu_segcblist_ready_cbs(&rdp->cblist)) { + needwake = rdp->nocb_cb_sleep; + WRITE_ONCE(rdp->nocb_cb_sleep, false); + smp_mb(); /* CB invocation -after- GP end. */ + } else { + needwake = false; + } + rcu_nocb_unlock_irqrestore(rdp, flags); + if (needwake) { + swake_up_one(&rdp->nocb_cb_wq); + gotcbs = true; + } + if (needwake_gp) rcu_gp_kthread_wake(); } - /* - * Wait for the grace period. Do so interruptibly to avoid messing - * up the load average. - */ - trace_rcu_this_gp(rnp, rdp, c, TPS("StartWait")); - for (;;) { + my_rdp->nocb_gp_bypass = bypass; + my_rdp->nocb_gp_gp = needwait_gp; + my_rdp->nocb_gp_seq = needwait_gp ? wait_gp_seq : 0; + if (bypass && !rcu_nocb_poll) { + // At least one child with non-empty ->nocb_bypass, so set + // timer in order to avoid stranding its callbacks. + raw_spin_lock_irqsave(&my_rdp->nocb_gp_lock, flags); + mod_timer(&my_rdp->nocb_bypass_timer, j + 2); + raw_spin_unlock_irqrestore(&my_rdp->nocb_gp_lock, flags); + } + if (rcu_nocb_poll) { + /* Polling, so trace if first poll in the series. */ + if (gotcbs) + trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("Poll")); + schedule_timeout_interruptible(1); + } else if (!needwait_gp) { + /* Wait for callbacks to appear. */ + trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("Sleep")); + swait_event_interruptible_exclusive(my_rdp->nocb_gp_wq, + !READ_ONCE(my_rdp->nocb_gp_sleep)); + trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("EndSleep")); + } else { + rnp = my_rdp->mynode; + trace_rcu_this_gp(rnp, my_rdp, wait_gp_seq, TPS("StartWait")); swait_event_interruptible_exclusive( - rnp->nocb_gp_wq[rcu_seq_ctr(c) & 0x1], - (d = rcu_seq_done(&rnp->gp_seq, c))); - if (likely(d)) - break; - WARN_ON(signal_pending(current)); - trace_rcu_this_gp(rnp, rdp, c, TPS("ResumeWait")); + rnp->nocb_gp_wq[rcu_seq_ctr(wait_gp_seq) & 0x1], + rcu_seq_done(&rnp->gp_seq, wait_gp_seq) || + !READ_ONCE(my_rdp->nocb_gp_sleep)); + trace_rcu_this_gp(rnp, my_rdp, wait_gp_seq, TPS("EndWait")); } - trace_rcu_this_gp(rnp, rdp, c, TPS("EndWait")); - smp_mb(); /* Ensure that CB invocation happens after GP end. */ + if (!rcu_nocb_poll) { + raw_spin_lock_irqsave(&my_rdp->nocb_gp_lock, flags); + if (bypass) + del_timer(&my_rdp->nocb_bypass_timer); + WRITE_ONCE(my_rdp->nocb_gp_sleep, true); + raw_spin_unlock_irqrestore(&my_rdp->nocb_gp_lock, flags); + } + my_rdp->nocb_gp_seq = -1; + WARN_ON(signal_pending(current)); } /* - * Leaders come here to wait for additional callbacks to show up. - * This function does not return until callbacks appear. + * No-CBs grace-period-wait kthread. There is one of these per group + * of CPUs, but only once at least one CPU in that group has come online + * at least once since boot. This kthread checks for newly posted + * callbacks from any of the CPUs it is responsible for, waits for a + * grace period, then awakens all of the rcu_nocb_cb_kthread() instances + * that then have callback-invocation work to do. */ -static void nocb_leader_wait(struct rcu_data *my_rdp) +static int rcu_nocb_gp_kthread(void *arg) { - bool firsttime = true; - unsigned long flags; - bool gotcbs; - struct rcu_data *rdp; - struct rcu_head **tail; - -wait_again: - - /* Wait for callbacks to appear. */ - if (!rcu_nocb_poll) { - trace_rcu_nocb_wake(rcu_state.name, my_rdp->cpu, TPS("Sleep")); - swait_event_interruptible_exclusive(my_rdp->nocb_wq, - !READ_ONCE(my_rdp->nocb_leader_sleep)); - raw_spin_lock_irqsave(&my_rdp->nocb_lock, flags); - my_rdp->nocb_leader_sleep = true; - WRITE_ONCE(my_rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT); - del_timer(&my_rdp->nocb_timer); - raw_spin_unlock_irqrestore(&my_rdp->nocb_lock, flags); - } else if (firsttime) { - firsttime = false; /* Don't drown trace log with "Poll"! */ - trace_rcu_nocb_wake(rcu_state.name, my_rdp->cpu, TPS("Poll")); - } - - /* - * Each pass through the following loop checks a follower for CBs. - * We are our own first follower. Any CBs found are moved to - * nocb_gp_head, where they await a grace period. - */ - gotcbs = false; - smp_mb(); /* wakeup and _sleep before ->nocb_head reads. */ - for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) { - rdp->nocb_gp_head = READ_ONCE(rdp->nocb_head); - if (!rdp->nocb_gp_head) - continue; /* No CBs here, try next follower. */ - - /* Move callbacks to wait-for-GP list, which is empty. */ - WRITE_ONCE(rdp->nocb_head, NULL); - rdp->nocb_gp_tail = xchg(&rdp->nocb_tail, &rdp->nocb_head); - gotcbs = true; - } - - /* No callbacks? Sleep a bit if polling, and go retry. */ - if (unlikely(!gotcbs)) { - WARN_ON(signal_pending(current)); - if (rcu_nocb_poll) { - schedule_timeout_interruptible(1); - } else { - trace_rcu_nocb_wake(rcu_state.name, my_rdp->cpu, - TPS("WokeEmpty")); - } - goto wait_again; - } + struct rcu_data *rdp = arg; - /* Wait for one grace period. */ - rcu_nocb_wait_gp(my_rdp); - - /* Each pass through the following loop wakes a follower, if needed. */ - for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) { - if (!rcu_nocb_poll && - READ_ONCE(rdp->nocb_head) && - READ_ONCE(my_rdp->nocb_leader_sleep)) { - raw_spin_lock_irqsave(&my_rdp->nocb_lock, flags); - my_rdp->nocb_leader_sleep = false;/* No need to sleep.*/ - raw_spin_unlock_irqrestore(&my_rdp->nocb_lock, flags); - } - if (!rdp->nocb_gp_head) - continue; /* No CBs, so no need to wake follower. */ - - /* Append callbacks to follower's "done" list. */ - raw_spin_lock_irqsave(&rdp->nocb_lock, flags); - tail = rdp->nocb_follower_tail; - rdp->nocb_follower_tail = rdp->nocb_gp_tail; - *tail = rdp->nocb_gp_head; - raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); - if (rdp != my_rdp && tail == &rdp->nocb_follower_head) { - /* List was empty, so wake up the follower. */ - swake_up_one(&rdp->nocb_wq); - } + for (;;) { + WRITE_ONCE(rdp->nocb_gp_loops, rdp->nocb_gp_loops + 1); + nocb_gp_wait(rdp); + cond_resched_tasks_rcu_qs(); } - - /* If we (the leader) don't have CBs, go wait some more. */ - if (!my_rdp->nocb_follower_head) - goto wait_again; + return 0; } /* - * Followers come here to wait for additional callbacks to show up. - * This function does not return until callbacks appear. + * Invoke any ready callbacks from the corresponding no-CBs CPU, + * then, if there are no more, wait for more to appear. */ -static void nocb_follower_wait(struct rcu_data *rdp) +static void nocb_cb_wait(struct rcu_data *rdp) { - for (;;) { - trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("FollowerSleep")); - swait_event_interruptible_exclusive(rdp->nocb_wq, - READ_ONCE(rdp->nocb_follower_head)); - if (smp_load_acquire(&rdp->nocb_follower_head)) { - /* ^^^ Ensure CB invocation follows _head test. */ - return; - } - WARN_ON(signal_pending(current)); - trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WokeEmpty")); + unsigned long cur_gp_seq; + unsigned long flags; + bool needwake_gp = false; + struct rcu_node *rnp = rdp->mynode; + + local_irq_save(flags); + rcu_momentary_dyntick_idle(); + local_irq_restore(flags); + local_bh_disable(); + rcu_do_batch(rdp); + local_bh_enable(); + lockdep_assert_irqs_enabled(); + rcu_nocb_lock_irqsave(rdp, flags); + if (rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq) && + rcu_seq_done(&rnp->gp_seq, cur_gp_seq) && + raw_spin_trylock_rcu_node(rnp)) { /* irqs already disabled. */ + needwake_gp = rcu_advance_cbs(rdp->mynode, rdp); + raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */ + } + if (rcu_segcblist_ready_cbs(&rdp->cblist)) { + rcu_nocb_unlock_irqrestore(rdp, flags); + if (needwake_gp) + rcu_gp_kthread_wake(); + return; + } + + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("CBSleep")); + WRITE_ONCE(rdp->nocb_cb_sleep, true); + rcu_nocb_unlock_irqrestore(rdp, flags); + if (needwake_gp) + rcu_gp_kthread_wake(); + swait_event_interruptible_exclusive(rdp->nocb_cb_wq, + !READ_ONCE(rdp->nocb_cb_sleep)); + if (!smp_load_acquire(&rdp->nocb_cb_sleep)) { /* VVV */ + /* ^^^ Ensure CB invocation follows _sleep test. */ + return; } + WARN_ON(signal_pending(current)); + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WokeEmpty")); } /* - * Per-rcu_data kthread, but only for no-CBs CPUs. Each kthread invokes - * callbacks queued by the corresponding no-CBs CPU, however, there is - * an optional leader-follower relationship so that the grace-period - * kthreads don't have to do quite so many wakeups. + * Per-rcu_data kthread, but only for no-CBs CPUs. Repeatedly invoke + * nocb_cb_wait() to do the dirty work. */ -static int rcu_nocb_kthread(void *arg) +static int rcu_nocb_cb_kthread(void *arg) { - int c, cl; - unsigned long flags; - struct rcu_head *list; - struct rcu_head *next; - struct rcu_head **tail; struct rcu_data *rdp = arg; - /* Each pass through this loop invokes one batch of callbacks */ + // Each pass through this loop does one callback batch, and, + // if there are no more ready callbacks, waits for them. for (;;) { - /* Wait for callbacks. */ - if (rdp->nocb_leader == rdp) - nocb_leader_wait(rdp); - else - nocb_follower_wait(rdp); - - /* Pull the ready-to-invoke callbacks onto local list. */ - raw_spin_lock_irqsave(&rdp->nocb_lock, flags); - list = rdp->nocb_follower_head; - rdp->nocb_follower_head = NULL; - tail = rdp->nocb_follower_tail; - rdp->nocb_follower_tail = &rdp->nocb_follower_head; - raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); - if (WARN_ON_ONCE(!list)) - continue; - trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WokeNonEmpty")); - - /* Each pass through the following loop invokes a callback. */ - trace_rcu_batch_start(rcu_state.name, - atomic_long_read(&rdp->nocb_q_count_lazy), - rcu_get_n_cbs_nocb_cpu(rdp), -1); - c = cl = 0; - while (list) { - next = list->next; - /* Wait for enqueuing to complete, if needed. */ - while (next == NULL && &list->next != tail) { - trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, - TPS("WaitQueue")); - schedule_timeout_interruptible(1); - trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, - TPS("WokeQueue")); - next = list->next; - } - debug_rcu_head_unqueue(list); - local_bh_disable(); - if (__rcu_reclaim(rcu_state.name, list)) - cl++; - c++; - local_bh_enable(); - cond_resched_tasks_rcu_qs(); - list = next; - } - trace_rcu_batch_end(rcu_state.name, c, !!list, 0, 0, 1); - smp_mb__before_atomic(); /* _add after CB invocation. */ - atomic_long_add(-c, &rdp->nocb_q_count); - atomic_long_add(-cl, &rdp->nocb_q_count_lazy); + nocb_cb_wait(rdp); + cond_resched_tasks_rcu_qs(); } return 0; } @@ -1993,14 +2159,14 @@ static void do_nocb_deferred_wakeup_common(struct rcu_data *rdp) unsigned long flags; int ndw; - raw_spin_lock_irqsave(&rdp->nocb_lock, flags); + rcu_nocb_lock_irqsave(rdp, flags); if (!rcu_nocb_need_deferred_wakeup(rdp)) { - raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); + rcu_nocb_unlock_irqrestore(rdp, flags); return; } ndw = READ_ONCE(rdp->nocb_defer_wakeup); WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT); - __wake_nocb_leader(rdp, ndw == RCU_NOCB_WAKE_FORCE, flags); + wake_nocb_gp(rdp, ndw == RCU_NOCB_WAKE_FORCE, flags); trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("DeferredWake")); } @@ -2027,6 +2193,7 @@ void __init rcu_init_nohz(void) { int cpu; bool need_rcu_nocb_mask = false; + struct rcu_data *rdp; #if defined(CONFIG_NO_HZ_FULL) if (tick_nohz_full_running && cpumask_weight(tick_nohz_full_mask)) @@ -2060,67 +2227,63 @@ void __init rcu_init_nohz(void) if (rcu_nocb_poll) pr_info("\tPoll for callbacks from no-CBs CPUs.\n"); - for_each_cpu(cpu, rcu_nocb_mask) - init_nocb_callback_list(per_cpu_ptr(&rcu_data, cpu)); + for_each_cpu(cpu, rcu_nocb_mask) { + rdp = per_cpu_ptr(&rcu_data, cpu); + if (rcu_segcblist_empty(&rdp->cblist)) + rcu_segcblist_init(&rdp->cblist); + rcu_segcblist_offload(&rdp->cblist); + } rcu_organize_nocb_kthreads(); } /* Initialize per-rcu_data variables for no-CBs CPUs. */ static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp) { - rdp->nocb_tail = &rdp->nocb_head; - init_swait_queue_head(&rdp->nocb_wq); - rdp->nocb_follower_tail = &rdp->nocb_follower_head; + init_swait_queue_head(&rdp->nocb_cb_wq); + init_swait_queue_head(&rdp->nocb_gp_wq); raw_spin_lock_init(&rdp->nocb_lock); + raw_spin_lock_init(&rdp->nocb_bypass_lock); + raw_spin_lock_init(&rdp->nocb_gp_lock); timer_setup(&rdp->nocb_timer, do_nocb_deferred_wakeup_timer, 0); + timer_setup(&rdp->nocb_bypass_timer, do_nocb_bypass_wakeup_timer, 0); + rcu_cblist_init(&rdp->nocb_bypass); } /* * If the specified CPU is a no-CBs CPU that does not already have its - * rcuo kthread, spawn it. If the CPUs are brought online out of order, - * this can require re-organizing the leader-follower relationships. + * rcuo CB kthread, spawn it. Additionally, if the rcuo GP kthread + * for this CPU's group has not yet been created, spawn it as well. */ static void rcu_spawn_one_nocb_kthread(int cpu) { - struct rcu_data *rdp; - struct rcu_data *rdp_last; - struct rcu_data *rdp_old_leader; - struct rcu_data *rdp_spawn = per_cpu_ptr(&rcu_data, cpu); + struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); + struct rcu_data *rdp_gp; struct task_struct *t; /* * If this isn't a no-CBs CPU or if it already has an rcuo kthread, * then nothing to do. */ - if (!rcu_is_nocb_cpu(cpu) || rdp_spawn->nocb_kthread) + if (!rcu_is_nocb_cpu(cpu) || rdp->nocb_cb_kthread) return; - /* If we didn't spawn the leader first, reorganize! */ - rdp_old_leader = rdp_spawn->nocb_leader; - if (rdp_old_leader != rdp_spawn && !rdp_old_leader->nocb_kthread) { - rdp_last = NULL; - rdp = rdp_old_leader; - do { - rdp->nocb_leader = rdp_spawn; - if (rdp_last && rdp != rdp_spawn) - rdp_last->nocb_next_follower = rdp; - if (rdp == rdp_spawn) { - rdp = rdp->nocb_next_follower; - } else { - rdp_last = rdp; - rdp = rdp->nocb_next_follower; - rdp_last->nocb_next_follower = NULL; - } - } while (rdp); - rdp_spawn->nocb_next_follower = rdp_old_leader; + /* If we didn't spawn the GP kthread first, reorganize! */ + rdp_gp = rdp->nocb_gp_rdp; + if (!rdp_gp->nocb_gp_kthread) { + t = kthread_run(rcu_nocb_gp_kthread, rdp_gp, + "rcuog/%d", rdp_gp->cpu); + if (WARN_ONCE(IS_ERR(t), "%s: Could not start rcuo GP kthread, OOM is now expected behavior\n", __func__)) + return; + WRITE_ONCE(rdp_gp->nocb_gp_kthread, t); } /* Spawn the kthread for this CPU. */ - t = kthread_run(rcu_nocb_kthread, rdp_spawn, + t = kthread_run(rcu_nocb_cb_kthread, rdp, "rcuo%c/%d", rcu_state.abbr, cpu); - if (WARN_ONCE(IS_ERR(t), "%s: Could not start rcuo kthread, OOM is now expected behavior\n", __func__)) + if (WARN_ONCE(IS_ERR(t), "%s: Could not start rcuo CB kthread, OOM is now expected behavior\n", __func__)) return; - WRITE_ONCE(rdp_spawn->nocb_kthread, t); + WRITE_ONCE(rdp->nocb_cb_kthread, t); + WRITE_ONCE(rdp->nocb_gp_kthread, rdp_gp->nocb_gp_kthread); } /* @@ -2147,27 +2310,28 @@ static void __init rcu_spawn_nocb_kthreads(void) rcu_spawn_cpu_nocb_kthread(cpu); } -/* How many follower CPU IDs per leader? Default of -1 for sqrt(nr_cpu_ids). */ -static int rcu_nocb_leader_stride = -1; -module_param(rcu_nocb_leader_stride, int, 0444); +/* How many CB CPU IDs per GP kthread? Default of -1 for sqrt(nr_cpu_ids). */ +static int rcu_nocb_gp_stride = -1; +module_param(rcu_nocb_gp_stride, int, 0444); /* - * Initialize leader-follower relationships for all no-CBs CPU. + * Initialize GP-CB relationships for all no-CBs CPU. */ static void __init rcu_organize_nocb_kthreads(void) { int cpu; - int ls = rcu_nocb_leader_stride; - int nl = 0; /* Next leader. */ + bool firsttime = true; + int ls = rcu_nocb_gp_stride; + int nl = 0; /* Next GP kthread. */ struct rcu_data *rdp; - struct rcu_data *rdp_leader = NULL; /* Suppress misguided gcc warn. */ + struct rcu_data *rdp_gp = NULL; /* Suppress misguided gcc warn. */ struct rcu_data *rdp_prev = NULL; if (!cpumask_available(rcu_nocb_mask)) return; if (ls == -1) { - ls = int_sqrt(nr_cpu_ids); - rcu_nocb_leader_stride = ls; + ls = nr_cpu_ids / int_sqrt(nr_cpu_ids); + rcu_nocb_gp_stride = ls; } /* @@ -2178,39 +2342,24 @@ static void __init rcu_organize_nocb_kthreads(void) for_each_cpu(cpu, rcu_nocb_mask) { rdp = per_cpu_ptr(&rcu_data, cpu); if (rdp->cpu >= nl) { - /* New leader, set up for followers & next leader. */ + /* New GP kthread, set up for CBs & next GP. */ nl = DIV_ROUND_UP(rdp->cpu + 1, ls) * ls; - rdp->nocb_leader = rdp; - rdp_leader = rdp; + rdp->nocb_gp_rdp = rdp; + rdp_gp = rdp; + if (!firsttime && dump_tree) + pr_cont("\n"); + firsttime = false; + pr_alert("%s: No-CB GP kthread CPU %d:", __func__, cpu); } else { - /* Another follower, link to previous leader. */ - rdp->nocb_leader = rdp_leader; - rdp_prev->nocb_next_follower = rdp; + /* Another CB kthread, link to previous GP kthread. */ + rdp->nocb_gp_rdp = rdp_gp; + rdp_prev->nocb_next_cb_rdp = rdp; + pr_alert(" %d", cpu); } rdp_prev = rdp; } } -/* Prevent __call_rcu() from enqueuing callbacks on no-CBs CPUs */ -static bool init_nocb_callback_list(struct rcu_data *rdp) -{ - if (!rcu_is_nocb_cpu(rdp->cpu)) - return false; - - /* If there are early-boot callbacks, move them to nocb lists. */ - if (!rcu_segcblist_empty(&rdp->cblist)) { - rdp->nocb_head = rcu_segcblist_head(&rdp->cblist); - rdp->nocb_tail = rcu_segcblist_tail(&rdp->cblist); - atomic_long_set(&rdp->nocb_q_count, - rcu_segcblist_n_cbs(&rdp->cblist)); - atomic_long_set(&rdp->nocb_q_count_lazy, - rcu_segcblist_n_lazy_cbs(&rdp->cblist)); - rcu_segcblist_init(&rdp->cblist); - } - rcu_segcblist_disable(&rdp->cblist); - return true; -} - /* * Bind the current task to the offloaded CPUs. If there are no offloaded * CPUs, leave the task unbound. Splat if the bind attempt fails. @@ -2223,20 +2372,101 @@ void rcu_bind_current_to_nocb(void) EXPORT_SYMBOL_GPL(rcu_bind_current_to_nocb); /* - * Return the number of RCU callbacks still queued from the specified - * CPU, which must be a nocbs CPU. + * Dump out nocb grace-period kthread state for the specified rcu_data + * structure. */ -static unsigned long rcu_get_n_cbs_nocb_cpu(struct rcu_data *rdp) +static void show_rcu_nocb_gp_state(struct rcu_data *rdp) { - return atomic_long_read(&rdp->nocb_q_count); + struct rcu_node *rnp = rdp->mynode; + + pr_info("nocb GP %d %c%c%c%c%c%c %c[%c%c] %c%c:%ld rnp %d:%d %lu\n", + rdp->cpu, + "kK"[!!rdp->nocb_gp_kthread], + "lL"[raw_spin_is_locked(&rdp->nocb_gp_lock)], + "dD"[!!rdp->nocb_defer_wakeup], + "tT"[timer_pending(&rdp->nocb_timer)], + "bB"[timer_pending(&rdp->nocb_bypass_timer)], + "sS"[!!rdp->nocb_gp_sleep], + ".W"[swait_active(&rdp->nocb_gp_wq)], + ".W"[swait_active(&rnp->nocb_gp_wq[0])], + ".W"[swait_active(&rnp->nocb_gp_wq[1])], + ".B"[!!rdp->nocb_gp_bypass], + ".G"[!!rdp->nocb_gp_gp], + (long)rdp->nocb_gp_seq, + rnp->grplo, rnp->grphi, READ_ONCE(rdp->nocb_gp_loops)); +} + +/* Dump out nocb kthread state for the specified rcu_data structure. */ +static void show_rcu_nocb_state(struct rcu_data *rdp) +{ + struct rcu_segcblist *rsclp = &rdp->cblist; + bool waslocked; + bool wastimer; + bool wassleep; + + if (rdp->nocb_gp_rdp == rdp) + show_rcu_nocb_gp_state(rdp); + + pr_info(" CB %d->%d %c%c%c%c%c%c F%ld L%ld C%d %c%c%c%c%c q%ld\n", + rdp->cpu, rdp->nocb_gp_rdp->cpu, + "kK"[!!rdp->nocb_cb_kthread], + "bB"[raw_spin_is_locked(&rdp->nocb_bypass_lock)], + "cC"[!!atomic_read(&rdp->nocb_lock_contended)], + "lL"[raw_spin_is_locked(&rdp->nocb_lock)], + "sS"[!!rdp->nocb_cb_sleep], + ".W"[swait_active(&rdp->nocb_cb_wq)], + jiffies - rdp->nocb_bypass_first, + jiffies - rdp->nocb_nobypass_last, + rdp->nocb_nobypass_count, + ".D"[rcu_segcblist_ready_cbs(rsclp)], + ".W"[!rcu_segcblist_restempty(rsclp, RCU_DONE_TAIL)], + ".R"[!rcu_segcblist_restempty(rsclp, RCU_WAIT_TAIL)], + ".N"[!rcu_segcblist_restempty(rsclp, RCU_NEXT_READY_TAIL)], + ".B"[!!rcu_cblist_n_cbs(&rdp->nocb_bypass)], + rcu_segcblist_n_cbs(&rdp->cblist)); + + /* It is OK for GP kthreads to have GP state. */ + if (rdp->nocb_gp_rdp == rdp) + return; + + waslocked = raw_spin_is_locked(&rdp->nocb_gp_lock); + wastimer = timer_pending(&rdp->nocb_timer); + wassleep = swait_active(&rdp->nocb_gp_wq); + if (!rdp->nocb_defer_wakeup && !rdp->nocb_gp_sleep && + !waslocked && !wastimer && !wassleep) + return; /* Nothing untowards. */ + + pr_info(" !!! %c%c%c%c %c\n", + "lL"[waslocked], + "dD"[!!rdp->nocb_defer_wakeup], + "tT"[wastimer], + "sS"[!!rdp->nocb_gp_sleep], + ".W"[wassleep]); } #else /* #ifdef CONFIG_RCU_NOCB_CPU */ -static bool rcu_nocb_cpu_needs_barrier(int cpu) +/* No ->nocb_lock to acquire. */ +static void rcu_nocb_lock(struct rcu_data *rdp) +{ +} + +/* No ->nocb_lock to release. */ +static void rcu_nocb_unlock(struct rcu_data *rdp) { - WARN_ON_ONCE(1); /* Should be dead code. */ - return false; +} + +/* No ->nocb_lock to release. */ +static void rcu_nocb_unlock_irqrestore(struct rcu_data *rdp, + unsigned long flags) +{ + local_irq_restore(flags); +} + +/* Lockdep check that ->cblist may be safely accessed. */ +static void rcu_lockdep_assert_cblist_protected(struct rcu_data *rdp) +{ + lockdep_assert_irqs_disabled(); } static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq) @@ -2252,19 +2482,24 @@ static void rcu_init_one_nocb(struct rcu_node *rnp) { } -static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, - bool lazy, unsigned long flags) +static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp, + unsigned long j) { - return false; + return true; } -static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_data *my_rdp, - struct rcu_data *rdp, - unsigned long flags) +static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp, + bool *was_alldone, unsigned long flags) { return false; } +static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_empty, + unsigned long flags) +{ + WARN_ON_ONCE(1); /* Should be dead code! */ +} + static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp) { } @@ -2286,14 +2521,8 @@ static void __init rcu_spawn_nocb_kthreads(void) { } -static bool init_nocb_callback_list(struct rcu_data *rdp) -{ - return false; -} - -static unsigned long rcu_get_n_cbs_nocb_cpu(struct rcu_data *rdp) +static void show_rcu_nocb_state(struct rcu_data *rdp) { - return 0; } #endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */ diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 065183391f75..c0b8c458d8a6 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -163,7 +163,7 @@ static void rcu_iw_handler(struct irq_work *iwp) // // Printing RCU CPU stall warnings -#ifdef CONFIG_PREEMPT +#ifdef CONFIG_PREEMPTION /* * Dump detailed information for all tasks blocking the current RCU @@ -215,7 +215,7 @@ static int rcu_print_task_stall(struct rcu_node *rnp) return ndetected; } -#else /* #ifdef CONFIG_PREEMPT */ +#else /* #ifdef CONFIG_PREEMPTION */ /* * Because preemptible RCU does not exist, we never have to check for @@ -233,7 +233,7 @@ static int rcu_print_task_stall(struct rcu_node *rnp) { return 0; } -#endif /* #else #ifdef CONFIG_PREEMPT */ +#endif /* #else #ifdef CONFIG_PREEMPTION */ /* * Dump stacks of all tasks running on stalled CPUs. First try using @@ -527,6 +527,8 @@ static void check_cpu_stall(struct rcu_data *rdp) /* We haven't checked in, so go dump stack. */ print_cpu_stall(); + if (rcu_cpu_stall_ftrace_dump) + rcu_ftrace_dump(DUMP_ALL); } else if (rcu_gp_in_progress() && ULONG_CMP_GE(j, js + RCU_STALL_RAT_DELAY) && @@ -534,6 +536,8 @@ static void check_cpu_stall(struct rcu_data *rdp) /* They had a few time units to dump stack, so complain. */ print_other_cpu_stall(gs2); + if (rcu_cpu_stall_ftrace_dump) + rcu_ftrace_dump(DUMP_ALL); } } @@ -585,6 +589,11 @@ void show_rcu_gp_kthreads(void) cpu, (long)rdp->gp_seq_needed); } } + for_each_possible_cpu(cpu) { + rdp = per_cpu_ptr(&rcu_data, cpu); + if (rcu_segcblist_is_offloaded(&rdp->cblist)) + show_rcu_nocb_state(rdp); + } /* sched_show_task(rcu_state.gp_kthread); */ } EXPORT_SYMBOL_GPL(show_rcu_gp_kthreads); diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 61df2bf08563..1861103662db 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -61,9 +61,15 @@ module_param(rcu_normal_after_boot, int, 0); #ifdef CONFIG_DEBUG_LOCK_ALLOC /** - * rcu_read_lock_sched_held() - might we be in RCU-sched read-side critical section? + * rcu_read_lock_held_common() - might we be in RCU-sched read-side critical section? + * @ret: Best guess answer if lockdep cannot be relied on * - * If CONFIG_DEBUG_LOCK_ALLOC is selected, returns nonzero iff in an + * Returns true if lockdep must be ignored, in which case *ret contains + * the best guess described below. Otherwise returns false, in which + * case *ret tells the caller nothing and the caller should instead + * consult lockdep. + * + * If CONFIG_DEBUG_LOCK_ALLOC is selected, set *ret to nonzero iff in an * RCU-sched read-side critical section. In absence of * CONFIG_DEBUG_LOCK_ALLOC, this assumes we are in an RCU-sched read-side * critical section unless it can prove otherwise. Note that disabling @@ -75,35 +81,45 @@ module_param(rcu_normal_after_boot, int, 0); * Check debug_lockdep_rcu_enabled() to prevent false positives during boot * and while lockdep is disabled. * - * Note that if the CPU is in the idle loop from an RCU point of - * view (ie: that we are in the section between rcu_idle_enter() and - * rcu_idle_exit()) then rcu_read_lock_held() returns false even if the CPU - * did an rcu_read_lock(). The reason for this is that RCU ignores CPUs - * that are in such a section, considering these as in extended quiescent - * state, so such a CPU is effectively never in an RCU read-side critical - * section regardless of what RCU primitives it invokes. This state of - * affairs is required --- we need to keep an RCU-free window in idle - * where the CPU may possibly enter into low power mode. This way we can - * notice an extended quiescent state to other CPUs that started a grace - * period. Otherwise we would delay any grace period as long as we run in - * the idle task. + * Note that if the CPU is in the idle loop from an RCU point of view (ie: + * that we are in the section between rcu_idle_enter() and rcu_idle_exit()) + * then rcu_read_lock_held() sets *ret to false even if the CPU did an + * rcu_read_lock(). The reason for this is that RCU ignores CPUs that are + * in such a section, considering these as in extended quiescent state, + * so such a CPU is effectively never in an RCU read-side critical section + * regardless of what RCU primitives it invokes. This state of affairs is + * required --- we need to keep an RCU-free window in idle where the CPU may + * possibly enter into low power mode. This way we can notice an extended + * quiescent state to other CPUs that started a grace period. Otherwise + * we would delay any grace period as long as we run in the idle task. * - * Similarly, we avoid claiming an SRCU read lock held if the current + * Similarly, we avoid claiming an RCU read lock held if the current * CPU is offline. */ +static bool rcu_read_lock_held_common(bool *ret) +{ + if (!debug_lockdep_rcu_enabled()) { + *ret = 1; + return true; + } + if (!rcu_is_watching()) { + *ret = 0; + return true; + } + if (!rcu_lockdep_current_cpu_online()) { + *ret = 0; + return true; + } + return false; +} + int rcu_read_lock_sched_held(void) { - int lockdep_opinion = 0; + bool ret; - if (!debug_lockdep_rcu_enabled()) - return 1; - if (!rcu_is_watching()) - return 0; - if (!rcu_lockdep_current_cpu_online()) - return 0; - if (debug_locks) - lockdep_opinion = lock_is_held(&rcu_sched_lock_map); - return lockdep_opinion || !preemptible(); + if (rcu_read_lock_held_common(&ret)) + return ret; + return lock_is_held(&rcu_sched_lock_map) || !preemptible(); } EXPORT_SYMBOL(rcu_read_lock_sched_held); #endif @@ -136,8 +152,7 @@ static atomic_t rcu_expedited_nesting = ATOMIC_INIT(1); */ bool rcu_gp_is_expedited(void) { - return rcu_expedited || atomic_read(&rcu_expedited_nesting) || - rcu_scheduler_active == RCU_SCHEDULER_INIT; + return rcu_expedited || atomic_read(&rcu_expedited_nesting); } EXPORT_SYMBOL_GPL(rcu_gp_is_expedited); @@ -261,12 +276,10 @@ NOKPROBE_SYMBOL(debug_lockdep_rcu_enabled); */ int rcu_read_lock_held(void) { - if (!debug_lockdep_rcu_enabled()) - return 1; - if (!rcu_is_watching()) - return 0; - if (!rcu_lockdep_current_cpu_online()) - return 0; + bool ret; + + if (rcu_read_lock_held_common(&ret)) + return ret; return lock_is_held(&rcu_lock_map); } EXPORT_SYMBOL_GPL(rcu_read_lock_held); @@ -288,16 +301,28 @@ EXPORT_SYMBOL_GPL(rcu_read_lock_held); */ int rcu_read_lock_bh_held(void) { - if (!debug_lockdep_rcu_enabled()) - return 1; - if (!rcu_is_watching()) - return 0; - if (!rcu_lockdep_current_cpu_online()) - return 0; + bool ret; + + if (rcu_read_lock_held_common(&ret)) + return ret; return in_softirq() || irqs_disabled(); } EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held); +int rcu_read_lock_any_held(void) +{ + bool ret; + + if (rcu_read_lock_held_common(&ret)) + return ret; + if (lock_is_held(&rcu_lock_map) || + lock_is_held(&rcu_bh_lock_map) || + lock_is_held(&rcu_sched_lock_map)) + return 1; + return !preemptible(); +} +EXPORT_SYMBOL_GPL(rcu_read_lock_any_held); + #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ /** @@ -437,6 +462,8 @@ EXPORT_SYMBOL_GPL(rcutorture_sched_setaffinity); #endif #ifdef CONFIG_RCU_STALL_COMMON +int rcu_cpu_stall_ftrace_dump __read_mostly; +module_param(rcu_cpu_stall_ftrace_dump, int, 0644); int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */ EXPORT_SYMBOL_GPL(rcu_cpu_stall_suppress); module_param(rcu_cpu_stall_suppress, int, 0644); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 2b037f195473..06961b997ed6 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -773,6 +773,18 @@ static void set_load_weight(struct task_struct *p, bool update_load) } #ifdef CONFIG_UCLAMP_TASK +/* + * Serializes updates of utilization clamp values + * + * The (slow-path) user-space triggers utilization clamp value updates which + * can require updates on (fast-path) scheduler's data structures used to + * support enqueue/dequeue operations. + * While the per-CPU rq lock protects fast-path update operations, user-space + * requests are serialized using a mutex to reduce the risk of conflicting + * updates or API abuses. + */ +static DEFINE_MUTEX(uclamp_mutex); + /* Max allowed minimum utilization */ unsigned int sysctl_sched_uclamp_util_min = SCHED_CAPACITY_SCALE; @@ -798,7 +810,7 @@ static inline unsigned int uclamp_bucket_base_value(unsigned int clamp_value) return UCLAMP_BUCKET_DELTA * uclamp_bucket_id(clamp_value); } -static inline unsigned int uclamp_none(int clamp_id) +static inline enum uclamp_id uclamp_none(enum uclamp_id clamp_id) { if (clamp_id == UCLAMP_MIN) return 0; @@ -814,7 +826,7 @@ static inline void uclamp_se_set(struct uclamp_se *uc_se, } static inline unsigned int -uclamp_idle_value(struct rq *rq, unsigned int clamp_id, +uclamp_idle_value(struct rq *rq, enum uclamp_id clamp_id, unsigned int clamp_value) { /* @@ -830,7 +842,7 @@ uclamp_idle_value(struct rq *rq, unsigned int clamp_id, return uclamp_none(UCLAMP_MIN); } -static inline void uclamp_idle_reset(struct rq *rq, unsigned int clamp_id, +static inline void uclamp_idle_reset(struct rq *rq, enum uclamp_id clamp_id, unsigned int clamp_value) { /* Reset max-clamp retention only on idle exit */ @@ -841,8 +853,8 @@ static inline void uclamp_idle_reset(struct rq *rq, unsigned int clamp_id, } static inline -unsigned int uclamp_rq_max_value(struct rq *rq, unsigned int clamp_id, - unsigned int clamp_value) +enum uclamp_id uclamp_rq_max_value(struct rq *rq, enum uclamp_id clamp_id, + unsigned int clamp_value) { struct uclamp_bucket *bucket = rq->uclamp[clamp_id].bucket; int bucket_id = UCLAMP_BUCKETS - 1; @@ -861,16 +873,42 @@ unsigned int uclamp_rq_max_value(struct rq *rq, unsigned int clamp_id, return uclamp_idle_value(rq, clamp_id, clamp_value); } +static inline struct uclamp_se +uclamp_tg_restrict(struct task_struct *p, enum uclamp_id clamp_id) +{ + struct uclamp_se uc_req = p->uclamp_req[clamp_id]; +#ifdef CONFIG_UCLAMP_TASK_GROUP + struct uclamp_se uc_max; + + /* + * Tasks in autogroups or root task group will be + * restricted by system defaults. + */ + if (task_group_is_autogroup(task_group(p))) + return uc_req; + if (task_group(p) == &root_task_group) + return uc_req; + + uc_max = task_group(p)->uclamp[clamp_id]; + if (uc_req.value > uc_max.value || !uc_req.user_defined) + return uc_max; +#endif + + return uc_req; +} + /* * The effective clamp bucket index of a task depends on, by increasing * priority: * - the task specific clamp value, when explicitly requested from userspace + * - the task group effective clamp value, for tasks not either in the root + * group or in an autogroup * - the system default clamp value, defined by the sysadmin */ static inline struct uclamp_se -uclamp_eff_get(struct task_struct *p, unsigned int clamp_id) +uclamp_eff_get(struct task_struct *p, enum uclamp_id clamp_id) { - struct uclamp_se uc_req = p->uclamp_req[clamp_id]; + struct uclamp_se uc_req = uclamp_tg_restrict(p, clamp_id); struct uclamp_se uc_max = uclamp_default[clamp_id]; /* System default restrictions always apply */ @@ -880,7 +918,7 @@ uclamp_eff_get(struct task_struct *p, unsigned int clamp_id) return uc_req; } -unsigned int uclamp_eff_value(struct task_struct *p, unsigned int clamp_id) +enum uclamp_id uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id) { struct uclamp_se uc_eff; @@ -904,7 +942,7 @@ unsigned int uclamp_eff_value(struct task_struct *p, unsigned int clamp_id) * for each bucket when all its RUNNABLE tasks require the same clamp. */ static inline void uclamp_rq_inc_id(struct rq *rq, struct task_struct *p, - unsigned int clamp_id) + enum uclamp_id clamp_id) { struct uclamp_rq *uc_rq = &rq->uclamp[clamp_id]; struct uclamp_se *uc_se = &p->uclamp[clamp_id]; @@ -942,7 +980,7 @@ static inline void uclamp_rq_inc_id(struct rq *rq, struct task_struct *p, * enforce the expected state and warn. */ static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p, - unsigned int clamp_id) + enum uclamp_id clamp_id) { struct uclamp_rq *uc_rq = &rq->uclamp[clamp_id]; struct uclamp_se *uc_se = &p->uclamp[clamp_id]; @@ -981,7 +1019,7 @@ static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p, static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p) { - unsigned int clamp_id; + enum uclamp_id clamp_id; if (unlikely(!p->sched_class->uclamp_enabled)) return; @@ -996,7 +1034,7 @@ static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p) static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p) { - unsigned int clamp_id; + enum uclamp_id clamp_id; if (unlikely(!p->sched_class->uclamp_enabled)) return; @@ -1005,15 +1043,82 @@ static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p) uclamp_rq_dec_id(rq, p, clamp_id); } +static inline void +uclamp_update_active(struct task_struct *p, enum uclamp_id clamp_id) +{ + struct rq_flags rf; + struct rq *rq; + + /* + * Lock the task and the rq where the task is (or was) queued. + * + * We might lock the (previous) rq of a !RUNNABLE task, but that's the + * price to pay to safely serialize util_{min,max} updates with + * enqueues, dequeues and migration operations. + * This is the same locking schema used by __set_cpus_allowed_ptr(). + */ + rq = task_rq_lock(p, &rf); + + /* + * Setting the clamp bucket is serialized by task_rq_lock(). + * If the task is not yet RUNNABLE and its task_struct is not + * affecting a valid clamp bucket, the next time it's enqueued, + * it will already see the updated clamp bucket value. + */ + if (!p->uclamp[clamp_id].active) { + uclamp_rq_dec_id(rq, p, clamp_id); + uclamp_rq_inc_id(rq, p, clamp_id); + } + + task_rq_unlock(rq, p, &rf); +} + +static inline void +uclamp_update_active_tasks(struct cgroup_subsys_state *css, + unsigned int clamps) +{ + enum uclamp_id clamp_id; + struct css_task_iter it; + struct task_struct *p; + + css_task_iter_start(css, 0, &it); + while ((p = css_task_iter_next(&it))) { + for_each_clamp_id(clamp_id) { + if ((0x1 << clamp_id) & clamps) + uclamp_update_active(p, clamp_id); + } + } + css_task_iter_end(&it); +} + +#ifdef CONFIG_UCLAMP_TASK_GROUP +static void cpu_util_update_eff(struct cgroup_subsys_state *css); +static void uclamp_update_root_tg(void) +{ + struct task_group *tg = &root_task_group; + + uclamp_se_set(&tg->uclamp_req[UCLAMP_MIN], + sysctl_sched_uclamp_util_min, false); + uclamp_se_set(&tg->uclamp_req[UCLAMP_MAX], + sysctl_sched_uclamp_util_max, false); + + rcu_read_lock(); + cpu_util_update_eff(&root_task_group.css); + rcu_read_unlock(); +} +#else +static void uclamp_update_root_tg(void) { } +#endif + int sysctl_sched_uclamp_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { + bool update_root_tg = false; int old_min, old_max; - static DEFINE_MUTEX(mutex); int result; - mutex_lock(&mutex); + mutex_lock(&uclamp_mutex); old_min = sysctl_sched_uclamp_util_min; old_max = sysctl_sched_uclamp_util_max; @@ -1032,23 +1137,30 @@ int sysctl_sched_uclamp_handler(struct ctl_table *table, int write, if (old_min != sysctl_sched_uclamp_util_min) { uclamp_se_set(&uclamp_default[UCLAMP_MIN], sysctl_sched_uclamp_util_min, false); + update_root_tg = true; } if (old_max != sysctl_sched_uclamp_util_max) { uclamp_se_set(&uclamp_default[UCLAMP_MAX], sysctl_sched_uclamp_util_max, false); + update_root_tg = true; } + if (update_root_tg) + uclamp_update_root_tg(); + /* - * Updating all the RUNNABLE task is expensive, keep it simple and do - * just a lazy update at each next enqueue time. + * We update all RUNNABLE tasks only when task groups are in use. + * Otherwise, keep it simple and do just a lazy update at each next + * task enqueue time. */ + goto done; undo: sysctl_sched_uclamp_util_min = old_min; sysctl_sched_uclamp_util_max = old_max; done: - mutex_unlock(&mutex); + mutex_unlock(&uclamp_mutex); return result; } @@ -1075,7 +1187,7 @@ static int uclamp_validate(struct task_struct *p, static void __setscheduler_uclamp(struct task_struct *p, const struct sched_attr *attr) { - unsigned int clamp_id; + enum uclamp_id clamp_id; /* * On scheduling class change, reset to default clamps for tasks @@ -1112,7 +1224,7 @@ static void __setscheduler_uclamp(struct task_struct *p, static void uclamp_fork(struct task_struct *p) { - unsigned int clamp_id; + enum uclamp_id clamp_id; for_each_clamp_id(clamp_id) p->uclamp[clamp_id].active = false; @@ -1134,9 +1246,11 @@ static void uclamp_fork(struct task_struct *p) static void __init init_uclamp(void) { struct uclamp_se uc_max = {}; - unsigned int clamp_id; + enum uclamp_id clamp_id; int cpu; + mutex_init(&uclamp_mutex); + for_each_possible_cpu(cpu) { memset(&cpu_rq(cpu)->uclamp, 0, sizeof(struct uclamp_rq)); cpu_rq(cpu)->uclamp_flags = 0; @@ -1149,8 +1263,13 @@ static void __init init_uclamp(void) /* System defaults allow max clamp values for both indexes */ uclamp_se_set(&uc_max, uclamp_none(UCLAMP_MAX), false); - for_each_clamp_id(clamp_id) + for_each_clamp_id(clamp_id) { uclamp_default[clamp_id] = uc_max; +#ifdef CONFIG_UCLAMP_TASK_GROUP + root_task_group.uclamp_req[clamp_id] = uc_max; + root_task_group.uclamp[clamp_id] = uc_max; +#endif + } } #else /* CONFIG_UCLAMP_TASK */ @@ -1494,7 +1613,7 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) if (queued) enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK); if (running) - set_curr_task(rq, p); + set_next_task(rq, p); } /* @@ -3214,12 +3333,8 @@ static __always_inline struct rq * context_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next, struct rq_flags *rf) { - struct mm_struct *mm, *oldmm; - prepare_task_switch(rq, prev, next); - mm = next->mm; - oldmm = prev->active_mm; /* * For paravirt, this is coupled with an exit in switch_to to * combine the page table reload and the switch backend into @@ -3228,22 +3343,37 @@ context_switch(struct rq *rq, struct task_struct *prev, arch_start_context_switch(prev); /* - * If mm is non-NULL, we pass through switch_mm(). If mm is - * NULL, we will pass through mmdrop() in finish_task_switch(). - * Both of these contain the full memory barrier required by - * membarrier after storing to rq->curr, before returning to - * user-space. + * kernel -> kernel lazy + transfer active + * user -> kernel lazy + mmgrab() active + * + * kernel -> user switch + mmdrop() active + * user -> user switch */ - if (!mm) { - next->active_mm = oldmm; - mmgrab(oldmm); - enter_lazy_tlb(oldmm, next); - } else - switch_mm_irqs_off(oldmm, mm, next); + if (!next->mm) { // to kernel + enter_lazy_tlb(prev->active_mm, next); + + next->active_mm = prev->active_mm; + if (prev->mm) // from user + mmgrab(prev->active_mm); + else + prev->active_mm = NULL; + } else { // to user + /* + * sys_membarrier() requires an smp_mb() between setting + * rq->curr and returning to userspace. + * + * The below provides this either through switch_mm(), or in + * case 'prev->active_mm == next->mm' through + * finish_task_switch()'s mmdrop(). + */ - if (!prev->mm) { - prev->active_mm = NULL; - rq->prev_mm = oldmm; + switch_mm_irqs_off(prev->active_mm, next->mm, next); + + if (!prev->mm) { // from kernel + /* will mmdrop() in finish_task_switch(). */ + rq->prev_mm = prev->active_mm; + prev->active_mm = NULL; + } } rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP); @@ -3486,8 +3616,36 @@ void scheduler_tick(void) struct tick_work { int cpu; + atomic_t state; struct delayed_work work; }; +/* Values for ->state, see diagram below. */ +#define TICK_SCHED_REMOTE_OFFLINE 0 +#define TICK_SCHED_REMOTE_OFFLINING 1 +#define TICK_SCHED_REMOTE_RUNNING 2 + +/* + * State diagram for ->state: + * + * + * TICK_SCHED_REMOTE_OFFLINE + * | ^ + * | | + * | | sched_tick_remote() + * | | + * | | + * +--TICK_SCHED_REMOTE_OFFLINING + * | ^ + * | | + * sched_tick_start() | | sched_tick_stop() + * | | + * V | + * TICK_SCHED_REMOTE_RUNNING + * + * + * Other transitions get WARN_ON_ONCE(), except that sched_tick_remote() + * and sched_tick_start() are happy to leave the state in RUNNING. + */ static struct tick_work __percpu *tick_work_cpu; @@ -3500,6 +3658,7 @@ static void sched_tick_remote(struct work_struct *work) struct task_struct *curr; struct rq_flags rf; u64 delta; + int os; /* * Handle the tick only if it appears the remote CPU is running in full @@ -3513,7 +3672,7 @@ static void sched_tick_remote(struct work_struct *work) rq_lock_irq(rq, &rf); curr = rq->curr; - if (is_idle_task(curr)) + if (is_idle_task(curr) || cpu_is_offline(cpu)) goto out_unlock; update_rq_clock(rq); @@ -3533,13 +3692,18 @@ out_requeue: /* * Run the remote tick once per second (1Hz). This arbitrary * frequency is large enough to avoid overload but short enough - * to keep scheduler internal stats reasonably up to date. + * to keep scheduler internal stats reasonably up to date. But + * first update state to reflect hotplug activity if required. */ - queue_delayed_work(system_unbound_wq, dwork, HZ); + os = atomic_fetch_add_unless(&twork->state, -1, TICK_SCHED_REMOTE_RUNNING); + WARN_ON_ONCE(os == TICK_SCHED_REMOTE_OFFLINE); + if (os == TICK_SCHED_REMOTE_RUNNING) + queue_delayed_work(system_unbound_wq, dwork, HZ); } static void sched_tick_start(int cpu) { + int os; struct tick_work *twork; if (housekeeping_cpu(cpu, HK_FLAG_TICK)) @@ -3548,15 +3712,20 @@ static void sched_tick_start(int cpu) WARN_ON_ONCE(!tick_work_cpu); twork = per_cpu_ptr(tick_work_cpu, cpu); - twork->cpu = cpu; - INIT_DELAYED_WORK(&twork->work, sched_tick_remote); - queue_delayed_work(system_unbound_wq, &twork->work, HZ); + os = atomic_xchg(&twork->state, TICK_SCHED_REMOTE_RUNNING); + WARN_ON_ONCE(os == TICK_SCHED_REMOTE_RUNNING); + if (os == TICK_SCHED_REMOTE_OFFLINE) { + twork->cpu = cpu; + INIT_DELAYED_WORK(&twork->work, sched_tick_remote); + queue_delayed_work(system_unbound_wq, &twork->work, HZ); + } } #ifdef CONFIG_HOTPLUG_CPU static void sched_tick_stop(int cpu) { struct tick_work *twork; + int os; if (housekeeping_cpu(cpu, HK_FLAG_TICK)) return; @@ -3564,7 +3733,10 @@ static void sched_tick_stop(int cpu) WARN_ON_ONCE(!tick_work_cpu); twork = per_cpu_ptr(tick_work_cpu, cpu); - cancel_delayed_work_sync(&twork->work); + /* There cannot be competing actions, but don't rely on stop-machine. */ + os = atomic_xchg(&twork->state, TICK_SCHED_REMOTE_OFFLINING); + WARN_ON_ONCE(os != TICK_SCHED_REMOTE_RUNNING); + /* Don't cancel, as this would mess up the state machine. */ } #endif /* CONFIG_HOTPLUG_CPU */ @@ -3572,7 +3744,6 @@ int __init sched_tick_offload_init(void) { tick_work_cpu = alloc_percpu(struct tick_work); BUG_ON(!tick_work_cpu); - return 0; } @@ -3581,7 +3752,7 @@ static inline void sched_tick_start(int cpu) { } static inline void sched_tick_stop(int cpu) { } #endif -#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ +#if defined(CONFIG_PREEMPTION) && (defined(CONFIG_DEBUG_PREEMPT) || \ defined(CONFIG_TRACE_PREEMPT_TOGGLE)) /* * If the value passed in is equal to the current preempt count @@ -3739,7 +3910,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) p = fair_sched_class.pick_next_task(rq, prev, rf); if (unlikely(p == RETRY_TASK)) - goto again; + goto restart; /* Assumes fair_sched_class->next == idle_sched_class */ if (unlikely(!p)) @@ -3748,14 +3919,19 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) return p; } -again: +restart: + /* + * Ensure that we put DL/RT tasks before the pick loop, such that they + * can PULL higher prio tasks when we lower the RQ 'priority'. + */ + prev->sched_class->put_prev_task(rq, prev, rf); + if (!rq->nr_running) + newidle_balance(rq, rf); + for_each_class(class) { - p = class->pick_next_task(rq, prev, rf); - if (p) { - if (unlikely(p == RETRY_TASK)) - goto again; + p = class->pick_next_task(rq, NULL, NULL); + if (p) return p; - } } /* The idle class should always have a runnable task: */ @@ -3782,7 +3958,7 @@ again: * task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets * called on the nearest possible occasion: * - * - If the kernel is preemptible (CONFIG_PREEMPT=y): + * - If the kernel is preemptible (CONFIG_PREEMPTION=y): * * - in syscall or exception context, at the next outmost * preempt_enable(). (this might be as soon as the wake_up()'s @@ -3791,7 +3967,7 @@ again: * - in IRQ context, return from interrupt-handler to * preemptible context * - * - If the kernel is not preemptible (CONFIG_PREEMPT is not set) + * - If the kernel is not preemptible (CONFIG_PREEMPTION is not set) * then at the next: * * - cond_resched() call @@ -3904,7 +4080,7 @@ void __noreturn do_task_dead(void) static inline void sched_submit_work(struct task_struct *tsk) { - if (!tsk->state || tsk_is_pi_blocked(tsk)) + if (!tsk->state) return; /* @@ -3920,6 +4096,9 @@ static inline void sched_submit_work(struct task_struct *tsk) preempt_enable_no_resched(); } + if (tsk_is_pi_blocked(tsk)) + return; + /* * If we are going to sleep and we have plugged IO queued, * make sure to submit it to avoid deadlocks. @@ -4033,7 +4212,7 @@ static void __sched notrace preempt_schedule_common(void) } while (need_resched()); } -#ifdef CONFIG_PREEMPT +#ifdef CONFIG_PREEMPTION /* * this is the entry point to schedule() from in-kernel preemption * off of preempt_enable. Kernel preemptions off return from interrupt @@ -4105,7 +4284,7 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void) } EXPORT_SYMBOL_GPL(preempt_schedule_notrace); -#endif /* CONFIG_PREEMPT */ +#endif /* CONFIG_PREEMPTION */ /* * this is the entry point to schedule() from kernel preemption @@ -4273,7 +4452,7 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) if (queued) enqueue_task(rq, p, queue_flag); if (running) - set_curr_task(rq, p); + set_next_task(rq, p); check_class_changed(rq, p, prev_class, oldprio); out_unlock: @@ -4340,7 +4519,7 @@ void set_user_nice(struct task_struct *p, long nice) resched_curr(rq); } if (running) - set_curr_task(rq, p); + set_next_task(rq, p); out_unlock: task_rq_unlock(rq, p, &rf); } @@ -4657,6 +4836,9 @@ recheck: return retval; } + if (pi) + cpuset_read_lock(); + /* * Make sure no PI-waiters arrive (or leave) while we are * changing the priority of the task: @@ -4671,8 +4853,8 @@ recheck: * Changing the policy of the stop threads its a very bad idea: */ if (p == rq->stop) { - task_rq_unlock(rq, p, &rf); - return -EINVAL; + retval = -EINVAL; + goto unlock; } /* @@ -4690,8 +4872,8 @@ recheck: goto change; p->sched_reset_on_fork = reset_on_fork; - task_rq_unlock(rq, p, &rf); - return 0; + retval = 0; + goto unlock; } change: @@ -4704,8 +4886,8 @@ change: if (rt_bandwidth_enabled() && rt_policy(policy) && task_group(p)->rt_bandwidth.rt_runtime == 0 && !task_group_is_autogroup(task_group(p))) { - task_rq_unlock(rq, p, &rf); - return -EPERM; + retval = -EPERM; + goto unlock; } #endif #ifdef CONFIG_SMP @@ -4720,8 +4902,8 @@ change: */ if (!cpumask_subset(span, p->cpus_ptr) || rq->rd->dl_bw.bw == 0) { - task_rq_unlock(rq, p, &rf); - return -EPERM; + retval = -EPERM; + goto unlock; } } #endif @@ -4731,6 +4913,8 @@ change: if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { policy = oldpolicy = -1; task_rq_unlock(rq, p, &rf); + if (pi) + cpuset_read_unlock(); goto recheck; } @@ -4740,8 +4924,8 @@ change: * is available. */ if ((dl_policy(policy) || dl_task(p)) && sched_dl_overflow(p, policy, attr)) { - task_rq_unlock(rq, p, &rf); - return -EBUSY; + retval = -EBUSY; + goto unlock; } p->sched_reset_on_fork = reset_on_fork; @@ -4783,7 +4967,7 @@ change: enqueue_task(rq, p, queue_flags); } if (running) - set_curr_task(rq, p); + set_next_task(rq, p); check_class_changed(rq, p, prev_class, oldprio); @@ -4791,14 +4975,22 @@ change: preempt_disable(); task_rq_unlock(rq, p, &rf); - if (pi) + if (pi) { + cpuset_read_unlock(); rt_mutex_adjust_pi(p); + } /* Run balance callbacks after we've adjusted the PI chain: */ balance_callback(rq); preempt_enable(); return 0; + +unlock: + task_rq_unlock(rq, p, &rf); + if (pi) + cpuset_read_unlock(); + return retval; } static int _sched_setscheduler(struct task_struct *p, int policy, @@ -4882,10 +5074,15 @@ do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) rcu_read_lock(); retval = -ESRCH; p = find_process_by_pid(pid); - if (p != NULL) - retval = sched_setscheduler(p, policy, &lparam); + if (likely(p)) + get_task_struct(p); rcu_read_unlock(); + if (likely(p)) { + retval = sched_setscheduler(p, policy, &lparam); + put_task_struct(p); + } + return retval; } @@ -5102,37 +5299,40 @@ out_unlock: return retval; } -static int sched_read_attr(struct sched_attr __user *uattr, - struct sched_attr *attr, - unsigned int usize) +/* + * Copy the kernel size attribute structure (which might be larger + * than what user-space knows about) to user-space. + * + * Note that all cases are valid: user-space buffer can be larger or + * smaller than the kernel-space buffer. The usual case is that both + * have the same size. + */ +static int +sched_attr_copy_to_user(struct sched_attr __user *uattr, + struct sched_attr *kattr, + unsigned int usize) { - int ret; + unsigned int ksize = sizeof(*kattr); if (!access_ok(uattr, usize)) return -EFAULT; /* - * If we're handed a smaller struct than we know of, - * ensure all the unknown bits are 0 - i.e. old - * user-space does not get uncomplete information. + * sched_getattr() ABI forwards and backwards compatibility: + * + * If usize == ksize then we just copy everything to user-space and all is good. + * + * If usize < ksize then we only copy as much as user-space has space for, + * this keeps ABI compatibility as well. We skip the rest. + * + * If usize > ksize then user-space is using a newer version of the ABI, + * which part the kernel doesn't know about. Just ignore it - tooling can + * detect the kernel's knowledge of attributes from the attr->size value + * which is set to ksize in this case. */ - if (usize < sizeof(*attr)) { - unsigned char *addr; - unsigned char *end; - - addr = (void *)attr + usize; - end = (void *)attr + sizeof(*attr); - - for (; addr < end; addr++) { - if (*addr) - return -EFBIG; - } - - attr->size = usize; - } + kattr->size = min(usize, ksize); - ret = copy_to_user(uattr, attr, attr->size); - if (ret) + if (copy_to_user(uattr, kattr, kattr->size)) return -EFAULT; return 0; @@ -5142,20 +5342,18 @@ static int sched_read_attr(struct sched_attr __user *uattr, * sys_sched_getattr - similar to sched_getparam, but with sched_attr * @pid: the pid in question. * @uattr: structure containing the extended parameters. - * @size: sizeof(attr) for fwd/bwd comp. + * @usize: sizeof(attr) that user-space knows about, for forwards and backwards compatibility. * @flags: for future extension. */ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, - unsigned int, size, unsigned int, flags) + unsigned int, usize, unsigned int, flags) { - struct sched_attr attr = { - .size = sizeof(struct sched_attr), - }; + struct sched_attr kattr = { }; struct task_struct *p; int retval; - if (!uattr || pid < 0 || size > PAGE_SIZE || - size < SCHED_ATTR_SIZE_VER0 || flags) + if (!uattr || pid < 0 || usize > PAGE_SIZE || + usize < SCHED_ATTR_SIZE_VER0 || flags) return -EINVAL; rcu_read_lock(); @@ -5168,25 +5366,24 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, if (retval) goto out_unlock; - attr.sched_policy = p->policy; + kattr.sched_policy = p->policy; if (p->sched_reset_on_fork) - attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; + kattr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; if (task_has_dl_policy(p)) - __getparam_dl(p, &attr); + __getparam_dl(p, &kattr); else if (task_has_rt_policy(p)) - attr.sched_priority = p->rt_priority; + kattr.sched_priority = p->rt_priority; else - attr.sched_nice = task_nice(p); + kattr.sched_nice = task_nice(p); #ifdef CONFIG_UCLAMP_TASK - attr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value; - attr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value; + kattr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value; + kattr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value; #endif rcu_read_unlock(); - retval = sched_read_attr(uattr, &attr, size); - return retval; + return sched_attr_copy_to_user(uattr, &kattr, usize); out_unlock: rcu_read_unlock(); @@ -5416,7 +5613,7 @@ SYSCALL_DEFINE0(sched_yield) return 0; } -#ifndef CONFIG_PREEMPT +#ifndef CONFIG_PREEMPTION int __sched _cond_resched(void) { if (should_resched(0)) { @@ -5433,7 +5630,7 @@ EXPORT_SYMBOL(_cond_resched); * __cond_resched_lock() - if a reschedule is pending, drop the given lock, * call schedule, and on return reacquire the lock. * - * This works OK both with and without CONFIG_PREEMPT. We do strange low-level + * This works OK both with and without CONFIG_PREEMPTION. We do strange low-level * operations here to prevent schedule() from being called twice (once via * spin_unlock(), once by hand). */ @@ -5972,7 +6169,7 @@ void sched_setnuma(struct task_struct *p, int nid) if (queued) enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK); if (running) - set_curr_task(rq, p); + set_next_task(rq, p); task_rq_unlock(rq, p, &rf); } #endif /* CONFIG_NUMA_BALANCING */ @@ -6012,21 +6209,22 @@ static void calc_load_migrate(struct rq *rq) atomic_long_add(delta, &calc_load_tasks); } -static void put_prev_task_fake(struct rq *rq, struct task_struct *prev) +static struct task_struct *__pick_migrate_task(struct rq *rq) { -} + const struct sched_class *class; + struct task_struct *next; -static const struct sched_class fake_sched_class = { - .put_prev_task = put_prev_task_fake, -}; + for_each_class(class) { + next = class->pick_next_task(rq, NULL, NULL); + if (next) { + next->sched_class->put_prev_task(rq, next, NULL); + return next; + } + } -static struct task_struct fake_task = { - /* - * Avoid pull_{rt,dl}_task() - */ - .prio = MAX_PRIO + 1, - .sched_class = &fake_sched_class, -}; + /* The idle class should always have a runnable task */ + BUG(); +} /* * Migrate all tasks from the rq, sleeping tasks will be migrated by @@ -6069,12 +6267,7 @@ static void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf) if (rq->nr_running == 1) break; - /* - * pick_next_task() assumes pinned rq->lock: - */ - next = pick_next_task(rq, &fake_task, rf); - BUG_ON(!next); - put_prev_task(rq, next); + next = __pick_migrate_task(rq); /* * Rules for changing task_struct::cpus_mask are holding @@ -6371,19 +6564,19 @@ DECLARE_PER_CPU(cpumask_var_t, select_idle_mask); void __init sched_init(void) { - unsigned long alloc_size = 0, ptr; + unsigned long ptr = 0; int i; wait_bit_init(); #ifdef CONFIG_FAIR_GROUP_SCHED - alloc_size += 2 * nr_cpu_ids * sizeof(void **); + ptr += 2 * nr_cpu_ids * sizeof(void **); #endif #ifdef CONFIG_RT_GROUP_SCHED - alloc_size += 2 * nr_cpu_ids * sizeof(void **); + ptr += 2 * nr_cpu_ids * sizeof(void **); #endif - if (alloc_size) { - ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT); + if (ptr) { + ptr = (unsigned long)kzalloc(ptr, GFP_NOWAIT); #ifdef CONFIG_FAIR_GROUP_SCHED root_task_group.se = (struct sched_entity **)ptr; @@ -6702,7 +6895,7 @@ struct task_struct *curr_task(int cpu) #ifdef CONFIG_IA64 /** - * set_curr_task - set the current task for a given CPU. + * ia64_set_curr_task - set the current task for a given CPU. * @cpu: the processor in question. * @p: the task pointer to set. * @@ -6727,6 +6920,20 @@ void ia64_set_curr_task(int cpu, struct task_struct *p) /* task_group_lock serializes the addition/removal of task groups */ static DEFINE_SPINLOCK(task_group_lock); +static inline void alloc_uclamp_sched_group(struct task_group *tg, + struct task_group *parent) +{ +#ifdef CONFIG_UCLAMP_TASK_GROUP + enum uclamp_id clamp_id; + + for_each_clamp_id(clamp_id) { + uclamp_se_set(&tg->uclamp_req[clamp_id], + uclamp_none(clamp_id), false); + tg->uclamp[clamp_id] = parent->uclamp[clamp_id]; + } +#endif +} + static void sched_free_group(struct task_group *tg) { free_fair_sched_group(tg); @@ -6750,6 +6957,8 @@ struct task_group *sched_create_group(struct task_group *parent) if (!alloc_rt_sched_group(tg, parent)) goto err; + alloc_uclamp_sched_group(tg, parent); + return tg; err: @@ -6853,7 +7062,7 @@ void sched_move_task(struct task_struct *tsk) if (queued) enqueue_task(rq, tsk, queue_flags); if (running) - set_curr_task(rq, tsk); + set_next_task(rq, tsk); task_rq_unlock(rq, tsk, &rf); } @@ -6936,10 +7145,6 @@ static int cpu_cgroup_can_attach(struct cgroup_taskset *tset) #ifdef CONFIG_RT_GROUP_SCHED if (!sched_rt_can_attach(css_tg(css), task)) return -EINVAL; -#else - /* We don't support RT-tasks being in separate groups */ - if (task->sched_class != &fair_sched_class) - return -EINVAL; #endif /* * Serialize against wake_up_new_task() such that if its @@ -6970,6 +7175,178 @@ static void cpu_cgroup_attach(struct cgroup_taskset *tset) sched_move_task(task); } +#ifdef CONFIG_UCLAMP_TASK_GROUP +static void cpu_util_update_eff(struct cgroup_subsys_state *css) +{ + struct cgroup_subsys_state *top_css = css; + struct uclamp_se *uc_parent = NULL; + struct uclamp_se *uc_se = NULL; + unsigned int eff[UCLAMP_CNT]; + enum uclamp_id clamp_id; + unsigned int clamps; + + css_for_each_descendant_pre(css, top_css) { + uc_parent = css_tg(css)->parent + ? css_tg(css)->parent->uclamp : NULL; + + for_each_clamp_id(clamp_id) { + /* Assume effective clamps matches requested clamps */ + eff[clamp_id] = css_tg(css)->uclamp_req[clamp_id].value; + /* Cap effective clamps with parent's effective clamps */ + if (uc_parent && + eff[clamp_id] > uc_parent[clamp_id].value) { + eff[clamp_id] = uc_parent[clamp_id].value; + } + } + /* Ensure protection is always capped by limit */ + eff[UCLAMP_MIN] = min(eff[UCLAMP_MIN], eff[UCLAMP_MAX]); + + /* Propagate most restrictive effective clamps */ + clamps = 0x0; + uc_se = css_tg(css)->uclamp; + for_each_clamp_id(clamp_id) { + if (eff[clamp_id] == uc_se[clamp_id].value) + continue; + uc_se[clamp_id].value = eff[clamp_id]; + uc_se[clamp_id].bucket_id = uclamp_bucket_id(eff[clamp_id]); + clamps |= (0x1 << clamp_id); + } + if (!clamps) { + css = css_rightmost_descendant(css); + continue; + } + + /* Immediately update descendants RUNNABLE tasks */ + uclamp_update_active_tasks(css, clamps); + } +} + +/* + * Integer 10^N with a given N exponent by casting to integer the literal "1eN" + * C expression. Since there is no way to convert a macro argument (N) into a + * character constant, use two levels of macros. + */ +#define _POW10(exp) ((unsigned int)1e##exp) +#define POW10(exp) _POW10(exp) + +struct uclamp_request { +#define UCLAMP_PERCENT_SHIFT 2 +#define UCLAMP_PERCENT_SCALE (100 * POW10(UCLAMP_PERCENT_SHIFT)) + s64 percent; + u64 util; + int ret; +}; + +static inline struct uclamp_request +capacity_from_percent(char *buf) +{ + struct uclamp_request req = { + .percent = UCLAMP_PERCENT_SCALE, + .util = SCHED_CAPACITY_SCALE, + .ret = 0, + }; + + buf = strim(buf); + if (strcmp(buf, "max")) { + req.ret = cgroup_parse_float(buf, UCLAMP_PERCENT_SHIFT, + &req.percent); + if (req.ret) + return req; + if (req.percent > UCLAMP_PERCENT_SCALE) { + req.ret = -ERANGE; + return req; + } + + req.util = req.percent << SCHED_CAPACITY_SHIFT; + req.util = DIV_ROUND_CLOSEST_ULL(req.util, UCLAMP_PERCENT_SCALE); + } + + return req; +} + +static ssize_t cpu_uclamp_write(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off, + enum uclamp_id clamp_id) +{ + struct uclamp_request req; + struct task_group *tg; + + req = capacity_from_percent(buf); + if (req.ret) + return req.ret; + + mutex_lock(&uclamp_mutex); + rcu_read_lock(); + + tg = css_tg(of_css(of)); + if (tg->uclamp_req[clamp_id].value != req.util) + uclamp_se_set(&tg->uclamp_req[clamp_id], req.util, false); + + /* + * Because of not recoverable conversion rounding we keep track of the + * exact requested value + */ + tg->uclamp_pct[clamp_id] = req.percent; + + /* Update effective clamps to track the most restrictive value */ + cpu_util_update_eff(of_css(of)); + + rcu_read_unlock(); + mutex_unlock(&uclamp_mutex); + + return nbytes; +} + +static ssize_t cpu_uclamp_min_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, + loff_t off) +{ + return cpu_uclamp_write(of, buf, nbytes, off, UCLAMP_MIN); +} + +static ssize_t cpu_uclamp_max_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, + loff_t off) +{ + return cpu_uclamp_write(of, buf, nbytes, off, UCLAMP_MAX); +} + +static inline void cpu_uclamp_print(struct seq_file *sf, + enum uclamp_id clamp_id) +{ + struct task_group *tg; + u64 util_clamp; + u64 percent; + u32 rem; + + rcu_read_lock(); + tg = css_tg(seq_css(sf)); + util_clamp = tg->uclamp_req[clamp_id].value; + rcu_read_unlock(); + + if (util_clamp == SCHED_CAPACITY_SCALE) { + seq_puts(sf, "max\n"); + return; + } + + percent = tg->uclamp_pct[clamp_id]; + percent = div_u64_rem(percent, POW10(UCLAMP_PERCENT_SHIFT), &rem); + seq_printf(sf, "%llu.%0*u\n", percent, UCLAMP_PERCENT_SHIFT, rem); +} + +static int cpu_uclamp_min_show(struct seq_file *sf, void *v) +{ + cpu_uclamp_print(sf, UCLAMP_MIN); + return 0; +} + +static int cpu_uclamp_max_show(struct seq_file *sf, void *v) +{ + cpu_uclamp_print(sf, UCLAMP_MAX); + return 0; +} +#endif /* CONFIG_UCLAMP_TASK_GROUP */ + #ifdef CONFIG_FAIR_GROUP_SCHED static int cpu_shares_write_u64(struct cgroup_subsys_state *css, struct cftype *cftype, u64 shareval) @@ -7315,6 +7692,20 @@ static struct cftype cpu_legacy_files[] = { .write_u64 = cpu_rt_period_write_uint, }, #endif +#ifdef CONFIG_UCLAMP_TASK_GROUP + { + .name = "uclamp.min", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = cpu_uclamp_min_show, + .write = cpu_uclamp_min_write, + }, + { + .name = "uclamp.max", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = cpu_uclamp_max_show, + .write = cpu_uclamp_max_write, + }, +#endif { } /* Terminate */ }; @@ -7482,6 +7873,20 @@ static struct cftype cpu_files[] = { .write = cpu_max_write, }, #endif +#ifdef CONFIG_UCLAMP_TASK_GROUP + { + .name = "uclamp.min", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = cpu_uclamp_min_show, + .write = cpu_uclamp_min_write, + }, + { + .name = "uclamp.max", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = cpu_uclamp_max_show, + .write = cpu_uclamp_max_write, + }, +#endif { } /* terminate */ }; diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 636ca6f88c8e..fdce9cfaca05 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -40,6 +40,7 @@ struct sugov_policy { struct task_struct *thread; bool work_in_progress; + bool limits_changed; bool need_freq_update; }; @@ -89,8 +90,11 @@ static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time) !cpufreq_this_cpu_can_update(sg_policy->policy)) return false; - if (unlikely(sg_policy->need_freq_update)) + if (unlikely(sg_policy->limits_changed)) { + sg_policy->limits_changed = false; + sg_policy->need_freq_update = true; return true; + } delta_ns = time - sg_policy->last_freq_update_time; @@ -259,9 +263,9 @@ unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs, * irq metric. Because IRQ/steal time is hidden from the task clock we * need to scale the task numbers: * - * 1 - irq - * U' = irq + ------- * U - * max + * max - irq + * U' = irq + --------- * U + * max */ util = scale_irq_capacity(util, irq, max); util += irq; @@ -437,7 +441,7 @@ static inline bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) { return false; } static inline void ignore_dl_rate_limit(struct sugov_cpu *sg_cpu, struct sugov_policy *sg_policy) { if (cpu_bw_dl(cpu_rq(sg_cpu->cpu)) > sg_cpu->bw_dl) - sg_policy->need_freq_update = true; + sg_policy->limits_changed = true; } static void sugov_update_single(struct update_util_data *hook, u64 time, @@ -457,7 +461,8 @@ static void sugov_update_single(struct update_util_data *hook, u64 time, if (!sugov_should_update_freq(sg_policy, time)) return; - busy = sugov_cpu_is_busy(sg_cpu); + /* Limits may have changed, don't skip frequency update */ + busy = !sg_policy->need_freq_update && sugov_cpu_is_busy(sg_cpu); util = sugov_get_util(sg_cpu); max = sg_cpu->max; @@ -831,6 +836,7 @@ static int sugov_start(struct cpufreq_policy *policy) sg_policy->last_freq_update_time = 0; sg_policy->next_freq = 0; sg_policy->work_in_progress = false; + sg_policy->limits_changed = false; sg_policy->need_freq_update = false; sg_policy->cached_raw_freq = 0; @@ -879,7 +885,7 @@ static void sugov_limits(struct cpufreq_policy *policy) mutex_unlock(&sg_policy->work_lock); } - sg_policy->need_freq_update = true; + sg_policy->limits_changed = true; } struct cpufreq_governor schedutil_gov = { diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index ef5b9f6b1d42..39dc9f74f289 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -529,6 +529,7 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq); static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p) { struct rq *later_rq = NULL; + struct dl_bw *dl_b; later_rq = find_lock_later_rq(p, rq); if (!later_rq) { @@ -557,6 +558,38 @@ static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p double_lock_balance(rq, later_rq); } + if (p->dl.dl_non_contending || p->dl.dl_throttled) { + /* + * Inactive timer is armed (or callback is running, but + * waiting for us to release rq locks). In any case, when it + * will fire (or continue), it will see running_bw of this + * task migrated to later_rq (and correctly handle it). + */ + sub_running_bw(&p->dl, &rq->dl); + sub_rq_bw(&p->dl, &rq->dl); + + add_rq_bw(&p->dl, &later_rq->dl); + add_running_bw(&p->dl, &later_rq->dl); + } else { + sub_rq_bw(&p->dl, &rq->dl); + add_rq_bw(&p->dl, &later_rq->dl); + } + + /* + * And we finally need to fixup root_domain(s) bandwidth accounting, + * since p is still hanging out in the old (now moved to default) root + * domain. + */ + dl_b = &rq->rd->dl_bw; + raw_spin_lock(&dl_b->lock); + __dl_sub(dl_b, p->dl.dl_bw, cpumask_weight(rq->rd->span)); + raw_spin_unlock(&dl_b->lock); + + dl_b = &later_rq->rd->dl_bw; + raw_spin_lock(&dl_b->lock); + __dl_add(dl_b, p->dl.dl_bw, cpumask_weight(later_rq->rd->span)); + raw_spin_unlock(&dl_b->lock); + set_task_cpu(p, later_rq->cpu); double_unlock_balance(later_rq, rq); @@ -1694,12 +1727,20 @@ static void start_hrtick_dl(struct rq *rq, struct task_struct *p) } #endif -static inline void set_next_task(struct rq *rq, struct task_struct *p) +static void set_next_task_dl(struct rq *rq, struct task_struct *p) { p->se.exec_start = rq_clock_task(rq); /* You can't push away the running task */ dequeue_pushable_dl_task(rq, p); + + if (hrtick_enabled(rq)) + start_hrtick_dl(rq, p); + + if (rq->curr->sched_class != &dl_sched_class) + update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 0); + + deadline_queue_push_tasks(rq); } static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq, @@ -1720,64 +1761,42 @@ pick_next_task_dl(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) struct task_struct *p; struct dl_rq *dl_rq; - dl_rq = &rq->dl; + WARN_ON_ONCE(prev || rf); - if (need_pull_dl_task(rq, prev)) { - /* - * This is OK, because current is on_cpu, which avoids it being - * picked for load-balance and preemption/IRQs are still - * disabled avoiding further scheduler activity on it and we're - * being very careful to re-start the picking loop. - */ - rq_unpin_lock(rq, rf); - pull_dl_task(rq); - rq_repin_lock(rq, rf); - /* - * pull_dl_task() can drop (and re-acquire) rq->lock; this - * means a stop task can slip in, in which case we need to - * re-start task selection. - */ - if (rq->stop && task_on_rq_queued(rq->stop)) - return RETRY_TASK; - } - - /* - * When prev is DL, we may throttle it in put_prev_task(). - * So, we update time before we check for dl_nr_running. - */ - if (prev->sched_class == &dl_sched_class) - update_curr_dl(rq); + dl_rq = &rq->dl; if (unlikely(!dl_rq->dl_nr_running)) return NULL; - put_prev_task(rq, prev); - dl_se = pick_next_dl_entity(rq, dl_rq); BUG_ON(!dl_se); p = dl_task_of(dl_se); - set_next_task(rq, p); - - if (hrtick_enabled(rq)) - start_hrtick_dl(rq, p); - - deadline_queue_push_tasks(rq); - - if (rq->curr->sched_class != &dl_sched_class) - update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 0); + set_next_task_dl(rq, p); return p; } -static void put_prev_task_dl(struct rq *rq, struct task_struct *p) +static void put_prev_task_dl(struct rq *rq, struct task_struct *p, struct rq_flags *rf) { update_curr_dl(rq); update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 1); if (on_dl_rq(&p->dl) && p->nr_cpus_allowed > 1) enqueue_pushable_dl_task(rq, p); + + if (rf && !on_dl_rq(&p->dl) && need_pull_dl_task(rq, p)) { + /* + * This is OK, because current is on_cpu, which avoids it being + * picked for load-balance and preemption/IRQs are still + * disabled avoiding further scheduler activity on it and we've + * not yet started the picking loop. + */ + rq_unpin_lock(rq, rf); + pull_dl_task(rq); + rq_repin_lock(rq, rf); + } } /* @@ -1811,11 +1830,6 @@ static void task_fork_dl(struct task_struct *p) */ } -static void set_curr_task_dl(struct rq *rq) -{ - set_next_task(rq, rq->curr); -} - #ifdef CONFIG_SMP /* Only try algorithms three times */ @@ -2088,17 +2102,13 @@ retry: } deactivate_task(rq, next_task, 0); - sub_running_bw(&next_task->dl, &rq->dl); - sub_rq_bw(&next_task->dl, &rq->dl); set_task_cpu(next_task, later_rq->cpu); - add_rq_bw(&next_task->dl, &later_rq->dl); /* * Update the later_rq clock here, because the clock is used * by the cpufreq_update_util() inside __add_running_bw(). */ update_rq_clock(later_rq); - add_running_bw(&next_task->dl, &later_rq->dl); activate_task(later_rq, next_task, ENQUEUE_NOCLOCK); ret = 1; @@ -2186,11 +2196,7 @@ static void pull_dl_task(struct rq *this_rq) resched = true; deactivate_task(src_rq, p, 0); - sub_running_bw(&p->dl, &src_rq->dl); - sub_rq_bw(&p->dl, &src_rq->dl); set_task_cpu(p, this_cpu); - add_rq_bw(&p->dl, &this_rq->dl); - add_running_bw(&p->dl, &this_rq->dl); activate_task(this_rq, p, 0); dmin = p->dl.deadline; @@ -2283,6 +2289,36 @@ void __init init_sched_dl_class(void) GFP_KERNEL, cpu_to_node(i)); } +void dl_add_task_root_domain(struct task_struct *p) +{ + struct rq_flags rf; + struct rq *rq; + struct dl_bw *dl_b; + + rq = task_rq_lock(p, &rf); + if (!dl_task(p)) + goto unlock; + + dl_b = &rq->rd->dl_bw; + raw_spin_lock(&dl_b->lock); + + __dl_add(dl_b, p->dl.dl_bw, cpumask_weight(rq->rd->span)); + + raw_spin_unlock(&dl_b->lock); + +unlock: + task_rq_unlock(rq, p, &rf); +} + +void dl_clear_root_domain(struct root_domain *rd) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&rd->dl_bw.lock, flags); + rd->dl_bw.total_bw = 0; + raw_spin_unlock_irqrestore(&rd->dl_bw.lock, flags); +} + #endif /* CONFIG_SMP */ static void switched_from_dl(struct rq *rq, struct task_struct *p) @@ -2403,6 +2439,7 @@ const struct sched_class dl_sched_class = { .pick_next_task = pick_next_task_dl, .put_prev_task = put_prev_task_dl, + .set_next_task = set_next_task_dl, #ifdef CONFIG_SMP .select_task_rq = select_task_rq_dl, @@ -2413,7 +2450,6 @@ const struct sched_class dl_sched_class = { .task_woken = task_woken_dl, #endif - .set_curr_task = set_curr_task_dl, .task_tick = task_tick_dl, .task_fork = task_fork_dl, diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 036be95a87e9..d4bbf68c3161 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -96,12 +96,12 @@ int __weak arch_asym_cpu_priority(int cpu) } /* - * The margin used when comparing utilization with CPU capacity: - * util * margin < capacity * 1024 + * The margin used when comparing utilization with CPU capacity. * * (default: ~20%) */ -static unsigned int capacity_margin = 1280; +#define fits_capacity(cap, max) ((cap) * 1280 < (max) * 1024) + #endif #ifdef CONFIG_CFS_BANDWIDTH @@ -1086,6 +1086,21 @@ struct numa_group { unsigned long faults[0]; }; +/* + * For functions that can be called in multiple contexts that permit reading + * ->numa_group (see struct task_struct for locking rules). + */ +static struct numa_group *deref_task_numa_group(struct task_struct *p) +{ + return rcu_dereference_check(p->numa_group, p == current || + (lockdep_is_held(&task_rq(p)->lock) && !READ_ONCE(p->on_cpu))); +} + +static struct numa_group *deref_curr_numa_group(struct task_struct *p) +{ + return rcu_dereference_protected(p->numa_group, p == current); +} + static inline unsigned long group_faults_priv(struct numa_group *ng); static inline unsigned long group_faults_shared(struct numa_group *ng); @@ -1129,10 +1144,12 @@ static unsigned int task_scan_start(struct task_struct *p) { unsigned long smin = task_scan_min(p); unsigned long period = smin; + struct numa_group *ng; /* Scale the maximum scan period with the amount of shared memory. */ - if (p->numa_group) { - struct numa_group *ng = p->numa_group; + rcu_read_lock(); + ng = rcu_dereference(p->numa_group); + if (ng) { unsigned long shared = group_faults_shared(ng); unsigned long private = group_faults_priv(ng); @@ -1140,6 +1157,7 @@ static unsigned int task_scan_start(struct task_struct *p) period *= shared + 1; period /= private + shared + 1; } + rcu_read_unlock(); return max(smin, period); } @@ -1148,13 +1166,14 @@ static unsigned int task_scan_max(struct task_struct *p) { unsigned long smin = task_scan_min(p); unsigned long smax; + struct numa_group *ng; /* Watch for min being lower than max due to floor calculations */ smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p); /* Scale the maximum scan period with the amount of shared memory. */ - if (p->numa_group) { - struct numa_group *ng = p->numa_group; + ng = deref_curr_numa_group(p); + if (ng) { unsigned long shared = group_faults_shared(ng); unsigned long private = group_faults_priv(ng); unsigned long period = smax; @@ -1169,47 +1188,6 @@ static unsigned int task_scan_max(struct task_struct *p) return max(smin, smax); } -void init_numa_balancing(unsigned long clone_flags, struct task_struct *p) -{ - int mm_users = 0; - struct mm_struct *mm = p->mm; - - if (mm) { - mm_users = atomic_read(&mm->mm_users); - if (mm_users == 1) { - mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay); - mm->numa_scan_seq = 0; - } - } - p->node_stamp = 0; - p->numa_scan_seq = mm ? mm->numa_scan_seq : 0; - p->numa_scan_period = sysctl_numa_balancing_scan_delay; - p->numa_work.next = &p->numa_work; - p->numa_faults = NULL; - p->numa_group = NULL; - p->last_task_numa_placement = 0; - p->last_sum_exec_runtime = 0; - - /* New address space, reset the preferred nid */ - if (!(clone_flags & CLONE_VM)) { - p->numa_preferred_nid = NUMA_NO_NODE; - return; - } - - /* - * New thread, keep existing numa_preferred_nid which should be copied - * already by arch_dup_task_struct but stagger when scans start. - */ - if (mm) { - unsigned int delay; - - delay = min_t(unsigned int, task_scan_max(current), - current->numa_scan_period * mm_users * NSEC_PER_MSEC); - delay += 2 * TICK_NSEC; - p->node_stamp = delay; - } -} - static void account_numa_enqueue(struct rq *rq, struct task_struct *p) { rq->nr_numa_running += (p->numa_preferred_nid != NUMA_NO_NODE); @@ -1233,7 +1211,16 @@ static void account_numa_dequeue(struct rq *rq, struct task_struct *p) pid_t task_numa_group_id(struct task_struct *p) { - return p->numa_group ? p->numa_group->gid : 0; + struct numa_group *ng; + pid_t gid = 0; + + rcu_read_lock(); + ng = rcu_dereference(p->numa_group); + if (ng) + gid = ng->gid; + rcu_read_unlock(); + + return gid; } /* @@ -1258,11 +1245,13 @@ static inline unsigned long task_faults(struct task_struct *p, int nid) static inline unsigned long group_faults(struct task_struct *p, int nid) { - if (!p->numa_group) + struct numa_group *ng = deref_task_numa_group(p); + + if (!ng) return 0; - return p->numa_group->faults[task_faults_idx(NUMA_MEM, nid, 0)] + - p->numa_group->faults[task_faults_idx(NUMA_MEM, nid, 1)]; + return ng->faults[task_faults_idx(NUMA_MEM, nid, 0)] + + ng->faults[task_faults_idx(NUMA_MEM, nid, 1)]; } static inline unsigned long group_faults_cpu(struct numa_group *group, int nid) @@ -1400,12 +1389,13 @@ static inline unsigned long task_weight(struct task_struct *p, int nid, static inline unsigned long group_weight(struct task_struct *p, int nid, int dist) { + struct numa_group *ng = deref_task_numa_group(p); unsigned long faults, total_faults; - if (!p->numa_group) + if (!ng) return 0; - total_faults = p->numa_group->total_faults; + total_faults = ng->total_faults; if (!total_faults) return 0; @@ -1419,7 +1409,7 @@ static inline unsigned long group_weight(struct task_struct *p, int nid, bool should_numa_migrate_memory(struct task_struct *p, struct page * page, int src_nid, int dst_cpu) { - struct numa_group *ng = p->numa_group; + struct numa_group *ng = deref_curr_numa_group(p); int dst_nid = cpu_to_node(dst_cpu); int last_cpupid, this_cpupid; @@ -1600,13 +1590,14 @@ static bool load_too_imbalanced(long src_load, long dst_load, static void task_numa_compare(struct task_numa_env *env, long taskimp, long groupimp, bool maymove) { + struct numa_group *cur_ng, *p_ng = deref_curr_numa_group(env->p); struct rq *dst_rq = cpu_rq(env->dst_cpu); + long imp = p_ng ? groupimp : taskimp; struct task_struct *cur; long src_load, dst_load; - long load; - long imp = env->p->numa_group ? groupimp : taskimp; - long moveimp = imp; int dist = env->dist; + long moveimp = imp; + long load; if (READ_ONCE(dst_rq->numa_migrate_on)) return; @@ -1645,21 +1636,22 @@ static void task_numa_compare(struct task_numa_env *env, * If dst and source tasks are in the same NUMA group, or not * in any group then look only at task weights. */ - if (cur->numa_group == env->p->numa_group) { + cur_ng = rcu_dereference(cur->numa_group); + if (cur_ng == p_ng) { imp = taskimp + task_weight(cur, env->src_nid, dist) - task_weight(cur, env->dst_nid, dist); /* * Add some hysteresis to prevent swapping the * tasks within a group over tiny differences. */ - if (cur->numa_group) + if (cur_ng) imp -= imp / 16; } else { /* * Compare the group weights. If a task is all by itself * (not part of a group), use the task weight instead. */ - if (cur->numa_group && env->p->numa_group) + if (cur_ng && p_ng) imp += group_weight(cur, env->src_nid, dist) - group_weight(cur, env->dst_nid, dist); else @@ -1757,11 +1749,12 @@ static int task_numa_migrate(struct task_struct *p) .best_imp = 0, .best_cpu = -1, }; + unsigned long taskweight, groupweight; struct sched_domain *sd; + long taskimp, groupimp; + struct numa_group *ng; struct rq *best_rq; - unsigned long taskweight, groupweight; int nid, ret, dist; - long taskimp, groupimp; /* * Pick the lowest SD_NUMA domain, as that would have the smallest @@ -1807,7 +1800,8 @@ static int task_numa_migrate(struct task_struct *p) * multiple NUMA nodes; in order to better consolidate the group, * we need to check other locations. */ - if (env.best_cpu == -1 || (p->numa_group && p->numa_group->active_nodes > 1)) { + ng = deref_curr_numa_group(p); + if (env.best_cpu == -1 || (ng && ng->active_nodes > 1)) { for_each_online_node(nid) { if (nid == env.src_nid || nid == p->numa_preferred_nid) continue; @@ -1840,7 +1834,7 @@ static int task_numa_migrate(struct task_struct *p) * A task that migrated to a second choice node will be better off * trying for a better one later. Do not set the preferred node here. */ - if (p->numa_group) { + if (ng) { if (env.best_cpu == -1) nid = env.src_nid; else @@ -2135,6 +2129,7 @@ static void task_numa_placement(struct task_struct *p) unsigned long total_faults; u64 runtime, period; spinlock_t *group_lock = NULL; + struct numa_group *ng; /* * The p->mm->numa_scan_seq field gets updated without @@ -2152,8 +2147,9 @@ static void task_numa_placement(struct task_struct *p) runtime = numa_get_avg_runtime(p, &period); /* If the task is part of a group prevent parallel updates to group stats */ - if (p->numa_group) { - group_lock = &p->numa_group->lock; + ng = deref_curr_numa_group(p); + if (ng) { + group_lock = &ng->lock; spin_lock_irq(group_lock); } @@ -2194,7 +2190,7 @@ static void task_numa_placement(struct task_struct *p) p->numa_faults[cpu_idx] += f_diff; faults += p->numa_faults[mem_idx]; p->total_numa_faults += diff; - if (p->numa_group) { + if (ng) { /* * safe because we can only change our own group * @@ -2202,14 +2198,14 @@ static void task_numa_placement(struct task_struct *p) * nid and priv in a specific region because it * is at the beginning of the numa_faults array. */ - p->numa_group->faults[mem_idx] += diff; - p->numa_group->faults_cpu[mem_idx] += f_diff; - p->numa_group->total_faults += diff; - group_faults += p->numa_group->faults[mem_idx]; + ng->faults[mem_idx] += diff; + ng->faults_cpu[mem_idx] += f_diff; + ng->total_faults += diff; + group_faults += ng->faults[mem_idx]; } } - if (!p->numa_group) { + if (!ng) { if (faults > max_faults) { max_faults = faults; max_nid = nid; @@ -2220,8 +2216,8 @@ static void task_numa_placement(struct task_struct *p) } } - if (p->numa_group) { - numa_group_count_active_nodes(p->numa_group); + if (ng) { + numa_group_count_active_nodes(ng); spin_unlock_irq(group_lock); max_nid = preferred_group_nid(p, max_nid); } @@ -2255,7 +2251,7 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags, int cpu = cpupid_to_cpu(cpupid); int i; - if (unlikely(!p->numa_group)) { + if (unlikely(!deref_curr_numa_group(p))) { unsigned int size = sizeof(struct numa_group) + 4*nr_node_ids*sizeof(unsigned long); @@ -2291,7 +2287,7 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags, if (!grp) goto no_join; - my_grp = p->numa_group; + my_grp = deref_curr_numa_group(p); if (grp == my_grp) goto no_join; @@ -2353,13 +2349,24 @@ no_join: return; } -void task_numa_free(struct task_struct *p) +/* + * Get rid of NUMA staticstics associated with a task (either current or dead). + * If @final is set, the task is dead and has reached refcount zero, so we can + * safely free all relevant data structures. Otherwise, there might be + * concurrent reads from places like load balancing and procfs, and we should + * reset the data back to default state without freeing ->numa_faults. + */ +void task_numa_free(struct task_struct *p, bool final) { - struct numa_group *grp = p->numa_group; - void *numa_faults = p->numa_faults; + /* safe: p either is current or is being freed by current */ + struct numa_group *grp = rcu_dereference_raw(p->numa_group); + unsigned long *numa_faults = p->numa_faults; unsigned long flags; int i; + if (!numa_faults) + return; + if (grp) { spin_lock_irqsave(&grp->lock, flags); for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) @@ -2372,8 +2379,14 @@ void task_numa_free(struct task_struct *p) put_numa_group(grp); } - p->numa_faults = NULL; - kfree(numa_faults); + if (final) { + p->numa_faults = NULL; + kfree(numa_faults); + } else { + p->total_numa_faults = 0; + for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) + numa_faults[i] = 0; + } } /* @@ -2426,7 +2439,7 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags) * actively using should be counted as local. This allows the * scan rate to slow down when a workload has settled down. */ - ng = p->numa_group; + ng = deref_curr_numa_group(p); if (!priv && !local && ng && ng->active_nodes > 1 && numa_is_active_node(cpu_node, ng) && numa_is_active_node(mem_node, ng)) @@ -2469,7 +2482,7 @@ static void reset_ptenuma_scan(struct task_struct *p) * The expensive part of numa migration is done from task_work context. * Triggered from task_tick_numa(). */ -void task_numa_work(struct callback_head *work) +static void task_numa_work(struct callback_head *work) { unsigned long migrate, next_scan, now = jiffies; struct task_struct *p = current; @@ -2482,7 +2495,7 @@ void task_numa_work(struct callback_head *work) SCHED_WARN_ON(p != container_of(work, struct task_struct, numa_work)); - work->next = work; /* protect against double add */ + work->next = work; /* * Who cares about NUMA placement when they're dying. * @@ -2611,6 +2624,50 @@ out: } } +void init_numa_balancing(unsigned long clone_flags, struct task_struct *p) +{ + int mm_users = 0; + struct mm_struct *mm = p->mm; + + if (mm) { + mm_users = atomic_read(&mm->mm_users); + if (mm_users == 1) { + mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay); + mm->numa_scan_seq = 0; + } + } + p->node_stamp = 0; + p->numa_scan_seq = mm ? mm->numa_scan_seq : 0; + p->numa_scan_period = sysctl_numa_balancing_scan_delay; + /* Protect against double add, see task_tick_numa and task_numa_work */ + p->numa_work.next = &p->numa_work; + p->numa_faults = NULL; + RCU_INIT_POINTER(p->numa_group, NULL); + p->last_task_numa_placement = 0; + p->last_sum_exec_runtime = 0; + + init_task_work(&p->numa_work, task_numa_work); + + /* New address space, reset the preferred nid */ + if (!(clone_flags & CLONE_VM)) { + p->numa_preferred_nid = NUMA_NO_NODE; + return; + } + + /* + * New thread, keep existing numa_preferred_nid which should be copied + * already by arch_dup_task_struct but stagger when scans start. + */ + if (mm) { + unsigned int delay; + + delay = min_t(unsigned int, task_scan_max(current), + current->numa_scan_period * mm_users * NSEC_PER_MSEC); + delay += 2 * TICK_NSEC; + p->node_stamp = delay; + } +} + /* * Drive the periodic memory faults.. */ @@ -2639,10 +2696,8 @@ static void task_tick_numa(struct rq *rq, struct task_struct *curr) curr->numa_scan_period = task_scan_start(curr); curr->node_stamp += period; - if (!time_before(jiffies, curr->mm->numa_next_scan)) { - init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */ + if (!time_before(jiffies, curr->mm->numa_next_scan)) task_work_add(curr, work, true); - } } } @@ -3635,8 +3690,6 @@ static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq) return cfs_rq->avg.load_avg; } -static int idle_balance(struct rq *this_rq, struct rq_flags *rf); - static inline unsigned long task_util(struct task_struct *p) { return READ_ONCE(p->se.avg.util_avg); @@ -3753,7 +3806,7 @@ util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep) static inline int task_fits_capacity(struct task_struct *p, long capacity) { - return capacity * 1024 > task_util_est(p) * capacity_margin; + return fits_capacity(task_util_est(p), capacity); } static inline void update_misfit_status(struct task_struct *p, struct rq *rq) @@ -4316,8 +4369,6 @@ void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b) now = sched_clock_cpu(smp_processor_id()); cfs_b->runtime = cfs_b->quota; - cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period); - cfs_b->expires_seq++; } static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) @@ -4339,8 +4390,7 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) { struct task_group *tg = cfs_rq->tg; struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg); - u64 amount = 0, min_amount, expires; - int expires_seq; + u64 amount = 0, min_amount; /* note: this is a positive sum as runtime_remaining <= 0 */ min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining; @@ -4357,65 +4407,23 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) cfs_b->idle = 0; } } - expires_seq = cfs_b->expires_seq; - expires = cfs_b->runtime_expires; raw_spin_unlock(&cfs_b->lock); cfs_rq->runtime_remaining += amount; - /* - * we may have advanced our local expiration to account for allowed - * spread between our sched_clock and the one on which runtime was - * issued. - */ - if (cfs_rq->expires_seq != expires_seq) { - cfs_rq->expires_seq = expires_seq; - cfs_rq->runtime_expires = expires; - } return cfs_rq->runtime_remaining > 0; } -/* - * Note: This depends on the synchronization provided by sched_clock and the - * fact that rq->clock snapshots this value. - */ -static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq) -{ - struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); - - /* if the deadline is ahead of our clock, nothing to do */ - if (likely((s64)(rq_clock(rq_of(cfs_rq)) - cfs_rq->runtime_expires) < 0)) - return; - - if (cfs_rq->runtime_remaining < 0) - return; - - /* - * If the local deadline has passed we have to consider the - * possibility that our sched_clock is 'fast' and the global deadline - * has not truly expired. - * - * Fortunately we can check determine whether this the case by checking - * whether the global deadline(cfs_b->expires_seq) has advanced. - */ - if (cfs_rq->expires_seq == cfs_b->expires_seq) { - /* extend local deadline, drift is bounded above by 2 ticks */ - cfs_rq->runtime_expires += TICK_NSEC; - } else { - /* global deadline is ahead, expiration has passed */ - cfs_rq->runtime_remaining = 0; - } -} - static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) { /* dock delta_exec before expiring quota (as it could span periods) */ cfs_rq->runtime_remaining -= delta_exec; - expire_cfs_rq_runtime(cfs_rq); if (likely(cfs_rq->runtime_remaining > 0)) return; + if (cfs_rq->throttled) + return; /* * if we're unable to extend our runtime we resched so that the active * hierarchy can be throttled @@ -4500,7 +4508,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq) struct rq *rq = rq_of(cfs_rq); struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); struct sched_entity *se; - long task_delta, dequeue = 1; + long task_delta, idle_task_delta, dequeue = 1; bool empty; se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))]; @@ -4511,6 +4519,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq) rcu_read_unlock(); task_delta = cfs_rq->h_nr_running; + idle_task_delta = cfs_rq->idle_h_nr_running; for_each_sched_entity(se) { struct cfs_rq *qcfs_rq = cfs_rq_of(se); /* throttled entity or throttle-on-deactivate */ @@ -4520,6 +4529,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq) if (dequeue) dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP); qcfs_rq->h_nr_running -= task_delta; + qcfs_rq->idle_h_nr_running -= idle_task_delta; if (qcfs_rq->load.weight) dequeue = 0; @@ -4559,7 +4569,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); struct sched_entity *se; int enqueue = 1; - long task_delta; + long task_delta, idle_task_delta; se = cfs_rq->tg->se[cpu_of(rq)]; @@ -4579,6 +4589,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) return; task_delta = cfs_rq->h_nr_running; + idle_task_delta = cfs_rq->idle_h_nr_running; for_each_sched_entity(se) { if (se->on_rq) enqueue = 0; @@ -4587,6 +4598,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) if (enqueue) enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP); cfs_rq->h_nr_running += task_delta; + cfs_rq->idle_h_nr_running += idle_task_delta; if (cfs_rq_throttled(cfs_rq)) break; @@ -4602,8 +4614,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) resched_curr(rq); } -static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, - u64 remaining, u64 expires) +static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, u64 remaining) { struct cfs_rq *cfs_rq; u64 runtime; @@ -4619,13 +4630,15 @@ static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, if (!cfs_rq_throttled(cfs_rq)) goto next; + /* By the above check, this should never be true */ + SCHED_WARN_ON(cfs_rq->runtime_remaining > 0); + runtime = -cfs_rq->runtime_remaining + 1; if (runtime > remaining) runtime = remaining; remaining -= runtime; cfs_rq->runtime_remaining += runtime; - cfs_rq->runtime_expires = expires; /* we check whether we're throttled above */ if (cfs_rq->runtime_remaining > 0) @@ -4650,7 +4663,7 @@ next: */ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, unsigned long flags) { - u64 runtime, runtime_expires; + u64 runtime; int throttled; /* no need to continue the timer with no bandwidth constraint */ @@ -4678,8 +4691,6 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, u /* account preceding periods in which throttling occurred */ cfs_b->nr_throttled += overrun; - runtime_expires = cfs_b->runtime_expires; - /* * This check is repeated as we are holding onto the new bandwidth while * we unthrottle. This can potentially race with an unthrottled group @@ -4692,8 +4703,7 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, u cfs_b->distribute_running = 1; raw_spin_unlock_irqrestore(&cfs_b->lock, flags); /* we can't nest cfs_b->lock while distributing bandwidth */ - runtime = distribute_cfs_runtime(cfs_b, runtime, - runtime_expires); + runtime = distribute_cfs_runtime(cfs_b, runtime); raw_spin_lock_irqsave(&cfs_b->lock, flags); cfs_b->distribute_running = 0; @@ -4775,8 +4785,7 @@ static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq) return; raw_spin_lock(&cfs_b->lock); - if (cfs_b->quota != RUNTIME_INF && - cfs_rq->runtime_expires == cfs_b->runtime_expires) { + if (cfs_b->quota != RUNTIME_INF) { cfs_b->runtime += slack_runtime; /* we are under rq->lock, defer unthrottling using a timer */ @@ -4809,7 +4818,6 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) { u64 runtime = 0, slice = sched_cfs_bandwidth_slice(); unsigned long flags; - u64 expires; /* confirm we're still not at a refresh boundary */ raw_spin_lock_irqsave(&cfs_b->lock, flags); @@ -4827,7 +4835,6 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) runtime = cfs_b->runtime; - expires = cfs_b->runtime_expires; if (runtime) cfs_b->distribute_running = 1; @@ -4836,11 +4843,10 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) if (!runtime) return; - runtime = distribute_cfs_runtime(cfs_b, runtime, expires); + runtime = distribute_cfs_runtime(cfs_b, runtime); raw_spin_lock_irqsave(&cfs_b->lock, flags); - if (expires == cfs_b->runtime_expires) - lsub_positive(&cfs_b->runtime, runtime); + lsub_positive(&cfs_b->runtime, runtime); cfs_b->distribute_running = 0; raw_spin_unlock_irqrestore(&cfs_b->lock, flags); } @@ -4997,8 +5003,6 @@ void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b) cfs_b->period_active = 1; overrun = hrtimer_forward_now(&cfs_b->period_timer, cfs_b->period); - cfs_b->runtime_expires += (overrun + 1) * ktime_to_ns(cfs_b->period); - cfs_b->expires_seq++; hrtimer_start_expires(&cfs_b->period_timer, HRTIMER_MODE_ABS_PINNED); } @@ -5176,7 +5180,7 @@ static inline unsigned long cpu_util(int cpu); static inline bool cpu_overutilized(int cpu) { - return (capacity_of(cpu) * 1024) < (cpu_util(cpu) * capacity_margin); + return !fits_capacity(cpu_util(cpu), capacity_of(cpu)); } static inline void update_overutilized_status(struct rq *rq) @@ -5200,6 +5204,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) { struct cfs_rq *cfs_rq; struct sched_entity *se = &p->se; + int idle_h_nr_running = task_has_idle_policy(p); /* * The code below (indirectly) updates schedutil which looks at @@ -5232,6 +5237,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (cfs_rq_throttled(cfs_rq)) break; cfs_rq->h_nr_running++; + cfs_rq->idle_h_nr_running += idle_h_nr_running; flags = ENQUEUE_WAKEUP; } @@ -5239,6 +5245,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); cfs_rq->h_nr_running++; + cfs_rq->idle_h_nr_running += idle_h_nr_running; if (cfs_rq_throttled(cfs_rq)) break; @@ -5300,6 +5307,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) struct cfs_rq *cfs_rq; struct sched_entity *se = &p->se; int task_sleep = flags & DEQUEUE_SLEEP; + int idle_h_nr_running = task_has_idle_policy(p); for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); @@ -5314,6 +5322,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (cfs_rq_throttled(cfs_rq)) break; cfs_rq->h_nr_running--; + cfs_rq->idle_h_nr_running -= idle_h_nr_running; /* Don't dequeue parent if it has other entities besides us */ if (cfs_rq->load.weight) { @@ -5333,6 +5342,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); cfs_rq->h_nr_running--; + cfs_rq->idle_h_nr_running -= idle_h_nr_running; if (cfs_rq_throttled(cfs_rq)) break; @@ -5366,6 +5376,15 @@ static struct { #endif /* CONFIG_NO_HZ_COMMON */ +/* CPU only has SCHED_IDLE tasks enqueued */ +static int sched_idle_cpu(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + + return unlikely(rq->nr_running == rq->cfs.idle_h_nr_running && + rq->nr_running); +} + static unsigned long cpu_runnable_load(struct rq *rq) { return cfs_rq_runnable_load_avg(&rq->cfs); @@ -5688,7 +5707,7 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this unsigned int min_exit_latency = UINT_MAX; u64 latest_idle_timestamp = 0; int least_loaded_cpu = this_cpu; - int shallowest_idle_cpu = -1; + int shallowest_idle_cpu = -1, si_cpu = -1; int i; /* Check if we have any choice: */ @@ -5719,7 +5738,12 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this latest_idle_timestamp = rq->idle_stamp; shallowest_idle_cpu = i; } - } else if (shallowest_idle_cpu == -1) { + } else if (shallowest_idle_cpu == -1 && si_cpu == -1) { + if (sched_idle_cpu(i)) { + si_cpu = i; + continue; + } + load = cpu_runnable_load(cpu_rq(i)); if (load < min_load) { min_load = load; @@ -5728,7 +5752,11 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this } } - return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu; + if (shallowest_idle_cpu != -1) + return shallowest_idle_cpu; + if (si_cpu != -1) + return si_cpu; + return least_loaded_cpu; } static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p, @@ -5881,7 +5909,7 @@ static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int */ static int select_idle_smt(struct task_struct *p, int target) { - int cpu; + int cpu, si_cpu = -1; if (!static_branch_likely(&sched_smt_present)) return -1; @@ -5891,9 +5919,11 @@ static int select_idle_smt(struct task_struct *p, int target) continue; if (available_idle_cpu(cpu)) return cpu; + if (si_cpu == -1 && sched_idle_cpu(cpu)) + si_cpu = cpu; } - return -1; + return si_cpu; } #else /* CONFIG_SCHED_SMT */ @@ -5921,8 +5951,8 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t u64 avg_cost, avg_idle; u64 time, cost; s64 delta; - int cpu, nr = INT_MAX; int this = smp_processor_id(); + int cpu, nr = INT_MAX, si_cpu = -1; this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc)); if (!this_sd) @@ -5950,11 +5980,13 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t for_each_cpu_wrap(cpu, sched_domain_span(sd), target) { if (!--nr) - return -1; + return si_cpu; if (!cpumask_test_cpu(cpu, p->cpus_ptr)) continue; if (available_idle_cpu(cpu)) break; + if (si_cpu == -1 && sched_idle_cpu(cpu)) + si_cpu = cpu; } time = cpu_clock(this) - time; @@ -5973,13 +6005,14 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) struct sched_domain *sd; int i, recent_used_cpu; - if (available_idle_cpu(target)) + if (available_idle_cpu(target) || sched_idle_cpu(target)) return target; /* * If the previous CPU is cache affine and idle, don't be stupid: */ - if (prev != target && cpus_share_cache(prev, target) && available_idle_cpu(prev)) + if (prev != target && cpus_share_cache(prev, target) && + (available_idle_cpu(prev) || sched_idle_cpu(prev))) return prev; /* Check a recently used CPU as a potential idle candidate: */ @@ -5987,7 +6020,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) if (recent_used_cpu != prev && recent_used_cpu != target && cpus_share_cache(recent_used_cpu, target) && - available_idle_cpu(recent_used_cpu) && + (available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) && cpumask_test_cpu(p->recent_used_cpu, p->cpus_ptr)) { /* * Replace recent_used_cpu with prev as it is a potential @@ -6223,69 +6256,55 @@ static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu) } /* - * compute_energy(): Estimates the energy that would be consumed if @p was + * compute_energy(): Estimates the energy that @pd would consume if @p was * migrated to @dst_cpu. compute_energy() predicts what will be the utilization - * landscape of the * CPUs after the task migration, and uses the Energy Model + * landscape of @pd's CPUs after the task migration, and uses the Energy Model * to compute what would be the energy if we decided to actually migrate that * task. */ static long compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd) { - unsigned int max_util, util_cfs, cpu_util, cpu_cap; - unsigned long sum_util, energy = 0; - struct task_struct *tsk; + struct cpumask *pd_mask = perf_domain_span(pd); + unsigned long cpu_cap = arch_scale_cpu_capacity(cpumask_first(pd_mask)); + unsigned long max_util = 0, sum_util = 0; int cpu; - for (; pd; pd = pd->next) { - struct cpumask *pd_mask = perf_domain_span(pd); + /* + * The capacity state of CPUs of the current rd can be driven by CPUs + * of another rd if they belong to the same pd. So, account for the + * utilization of these CPUs too by masking pd with cpu_online_mask + * instead of the rd span. + * + * If an entire pd is outside of the current rd, it will not appear in + * its pd list and will not be accounted by compute_energy(). + */ + for_each_cpu_and(cpu, pd_mask, cpu_online_mask) { + unsigned long cpu_util, util_cfs = cpu_util_next(cpu, p, dst_cpu); + struct task_struct *tsk = cpu == dst_cpu ? p : NULL; /* - * The energy model mandates all the CPUs of a performance - * domain have the same capacity. + * Busy time computation: utilization clamping is not + * required since the ratio (sum_util / cpu_capacity) + * is already enough to scale the EM reported power + * consumption at the (eventually clamped) cpu_capacity. */ - cpu_cap = arch_scale_cpu_capacity(cpumask_first(pd_mask)); - max_util = sum_util = 0; + sum_util += schedutil_cpu_util(cpu, util_cfs, cpu_cap, + ENERGY_UTIL, NULL); /* - * The capacity state of CPUs of the current rd can be driven by - * CPUs of another rd if they belong to the same performance - * domain. So, account for the utilization of these CPUs too - * by masking pd with cpu_online_mask instead of the rd span. - * - * If an entire performance domain is outside of the current rd, - * it will not appear in its pd list and will not be accounted - * by compute_energy(). + * Performance domain frequency: utilization clamping + * must be considered since it affects the selection + * of the performance domain frequency. + * NOTE: in case RT tasks are running, by default the + * FREQUENCY_UTIL's utilization can be max OPP. */ - for_each_cpu_and(cpu, pd_mask, cpu_online_mask) { - util_cfs = cpu_util_next(cpu, p, dst_cpu); - - /* - * Busy time computation: utilization clamping is not - * required since the ratio (sum_util / cpu_capacity) - * is already enough to scale the EM reported power - * consumption at the (eventually clamped) cpu_capacity. - */ - sum_util += schedutil_cpu_util(cpu, util_cfs, cpu_cap, - ENERGY_UTIL, NULL); - - /* - * Performance domain frequency: utilization clamping - * must be considered since it affects the selection - * of the performance domain frequency. - * NOTE: in case RT tasks are running, by default the - * FREQUENCY_UTIL's utilization can be max OPP. - */ - tsk = cpu == dst_cpu ? p : NULL; - cpu_util = schedutil_cpu_util(cpu, util_cfs, cpu_cap, - FREQUENCY_UTIL, tsk); - max_util = max(max_util, cpu_util); - } - - energy += em_pd_energy(pd->em_pd, max_util, sum_util); + cpu_util = schedutil_cpu_util(cpu, util_cfs, cpu_cap, + FREQUENCY_UTIL, tsk); + max_util = max(max_util, cpu_util); } - return energy; + return em_pd_energy(pd->em_pd, max_util, sum_util); } /* @@ -6327,21 +6346,19 @@ compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd) * other use-cases too. So, until someone finds a better way to solve this, * let's keep things simple by re-using the existing slow path. */ - static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) { - unsigned long prev_energy = ULONG_MAX, best_energy = ULONG_MAX; + unsigned long prev_delta = ULONG_MAX, best_delta = ULONG_MAX; struct root_domain *rd = cpu_rq(smp_processor_id())->rd; + unsigned long cpu_cap, util, base_energy = 0; int cpu, best_energy_cpu = prev_cpu; - struct perf_domain *head, *pd; - unsigned long cpu_cap, util; struct sched_domain *sd; + struct perf_domain *pd; rcu_read_lock(); pd = rcu_dereference(rd->pd); if (!pd || READ_ONCE(rd->overutilized)) goto fail; - head = pd; /* * Energy-aware wake-up happens on the lowest sched_domain starting @@ -6358,9 +6375,14 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) goto unlock; for (; pd; pd = pd->next) { - unsigned long cur_energy, spare_cap, max_spare_cap = 0; + unsigned long cur_delta, spare_cap, max_spare_cap = 0; + unsigned long base_energy_pd; int max_spare_cap_cpu = -1; + /* Compute the 'base' energy of the pd, without @p */ + base_energy_pd = compute_energy(p, -1, pd); + base_energy += base_energy_pd; + for_each_cpu_and(cpu, perf_domain_span(pd), sched_domain_span(sd)) { if (!cpumask_test_cpu(cpu, p->cpus_ptr)) continue; @@ -6368,14 +6390,14 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) /* Skip CPUs that will be overutilized. */ util = cpu_util_next(cpu, p, cpu); cpu_cap = capacity_of(cpu); - if (cpu_cap * 1024 < util * capacity_margin) + if (!fits_capacity(util, cpu_cap)) continue; /* Always use prev_cpu as a candidate. */ if (cpu == prev_cpu) { - prev_energy = compute_energy(p, prev_cpu, head); - best_energy = min(best_energy, prev_energy); - continue; + prev_delta = compute_energy(p, prev_cpu, pd); + prev_delta -= base_energy_pd; + best_delta = min(best_delta, prev_delta); } /* @@ -6391,9 +6413,10 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) /* Evaluate the energy impact of using this CPU. */ if (max_spare_cap_cpu >= 0) { - cur_energy = compute_energy(p, max_spare_cap_cpu, head); - if (cur_energy < best_energy) { - best_energy = cur_energy; + cur_delta = compute_energy(p, max_spare_cap_cpu, pd); + cur_delta -= base_energy_pd; + if (cur_delta < best_delta) { + best_delta = cur_delta; best_energy_cpu = max_spare_cap_cpu; } } @@ -6405,10 +6428,10 @@ unlock: * Pick the best CPU if prev_cpu cannot be used, or if it saves at * least 6% of the energy used by prev_cpu. */ - if (prev_energy == ULONG_MAX) + if (prev_delta == ULONG_MAX) return best_energy_cpu; - if ((prev_energy - best_energy) > (prev_energy >> 4)) + if ((prev_delta - best_delta) > ((prev_delta + base_energy) >> 4)) return best_energy_cpu; return prev_cpu; @@ -6742,7 +6765,7 @@ again: goto idle; #ifdef CONFIG_FAIR_GROUP_SCHED - if (prev->sched_class != &fair_sched_class) + if (!prev || prev->sched_class != &fair_sched_class) goto simple; /* @@ -6819,8 +6842,8 @@ again: goto done; simple: #endif - - put_prev_task(rq, prev); + if (prev) + put_prev_task(rq, prev); do { se = pick_next_entity(cfs_rq, NULL); @@ -6848,11 +6871,13 @@ done: __maybe_unused; return p; idle: - update_misfit_status(NULL, rq); - new_tasks = idle_balance(rq, rf); + if (!rf) + return NULL; + + new_tasks = newidle_balance(rq, rf); /* - * Because idle_balance() releases (and re-acquires) rq->lock, it is + * Because newidle_balance() releases (and re-acquires) rq->lock, it is * possible for any higher priority task to appear. In that case we * must re-start the pick_next_entity() loop. */ @@ -6874,7 +6899,7 @@ idle: /* * Account for a descheduled task: */ -static void put_prev_task_fair(struct rq *rq, struct task_struct *prev) +static void put_prev_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) { struct sched_entity *se = &prev->se; struct cfs_rq *cfs_rq; @@ -7376,7 +7401,7 @@ static int detach_tasks(struct lb_env *env) detached++; env->imbalance -= load; -#ifdef CONFIG_PREEMPT +#ifdef CONFIG_PREEMPTION /* * NEWIDLE balancing is a source of latency, so preemptible * kernels will stop after the first task is detached to minimize @@ -7923,8 +7948,7 @@ group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs) static inline bool group_smaller_min_cpu_capacity(struct sched_group *sg, struct sched_group *ref) { - return sg->sgc->min_capacity * capacity_margin < - ref->sgc->min_capacity * 1024; + return fits_capacity(sg->sgc->min_capacity, ref->sgc->min_capacity); } /* @@ -7934,8 +7958,7 @@ group_smaller_min_cpu_capacity(struct sched_group *sg, struct sched_group *ref) static inline bool group_smaller_max_cpu_capacity(struct sched_group *sg, struct sched_group *ref) { - return sg->sgc->max_capacity * capacity_margin < - ref->sgc->max_capacity * 1024; + return fits_capacity(sg->sgc->max_capacity, ref->sgc->max_capacity); } static inline enum @@ -8993,9 +9016,10 @@ more_balance: out_balanced: /* * We reach balance although we may have faced some affinity - * constraints. Clear the imbalance flag if it was set. + * constraints. Clear the imbalance flag only if other tasks got + * a chance to move and fix the imbalance. */ - if (sd_parent) { + if (sd_parent && !(env.flags & LBF_ALL_PINNED)) { int *group_imbalance = &sd_parent->groups->sgc->imbalance; if (*group_imbalance) @@ -9016,10 +9040,10 @@ out_one_pinned: ld_moved = 0; /* - * idle_balance() disregards balance intervals, so we could repeatedly - * reach this code, which would lead to balance_interval skyrocketting - * in a short amount of time. Skip the balance_interval increase logic - * to avoid that. + * newidle_balance() disregards balance intervals, so we could + * repeatedly reach this code, which would lead to balance_interval + * skyrocketting in a short amount of time. Skip the balance_interval + * increase logic to avoid that. */ if (env.idle == CPU_NEWLY_IDLE) goto out; @@ -9729,7 +9753,7 @@ static inline void nohz_newidle_balance(struct rq *this_rq) { } * idle_balance is called by schedule() if this_cpu is about to become * idle. Attempts to pull tasks from other CPUs. */ -static int idle_balance(struct rq *this_rq, struct rq_flags *rf) +int newidle_balance(struct rq *this_rq, struct rq_flags *rf) { unsigned long next_balance = jiffies + HZ; int this_cpu = this_rq->cpu; @@ -9737,6 +9761,7 @@ static int idle_balance(struct rq *this_rq, struct rq_flags *rf) int pulled_task = 0; u64 curr_cost = 0; + update_misfit_status(NULL, this_rq); /* * We must set idle_stamp _before_ calling idle_balance(), such that we * measure the duration of idle_balance() as idle time. @@ -10121,9 +10146,19 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p) * This routine is mostly called to set cfs_rq->curr field when a task * migrates between groups/classes. */ -static void set_curr_task_fair(struct rq *rq) +static void set_next_task_fair(struct rq *rq, struct task_struct *p) { - struct sched_entity *se = &rq->curr->se; + struct sched_entity *se = &p->se; + +#ifdef CONFIG_SMP + if (task_on_rq_queued(p)) { + /* + * Move the next running task to the front of the list, so our + * cfs_tasks list becomes MRU one. + */ + list_move(&se->group_node, &rq->cfs_tasks); + } +#endif for_each_sched_entity(se) { struct cfs_rq *cfs_rq = cfs_rq_of(se); @@ -10241,18 +10276,18 @@ err: void online_fair_sched_group(struct task_group *tg) { struct sched_entity *se; + struct rq_flags rf; struct rq *rq; int i; for_each_possible_cpu(i) { rq = cpu_rq(i); se = tg->se[i]; - - raw_spin_lock_irq(&rq->lock); + rq_lock_irq(rq, &rf); update_rq_clock(rq); attach_entity_cfs_rq(se); sync_throttle(tg, i); - raw_spin_unlock_irq(&rq->lock); + rq_unlock_irq(rq, &rf); } } @@ -10394,7 +10429,9 @@ const struct sched_class fair_sched_class = { .check_preempt_curr = check_preempt_wakeup, .pick_next_task = pick_next_task_fair, + .put_prev_task = put_prev_task_fair, + .set_next_task = set_next_task_fair, #ifdef CONFIG_SMP .select_task_rq = select_task_rq_fair, @@ -10407,7 +10444,6 @@ const struct sched_class fair_sched_class = { .set_cpus_allowed = set_cpus_allowed_common, #endif - .set_curr_task = set_curr_task_fair, .task_tick = task_tick_fair, .task_fork = task_fork_fair, @@ -10444,18 +10480,22 @@ void show_numa_stats(struct task_struct *p, struct seq_file *m) { int node; unsigned long tsf = 0, tpf = 0, gsf = 0, gpf = 0; + struct numa_group *ng; + rcu_read_lock(); + ng = rcu_dereference(p->numa_group); for_each_online_node(node) { if (p->numa_faults) { tsf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 0)]; tpf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 1)]; } - if (p->numa_group) { - gsf = p->numa_group->faults[task_faults_idx(NUMA_MEM, node, 0)], - gpf = p->numa_group->faults[task_faults_idx(NUMA_MEM, node, 1)]; + if (ng) { + gsf = ng->faults[task_faults_idx(NUMA_MEM, node, 0)], + gpf = ng->faults[task_faults_idx(NUMA_MEM, node, 1)]; } print_numa_stats(m, node, tsf, tpf, gsf, gpf); } + rcu_read_unlock(); } #endif /* CONFIG_NUMA_BALANCING */ #endif /* CONFIG_SCHED_DEBUG */ diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 80940939b733..8bfeb6395bdd 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -241,13 +241,14 @@ static void do_idle(void) check_pgt_cache(); rmb(); + local_irq_disable(); + if (cpu_is_offline(cpu)) { - tick_nohz_idle_stop_tick_protected(); + tick_nohz_idle_stop_tick(); cpuhp_report_idle_dead(); arch_cpu_idle_dead(); } - local_irq_disable(); arch_cpu_idle_enter(); /* @@ -374,14 +375,27 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl resched_curr(rq); } -static struct task_struct * -pick_next_task_idle(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) +static void put_prev_task_idle(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) +{ +} + +static void set_next_task_idle(struct rq *rq, struct task_struct *next) { - put_prev_task(rq, prev); update_idle_core(rq); schedstat_inc(rq->sched_goidle); +} + +static struct task_struct * +pick_next_task_idle(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) +{ + struct task_struct *next = rq->idle; - return rq->idle; + if (prev) + put_prev_task(rq, prev); + + set_next_task_idle(rq, next); + + return next; } /* @@ -397,10 +411,6 @@ dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags) raw_spin_lock_irq(&rq->lock); } -static void put_prev_task_idle(struct rq *rq, struct task_struct *prev) -{ -} - /* * scheduler tick hitting a task of our scheduling class. * @@ -413,10 +423,6 @@ static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued) { } -static void set_curr_task_idle(struct rq *rq) -{ -} - static void switched_to_idle(struct rq *rq, struct task_struct *p) { BUG(); @@ -451,13 +457,13 @@ const struct sched_class idle_sched_class = { .pick_next_task = pick_next_task_idle, .put_prev_task = put_prev_task_idle, + .set_next_task = set_next_task_idle, #ifdef CONFIG_SMP .select_task_rq = select_task_rq_idle, .set_cpus_allowed = set_cpus_allowed_common, #endif - .set_curr_task = set_curr_task_idle, .task_tick = task_tick_idle, .get_rr_interval = get_rr_interval_idle, diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c index ccb28085b114..9fcb2a695a41 100644 --- a/kernel/sched/isolation.c +++ b/kernel/sched/isolation.c @@ -22,9 +22,17 @@ EXPORT_SYMBOL_GPL(housekeeping_enabled); int housekeeping_any_cpu(enum hk_flags flags) { - if (static_branch_unlikely(&housekeeping_overridden)) - if (housekeeping_flags & flags) + int cpu; + + if (static_branch_unlikely(&housekeeping_overridden)) { + if (housekeeping_flags & flags) { + cpu = sched_numa_find_closest(housekeeping_mask, smp_processor_id()); + if (cpu < nr_cpu_ids) + return cpu; + return cpumask_any_and(housekeeping_mask, cpu_online_mask); + } + } return smp_processor_id(); } EXPORT_SYMBOL_GPL(housekeeping_any_cpu); diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 7acc632c3b82..517e3719027e 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -1051,7 +1051,7 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group, if (!rcu_access_pointer(group->poll_kworker)) { struct sched_param param = { - .sched_priority = MAX_RT_PRIO - 1, + .sched_priority = 1, }; struct kthread_worker *kworker; @@ -1061,7 +1061,7 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group, mutex_unlock(&group->trigger_lock); return ERR_CAST(kworker); } - sched_setscheduler(kworker->task, SCHED_FIFO, ¶m); + sched_setscheduler_nocheck(kworker->task, SCHED_FIFO, ¶m); kthread_init_delayed_work(&group->poll_work, psi_poll_work); rcu_assign_pointer(group->poll_kworker, kworker); @@ -1131,7 +1131,15 @@ static void psi_trigger_destroy(struct kref *ref) * deadlock while waiting for psi_poll_work to acquire trigger_lock */ if (kworker_to_destroy) { + /* + * After the RCU grace period has expired, the worker + * can no longer be found through group->poll_kworker. + * But it might have been already scheduled before + * that - deschedule it cleanly before destroying it. + */ kthread_cancel_delayed_work_sync(&group->poll_work); + atomic_set(&group->poll_scheduled, 0); + kthread_destroy_worker(kworker_to_destroy); } kfree(t); @@ -1190,7 +1198,7 @@ static ssize_t psi_write(struct file *file, const char __user *user_buf, if (static_branch_likely(&psi_disabled)) return -EOPNOTSUPP; - buf_size = min(nbytes, (sizeof(buf) - 1)); + buf_size = min(nbytes, sizeof(buf)); if (copy_from_user(buf, user_buf, buf_size)) return -EFAULT; diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index a532558a5176..858c4cc6f99b 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1498,12 +1498,22 @@ static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flag #endif } -static inline void set_next_task(struct rq *rq, struct task_struct *p) +static inline void set_next_task_rt(struct rq *rq, struct task_struct *p) { p->se.exec_start = rq_clock_task(rq); /* The running task is never eligible for pushing */ dequeue_pushable_task(rq, p); + + /* + * If prev task was rt, put_prev_task() has already updated the + * utilization. We only care of the case where we start to schedule a + * rt task + */ + if (rq->curr->sched_class != &rt_sched_class) + update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 0); + + rt_queue_push_tasks(rq); } static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq, @@ -1543,56 +1553,19 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) struct task_struct *p; struct rt_rq *rt_rq = &rq->rt; - if (need_pull_rt_task(rq, prev)) { - /* - * This is OK, because current is on_cpu, which avoids it being - * picked for load-balance and preemption/IRQs are still - * disabled avoiding further scheduler activity on it and we're - * being very careful to re-start the picking loop. - */ - rq_unpin_lock(rq, rf); - pull_rt_task(rq); - rq_repin_lock(rq, rf); - /* - * pull_rt_task() can drop (and re-acquire) rq->lock; this - * means a dl or stop task can slip in, in which case we need - * to re-start task selection. - */ - if (unlikely((rq->stop && task_on_rq_queued(rq->stop)) || - rq->dl.dl_nr_running)) - return RETRY_TASK; - } - - /* - * We may dequeue prev's rt_rq in put_prev_task(). - * So, we update time before rt_queued check. - */ - if (prev->sched_class == &rt_sched_class) - update_curr_rt(rq); + WARN_ON_ONCE(prev || rf); if (!rt_rq->rt_queued) return NULL; - put_prev_task(rq, prev); - p = _pick_next_task_rt(rq); - set_next_task(rq, p); - - rt_queue_push_tasks(rq); - - /* - * If prev task was rt, put_prev_task() has already updated the - * utilization. We only care of the case where we start to schedule a - * rt task - */ - if (rq->curr->sched_class != &rt_sched_class) - update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 0); + set_next_task_rt(rq, p); return p; } -static void put_prev_task_rt(struct rq *rq, struct task_struct *p) +static void put_prev_task_rt(struct rq *rq, struct task_struct *p, struct rq_flags *rf) { update_curr_rt(rq); @@ -1604,6 +1577,18 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p) */ if (on_rt_rq(&p->rt) && p->nr_cpus_allowed > 1) enqueue_pushable_task(rq, p); + + if (rf && !on_rt_rq(&p->rt) && need_pull_rt_task(rq, p)) { + /* + * This is OK, because current is on_cpu, which avoids it being + * picked for load-balance and preemption/IRQs are still + * disabled avoiding further scheduler activity on it and we've + * not yet started the picking loop. + */ + rq_unpin_lock(rq, rf); + pull_rt_task(rq); + rq_repin_lock(rq, rf); + } } #ifdef CONFIG_SMP @@ -2354,11 +2339,6 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued) } } -static void set_curr_task_rt(struct rq *rq) -{ - set_next_task(rq, rq->curr); -} - static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task) { /* @@ -2380,6 +2360,7 @@ const struct sched_class rt_sched_class = { .pick_next_task = pick_next_task_rt, .put_prev_task = put_prev_task_rt, + .set_next_task = set_next_task_rt, #ifdef CONFIG_SMP .select_task_rq = select_task_rq_rt, @@ -2391,7 +2372,6 @@ const struct sched_class rt_sched_class = { .switched_from = switched_from_rt, #endif - .set_curr_task = set_curr_task_rt, .task_tick = task_tick_rt, .get_rr_interval = get_rr_interval_rt, diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 802b1f3405f2..b3cb895d14a2 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -335,8 +335,6 @@ struct cfs_bandwidth { u64 quota; u64 runtime; s64 hierarchical_quota; - u64 runtime_expires; - int expires_seq; u8 idle; u8 period_active; @@ -393,6 +391,16 @@ struct task_group { #endif struct cfs_bandwidth cfs_bandwidth; + +#ifdef CONFIG_UCLAMP_TASK_GROUP + /* The two decimal precision [%] value requested from user-space */ + unsigned int uclamp_pct[UCLAMP_CNT]; + /* Clamp values requested for a task group */ + struct uclamp_se uclamp_req[UCLAMP_CNT]; + /* Effective clamp values used for a task group */ + struct uclamp_se uclamp[UCLAMP_CNT]; +#endif + }; #ifdef CONFIG_FAIR_GROUP_SCHED @@ -483,7 +491,8 @@ struct cfs_rq { struct load_weight load; unsigned long runnable_weight; unsigned int nr_running; - unsigned int h_nr_running; + unsigned int h_nr_running; /* SCHED_{NORMAL,BATCH,IDLE} */ + unsigned int idle_h_nr_running; /* SCHED_IDLE */ u64 exec_clock; u64 min_vruntime; @@ -556,8 +565,6 @@ struct cfs_rq { #ifdef CONFIG_CFS_BANDWIDTH int runtime_enabled; - int expires_seq; - u64 runtime_expires; s64 runtime_remaining; u64 throttled_clock; @@ -777,9 +784,6 @@ struct root_domain { struct perf_domain __rcu *pd; }; -extern struct root_domain def_root_domain; -extern struct mutex sched_domains_mutex; - extern void init_defrootdomain(void); extern int sched_init_domains(const struct cpumask *cpu_map); extern void rq_attach_root(struct rq *rq, struct root_domain *rd); @@ -1261,16 +1265,18 @@ enum numa_topology_type { extern enum numa_topology_type sched_numa_topology_type; extern int sched_max_numa_distance; extern bool find_numa_distance(int distance); -#endif - -#ifdef CONFIG_NUMA extern void sched_init_numa(void); extern void sched_domains_numa_masks_set(unsigned int cpu); extern void sched_domains_numa_masks_clear(unsigned int cpu); +extern int sched_numa_find_closest(const struct cpumask *cpus, int cpu); #else static inline void sched_init_numa(void) { } static inline void sched_domains_numa_masks_set(unsigned int cpu) { } static inline void sched_domains_numa_masks_clear(unsigned int cpu) { } +static inline int sched_numa_find_closest(const struct cpumask *cpus, int cpu) +{ + return nr_cpu_ids; +} #endif #ifdef CONFIG_NUMA_BALANCING @@ -1449,10 +1455,14 @@ static inline void unregister_sched_domain_sysctl(void) } #endif +extern int newidle_balance(struct rq *this_rq, struct rq_flags *rf); + #else static inline void sched_ttwu_pending(void) { } +static inline int newidle_balance(struct rq *this_rq, struct rq_flags *rf) { return 0; } + #endif /* CONFIG_SMP */ #include "stats.h" @@ -1700,17 +1710,21 @@ struct sched_class { void (*check_preempt_curr)(struct rq *rq, struct task_struct *p, int flags); /* - * It is the responsibility of the pick_next_task() method that will - * return the next task to call put_prev_task() on the @prev task or - * something equivalent. + * Both @prev and @rf are optional and may be NULL, in which case the + * caller must already have invoked put_prev_task(rq, prev, rf). + * + * Otherwise it is the responsibility of the pick_next_task() to call + * put_prev_task() on the @prev task or something equivalent, IFF it + * returns a next task. * - * May return RETRY_TASK when it finds a higher prio class has runnable - * tasks. + * In that case (@rf != NULL) it may return RETRY_TASK when it finds a + * higher prio class has runnable tasks. */ struct task_struct * (*pick_next_task)(struct rq *rq, struct task_struct *prev, struct rq_flags *rf); - void (*put_prev_task)(struct rq *rq, struct task_struct *p); + void (*put_prev_task)(struct rq *rq, struct task_struct *p, struct rq_flags *rf); + void (*set_next_task)(struct rq *rq, struct task_struct *p); #ifdef CONFIG_SMP int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags); @@ -1725,7 +1739,6 @@ struct sched_class { void (*rq_offline)(struct rq *rq); #endif - void (*set_curr_task)(struct rq *rq); void (*task_tick)(struct rq *rq, struct task_struct *p, int queued); void (*task_fork)(struct task_struct *p); void (*task_dead)(struct task_struct *p); @@ -1755,12 +1768,14 @@ struct sched_class { static inline void put_prev_task(struct rq *rq, struct task_struct *prev) { - prev->sched_class->put_prev_task(rq, prev); + WARN_ON_ONCE(rq->curr != prev); + prev->sched_class->put_prev_task(rq, prev, NULL); } -static inline void set_curr_task(struct rq *rq, struct task_struct *curr) +static inline void set_next_task(struct rq *rq, struct task_struct *next) { - curr->sched_class->set_curr_task(rq); + WARN_ON_ONCE(rq->curr != next); + next->sched_class->set_next_task(rq, next); } #ifdef CONFIG_SMP @@ -1943,7 +1958,7 @@ unsigned long arch_scale_freq_capacity(int cpu) #endif #ifdef CONFIG_SMP -#ifdef CONFIG_PREEMPT +#ifdef CONFIG_PREEMPTION static inline void double_rq_lock(struct rq *rq1, struct rq *rq2); @@ -1995,7 +2010,7 @@ static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest) return ret; } -#endif /* CONFIG_PREEMPT */ +#endif /* CONFIG_PREEMPTION */ /* * double_lock_balance - lock the busiest runqueue, this_rq is locked already. @@ -2266,7 +2281,7 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {} #endif /* CONFIG_CPU_FREQ */ #ifdef CONFIG_UCLAMP_TASK -unsigned int uclamp_eff_value(struct task_struct *p, unsigned int clamp_id); +enum uclamp_id uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id); static __always_inline unsigned int uclamp_util_with(struct rq *rq, unsigned int util, diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index aa0de240fb41..ba683fe81a6e 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -157,9 +157,10 @@ static inline void sched_info_dequeued(struct rq *rq, struct task_struct *t) { unsigned long long now = rq_clock(rq), delta = 0; - if (unlikely(sched_info_on())) + if (sched_info_on()) { if (t->sched_info.last_queued) delta = now - t->sched_info.last_queued; + } sched_info_reset_dequeued(t); t->sched_info.run_delay += delta; @@ -192,7 +193,7 @@ static void sched_info_arrive(struct rq *rq, struct task_struct *t) */ static inline void sched_info_queued(struct rq *rq, struct task_struct *t) { - if (unlikely(sched_info_on())) { + if (sched_info_on()) { if (!t->sched_info.last_queued) t->sched_info.last_queued = rq_clock(rq); } @@ -239,7 +240,7 @@ __sched_info_switch(struct rq *rq, struct task_struct *prev, struct task_struct static inline void sched_info_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next) { - if (unlikely(sched_info_on())) + if (sched_info_on()) __sched_info_switch(rq, prev, next); } diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index c183b790ca54..7e1cee4e65b2 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -23,17 +23,22 @@ check_preempt_curr_stop(struct rq *rq, struct task_struct *p, int flags) /* we're never preempted */ } +static void set_next_task_stop(struct rq *rq, struct task_struct *stop) +{ + stop->se.exec_start = rq_clock_task(rq); +} + static struct task_struct * pick_next_task_stop(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) { struct task_struct *stop = rq->stop; + WARN_ON_ONCE(prev || rf); + if (!stop || !task_on_rq_queued(stop)) return NULL; - put_prev_task(rq, prev); - - stop->se.exec_start = rq_clock_task(rq); + set_next_task_stop(rq, stop); return stop; } @@ -55,7 +60,7 @@ static void yield_task_stop(struct rq *rq) BUG(); /* the stop task should never yield, its pointless. */ } -static void put_prev_task_stop(struct rq *rq, struct task_struct *prev) +static void put_prev_task_stop(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) { struct task_struct *curr = rq->curr; u64 delta_exec; @@ -86,13 +91,6 @@ static void task_tick_stop(struct rq *rq, struct task_struct *curr, int queued) { } -static void set_curr_task_stop(struct rq *rq) -{ - struct task_struct *stop = rq->stop; - - stop->se.exec_start = rq_clock_task(rq); -} - static void switched_to_stop(struct rq *rq, struct task_struct *p) { BUG(); /* its impossible to change to this class */ @@ -128,13 +126,13 @@ const struct sched_class stop_sched_class = { .pick_next_task = pick_next_task_stop, .put_prev_task = put_prev_task_stop, + .set_next_task = set_next_task_stop, #ifdef CONFIG_SMP .select_task_rq = select_task_rq_stop, .set_cpus_allowed = set_cpus_allowed_common, #endif - .set_curr_task = set_curr_task_stop, .task_tick = task_tick_stop, .get_rr_interval = get_rr_interval_stop, diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index f751ce0b783e..b5667a273bf6 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1284,6 +1284,7 @@ static int sched_domains_curr_level; int sched_max_numa_distance; static int *sched_domains_numa_distance; static struct cpumask ***sched_domains_numa_masks; +int __read_mostly node_reclaim_distance = RECLAIM_DISTANCE; #endif /* @@ -1402,7 +1403,7 @@ sd_init(struct sched_domain_topology_level *tl, sd->flags &= ~SD_PREFER_SIBLING; sd->flags |= SD_SERIALIZE; - if (sched_domains_numa_distance[tl->numa_level] > RECLAIM_DISTANCE) { + if (sched_domains_numa_distance[tl->numa_level] > node_reclaim_distance) { sd->flags &= ~(SD_BALANCE_EXEC | SD_BALANCE_FORK | SD_WAKE_AFFINE); @@ -1724,6 +1725,26 @@ void sched_domains_numa_masks_clear(unsigned int cpu) } } +/* + * sched_numa_find_closest() - given the NUMA topology, find the cpu + * closest to @cpu from @cpumask. + * cpumask: cpumask to find a cpu from + * cpu: cpu to be close to + * + * returns: cpu, or nr_cpu_ids when nothing found. + */ +int sched_numa_find_closest(const struct cpumask *cpus, int cpu) +{ + int i, j = cpu_to_node(cpu); + + for (i = 0; i < sched_domains_numa_levels; i++) { + cpu = cpumask_any_and(cpus, sched_domains_numa_masks[i][j]); + if (cpu < nr_cpu_ids) + return cpu; + } + return nr_cpu_ids; +} + #endif /* CONFIG_NUMA */ static int __sdt_alloc(const struct cpumask *cpu_map) @@ -2149,16 +2170,16 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur, * ndoms_new == 0 is a special case for destroying existing domains, * and it will not create the default domain. * - * Call with hotplug lock held + * Call with hotplug lock and sched_domains_mutex held */ -void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], - struct sched_domain_attr *dattr_new) +void partition_sched_domains_locked(int ndoms_new, cpumask_var_t doms_new[], + struct sched_domain_attr *dattr_new) { bool __maybe_unused has_eas = false; int i, j, n; int new_topology; - mutex_lock(&sched_domains_mutex); + lockdep_assert_held(&sched_domains_mutex); /* Always unregister in case we don't destroy any domains: */ unregister_sched_domain_sysctl(); @@ -2183,8 +2204,19 @@ void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], for (i = 0; i < ndoms_cur; i++) { for (j = 0; j < n && !new_topology; j++) { if (cpumask_equal(doms_cur[i], doms_new[j]) && - dattrs_equal(dattr_cur, i, dattr_new, j)) + dattrs_equal(dattr_cur, i, dattr_new, j)) { + struct root_domain *rd; + + /* + * This domain won't be destroyed and as such + * its dl_bw->total_bw needs to be cleared. It + * will be recomputed in function + * update_tasks_root_domain(). + */ + rd = cpu_rq(cpumask_any(doms_cur[i]))->rd; + dl_clear_root_domain(rd); goto match1; + } } /* No match - a current sched domain not in new doms_new[] */ detach_destroy_domains(doms_cur[i]); @@ -2241,6 +2273,15 @@ match3: ndoms_cur = ndoms_new; register_sched_domain_sysctl(); +} +/* + * Call with hotplug lock held + */ +void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], + struct sched_domain_attr *dattr_new) +{ + mutex_lock(&sched_domains_mutex); + partition_sched_domains_locked(ndoms_new, doms_new, dattr_new); mutex_unlock(&sched_domains_mutex); } diff --git a/kernel/signal.c b/kernel/signal.c index 91b789dd6e72..c4da1ef56fdf 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -90,6 +90,11 @@ static bool sig_task_ignored(struct task_struct *t, int sig, bool force) handler == SIG_DFL && !(force && sig_kernel_only(sig))) return true; + /* Only allow kernel generated signals to this kthread */ + if (unlikely((t->flags & PF_KTHREAD) && + (handler == SIG_KTHREAD_KERNEL) && !force)) + return true; + return sig_handler_ignored(handler, sig); } @@ -349,7 +354,7 @@ void task_clear_jobctl_pending(struct task_struct *task, unsigned long mask) * @task has %JOBCTL_STOP_PENDING set and is participating in a group stop. * Group stop states are cleared and the group stop count is consumed if * %JOBCTL_STOP_CONSUME was set. If the consumption completes the group - * stop, the appropriate %SIGNAL_* flags are set. + * stop, the appropriate `SIGNAL_*` flags are set. * * CONTEXT: * Must be called with @task->sighand->siglock held. @@ -1885,6 +1890,7 @@ static void do_notify_pidfd(struct task_struct *task) { struct pid *pid; + WARN_ON(task->exit_state == 0); pid = task_pid(task); wake_up_all(&pid->wait_pidfd); } @@ -3672,8 +3678,11 @@ static int copy_siginfo_from_user_any(kernel_siginfo_t *kinfo, siginfo_t *info) static struct pid *pidfd_to_pid(const struct file *file) { - if (file->f_op == &pidfd_fops) - return file->private_data; + struct pid *pid; + + pid = pidfd_pid(file); + if (!IS_ERR(pid)) + return pid; return tgid_pidfd_to_pid(file); } diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c index f5440abb7532..6d1f68b7e528 100644 --- a/kernel/stacktrace.c +++ b/kernel/stacktrace.c @@ -20,7 +20,7 @@ * @nr_entries: Number of entries in the storage array * @spaces: Number of leading spaces to print */ -void stack_trace_print(unsigned long *entries, unsigned int nr_entries, +void stack_trace_print(const unsigned long *entries, unsigned int nr_entries, int spaces) { unsigned int i; @@ -43,7 +43,7 @@ EXPORT_SYMBOL_GPL(stack_trace_print); * * Return: Number of bytes printed. */ -int stack_trace_snprint(char *buf, size_t size, unsigned long *entries, +int stack_trace_snprint(char *buf, size_t size, const unsigned long *entries, unsigned int nr_entries, int spaces) { unsigned int generated, i, total = 0; diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index b4f83f7bdf86..c7031a22aa7b 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -383,6 +383,7 @@ static bool queue_stop_cpus_work(const struct cpumask *cpumask, */ preempt_disable(); stop_cpus_in_progress = true; + barrier(); for_each_cpu(cpu, cpumask) { work = &per_cpu(cpu_stopper.stop_work, cpu); work->fn = fn; @@ -391,6 +392,7 @@ static bool queue_stop_cpus_work(const struct cpumask *cpumask, if (cpu_stop_queue_work(cpu, work)) queued = true; } + barrier(); stop_cpus_in_progress = false; preempt_enable(); diff --git a/kernel/sys.c b/kernel/sys.c index 2969304c29fe..d605fe5e58a5 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -103,12 +103,6 @@ #ifndef SET_TSC_CTL # define SET_TSC_CTL(a) (-EINVAL) #endif -#ifndef MPX_ENABLE_MANAGEMENT -# define MPX_ENABLE_MANAGEMENT() (-EINVAL) -#endif -#ifndef MPX_DISABLE_MANAGEMENT -# define MPX_DISABLE_MANAGEMENT() (-EINVAL) -#endif #ifndef GET_FP_MODE # define GET_FP_MODE(a) (-EINVAL) #endif @@ -124,6 +118,12 @@ #ifndef PAC_RESET_KEYS # define PAC_RESET_KEYS(a, b) (-EINVAL) #endif +#ifndef SET_TAGGED_ADDR_CTRL +# define SET_TAGGED_ADDR_CTRL(a) (-EINVAL) +#endif +#ifndef GET_TAGGED_ADDR_CTRL +# define GET_TAGGED_ADDR_CTRL() (-EINVAL) +#endif /* * this is where the system-wide overflow UID and GID are defined, for @@ -2456,15 +2456,9 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, up_write(&me->mm->mmap_sem); break; case PR_MPX_ENABLE_MANAGEMENT: - if (arg2 || arg3 || arg4 || arg5) - return -EINVAL; - error = MPX_ENABLE_MANAGEMENT(); - break; case PR_MPX_DISABLE_MANAGEMENT: - if (arg2 || arg3 || arg4 || arg5) - return -EINVAL; - error = MPX_DISABLE_MANAGEMENT(); - break; + /* No longer implemented: */ + return -EINVAL; case PR_SET_FP_MODE: error = SET_FP_MODE(me, arg2); break; @@ -2492,6 +2486,16 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, return -EINVAL; error = PAC_RESET_KEYS(me, arg2); break; + case PR_SET_TAGGED_ADDR_CTRL: + if (arg3 || arg4 || arg5) + return -EINVAL; + error = SET_TAGGED_ADDR_CTRL(arg2); + break; + case PR_GET_TAGGED_ADDR_CTRL: + if (arg2 || arg3 || arg4 || arg5) + return -EINVAL; + error = GET_TAGGED_ADDR_CTRL(); + break; default: error = -EINVAL; break; diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 57518efc3810..b7d75a9e8ccf 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -672,7 +672,7 @@ static int alarm_timer_create(struct k_itimer *new_timer) enum alarmtimer_type type; if (!alarmtimer_get_rtcdev()) - return -ENOTSUPP; + return -EOPNOTSUPP; if (!capable(CAP_WAKE_ALARM)) return -EPERM; @@ -790,7 +790,7 @@ static int alarm_timer_nsleep(const clockid_t which_clock, int flags, int ret = 0; if (!alarmtimer_get_rtcdev()) - return -ENOTSUPP; + return -EOPNOTSUPP; if (flags & ~TIMER_ABSTIME) return -EINVAL; diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index d911c8470149..ca69290bee2a 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -146,6 +146,11 @@ static void tk_set_wall_to_mono(struct timekeeper *tk, struct timespec64 wtm) static inline void tk_update_sleep_time(struct timekeeper *tk, ktime_t delta) { tk->offs_boot = ktime_add(tk->offs_boot, delta); + /* + * Timespec representation for VDSO update to avoid 64bit division + * on every update. + */ + tk->monotonic_to_boot = ktime_to_timespec64(tk->offs_boot); } /* diff --git a/kernel/time/vsyscall.c b/kernel/time/vsyscall.c index 8cf3596a4ce6..4bc37ac3bb05 100644 --- a/kernel/time/vsyscall.c +++ b/kernel/time/vsyscall.c @@ -17,7 +17,7 @@ static inline void update_vdso_data(struct vdso_data *vdata, struct timekeeper *tk) { struct vdso_timestamp *vdso_ts; - u64 nsec; + u64 nsec, sec; vdata[CS_HRES_COARSE].cycle_last = tk->tkr_mono.cycle_last; vdata[CS_HRES_COARSE].mask = tk->tkr_mono.mask; @@ -45,23 +45,27 @@ static inline void update_vdso_data(struct vdso_data *vdata, } vdso_ts->nsec = nsec; - /* CLOCK_MONOTONIC_RAW */ - vdso_ts = &vdata[CS_RAW].basetime[CLOCK_MONOTONIC_RAW]; - vdso_ts->sec = tk->raw_sec; - vdso_ts->nsec = tk->tkr_raw.xtime_nsec; + /* Copy MONOTONIC time for BOOTTIME */ + sec = vdso_ts->sec; + /* Add the boot offset */ + sec += tk->monotonic_to_boot.tv_sec; + nsec += (u64)tk->monotonic_to_boot.tv_nsec << tk->tkr_mono.shift; /* CLOCK_BOOTTIME */ vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_BOOTTIME]; - vdso_ts->sec = tk->xtime_sec + tk->wall_to_monotonic.tv_sec; - nsec = tk->tkr_mono.xtime_nsec; - nsec += ((u64)(tk->wall_to_monotonic.tv_nsec + - ktime_to_ns(tk->offs_boot)) << tk->tkr_mono.shift); + vdso_ts->sec = sec; + while (nsec >= (((u64)NSEC_PER_SEC) << tk->tkr_mono.shift)) { nsec -= (((u64)NSEC_PER_SEC) << tk->tkr_mono.shift); vdso_ts->sec++; } vdso_ts->nsec = nsec; + /* CLOCK_MONOTONIC_RAW */ + vdso_ts = &vdata[CS_RAW].basetime[CLOCK_MONOTONIC_RAW]; + vdso_ts->sec = tk->raw_sec; + vdso_ts->nsec = tk->tkr_raw.xtime_nsec; + /* CLOCK_TAI */ vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_TAI]; vdso_ts->sec = tk->xtime_sec + (s64)tk->tai_offset; diff --git a/kernel/torture.c b/kernel/torture.c index a8d9bdfba7c3..7c13f5558b71 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -263,7 +263,6 @@ static void torture_onoff_cleanup(void) onoff_task = NULL; #endif /* #ifdef CONFIG_HOTPLUG_CPU */ } -EXPORT_SYMBOL_GPL(torture_onoff_cleanup); /* * Print online/offline testing statistics. @@ -449,7 +448,6 @@ static void torture_shuffle_cleanup(void) } shuffler_task = NULL; } -EXPORT_SYMBOL_GPL(torture_shuffle_cleanup); /* * Variables for auto-shutdown. This allows "lights out" torture runs diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 98da8998c25c..6a64d7772870 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -146,7 +146,7 @@ config FUNCTION_TRACER select GENERIC_TRACER select CONTEXT_SWITCH_TRACER select GLOB - select TASKS_RCU if PREEMPT + select TASKS_RCU if PREEMPTION help Enable the kernel to trace every kernel function. This is done by using a compiler feature to insert a small, 5-byte No-Operation @@ -179,7 +179,7 @@ config TRACE_PREEMPT_TOGGLE config PREEMPTIRQ_EVENTS bool "Enable trace events for preempt and irq disable/enable" select TRACE_IRQFLAGS - select TRACE_PREEMPT_TOGGLE if PREEMPT + select TRACE_PREEMPT_TOGGLE if PREEMPTION select GENERIC_TRACER default n help @@ -214,7 +214,7 @@ config PREEMPT_TRACER bool "Preemption-off Latency Tracer" default n depends on !ARCH_USES_GETTIMEOFFSET - depends on PREEMPT + depends on PREEMPTION select GENERIC_TRACER select TRACER_MAX_TRACE select RING_BUFFER_ALLOW_SWAP diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index eca34503f178..356b848c697a 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2814,7 +2814,7 @@ int ftrace_shutdown(struct ftrace_ops *ops, int command) * synchornize_rcu_tasks() will wait for those tasks to * execute and either schedule voluntarily or enter user space. */ - if (IS_ENABLED(CONFIG_PREEMPT)) + if (IS_ENABLED(CONFIG_PREEMPTION)) synchronize_rcu_tasks(); free_ops: @@ -3095,6 +3095,14 @@ t_probe_next(struct seq_file *m, loff_t *pos) hnd = &iter->probe_entry->hlist; hash = iter->probe->ops.func_hash->filter_hash; + + /* + * A probe being registered may temporarily have an empty hash + * and it's at the end of the func_probes list. + */ + if (!hash || hash == EMPTY_HASH) + return NULL; + size = 1 << hash->size_bits; retry: @@ -4320,12 +4328,21 @@ register_ftrace_function_probe(char *glob, struct trace_array *tr, mutex_unlock(&ftrace_lock); + /* + * Note, there's a small window here that the func_hash->filter_hash + * may be NULL or empty. Need to be carefule when reading the loop. + */ mutex_lock(&probe->ops.func_hash->regex_lock); orig_hash = &probe->ops.func_hash->filter_hash; old_hash = *orig_hash; hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, old_hash); + if (!hash) { + ret = -ENOMEM; + goto out; + } + ret = ftrace_match_records(hash, glob, strlen(glob)); /* Nothing found? */ diff --git a/kernel/trace/ftrace_internal.h b/kernel/trace/ftrace_internal.h index 0515a2096f90..0456e0a3dab1 100644 --- a/kernel/trace/ftrace_internal.h +++ b/kernel/trace/ftrace_internal.h @@ -6,22 +6,22 @@ /* * Traverse the ftrace_global_list, invoking all entries. The reason that we - * can use rcu_dereference_raw_notrace() is that elements removed from this list + * can use rcu_dereference_raw_check() is that elements removed from this list * are simply leaked, so there is no need to interact with a grace-period - * mechanism. The rcu_dereference_raw_notrace() calls are needed to handle + * mechanism. The rcu_dereference_raw_check() calls are needed to handle * concurrent insertions into the ftrace_global_list. * * Silly Alpha and silly pointer-speculation compiler optimizations! */ #define do_for_each_ftrace_op(op, list) \ - op = rcu_dereference_raw_notrace(list); \ + op = rcu_dereference_raw_check(list); \ do /* * Optimized for just a single item in the list (as that is the normal case). */ #define while_for_each_ftrace_op(op) \ - while (likely(op = rcu_dereference_raw_notrace((op)->next)) && \ + while (likely(op = rcu_dereference_raw_check((op)->next)) && \ unlikely((op) != &ftrace_list_end)) extern struct ftrace_ops __rcu *ftrace_ops_list; diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c index 0564f6db0561..09b0b49f346e 100644 --- a/kernel/trace/ring_buffer_benchmark.c +++ b/kernel/trace/ring_buffer_benchmark.c @@ -267,7 +267,7 @@ static void ring_buffer_producer(void) if (consumer && !(cnt % wakeup_interval)) wake_up_process(consumer); -#ifndef CONFIG_PREEMPT +#ifndef CONFIG_PREEMPTION /* * If we are a non preempt kernel, the 10 second run will * stop everything while it runs. Instead, we will call diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 525a97fbbc60..947ba433865f 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1567,9 +1567,9 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu, /** * update_max_tr_single - only copy one trace over, and reset the rest - * @tr - tracer - * @tsk - task with the latency - * @cpu - the cpu of the buffer to copy. + * @tr: tracer + * @tsk: task with the latency + * @cpu: the cpu of the buffer to copy. * * Flip the trace of a single CPU buffer between the @tr and the max_tr. */ @@ -1767,7 +1767,7 @@ static void __init apply_trace_boot_options(void); /** * register_tracer - register a tracer with the ftrace system. - * @type - the plugin for the tracer + * @type: the plugin for the tracer * * Register a new plugin tracer. */ @@ -2230,9 +2230,9 @@ static bool tracing_record_taskinfo_skip(int flags) /** * tracing_record_taskinfo - record the task info of a task * - * @task - task to record - * @flags - TRACE_RECORD_CMDLINE for recording comm - * - TRACE_RECORD_TGID for recording tgid + * @task: task to record + * @flags: TRACE_RECORD_CMDLINE for recording comm + * TRACE_RECORD_TGID for recording tgid */ void tracing_record_taskinfo(struct task_struct *task, int flags) { @@ -2258,10 +2258,10 @@ void tracing_record_taskinfo(struct task_struct *task, int flags) /** * tracing_record_taskinfo_sched_switch - record task info for sched_switch * - * @prev - previous task during sched_switch - * @next - next task during sched_switch - * @flags - TRACE_RECORD_CMDLINE for recording comm - * TRACE_RECORD_TGID for recording tgid + * @prev: previous task during sched_switch + * @next: next task during sched_switch + * @flags: TRACE_RECORD_CMDLINE for recording comm + * TRACE_RECORD_TGID for recording tgid */ void tracing_record_taskinfo_sched_switch(struct task_struct *prev, struct task_struct *next, int flags) @@ -2642,10 +2642,10 @@ static void ftrace_exports(struct ring_buffer_event *event) preempt_disable_notrace(); - export = rcu_dereference_raw_notrace(ftrace_exports_list); + export = rcu_dereference_raw_check(ftrace_exports_list); while (export) { trace_process_export(export, event); - export = rcu_dereference_raw_notrace(export->next); + export = rcu_dereference_raw_check(export->next); } preempt_enable_notrace(); @@ -3072,7 +3072,9 @@ static void trace_printk_start_stop_comm(int enabled) /** * trace_vbprintk - write binary msg to tracing buffer - * + * @ip: The address of the caller + * @fmt: The string format to write to the buffer + * @args: Arguments for @fmt */ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) { diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index c7506bc81b75..b89cdfe20bc1 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -255,12 +255,12 @@ void *trace_event_buffer_reserve(struct trace_event_buffer *fbuffer, local_save_flags(fbuffer->flags); fbuffer->pc = preempt_count(); /* - * If CONFIG_PREEMPT is enabled, then the tracepoint itself disables + * If CONFIG_PREEMPTION is enabled, then the tracepoint itself disables * preemption (adding one to the preempt_count). Since we are * interested in the preempt_count at the time the tracepoint was * hit, we need to subtract one to offset the increment. */ - if (IS_ENABLED(CONFIG_PREEMPT)) + if (IS_ENABLED(CONFIG_PREEMPTION)) fbuffer->pc--; fbuffer->trace_file = trace_file; @@ -787,7 +787,7 @@ static int __ftrace_set_clr_event(struct trace_array *tr, const char *match, return ret; } -static int ftrace_set_clr_event(struct trace_array *tr, char *buf, int set) +int ftrace_set_clr_event(struct trace_array *tr, char *buf, int set) { char *event = NULL, *sub = NULL, *match; int ret; diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 69ebf3c2f1b5..78af97163147 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -137,6 +137,13 @@ int trace_graph_entry(struct ftrace_graph_ent *trace) if (trace_recursion_test(TRACE_GRAPH_NOTRACE_BIT)) return 0; + /* + * Do not trace a function if it's filtered by set_graph_notrace. + * Make the index of ret stack negative to indicate that it should + * ignore further functions. But it needs its own ret stack entry + * to recover the original index in order to continue tracing after + * returning from the function. + */ if (ftrace_graph_notrace_addr(trace->func)) { trace_recursion_set(TRACE_GRAPH_NOTRACE_BIT); /* @@ -156,16 +163,6 @@ int trace_graph_entry(struct ftrace_graph_ent *trace) return 0; /* - * Do not trace a function if it's filtered by set_graph_notrace. - * Make the index of ret stack negative to indicate that it should - * ignore further functions. But it needs its own ret stack entry - * to recover the original index in order to continue tracing after - * returning from the function. - */ - if (ftrace_graph_notrace_addr(trace->func)) - return 1; - - /* * Stop here if tracing_threshold is set. We only write function return * events to the ring buffer. */ diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index dbef0d135075..fb6bfbc5bf86 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -895,7 +895,8 @@ void trace_probe_cleanup(struct trace_probe *tp) for (i = 0; i < tp->nr_args; i++) traceprobe_free_probe_arg(&tp->args[i]); - kfree(call->class->system); + if (call->class) + kfree(call->class->system); kfree(call->name); kfree(call->print_fmt); } diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 743b2b520d34..5e43b9664eca 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -579,8 +579,7 @@ probe_wakeup(void *ignore, struct task_struct *p) else tracing_dl = 0; - wakeup_task = p; - get_task_struct(wakeup_task); + wakeup_task = get_task_struct(p); local_save_flags(flags); |