summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile2
-rw-r--r--kernel/bpf/core.c8
-rw-r--r--kernel/bpf/syscall.c30
-rw-r--r--kernel/bpf/verifier.c36
-rw-r--r--kernel/cgroup/cgroup.c12
-rw-r--r--kernel/cgroup/cpuset.c163
-rw-r--r--kernel/configs.c16
-rw-r--r--kernel/cpu.c24
-rw-r--r--kernel/cred.c21
-rw-r--r--kernel/dma/contiguous.c16
-rw-r--r--kernel/dma/direct.c22
-rw-r--r--kernel/dma/mapping.c32
-rw-r--r--kernel/dma/remap.c2
-rw-r--r--kernel/dma/swiotlb.c34
-rw-r--r--kernel/events/core.c104
-rw-r--r--kernel/events/hw_breakpoint.c4
-rw-r--r--kernel/exit.c43
-rw-r--r--kernel/fork.c22
-rw-r--r--kernel/irq/affinity.c237
-rw-r--r--kernel/irq/irqdesc.c15
-rw-r--r--kernel/irq/irqdomain.c10
-rw-r--r--kernel/irq/manage.c5
-rw-r--r--kernel/irq/proc.c14
-rw-r--r--kernel/irq/resend.c2
-rw-r--r--kernel/jump_label.c4
-rw-r--r--kernel/kallsyms.c6
-rw-r--r--kernel/kexec_elf.c430
-rw-r--r--kernel/kprobes.c13
-rw-r--r--kernel/locking/lockdep.c174
-rw-r--r--kernel/locking/lockdep_internals.h9
-rw-r--r--kernel/locking/lockdep_proc.c11
-rw-r--r--kernel/locking/mutex.c37
-rw-r--r--kernel/locking/rtmutex.c6
-rw-r--r--kernel/locking/rwsem.c82
-rw-r--r--kernel/memremap.c405
-rw-r--r--kernel/module.c4
-rw-r--r--kernel/rcu/Kconfig8
-rw-r--r--kernel/rcu/Kconfig.debug11
-rw-r--r--kernel/rcu/rcu.h1
-rw-r--r--kernel/rcu/rcu_segcblist.c174
-rw-r--r--kernel/rcu/rcu_segcblist.h54
-rw-r--r--kernel/rcu/rcuperf.c10
-rw-r--r--kernel/rcu/rcutorture.c30
-rw-r--r--kernel/rcu/srcutree.c5
-rw-r--r--kernel/rcu/tree.c217
-rw-r--r--kernel/rcu/tree.h81
-rw-r--r--kernel/rcu/tree_exp.h8
-rw-r--r--kernel/rcu/tree_plugin.h1195
-rw-r--r--kernel/rcu/tree_stall.h15
-rw-r--r--kernel/rcu/update.c105
-rw-r--r--kernel/sched/core.c701
-rw-r--r--kernel/sched/cpufreq_schedutil.c20
-rw-r--r--kernel/sched/deadline.c142
-rw-r--r--kernel/sched/fair.c556
-rw-r--r--kernel/sched/idle.c36
-rw-r--r--kernel/sched/isolation.c12
-rw-r--r--kernel/sched/psi.c14
-rw-r--r--kernel/sched/rt.c74
-rw-r--r--kernel/sched/sched.h63
-rw-r--r--kernel/sched/stats.h7
-rw-r--r--kernel/sched/stop_task.c22
-rw-r--r--kernel/sched/topology.c53
-rw-r--r--kernel/signal.c15
-rw-r--r--kernel/stacktrace.c4
-rw-r--r--kernel/stop_machine.c2
-rw-r--r--kernel/sys.c32
-rw-r--r--kernel/time/alarmtimer.c4
-rw-r--r--kernel/time/timekeeping.c5
-rw-r--r--kernel/time/vsyscall.c22
-rw-r--r--kernel/torture.c2
-rw-r--r--kernel/trace/Kconfig6
-rw-r--r--kernel/trace/ftrace.c19
-rw-r--r--kernel/trace/ftrace_internal.h8
-rw-r--r--kernel/trace/ring_buffer_benchmark.c2
-rw-r--r--kernel/trace/trace.c30
-rw-r--r--kernel/trace/trace_events.c6
-rw-r--r--kernel/trace/trace_functions_graph.c17
-rw-r--r--kernel/trace/trace_probe.c3
-rw-r--r--kernel/trace/trace_sched_wakeup.c3
79 files changed, 3824 insertions, 2025 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index a8d923b5481b..48c5376d290a 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -64,6 +64,7 @@ obj-$(CONFIG_CRASH_CORE) += crash_core.o
obj-$(CONFIG_KEXEC_CORE) += kexec_core.o
obj-$(CONFIG_KEXEC) += kexec.o
obj-$(CONFIG_KEXEC_FILE) += kexec_file.o
+obj-$(CONFIG_KEXEC_ELF) += kexec_elf.o
obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o
obj-$(CONFIG_COMPAT) += compat.o
obj-$(CONFIG_CGROUPS) += cgroup/
@@ -111,7 +112,6 @@ obj-$(CONFIG_CONTEXT_TRACKING) += context_tracking.o
obj-$(CONFIG_TORTURE_TEST) += torture.o
obj-$(CONFIG_HAS_IOMEM) += iomem.o
-obj-$(CONFIG_ZONE_DEVICE) += memremap.o
obj-$(CONFIG_RSEQ) += rseq.o
obj-$(CONFIG_GCC_PLUGIN_STACKLEAK) += stackleak.o
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 8191a7db2777..66088a9e9b9e 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -890,7 +890,8 @@ int bpf_jit_get_func_addr(const struct bpf_prog *prog,
static int bpf_jit_blind_insn(const struct bpf_insn *from,
const struct bpf_insn *aux,
- struct bpf_insn *to_buff)
+ struct bpf_insn *to_buff,
+ bool emit_zext)
{
struct bpf_insn *to = to_buff;
u32 imm_rnd = get_random_int();
@@ -1005,6 +1006,8 @@ static int bpf_jit_blind_insn(const struct bpf_insn *from,
case 0: /* Part 2 of BPF_LD | BPF_IMM | BPF_DW. */
*to++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ aux[0].imm);
*to++ = BPF_ALU32_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
+ if (emit_zext)
+ *to++ = BPF_ZEXT_REG(BPF_REG_AX);
*to++ = BPF_ALU64_REG(BPF_OR, aux[0].dst_reg, BPF_REG_AX);
break;
@@ -1088,7 +1091,8 @@ struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *prog)
insn[1].code == 0)
memcpy(aux, insn, sizeof(aux));
- rewritten = bpf_jit_blind_insn(insn, aux, insn_buff);
+ rewritten = bpf_jit_blind_insn(insn, aux, insn_buff,
+ clone->aux->verifier_zext);
if (!rewritten)
continue;
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 5d141f16f6fa..272071e9112f 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1707,20 +1707,26 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr)
if (err)
goto free_used_maps;
- err = bpf_prog_new_fd(prog);
- if (err < 0) {
- /* failed to allocate fd.
- * bpf_prog_put() is needed because the above
- * bpf_prog_alloc_id() has published the prog
- * to the userspace and the userspace may
- * have refcnt-ed it through BPF_PROG_GET_FD_BY_ID.
- */
- bpf_prog_put(prog);
- return err;
- }
-
+ /* Upon success of bpf_prog_alloc_id(), the BPF prog is
+ * effectively publicly exposed. However, retrieving via
+ * bpf_prog_get_fd_by_id() will take another reference,
+ * therefore it cannot be gone underneath us.
+ *
+ * Only for the time /after/ successful bpf_prog_new_fd()
+ * and before returning to userspace, we might just hold
+ * one reference and any parallel close on that fd could
+ * rip everything out. Hence, below notifications must
+ * happen before bpf_prog_new_fd().
+ *
+ * Also, any failure handling from this point onwards must
+ * be using bpf_prog_put() given the program is exposed.
+ */
bpf_prog_kallsyms_add(prog);
perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_LOAD, 0);
+
+ err = bpf_prog_new_fd(prog);
+ if (err < 0)
+ bpf_prog_put(prog);
return err;
free_used_maps:
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 5900cbb966b1..c36a719fee6d 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -985,9 +985,6 @@ static void __mark_reg_unbounded(struct bpf_reg_state *reg)
reg->smax_value = S64_MAX;
reg->umin_value = 0;
reg->umax_value = U64_MAX;
-
- /* constant backtracking is enabled for root only for now */
- reg->precise = capable(CAP_SYS_ADMIN) ? false : true;
}
/* Mark a register as having a completely unknown (scalar) value. */
@@ -1014,7 +1011,11 @@ static void mark_reg_unknown(struct bpf_verifier_env *env,
__mark_reg_not_init(regs + regno);
return;
}
- __mark_reg_unknown(regs + regno);
+ regs += regno;
+ __mark_reg_unknown(regs);
+ /* constant backtracking is enabled for root without bpf2bpf calls */
+ regs->precise = env->subprog_cnt > 1 || !env->allow_ptr_leaks ?
+ true : false;
}
static void __mark_reg_not_init(struct bpf_reg_state *reg)
@@ -1771,16 +1772,21 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno,
bitmap_from_u64(mask, stack_mask);
for_each_set_bit(i, mask, 64) {
if (i >= func->allocated_stack / BPF_REG_SIZE) {
- /* This can happen if backtracking
- * is propagating stack precision where
- * caller has larger stack frame
- * than callee, but backtrack_insn() should
- * have returned -ENOTSUPP.
+ /* the sequence of instructions:
+ * 2: (bf) r3 = r10
+ * 3: (7b) *(u64 *)(r3 -8) = r0
+ * 4: (79) r4 = *(u64 *)(r10 -8)
+ * doesn't contain jmps. It's backtracked
+ * as a single block.
+ * During backtracking insn 3 is not recognized as
+ * stack access, so at the end of backtracking
+ * stack slot fp-8 is still marked in stack_mask.
+ * However the parent state may not have accessed
+ * fp-8 and it's "unallocated" stack space.
+ * In such case fallback to conservative.
*/
- verbose(env, "BUG spi %d stack_size %d\n",
- i, func->allocated_stack);
- WARN_ONCE(1, "verifier backtracking bug");
- return -EFAULT;
+ mark_all_scalars_precise(env, st);
+ return 0;
}
if (func->stack[i].slot_type[0] != STACK_SPILL) {
@@ -8616,8 +8622,8 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
}
if (is_narrower_load && size < target_size) {
- u8 shift = (off & (size_default - 1)) * 8;
-
+ u8 shift = bpf_ctx_narrow_load_shift(off, size,
+ size_default);
if (ctx_field_size <= 4) {
if (shift)
insn_buf[cnt++] = BPF_ALU32_IMM(BPF_RSH,
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 753afbca549f..a7ce73a2c401 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -1891,7 +1891,7 @@ static int cgroup_reconfigure(struct fs_context *fc)
*/
static bool use_task_css_set_links __read_mostly;
-static void cgroup_enable_task_cg_lists(void)
+void cgroup_enable_task_cg_lists(void)
{
struct task_struct *p, *g;
@@ -5255,8 +5255,16 @@ static struct cgroup *cgroup_create(struct cgroup *parent)
* if the parent has to be frozen, the child has too.
*/
cgrp->freezer.e_freeze = parent->freezer.e_freeze;
- if (cgrp->freezer.e_freeze)
+ if (cgrp->freezer.e_freeze) {
+ /*
+ * Set the CGRP_FREEZE flag, so when a process will be
+ * attached to the child cgroup, it will become frozen.
+ * At this point the new cgroup is unpopulated, so we can
+ * consider it frozen immediately.
+ */
+ set_bit(CGRP_FREEZE, &cgrp->flags);
set_bit(CGRP_FROZEN, &cgrp->flags);
+ }
spin_lock_irq(&css_set_lock);
for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) {
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 5aa37531ce76..c52bc91f882b 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -45,6 +45,7 @@
#include <linux/proc_fs.h>
#include <linux/rcupdate.h>
#include <linux/sched.h>
+#include <linux/sched/deadline.h>
#include <linux/sched/mm.h>
#include <linux/sched/task.h>
#include <linux/seq_file.h>
@@ -332,7 +333,18 @@ static struct cpuset top_cpuset = {
* guidelines for accessing subsystem state in kernel/cgroup.c
*/
-static DEFINE_MUTEX(cpuset_mutex);
+DEFINE_STATIC_PERCPU_RWSEM(cpuset_rwsem);
+
+void cpuset_read_lock(void)
+{
+ percpu_down_read(&cpuset_rwsem);
+}
+
+void cpuset_read_unlock(void)
+{
+ percpu_up_read(&cpuset_rwsem);
+}
+
static DEFINE_SPINLOCK(callback_lock);
static struct workqueue_struct *cpuset_migrate_mm_wq;
@@ -894,6 +906,67 @@ done:
return ndoms;
}
+static void update_tasks_root_domain(struct cpuset *cs)
+{
+ struct css_task_iter it;
+ struct task_struct *task;
+
+ css_task_iter_start(&cs->css, 0, &it);
+
+ while ((task = css_task_iter_next(&it)))
+ dl_add_task_root_domain(task);
+
+ css_task_iter_end(&it);
+}
+
+static void rebuild_root_domains(void)
+{
+ struct cpuset *cs = NULL;
+ struct cgroup_subsys_state *pos_css;
+
+ percpu_rwsem_assert_held(&cpuset_rwsem);
+ lockdep_assert_cpus_held();
+ lockdep_assert_held(&sched_domains_mutex);
+
+ cgroup_enable_task_cg_lists();
+
+ rcu_read_lock();
+
+ /*
+ * Clear default root domain DL accounting, it will be computed again
+ * if a task belongs to it.
+ */
+ dl_clear_root_domain(&def_root_domain);
+
+ cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {
+
+ if (cpumask_empty(cs->effective_cpus)) {
+ pos_css = css_rightmost_descendant(pos_css);
+ continue;
+ }
+
+ css_get(&cs->css);
+
+ rcu_read_unlock();
+
+ update_tasks_root_domain(cs);
+
+ rcu_read_lock();
+ css_put(&cs->css);
+ }
+ rcu_read_unlock();
+}
+
+static void
+partition_and_rebuild_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
+ struct sched_domain_attr *dattr_new)
+{
+ mutex_lock(&sched_domains_mutex);
+ partition_sched_domains_locked(ndoms_new, doms_new, dattr_new);
+ rebuild_root_domains();
+ mutex_unlock(&sched_domains_mutex);
+}
+
/*
* Rebuild scheduler domains.
*
@@ -911,8 +984,8 @@ static void rebuild_sched_domains_locked(void)
cpumask_var_t *doms;
int ndoms;
- lockdep_assert_held(&cpuset_mutex);
- get_online_cpus();
+ lockdep_assert_cpus_held();
+ percpu_rwsem_assert_held(&cpuset_rwsem);
/*
* We have raced with CPU hotplug. Don't do anything to avoid
@@ -921,19 +994,17 @@ static void rebuild_sched_domains_locked(void)
*/
if (!top_cpuset.nr_subparts_cpus &&
!cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask))
- goto out;
+ return;
if (top_cpuset.nr_subparts_cpus &&
!cpumask_subset(top_cpuset.effective_cpus, cpu_active_mask))
- goto out;
+ return;
/* Generate domain masks and attrs */
ndoms = generate_sched_domains(&doms, &attr);
/* Have scheduler rebuild the domains */
- partition_sched_domains(ndoms, doms, attr);
-out:
- put_online_cpus();
+ partition_and_rebuild_sched_domains(ndoms, doms, attr);
}
#else /* !CONFIG_SMP */
static void rebuild_sched_domains_locked(void)
@@ -943,9 +1014,11 @@ static void rebuild_sched_domains_locked(void)
void rebuild_sched_domains(void)
{
- mutex_lock(&cpuset_mutex);
+ get_online_cpus();
+ percpu_down_write(&cpuset_rwsem);
rebuild_sched_domains_locked();
- mutex_unlock(&cpuset_mutex);
+ percpu_up_write(&cpuset_rwsem);
+ put_online_cpus();
}
/**
@@ -1051,7 +1124,7 @@ static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd,
int deleting; /* Moving cpus from subparts_cpus to effective_cpus */
bool part_error = false; /* Partition error? */
- lockdep_assert_held(&cpuset_mutex);
+ percpu_rwsem_assert_held(&cpuset_rwsem);
/*
* The parent must be a partition root.
@@ -2039,7 +2112,7 @@ static int cpuset_can_attach(struct cgroup_taskset *tset)
cpuset_attach_old_cs = task_cs(cgroup_taskset_first(tset, &css));
cs = css_cs(css);
- mutex_lock(&cpuset_mutex);
+ percpu_down_write(&cpuset_rwsem);
/* allow moving tasks into an empty cpuset if on default hierarchy */
ret = -ENOSPC;
@@ -2063,7 +2136,7 @@ static int cpuset_can_attach(struct cgroup_taskset *tset)
cs->attach_in_progress++;
ret = 0;
out_unlock:
- mutex_unlock(&cpuset_mutex);
+ percpu_up_write(&cpuset_rwsem);
return ret;
}
@@ -2073,9 +2146,9 @@ static void cpuset_cancel_attach(struct cgroup_taskset *tset)
cgroup_taskset_first(tset, &css);
- mutex_lock(&cpuset_mutex);
+ percpu_down_write(&cpuset_rwsem);
css_cs(css)->attach_in_progress--;
- mutex_unlock(&cpuset_mutex);
+ percpu_up_write(&cpuset_rwsem);
}
/*
@@ -2098,7 +2171,7 @@ static void cpuset_attach(struct cgroup_taskset *tset)
cgroup_taskset_first(tset, &css);
cs = css_cs(css);
- mutex_lock(&cpuset_mutex);
+ percpu_down_write(&cpuset_rwsem);
/* prepare for attach */
if (cs == &top_cpuset)
@@ -2152,7 +2225,7 @@ static void cpuset_attach(struct cgroup_taskset *tset)
if (!cs->attach_in_progress)
wake_up(&cpuset_attach_wq);
- mutex_unlock(&cpuset_mutex);
+ percpu_up_write(&cpuset_rwsem);
}
/* The various types of files and directories in a cpuset file system */
@@ -2183,7 +2256,8 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft,
cpuset_filetype_t type = cft->private;
int retval = 0;
- mutex_lock(&cpuset_mutex);
+ get_online_cpus();
+ percpu_down_write(&cpuset_rwsem);
if (!is_cpuset_online(cs)) {
retval = -ENODEV;
goto out_unlock;
@@ -2219,7 +2293,8 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft,
break;
}
out_unlock:
- mutex_unlock(&cpuset_mutex);
+ percpu_up_write(&cpuset_rwsem);
+ put_online_cpus();
return retval;
}
@@ -2230,7 +2305,8 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft,
cpuset_filetype_t type = cft->private;
int retval = -ENODEV;
- mutex_lock(&cpuset_mutex);
+ get_online_cpus();
+ percpu_down_write(&cpuset_rwsem);
if (!is_cpuset_online(cs))
goto out_unlock;
@@ -2243,7 +2319,8 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft,
break;
}
out_unlock:
- mutex_unlock(&cpuset_mutex);
+ percpu_up_write(&cpuset_rwsem);
+ put_online_cpus();
return retval;
}
@@ -2282,7 +2359,8 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
kernfs_break_active_protection(of->kn);
flush_work(&cpuset_hotplug_work);
- mutex_lock(&cpuset_mutex);
+ get_online_cpus();
+ percpu_down_write(&cpuset_rwsem);
if (!is_cpuset_online(cs))
goto out_unlock;
@@ -2306,7 +2384,8 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
free_cpuset(trialcs);
out_unlock:
- mutex_unlock(&cpuset_mutex);
+ percpu_up_write(&cpuset_rwsem);
+ put_online_cpus();
kernfs_unbreak_active_protection(of->kn);
css_put(&cs->css);
flush_workqueue(cpuset_migrate_mm_wq);
@@ -2437,13 +2516,15 @@ static ssize_t sched_partition_write(struct kernfs_open_file *of, char *buf,
return -EINVAL;
css_get(&cs->css);
- mutex_lock(&cpuset_mutex);
+ get_online_cpus();
+ percpu_down_write(&cpuset_rwsem);
if (!is_cpuset_online(cs))
goto out_unlock;
retval = update_prstate(cs, val);
out_unlock:
- mutex_unlock(&cpuset_mutex);
+ percpu_up_write(&cpuset_rwsem);
+ put_online_cpus();
css_put(&cs->css);
return retval ?: nbytes;
}
@@ -2649,7 +2730,8 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
if (!parent)
return 0;
- mutex_lock(&cpuset_mutex);
+ get_online_cpus();
+ percpu_down_write(&cpuset_rwsem);
set_bit(CS_ONLINE, &cs->flags);
if (is_spread_page(parent))
@@ -2700,7 +2782,8 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
cpumask_copy(cs->effective_cpus, parent->cpus_allowed);
spin_unlock_irq(&callback_lock);
out_unlock:
- mutex_unlock(&cpuset_mutex);
+ percpu_up_write(&cpuset_rwsem);
+ put_online_cpus();
return 0;
}
@@ -2719,7 +2802,8 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css)
{
struct cpuset *cs = css_cs(css);
- mutex_lock(&cpuset_mutex);
+ get_online_cpus();
+ percpu_down_write(&cpuset_rwsem);
if (is_partition_root(cs))
update_prstate(cs, 0);
@@ -2738,7 +2822,8 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css)
cpuset_dec();
clear_bit(CS_ONLINE, &cs->flags);
- mutex_unlock(&cpuset_mutex);
+ percpu_up_write(&cpuset_rwsem);
+ put_online_cpus();
}
static void cpuset_css_free(struct cgroup_subsys_state *css)
@@ -2750,7 +2835,7 @@ static void cpuset_css_free(struct cgroup_subsys_state *css)
static void cpuset_bind(struct cgroup_subsys_state *root_css)
{
- mutex_lock(&cpuset_mutex);
+ percpu_down_write(&cpuset_rwsem);
spin_lock_irq(&callback_lock);
if (is_in_v2_mode()) {
@@ -2763,7 +2848,7 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css)
}
spin_unlock_irq(&callback_lock);
- mutex_unlock(&cpuset_mutex);
+ percpu_up_write(&cpuset_rwsem);
}
/*
@@ -2805,6 +2890,8 @@ struct cgroup_subsys cpuset_cgrp_subsys = {
int __init cpuset_init(void)
{
+ BUG_ON(percpu_init_rwsem(&cpuset_rwsem));
+
BUG_ON(!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL));
BUG_ON(!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL));
BUG_ON(!zalloc_cpumask_var(&top_cpuset.subparts_cpus, GFP_KERNEL));
@@ -2876,7 +2963,7 @@ hotplug_update_tasks_legacy(struct cpuset *cs,
is_empty = cpumask_empty(cs->cpus_allowed) ||
nodes_empty(cs->mems_allowed);
- mutex_unlock(&cpuset_mutex);
+ percpu_up_write(&cpuset_rwsem);
/*
* Move tasks to the nearest ancestor with execution resources,
@@ -2886,7 +2973,7 @@ hotplug_update_tasks_legacy(struct cpuset *cs,
if (is_empty)
remove_tasks_in_empty_cpuset(cs);
- mutex_lock(&cpuset_mutex);
+ percpu_down_write(&cpuset_rwsem);
}
static void
@@ -2936,14 +3023,14 @@ static void cpuset_hotplug_update_tasks(struct cpuset *cs, struct tmpmasks *tmp)
retry:
wait_event(cpuset_attach_wq, cs->attach_in_progress == 0);
- mutex_lock(&cpuset_mutex);
+ percpu_down_write(&cpuset_rwsem);
/*
* We have raced with task attaching. We wait until attaching
* is finished, so we won't attach a task to an empty cpuset.
*/
if (cs->attach_in_progress) {
- mutex_unlock(&cpuset_mutex);
+ percpu_up_write(&cpuset_rwsem);
goto retry;
}
@@ -3011,7 +3098,7 @@ update_tasks:
hotplug_update_tasks_legacy(cs, &new_cpus, &new_mems,
cpus_updated, mems_updated);
- mutex_unlock(&cpuset_mutex);
+ percpu_up_write(&cpuset_rwsem);
}
/**
@@ -3041,7 +3128,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
if (on_dfl && !alloc_cpumasks(NULL, &tmp))
ptmp = &tmp;
- mutex_lock(&cpuset_mutex);
+ percpu_down_write(&cpuset_rwsem);
/* fetch the available cpus/mems and find out which changed how */
cpumask_copy(&new_cpus, cpu_active_mask);
@@ -3091,7 +3178,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
update_tasks_nodemask(&top_cpuset);
}
- mutex_unlock(&cpuset_mutex);
+ percpu_up_write(&cpuset_rwsem);
/* if cpus or mems changed, we need to propagate to descendants */
if (cpus_updated || mems_updated) {
diff --git a/kernel/configs.c b/kernel/configs.c
index b062425ccf8d..c09ea4c995e1 100644
--- a/kernel/configs.c
+++ b/kernel/configs.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* kernel/configs.c
* Echo the kernel .config file used to build the kernel
@@ -6,21 +7,6 @@
* Copyright (C) 2002 Randy Dunlap <rdunlap@xenotime.net>
* Copyright (C) 2002 Al Stone <ahs3@fc.hp.com>
* Copyright (C) 2002 Hewlett-Packard Company
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or (at
- * your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
- * NON INFRINGEMENT. See the GNU General Public License for more
- * details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
#include <linux/kernel.h>
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 05778e32674a..e1967e9eddc2 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -2298,6 +2298,9 @@ EXPORT_SYMBOL(__cpu_present_mask);
struct cpumask __cpu_active_mask __read_mostly;
EXPORT_SYMBOL(__cpu_active_mask);
+atomic_t __num_online_cpus __read_mostly;
+EXPORT_SYMBOL(__num_online_cpus);
+
void init_cpu_present(const struct cpumask *src)
{
cpumask_copy(&__cpu_present_mask, src);
@@ -2313,6 +2316,27 @@ void init_cpu_online(const struct cpumask *src)
cpumask_copy(&__cpu_online_mask, src);
}
+void set_cpu_online(unsigned int cpu, bool online)
+{
+ /*
+ * atomic_inc/dec() is required to handle the horrid abuse of this
+ * function by the reboot and kexec code which invoke it from
+ * IPI/NMI broadcasts when shutting down CPUs. Invocation from
+ * regular CPU hotplug is properly serialized.
+ *
+ * Note, that the fact that __num_online_cpus is of type atomic_t
+ * does not protect readers which are not serialized against
+ * concurrent hotplug operations.
+ */
+ if (online) {
+ if (!cpumask_test_and_set_cpu(cpu, &__cpu_online_mask))
+ atomic_inc(&__num_online_cpus);
+ } else {
+ if (cpumask_test_and_clear_cpu(cpu, &__cpu_online_mask))
+ atomic_dec(&__num_online_cpus);
+ }
+}
+
/*
* Activate the first processor.
*/
diff --git a/kernel/cred.c b/kernel/cred.c
index f9a0ce66c9c3..c0a4c12d38b2 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -144,7 +144,10 @@ void __put_cred(struct cred *cred)
BUG_ON(cred == current->cred);
BUG_ON(cred == current->real_cred);
- call_rcu(&cred->rcu, put_cred_rcu);
+ if (cred->non_rcu)
+ put_cred_rcu(&cred->rcu);
+ else
+ call_rcu(&cred->rcu, put_cred_rcu);
}
EXPORT_SYMBOL(__put_cred);
@@ -261,6 +264,7 @@ struct cred *prepare_creds(void)
old = task->cred;
memcpy(new, old, sizeof(struct cred));
+ new->non_rcu = 0;
atomic_set(&new->usage, 1);
set_cred_subscribers(new, 0);
get_group_info(new->group_info);
@@ -544,7 +548,19 @@ const struct cred *override_creds(const struct cred *new)
validate_creds(old);
validate_creds(new);
- get_cred(new);
+
+ /*
+ * NOTE! This uses 'get_new_cred()' rather than 'get_cred()'.
+ *
+ * That means that we do not clear the 'non_rcu' flag, since
+ * we are only installing the cred into the thread-synchronous
+ * '->cred' pointer, not the '->real_cred' pointer that is
+ * visible to other threads under RCU.
+ *
+ * Also note that we did validate_creds() manually, not depending
+ * on the validation in 'get_cred()'.
+ */
+ get_new_cred((struct cred *)new);
alter_cred_subscribers(new, 1);
rcu_assign_pointer(current->cred, new);
alter_cred_subscribers(old, -1);
@@ -681,6 +697,7 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon)
validate_creds(old);
*new = *old;
+ new->non_rcu = 0;
atomic_set(&new->usage, 1);
set_cred_subscribers(new, 0);
get_uid(new->user);
diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c
index bfc0c17f2a3d..69cfb4345388 100644
--- a/kernel/dma/contiguous.c
+++ b/kernel/dma/contiguous.c
@@ -230,9 +230,7 @@ bool dma_release_from_contiguous(struct device *dev, struct page *pages,
*/
struct page *dma_alloc_contiguous(struct device *dev, size_t size, gfp_t gfp)
{
- int node = dev ? dev_to_node(dev) : NUMA_NO_NODE;
- size_t count = PAGE_ALIGN(size) >> PAGE_SHIFT;
- size_t align = get_order(PAGE_ALIGN(size));
+ size_t count = size >> PAGE_SHIFT;
struct page *page = NULL;
struct cma *cma = NULL;
@@ -243,13 +241,12 @@ struct page *dma_alloc_contiguous(struct device *dev, size_t size, gfp_t gfp)
/* CMA can be used only in the context which permits sleeping */
if (cma && gfpflags_allow_blocking(gfp)) {
- align = min_t(size_t, align, CONFIG_CMA_ALIGNMENT);
- page = cma_alloc(cma, count, align, gfp & __GFP_NOWARN);
+ size_t align = get_order(size);
+ size_t cma_align = min_t(size_t, align, CONFIG_CMA_ALIGNMENT);
+
+ page = cma_alloc(cma, count, cma_align, gfp & __GFP_NOWARN);
}
- /* Fallback allocation of normal pages */
- if (!page)
- page = alloc_pages_node(node, gfp, align);
return page;
}
@@ -266,7 +263,8 @@ struct page *dma_alloc_contiguous(struct device *dev, size_t size, gfp_t gfp)
*/
void dma_free_contiguous(struct device *dev, struct page *page, size_t size)
{
- if (!cma_release(dev_get_cma_area(dev), page, size >> PAGE_SHIFT))
+ if (!cma_release(dev_get_cma_area(dev), page,
+ PAGE_ALIGN(size) >> PAGE_SHIFT))
__free_pages(page, get_order(size));
}
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index 59bdceea3737..8402b29c280f 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -47,9 +47,6 @@ u64 dma_direct_get_required_mask(struct device *dev)
{
u64 max_dma = phys_to_dma_direct(dev, (max_pfn - 1) << PAGE_SHIFT);
- if (dev->bus_dma_mask && dev->bus_dma_mask < max_dma)
- max_dma = dev->bus_dma_mask;
-
return (1ULL << (fls64(max_dma) - 1)) * 2 - 1;
}
@@ -88,6 +85,8 @@ static bool dma_coherent_ok(struct device *dev, phys_addr_t phys, size_t size)
struct page *__dma_direct_alloc_pages(struct device *dev, size_t size,
dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs)
{
+ size_t alloc_size = PAGE_ALIGN(size);
+ int node = dev_to_node(dev);
struct page *page = NULL;
u64 phys_mask;
@@ -98,8 +97,14 @@ struct page *__dma_direct_alloc_pages(struct device *dev, size_t size,
gfp &= ~__GFP_ZERO;
gfp |= __dma_direct_optimal_gfp_mask(dev, dev->coherent_dma_mask,
&phys_mask);
+ page = dma_alloc_contiguous(dev, alloc_size, gfp);
+ if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) {
+ dma_free_contiguous(dev, page, alloc_size);
+ page = NULL;
+ }
again:
- page = dma_alloc_contiguous(dev, size, gfp);
+ if (!page)
+ page = alloc_pages_node(node, gfp, get_order(alloc_size));
if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) {
dma_free_contiguous(dev, page, size);
page = NULL;
@@ -130,10 +135,12 @@ void *dma_direct_alloc_pages(struct device *dev, size_t size,
if (!page)
return NULL;
- if (attrs & DMA_ATTR_NO_KERNEL_MAPPING) {
+ if ((attrs & DMA_ATTR_NO_KERNEL_MAPPING) &&
+ !force_dma_unencrypted(dev)) {
/* remove any dirty cache lines on the kernel alias */
if (!PageHighMem(page))
arch_dma_prep_coherent(page, size);
+ *dma_handle = phys_to_dma(dev, page_to_phys(page));
/* return the page pointer as the opaque cookie */
return page;
}
@@ -178,7 +185,8 @@ void dma_direct_free_pages(struct device *dev, size_t size, void *cpu_addr,
{
unsigned int page_order = get_order(size);
- if (attrs & DMA_ATTR_NO_KERNEL_MAPPING) {
+ if ((attrs & DMA_ATTR_NO_KERNEL_MAPPING) &&
+ !force_dma_unencrypted(dev)) {
/* cpu_addr is a struct page cookie, not a kernel address */
__dma_direct_free_pages(dev, size, cpu_addr);
return;
@@ -297,7 +305,7 @@ void dma_direct_unmap_page(struct device *dev, dma_addr_t addr,
dma_direct_sync_single_for_cpu(dev, addr, size, dir);
if (unlikely(is_swiotlb_buffer(phys)))
- swiotlb_tbl_unmap_single(dev, phys, size, dir, attrs);
+ swiotlb_tbl_unmap_single(dev, phys, size, size, dir, attrs);
}
EXPORT_SYMBOL(dma_direct_unmap_page);
diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
index 1f628e7ac709..b0038ca3aa92 100644
--- a/kernel/dma/mapping.c
+++ b/kernel/dma/mapping.c
@@ -116,11 +116,16 @@ int dma_common_get_sgtable(struct device *dev, struct sg_table *sgt,
int ret;
if (!dev_is_dma_coherent(dev)) {
+ unsigned long pfn;
+
if (!IS_ENABLED(CONFIG_ARCH_HAS_DMA_COHERENT_TO_PFN))
return -ENXIO;
- page = pfn_to_page(arch_dma_coherent_to_pfn(dev, cpu_addr,
- dma_addr));
+ /* If the PFN is not valid, we do not have a struct page */
+ pfn = arch_dma_coherent_to_pfn(dev, cpu_addr, dma_addr);
+ if (!pfn_valid(pfn))
+ return -ENXIO;
+ page = pfn_to_page(pfn);
} else {
page = virt_to_page(cpu_addr);
}
@@ -145,6 +150,23 @@ int dma_get_sgtable_attrs(struct device *dev, struct sg_table *sgt,
}
EXPORT_SYMBOL(dma_get_sgtable_attrs);
+#ifdef CONFIG_MMU
+/*
+ * Return the page attributes used for mapping dma_alloc_* memory, either in
+ * kernel space if remapping is needed, or to userspace through dma_mmap_*.
+ */
+pgprot_t dma_pgprot(struct device *dev, pgprot_t prot, unsigned long attrs)
+{
+ if (dev_is_dma_coherent(dev) ||
+ (IS_ENABLED(CONFIG_DMA_NONCOHERENT_CACHE_SYNC) &&
+ (attrs & DMA_ATTR_NON_CONSISTENT)))
+ return prot;
+ if (IS_ENABLED(CONFIG_ARCH_HAS_DMA_MMAP_PGPROT))
+ return arch_dma_mmap_pgprot(dev, prot, attrs);
+ return pgprot_noncached(prot);
+}
+#endif /* CONFIG_MMU */
+
/*
* Create userspace mapping for the DMA-coherent memory.
*/
@@ -159,7 +181,7 @@ int dma_common_mmap(struct device *dev, struct vm_area_struct *vma,
unsigned long pfn;
int ret = -ENXIO;
- vma->vm_page_prot = arch_dma_mmap_pgprot(dev, vma->vm_page_prot, attrs);
+ vma->vm_page_prot = dma_pgprot(dev, vma->vm_page_prot, attrs);
if (dma_mmap_from_dev_coherent(dev, vma, cpu_addr, size, &ret))
return ret;
@@ -170,7 +192,11 @@ int dma_common_mmap(struct device *dev, struct vm_area_struct *vma,
if (!dev_is_dma_coherent(dev)) {
if (!IS_ENABLED(CONFIG_ARCH_HAS_DMA_COHERENT_TO_PFN))
return -ENXIO;
+
+ /* If the PFN is not valid, we do not have a struct page */
pfn = arch_dma_coherent_to_pfn(dev, cpu_addr, dma_addr);
+ if (!pfn_valid(pfn))
+ return -ENXIO;
} else {
pfn = page_to_pfn(virt_to_page(cpu_addr));
}
diff --git a/kernel/dma/remap.c b/kernel/dma/remap.c
index a594aec07882..ffe78f0b2fe4 100644
--- a/kernel/dma/remap.c
+++ b/kernel/dma/remap.c
@@ -218,7 +218,7 @@ void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle,
/* create a coherent mapping */
ret = dma_common_contiguous_remap(page, size, VM_USERMAP,
- arch_dma_mmap_pgprot(dev, PAGE_KERNEL, attrs),
+ dma_pgprot(dev, PAGE_KERNEL, attrs),
__builtin_return_address(0));
if (!ret) {
__dma_direct_free_pages(dev, size, page);
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index 9de232229063..796a44f8ef5a 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -444,7 +444,9 @@ static void swiotlb_bounce(phys_addr_t orig_addr, phys_addr_t tlb_addr,
phys_addr_t swiotlb_tbl_map_single(struct device *hwdev,
dma_addr_t tbl_dma_addr,
- phys_addr_t orig_addr, size_t size,
+ phys_addr_t orig_addr,
+ size_t mapping_size,
+ size_t alloc_size,
enum dma_data_direction dir,
unsigned long attrs)
{
@@ -464,6 +466,12 @@ phys_addr_t swiotlb_tbl_map_single(struct device *hwdev,
pr_warn_once("%s is active and system is using DMA bounce buffers\n",
sme_active() ? "SME" : "SEV");
+ if (mapping_size > alloc_size) {
+ dev_warn_once(hwdev, "Invalid sizes (mapping: %zd bytes, alloc: %zd bytes)",
+ mapping_size, alloc_size);
+ return (phys_addr_t)DMA_MAPPING_ERROR;
+ }
+
mask = dma_get_seg_boundary(hwdev);
tbl_dma_addr &= mask;
@@ -471,8 +479,8 @@ phys_addr_t swiotlb_tbl_map_single(struct device *hwdev,
offset_slots = ALIGN(tbl_dma_addr, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT;
/*
- * Carefully handle integer overflow which can occur when mask == ~0UL.
- */
+ * Carefully handle integer overflow which can occur when mask == ~0UL.
+ */
max_slots = mask + 1
? ALIGN(mask + 1, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT
: 1UL << (BITS_PER_LONG - IO_TLB_SHIFT);
@@ -481,8 +489,8 @@ phys_addr_t swiotlb_tbl_map_single(struct device *hwdev,
* For mappings greater than or equal to a page, we limit the stride
* (and hence alignment) to a page size.
*/
- nslots = ALIGN(size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT;
- if (size >= PAGE_SIZE)
+ nslots = ALIGN(alloc_size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT;
+ if (alloc_size >= PAGE_SIZE)
stride = (1 << (PAGE_SHIFT - IO_TLB_SHIFT));
else
stride = 1;
@@ -547,7 +555,7 @@ not_found:
spin_unlock_irqrestore(&io_tlb_lock, flags);
if (!(attrs & DMA_ATTR_NO_WARN) && printk_ratelimit())
dev_warn(hwdev, "swiotlb buffer is full (sz: %zd bytes), total %lu (slots), used %lu (slots)\n",
- size, io_tlb_nslabs, tmp_io_tlb_used);
+ alloc_size, io_tlb_nslabs, tmp_io_tlb_used);
return (phys_addr_t)DMA_MAPPING_ERROR;
found:
io_tlb_used += nslots;
@@ -562,7 +570,7 @@ found:
io_tlb_orig_addr[index+i] = orig_addr + (i << IO_TLB_SHIFT);
if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
(dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL))
- swiotlb_bounce(orig_addr, tlb_addr, size, DMA_TO_DEVICE);
+ swiotlb_bounce(orig_addr, tlb_addr, mapping_size, DMA_TO_DEVICE);
return tlb_addr;
}
@@ -571,11 +579,11 @@ found:
* tlb_addr is the physical address of the bounce buffer to unmap.
*/
void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr,
- size_t size, enum dma_data_direction dir,
- unsigned long attrs)
+ size_t mapping_size, size_t alloc_size,
+ enum dma_data_direction dir, unsigned long attrs)
{
unsigned long flags;
- int i, count, nslots = ALIGN(size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT;
+ int i, count, nslots = ALIGN(alloc_size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT;
int index = (tlb_addr - io_tlb_start) >> IO_TLB_SHIFT;
phys_addr_t orig_addr = io_tlb_orig_addr[index];
@@ -585,7 +593,7 @@ void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr,
if (orig_addr != INVALID_PHYS_ADDR &&
!(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
((dir == DMA_FROM_DEVICE) || (dir == DMA_BIDIRECTIONAL)))
- swiotlb_bounce(orig_addr, tlb_addr, size, DMA_FROM_DEVICE);
+ swiotlb_bounce(orig_addr, tlb_addr, mapping_size, DMA_FROM_DEVICE);
/*
* Return the buffer to the free list by setting the corresponding
@@ -665,14 +673,14 @@ bool swiotlb_map(struct device *dev, phys_addr_t *phys, dma_addr_t *dma_addr,
/* Oh well, have to allocate and map a bounce buffer. */
*phys = swiotlb_tbl_map_single(dev, __phys_to_dma(dev, io_tlb_start),
- *phys, size, dir, attrs);
+ *phys, size, size, dir, attrs);
if (*phys == (phys_addr_t)DMA_MAPPING_ERROR)
return false;
/* Ensure that the address returned is DMA'ble */
*dma_addr = __phys_to_dma(dev, *phys);
if (unlikely(!dma_capable(dev, *dma_addr, size))) {
- swiotlb_tbl_unmap_single(dev, *phys, size, dir,
+ swiotlb_tbl_unmap_single(dev, *phys, size, size, dir,
attrs | DMA_ATTR_SKIP_CPU_SYNC);
return false;
}
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 026a14541a38..1c414b8866b4 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1887,6 +1887,89 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
ctx->generation++;
}
+static int
+perf_aux_output_match(struct perf_event *event, struct perf_event *aux_event)
+{
+ if (!has_aux(aux_event))
+ return 0;
+
+ if (!event->pmu->aux_output_match)
+ return 0;
+
+ return event->pmu->aux_output_match(aux_event);
+}
+
+static void put_event(struct perf_event *event);
+static void event_sched_out(struct perf_event *event,
+ struct perf_cpu_context *cpuctx,
+ struct perf_event_context *ctx);
+
+static void perf_put_aux_event(struct perf_event *event)
+{
+ struct perf_event_context *ctx = event->ctx;
+ struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
+ struct perf_event *iter;
+
+ /*
+ * If event uses aux_event tear down the link
+ */
+ if (event->aux_event) {
+ iter = event->aux_event;
+ event->aux_event = NULL;
+ put_event(iter);
+ return;
+ }
+
+ /*
+ * If the event is an aux_event, tear down all links to
+ * it from other events.
+ */
+ for_each_sibling_event(iter, event->group_leader) {
+ if (iter->aux_event != event)
+ continue;
+
+ iter->aux_event = NULL;
+ put_event(event);
+
+ /*
+ * If it's ACTIVE, schedule it out and put it into ERROR
+ * state so that we don't try to schedule it again. Note
+ * that perf_event_enable() will clear the ERROR status.
+ */
+ event_sched_out(iter, cpuctx, ctx);
+ perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
+ }
+}
+
+static int perf_get_aux_event(struct perf_event *event,
+ struct perf_event *group_leader)
+{
+ /*
+ * Our group leader must be an aux event if we want to be
+ * an aux_output. This way, the aux event will precede its
+ * aux_output events in the group, and therefore will always
+ * schedule first.
+ */
+ if (!group_leader)
+ return 0;
+
+ if (!perf_aux_output_match(event, group_leader))
+ return 0;
+
+ if (!atomic_long_inc_not_zero(&group_leader->refcount))
+ return 0;
+
+ /*
+ * Link aux_outputs to their aux event; this is undone in
+ * perf_group_detach() by perf_put_aux_event(). When the
+ * group in torn down, the aux_output events loose their
+ * link to the aux_event and can't schedule any more.
+ */
+ event->aux_event = group_leader;
+
+ return 1;
+}
+
static void perf_group_detach(struct perf_event *event)
{
struct perf_event *sibling, *tmp;
@@ -1902,6 +1985,8 @@ static void perf_group_detach(struct perf_event *event)
event->attach_state &= ~PERF_ATTACH_GROUP;
+ perf_put_aux_event(event);
+
/*
* If this is a sibling, remove it from its group.
*/
@@ -4089,10 +4174,8 @@ alloc_perf_context(struct pmu *pmu, struct task_struct *task)
return NULL;
__perf_event_init_context(ctx);
- if (task) {
- ctx->task = task;
- get_task_struct(task);
- }
+ if (task)
+ ctx->task = get_task_struct(task);
ctx->pmu = pmu;
return ctx;
@@ -10355,8 +10438,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
* and we cannot use the ctx information because we need the
* pmu before we get a ctx.
*/
- get_task_struct(task);
- event->hw.target = task;
+ event->hw.target = get_task_struct(task);
}
event->clock = &local_clock;
@@ -10426,6 +10508,12 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
goto err_ns;
}
+ if (event->attr.aux_output &&
+ !(pmu->capabilities & PERF_PMU_CAP_AUX_OUTPUT)) {
+ err = -EOPNOTSUPP;
+ goto err_pmu;
+ }
+
err = exclusive_event_init(event);
if (err)
goto err_pmu;
@@ -11082,6 +11170,8 @@ SYSCALL_DEFINE5(perf_event_open,
}
}
+ if (event->attr.aux_output && !perf_get_aux_event(event, group_leader))
+ goto err_locked;
/*
* Must be under the same ctx::mutex as perf_install_in_context(),
@@ -11274,7 +11364,7 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
goto err_unlock;
}
- perf_install_in_context(ctx, event, cpu);
+ perf_install_in_context(ctx, event, event->cpu);
perf_unpin_context(ctx);
mutex_unlock(&ctx->mutex);
diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c
index c5cd852fe86b..3cc8416ec844 100644
--- a/kernel/events/hw_breakpoint.c
+++ b/kernel/events/hw_breakpoint.c
@@ -413,7 +413,7 @@ static int hw_breakpoint_parse(struct perf_event *bp,
int register_perf_hw_breakpoint(struct perf_event *bp)
{
- struct arch_hw_breakpoint hw;
+ struct arch_hw_breakpoint hw = { };
int err;
err = reserve_bp_slot(bp);
@@ -461,7 +461,7 @@ int
modify_user_hw_breakpoint_check(struct perf_event *bp, struct perf_event_attr *attr,
bool check)
{
- struct arch_hw_breakpoint hw;
+ struct arch_hw_breakpoint hw = { };
int err;
err = hw_breakpoint_parse(bp, attr, &hw);
diff --git a/kernel/exit.c b/kernel/exit.c
index 4436158a6d30..22ab6a4bdc51 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -734,9 +734,10 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
autoreap = true;
}
- tsk->exit_state = autoreap ? EXIT_DEAD : EXIT_ZOMBIE;
- if (tsk->exit_state == EXIT_DEAD)
+ if (autoreap) {
+ tsk->exit_state = EXIT_DEAD;
list_add(&tsk->ptrace_entry, &dead);
+ }
/* mt-exec, de_thread() is waiting for group leader */
if (unlikely(tsk->signal->notify_count < 0))
@@ -1553,6 +1554,23 @@ end:
return retval;
}
+static struct pid *pidfd_get_pid(unsigned int fd)
+{
+ struct fd f;
+ struct pid *pid;
+
+ f = fdget(fd);
+ if (!f.file)
+ return ERR_PTR(-EBADF);
+
+ pid = pidfd_pid(f.file);
+ if (!IS_ERR(pid))
+ get_pid(pid);
+
+ fdput(f);
+ return pid;
+}
+
static long kernel_waitid(int which, pid_t upid, struct waitid_info *infop,
int options, struct rusage *ru)
{
@@ -1575,19 +1593,32 @@ static long kernel_waitid(int which, pid_t upid, struct waitid_info *infop,
type = PIDTYPE_PID;
if (upid <= 0)
return -EINVAL;
+
+ pid = find_get_pid(upid);
break;
case P_PGID:
type = PIDTYPE_PGID;
- if (upid <= 0)
+ if (upid < 0)
return -EINVAL;
+
+ if (upid)
+ pid = find_get_pid(upid);
+ else
+ pid = get_task_pid(current, PIDTYPE_PGID);
+ break;
+ case P_PIDFD:
+ type = PIDTYPE_PID;
+ if (upid < 0)
+ return -EINVAL;
+
+ pid = pidfd_get_pid(upid);
+ if (IS_ERR(pid))
+ return PTR_ERR(pid);
break;
default:
return -EINVAL;
}
- if (type < PIDTYPE_MAX)
- pid = find_get_pid(upid);
-
wo.wo_type = type;
wo.wo_pid = pid;
wo.wo_flags = options;
diff --git a/kernel/fork.c b/kernel/fork.c
index d8ae0f1b4148..1d1cd06edbc1 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -726,7 +726,7 @@ void __put_task_struct(struct task_struct *tsk)
WARN_ON(tsk == current);
cgroup_free(tsk);
- task_numa_free(tsk);
+ task_numa_free(tsk, true);
security_task_free(tsk);
exit_creds(tsk);
delayacct_tsk_free(tsk);
@@ -768,6 +768,7 @@ static void set_max_threads(unsigned int max_threads_suggested)
int arch_task_struct_size __read_mostly;
#endif
+#ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
static void task_struct_whitelist(unsigned long *offset, unsigned long *size)
{
/* Fetch thread_struct whitelist for the architecture. */
@@ -782,6 +783,7 @@ static void task_struct_whitelist(unsigned long *offset, unsigned long *size)
else
*offset += offsetof(struct task_struct, thread);
}
+#endif /* CONFIG_ARCH_TASK_STRUCT_ALLOCATOR */
void __init fork_init(void)
{
@@ -1690,6 +1692,14 @@ static inline void rcu_copy_process(struct task_struct *p)
#endif /* #ifdef CONFIG_TASKS_RCU */
}
+struct pid *pidfd_pid(const struct file *file)
+{
+ if (file->f_op == &pidfd_fops)
+ return file->private_data;
+
+ return ERR_PTR(-EBADF);
+}
+
static int pidfd_release(struct inode *inode, struct file *file)
{
struct pid *pid = file->private_data;
@@ -2338,6 +2348,8 @@ struct mm_struct *copy_init_mm(void)
*
* It copies the process, and if successful kick-starts
* it and waits for it to finish using the VM if required.
+ *
+ * args->exit_signal is expected to be checked for sanity by the caller.
*/
long _do_fork(struct kernel_clone_args *args)
{
@@ -2562,6 +2574,14 @@ noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs,
if (copy_from_user(&args, uargs, size))
return -EFAULT;
+ /*
+ * Verify that higher 32bits of exit_signal are unset and that
+ * it is a valid signal
+ */
+ if (unlikely((args.exit_signal & ~((u64)CSIGNAL)) ||
+ !valid_signal(args.exit_signal)))
+ return -EINVAL;
+
*kargs = (struct kernel_clone_args){
.flags = args.flags,
.pidfd = u64_to_user_ptr(args.pidfd),
diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
index 4352b08ae48d..4d89ad4fae3b 100644
--- a/kernel/irq/affinity.c
+++ b/kernel/irq/affinity.c
@@ -7,6 +7,7 @@
#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/cpu.h>
+#include <linux/sort.h>
static void irq_spread_init_one(struct cpumask *irqmsk, struct cpumask *nmsk,
unsigned int cpus_per_vec)
@@ -94,6 +95,155 @@ static int get_nodes_in_cpumask(cpumask_var_t *node_to_cpumask,
return nodes;
}
+struct node_vectors {
+ unsigned id;
+
+ union {
+ unsigned nvectors;
+ unsigned ncpus;
+ };
+};
+
+static int ncpus_cmp_func(const void *l, const void *r)
+{
+ const struct node_vectors *ln = l;
+ const struct node_vectors *rn = r;
+
+ return ln->ncpus - rn->ncpus;
+}
+
+/*
+ * Allocate vector number for each node, so that for each node:
+ *
+ * 1) the allocated number is >= 1
+ *
+ * 2) the allocated numbver is <= active CPU number of this node
+ *
+ * The actual allocated total vectors may be less than @numvecs when
+ * active total CPU number is less than @numvecs.
+ *
+ * Active CPUs means the CPUs in '@cpu_mask AND @node_to_cpumask[]'
+ * for each node.
+ */
+static void alloc_nodes_vectors(unsigned int numvecs,
+ cpumask_var_t *node_to_cpumask,
+ const struct cpumask *cpu_mask,
+ const nodemask_t nodemsk,
+ struct cpumask *nmsk,
+ struct node_vectors *node_vectors)
+{
+ unsigned n, remaining_ncpus = 0;
+
+ for (n = 0; n < nr_node_ids; n++) {
+ node_vectors[n].id = n;
+ node_vectors[n].ncpus = UINT_MAX;
+ }
+
+ for_each_node_mask(n, nodemsk) {
+ unsigned ncpus;
+
+ cpumask_and(nmsk, cpu_mask, node_to_cpumask[n]);
+ ncpus = cpumask_weight(nmsk);
+
+ if (!ncpus)
+ continue;
+ remaining_ncpus += ncpus;
+ node_vectors[n].ncpus = ncpus;
+ }
+
+ numvecs = min_t(unsigned, remaining_ncpus, numvecs);
+
+ sort(node_vectors, nr_node_ids, sizeof(node_vectors[0]),
+ ncpus_cmp_func, NULL);
+
+ /*
+ * Allocate vectors for each node according to the ratio of this
+ * node's nr_cpus to remaining un-assigned ncpus. 'numvecs' is
+ * bigger than number of active numa nodes. Always start the
+ * allocation from the node with minimized nr_cpus.
+ *
+ * This way guarantees that each active node gets allocated at
+ * least one vector, and the theory is simple: over-allocation
+ * is only done when this node is assigned by one vector, so
+ * other nodes will be allocated >= 1 vector, since 'numvecs' is
+ * bigger than number of numa nodes.
+ *
+ * One perfect invariant is that number of allocated vectors for
+ * each node is <= CPU count of this node:
+ *
+ * 1) suppose there are two nodes: A and B
+ * ncpu(X) is CPU count of node X
+ * vecs(X) is the vector count allocated to node X via this
+ * algorithm
+ *
+ * ncpu(A) <= ncpu(B)
+ * ncpu(A) + ncpu(B) = N
+ * vecs(A) + vecs(B) = V
+ *
+ * vecs(A) = max(1, round_down(V * ncpu(A) / N))
+ * vecs(B) = V - vecs(A)
+ *
+ * both N and V are integer, and 2 <= V <= N, suppose
+ * V = N - delta, and 0 <= delta <= N - 2
+ *
+ * 2) obviously vecs(A) <= ncpu(A) because:
+ *
+ * if vecs(A) is 1, then vecs(A) <= ncpu(A) given
+ * ncpu(A) >= 1
+ *
+ * otherwise,
+ * vecs(A) <= V * ncpu(A) / N <= ncpu(A), given V <= N
+ *
+ * 3) prove how vecs(B) <= ncpu(B):
+ *
+ * if round_down(V * ncpu(A) / N) == 0, vecs(B) won't be
+ * over-allocated, so vecs(B) <= ncpu(B),
+ *
+ * otherwise:
+ *
+ * vecs(A) =
+ * round_down(V * ncpu(A) / N) =
+ * round_down((N - delta) * ncpu(A) / N) =
+ * round_down((N * ncpu(A) - delta * ncpu(A)) / N) >=
+ * round_down((N * ncpu(A) - delta * N) / N) =
+ * cpu(A) - delta
+ *
+ * then:
+ *
+ * vecs(A) - V >= ncpu(A) - delta - V
+ * =>
+ * V - vecs(A) <= V + delta - ncpu(A)
+ * =>
+ * vecs(B) <= N - ncpu(A)
+ * =>
+ * vecs(B) <= cpu(B)
+ *
+ * For nodes >= 3, it can be thought as one node and another big
+ * node given that is exactly what this algorithm is implemented,
+ * and we always re-calculate 'remaining_ncpus' & 'numvecs', and
+ * finally for each node X: vecs(X) <= ncpu(X).
+ *
+ */
+ for (n = 0; n < nr_node_ids; n++) {
+ unsigned nvectors, ncpus;
+
+ if (node_vectors[n].ncpus == UINT_MAX)
+ continue;
+
+ WARN_ON_ONCE(numvecs == 0);
+
+ ncpus = node_vectors[n].ncpus;
+ nvectors = max_t(unsigned, 1,
+ numvecs * ncpus / remaining_ncpus);
+ WARN_ON_ONCE(nvectors > ncpus);
+
+ node_vectors[n].nvectors = nvectors;
+
+ remaining_ncpus -= ncpus;
+ numvecs -= nvectors;
+ }
+}
+
static int __irq_build_affinity_masks(unsigned int startvec,
unsigned int numvecs,
unsigned int firstvec,
@@ -102,10 +252,11 @@ static int __irq_build_affinity_masks(unsigned int startvec,
struct cpumask *nmsk,
struct irq_affinity_desc *masks)
{
- unsigned int n, nodes, cpus_per_vec, extra_vecs, done = 0;
+ unsigned int i, n, nodes, cpus_per_vec, extra_vecs, done = 0;
unsigned int last_affv = firstvec + numvecs;
unsigned int curvec = startvec;
nodemask_t nodemsk = NODE_MASK_NONE;
+ struct node_vectors *node_vectors;
if (!cpumask_weight(cpu_mask))
return 0;
@@ -126,42 +277,56 @@ static int __irq_build_affinity_masks(unsigned int startvec,
return numvecs;
}
- for_each_node_mask(n, nodemsk) {
- unsigned int ncpus, v, vecs_to_assign, vecs_per_node;
+ node_vectors = kcalloc(nr_node_ids,
+ sizeof(struct node_vectors),
+ GFP_KERNEL);
+ if (!node_vectors)
+ return -ENOMEM;
- /* Spread the vectors per node */
- vecs_per_node = (numvecs - (curvec - firstvec)) / nodes;
+ /* allocate vector number for each node */
+ alloc_nodes_vectors(numvecs, node_to_cpumask, cpu_mask,
+ nodemsk, nmsk, node_vectors);
- /* Get the cpus on this node which are in the mask */
- cpumask_and(nmsk, cpu_mask, node_to_cpumask[n]);
+ for (i = 0; i < nr_node_ids; i++) {
+ unsigned int ncpus, v;
+ struct node_vectors *nv = &node_vectors[i];
+
+ if (nv->nvectors == UINT_MAX)
+ continue;
- /* Calculate the number of cpus per vector */
+ /* Get the cpus on this node which are in the mask */
+ cpumask_and(nmsk, cpu_mask, node_to_cpumask[nv->id]);
ncpus = cpumask_weight(nmsk);
- vecs_to_assign = min(vecs_per_node, ncpus);
+ if (!ncpus)
+ continue;
+
+ WARN_ON_ONCE(nv->nvectors > ncpus);
/* Account for rounding errors */
- extra_vecs = ncpus - vecs_to_assign * (ncpus / vecs_to_assign);
+ extra_vecs = ncpus - nv->nvectors * (ncpus / nv->nvectors);
- for (v = 0; curvec < last_affv && v < vecs_to_assign;
- curvec++, v++) {
- cpus_per_vec = ncpus / vecs_to_assign;
+ /* Spread allocated vectors on CPUs of the current node */
+ for (v = 0; v < nv->nvectors; v++, curvec++) {
+ cpus_per_vec = ncpus / nv->nvectors;
/* Account for extra vectors to compensate rounding errors */
if (extra_vecs) {
cpus_per_vec++;
--extra_vecs;
}
+
+ /*
+ * wrapping has to be considered given 'startvec'
+ * may start anywhere
+ */
+ if (curvec >= last_affv)
+ curvec = firstvec;
irq_spread_init_one(&masks[curvec].mask, nmsk,
cpus_per_vec);
}
-
- done += v;
- if (done >= numvecs)
- break;
- if (curvec >= last_affv)
- curvec = firstvec;
- --nodes;
+ done += nv->nvectors;
}
+ kfree(node_vectors);
return done;
}
@@ -174,7 +339,7 @@ static int irq_build_affinity_masks(unsigned int startvec, unsigned int numvecs,
unsigned int firstvec,
struct irq_affinity_desc *masks)
{
- unsigned int curvec = startvec, nr_present, nr_others;
+ unsigned int curvec = startvec, nr_present = 0, nr_others = 0;
cpumask_var_t *node_to_cpumask;
cpumask_var_t nmsk, npresmsk;
int ret = -ENOMEM;
@@ -189,15 +354,17 @@ static int irq_build_affinity_masks(unsigned int startvec, unsigned int numvecs,
if (!node_to_cpumask)
goto fail_npresmsk;
- ret = 0;
/* Stabilize the cpumasks */
get_online_cpus();
build_node_to_cpumask(node_to_cpumask);
/* Spread on present CPUs starting from affd->pre_vectors */
- nr_present = __irq_build_affinity_masks(curvec, numvecs,
- firstvec, node_to_cpumask,
- cpu_present_mask, nmsk, masks);
+ ret = __irq_build_affinity_masks(curvec, numvecs, firstvec,
+ node_to_cpumask, cpu_present_mask,
+ nmsk, masks);
+ if (ret < 0)
+ goto fail_build_affinity;
+ nr_present = ret;
/*
* Spread on non present CPUs starting from the next vector to be
@@ -210,12 +377,16 @@ static int irq_build_affinity_masks(unsigned int startvec, unsigned int numvecs,
else
curvec = firstvec + nr_present;
cpumask_andnot(npresmsk, cpu_possible_mask, cpu_present_mask);
- nr_others = __irq_build_affinity_masks(curvec, numvecs,
- firstvec, node_to_cpumask,
- npresmsk, nmsk, masks);
+ ret = __irq_build_affinity_masks(curvec, numvecs, firstvec,
+ node_to_cpumask, npresmsk, nmsk,
+ masks);
+ if (ret >= 0)
+ nr_others = ret;
+
+ fail_build_affinity:
put_online_cpus();
- if (nr_present < numvecs)
+ if (ret >= 0)
WARN_ON(nr_present + nr_others < numvecs);
free_node_to_cpumask(node_to_cpumask);
@@ -225,7 +396,7 @@ static int irq_build_affinity_masks(unsigned int startvec, unsigned int numvecs,
fail_nmsk:
free_cpumask_var(nmsk);
- return ret;
+ return ret < 0 ? ret : 0;
}
static void default_calc_sets(struct irq_affinity *affd, unsigned int affvecs)
@@ -251,11 +422,9 @@ irq_create_affinity_masks(unsigned int nvecs, struct irq_affinity *affd)
* Determine the number of vectors which need interrupt affinities
* assigned. If the pre/post request exhausts the available vectors
* then nothing to do here except for invoking the calc_sets()
- * callback so the device driver can adjust to the situation. If there
- * is only a single vector, then managing the queue is pointless as
- * well.
+ * callback so the device driver can adjust to the situation.
*/
- if (nvecs > 1 && nvecs > affd->pre_vectors + affd->post_vectors)
+ if (nvecs > affd->pre_vectors + affd->post_vectors)
affvecs = nvecs - affd->pre_vectors - affd->post_vectors;
else
affvecs = 0;
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 9484e88dabc2..9be995fc3c5a 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -295,6 +295,18 @@ static void irq_sysfs_add(int irq, struct irq_desc *desc)
}
}
+static void irq_sysfs_del(struct irq_desc *desc)
+{
+ /*
+ * If irq_sysfs_init() has not yet been invoked (early boot), then
+ * irq_kobj_base is NULL and the descriptor was never added.
+ * kobject_del() complains about a object with no parent, so make
+ * it conditional.
+ */
+ if (irq_kobj_base)
+ kobject_del(&desc->kobj);
+}
+
static int __init irq_sysfs_init(void)
{
struct irq_desc *desc;
@@ -325,6 +337,7 @@ static struct kobj_type irq_kobj_type = {
};
static void irq_sysfs_add(int irq, struct irq_desc *desc) {}
+static void irq_sysfs_del(struct irq_desc *desc) {}
#endif /* CONFIG_SYSFS */
@@ -438,7 +451,7 @@ static void free_desc(unsigned int irq)
* The sysfs entry must be serialized against a concurrent
* irq_sysfs_init() as well.
*/
- kobject_del(&desc->kobj);
+ irq_sysfs_del(desc);
delete_irq_desc(irq);
/*
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 3078d0e48bba..132672b74e4b 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -31,7 +31,7 @@ struct irqchip_fwid {
struct fwnode_handle fwnode;
unsigned int type;
char *name;
- void *data;
+ phys_addr_t *pa;
};
#ifdef CONFIG_GENERIC_IRQ_DEBUGFS
@@ -62,7 +62,8 @@ EXPORT_SYMBOL_GPL(irqchip_fwnode_ops);
* domain struct.
*/
struct fwnode_handle *__irq_domain_alloc_fwnode(unsigned int type, int id,
- const char *name, void *data)
+ const char *name,
+ phys_addr_t *pa)
{
struct irqchip_fwid *fwid;
char *n;
@@ -77,7 +78,7 @@ struct fwnode_handle *__irq_domain_alloc_fwnode(unsigned int type, int id,
n = kasprintf(GFP_KERNEL, "%s-%d", name, id);
break;
default:
- n = kasprintf(GFP_KERNEL, "irqchip@%p", data);
+ n = kasprintf(GFP_KERNEL, "irqchip@%pa", pa);
break;
}
@@ -89,7 +90,7 @@ struct fwnode_handle *__irq_domain_alloc_fwnode(unsigned int type, int id,
fwid->type = type;
fwid->name = n;
- fwid->data = data;
+ fwid->pa = pa;
fwid->fwnode.ops = &irqchip_fwnode_ops;
return &fwid->fwnode;
}
@@ -148,6 +149,7 @@ struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, int size,
switch (fwid->type) {
case IRQCHIP_FWNODE_NAMED:
case IRQCHIP_FWNODE_NAMED_ID:
+ domain->fwnode = fwnode;
domain->name = kstrdup(fwid->name, GFP_KERNEL);
if (!domain->name) {
kfree(domain);
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index e8f7f179bf77..1753486b440c 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -23,7 +23,7 @@
#include "internals.h"
-#ifdef CONFIG_IRQ_FORCED_THREADING
+#if defined(CONFIG_IRQ_FORCED_THREADING) && !defined(CONFIG_PREEMPT_RT)
__read_mostly bool force_irqthreads;
EXPORT_SYMBOL_GPL(force_irqthreads);
@@ -1255,8 +1255,7 @@ setup_irq_thread(struct irqaction *new, unsigned int irq, bool secondary)
* the thread dies to avoid that the interrupt code
* references an already freed task_struct.
*/
- get_task_struct(t);
- new->thread = t;
+ new->thread = get_task_struct(t);
/*
* Tell the thread to set its affinity. This is
* important for shared interrupt handlers as we do
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index da9addb8d655..cfc4f088a0e7 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -100,10 +100,6 @@ static int irq_affinity_hint_proc_show(struct seq_file *m, void *v)
return 0;
}
-#ifndef is_affinity_mask_valid
-#define is_affinity_mask_valid(val) 1
-#endif
-
int no_irq_affinity;
static int irq_affinity_proc_show(struct seq_file *m, void *v)
{
@@ -136,11 +132,6 @@ static ssize_t write_irq_affinity(int type, struct file *file,
if (err)
goto free_cpumask;
- if (!is_affinity_mask_valid(new_value)) {
- err = -EINVAL;
- goto free_cpumask;
- }
-
/*
* Do not allow disabling IRQs completely - it's a too easy
* way to make the system unusable accidentally :-) At least
@@ -232,11 +223,6 @@ static ssize_t default_affinity_write(struct file *file,
if (err)
goto out;
- if (!is_affinity_mask_valid(new_value)) {
- err = -EINVAL;
- goto out;
- }
-
/*
* Do not allow disabling IRQs completely - it's a too easy
* way to make the system unusable accidentally :-) At least
diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c
index 95414ad3506a..98c04ca5fa43 100644
--- a/kernel/irq/resend.c
+++ b/kernel/irq/resend.c
@@ -36,6 +36,8 @@ static void resend_irqs(unsigned long arg)
irq = find_first_bit(irqs_resend, nr_irqs);
clear_bit(irq, irqs_resend);
desc = irq_to_desc(irq);
+ if (!desc)
+ continue;
local_irq_disable();
desc->handle_irq(desc);
local_irq_enable();
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index df3008419a1d..cdb3ffab128b 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -407,7 +407,9 @@ static bool jump_label_can_update(struct jump_entry *entry, bool init)
return false;
if (!kernel_text_address(jump_entry_code(entry))) {
- WARN_ONCE(1, "can't patch jump_label at %pS", (void *)jump_entry_code(entry));
+ WARN_ONCE(!jump_entry_is_init(entry),
+ "can't patch jump_label at %pS",
+ (void *)jump_entry_code(entry));
return false;
}
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 95a260f9214b..136ce049c4ad 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -263,8 +263,10 @@ int kallsyms_lookup_size_offset(unsigned long addr, unsigned long *symbolsize,
{
char namebuf[KSYM_NAME_LEN];
- if (is_ksym_addr(addr))
- return !!get_symbol_pos(addr, symbolsize, offset);
+ if (is_ksym_addr(addr)) {
+ get_symbol_pos(addr, symbolsize, offset);
+ return 1;
+ }
return !!module_address_lookup(addr, symbolsize, offset, NULL, namebuf) ||
!!__bpf_address_lookup(addr, symbolsize, offset, namebuf);
}
diff --git a/kernel/kexec_elf.c b/kernel/kexec_elf.c
new file mode 100644
index 000000000000..d3689632e8b9
--- /dev/null
+++ b/kernel/kexec_elf.c
@@ -0,0 +1,430 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Load ELF vmlinux file for the kexec_file_load syscall.
+ *
+ * Copyright (C) 2004 Adam Litke (agl@us.ibm.com)
+ * Copyright (C) 2004 IBM Corp.
+ * Copyright (C) 2005 R Sharada (sharada@in.ibm.com)
+ * Copyright (C) 2006 Mohan Kumar M (mohan@in.ibm.com)
+ * Copyright (C) 2016 IBM Corporation
+ *
+ * Based on kexec-tools' kexec-elf-exec.c and kexec-elf-ppc64.c.
+ * Heavily modified for the kernel by
+ * Thiago Jung Bauermann <bauerman@linux.vnet.ibm.com>.
+ */
+
+#define pr_fmt(fmt) "kexec_elf: " fmt
+
+#include <linux/elf.h>
+#include <linux/kexec.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+
+static inline bool elf_is_elf_file(const struct elfhdr *ehdr)
+{
+ return memcmp(ehdr->e_ident, ELFMAG, SELFMAG) == 0;
+}
+
+static uint64_t elf64_to_cpu(const struct elfhdr *ehdr, uint64_t value)
+{
+ if (ehdr->e_ident[EI_DATA] == ELFDATA2LSB)
+ value = le64_to_cpu(value);
+ else if (ehdr->e_ident[EI_DATA] == ELFDATA2MSB)
+ value = be64_to_cpu(value);
+
+ return value;
+}
+
+static uint32_t elf32_to_cpu(const struct elfhdr *ehdr, uint32_t value)
+{
+ if (ehdr->e_ident[EI_DATA] == ELFDATA2LSB)
+ value = le32_to_cpu(value);
+ else if (ehdr->e_ident[EI_DATA] == ELFDATA2MSB)
+ value = be32_to_cpu(value);
+
+ return value;
+}
+
+static uint16_t elf16_to_cpu(const struct elfhdr *ehdr, uint16_t value)
+{
+ if (ehdr->e_ident[EI_DATA] == ELFDATA2LSB)
+ value = le16_to_cpu(value);
+ else if (ehdr->e_ident[EI_DATA] == ELFDATA2MSB)
+ value = be16_to_cpu(value);
+
+ return value;
+}
+
+/**
+ * elf_is_ehdr_sane - check that it is safe to use the ELF header
+ * @buf_len: size of the buffer in which the ELF file is loaded.
+ */
+static bool elf_is_ehdr_sane(const struct elfhdr *ehdr, size_t buf_len)
+{
+ if (ehdr->e_phnum > 0 && ehdr->e_phentsize != sizeof(struct elf_phdr)) {
+ pr_debug("Bad program header size.\n");
+ return false;
+ } else if (ehdr->e_shnum > 0 &&
+ ehdr->e_shentsize != sizeof(struct elf_shdr)) {
+ pr_debug("Bad section header size.\n");
+ return false;
+ } else if (ehdr->e_ident[EI_VERSION] != EV_CURRENT ||
+ ehdr->e_version != EV_CURRENT) {
+ pr_debug("Unknown ELF version.\n");
+ return false;
+ }
+
+ if (ehdr->e_phoff > 0 && ehdr->e_phnum > 0) {
+ size_t phdr_size;
+
+ /*
+ * e_phnum is at most 65535 so calculating the size of the
+ * program header cannot overflow.
+ */
+ phdr_size = sizeof(struct elf_phdr) * ehdr->e_phnum;
+
+ /* Sanity check the program header table location. */
+ if (ehdr->e_phoff + phdr_size < ehdr->e_phoff) {
+ pr_debug("Program headers at invalid location.\n");
+ return false;
+ } else if (ehdr->e_phoff + phdr_size > buf_len) {
+ pr_debug("Program headers truncated.\n");
+ return false;
+ }
+ }
+
+ if (ehdr->e_shoff > 0 && ehdr->e_shnum > 0) {
+ size_t shdr_size;
+
+ /*
+ * e_shnum is at most 65536 so calculating
+ * the size of the section header cannot overflow.
+ */
+ shdr_size = sizeof(struct elf_shdr) * ehdr->e_shnum;
+
+ /* Sanity check the section header table location. */
+ if (ehdr->e_shoff + shdr_size < ehdr->e_shoff) {
+ pr_debug("Section headers at invalid location.\n");
+ return false;
+ } else if (ehdr->e_shoff + shdr_size > buf_len) {
+ pr_debug("Section headers truncated.\n");
+ return false;
+ }
+ }
+
+ return true;
+}
+
+static int elf_read_ehdr(const char *buf, size_t len, struct elfhdr *ehdr)
+{
+ struct elfhdr *buf_ehdr;
+
+ if (len < sizeof(*buf_ehdr)) {
+ pr_debug("Buffer is too small to hold ELF header.\n");
+ return -ENOEXEC;
+ }
+
+ memset(ehdr, 0, sizeof(*ehdr));
+ memcpy(ehdr->e_ident, buf, sizeof(ehdr->e_ident));
+ if (!elf_is_elf_file(ehdr)) {
+ pr_debug("No ELF header magic.\n");
+ return -ENOEXEC;
+ }
+
+ if (ehdr->e_ident[EI_CLASS] != ELF_CLASS) {
+ pr_debug("Not a supported ELF class.\n");
+ return -ENOEXEC;
+ } else if (ehdr->e_ident[EI_DATA] != ELFDATA2LSB &&
+ ehdr->e_ident[EI_DATA] != ELFDATA2MSB) {
+ pr_debug("Not a supported ELF data format.\n");
+ return -ENOEXEC;
+ }
+
+ buf_ehdr = (struct elfhdr *) buf;
+ if (elf16_to_cpu(ehdr, buf_ehdr->e_ehsize) != sizeof(*buf_ehdr)) {
+ pr_debug("Bad ELF header size.\n");
+ return -ENOEXEC;
+ }
+
+ ehdr->e_type = elf16_to_cpu(ehdr, buf_ehdr->e_type);
+ ehdr->e_machine = elf16_to_cpu(ehdr, buf_ehdr->e_machine);
+ ehdr->e_version = elf32_to_cpu(ehdr, buf_ehdr->e_version);
+ ehdr->e_flags = elf32_to_cpu(ehdr, buf_ehdr->e_flags);
+ ehdr->e_phentsize = elf16_to_cpu(ehdr, buf_ehdr->e_phentsize);
+ ehdr->e_phnum = elf16_to_cpu(ehdr, buf_ehdr->e_phnum);
+ ehdr->e_shentsize = elf16_to_cpu(ehdr, buf_ehdr->e_shentsize);
+ ehdr->e_shnum = elf16_to_cpu(ehdr, buf_ehdr->e_shnum);
+ ehdr->e_shstrndx = elf16_to_cpu(ehdr, buf_ehdr->e_shstrndx);
+
+ switch (ehdr->e_ident[EI_CLASS]) {
+ case ELFCLASS64:
+ ehdr->e_entry = elf64_to_cpu(ehdr, buf_ehdr->e_entry);
+ ehdr->e_phoff = elf64_to_cpu(ehdr, buf_ehdr->e_phoff);
+ ehdr->e_shoff = elf64_to_cpu(ehdr, buf_ehdr->e_shoff);
+ break;
+
+ case ELFCLASS32:
+ ehdr->e_entry = elf32_to_cpu(ehdr, buf_ehdr->e_entry);
+ ehdr->e_phoff = elf32_to_cpu(ehdr, buf_ehdr->e_phoff);
+ ehdr->e_shoff = elf32_to_cpu(ehdr, buf_ehdr->e_shoff);
+ break;
+
+ default:
+ pr_debug("Unknown ELF class.\n");
+ return -EINVAL;
+ }
+
+ return elf_is_ehdr_sane(ehdr, len) ? 0 : -ENOEXEC;
+}
+
+/**
+ * elf_is_phdr_sane - check that it is safe to use the program header
+ * @buf_len: size of the buffer in which the ELF file is loaded.
+ */
+static bool elf_is_phdr_sane(const struct elf_phdr *phdr, size_t buf_len)
+{
+
+ if (phdr->p_offset + phdr->p_filesz < phdr->p_offset) {
+ pr_debug("ELF segment location wraps around.\n");
+ return false;
+ } else if (phdr->p_offset + phdr->p_filesz > buf_len) {
+ pr_debug("ELF segment not in file.\n");
+ return false;
+ } else if (phdr->p_paddr + phdr->p_memsz < phdr->p_paddr) {
+ pr_debug("ELF segment address wraps around.\n");
+ return false;
+ }
+
+ return true;
+}
+
+static int elf_read_phdr(const char *buf, size_t len,
+ struct kexec_elf_info *elf_info,
+ int idx)
+{
+ /* Override the const in proghdrs, we are the ones doing the loading. */
+ struct elf_phdr *phdr = (struct elf_phdr *) &elf_info->proghdrs[idx];
+ const struct elfhdr *ehdr = elf_info->ehdr;
+ const char *pbuf;
+ struct elf_phdr *buf_phdr;
+
+ pbuf = buf + elf_info->ehdr->e_phoff + (idx * sizeof(*buf_phdr));
+ buf_phdr = (struct elf_phdr *) pbuf;
+
+ phdr->p_type = elf32_to_cpu(elf_info->ehdr, buf_phdr->p_type);
+ phdr->p_flags = elf32_to_cpu(elf_info->ehdr, buf_phdr->p_flags);
+
+ switch (ehdr->e_ident[EI_CLASS]) {
+ case ELFCLASS64:
+ phdr->p_offset = elf64_to_cpu(ehdr, buf_phdr->p_offset);
+ phdr->p_paddr = elf64_to_cpu(ehdr, buf_phdr->p_paddr);
+ phdr->p_vaddr = elf64_to_cpu(ehdr, buf_phdr->p_vaddr);
+ phdr->p_filesz = elf64_to_cpu(ehdr, buf_phdr->p_filesz);
+ phdr->p_memsz = elf64_to_cpu(ehdr, buf_phdr->p_memsz);
+ phdr->p_align = elf64_to_cpu(ehdr, buf_phdr->p_align);
+ break;
+
+ case ELFCLASS32:
+ phdr->p_offset = elf32_to_cpu(ehdr, buf_phdr->p_offset);
+ phdr->p_paddr = elf32_to_cpu(ehdr, buf_phdr->p_paddr);
+ phdr->p_vaddr = elf32_to_cpu(ehdr, buf_phdr->p_vaddr);
+ phdr->p_filesz = elf32_to_cpu(ehdr, buf_phdr->p_filesz);
+ phdr->p_memsz = elf32_to_cpu(ehdr, buf_phdr->p_memsz);
+ phdr->p_align = elf32_to_cpu(ehdr, buf_phdr->p_align);
+ break;
+
+ default:
+ pr_debug("Unknown ELF class.\n");
+ return -EINVAL;
+ }
+
+ return elf_is_phdr_sane(phdr, len) ? 0 : -ENOEXEC;
+}
+
+/**
+ * elf_read_phdrs - read the program headers from the buffer
+ *
+ * This function assumes that the program header table was checked for sanity.
+ * Use elf_is_ehdr_sane() if it wasn't.
+ */
+static int elf_read_phdrs(const char *buf, size_t len,
+ struct kexec_elf_info *elf_info)
+{
+ size_t phdr_size, i;
+ const struct elfhdr *ehdr = elf_info->ehdr;
+
+ /*
+ * e_phnum is at most 65535 so calculating the size of the
+ * program header cannot overflow.
+ */
+ phdr_size = sizeof(struct elf_phdr) * ehdr->e_phnum;
+
+ elf_info->proghdrs = kzalloc(phdr_size, GFP_KERNEL);
+ if (!elf_info->proghdrs)
+ return -ENOMEM;
+
+ for (i = 0; i < ehdr->e_phnum; i++) {
+ int ret;
+
+ ret = elf_read_phdr(buf, len, elf_info, i);
+ if (ret) {
+ kfree(elf_info->proghdrs);
+ elf_info->proghdrs = NULL;
+ return ret;
+ }
+ }
+
+ return 0;
+}
+
+/**
+ * elf_read_from_buffer - read ELF file and sets up ELF header and ELF info
+ * @buf: Buffer to read ELF file from.
+ * @len: Size of @buf.
+ * @ehdr: Pointer to existing struct which will be populated.
+ * @elf_info: Pointer to existing struct which will be populated.
+ *
+ * This function allows reading ELF files with different byte order than
+ * the kernel, byte-swapping the fields as needed.
+ *
+ * Return:
+ * On success returns 0, and the caller should call
+ * kexec_free_elf_info(elf_info) to free the memory allocated for the section
+ * and program headers.
+ */
+static int elf_read_from_buffer(const char *buf, size_t len,
+ struct elfhdr *ehdr,
+ struct kexec_elf_info *elf_info)
+{
+ int ret;
+
+ ret = elf_read_ehdr(buf, len, ehdr);
+ if (ret)
+ return ret;
+
+ elf_info->buffer = buf;
+ elf_info->ehdr = ehdr;
+ if (ehdr->e_phoff > 0 && ehdr->e_phnum > 0) {
+ ret = elf_read_phdrs(buf, len, elf_info);
+ if (ret)
+ return ret;
+ }
+ return 0;
+}
+
+/**
+ * kexec_free_elf_info - free memory allocated by elf_read_from_buffer
+ */
+void kexec_free_elf_info(struct kexec_elf_info *elf_info)
+{
+ kfree(elf_info->proghdrs);
+ memset(elf_info, 0, sizeof(*elf_info));
+}
+/**
+ * kexec_build_elf_info - read ELF executable and check that we can use it
+ */
+int kexec_build_elf_info(const char *buf, size_t len, struct elfhdr *ehdr,
+ struct kexec_elf_info *elf_info)
+{
+ int i;
+ int ret;
+
+ ret = elf_read_from_buffer(buf, len, ehdr, elf_info);
+ if (ret)
+ return ret;
+
+ /* Big endian vmlinux has type ET_DYN. */
+ if (ehdr->e_type != ET_EXEC && ehdr->e_type != ET_DYN) {
+ pr_err("Not an ELF executable.\n");
+ goto error;
+ } else if (!elf_info->proghdrs) {
+ pr_err("No ELF program header.\n");
+ goto error;
+ }
+
+ for (i = 0; i < ehdr->e_phnum; i++) {
+ /*
+ * Kexec does not support loading interpreters.
+ * In addition this check keeps us from attempting
+ * to kexec ordinay executables.
+ */
+ if (elf_info->proghdrs[i].p_type == PT_INTERP) {
+ pr_err("Requires an ELF interpreter.\n");
+ goto error;
+ }
+ }
+
+ return 0;
+error:
+ kexec_free_elf_info(elf_info);
+ return -ENOEXEC;
+}
+
+
+int kexec_elf_probe(const char *buf, unsigned long len)
+{
+ struct elfhdr ehdr;
+ struct kexec_elf_info elf_info;
+ int ret;
+
+ ret = kexec_build_elf_info(buf, len, &ehdr, &elf_info);
+ if (ret)
+ return ret;
+
+ kexec_free_elf_info(&elf_info);
+
+ return elf_check_arch(&ehdr) ? 0 : -ENOEXEC;
+}
+
+/**
+ * kexec_elf_load - load ELF executable image
+ * @lowest_load_addr: On return, will be the address where the first PT_LOAD
+ * section will be loaded in memory.
+ *
+ * Return:
+ * 0 on success, negative value on failure.
+ */
+int kexec_elf_load(struct kimage *image, struct elfhdr *ehdr,
+ struct kexec_elf_info *elf_info,
+ struct kexec_buf *kbuf,
+ unsigned long *lowest_load_addr)
+{
+ unsigned long lowest_addr = UINT_MAX;
+ int ret;
+ size_t i;
+
+ /* Read in the PT_LOAD segments. */
+ for (i = 0; i < ehdr->e_phnum; i++) {
+ unsigned long load_addr;
+ size_t size;
+ const struct elf_phdr *phdr;
+
+ phdr = &elf_info->proghdrs[i];
+ if (phdr->p_type != PT_LOAD)
+ continue;
+
+ size = phdr->p_filesz;
+ if (size > phdr->p_memsz)
+ size = phdr->p_memsz;
+
+ kbuf->buffer = (void *) elf_info->buffer + phdr->p_offset;
+ kbuf->bufsz = size;
+ kbuf->memsz = phdr->p_memsz;
+ kbuf->buf_align = phdr->p_align;
+ kbuf->buf_min = phdr->p_paddr;
+ kbuf->mem = KEXEC_BUF_MEM_UNKNOWN;
+ ret = kexec_add_buffer(kbuf);
+ if (ret)
+ goto out;
+ load_addr = kbuf->mem;
+
+ if (load_addr < lowest_addr)
+ lowest_addr = load_addr;
+ }
+
+ *lowest_load_addr = lowest_addr;
+ ret = 0;
+ out:
+ return ret;
+}
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 9873fc627d61..1b66ccbb744a 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -470,6 +470,7 @@ static DECLARE_DELAYED_WORK(optimizing_work, kprobe_optimizer);
*/
static void do_optimize_kprobes(void)
{
+ lockdep_assert_held(&text_mutex);
/*
* The optimization/unoptimization refers online_cpus via
* stop_machine() and cpu-hotplug modifies online_cpus.
@@ -487,9 +488,7 @@ static void do_optimize_kprobes(void)
list_empty(&optimizing_list))
return;
- mutex_lock(&text_mutex);
arch_optimize_kprobes(&optimizing_list);
- mutex_unlock(&text_mutex);
}
/*
@@ -500,6 +499,7 @@ static void do_unoptimize_kprobes(void)
{
struct optimized_kprobe *op, *tmp;
+ lockdep_assert_held(&text_mutex);
/* See comment in do_optimize_kprobes() */
lockdep_assert_cpus_held();
@@ -507,7 +507,6 @@ static void do_unoptimize_kprobes(void)
if (list_empty(&unoptimizing_list))
return;
- mutex_lock(&text_mutex);
arch_unoptimize_kprobes(&unoptimizing_list, &freeing_list);
/* Loop free_list for disarming */
list_for_each_entry_safe(op, tmp, &freeing_list, list) {
@@ -524,7 +523,6 @@ static void do_unoptimize_kprobes(void)
} else
list_del_init(&op->list);
}
- mutex_unlock(&text_mutex);
}
/* Reclaim all kprobes on the free_list */
@@ -556,6 +554,7 @@ static void kprobe_optimizer(struct work_struct *work)
{
mutex_lock(&kprobe_mutex);
cpus_read_lock();
+ mutex_lock(&text_mutex);
/* Lock modules while optimizing kprobes */
mutex_lock(&module_mutex);
@@ -583,6 +582,7 @@ static void kprobe_optimizer(struct work_struct *work)
do_free_cleaned_kprobes();
mutex_unlock(&module_mutex);
+ mutex_unlock(&text_mutex);
cpus_read_unlock();
mutex_unlock(&kprobe_mutex);
@@ -1514,7 +1514,8 @@ static int check_kprobe_address_safe(struct kprobe *p,
/* Ensure it is not in reserved area nor out of text */
if (!kernel_text_address((unsigned long) p->addr) ||
within_kprobe_blacklist((unsigned long) p->addr) ||
- jump_label_text_reserved(p->addr, p->addr)) {
+ jump_label_text_reserved(p->addr, p->addr) ||
+ find_bug((unsigned long)p->addr)) {
ret = -EINVAL;
goto out;
}
@@ -1906,7 +1907,7 @@ int register_kretprobe(struct kretprobe *rp)
/* Pre-allocate memory for max kretprobe instances */
if (rp->maxactive <= 0) {
-#ifdef CONFIG_PREEMPT
+#ifdef CONFIG_PREEMPTION
rp->maxactive = max_t(unsigned int, 10, 2*num_possible_cpus());
#else
rp->maxactive = num_possible_cpus();
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 341f52117f88..233459c03b5a 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -448,34 +448,102 @@ static void print_lockdep_off(const char *bug_msg)
unsigned long nr_stack_trace_entries;
-#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING)
+#ifdef CONFIG_PROVE_LOCKING
+/**
+ * struct lock_trace - single stack backtrace
+ * @hash_entry: Entry in a stack_trace_hash[] list.
+ * @hash: jhash() of @entries.
+ * @nr_entries: Number of entries in @entries.
+ * @entries: Actual stack backtrace.
+ */
+struct lock_trace {
+ struct hlist_node hash_entry;
+ u32 hash;
+ u32 nr_entries;
+ unsigned long entries[0] __aligned(sizeof(unsigned long));
+};
+#define LOCK_TRACE_SIZE_IN_LONGS \
+ (sizeof(struct lock_trace) / sizeof(unsigned long))
/*
- * Stack-trace: tightly packed array of stack backtrace
- * addresses. Protected by the graph_lock.
+ * Stack-trace: sequence of lock_trace structures. Protected by the graph_lock.
*/
static unsigned long stack_trace[MAX_STACK_TRACE_ENTRIES];
+static struct hlist_head stack_trace_hash[STACK_TRACE_HASH_SIZE];
+
+static bool traces_identical(struct lock_trace *t1, struct lock_trace *t2)
+{
+ return t1->hash == t2->hash && t1->nr_entries == t2->nr_entries &&
+ memcmp(t1->entries, t2->entries,
+ t1->nr_entries * sizeof(t1->entries[0])) == 0;
+}
-static int save_trace(struct lock_trace *trace)
+static struct lock_trace *save_trace(void)
{
- unsigned long *entries = stack_trace + nr_stack_trace_entries;
+ struct lock_trace *trace, *t2;
+ struct hlist_head *hash_head;
+ u32 hash;
unsigned int max_entries;
- trace->offset = nr_stack_trace_entries;
- max_entries = MAX_STACK_TRACE_ENTRIES - nr_stack_trace_entries;
- trace->nr_entries = stack_trace_save(entries, max_entries, 3);
- nr_stack_trace_entries += trace->nr_entries;
+ BUILD_BUG_ON_NOT_POWER_OF_2(STACK_TRACE_HASH_SIZE);
+ BUILD_BUG_ON(LOCK_TRACE_SIZE_IN_LONGS >= MAX_STACK_TRACE_ENTRIES);
- if (nr_stack_trace_entries >= MAX_STACK_TRACE_ENTRIES-1) {
+ trace = (struct lock_trace *)(stack_trace + nr_stack_trace_entries);
+ max_entries = MAX_STACK_TRACE_ENTRIES - nr_stack_trace_entries -
+ LOCK_TRACE_SIZE_IN_LONGS;
+ trace->nr_entries = stack_trace_save(trace->entries, max_entries, 3);
+
+ if (nr_stack_trace_entries >= MAX_STACK_TRACE_ENTRIES -
+ LOCK_TRACE_SIZE_IN_LONGS - 1) {
if (!debug_locks_off_graph_unlock())
- return 0;
+ return NULL;
print_lockdep_off("BUG: MAX_STACK_TRACE_ENTRIES too low!");
dump_stack();
- return 0;
+ return NULL;
}
- return 1;
+ hash = jhash(trace->entries, trace->nr_entries *
+ sizeof(trace->entries[0]), 0);
+ trace->hash = hash;
+ hash_head = stack_trace_hash + (hash & (STACK_TRACE_HASH_SIZE - 1));
+ hlist_for_each_entry(t2, hash_head, hash_entry) {
+ if (traces_identical(trace, t2))
+ return t2;
+ }
+ nr_stack_trace_entries += LOCK_TRACE_SIZE_IN_LONGS + trace->nr_entries;
+ hlist_add_head(&trace->hash_entry, hash_head);
+
+ return trace;
+}
+
+/* Return the number of stack traces in the stack_trace[] array. */
+u64 lockdep_stack_trace_count(void)
+{
+ struct lock_trace *trace;
+ u64 c = 0;
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(stack_trace_hash); i++) {
+ hlist_for_each_entry(trace, &stack_trace_hash[i], hash_entry) {
+ c++;
+ }
+ }
+
+ return c;
+}
+
+/* Return the number of stack hash chains that have at least one stack trace. */
+u64 lockdep_stack_hash_count(void)
+{
+ u64 c = 0;
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(stack_trace_hash); i++)
+ if (!hlist_empty(&stack_trace_hash[i]))
+ c++;
+
+ return c;
}
#endif
@@ -491,7 +559,7 @@ unsigned int max_lockdep_depth;
DEFINE_PER_CPU(struct lockdep_stats, lockdep_stats);
#endif
-#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING)
+#ifdef CONFIG_PROVE_LOCKING
/*
* Locking printouts:
*/
@@ -511,7 +579,7 @@ static const char *usage_str[] =
};
#endif
-const char * __get_key_name(struct lockdep_subclass_key *key, char *str)
+const char *__get_key_name(const struct lockdep_subclass_key *key, char *str)
{
return kallsyms_lookup((unsigned long)key, NULL, NULL, NULL, str);
}
@@ -620,7 +688,7 @@ static void print_lock(struct held_lock *hlock)
return;
}
- printk(KERN_CONT "%p", hlock->instance);
+ printk(KERN_CONT "%px", hlock->instance);
print_lock_name(lock);
printk(KERN_CONT ", at: %pS\n", (void *)hlock->acquire_ip);
}
@@ -1235,7 +1303,7 @@ static struct lock_list *alloc_list_entry(void)
static int add_lock_to_list(struct lock_class *this,
struct lock_class *links_to, struct list_head *head,
unsigned long ip, int distance,
- struct lock_trace *trace)
+ const struct lock_trace *trace)
{
struct lock_list *entry;
/*
@@ -1249,7 +1317,7 @@ static int add_lock_to_list(struct lock_class *this,
entry->class = this;
entry->links_to = links_to;
entry->distance = distance;
- entry->trace = *trace;
+ entry->trace = trace;
/*
* Both allocation and removal are done under the graph lock; but
* iteration is under RCU-sched; see look_up_lock_class() and
@@ -1470,11 +1538,10 @@ static inline int __bfs_backwards(struct lock_list *src_entry,
}
-static void print_lock_trace(struct lock_trace *trace, unsigned int spaces)
+static void print_lock_trace(const struct lock_trace *trace,
+ unsigned int spaces)
{
- unsigned long *entries = stack_trace + trace->offset;
-
- stack_trace_print(entries, trace->nr_entries, spaces);
+ stack_trace_print(trace->entries, trace->nr_entries, spaces);
}
/*
@@ -1489,7 +1556,7 @@ print_circular_bug_entry(struct lock_list *target, int depth)
printk("\n-> #%u", depth);
print_lock_name(target->class);
printk(KERN_CONT ":\n");
- print_lock_trace(&target->trace, 6);
+ print_lock_trace(target->trace, 6);
}
static void
@@ -1592,7 +1659,8 @@ static noinline void print_circular_bug(struct lock_list *this,
if (!debug_locks_off_graph_unlock() || debug_locks_silent)
return;
- if (!save_trace(&this->trace))
+ this->trace = save_trace();
+ if (!this->trace)
return;
depth = get_lock_depth(target);
@@ -1715,7 +1783,7 @@ check_path(struct lock_class *target, struct lock_list *src_entry,
*/
static noinline int
check_noncircular(struct held_lock *src, struct held_lock *target,
- struct lock_trace *trace)
+ struct lock_trace **const trace)
{
int ret;
struct lock_list *uninitialized_var(target_entry);
@@ -1729,13 +1797,13 @@ check_noncircular(struct held_lock *src, struct held_lock *target,
ret = check_path(hlock_class(target), &src_entry, &target_entry);
if (unlikely(!ret)) {
- if (!trace->nr_entries) {
+ if (!*trace) {
/*
* If save_trace fails here, the printing might
* trigger a WARN but because of the !nr_entries it
* should not do bad things.
*/
- save_trace(trace);
+ *trace = save_trace();
}
print_circular_bug(&src_entry, target_entry, src, target);
@@ -1859,7 +1927,7 @@ static void print_lock_class_header(struct lock_class *class, int depth)
len += printk("%*s %s", depth, "", usage_str[bit]);
len += printk(KERN_CONT " at:\n");
- print_lock_trace(class->usage_traces + bit, len);
+ print_lock_trace(class->usage_traces[bit], len);
}
}
printk("%*s }\n", depth, "");
@@ -1884,7 +1952,7 @@ print_shortest_lock_dependencies(struct lock_list *leaf,
do {
print_lock_class_header(entry->class, depth);
printk("%*s ... acquired at:\n", depth, "");
- print_lock_trace(&entry->trace, 2);
+ print_lock_trace(entry->trace, 2);
printk("\n");
if (depth == 0 && (entry != root)) {
@@ -1995,14 +2063,14 @@ print_bad_irq_dependency(struct task_struct *curr,
print_lock_name(backwards_entry->class);
pr_warn("\n... which became %s-irq-safe at:\n", irqclass);
- print_lock_trace(backwards_entry->class->usage_traces + bit1, 1);
+ print_lock_trace(backwards_entry->class->usage_traces[bit1], 1);
pr_warn("\nto a %s-irq-unsafe lock:\n", irqclass);
print_lock_name(forwards_entry->class);
pr_warn("\n... which became %s-irq-unsafe at:\n", irqclass);
pr_warn("...");
- print_lock_trace(forwards_entry->class->usage_traces + bit2, 1);
+ print_lock_trace(forwards_entry->class->usage_traces[bit2], 1);
pr_warn("\nother info that might help us debug this:\n\n");
print_irq_lock_scenario(backwards_entry, forwards_entry,
@@ -2011,13 +2079,15 @@ print_bad_irq_dependency(struct task_struct *curr,
lockdep_print_held_locks(curr);
pr_warn("\nthe dependencies between %s-irq-safe lock and the holding lock:\n", irqclass);
- if (!save_trace(&prev_root->trace))
+ prev_root->trace = save_trace();
+ if (!prev_root->trace)
return;
print_shortest_lock_dependencies(backwards_entry, prev_root);
pr_warn("\nthe dependencies between the lock to be acquired");
pr_warn(" and %s-irq-unsafe lock:\n", irqclass);
- if (!save_trace(&next_root->trace))
+ next_root->trace = save_trace();
+ if (!next_root->trace)
return;
print_shortest_lock_dependencies(forwards_entry, next_root);
@@ -2369,7 +2439,8 @@ check_deadlock(struct task_struct *curr, struct held_lock *next)
*/
static int
check_prev_add(struct task_struct *curr, struct held_lock *prev,
- struct held_lock *next, int distance, struct lock_trace *trace)
+ struct held_lock *next, int distance,
+ struct lock_trace **const trace)
{
struct lock_list *entry;
int ret;
@@ -2444,8 +2515,11 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
return ret;
#endif
- if (!trace->nr_entries && !save_trace(trace))
- return 0;
+ if (!*trace) {
+ *trace = save_trace();
+ if (!*trace)
+ return 0;
+ }
/*
* Ok, all validations passed, add the new lock
@@ -2453,14 +2527,14 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
*/
ret = add_lock_to_list(hlock_class(next), hlock_class(prev),
&hlock_class(prev)->locks_after,
- next->acquire_ip, distance, trace);
+ next->acquire_ip, distance, *trace);
if (!ret)
return 0;
ret = add_lock_to_list(hlock_class(prev), hlock_class(next),
&hlock_class(next)->locks_before,
- next->acquire_ip, distance, trace);
+ next->acquire_ip, distance, *trace);
if (!ret)
return 0;
@@ -2476,7 +2550,7 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
static int
check_prevs_add(struct task_struct *curr, struct held_lock *next)
{
- struct lock_trace trace = { .nr_entries = 0 };
+ struct lock_trace *trace = NULL;
int depth = curr->lockdep_depth;
struct held_lock *hlock;
@@ -2969,7 +3043,7 @@ static void check_chain_key(struct task_struct *curr)
#endif
}
-#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING)
+#ifdef CONFIG_PROVE_LOCKING
static int mark_lock(struct task_struct *curr, struct held_lock *this,
enum lock_usage_bit new_bit);
@@ -3015,7 +3089,7 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this,
print_lock(this);
pr_warn("{%s} state was registered at:\n", usage_str[prev_bit]);
- print_lock_trace(hlock_class(this)->usage_traces + prev_bit, 1);
+ print_lock_trace(hlock_class(this)->usage_traces[prev_bit], 1);
print_irqtrace_events(curr);
pr_warn("\nother info that might help us debug this:\n");
@@ -3096,7 +3170,8 @@ print_irq_inversion_bug(struct task_struct *curr,
lockdep_print_held_locks(curr);
pr_warn("\nthe shortest dependencies between 2nd lock and 1st lock:\n");
- if (!save_trace(&root->trace))
+ root->trace = save_trace();
+ if (!root->trace)
return;
print_shortest_lock_dependencies(other, root);
@@ -3580,7 +3655,7 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
hlock_class(this)->usage_mask |= new_mask;
- if (!save_trace(hlock_class(this)->usage_traces + new_bit))
+ if (!(hlock_class(this)->usage_traces[new_bit] = save_trace()))
return 0;
switch (new_bit) {
@@ -3608,7 +3683,7 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
return ret;
}
-#else /* defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) */
+#else /* CONFIG_PROVE_LOCKING */
static inline int
mark_usage(struct task_struct *curr, struct held_lock *hlock, int check)
@@ -3627,7 +3702,7 @@ static inline int separate_irq_context(struct task_struct *curr,
return 0;
}
-#endif /* defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) */
+#endif /* CONFIG_PROVE_LOCKING */
/*
* Initialize a lock instance's lock-class mapping info:
@@ -4321,8 +4396,7 @@ static void __lock_unpin_lock(struct lockdep_map *lock, struct pin_cookie cookie
*/
static void check_flags(unsigned long flags)
{
-#if defined(CONFIG_PROVE_LOCKING) && defined(CONFIG_DEBUG_LOCKDEP) && \
- defined(CONFIG_TRACE_IRQFLAGS)
+#if defined(CONFIG_PROVE_LOCKING) && defined(CONFIG_DEBUG_LOCKDEP)
if (!debug_locks)
return;
@@ -5158,6 +5232,12 @@ void __init lockdep_init(void)
) / 1024
);
+#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING)
+ printk(" memory used for stack traces: %zu kB\n",
+ (sizeof(stack_trace) + sizeof(stack_trace_hash)) / 1024
+ );
+#endif
+
printk(" per task-struct memory footprint: %zu bytes\n",
sizeof(((struct task_struct *)NULL)->held_locks));
}
diff --git a/kernel/locking/lockdep_internals.h b/kernel/locking/lockdep_internals.h
index cc83568d5012..18d85aebbb57 100644
--- a/kernel/locking/lockdep_internals.h
+++ b/kernel/locking/lockdep_internals.h
@@ -92,6 +92,7 @@ static const unsigned long LOCKF_USED_IN_IRQ_READ =
#define MAX_LOCKDEP_ENTRIES 16384UL
#define MAX_LOCKDEP_CHAINS_BITS 15
#define MAX_STACK_TRACE_ENTRIES 262144UL
+#define STACK_TRACE_HASH_SIZE 8192
#else
#define MAX_LOCKDEP_ENTRIES 32768UL
@@ -102,6 +103,7 @@ static const unsigned long LOCKF_USED_IN_IRQ_READ =
* addresses. Protected by the hash_lock.
*/
#define MAX_STACK_TRACE_ENTRIES 524288UL
+#define STACK_TRACE_HASH_SIZE 16384
#endif
#define MAX_LOCKDEP_CHAINS (1UL << MAX_LOCKDEP_CHAINS_BITS)
@@ -116,7 +118,8 @@ extern struct lock_chain lock_chains[];
extern void get_usage_chars(struct lock_class *class,
char usage[LOCK_USAGE_CHARS]);
-extern const char * __get_key_name(struct lockdep_subclass_key *key, char *str);
+extern const char *__get_key_name(const struct lockdep_subclass_key *key,
+ char *str);
struct lock_class *lock_chain_get_class(struct lock_chain *chain, int i);
@@ -137,6 +140,10 @@ extern unsigned int max_bfs_queue_depth;
#ifdef CONFIG_PROVE_LOCKING
extern unsigned long lockdep_count_forward_deps(struct lock_class *);
extern unsigned long lockdep_count_backward_deps(struct lock_class *);
+#ifdef CONFIG_TRACE_IRQFLAGS
+u64 lockdep_stack_trace_count(void);
+u64 lockdep_stack_hash_count(void);
+#endif
#else
static inline unsigned long
lockdep_count_forward_deps(struct lock_class *class)
diff --git a/kernel/locking/lockdep_proc.c b/kernel/locking/lockdep_proc.c
index 65b6a1600c8f..dadb7b7fba37 100644
--- a/kernel/locking/lockdep_proc.c
+++ b/kernel/locking/lockdep_proc.c
@@ -200,7 +200,6 @@ static void lockdep_stats_debug_show(struct seq_file *m)
static int lockdep_stats_show(struct seq_file *m, void *v)
{
- struct lock_class *class;
unsigned long nr_unused = 0, nr_uncategorized = 0,
nr_irq_safe = 0, nr_irq_unsafe = 0,
nr_softirq_safe = 0, nr_softirq_unsafe = 0,
@@ -211,6 +210,8 @@ static int lockdep_stats_show(struct seq_file *m, void *v)
sum_forward_deps = 0;
#ifdef CONFIG_PROVE_LOCKING
+ struct lock_class *class;
+
list_for_each_entry(class, &all_lock_classes, lock_entry) {
if (class->usage_mask == 0)
@@ -284,6 +285,12 @@ static int lockdep_stats_show(struct seq_file *m, void *v)
nr_process_chains);
seq_printf(m, " stack-trace entries: %11lu [max: %lu]\n",
nr_stack_trace_entries, MAX_STACK_TRACE_ENTRIES);
+#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING)
+ seq_printf(m, " number of stack traces: %llu\n",
+ lockdep_stack_trace_count());
+ seq_printf(m, " number of stack hash chains: %llu\n",
+ lockdep_stack_hash_count());
+#endif
seq_printf(m, " combined max dependencies: %11u\n",
(nr_hardirq_chains + 1) *
(nr_softirq_chains + 1) *
@@ -398,7 +405,7 @@ static void seq_lock_time(struct seq_file *m, struct lock_time *lt)
static void seq_stats(struct seq_file *m, struct lock_stat_data *data)
{
- struct lockdep_subclass_key *ckey;
+ const struct lockdep_subclass_key *ckey;
struct lock_class_stats *stats;
struct lock_class *class;
const char *cname;
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index edd1c082dbf5..468a9b8422e3 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -65,11 +65,37 @@ EXPORT_SYMBOL(__mutex_init);
#define MUTEX_FLAGS 0x07
+/*
+ * Internal helper function; C doesn't allow us to hide it :/
+ *
+ * DO NOT USE (outside of mutex code).
+ */
+static inline struct task_struct *__mutex_owner(struct mutex *lock)
+{
+ return (struct task_struct *)(atomic_long_read(&lock->owner) & ~MUTEX_FLAGS);
+}
+
static inline struct task_struct *__owner_task(unsigned long owner)
{
return (struct task_struct *)(owner & ~MUTEX_FLAGS);
}
+bool mutex_is_locked(struct mutex *lock)
+{
+ return __mutex_owner(lock) != NULL;
+}
+EXPORT_SYMBOL(mutex_is_locked);
+
+__must_check enum mutex_trylock_recursive_enum
+mutex_trylock_recursive(struct mutex *lock)
+{
+ if (unlikely(__mutex_owner(lock) == current))
+ return MUTEX_TRYLOCK_RECURSIVE;
+
+ return mutex_trylock(lock);
+}
+EXPORT_SYMBOL(mutex_trylock_recursive);
+
static inline unsigned long __owner_flags(unsigned long owner)
{
return owner & MUTEX_FLAGS;
@@ -908,6 +934,10 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
might_sleep();
+#ifdef CONFIG_DEBUG_MUTEXES
+ DEBUG_LOCKS_WARN_ON(lock->magic != lock);
+#endif
+
ww = container_of(lock, struct ww_mutex, base);
if (use_ww_ctx && ww_ctx) {
if (unlikely(ww_ctx == READ_ONCE(ww->ctx)))
@@ -1379,8 +1409,13 @@ __ww_mutex_lock_interruptible_slowpath(struct ww_mutex *lock,
*/
int __sched mutex_trylock(struct mutex *lock)
{
- bool locked = __mutex_trylock(lock);
+ bool locked;
+
+#ifdef CONFIG_DEBUG_MUTEXES
+ DEBUG_LOCKS_WARN_ON(lock->magic != lock);
+#endif
+ locked = __mutex_trylock(lock);
if (locked)
mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_);
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index fa83d36e30c6..2874bf556162 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -628,8 +628,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
}
/* [10] Grab the next task, i.e. owner of @lock */
- task = rt_mutex_owner(lock);
- get_task_struct(task);
+ task = get_task_struct(rt_mutex_owner(lock));
raw_spin_lock(&task->pi_lock);
/*
@@ -709,8 +708,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
}
/* [10] Grab the next task, i.e. the owner of @lock */
- task = rt_mutex_owner(lock);
- get_task_struct(task);
+ task = get_task_struct(rt_mutex_owner(lock));
raw_spin_lock(&task->pi_lock);
/* [11] requeue the pi waiters if necessary */
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index 37524a47f002..eef04551eae7 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -105,8 +105,9 @@
#ifdef CONFIG_DEBUG_RWSEMS
# define DEBUG_RWSEMS_WARN_ON(c, sem) do { \
if (!debug_locks_silent && \
- WARN_ONCE(c, "DEBUG_RWSEMS_WARN_ON(%s): count = 0x%lx, owner = 0x%lx, curr 0x%lx, list %sempty\n",\
+ WARN_ONCE(c, "DEBUG_RWSEMS_WARN_ON(%s): count = 0x%lx, magic = 0x%lx, owner = 0x%lx, curr 0x%lx, list %sempty\n",\
#c, atomic_long_read(&(sem)->count), \
+ (unsigned long) sem->magic, \
atomic_long_read(&(sem)->owner), (long)current, \
list_empty(&(sem)->wait_list) ? "" : "not ")) \
debug_locks_off(); \
@@ -330,6 +331,9 @@ void __init_rwsem(struct rw_semaphore *sem, const char *name,
debug_check_no_locks_freed((void *)sem, sizeof(*sem));
lockdep_init_map(&sem->dep_map, name, key, 0);
#endif
+#ifdef CONFIG_DEBUG_RWSEMS
+ sem->magic = sem;
+#endif
atomic_long_set(&sem->count, RWSEM_UNLOCKED_VALUE);
raw_spin_lock_init(&sem->wait_lock);
INIT_LIST_HEAD(&sem->wait_list);
@@ -666,7 +670,11 @@ static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem,
preempt_disable();
rcu_read_lock();
owner = rwsem_owner_flags(sem, &flags);
- if ((flags & nonspinnable) || (owner && !owner_on_cpu(owner)))
+ /*
+ * Don't check the read-owner as the entry may be stale.
+ */
+ if ((flags & nonspinnable) ||
+ (owner && !(flags & RWSEM_READER_OWNED) && !owner_on_cpu(owner)))
ret = false;
rcu_read_unlock();
preempt_enable();
@@ -720,11 +728,12 @@ rwsem_spin_on_owner(struct rw_semaphore *sem, unsigned long nonspinnable)
rcu_read_lock();
for (;;) {
- if (atomic_long_read(&sem->count) & RWSEM_FLAG_HANDOFF) {
- state = OWNER_NONSPINNABLE;
- break;
- }
-
+ /*
+ * When a waiting writer set the handoff flag, it may spin
+ * on the owner as well. Once that writer acquires the lock,
+ * we can spin on it. So we don't need to quit even when the
+ * handoff bit is set.
+ */
new = rwsem_owner_flags(sem, &new_flags);
if ((new != owner) || (new_flags != flags)) {
state = rwsem_owner_state(new, new_flags, nonspinnable);
@@ -970,6 +979,13 @@ static inline bool rwsem_reader_phase_trylock(struct rw_semaphore *sem,
{
return false;
}
+
+static inline int
+rwsem_spin_on_owner(struct rw_semaphore *sem, unsigned long nonspinnable)
+{
+ return 0;
+}
+#define OWNER_NULL 1
#endif
/*
@@ -1000,6 +1016,7 @@ rwsem_down_read_slowpath(struct rw_semaphore *sem, int state)
atomic_long_add(-RWSEM_READER_BIAS, &sem->count);
adjustment = 0;
if (rwsem_optimistic_spin(sem, false)) {
+ /* rwsem_optimistic_spin() implies ACQUIRE on success */
/*
* Wake up other readers in the wait list if the front
* waiter is a reader.
@@ -1014,6 +1031,7 @@ rwsem_down_read_slowpath(struct rw_semaphore *sem, int state)
}
return sem;
} else if (rwsem_reader_phase_trylock(sem, waiter.last_rowner)) {
+ /* rwsem_reader_phase_trylock() implies ACQUIRE on success */
return sem;
}
@@ -1032,6 +1050,8 @@ queue:
*/
if (adjustment && !(atomic_long_read(&sem->count) &
(RWSEM_WRITER_MASK | RWSEM_FLAG_HANDOFF))) {
+ /* Provide lock ACQUIRE */
+ smp_acquire__after_ctrl_dep();
raw_spin_unlock_irq(&sem->wait_lock);
rwsem_set_reader_owned(sem);
lockevent_inc(rwsem_rlock_fast);
@@ -1065,15 +1085,18 @@ queue:
wake_up_q(&wake_q);
/* wait to be given the lock */
- while (true) {
+ for (;;) {
set_current_state(state);
- if (!waiter.task)
+ if (!smp_load_acquire(&waiter.task)) {
+ /* Matches rwsem_mark_wake()'s smp_store_release(). */
break;
+ }
if (signal_pending_state(state, current)) {
raw_spin_lock_irq(&sem->wait_lock);
if (waiter.task)
goto out_nolock;
raw_spin_unlock_irq(&sem->wait_lock);
+ /* Ordered by sem->wait_lock against rwsem_mark_wake(). */
break;
}
schedule();
@@ -1083,6 +1106,7 @@ queue:
__set_current_state(TASK_RUNNING);
lockevent_inc(rwsem_rlock);
return sem;
+
out_nolock:
list_del(&waiter.list);
if (list_empty(&sem->wait_list)) {
@@ -1123,8 +1147,10 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state)
/* do optimistic spinning and steal lock if possible */
if (rwsem_can_spin_on_owner(sem, RWSEM_WR_NONSPINNABLE) &&
- rwsem_optimistic_spin(sem, true))
+ rwsem_optimistic_spin(sem, true)) {
+ /* rwsem_optimistic_spin() implies ACQUIRE on success */
return sem;
+ }
/*
* Disable reader optimistic spinning for this rwsem after
@@ -1184,12 +1210,26 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state)
wait:
/* wait until we successfully acquire the lock */
set_current_state(state);
- while (true) {
- if (rwsem_try_write_lock(sem, wstate))
+ for (;;) {
+ if (rwsem_try_write_lock(sem, wstate)) {
+ /* rwsem_try_write_lock() implies ACQUIRE on success */
break;
+ }
raw_spin_unlock_irq(&sem->wait_lock);
+ /*
+ * After setting the handoff bit and failing to acquire
+ * the lock, attempt to spin on owner to accelerate lock
+ * transfer. If the previous owner is a on-cpu writer and it
+ * has just released the lock, OWNER_NULL will be returned.
+ * In this case, we attempt to acquire the lock again
+ * without sleeping.
+ */
+ if ((wstate == WRITER_HANDOFF) &&
+ (rwsem_spin_on_owner(sem, 0) == OWNER_NULL))
+ goto trylock_again;
+
/* Block until there are no active lockers. */
for (;;) {
if (signal_pending_state(state, current))
@@ -1224,7 +1264,7 @@ wait:
break;
}
}
-
+trylock_again:
raw_spin_lock_irq(&sem->wait_lock);
}
__set_current_state(TASK_RUNNING);
@@ -1322,11 +1362,14 @@ static inline int __down_read_killable(struct rw_semaphore *sem)
static inline int __down_read_trylock(struct rw_semaphore *sem)
{
+ long tmp;
+
+ DEBUG_RWSEMS_WARN_ON(sem->magic != sem, sem);
+
/*
* Optimize for the case when the rwsem is not locked at all.
*/
- long tmp = RWSEM_UNLOCKED_VALUE;
-
+ tmp = RWSEM_UNLOCKED_VALUE;
do {
if (atomic_long_try_cmpxchg_acquire(&sem->count, &tmp,
tmp + RWSEM_READER_BIAS)) {
@@ -1367,8 +1410,11 @@ static inline int __down_write_killable(struct rw_semaphore *sem)
static inline int __down_write_trylock(struct rw_semaphore *sem)
{
- long tmp = RWSEM_UNLOCKED_VALUE;
+ long tmp;
+
+ DEBUG_RWSEMS_WARN_ON(sem->magic != sem, sem);
+ tmp = RWSEM_UNLOCKED_VALUE;
if (atomic_long_try_cmpxchg_acquire(&sem->count, &tmp,
RWSEM_WRITER_LOCKED)) {
rwsem_set_owner(sem);
@@ -1384,7 +1430,9 @@ inline void __up_read(struct rw_semaphore *sem)
{
long tmp;
+ DEBUG_RWSEMS_WARN_ON(sem->magic != sem, sem);
DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem);
+
rwsem_clear_reader_owned(sem);
tmp = atomic_long_add_return_release(-RWSEM_READER_BIAS, &sem->count);
DEBUG_RWSEMS_WARN_ON(tmp < 0, sem);
@@ -1402,12 +1450,14 @@ static inline void __up_write(struct rw_semaphore *sem)
{
long tmp;
+ DEBUG_RWSEMS_WARN_ON(sem->magic != sem, sem);
/*
* sem->owner may differ from current if the ownership is transferred
* to an anonymous writer by setting the RWSEM_NONSPINNABLE bits.
*/
DEBUG_RWSEMS_WARN_ON((rwsem_owner(sem) != current) &&
!rwsem_test_oflags(sem, RWSEM_NONSPINNABLE), sem);
+
rwsem_clear_owner(sem);
tmp = atomic_long_fetch_add_release(-RWSEM_WRITER_LOCKED, &sem->count);
if (unlikely(tmp & RWSEM_FLAG_WAITERS))
diff --git a/kernel/memremap.c b/kernel/memremap.c
deleted file mode 100644
index 6ee03a816d67..000000000000
--- a/kernel/memremap.c
+++ /dev/null
@@ -1,405 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright(c) 2015 Intel Corporation. All rights reserved. */
-#include <linux/device.h>
-#include <linux/io.h>
-#include <linux/kasan.h>
-#include <linux/memory_hotplug.h>
-#include <linux/mm.h>
-#include <linux/pfn_t.h>
-#include <linux/swap.h>
-#include <linux/swapops.h>
-#include <linux/types.h>
-#include <linux/wait_bit.h>
-#include <linux/xarray.h>
-
-static DEFINE_XARRAY(pgmap_array);
-#define SECTION_MASK ~((1UL << PA_SECTION_SHIFT) - 1)
-#define SECTION_SIZE (1UL << PA_SECTION_SHIFT)
-
-#ifdef CONFIG_DEV_PAGEMAP_OPS
-DEFINE_STATIC_KEY_FALSE(devmap_managed_key);
-EXPORT_SYMBOL(devmap_managed_key);
-static atomic_t devmap_managed_enable;
-
-static void devmap_managed_enable_put(void *data)
-{
- if (atomic_dec_and_test(&devmap_managed_enable))
- static_branch_disable(&devmap_managed_key);
-}
-
-static int devmap_managed_enable_get(struct device *dev, struct dev_pagemap *pgmap)
-{
- if (!pgmap->ops || !pgmap->ops->page_free) {
- WARN(1, "Missing page_free method\n");
- return -EINVAL;
- }
-
- if (atomic_inc_return(&devmap_managed_enable) == 1)
- static_branch_enable(&devmap_managed_key);
- return devm_add_action_or_reset(dev, devmap_managed_enable_put, NULL);
-}
-#else
-static int devmap_managed_enable_get(struct device *dev, struct dev_pagemap *pgmap)
-{
- return -EINVAL;
-}
-#endif /* CONFIG_DEV_PAGEMAP_OPS */
-
-static void pgmap_array_delete(struct resource *res)
-{
- xa_store_range(&pgmap_array, PHYS_PFN(res->start), PHYS_PFN(res->end),
- NULL, GFP_KERNEL);
- synchronize_rcu();
-}
-
-static unsigned long pfn_first(struct dev_pagemap *pgmap)
-{
- return PHYS_PFN(pgmap->res.start) +
- vmem_altmap_offset(pgmap_altmap(pgmap));
-}
-
-static unsigned long pfn_end(struct dev_pagemap *pgmap)
-{
- const struct resource *res = &pgmap->res;
-
- return (res->start + resource_size(res)) >> PAGE_SHIFT;
-}
-
-static unsigned long pfn_next(unsigned long pfn)
-{
- if (pfn % 1024 == 0)
- cond_resched();
- return pfn + 1;
-}
-
-#define for_each_device_pfn(pfn, map) \
- for (pfn = pfn_first(map); pfn < pfn_end(map); pfn = pfn_next(pfn))
-
-static void dev_pagemap_kill(struct dev_pagemap *pgmap)
-{
- if (pgmap->ops && pgmap->ops->kill)
- pgmap->ops->kill(pgmap);
- else
- percpu_ref_kill(pgmap->ref);
-}
-
-static void dev_pagemap_cleanup(struct dev_pagemap *pgmap)
-{
- if (pgmap->ops && pgmap->ops->cleanup) {
- pgmap->ops->cleanup(pgmap);
- } else {
- wait_for_completion(&pgmap->done);
- percpu_ref_exit(pgmap->ref);
- }
-}
-
-static void devm_memremap_pages_release(void *data)
-{
- struct dev_pagemap *pgmap = data;
- struct device *dev = pgmap->dev;
- struct resource *res = &pgmap->res;
- unsigned long pfn;
- int nid;
-
- dev_pagemap_kill(pgmap);
- for_each_device_pfn(pfn, pgmap)
- put_page(pfn_to_page(pfn));
- dev_pagemap_cleanup(pgmap);
-
- /* pages are dead and unused, undo the arch mapping */
- nid = page_to_nid(pfn_to_page(PHYS_PFN(res->start)));
-
- mem_hotplug_begin();
- if (pgmap->type == MEMORY_DEVICE_PRIVATE) {
- pfn = PHYS_PFN(res->start);
- __remove_pages(page_zone(pfn_to_page(pfn)), pfn,
- PHYS_PFN(resource_size(res)), NULL);
- } else {
- arch_remove_memory(nid, res->start, resource_size(res),
- pgmap_altmap(pgmap));
- kasan_remove_zero_shadow(__va(res->start), resource_size(res));
- }
- mem_hotplug_done();
-
- untrack_pfn(NULL, PHYS_PFN(res->start), resource_size(res));
- pgmap_array_delete(res);
- dev_WARN_ONCE(dev, pgmap->altmap.alloc,
- "%s: failed to free all reserved pages\n", __func__);
-}
-
-static void dev_pagemap_percpu_release(struct percpu_ref *ref)
-{
- struct dev_pagemap *pgmap =
- container_of(ref, struct dev_pagemap, internal_ref);
-
- complete(&pgmap->done);
-}
-
-/**
- * devm_memremap_pages - remap and provide memmap backing for the given resource
- * @dev: hosting device for @res
- * @pgmap: pointer to a struct dev_pagemap
- *
- * Notes:
- * 1/ At a minimum the res and type members of @pgmap must be initialized
- * by the caller before passing it to this function
- *
- * 2/ The altmap field may optionally be initialized, in which case
- * PGMAP_ALTMAP_VALID must be set in pgmap->flags.
- *
- * 3/ The ref field may optionally be provided, in which pgmap->ref must be
- * 'live' on entry and will be killed and reaped at
- * devm_memremap_pages_release() time, or if this routine fails.
- *
- * 4/ res is expected to be a host memory range that could feasibly be
- * treated as a "System RAM" range, i.e. not a device mmio range, but
- * this is not enforced.
- */
-void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
-{
- struct resource *res = &pgmap->res;
- struct dev_pagemap *conflict_pgmap;
- struct mhp_restrictions restrictions = {
- /*
- * We do not want any optional features only our own memmap
- */
- .altmap = pgmap_altmap(pgmap),
- };
- pgprot_t pgprot = PAGE_KERNEL;
- int error, nid, is_ram;
- bool need_devmap_managed = true;
-
- switch (pgmap->type) {
- case MEMORY_DEVICE_PRIVATE:
- if (!IS_ENABLED(CONFIG_DEVICE_PRIVATE)) {
- WARN(1, "Device private memory not supported\n");
- return ERR_PTR(-EINVAL);
- }
- if (!pgmap->ops || !pgmap->ops->migrate_to_ram) {
- WARN(1, "Missing migrate_to_ram method\n");
- return ERR_PTR(-EINVAL);
- }
- break;
- case MEMORY_DEVICE_FS_DAX:
- if (!IS_ENABLED(CONFIG_ZONE_DEVICE) ||
- IS_ENABLED(CONFIG_FS_DAX_LIMITED)) {
- WARN(1, "File system DAX not supported\n");
- return ERR_PTR(-EINVAL);
- }
- break;
- case MEMORY_DEVICE_DEVDAX:
- case MEMORY_DEVICE_PCI_P2PDMA:
- need_devmap_managed = false;
- break;
- default:
- WARN(1, "Invalid pgmap type %d\n", pgmap->type);
- break;
- }
-
- if (!pgmap->ref) {
- if (pgmap->ops && (pgmap->ops->kill || pgmap->ops->cleanup))
- return ERR_PTR(-EINVAL);
-
- init_completion(&pgmap->done);
- error = percpu_ref_init(&pgmap->internal_ref,
- dev_pagemap_percpu_release, 0, GFP_KERNEL);
- if (error)
- return ERR_PTR(error);
- pgmap->ref = &pgmap->internal_ref;
- } else {
- if (!pgmap->ops || !pgmap->ops->kill || !pgmap->ops->cleanup) {
- WARN(1, "Missing reference count teardown definition\n");
- return ERR_PTR(-EINVAL);
- }
- }
-
- if (need_devmap_managed) {
- error = devmap_managed_enable_get(dev, pgmap);
- if (error)
- return ERR_PTR(error);
- }
-
- conflict_pgmap = get_dev_pagemap(PHYS_PFN(res->start), NULL);
- if (conflict_pgmap) {
- dev_WARN(dev, "Conflicting mapping in same section\n");
- put_dev_pagemap(conflict_pgmap);
- error = -ENOMEM;
- goto err_array;
- }
-
- conflict_pgmap = get_dev_pagemap(PHYS_PFN(res->end), NULL);
- if (conflict_pgmap) {
- dev_WARN(dev, "Conflicting mapping in same section\n");
- put_dev_pagemap(conflict_pgmap);
- error = -ENOMEM;
- goto err_array;
- }
-
- is_ram = region_intersects(res->start, resource_size(res),
- IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE);
-
- if (is_ram != REGION_DISJOINT) {
- WARN_ONCE(1, "%s attempted on %s region %pr\n", __func__,
- is_ram == REGION_MIXED ? "mixed" : "ram", res);
- error = -ENXIO;
- goto err_array;
- }
-
- pgmap->dev = dev;
-
- error = xa_err(xa_store_range(&pgmap_array, PHYS_PFN(res->start),
- PHYS_PFN(res->end), pgmap, GFP_KERNEL));
- if (error)
- goto err_array;
-
- nid = dev_to_node(dev);
- if (nid < 0)
- nid = numa_mem_id();
-
- error = track_pfn_remap(NULL, &pgprot, PHYS_PFN(res->start), 0,
- resource_size(res));
- if (error)
- goto err_pfn_remap;
-
- mem_hotplug_begin();
-
- /*
- * For device private memory we call add_pages() as we only need to
- * allocate and initialize struct page for the device memory. More-
- * over the device memory is un-accessible thus we do not want to
- * create a linear mapping for the memory like arch_add_memory()
- * would do.
- *
- * For all other device memory types, which are accessible by
- * the CPU, we do want the linear mapping and thus use
- * arch_add_memory().
- */
- if (pgmap->type == MEMORY_DEVICE_PRIVATE) {
- error = add_pages(nid, PHYS_PFN(res->start),
- PHYS_PFN(resource_size(res)), &restrictions);
- } else {
- error = kasan_add_zero_shadow(__va(res->start), resource_size(res));
- if (error) {
- mem_hotplug_done();
- goto err_kasan;
- }
-
- error = arch_add_memory(nid, res->start, resource_size(res),
- &restrictions);
- }
-
- if (!error) {
- struct zone *zone;
-
- zone = &NODE_DATA(nid)->node_zones[ZONE_DEVICE];
- move_pfn_range_to_zone(zone, PHYS_PFN(res->start),
- PHYS_PFN(resource_size(res)), restrictions.altmap);
- }
-
- mem_hotplug_done();
- if (error)
- goto err_add_memory;
-
- /*
- * Initialization of the pages has been deferred until now in order
- * to allow us to do the work while not holding the hotplug lock.
- */
- memmap_init_zone_device(&NODE_DATA(nid)->node_zones[ZONE_DEVICE],
- PHYS_PFN(res->start),
- PHYS_PFN(resource_size(res)), pgmap);
- percpu_ref_get_many(pgmap->ref, pfn_end(pgmap) - pfn_first(pgmap));
-
- error = devm_add_action_or_reset(dev, devm_memremap_pages_release,
- pgmap);
- if (error)
- return ERR_PTR(error);
-
- return __va(res->start);
-
- err_add_memory:
- kasan_remove_zero_shadow(__va(res->start), resource_size(res));
- err_kasan:
- untrack_pfn(NULL, PHYS_PFN(res->start), resource_size(res));
- err_pfn_remap:
- pgmap_array_delete(res);
- err_array:
- dev_pagemap_kill(pgmap);
- dev_pagemap_cleanup(pgmap);
- return ERR_PTR(error);
-}
-EXPORT_SYMBOL_GPL(devm_memremap_pages);
-
-void devm_memunmap_pages(struct device *dev, struct dev_pagemap *pgmap)
-{
- devm_release_action(dev, devm_memremap_pages_release, pgmap);
-}
-EXPORT_SYMBOL_GPL(devm_memunmap_pages);
-
-unsigned long vmem_altmap_offset(struct vmem_altmap *altmap)
-{
- /* number of pfns from base where pfn_to_page() is valid */
- if (altmap)
- return altmap->reserve + altmap->free;
- return 0;
-}
-
-void vmem_altmap_free(struct vmem_altmap *altmap, unsigned long nr_pfns)
-{
- altmap->alloc -= nr_pfns;
-}
-
-/**
- * get_dev_pagemap() - take a new live reference on the dev_pagemap for @pfn
- * @pfn: page frame number to lookup page_map
- * @pgmap: optional known pgmap that already has a reference
- *
- * If @pgmap is non-NULL and covers @pfn it will be returned as-is. If @pgmap
- * is non-NULL but does not cover @pfn the reference to it will be released.
- */
-struct dev_pagemap *get_dev_pagemap(unsigned long pfn,
- struct dev_pagemap *pgmap)
-{
- resource_size_t phys = PFN_PHYS(pfn);
-
- /*
- * In the cached case we're already holding a live reference.
- */
- if (pgmap) {
- if (phys >= pgmap->res.start && phys <= pgmap->res.end)
- return pgmap;
- put_dev_pagemap(pgmap);
- }
-
- /* fall back to slow path lookup */
- rcu_read_lock();
- pgmap = xa_load(&pgmap_array, PHYS_PFN(phys));
- if (pgmap && !percpu_ref_tryget_live(pgmap->ref))
- pgmap = NULL;
- rcu_read_unlock();
-
- return pgmap;
-}
-EXPORT_SYMBOL_GPL(get_dev_pagemap);
-
-#ifdef CONFIG_DEV_PAGEMAP_OPS
-void __put_devmap_managed_page(struct page *page)
-{
- int count = page_ref_dec_return(page);
-
- /*
- * If refcount is 1 then page is freed and refcount is stable as nobody
- * holds a reference on the page.
- */
- if (count == 1) {
- /* Clear Active bit in case of parallel mark_page_accessed */
- __ClearPageActive(page);
- __ClearPageWaiters(page);
-
- mem_cgroup_uncharge(page);
-
- page->pgmap->ops->page_free(page);
- } else if (!count)
- __put_page(page);
-}
-EXPORT_SYMBOL(__put_devmap_managed_page);
-#endif /* CONFIG_DEV_PAGEMAP_OPS */
diff --git a/kernel/module.c b/kernel/module.c
index 5933395af9a0..9ee93421269c 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -65,9 +65,9 @@
/*
* Modules' sections will be aligned on page boundaries
* to ensure complete separation of code and data, but
- * only when CONFIG_STRICT_MODULE_RWX=y
+ * only when CONFIG_ARCH_HAS_STRICT_MODULE_RWX=y
*/
-#ifdef CONFIG_STRICT_MODULE_RWX
+#ifdef CONFIG_ARCH_HAS_STRICT_MODULE_RWX
# define debug_align(X) ALIGN(X, PAGE_SIZE)
#else
# define debug_align(X) (X)
diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig
index 480edf328b51..7644eda17d62 100644
--- a/kernel/rcu/Kconfig
+++ b/kernel/rcu/Kconfig
@@ -7,7 +7,7 @@ menu "RCU Subsystem"
config TREE_RCU
bool
- default y if !PREEMPT && SMP
+ default y if !PREEMPTION && SMP
help
This option selects the RCU implementation that is
designed for very large SMP system with hundreds or
@@ -16,7 +16,7 @@ config TREE_RCU
config PREEMPT_RCU
bool
- default y if PREEMPT
+ default y if PREEMPTION
help
This option selects the RCU implementation that is
designed for very large SMP systems with hundreds or
@@ -28,7 +28,7 @@ config PREEMPT_RCU
config TINY_RCU
bool
- default y if !PREEMPT && !SMP
+ default y if !PREEMPTION && !SMP
help
This option selects the RCU implementation that is
designed for UP systems from which real-time response
@@ -70,7 +70,7 @@ config TREE_SRCU
This option selects the full-fledged version of SRCU.
config TASKS_RCU
- def_bool PREEMPT
+ def_bool PREEMPTION
select SRCU
help
This option enables a task-based RCU implementation that uses
diff --git a/kernel/rcu/Kconfig.debug b/kernel/rcu/Kconfig.debug
index 5ec3ea4028e2..4aa02eee8f6c 100644
--- a/kernel/rcu/Kconfig.debug
+++ b/kernel/rcu/Kconfig.debug
@@ -8,6 +8,17 @@ menu "RCU Debugging"
config PROVE_RCU
def_bool PROVE_LOCKING
+config PROVE_RCU_LIST
+ bool "RCU list lockdep debugging"
+ depends on PROVE_RCU && RCU_EXPERT
+ default n
+ help
+ Enable RCU lockdep checking for list usages. By default it is
+ turned off since there are several list RCU users that still
+ need to be converted to pass a lockdep expression. To prevent
+ false-positive splats, we keep it default disabled but once all
+ users are converted, we can remove this config option.
+
config TORTURE_TEST
tristate
default n
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index 5290b01de534..8fd4f82c9b3d 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -227,6 +227,7 @@ static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head)
#ifdef CONFIG_RCU_STALL_COMMON
+extern int rcu_cpu_stall_ftrace_dump;
extern int rcu_cpu_stall_suppress;
extern int rcu_cpu_stall_timeout;
int rcu_jiffies_till_stall_check(void);
diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c
index 9bd5f6023c21..495c58ce1640 100644
--- a/kernel/rcu/rcu_segcblist.c
+++ b/kernel/rcu/rcu_segcblist.c
@@ -24,6 +24,49 @@ void rcu_cblist_init(struct rcu_cblist *rclp)
}
/*
+ * Enqueue an rcu_head structure onto the specified callback list.
+ * This function assumes that the callback is non-lazy because it
+ * is intended for use by no-CBs CPUs, which do not distinguish
+ * between lazy and non-lazy RCU callbacks.
+ */
+void rcu_cblist_enqueue(struct rcu_cblist *rclp, struct rcu_head *rhp)
+{
+ *rclp->tail = rhp;
+ rclp->tail = &rhp->next;
+ WRITE_ONCE(rclp->len, rclp->len + 1);
+}
+
+/*
+ * Flush the second rcu_cblist structure onto the first one, obliterating
+ * any contents of the first. If rhp is non-NULL, enqueue it as the sole
+ * element of the second rcu_cblist structure, but ensuring that the second
+ * rcu_cblist structure, if initially non-empty, always appears non-empty
+ * throughout the process. If rdp is NULL, the second rcu_cblist structure
+ * is instead initialized to empty.
+ */
+void rcu_cblist_flush_enqueue(struct rcu_cblist *drclp,
+ struct rcu_cblist *srclp,
+ struct rcu_head *rhp)
+{
+ drclp->head = srclp->head;
+ if (drclp->head)
+ drclp->tail = srclp->tail;
+ else
+ drclp->tail = &drclp->head;
+ drclp->len = srclp->len;
+ drclp->len_lazy = srclp->len_lazy;
+ if (!rhp) {
+ rcu_cblist_init(srclp);
+ } else {
+ rhp->next = NULL;
+ srclp->head = rhp;
+ srclp->tail = &rhp->next;
+ WRITE_ONCE(srclp->len, 1);
+ srclp->len_lazy = 0;
+ }
+}
+
+/*
* Dequeue the oldest rcu_head structure from the specified callback
* list. This function assumes that the callback is non-lazy, but
* the caller can later invoke rcu_cblist_dequeued_lazy() if it
@@ -44,6 +87,67 @@ struct rcu_head *rcu_cblist_dequeue(struct rcu_cblist *rclp)
return rhp;
}
+/* Set the length of an rcu_segcblist structure. */
+void rcu_segcblist_set_len(struct rcu_segcblist *rsclp, long v)
+{
+#ifdef CONFIG_RCU_NOCB_CPU
+ atomic_long_set(&rsclp->len, v);
+#else
+ WRITE_ONCE(rsclp->len, v);
+#endif
+}
+
+/*
+ * Increase the numeric length of an rcu_segcblist structure by the
+ * specified amount, which can be negative. This can cause the ->len
+ * field to disagree with the actual number of callbacks on the structure.
+ * This increase is fully ordered with respect to the callers accesses
+ * both before and after.
+ */
+void rcu_segcblist_add_len(struct rcu_segcblist *rsclp, long v)
+{
+#ifdef CONFIG_RCU_NOCB_CPU
+ smp_mb__before_atomic(); /* Up to the caller! */
+ atomic_long_add(v, &rsclp->len);
+ smp_mb__after_atomic(); /* Up to the caller! */
+#else
+ smp_mb(); /* Up to the caller! */
+ WRITE_ONCE(rsclp->len, rsclp->len + v);
+ smp_mb(); /* Up to the caller! */
+#endif
+}
+
+/*
+ * Increase the numeric length of an rcu_segcblist structure by one.
+ * This can cause the ->len field to disagree with the actual number of
+ * callbacks on the structure. This increase is fully ordered with respect
+ * to the callers accesses both before and after.
+ */
+void rcu_segcblist_inc_len(struct rcu_segcblist *rsclp)
+{
+ rcu_segcblist_add_len(rsclp, 1);
+}
+
+/*
+ * Exchange the numeric length of the specified rcu_segcblist structure
+ * with the specified value. This can cause the ->len field to disagree
+ * with the actual number of callbacks on the structure. This exchange is
+ * fully ordered with respect to the callers accesses both before and after.
+ */
+long rcu_segcblist_xchg_len(struct rcu_segcblist *rsclp, long v)
+{
+#ifdef CONFIG_RCU_NOCB_CPU
+ return atomic_long_xchg(&rsclp->len, v);
+#else
+ long ret = rsclp->len;
+
+ smp_mb(); /* Up to the caller! */
+ WRITE_ONCE(rsclp->len, v);
+ smp_mb(); /* Up to the caller! */
+ return ret;
+#endif
+}
+
/*
* Initialize an rcu_segcblist structure.
*/
@@ -56,8 +160,9 @@ void rcu_segcblist_init(struct rcu_segcblist *rsclp)
rsclp->head = NULL;
for (i = 0; i < RCU_CBLIST_NSEGS; i++)
rsclp->tails[i] = &rsclp->head;
- rsclp->len = 0;
+ rcu_segcblist_set_len(rsclp, 0);
rsclp->len_lazy = 0;
+ rsclp->enabled = 1;
}
/*
@@ -69,7 +174,16 @@ void rcu_segcblist_disable(struct rcu_segcblist *rsclp)
WARN_ON_ONCE(!rcu_segcblist_empty(rsclp));
WARN_ON_ONCE(rcu_segcblist_n_cbs(rsclp));
WARN_ON_ONCE(rcu_segcblist_n_lazy_cbs(rsclp));
- rsclp->tails[RCU_NEXT_TAIL] = NULL;
+ rsclp->enabled = 0;
+}
+
+/*
+ * Mark the specified rcu_segcblist structure as offloaded. This
+ * structure must be empty.
+ */
+void rcu_segcblist_offload(struct rcu_segcblist *rsclp)
+{
+ rsclp->offloaded = 1;
}
/*
@@ -118,6 +232,18 @@ struct rcu_head *rcu_segcblist_first_pend_cb(struct rcu_segcblist *rsclp)
}
/*
+ * Return false if there are no CBs awaiting grace periods, otherwise,
+ * return true and store the nearest waited-upon grace period into *lp.
+ */
+bool rcu_segcblist_nextgp(struct rcu_segcblist *rsclp, unsigned long *lp)
+{
+ if (!rcu_segcblist_pend_cbs(rsclp))
+ return false;
+ *lp = rsclp->gp_seq[RCU_WAIT_TAIL];
+ return true;
+}
+
+/*
* Enqueue the specified callback onto the specified rcu_segcblist
* structure, updating accounting as needed. Note that the ->len
* field may be accessed locklessly, hence the WRITE_ONCE().
@@ -129,13 +255,13 @@ struct rcu_head *rcu_segcblist_first_pend_cb(struct rcu_segcblist *rsclp)
void rcu_segcblist_enqueue(struct rcu_segcblist *rsclp,
struct rcu_head *rhp, bool lazy)
{
- WRITE_ONCE(rsclp->len, rsclp->len + 1); /* ->len sampled locklessly. */
+ rcu_segcblist_inc_len(rsclp);
if (lazy)
rsclp->len_lazy++;
smp_mb(); /* Ensure counts are updated before callback is enqueued. */
rhp->next = NULL;
- *rsclp->tails[RCU_NEXT_TAIL] = rhp;
- rsclp->tails[RCU_NEXT_TAIL] = &rhp->next;
+ WRITE_ONCE(*rsclp->tails[RCU_NEXT_TAIL], rhp);
+ WRITE_ONCE(rsclp->tails[RCU_NEXT_TAIL], &rhp->next);
}
/*
@@ -155,7 +281,7 @@ bool rcu_segcblist_entrain(struct rcu_segcblist *rsclp,
if (rcu_segcblist_n_cbs(rsclp) == 0)
return false;
- WRITE_ONCE(rsclp->len, rsclp->len + 1);
+ rcu_segcblist_inc_len(rsclp);
if (lazy)
rsclp->len_lazy++;
smp_mb(); /* Ensure counts are updated before callback is entrained. */
@@ -163,9 +289,9 @@ bool rcu_segcblist_entrain(struct rcu_segcblist *rsclp,
for (i = RCU_NEXT_TAIL; i > RCU_DONE_TAIL; i--)
if (rsclp->tails[i] != rsclp->tails[i - 1])
break;
- *rsclp->tails[i] = rhp;
+ WRITE_ONCE(*rsclp->tails[i], rhp);
for (; i <= RCU_NEXT_TAIL; i++)
- rsclp->tails[i] = &rhp->next;
+ WRITE_ONCE(rsclp->tails[i], &rhp->next);
return true;
}
@@ -182,9 +308,8 @@ void rcu_segcblist_extract_count(struct rcu_segcblist *rsclp,
struct rcu_cblist *rclp)
{
rclp->len_lazy += rsclp->len_lazy;
- rclp->len += rsclp->len;
rsclp->len_lazy = 0;
- WRITE_ONCE(rsclp->len, 0); /* ->len sampled locklessly. */
+ rclp->len = rcu_segcblist_xchg_len(rsclp, 0);
}
/*
@@ -200,12 +325,12 @@ void rcu_segcblist_extract_done_cbs(struct rcu_segcblist *rsclp,
if (!rcu_segcblist_ready_cbs(rsclp))
return; /* Nothing to do. */
*rclp->tail = rsclp->head;
- rsclp->head = *rsclp->tails[RCU_DONE_TAIL];
- *rsclp->tails[RCU_DONE_TAIL] = NULL;
+ WRITE_ONCE(rsclp->head, *rsclp->tails[RCU_DONE_TAIL]);
+ WRITE_ONCE(*rsclp->tails[RCU_DONE_TAIL], NULL);
rclp->tail = rsclp->tails[RCU_DONE_TAIL];
for (i = RCU_CBLIST_NSEGS - 1; i >= RCU_DONE_TAIL; i--)
if (rsclp->tails[i] == rsclp->tails[RCU_DONE_TAIL])
- rsclp->tails[i] = &rsclp->head;
+ WRITE_ONCE(rsclp->tails[i], &rsclp->head);
}
/*
@@ -224,9 +349,9 @@ void rcu_segcblist_extract_pend_cbs(struct rcu_segcblist *rsclp,
return; /* Nothing to do. */
*rclp->tail = *rsclp->tails[RCU_DONE_TAIL];
rclp->tail = rsclp->tails[RCU_NEXT_TAIL];
- *rsclp->tails[RCU_DONE_TAIL] = NULL;
+ WRITE_ONCE(*rsclp->tails[RCU_DONE_TAIL], NULL);
for (i = RCU_DONE_TAIL + 1; i < RCU_CBLIST_NSEGS; i++)
- rsclp->tails[i] = rsclp->tails[RCU_DONE_TAIL];
+ WRITE_ONCE(rsclp->tails[i], rsclp->tails[RCU_DONE_TAIL]);
}
/*
@@ -237,8 +362,7 @@ void rcu_segcblist_insert_count(struct rcu_segcblist *rsclp,
struct rcu_cblist *rclp)
{
rsclp->len_lazy += rclp->len_lazy;
- /* ->len sampled locklessly. */
- WRITE_ONCE(rsclp->len, rsclp->len + rclp->len);
+ rcu_segcblist_add_len(rsclp, rclp->len);
rclp->len_lazy = 0;
rclp->len = 0;
}
@@ -255,10 +379,10 @@ void rcu_segcblist_insert_done_cbs(struct rcu_segcblist *rsclp,
if (!rclp->head)
return; /* No callbacks to move. */
*rclp->tail = rsclp->head;
- rsclp->head = rclp->head;
+ WRITE_ONCE(rsclp->head, rclp->head);
for (i = RCU_DONE_TAIL; i < RCU_CBLIST_NSEGS; i++)
if (&rsclp->head == rsclp->tails[i])
- rsclp->tails[i] = rclp->tail;
+ WRITE_ONCE(rsclp->tails[i], rclp->tail);
else
break;
rclp->head = NULL;
@@ -274,8 +398,8 @@ void rcu_segcblist_insert_pend_cbs(struct rcu_segcblist *rsclp,
{
if (!rclp->head)
return; /* Nothing to do. */
- *rsclp->tails[RCU_NEXT_TAIL] = rclp->head;
- rsclp->tails[RCU_NEXT_TAIL] = rclp->tail;
+ WRITE_ONCE(*rsclp->tails[RCU_NEXT_TAIL], rclp->head);
+ WRITE_ONCE(rsclp->tails[RCU_NEXT_TAIL], rclp->tail);
rclp->head = NULL;
rclp->tail = &rclp->head;
}
@@ -299,7 +423,7 @@ void rcu_segcblist_advance(struct rcu_segcblist *rsclp, unsigned long seq)
for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++) {
if (ULONG_CMP_LT(seq, rsclp->gp_seq[i]))
break;
- rsclp->tails[RCU_DONE_TAIL] = rsclp->tails[i];
+ WRITE_ONCE(rsclp->tails[RCU_DONE_TAIL], rsclp->tails[i]);
}
/* If no callbacks moved, nothing more need be done. */
@@ -308,7 +432,7 @@ void rcu_segcblist_advance(struct rcu_segcblist *rsclp, unsigned long seq)
/* Clean up tail pointers that might have been misordered above. */
for (j = RCU_WAIT_TAIL; j < i; j++)
- rsclp->tails[j] = rsclp->tails[RCU_DONE_TAIL];
+ WRITE_ONCE(rsclp->tails[j], rsclp->tails[RCU_DONE_TAIL]);
/*
* Callbacks moved, so clean up the misordered ->tails[] pointers
@@ -319,7 +443,7 @@ void rcu_segcblist_advance(struct rcu_segcblist *rsclp, unsigned long seq)
for (j = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++, j++) {
if (rsclp->tails[j] == rsclp->tails[RCU_NEXT_TAIL])
break; /* No more callbacks. */
- rsclp->tails[j] = rsclp->tails[i];
+ WRITE_ONCE(rsclp->tails[j], rsclp->tails[i]);
rsclp->gp_seq[j] = rsclp->gp_seq[i];
}
}
@@ -384,7 +508,7 @@ bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp, unsigned long seq)
* structure other than in the RCU_NEXT_TAIL segment.
*/
for (; i < RCU_NEXT_TAIL; i++) {
- rsclp->tails[i] = rsclp->tails[RCU_NEXT_TAIL];
+ WRITE_ONCE(rsclp->tails[i], rsclp->tails[RCU_NEXT_TAIL]);
rsclp->gp_seq[i] = seq;
}
return true;
diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h
index 71b64648464e..815c2fdd3fcc 100644
--- a/kernel/rcu/rcu_segcblist.h
+++ b/kernel/rcu/rcu_segcblist.h
@@ -9,6 +9,12 @@
#include <linux/rcu_segcblist.h>
+/* Return number of callbacks in the specified callback list. */
+static inline long rcu_cblist_n_cbs(struct rcu_cblist *rclp)
+{
+ return READ_ONCE(rclp->len);
+}
+
/*
* Account for the fact that a previously dequeued callback turned out
* to be marked as lazy.
@@ -19,6 +25,10 @@ static inline void rcu_cblist_dequeued_lazy(struct rcu_cblist *rclp)
}
void rcu_cblist_init(struct rcu_cblist *rclp);
+void rcu_cblist_enqueue(struct rcu_cblist *rclp, struct rcu_head *rhp);
+void rcu_cblist_flush_enqueue(struct rcu_cblist *drclp,
+ struct rcu_cblist *srclp,
+ struct rcu_head *rhp);
struct rcu_head *rcu_cblist_dequeue(struct rcu_cblist *rclp);
/*
@@ -36,13 +46,17 @@ struct rcu_head *rcu_cblist_dequeue(struct rcu_cblist *rclp);
*/
static inline bool rcu_segcblist_empty(struct rcu_segcblist *rsclp)
{
- return !rsclp->head;
+ return !READ_ONCE(rsclp->head);
}
/* Return number of callbacks in segmented callback list. */
static inline long rcu_segcblist_n_cbs(struct rcu_segcblist *rsclp)
{
+#ifdef CONFIG_RCU_NOCB_CPU
+ return atomic_long_read(&rsclp->len);
+#else
return READ_ONCE(rsclp->len);
+#endif
}
/* Return number of lazy callbacks in segmented callback list. */
@@ -54,16 +68,22 @@ static inline long rcu_segcblist_n_lazy_cbs(struct rcu_segcblist *rsclp)
/* Return number of lazy callbacks in segmented callback list. */
static inline long rcu_segcblist_n_nonlazy_cbs(struct rcu_segcblist *rsclp)
{
- return rsclp->len - rsclp->len_lazy;
+ return rcu_segcblist_n_cbs(rsclp) - rsclp->len_lazy;
}
/*
* Is the specified rcu_segcblist enabled, for example, not corresponding
- * to an offline or callback-offloaded CPU?
+ * to an offline CPU?
*/
static inline bool rcu_segcblist_is_enabled(struct rcu_segcblist *rsclp)
{
- return !!rsclp->tails[RCU_NEXT_TAIL];
+ return rsclp->enabled;
+}
+
+/* Is the specified rcu_segcblist offloaded? */
+static inline bool rcu_segcblist_is_offloaded(struct rcu_segcblist *rsclp)
+{
+ return rsclp->offloaded;
}
/*
@@ -73,36 +93,18 @@ static inline bool rcu_segcblist_is_enabled(struct rcu_segcblist *rsclp)
*/
static inline bool rcu_segcblist_restempty(struct rcu_segcblist *rsclp, int seg)
{
- return !*rsclp->tails[seg];
-}
-
-/*
- * Interim function to return rcu_segcblist head pointer. Longer term, the
- * rcu_segcblist will be used more pervasively, removing the need for this
- * function.
- */
-static inline struct rcu_head *rcu_segcblist_head(struct rcu_segcblist *rsclp)
-{
- return rsclp->head;
-}
-
-/*
- * Interim function to return rcu_segcblist head pointer. Longer term, the
- * rcu_segcblist will be used more pervasively, removing the need for this
- * function.
- */
-static inline struct rcu_head **rcu_segcblist_tail(struct rcu_segcblist *rsclp)
-{
- WARN_ON_ONCE(rcu_segcblist_empty(rsclp));
- return rsclp->tails[RCU_NEXT_TAIL];
+ return !READ_ONCE(*READ_ONCE(rsclp->tails[seg]));
}
+void rcu_segcblist_inc_len(struct rcu_segcblist *rsclp);
void rcu_segcblist_init(struct rcu_segcblist *rsclp);
void rcu_segcblist_disable(struct rcu_segcblist *rsclp);
+void rcu_segcblist_offload(struct rcu_segcblist *rsclp);
bool rcu_segcblist_ready_cbs(struct rcu_segcblist *rsclp);
bool rcu_segcblist_pend_cbs(struct rcu_segcblist *rsclp);
struct rcu_head *rcu_segcblist_first_cb(struct rcu_segcblist *rsclp);
struct rcu_head *rcu_segcblist_first_pend_cb(struct rcu_segcblist *rsclp);
+bool rcu_segcblist_nextgp(struct rcu_segcblist *rsclp, unsigned long *lp);
void rcu_segcblist_enqueue(struct rcu_segcblist *rsclp,
struct rcu_head *rhp, bool lazy);
bool rcu_segcblist_entrain(struct rcu_segcblist *rsclp,
diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c
index 7a6890b23c5f..5a879d073c1c 100644
--- a/kernel/rcu/rcuperf.c
+++ b/kernel/rcu/rcuperf.c
@@ -89,7 +89,7 @@ torture_param(int, writer_holdoff, 0, "Holdoff (us) between GPs, zero to disable
static char *perf_type = "rcu";
module_param(perf_type, charp, 0444);
-MODULE_PARM_DESC(perf_type, "Type of RCU to performance-test (rcu, rcu_bh, ...)");
+MODULE_PARM_DESC(perf_type, "Type of RCU to performance-test (rcu, srcu, ...)");
static int nrealreaders;
static int nrealwriters;
@@ -375,6 +375,14 @@ rcu_perf_writer(void *arg)
if (holdoff)
schedule_timeout_uninterruptible(holdoff * HZ);
+ /*
+ * Wait until rcu_end_inkernel_boot() is called for normal GP tests
+ * so that RCU is not always expedited for normal GP tests.
+ * The system_state test is approximate, but works well in practice.
+ */
+ while (!gp_exp && system_state != SYSTEM_RUNNING)
+ schedule_timeout_uninterruptible(1);
+
t = ktime_get_mono_fast_ns();
if (atomic_inc_return(&n_rcu_perf_writer_started) >= nrealwriters) {
t_rcu_perf_writer_started = t;
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index fce4e7e6f502..3c9feca1eab1 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -161,6 +161,7 @@ static atomic_long_t n_rcu_torture_timers;
static long n_barrier_attempts;
static long n_barrier_successes; /* did rcu_barrier test succeed? */
static struct list_head rcu_torture_removed;
+static unsigned long shutdown_jiffies;
static int rcu_torture_writer_state;
#define RTWS_FIXED_DELAY 0
@@ -228,6 +229,15 @@ static u64 notrace rcu_trace_clock_local(void)
}
#endif /* #else #ifdef CONFIG_RCU_TRACE */
+/*
+ * Stop aggressive CPU-hog tests a bit before the end of the test in order
+ * to avoid interfering with test shutdown.
+ */
+static bool shutdown_time_arrived(void)
+{
+ return shutdown_secs && time_after(jiffies, shutdown_jiffies - 30 * HZ);
+}
+
static unsigned long boost_starttime; /* jiffies of next boost test start. */
static DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */
/* and boost task create/destroy. */
@@ -1713,12 +1723,14 @@ static void rcu_torture_fwd_cb_cr(struct rcu_head *rhp)
}
// Give the scheduler a chance, even on nohz_full CPUs.
-static void rcu_torture_fwd_prog_cond_resched(void)
+static void rcu_torture_fwd_prog_cond_resched(unsigned long iter)
{
if (IS_ENABLED(CONFIG_PREEMPT) && IS_ENABLED(CONFIG_NO_HZ_FULL)) {
- if (need_resched())
+ // Real call_rcu() floods hit userspace, so emulate that.
+ if (need_resched() || (iter & 0xfff))
schedule();
} else {
+ // No userspace emulation: CB invocation throttles call_rcu()
cond_resched();
}
}
@@ -1746,7 +1758,7 @@ static unsigned long rcu_torture_fwd_prog_cbfree(void)
spin_unlock_irqrestore(&rcu_fwd_lock, flags);
kfree(rfcp);
freed++;
- rcu_torture_fwd_prog_cond_resched();
+ rcu_torture_fwd_prog_cond_resched(freed);
}
return freed;
}
@@ -1785,15 +1797,17 @@ static void rcu_torture_fwd_prog_nr(int *tested, int *tested_tries)
WRITE_ONCE(rcu_fwd_startat, jiffies);
stopat = rcu_fwd_startat + dur;
while (time_before(jiffies, stopat) &&
+ !shutdown_time_arrived() &&
!READ_ONCE(rcu_fwd_emergency_stop) && !torture_must_stop()) {
idx = cur_ops->readlock();
udelay(10);
cur_ops->readunlock(idx);
if (!fwd_progress_need_resched || need_resched())
- rcu_torture_fwd_prog_cond_resched();
+ rcu_torture_fwd_prog_cond_resched(1);
}
(*tested_tries)++;
if (!time_before(jiffies, stopat) &&
+ !shutdown_time_arrived() &&
!READ_ONCE(rcu_fwd_emergency_stop) && !torture_must_stop()) {
(*tested)++;
cver = READ_ONCE(rcu_torture_current_version) - cver;
@@ -1852,6 +1866,7 @@ static void rcu_torture_fwd_prog_cr(void)
gps = cur_ops->get_gp_seq();
rcu_launder_gp_seq_start = gps;
while (time_before(jiffies, stopat) &&
+ !shutdown_time_arrived() &&
!READ_ONCE(rcu_fwd_emergency_stop) && !torture_must_stop()) {
rfcp = READ_ONCE(rcu_fwd_cb_head);
rfcpn = NULL;
@@ -1875,7 +1890,7 @@ static void rcu_torture_fwd_prog_cr(void)
rfcp->rfc_gps = 0;
}
cur_ops->call(&rfcp->rh, rcu_torture_fwd_cb_cr);
- rcu_torture_fwd_prog_cond_resched();
+ rcu_torture_fwd_prog_cond_resched(n_launders + n_max_cbs);
}
stoppedat = jiffies;
n_launders_cb_snap = READ_ONCE(n_launders_cb);
@@ -1884,7 +1899,8 @@ static void rcu_torture_fwd_prog_cr(void)
cur_ops->cb_barrier(); /* Wait for callbacks to be invoked. */
(void)rcu_torture_fwd_prog_cbfree();
- if (!torture_must_stop() && !READ_ONCE(rcu_fwd_emergency_stop)) {
+ if (!torture_must_stop() && !READ_ONCE(rcu_fwd_emergency_stop) &&
+ !shutdown_time_arrived()) {
WARN_ON(n_max_gps < MIN_FWD_CBS_LAUNDERED);
pr_alert("%s Duration %lu barrier: %lu pending %ld n_launders: %ld n_launders_sa: %ld n_max_gps: %ld n_max_cbs: %ld cver %ld gps %ld\n",
__func__,
@@ -2160,6 +2176,7 @@ rcu_torture_cleanup(void)
return;
}
+ show_rcu_gp_kthreads();
rcu_torture_barrier_cleanup();
torture_stop_kthread(rcu_torture_fwd_prog, fwd_prog_task);
torture_stop_kthread(rcu_torture_stall, stall_task);
@@ -2465,6 +2482,7 @@ rcu_torture_init(void)
goto unwind;
rcutor_hp = firsterr;
}
+ shutdown_jiffies = jiffies + shutdown_secs * HZ;
firsterr = torture_shutdown_init(shutdown_secs, rcu_torture_cleanup);
if (firsterr)
goto unwind;
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index cf0e886314f2..5dffade2d7cd 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -1279,8 +1279,9 @@ void srcu_torture_stats_print(struct srcu_struct *ssp, char *tt, char *tf)
c0 = l0 - u0;
c1 = l1 - u1;
- pr_cont(" %d(%ld,%ld %1p)",
- cpu, c0, c1, rcu_segcblist_head(&sdp->srcu_cblist));
+ pr_cont(" %d(%ld,%ld %c)",
+ cpu, c0, c1,
+ "C."[rcu_segcblist_empty(&sdp->srcu_cblist)]);
s0 += c0;
s1 += c1;
}
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index a14e5fbbea46..81105141b6a8 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -56,6 +56,7 @@
#include <linux/smpboot.h>
#include <linux/jiffies.h>
#include <linux/sched/isolation.h>
+#include <linux/sched/clock.h>
#include "../time/tick-internal.h"
#include "tree.h"
@@ -210,9 +211,9 @@ static long rcu_get_n_cbs_cpu(int cpu)
{
struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
- if (rcu_segcblist_is_enabled(&rdp->cblist)) /* Online normal CPU? */
+ if (rcu_segcblist_is_enabled(&rdp->cblist))
return rcu_segcblist_n_cbs(&rdp->cblist);
- return rcu_get_n_cbs_nocb_cpu(rdp); /* Works for offline, too. */
+ return 0;
}
void rcu_softirq_qs(void)
@@ -416,6 +417,12 @@ module_param(qlowmark, long, 0444);
static ulong jiffies_till_first_fqs = ULONG_MAX;
static ulong jiffies_till_next_fqs = ULONG_MAX;
static bool rcu_kick_kthreads;
+static int rcu_divisor = 7;
+module_param(rcu_divisor, int, 0644);
+
+/* Force an exit from rcu_do_batch() after 3 milliseconds. */
+static long rcu_resched_ns = 3 * NSEC_PER_MSEC;
+module_param(rcu_resched_ns, long, 0644);
/*
* How long the grace period must be before we start recruiting
@@ -1251,6 +1258,7 @@ static bool rcu_accelerate_cbs(struct rcu_node *rnp, struct rcu_data *rdp)
unsigned long gp_seq_req;
bool ret = false;
+ rcu_lockdep_assert_cblist_protected(rdp);
raw_lockdep_assert_held_rcu_node(rnp);
/* If no pending (not yet ready to invoke) callbacks, nothing to do. */
@@ -1292,7 +1300,7 @@ static void rcu_accelerate_cbs_unlocked(struct rcu_node *rnp,
unsigned long c;
bool needwake;
- lockdep_assert_irqs_disabled();
+ rcu_lockdep_assert_cblist_protected(rdp);
c = rcu_seq_snap(&rcu_state.gp_seq);
if (!rdp->gpwrap && ULONG_CMP_GE(rdp->gp_seq_needed, c)) {
/* Old request still live, so mark recent callbacks. */
@@ -1318,6 +1326,7 @@ static void rcu_accelerate_cbs_unlocked(struct rcu_node *rnp,
*/
static bool rcu_advance_cbs(struct rcu_node *rnp, struct rcu_data *rdp)
{
+ rcu_lockdep_assert_cblist_protected(rdp);
raw_lockdep_assert_held_rcu_node(rnp);
/* If no pending (not yet ready to invoke) callbacks, nothing to do. */
@@ -1335,6 +1344,21 @@ static bool rcu_advance_cbs(struct rcu_node *rnp, struct rcu_data *rdp)
}
/*
+ * Move and classify callbacks, but only if doing so won't require
+ * that the RCU grace-period kthread be awakened.
+ */
+static void __maybe_unused rcu_advance_cbs_nowake(struct rcu_node *rnp,
+ struct rcu_data *rdp)
+{
+ rcu_lockdep_assert_cblist_protected(rdp);
+ if (!rcu_seq_state(rcu_seq_current(&rnp->gp_seq)) ||
+ !raw_spin_trylock_rcu_node(rnp))
+ return;
+ WARN_ON_ONCE(rcu_advance_cbs(rnp, rdp));
+ raw_spin_unlock_rcu_node(rnp);
+}
+
+/*
* Update CPU-local rcu_data state to record the beginnings and ends of
* grace periods. The caller must hold the ->lock of the leaf rcu_node
* structure corresponding to the current CPU, and must have irqs disabled.
@@ -1342,8 +1366,10 @@ static bool rcu_advance_cbs(struct rcu_node *rnp, struct rcu_data *rdp)
*/
static bool __note_gp_changes(struct rcu_node *rnp, struct rcu_data *rdp)
{
- bool ret;
+ bool ret = false;
bool need_gp;
+ const bool offloaded = IS_ENABLED(CONFIG_RCU_NOCB_CPU) &&
+ rcu_segcblist_is_offloaded(&rdp->cblist);
raw_lockdep_assert_held_rcu_node(rnp);
@@ -1353,10 +1379,12 @@ static bool __note_gp_changes(struct rcu_node *rnp, struct rcu_data *rdp)
/* Handle the ends of any preceding grace periods first. */
if (rcu_seq_completed_gp(rdp->gp_seq, rnp->gp_seq) ||
unlikely(READ_ONCE(rdp->gpwrap))) {
- ret = rcu_advance_cbs(rnp, rdp); /* Advance callbacks. */
+ if (!offloaded)
+ ret = rcu_advance_cbs(rnp, rdp); /* Advance CBs. */
trace_rcu_grace_period(rcu_state.name, rdp->gp_seq, TPS("cpuend"));
} else {
- ret = rcu_accelerate_cbs(rnp, rdp); /* Recent callbacks. */
+ if (!offloaded)
+ ret = rcu_accelerate_cbs(rnp, rdp); /* Recent CBs. */
}
/* Now handle the beginnings of any new-to-this-CPU grace periods. */
@@ -1657,6 +1685,7 @@ static void rcu_gp_cleanup(void)
unsigned long gp_duration;
bool needgp = false;
unsigned long new_gp_seq;
+ bool offloaded;
struct rcu_data *rdp;
struct rcu_node *rnp = rcu_get_root();
struct swait_queue_head *sq;
@@ -1722,7 +1751,9 @@ static void rcu_gp_cleanup(void)
needgp = true;
}
/* Advance CBs to reduce false positives below. */
- if (!rcu_accelerate_cbs(rnp, rdp) && needgp) {
+ offloaded = IS_ENABLED(CONFIG_RCU_NOCB_CPU) &&
+ rcu_segcblist_is_offloaded(&rdp->cblist);
+ if ((offloaded || !rcu_accelerate_cbs(rnp, rdp)) && needgp) {
WRITE_ONCE(rcu_state.gp_flags, RCU_GP_FLAG_INIT);
rcu_state.gp_req_activity = jiffies;
trace_rcu_grace_period(rcu_state.name,
@@ -1881,7 +1912,7 @@ rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
struct rcu_node *rnp_p;
raw_lockdep_assert_held_rcu_node(rnp);
- if (WARN_ON_ONCE(!IS_ENABLED(CONFIG_PREEMPT)) ||
+ if (WARN_ON_ONCE(!IS_ENABLED(CONFIG_PREEMPTION)) ||
WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp)) ||
rnp->qsmask != 0) {
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
@@ -1916,7 +1947,9 @@ rcu_report_qs_rdp(int cpu, struct rcu_data *rdp)
{
unsigned long flags;
unsigned long mask;
- bool needwake;
+ bool needwake = false;
+ const bool offloaded = IS_ENABLED(CONFIG_RCU_NOCB_CPU) &&
+ rcu_segcblist_is_offloaded(&rdp->cblist);
struct rcu_node *rnp;
rnp = rdp->mynode;
@@ -1943,7 +1976,8 @@ rcu_report_qs_rdp(int cpu, struct rcu_data *rdp)
* This GP can't end until cpu checks in, so all of our
* callbacks can be processed during the next GP.
*/
- needwake = rcu_accelerate_cbs(rnp, rdp);
+ if (!offloaded)
+ needwake = rcu_accelerate_cbs(rnp, rdp);
rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags);
/* ^^^ Released rnp->lock */
@@ -2077,9 +2111,12 @@ int rcutree_dead_cpu(unsigned int cpu)
static void rcu_do_batch(struct rcu_data *rdp)
{
unsigned long flags;
+ const bool offloaded = IS_ENABLED(CONFIG_RCU_NOCB_CPU) &&
+ rcu_segcblist_is_offloaded(&rdp->cblist);
struct rcu_head *rhp;
struct rcu_cblist rcl = RCU_CBLIST_INITIALIZER(rcl);
long bl, count;
+ long pending, tlimit = 0;
/* If no callbacks are ready, just return. */
if (!rcu_segcblist_ready_cbs(&rdp->cblist)) {
@@ -2099,13 +2136,19 @@ static void rcu_do_batch(struct rcu_data *rdp)
* callback counts, as rcu_barrier() needs to be conservative.
*/
local_irq_save(flags);
+ rcu_nocb_lock(rdp);
WARN_ON_ONCE(cpu_is_offline(smp_processor_id()));
- bl = rdp->blimit;
+ pending = rcu_segcblist_n_cbs(&rdp->cblist);
+ bl = max(rdp->blimit, pending >> rcu_divisor);
+ if (unlikely(bl > 100))
+ tlimit = local_clock() + rcu_resched_ns;
trace_rcu_batch_start(rcu_state.name,
rcu_segcblist_n_lazy_cbs(&rdp->cblist),
rcu_segcblist_n_cbs(&rdp->cblist), bl);
rcu_segcblist_extract_done_cbs(&rdp->cblist, &rcl);
- local_irq_restore(flags);
+ if (offloaded)
+ rdp->qlen_last_fqs_check = rcu_segcblist_n_cbs(&rdp->cblist);
+ rcu_nocb_unlock_irqrestore(rdp, flags);
/* Invoke callbacks. */
rhp = rcu_cblist_dequeue(&rcl);
@@ -2117,13 +2160,29 @@ static void rcu_do_batch(struct rcu_data *rdp)
* Stop only if limit reached and CPU has something to do.
* Note: The rcl structure counts down from zero.
*/
- if (-rcl.len >= bl &&
+ if (-rcl.len >= bl && !offloaded &&
(need_resched() ||
(!is_idle_task(current) && !rcu_is_callbacks_kthread())))
break;
+ if (unlikely(tlimit)) {
+ /* only call local_clock() every 32 callbacks */
+ if (likely((-rcl.len & 31) || local_clock() < tlimit))
+ continue;
+ /* Exceeded the time limit, so leave. */
+ break;
+ }
+ if (offloaded) {
+ WARN_ON_ONCE(in_serving_softirq());
+ local_bh_enable();
+ lockdep_assert_irqs_enabled();
+ cond_resched_tasks_rcu_qs();
+ lockdep_assert_irqs_enabled();
+ local_bh_disable();
+ }
}
local_irq_save(flags);
+ rcu_nocb_lock(rdp);
count = -rcl.len;
trace_rcu_batch_end(rcu_state.name, count, !!rcl.head, need_resched(),
is_idle_task(current), rcu_is_callbacks_kthread());
@@ -2149,12 +2208,14 @@ static void rcu_do_batch(struct rcu_data *rdp)
* The following usually indicates a double call_rcu(). To track
* this down, try building with CONFIG_DEBUG_OBJECTS_RCU_HEAD=y.
*/
- WARN_ON_ONCE(rcu_segcblist_empty(&rdp->cblist) != (count == 0));
+ WARN_ON_ONCE(count == 0 && !rcu_segcblist_empty(&rdp->cblist));
+ WARN_ON_ONCE(!IS_ENABLED(CONFIG_RCU_NOCB_CPU) &&
+ count != 0 && rcu_segcblist_empty(&rdp->cblist));
- local_irq_restore(flags);
+ rcu_nocb_unlock_irqrestore(rdp, flags);
/* Re-invoke RCU core processing if there are callbacks remaining. */
- if (rcu_segcblist_ready_cbs(&rdp->cblist))
+ if (!offloaded && rcu_segcblist_ready_cbs(&rdp->cblist))
invoke_rcu_core();
}
@@ -2205,7 +2266,7 @@ static void force_qs_rnp(int (*f)(struct rcu_data *rdp))
mask = 0;
raw_spin_lock_irqsave_rcu_node(rnp, flags);
if (rnp->qsmask == 0) {
- if (!IS_ENABLED(CONFIG_PREEMPT) ||
+ if (!IS_ENABLED(CONFIG_PREEMPTION) ||
rcu_preempt_blocked_readers_cgp(rnp)) {
/*
* No point in scanning bits because they
@@ -2280,6 +2341,8 @@ static __latent_entropy void rcu_core(void)
unsigned long flags;
struct rcu_data *rdp = raw_cpu_ptr(&rcu_data);
struct rcu_node *rnp = rdp->mynode;
+ const bool offloaded = IS_ENABLED(CONFIG_RCU_NOCB_CPU) &&
+ rcu_segcblist_is_offloaded(&rdp->cblist);
if (cpu_is_offline(smp_processor_id()))
return;
@@ -2299,7 +2362,7 @@ static __latent_entropy void rcu_core(void)
/* No grace period and unregistered callbacks? */
if (!rcu_gp_in_progress() &&
- rcu_segcblist_is_enabled(&rdp->cblist)) {
+ rcu_segcblist_is_enabled(&rdp->cblist) && !offloaded) {
local_irq_save(flags);
if (!rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL))
rcu_accelerate_cbs_unlocked(rnp, rdp);
@@ -2309,7 +2372,7 @@ static __latent_entropy void rcu_core(void)
rcu_check_gp_start_stall(rnp, rdp, rcu_jiffies_till_stall_check());
/* If there are callbacks ready, invoke them. */
- if (rcu_segcblist_ready_cbs(&rdp->cblist) &&
+ if (!offloaded && rcu_segcblist_ready_cbs(&rdp->cblist) &&
likely(READ_ONCE(rcu_scheduler_fully_active)))
rcu_do_batch(rdp);
@@ -2489,10 +2552,11 @@ static void rcu_leak_callback(struct rcu_head *rhp)
* is expected to specify a CPU.
*/
static void
-__call_rcu(struct rcu_head *head, rcu_callback_t func, int cpu, bool lazy)
+__call_rcu(struct rcu_head *head, rcu_callback_t func, bool lazy)
{
unsigned long flags;
struct rcu_data *rdp;
+ bool was_alldone;
/* Misaligned rcu_head! */
WARN_ON_ONCE((unsigned long)head & (sizeof(void *) - 1));
@@ -2514,28 +2578,18 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func, int cpu, bool lazy)
rdp = this_cpu_ptr(&rcu_data);
/* Add the callback to our list. */
- if (unlikely(!rcu_segcblist_is_enabled(&rdp->cblist)) || cpu != -1) {
- int offline;
-
- if (cpu != -1)
- rdp = per_cpu_ptr(&rcu_data, cpu);
- if (likely(rdp->mynode)) {
- /* Post-boot, so this should be for a no-CBs CPU. */
- offline = !__call_rcu_nocb(rdp, head, lazy, flags);
- WARN_ON_ONCE(offline);
- /* Offline CPU, _call_rcu() illegal, leak callback. */
- local_irq_restore(flags);
- return;
- }
- /*
- * Very early boot, before rcu_init(). Initialize if needed
- * and then drop through to queue the callback.
- */
- WARN_ON_ONCE(cpu != -1);
+ if (unlikely(!rcu_segcblist_is_enabled(&rdp->cblist))) {
+ // This can trigger due to call_rcu() from offline CPU:
+ WARN_ON_ONCE(rcu_scheduler_active != RCU_SCHEDULER_INACTIVE);
WARN_ON_ONCE(!rcu_is_watching());
+ // Very early boot, before rcu_init(). Initialize if needed
+ // and then drop through to queue the callback.
if (rcu_segcblist_empty(&rdp->cblist))
rcu_segcblist_init(&rdp->cblist);
}
+ if (rcu_nocb_try_bypass(rdp, head, &was_alldone, flags))
+ return; // Enqueued onto ->nocb_bypass, so just leave.
+ /* If we get here, rcu_nocb_try_bypass() acquired ->nocb_lock. */
rcu_segcblist_enqueue(&rdp->cblist, head, lazy);
if (__is_kfree_rcu_offset((unsigned long)func))
trace_rcu_kfree_callback(rcu_state.name, head,
@@ -2548,8 +2602,13 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func, int cpu, bool lazy)
rcu_segcblist_n_cbs(&rdp->cblist));
/* Go handle any RCU core processing required. */
- __call_rcu_core(rdp, head, flags);
- local_irq_restore(flags);
+ if (IS_ENABLED(CONFIG_RCU_NOCB_CPU) &&
+ unlikely(rcu_segcblist_is_offloaded(&rdp->cblist))) {
+ __call_rcu_nocb_wake(rdp, was_alldone, flags); /* unlocks */
+ } else {
+ __call_rcu_core(rdp, head, flags);
+ local_irq_restore(flags);
+ }
}
/**
@@ -2589,7 +2648,7 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func, int cpu, bool lazy)
*/
void call_rcu(struct rcu_head *head, rcu_callback_t func)
{
- __call_rcu(head, func, -1, 0);
+ __call_rcu(head, func, 0);
}
EXPORT_SYMBOL_GPL(call_rcu);
@@ -2602,7 +2661,7 @@ EXPORT_SYMBOL_GPL(call_rcu);
*/
void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
{
- __call_rcu(head, func, -1, 1);
+ __call_rcu(head, func, 1);
}
EXPORT_SYMBOL_GPL(kfree_call_rcu);
@@ -2622,7 +2681,7 @@ static int rcu_blocking_is_gp(void)
{
int ret;
- if (IS_ENABLED(CONFIG_PREEMPT))
+ if (IS_ENABLED(CONFIG_PREEMPTION))
return rcu_scheduler_active == RCU_SCHEDULER_INACTIVE;
might_sleep(); /* Check for RCU read-side critical section. */
preempt_disable();
@@ -2735,6 +2794,10 @@ static int rcu_pending(void)
/* Check for CPU stalls, if enabled. */
check_cpu_stall(rdp);
+ /* Does this CPU need a deferred NOCB wakeup? */
+ if (rcu_nocb_need_deferred_wakeup(rdp))
+ return 1;
+
/* Is this CPU a NO_HZ_FULL CPU that should ignore RCU? */
if (rcu_nohz_full_cpu())
return 0;
@@ -2750,6 +2813,8 @@ static int rcu_pending(void)
/* Has RCU gone idle with this CPU needing another grace period? */
if (!rcu_gp_in_progress() &&
rcu_segcblist_is_enabled(&rdp->cblist) &&
+ (!IS_ENABLED(CONFIG_RCU_NOCB_CPU) ||
+ !rcu_segcblist_is_offloaded(&rdp->cblist)) &&
!rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL))
return 1;
@@ -2758,10 +2823,6 @@ static int rcu_pending(void)
unlikely(READ_ONCE(rdp->gpwrap))) /* outside lock */
return 1;
- /* Does this CPU need a deferred NOCB wakeup? */
- if (rcu_nocb_need_deferred_wakeup(rdp))
- return 1;
-
/* nothing to do */
return 0;
}
@@ -2801,6 +2862,8 @@ static void rcu_barrier_func(void *unused)
rcu_barrier_trace(TPS("IRQ"), -1, rcu_state.barrier_sequence);
rdp->barrier_head.func = rcu_barrier_callback;
debug_rcu_head_queue(&rdp->barrier_head);
+ rcu_nocb_lock(rdp);
+ WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies));
if (rcu_segcblist_entrain(&rdp->cblist, &rdp->barrier_head, 0)) {
atomic_inc(&rcu_state.barrier_cpu_count);
} else {
@@ -2808,6 +2871,7 @@ static void rcu_barrier_func(void *unused)
rcu_barrier_trace(TPS("IRQNQ"), -1,
rcu_state.barrier_sequence);
}
+ rcu_nocb_unlock(rdp);
}
/**
@@ -2858,22 +2922,11 @@ void rcu_barrier(void)
* corresponding CPU's preceding callbacks have been invoked.
*/
for_each_possible_cpu(cpu) {
- if (!cpu_online(cpu) && !rcu_is_nocb_cpu(cpu))
- continue;
rdp = per_cpu_ptr(&rcu_data, cpu);
- if (rcu_is_nocb_cpu(cpu)) {
- if (!rcu_nocb_cpu_needs_barrier(cpu)) {
- rcu_barrier_trace(TPS("OfflineNoCB"), cpu,
- rcu_state.barrier_sequence);
- } else {
- rcu_barrier_trace(TPS("OnlineNoCB"), cpu,
- rcu_state.barrier_sequence);
- smp_mb__before_atomic();
- atomic_inc(&rcu_state.barrier_cpu_count);
- __call_rcu(&rdp->barrier_head,
- rcu_barrier_callback, cpu, 0);
- }
- } else if (rcu_segcblist_n_cbs(&rdp->cblist)) {
+ if (!cpu_online(cpu) &&
+ !rcu_segcblist_is_offloaded(&rdp->cblist))
+ continue;
+ if (rcu_segcblist_n_cbs(&rdp->cblist)) {
rcu_barrier_trace(TPS("OnlineQ"), cpu,
rcu_state.barrier_sequence);
smp_call_function_single(cpu, rcu_barrier_func, NULL, 1);
@@ -2958,7 +3011,8 @@ rcu_boot_init_percpu_data(int cpu)
* Initializes a CPU's per-CPU RCU data. Note that only one online or
* offline event can be happening at a given time. Note also that we can
* accept some slop in the rsp->gp_seq access due to the fact that this
- * CPU cannot possibly have any RCU callbacks in flight yet.
+ * CPU cannot possibly have any non-offloaded RCU callbacks in flight yet.
+ * And any offloaded callbacks are being numbered elsewhere.
*/
int rcutree_prepare_cpu(unsigned int cpu)
{
@@ -2972,7 +3026,7 @@ int rcutree_prepare_cpu(unsigned int cpu)
rdp->n_force_qs_snap = rcu_state.n_force_qs;
rdp->blimit = blimit;
if (rcu_segcblist_empty(&rdp->cblist) && /* No early-boot CBs? */
- !init_nocb_callback_list(rdp))
+ !rcu_segcblist_is_offloaded(&rdp->cblist))
rcu_segcblist_init(&rdp->cblist); /* Re-enable callbacks. */
rdp->dynticks_nesting = 1; /* CPU not up, no tearing. */
rcu_dynticks_eqs_online();
@@ -3151,29 +3205,38 @@ void rcutree_migrate_callbacks(int cpu)
{
unsigned long flags;
struct rcu_data *my_rdp;
+ struct rcu_node *my_rnp;
struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
- struct rcu_node *rnp_root = rcu_get_root();
bool needwake;
- if (rcu_is_nocb_cpu(cpu) || rcu_segcblist_empty(&rdp->cblist))
+ if (rcu_segcblist_is_offloaded(&rdp->cblist) ||
+ rcu_segcblist_empty(&rdp->cblist))
return; /* No callbacks to migrate. */
local_irq_save(flags);
my_rdp = this_cpu_ptr(&rcu_data);
- if (rcu_nocb_adopt_orphan_cbs(my_rdp, rdp, flags)) {
- local_irq_restore(flags);
- return;
- }
- raw_spin_lock_rcu_node(rnp_root); /* irqs already disabled. */
+ my_rnp = my_rdp->mynode;
+ rcu_nocb_lock(my_rdp); /* irqs already disabled. */
+ WARN_ON_ONCE(!rcu_nocb_flush_bypass(my_rdp, NULL, jiffies));
+ raw_spin_lock_rcu_node(my_rnp); /* irqs already disabled. */
/* Leverage recent GPs and set GP for new callbacks. */
- needwake = rcu_advance_cbs(rnp_root, rdp) ||
- rcu_advance_cbs(rnp_root, my_rdp);
+ needwake = rcu_advance_cbs(my_rnp, rdp) ||
+ rcu_advance_cbs(my_rnp, my_rdp);
rcu_segcblist_merge(&my_rdp->cblist, &rdp->cblist);
+ needwake = needwake || rcu_advance_cbs(my_rnp, my_rdp);
+ rcu_segcblist_disable(&rdp->cblist);
WARN_ON_ONCE(rcu_segcblist_empty(&my_rdp->cblist) !=
!rcu_segcblist_n_cbs(&my_rdp->cblist));
- raw_spin_unlock_irqrestore_rcu_node(rnp_root, flags);
+ if (rcu_segcblist_is_offloaded(&my_rdp->cblist)) {
+ raw_spin_unlock_rcu_node(my_rnp); /* irqs remain disabled. */
+ __call_rcu_nocb_wake(my_rdp, true, flags);
+ } else {
+ rcu_nocb_unlock(my_rdp); /* irqs remain disabled. */
+ raw_spin_unlock_irqrestore_rcu_node(my_rnp, flags);
+ }
if (needwake)
rcu_gp_kthread_wake();
+ lockdep_assert_irqs_enabled();
WARN_ONCE(rcu_segcblist_n_cbs(&rdp->cblist) != 0 ||
!rcu_segcblist_empty(&rdp->cblist),
"rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, 1stCB=%p\n",
@@ -3234,13 +3297,13 @@ static int __init rcu_spawn_gp_kthread(void)
t = kthread_create(rcu_gp_kthread, NULL, "%s", rcu_state.name);
if (WARN_ONCE(IS_ERR(t), "%s: Could not start grace-period kthread, OOM is now expected behavior\n", __func__))
return 0;
- rnp = rcu_get_root();
- raw_spin_lock_irqsave_rcu_node(rnp, flags);
- rcu_state.gp_kthread = t;
if (kthread_prio) {
sp.sched_priority = kthread_prio;
sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
}
+ rnp = rcu_get_root();
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
+ rcu_state.gp_kthread = t;
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
wake_up_process(t);
rcu_spawn_nocb_kthreads();
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 7acaf3a62d39..c612f306fe89 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -194,29 +194,38 @@ struct rcu_data {
/* 5) Callback offloading. */
#ifdef CONFIG_RCU_NOCB_CPU
- struct rcu_head *nocb_head; /* CBs waiting for kthread. */
- struct rcu_head **nocb_tail;
- atomic_long_t nocb_q_count; /* # CBs waiting for nocb */
- atomic_long_t nocb_q_count_lazy; /* invocation (all stages). */
- struct rcu_head *nocb_follower_head; /* CBs ready to invoke. */
- struct rcu_head **nocb_follower_tail;
- struct swait_queue_head nocb_wq; /* For nocb kthreads to sleep on. */
- struct task_struct *nocb_kthread;
+ struct swait_queue_head nocb_cb_wq; /* For nocb kthreads to sleep on. */
+ struct task_struct *nocb_gp_kthread;
raw_spinlock_t nocb_lock; /* Guard following pair of fields. */
+ atomic_t nocb_lock_contended; /* Contention experienced. */
int nocb_defer_wakeup; /* Defer wakeup of nocb_kthread. */
struct timer_list nocb_timer; /* Enforce finite deferral. */
-
- /* The following fields are used by the leader, hence own cacheline. */
- struct rcu_head *nocb_gp_head ____cacheline_internodealigned_in_smp;
- /* CBs waiting for GP. */
- struct rcu_head **nocb_gp_tail;
- bool nocb_leader_sleep; /* Is the nocb leader thread asleep? */
- struct rcu_data *nocb_next_follower;
- /* Next follower in wakeup chain. */
-
- /* The following fields are used by the follower, hence new cachline. */
- struct rcu_data *nocb_leader ____cacheline_internodealigned_in_smp;
- /* Leader CPU takes GP-end wakeups. */
+ unsigned long nocb_gp_adv_time; /* Last call_rcu() CB adv (jiffies). */
+
+ /* The following fields are used by call_rcu, hence own cacheline. */
+ raw_spinlock_t nocb_bypass_lock ____cacheline_internodealigned_in_smp;
+ struct rcu_cblist nocb_bypass; /* Lock-contention-bypass CB list. */
+ unsigned long nocb_bypass_first; /* Time (jiffies) of first enqueue. */
+ unsigned long nocb_nobypass_last; /* Last ->cblist enqueue (jiffies). */
+ int nocb_nobypass_count; /* # ->cblist enqueues at ^^^ time. */
+
+ /* The following fields are used by GP kthread, hence own cacheline. */
+ raw_spinlock_t nocb_gp_lock ____cacheline_internodealigned_in_smp;
+ struct timer_list nocb_bypass_timer; /* Force nocb_bypass flush. */
+ u8 nocb_gp_sleep; /* Is the nocb GP thread asleep? */
+ u8 nocb_gp_bypass; /* Found a bypass on last scan? */
+ u8 nocb_gp_gp; /* GP to wait for on last scan? */
+ unsigned long nocb_gp_seq; /* If so, ->gp_seq to wait for. */
+ unsigned long nocb_gp_loops; /* # passes through wait code. */
+ struct swait_queue_head nocb_gp_wq; /* For nocb kthreads to sleep on. */
+ bool nocb_cb_sleep; /* Is the nocb CB thread asleep? */
+ struct task_struct *nocb_cb_kthread;
+ struct rcu_data *nocb_next_cb_rdp;
+ /* Next rcu_data in wakeup chain. */
+
+ /* The following fields are used by CB kthread, hence new cacheline. */
+ struct rcu_data *nocb_gp_rdp ____cacheline_internodealigned_in_smp;
+ /* GP rdp takes GP-end wakeups. */
#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
/* 6) RCU priority boosting. */
@@ -419,25 +428,39 @@ static bool rcu_preempt_has_tasks(struct rcu_node *rnp);
static bool rcu_preempt_need_deferred_qs(struct task_struct *t);
static void rcu_preempt_deferred_qs(struct task_struct *t);
static void zero_cpu_stall_ticks(struct rcu_data *rdp);
-static bool rcu_nocb_cpu_needs_barrier(int cpu);
static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp);
static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq);
static void rcu_init_one_nocb(struct rcu_node *rnp);
-static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
- bool lazy, unsigned long flags);
-static bool rcu_nocb_adopt_orphan_cbs(struct rcu_data *my_rdp,
- struct rcu_data *rdp,
- unsigned long flags);
+static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
+ unsigned long j);
+static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
+ bool *was_alldone, unsigned long flags);
+static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_empty,
+ unsigned long flags);
static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp);
static void do_nocb_deferred_wakeup(struct rcu_data *rdp);
static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp);
static void rcu_spawn_cpu_nocb_kthread(int cpu);
static void __init rcu_spawn_nocb_kthreads(void);
+static void show_rcu_nocb_state(struct rcu_data *rdp);
+static void rcu_nocb_lock(struct rcu_data *rdp);
+static void rcu_nocb_unlock(struct rcu_data *rdp);
+static void rcu_nocb_unlock_irqrestore(struct rcu_data *rdp,
+ unsigned long flags);
+static void rcu_lockdep_assert_cblist_protected(struct rcu_data *rdp);
#ifdef CONFIG_RCU_NOCB_CPU
static void __init rcu_organize_nocb_kthreads(void);
-#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
-static bool init_nocb_callback_list(struct rcu_data *rdp);
-static unsigned long rcu_get_n_cbs_nocb_cpu(struct rcu_data *rdp);
+#define rcu_nocb_lock_irqsave(rdp, flags) \
+do { \
+ if (!rcu_segcblist_is_offloaded(&(rdp)->cblist)) \
+ local_irq_save(flags); \
+ else \
+ raw_spin_lock_irqsave(&(rdp)->nocb_lock, (flags)); \
+} while (0)
+#else /* #ifdef CONFIG_RCU_NOCB_CPU */
+#define rcu_nocb_lock_irqsave(rdp, flags) local_irq_save(flags)
+#endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */
+
static void rcu_bind_gp_kthread(void);
static bool rcu_nohz_full_cpu(void);
static void rcu_dynticks_task_enter(void);
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index af7e7b9c86af..d632cd019597 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -781,7 +781,7 @@ static int rcu_print_task_exp_stall(struct rcu_node *rnp)
* other hand, if the CPU is not in an RCU read-side critical section,
* the IPI handler reports the quiescent state immediately.
*
- * Although this is a greate improvement over previous expedited
+ * Although this is a great improvement over previous expedited
* implementations, it is still unfriendly to real-time workloads, so is
* thus not recommended for any sort of common-case code. In fact, if
* you are using synchronize_rcu_expedited() in a loop, please restructure
@@ -792,6 +792,7 @@ static int rcu_print_task_exp_stall(struct rcu_node *rnp)
*/
void synchronize_rcu_expedited(void)
{
+ bool boottime = (rcu_scheduler_active == RCU_SCHEDULER_INIT);
struct rcu_exp_work rew;
struct rcu_node *rnp;
unsigned long s;
@@ -817,7 +818,7 @@ void synchronize_rcu_expedited(void)
return; /* Someone else did our work for us. */
/* Ensure that load happens before action based on it. */
- if (unlikely(rcu_scheduler_active == RCU_SCHEDULER_INIT)) {
+ if (unlikely(boottime)) {
/* Direct call during scheduler init and early_initcalls(). */
rcu_exp_sel_wait_wake(s);
} else {
@@ -835,5 +836,8 @@ void synchronize_rcu_expedited(void)
/* Let the next expedited grace period start. */
mutex_unlock(&rcu_state.exp_mutex);
+
+ if (likely(!boottime))
+ destroy_work_on_stack(&rew.rew_work);
}
EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index acb225023ed1..2defc7fe74c3 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -288,7 +288,6 @@ void rcu_note_context_switch(bool preempt)
struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
struct rcu_node *rnp;
- barrier(); /* Avoid RCU read-side critical sections leaking down. */
trace_rcu_utilization(TPS("Start context switch"));
lockdep_assert_irqs_disabled();
WARN_ON_ONCE(!preempt && t->rcu_read_lock_nesting > 0);
@@ -314,15 +313,6 @@ void rcu_note_context_switch(bool preempt)
? rnp->gp_seq
: rcu_seq_snap(&rnp->gp_seq));
rcu_preempt_ctxt_queue(rnp, rdp);
- } else if (t->rcu_read_lock_nesting < 0 &&
- t->rcu_read_unlock_special.s) {
-
- /*
- * Complete exit from RCU read-side critical section on
- * behalf of preempted instance of __rcu_read_unlock().
- */
- rcu_read_unlock_special(t);
- rcu_preempt_deferred_qs(t);
} else {
rcu_preempt_deferred_qs(t);
}
@@ -340,7 +330,6 @@ void rcu_note_context_switch(bool preempt)
if (rdp->exp_deferred_qs)
rcu_report_exp_rdp(rdp);
trace_rcu_utilization(TPS("End context switch"));
- barrier(); /* Avoid RCU read-side critical sections leaking up. */
}
EXPORT_SYMBOL_GPL(rcu_note_context_switch);
@@ -626,22 +615,18 @@ static void rcu_read_unlock_special(struct task_struct *t)
(rdp->grpmask & rnp->expmask) ||
tick_nohz_full_cpu(rdp->cpu);
// Need to defer quiescent state until everything is enabled.
- if ((exp || in_irq()) && irqs_were_disabled && use_softirq &&
- (in_irq() || !t->rcu_read_unlock_special.b.deferred_qs)) {
+ if (irqs_were_disabled && use_softirq &&
+ (in_interrupt() ||
+ (exp && !t->rcu_read_unlock_special.b.deferred_qs))) {
// Using softirq, safe to awaken, and we get
// no help from enabling irqs, unlike bh/preempt.
raise_softirq_irqoff(RCU_SOFTIRQ);
- } else if (exp && irqs_were_disabled && !use_softirq &&
- !t->rcu_read_unlock_special.b.deferred_qs) {
- // Safe to awaken and we get no help from enabling
- // irqs, unlike bh/preempt.
- invoke_rcu_core();
} else {
// Enabling BH or preempt does reschedule, so...
// Also if no expediting or NO_HZ_FULL, slow is OK.
set_tsk_need_resched(current);
set_preempt_need_resched();
- if (IS_ENABLED(CONFIG_IRQ_WORK) &&
+ if (IS_ENABLED(CONFIG_IRQ_WORK) && irqs_were_disabled &&
!rdp->defer_qs_iw_pending && exp) {
// Get scheduler to re-evaluate and call hooks.
// If !IRQ_WORK, FQS scan will eventually IPI.
@@ -828,11 +813,6 @@ static void rcu_qs(void)
* dyntick-idle quiescent state visible to other CPUs, which will in
* some cases serve for expedited as well as normal grace periods.
* Either way, register a lightweight quiescent state.
- *
- * The barrier() calls are redundant in the common case when this is
- * called externally, but just in case this is called from within this
- * file.
- *
*/
void rcu_all_qs(void)
{
@@ -847,14 +827,12 @@ void rcu_all_qs(void)
return;
}
this_cpu_write(rcu_data.rcu_urgent_qs, false);
- barrier(); /* Avoid RCU read-side critical sections leaking down. */
if (unlikely(raw_cpu_read(rcu_data.rcu_need_heavy_qs))) {
local_irq_save(flags);
rcu_momentary_dyntick_idle();
local_irq_restore(flags);
}
rcu_qs();
- barrier(); /* Avoid RCU read-side critical sections leaking up. */
preempt_enable();
}
EXPORT_SYMBOL_GPL(rcu_all_qs);
@@ -864,7 +842,6 @@ EXPORT_SYMBOL_GPL(rcu_all_qs);
*/
void rcu_note_context_switch(bool preempt)
{
- barrier(); /* Avoid RCU read-side critical sections leaking down. */
trace_rcu_utilization(TPS("Start context switch"));
rcu_qs();
/* Load rcu_urgent_qs before other flags. */
@@ -877,7 +854,6 @@ void rcu_note_context_switch(bool preempt)
rcu_tasks_qs(current);
out:
trace_rcu_utilization(TPS("End context switch"));
- barrier(); /* Avoid RCU read-side critical sections leaking up. */
}
EXPORT_SYMBOL_GPL(rcu_note_context_switch);
@@ -1134,7 +1110,7 @@ static void rcu_preempt_boost_start_gp(struct rcu_node *rnp)
* already exist. We only create this kthread for preemptible RCU.
* Returns zero if all is well, a negated errno otherwise.
*/
-static int rcu_spawn_one_boost_kthread(struct rcu_node *rnp)
+static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp)
{
int rnp_index = rnp - rcu_get_root();
unsigned long flags;
@@ -1142,25 +1118,27 @@ static int rcu_spawn_one_boost_kthread(struct rcu_node *rnp)
struct task_struct *t;
if (!IS_ENABLED(CONFIG_PREEMPT_RCU))
- return 0;
+ return;
if (!rcu_scheduler_fully_active || rcu_rnp_online_cpus(rnp) == 0)
- return 0;
+ return;
rcu_state.boost = 1;
+
if (rnp->boost_kthread_task != NULL)
- return 0;
+ return;
+
t = kthread_create(rcu_boost_kthread, (void *)rnp,
"rcub/%d", rnp_index);
- if (IS_ERR(t))
- return PTR_ERR(t);
+ if (WARN_ON_ONCE(IS_ERR(t)))
+ return;
+
raw_spin_lock_irqsave_rcu_node(rnp, flags);
rnp->boost_kthread_task = t;
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
sp.sched_priority = kthread_prio;
sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */
- return 0;
}
/*
@@ -1201,7 +1179,7 @@ static void __init rcu_spawn_boost_kthreads(void)
struct rcu_node *rnp;
rcu_for_each_leaf_node(rnp)
- (void)rcu_spawn_one_boost_kthread(rnp);
+ rcu_spawn_one_boost_kthread(rnp);
}
static void rcu_prepare_kthreads(int cpu)
@@ -1211,7 +1189,7 @@ static void rcu_prepare_kthreads(int cpu)
/* Fire up the incoming CPU's kthread and leaf rcu_node kthread. */
if (rcu_scheduler_fully_active)
- (void)rcu_spawn_one_boost_kthread(rnp);
+ rcu_spawn_one_boost_kthread(rnp);
}
#else /* #ifdef CONFIG_RCU_BOOST */
@@ -1248,10 +1226,10 @@ static void rcu_prepare_kthreads(int cpu)
#if !defined(CONFIG_RCU_FAST_NO_HZ)
/*
- * Check to see if any future RCU-related work will need to be done
- * by the current CPU, even if none need be done immediately, returning
- * 1 if so. This function is part of the RCU implementation; it is -not-
- * an exported member of the RCU API.
+ * Check to see if any future non-offloaded RCU-related work will need
+ * to be done by the current CPU, even if none need be done immediately,
+ * returning 1 if so. This function is part of the RCU implementation;
+ * it is -not- an exported member of the RCU API.
*
* Because we not have RCU_FAST_NO_HZ, just check whether or not this
* CPU has RCU callbacks queued.
@@ -1259,7 +1237,8 @@ static void rcu_prepare_kthreads(int cpu)
int rcu_needs_cpu(u64 basemono, u64 *nextevt)
{
*nextevt = KTIME_MAX;
- return !rcu_segcblist_empty(&this_cpu_ptr(&rcu_data)->cblist);
+ return !rcu_segcblist_empty(&this_cpu_ptr(&rcu_data)->cblist) &&
+ !rcu_segcblist_is_offloaded(&this_cpu_ptr(&rcu_data)->cblist);
}
/*
@@ -1360,8 +1339,9 @@ int rcu_needs_cpu(u64 basemono, u64 *nextevt)
lockdep_assert_irqs_disabled();
- /* If no callbacks, RCU doesn't need the CPU. */
- if (rcu_segcblist_empty(&rdp->cblist)) {
+ /* If no non-offloaded callbacks, RCU doesn't need the CPU. */
+ if (rcu_segcblist_empty(&rdp->cblist) ||
+ rcu_segcblist_is_offloaded(&this_cpu_ptr(&rcu_data)->cblist)) {
*nextevt = KTIME_MAX;
return 0;
}
@@ -1404,7 +1384,7 @@ static void rcu_prepare_for_idle(void)
int tne;
lockdep_assert_irqs_disabled();
- if (rcu_is_nocb_cpu(smp_processor_id()))
+ if (rcu_segcblist_is_offloaded(&rdp->cblist))
return;
/* Handle nohz enablement switches conservatively. */
@@ -1453,8 +1433,10 @@ static void rcu_prepare_for_idle(void)
*/
static void rcu_cleanup_after_idle(void)
{
+ struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
+
lockdep_assert_irqs_disabled();
- if (rcu_is_nocb_cpu(smp_processor_id()))
+ if (rcu_segcblist_is_offloaded(&rdp->cblist))
return;
if (rcu_try_advance_all_cbs())
invoke_rcu_core();
@@ -1469,10 +1451,10 @@ static void rcu_cleanup_after_idle(void)
* specified by rcu_nocb_mask. For the CPUs in the set, there are kthreads
* created that pull the callbacks from the corresponding CPU, wait for
* a grace period to elapse, and invoke the callbacks. These kthreads
- * are organized into leaders, which manage incoming callbacks, wait for
- * grace periods, and awaken followers, and the followers, which only
- * invoke callbacks. Each leader is its own follower. The no-CBs CPUs
- * do a wake_up() on their kthread when they insert a callback into any
+ * are organized into GP kthreads, which manage incoming callbacks, wait for
+ * grace periods, and awaken CB kthreads, and the CB kthreads, which only
+ * invoke callbacks. Each GP kthread invokes its own CBs. The no-CBs CPUs
+ * do a wake_up() on their GP kthread when they insert a callback into any
* empty list, unless the rcu_nocb_poll boot parameter has been specified,
* in which case each kthread actively polls its CPU. (Which isn't so great
* for energy efficiency, but which does reduce RCU's overhead on that CPU.)
@@ -1515,6 +1497,116 @@ static int __init parse_rcu_nocb_poll(char *arg)
early_param("rcu_nocb_poll", parse_rcu_nocb_poll);
/*
+ * Don't bother bypassing ->cblist if the call_rcu() rate is low.
+ * After all, the main point of bypassing is to avoid lock contention
+ * on ->nocb_lock, which only can happen at high call_rcu() rates.
+ */
+int nocb_nobypass_lim_per_jiffy = 16 * 1000 / HZ;
+module_param(nocb_nobypass_lim_per_jiffy, int, 0);
+
+/*
+ * Acquire the specified rcu_data structure's ->nocb_bypass_lock. If the
+ * lock isn't immediately available, increment ->nocb_lock_contended to
+ * flag the contention.
+ */
+static void rcu_nocb_bypass_lock(struct rcu_data *rdp)
+{
+ lockdep_assert_irqs_disabled();
+ if (raw_spin_trylock(&rdp->nocb_bypass_lock))
+ return;
+ atomic_inc(&rdp->nocb_lock_contended);
+ WARN_ON_ONCE(smp_processor_id() != rdp->cpu);
+ smp_mb__after_atomic(); /* atomic_inc() before lock. */
+ raw_spin_lock(&rdp->nocb_bypass_lock);
+ smp_mb__before_atomic(); /* atomic_dec() after lock. */
+ atomic_dec(&rdp->nocb_lock_contended);
+}
+
+/*
+ * Spinwait until the specified rcu_data structure's ->nocb_lock is
+ * not contended. Please note that this is extremely special-purpose,
+ * relying on the fact that at most two kthreads and one CPU contend for
+ * this lock, and also that the two kthreads are guaranteed to have frequent
+ * grace-period-duration time intervals between successive acquisitions
+ * of the lock. This allows us to use an extremely simple throttling
+ * mechanism, and further to apply it only to the CPU doing floods of
+ * call_rcu() invocations. Don't try this at home!
+ */
+static void rcu_nocb_wait_contended(struct rcu_data *rdp)
+{
+ WARN_ON_ONCE(smp_processor_id() != rdp->cpu);
+ while (WARN_ON_ONCE(atomic_read(&rdp->nocb_lock_contended)))
+ cpu_relax();
+}
+
+/*
+ * Conditionally acquire the specified rcu_data structure's
+ * ->nocb_bypass_lock.
+ */
+static bool rcu_nocb_bypass_trylock(struct rcu_data *rdp)
+{
+ lockdep_assert_irqs_disabled();
+ return raw_spin_trylock(&rdp->nocb_bypass_lock);
+}
+
+/*
+ * Release the specified rcu_data structure's ->nocb_bypass_lock.
+ */
+static void rcu_nocb_bypass_unlock(struct rcu_data *rdp)
+{
+ lockdep_assert_irqs_disabled();
+ raw_spin_unlock(&rdp->nocb_bypass_lock);
+}
+
+/*
+ * Acquire the specified rcu_data structure's ->nocb_lock, but only
+ * if it corresponds to a no-CBs CPU.
+ */
+static void rcu_nocb_lock(struct rcu_data *rdp)
+{
+ lockdep_assert_irqs_disabled();
+ if (!rcu_segcblist_is_offloaded(&rdp->cblist))
+ return;
+ raw_spin_lock(&rdp->nocb_lock);
+}
+
+/*
+ * Release the specified rcu_data structure's ->nocb_lock, but only
+ * if it corresponds to a no-CBs CPU.
+ */
+static void rcu_nocb_unlock(struct rcu_data *rdp)
+{
+ if (rcu_segcblist_is_offloaded(&rdp->cblist)) {
+ lockdep_assert_irqs_disabled();
+ raw_spin_unlock(&rdp->nocb_lock);
+ }
+}
+
+/*
+ * Release the specified rcu_data structure's ->nocb_lock and restore
+ * interrupts, but only if it corresponds to a no-CBs CPU.
+ */
+static void rcu_nocb_unlock_irqrestore(struct rcu_data *rdp,
+ unsigned long flags)
+{
+ if (rcu_segcblist_is_offloaded(&rdp->cblist)) {
+ lockdep_assert_irqs_disabled();
+ raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
+ } else {
+ local_irq_restore(flags);
+ }
+}
+
+/* Lockdep check that ->cblist may be safely accessed. */
+static void rcu_lockdep_assert_cblist_protected(struct rcu_data *rdp)
+{
+ lockdep_assert_irqs_disabled();
+ if (rcu_segcblist_is_offloaded(&rdp->cblist) &&
+ cpu_online(rdp->cpu))
+ lockdep_assert_held(&rdp->nocb_lock);
+}
+
+/*
* Wake up any no-CBs CPUs' kthreads that were waiting on the just-ended
* grace period.
*/
@@ -1543,440 +1635,514 @@ bool rcu_is_nocb_cpu(int cpu)
}
/*
- * Kick the leader kthread for this NOCB group. Caller holds ->nocb_lock
+ * Kick the GP kthread for this NOCB group. Caller holds ->nocb_lock
* and this function releases it.
*/
-static void __wake_nocb_leader(struct rcu_data *rdp, bool force,
- unsigned long flags)
+static void wake_nocb_gp(struct rcu_data *rdp, bool force,
+ unsigned long flags)
__releases(rdp->nocb_lock)
{
- struct rcu_data *rdp_leader = rdp->nocb_leader;
+ bool needwake = false;
+ struct rcu_data *rdp_gp = rdp->nocb_gp_rdp;
lockdep_assert_held(&rdp->nocb_lock);
- if (!READ_ONCE(rdp_leader->nocb_kthread)) {
- raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
+ if (!READ_ONCE(rdp_gp->nocb_gp_kthread)) {
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
+ TPS("AlreadyAwake"));
+ rcu_nocb_unlock_irqrestore(rdp, flags);
return;
}
- if (rdp_leader->nocb_leader_sleep || force) {
- /* Prior smp_mb__after_atomic() orders against prior enqueue. */
- WRITE_ONCE(rdp_leader->nocb_leader_sleep, false);
- del_timer(&rdp->nocb_timer);
- raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
- smp_mb(); /* ->nocb_leader_sleep before swake_up_one(). */
- swake_up_one(&rdp_leader->nocb_wq);
- } else {
- raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
+ del_timer(&rdp->nocb_timer);
+ rcu_nocb_unlock_irqrestore(rdp, flags);
+ raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags);
+ if (force || READ_ONCE(rdp_gp->nocb_gp_sleep)) {
+ WRITE_ONCE(rdp_gp->nocb_gp_sleep, false);
+ needwake = true;
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("DoWake"));
}
+ raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags);
+ if (needwake)
+ wake_up_process(rdp_gp->nocb_gp_kthread);
}
/*
- * Kick the leader kthread for this NOCB group, but caller has not
- * acquired locks.
+ * Arrange to wake the GP kthread for this NOCB group at some future
+ * time when it is safe to do so.
*/
-static void wake_nocb_leader(struct rcu_data *rdp, bool force)
+static void wake_nocb_gp_defer(struct rcu_data *rdp, int waketype,
+ const char *reason)
{
- unsigned long flags;
+ if (rdp->nocb_defer_wakeup == RCU_NOCB_WAKE_NOT)
+ mod_timer(&rdp->nocb_timer, jiffies + 1);
+ if (rdp->nocb_defer_wakeup < waketype)
+ WRITE_ONCE(rdp->nocb_defer_wakeup, waketype);
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, reason);
+}
+
+/*
+ * Flush the ->nocb_bypass queue into ->cblist, enqueuing rhp if non-NULL.
+ * However, if there is a callback to be enqueued and if ->nocb_bypass
+ * proves to be initially empty, just return false because the no-CB GP
+ * kthread may need to be awakened in this case.
+ *
+ * Note that this function always returns true if rhp is NULL.
+ */
+static bool rcu_nocb_do_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
+ unsigned long j)
+{
+ struct rcu_cblist rcl;
- raw_spin_lock_irqsave(&rdp->nocb_lock, flags);
- __wake_nocb_leader(rdp, force, flags);
+ WARN_ON_ONCE(!rcu_segcblist_is_offloaded(&rdp->cblist));
+ rcu_lockdep_assert_cblist_protected(rdp);
+ lockdep_assert_held(&rdp->nocb_bypass_lock);
+ if (rhp && !rcu_cblist_n_cbs(&rdp->nocb_bypass)) {
+ raw_spin_unlock(&rdp->nocb_bypass_lock);
+ return false;
+ }
+ /* Note: ->cblist.len already accounts for ->nocb_bypass contents. */
+ if (rhp)
+ rcu_segcblist_inc_len(&rdp->cblist); /* Must precede enqueue. */
+ rcu_cblist_flush_enqueue(&rcl, &rdp->nocb_bypass, rhp);
+ rcu_segcblist_insert_pend_cbs(&rdp->cblist, &rcl);
+ WRITE_ONCE(rdp->nocb_bypass_first, j);
+ rcu_nocb_bypass_unlock(rdp);
+ return true;
}
/*
- * Arrange to wake the leader kthread for this NOCB group at some
- * future time when it is safe to do so.
+ * Flush the ->nocb_bypass queue into ->cblist, enqueuing rhp if non-NULL.
+ * However, if there is a callback to be enqueued and if ->nocb_bypass
+ * proves to be initially empty, just return false because the no-CB GP
+ * kthread may need to be awakened in this case.
+ *
+ * Note that this function always returns true if rhp is NULL.
*/
-static void wake_nocb_leader_defer(struct rcu_data *rdp, int waketype,
- const char *reason)
+static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
+ unsigned long j)
{
- unsigned long flags;
+ if (!rcu_segcblist_is_offloaded(&rdp->cblist))
+ return true;
+ rcu_lockdep_assert_cblist_protected(rdp);
+ rcu_nocb_bypass_lock(rdp);
+ return rcu_nocb_do_flush_bypass(rdp, rhp, j);
+}
- raw_spin_lock_irqsave(&rdp->nocb_lock, flags);
- if (rdp->nocb_defer_wakeup == RCU_NOCB_WAKE_NOT)
- mod_timer(&rdp->nocb_timer, jiffies + 1);
- WRITE_ONCE(rdp->nocb_defer_wakeup, waketype);
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, reason);
- raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
+/*
+ * If the ->nocb_bypass_lock is immediately available, flush the
+ * ->nocb_bypass queue into ->cblist.
+ */
+static void rcu_nocb_try_flush_bypass(struct rcu_data *rdp, unsigned long j)
+{
+ rcu_lockdep_assert_cblist_protected(rdp);
+ if (!rcu_segcblist_is_offloaded(&rdp->cblist) ||
+ !rcu_nocb_bypass_trylock(rdp))
+ return;
+ WARN_ON_ONCE(!rcu_nocb_do_flush_bypass(rdp, NULL, j));
}
-/* Does rcu_barrier need to queue an RCU callback on the specified CPU? */
-static bool rcu_nocb_cpu_needs_barrier(int cpu)
+/*
+ * See whether it is appropriate to use the ->nocb_bypass list in order
+ * to control contention on ->nocb_lock. A limited number of direct
+ * enqueues are permitted into ->cblist per jiffy. If ->nocb_bypass
+ * is non-empty, further callbacks must be placed into ->nocb_bypass,
+ * otherwise rcu_barrier() breaks. Use rcu_nocb_flush_bypass() to switch
+ * back to direct use of ->cblist. However, ->nocb_bypass should not be
+ * used if ->cblist is empty, because otherwise callbacks can be stranded
+ * on ->nocb_bypass because we cannot count on the current CPU ever again
+ * invoking call_rcu(). The general rule is that if ->nocb_bypass is
+ * non-empty, the corresponding no-CBs grace-period kthread must not be
+ * in an indefinite sleep state.
+ *
+ * Finally, it is not permitted to use the bypass during early boot,
+ * as doing so would confuse the auto-initialization code. Besides
+ * which, there is no point in worrying about lock contention while
+ * there is only one CPU in operation.
+ */
+static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
+ bool *was_alldone, unsigned long flags)
{
- struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
- unsigned long ret;
-#ifdef CONFIG_PROVE_RCU
- struct rcu_head *rhp;
-#endif /* #ifdef CONFIG_PROVE_RCU */
+ unsigned long c;
+ unsigned long cur_gp_seq;
+ unsigned long j = jiffies;
+ long ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass);
- /*
- * Check count of all no-CBs callbacks awaiting invocation.
- * There needs to be a barrier before this function is called,
- * but associated with a prior determination that no more
- * callbacks would be posted. In the worst case, the first
- * barrier in rcu_barrier() suffices (but the caller cannot
- * necessarily rely on this, not a substitute for the caller
- * getting the concurrency design right!). There must also be a
- * barrier between the following load and posting of a callback
- * (if a callback is in fact needed). This is associated with an
- * atomic_inc() in the caller.
- */
- ret = rcu_get_n_cbs_nocb_cpu(rdp);
-
-#ifdef CONFIG_PROVE_RCU
- rhp = READ_ONCE(rdp->nocb_head);
- if (!rhp)
- rhp = READ_ONCE(rdp->nocb_gp_head);
- if (!rhp)
- rhp = READ_ONCE(rdp->nocb_follower_head);
-
- /* Having no rcuo kthread but CBs after scheduler starts is bad! */
- if (!READ_ONCE(rdp->nocb_kthread) && rhp &&
- rcu_scheduler_fully_active) {
- /* RCU callback enqueued before CPU first came online??? */
- pr_err("RCU: Never-onlined no-CBs CPU %d has CB %p\n",
- cpu, rhp->func);
- WARN_ON_ONCE(1);
+ if (!rcu_segcblist_is_offloaded(&rdp->cblist)) {
+ *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist);
+ return false; /* Not offloaded, no bypassing. */
+ }
+ lockdep_assert_irqs_disabled();
+
+ // Don't use ->nocb_bypass during early boot.
+ if (rcu_scheduler_active != RCU_SCHEDULER_RUNNING) {
+ rcu_nocb_lock(rdp);
+ WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass));
+ *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist);
+ return false;
+ }
+
+ // If we have advanced to a new jiffy, reset counts to allow
+ // moving back from ->nocb_bypass to ->cblist.
+ if (j == rdp->nocb_nobypass_last) {
+ c = rdp->nocb_nobypass_count + 1;
+ } else {
+ WRITE_ONCE(rdp->nocb_nobypass_last, j);
+ c = rdp->nocb_nobypass_count - nocb_nobypass_lim_per_jiffy;
+ if (ULONG_CMP_LT(rdp->nocb_nobypass_count,
+ nocb_nobypass_lim_per_jiffy))
+ c = 0;
+ else if (c > nocb_nobypass_lim_per_jiffy)
+ c = nocb_nobypass_lim_per_jiffy;
+ }
+ WRITE_ONCE(rdp->nocb_nobypass_count, c);
+
+ // If there hasn't yet been all that many ->cblist enqueues
+ // this jiffy, tell the caller to enqueue onto ->cblist. But flush
+ // ->nocb_bypass first.
+ if (rdp->nocb_nobypass_count < nocb_nobypass_lim_per_jiffy) {
+ rcu_nocb_lock(rdp);
+ *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist);
+ if (*was_alldone)
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
+ TPS("FirstQ"));
+ WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, j));
+ WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass));
+ return false; // Caller must enqueue the callback.
+ }
+
+ // If ->nocb_bypass has been used too long or is too full,
+ // flush ->nocb_bypass to ->cblist.
+ if ((ncbs && j != READ_ONCE(rdp->nocb_bypass_first)) ||
+ ncbs >= qhimark) {
+ rcu_nocb_lock(rdp);
+ if (!rcu_nocb_flush_bypass(rdp, rhp, j)) {
+ *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist);
+ if (*was_alldone)
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
+ TPS("FirstQ"));
+ WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass));
+ return false; // Caller must enqueue the callback.
+ }
+ if (j != rdp->nocb_gp_adv_time &&
+ rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq) &&
+ rcu_seq_done(&rdp->mynode->gp_seq, cur_gp_seq)) {
+ rcu_advance_cbs_nowake(rdp->mynode, rdp);
+ rdp->nocb_gp_adv_time = j;
+ }
+ rcu_nocb_unlock_irqrestore(rdp, flags);
+ return true; // Callback already enqueued.
}
-#endif /* #ifdef CONFIG_PROVE_RCU */
- return !!ret;
+ // We need to use the bypass.
+ rcu_nocb_wait_contended(rdp);
+ rcu_nocb_bypass_lock(rdp);
+ ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass);
+ rcu_segcblist_inc_len(&rdp->cblist); /* Must precede enqueue. */
+ rcu_cblist_enqueue(&rdp->nocb_bypass, rhp);
+ if (!ncbs) {
+ WRITE_ONCE(rdp->nocb_bypass_first, j);
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("FirstBQ"));
+ }
+ rcu_nocb_bypass_unlock(rdp);
+ smp_mb(); /* Order enqueue before wake. */
+ if (ncbs) {
+ local_irq_restore(flags);
+ } else {
+ // No-CBs GP kthread might be indefinitely asleep, if so, wake.
+ rcu_nocb_lock(rdp); // Rare during call_rcu() flood.
+ if (!rcu_segcblist_pend_cbs(&rdp->cblist)) {
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
+ TPS("FirstBQwake"));
+ __call_rcu_nocb_wake(rdp, true, flags);
+ } else {
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
+ TPS("FirstBQnoWake"));
+ rcu_nocb_unlock_irqrestore(rdp, flags);
+ }
+ }
+ return true; // Callback already enqueued.
}
/*
- * Enqueue the specified string of rcu_head structures onto the specified
- * CPU's no-CBs lists. The CPU is specified by rdp, the head of the
- * string by rhp, and the tail of the string by rhtp. The non-lazy/lazy
- * counts are supplied by rhcount and rhcount_lazy.
+ * Awaken the no-CBs grace-period kthead if needed, either due to it
+ * legitimately being asleep or due to overload conditions.
*
* If warranted, also wake up the kthread servicing this CPUs queues.
*/
-static void __call_rcu_nocb_enqueue(struct rcu_data *rdp,
- struct rcu_head *rhp,
- struct rcu_head **rhtp,
- int rhcount, int rhcount_lazy,
- unsigned long flags)
+static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_alldone,
+ unsigned long flags)
+ __releases(rdp->nocb_lock)
{
- int len;
- struct rcu_head **old_rhpp;
+ unsigned long cur_gp_seq;
+ unsigned long j;
+ long len;
struct task_struct *t;
- /* Enqueue the callback on the nocb list and update counts. */
- atomic_long_add(rhcount, &rdp->nocb_q_count);
- /* rcu_barrier() relies on ->nocb_q_count add before xchg. */
- old_rhpp = xchg(&rdp->nocb_tail, rhtp);
- WRITE_ONCE(*old_rhpp, rhp);
- atomic_long_add(rhcount_lazy, &rdp->nocb_q_count_lazy);
- smp_mb__after_atomic(); /* Store *old_rhpp before _wake test. */
-
- /* If we are not being polled and there is a kthread, awaken it ... */
- t = READ_ONCE(rdp->nocb_kthread);
+ // If we are being polled or there is no kthread, just leave.
+ t = READ_ONCE(rdp->nocb_gp_kthread);
if (rcu_nocb_poll || !t) {
trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
TPS("WakeNotPoll"));
+ rcu_nocb_unlock_irqrestore(rdp, flags);
return;
}
- len = rcu_get_n_cbs_nocb_cpu(rdp);
- if (old_rhpp == &rdp->nocb_head) {
+ // Need to actually to a wakeup.
+ len = rcu_segcblist_n_cbs(&rdp->cblist);
+ if (was_alldone) {
+ rdp->qlen_last_fqs_check = len;
if (!irqs_disabled_flags(flags)) {
/* ... if queue was empty ... */
- wake_nocb_leader(rdp, false);
+ wake_nocb_gp(rdp, false, flags);
trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
TPS("WakeEmpty"));
} else {
- wake_nocb_leader_defer(rdp, RCU_NOCB_WAKE,
- TPS("WakeEmptyIsDeferred"));
+ wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE,
+ TPS("WakeEmptyIsDeferred"));
+ rcu_nocb_unlock_irqrestore(rdp, flags);
}
- rdp->qlen_last_fqs_check = 0;
} else if (len > rdp->qlen_last_fqs_check + qhimark) {
/* ... or if many callbacks queued. */
- if (!irqs_disabled_flags(flags)) {
- wake_nocb_leader(rdp, true);
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
- TPS("WakeOvf"));
- } else {
- wake_nocb_leader_defer(rdp, RCU_NOCB_WAKE_FORCE,
- TPS("WakeOvfIsDeferred"));
+ rdp->qlen_last_fqs_check = len;
+ j = jiffies;
+ if (j != rdp->nocb_gp_adv_time &&
+ rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq) &&
+ rcu_seq_done(&rdp->mynode->gp_seq, cur_gp_seq)) {
+ rcu_advance_cbs_nowake(rdp->mynode, rdp);
+ rdp->nocb_gp_adv_time = j;
}
- rdp->qlen_last_fqs_check = LONG_MAX / 2;
+ smp_mb(); /* Enqueue before timer_pending(). */
+ if ((rdp->nocb_cb_sleep ||
+ !rcu_segcblist_ready_cbs(&rdp->cblist)) &&
+ !timer_pending(&rdp->nocb_bypass_timer))
+ wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE_FORCE,
+ TPS("WakeOvfIsDeferred"));
+ rcu_nocb_unlock_irqrestore(rdp, flags);
} else {
trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WakeNot"));
+ rcu_nocb_unlock_irqrestore(rdp, flags);
}
return;
}
-/*
- * This is a helper for __call_rcu(), which invokes this when the normal
- * callback queue is inoperable. If this is not a no-CBs CPU, this
- * function returns failure back to __call_rcu(), which can complain
- * appropriately.
- *
- * Otherwise, this function queues the callback where the corresponding
- * "rcuo" kthread can find it.
- */
-static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
- bool lazy, unsigned long flags)
+/* Wake up the no-CBs GP kthread to flush ->nocb_bypass. */
+static void do_nocb_bypass_wakeup_timer(struct timer_list *t)
{
+ unsigned long flags;
+ struct rcu_data *rdp = from_timer(rdp, t, nocb_bypass_timer);
- if (!rcu_is_nocb_cpu(rdp->cpu))
- return false;
- __call_rcu_nocb_enqueue(rdp, rhp, &rhp->next, 1, lazy, flags);
- if (__is_kfree_rcu_offset((unsigned long)rhp->func))
- trace_rcu_kfree_callback(rcu_state.name, rhp,
- (unsigned long)rhp->func,
- -atomic_long_read(&rdp->nocb_q_count_lazy),
- -rcu_get_n_cbs_nocb_cpu(rdp));
- else
- trace_rcu_callback(rcu_state.name, rhp,
- -atomic_long_read(&rdp->nocb_q_count_lazy),
- -rcu_get_n_cbs_nocb_cpu(rdp));
-
- /*
- * If called from an extended quiescent state with interrupts
- * disabled, invoke the RCU core in order to allow the idle-entry
- * deferred-wakeup check to function.
- */
- if (irqs_disabled_flags(flags) &&
- !rcu_is_watching() &&
- cpu_online(smp_processor_id()))
- invoke_rcu_core();
-
- return true;
-}
-
-/*
- * Adopt orphaned callbacks on a no-CBs CPU, or return 0 if this is
- * not a no-CBs CPU.
- */
-static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_data *my_rdp,
- struct rcu_data *rdp,
- unsigned long flags)
-{
- lockdep_assert_irqs_disabled();
- if (!rcu_is_nocb_cpu(smp_processor_id()))
- return false; /* Not NOCBs CPU, caller must migrate CBs. */
- __call_rcu_nocb_enqueue(my_rdp, rcu_segcblist_head(&rdp->cblist),
- rcu_segcblist_tail(&rdp->cblist),
- rcu_segcblist_n_cbs(&rdp->cblist),
- rcu_segcblist_n_lazy_cbs(&rdp->cblist), flags);
- rcu_segcblist_init(&rdp->cblist);
- rcu_segcblist_disable(&rdp->cblist);
- return true;
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("Timer"));
+ rcu_nocb_lock_irqsave(rdp, flags);
+ smp_mb__after_spinlock(); /* Timer expire before wakeup. */
+ __call_rcu_nocb_wake(rdp, true, flags);
}
/*
- * If necessary, kick off a new grace period, and either way wait
- * for a subsequent grace period to complete.
+ * No-CBs GP kthreads come here to wait for additional callbacks to show up
+ * or for grace periods to end.
*/
-static void rcu_nocb_wait_gp(struct rcu_data *rdp)
+static void nocb_gp_wait(struct rcu_data *my_rdp)
{
- unsigned long c;
- bool d;
+ bool bypass = false;
+ long bypass_ncbs;
+ int __maybe_unused cpu = my_rdp->cpu;
+ unsigned long cur_gp_seq;
unsigned long flags;
+ bool gotcbs;
+ unsigned long j = jiffies;
+ bool needwait_gp = false; // This prevents actual uninitialized use.
bool needwake;
- struct rcu_node *rnp = rdp->mynode;
+ bool needwake_gp;
+ struct rcu_data *rdp;
+ struct rcu_node *rnp;
+ unsigned long wait_gp_seq = 0; // Suppress "use uninitialized" warning.
- local_irq_save(flags);
- c = rcu_seq_snap(&rcu_state.gp_seq);
- if (!rdp->gpwrap && ULONG_CMP_GE(rdp->gp_seq_needed, c)) {
- local_irq_restore(flags);
- } else {
- raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */
- needwake = rcu_start_this_gp(rnp, rdp, c);
- raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
- if (needwake)
+ /*
+ * Each pass through the following loop checks for CBs and for the
+ * nearest grace period (if any) to wait for next. The CB kthreads
+ * and the global grace-period kthread are awakened if needed.
+ */
+ for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_cb_rdp) {
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("Check"));
+ rcu_nocb_lock_irqsave(rdp, flags);
+ bypass_ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass);
+ if (bypass_ncbs &&
+ (time_after(j, READ_ONCE(rdp->nocb_bypass_first) + 1) ||
+ bypass_ncbs > 2 * qhimark)) {
+ // Bypass full or old, so flush it.
+ (void)rcu_nocb_try_flush_bypass(rdp, j);
+ bypass_ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass);
+ } else if (!bypass_ncbs && rcu_segcblist_empty(&rdp->cblist)) {
+ rcu_nocb_unlock_irqrestore(rdp, flags);
+ continue; /* No callbacks here, try next. */
+ }
+ if (bypass_ncbs) {
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
+ TPS("Bypass"));
+ bypass = true;
+ }
+ rnp = rdp->mynode;
+ if (bypass) { // Avoid race with first bypass CB.
+ WRITE_ONCE(my_rdp->nocb_defer_wakeup,
+ RCU_NOCB_WAKE_NOT);
+ del_timer(&my_rdp->nocb_timer);
+ }
+ // Advance callbacks if helpful and low contention.
+ needwake_gp = false;
+ if (!rcu_segcblist_restempty(&rdp->cblist,
+ RCU_NEXT_READY_TAIL) ||
+ (rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq) &&
+ rcu_seq_done(&rnp->gp_seq, cur_gp_seq))) {
+ raw_spin_lock_rcu_node(rnp); /* irqs disabled. */
+ needwake_gp = rcu_advance_cbs(rnp, rdp);
+ raw_spin_unlock_rcu_node(rnp); /* irqs disabled. */
+ }
+ // Need to wait on some grace period?
+ WARN_ON_ONCE(!rcu_segcblist_restempty(&rdp->cblist,
+ RCU_NEXT_READY_TAIL));
+ if (rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq)) {
+ if (!needwait_gp ||
+ ULONG_CMP_LT(cur_gp_seq, wait_gp_seq))
+ wait_gp_seq = cur_gp_seq;
+ needwait_gp = true;
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
+ TPS("NeedWaitGP"));
+ }
+ if (rcu_segcblist_ready_cbs(&rdp->cblist)) {
+ needwake = rdp->nocb_cb_sleep;
+ WRITE_ONCE(rdp->nocb_cb_sleep, false);
+ smp_mb(); /* CB invocation -after- GP end. */
+ } else {
+ needwake = false;
+ }
+ rcu_nocb_unlock_irqrestore(rdp, flags);
+ if (needwake) {
+ swake_up_one(&rdp->nocb_cb_wq);
+ gotcbs = true;
+ }
+ if (needwake_gp)
rcu_gp_kthread_wake();
}
- /*
- * Wait for the grace period. Do so interruptibly to avoid messing
- * up the load average.
- */
- trace_rcu_this_gp(rnp, rdp, c, TPS("StartWait"));
- for (;;) {
+ my_rdp->nocb_gp_bypass = bypass;
+ my_rdp->nocb_gp_gp = needwait_gp;
+ my_rdp->nocb_gp_seq = needwait_gp ? wait_gp_seq : 0;
+ if (bypass && !rcu_nocb_poll) {
+ // At least one child with non-empty ->nocb_bypass, so set
+ // timer in order to avoid stranding its callbacks.
+ raw_spin_lock_irqsave(&my_rdp->nocb_gp_lock, flags);
+ mod_timer(&my_rdp->nocb_bypass_timer, j + 2);
+ raw_spin_unlock_irqrestore(&my_rdp->nocb_gp_lock, flags);
+ }
+ if (rcu_nocb_poll) {
+ /* Polling, so trace if first poll in the series. */
+ if (gotcbs)
+ trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("Poll"));
+ schedule_timeout_interruptible(1);
+ } else if (!needwait_gp) {
+ /* Wait for callbacks to appear. */
+ trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("Sleep"));
+ swait_event_interruptible_exclusive(my_rdp->nocb_gp_wq,
+ !READ_ONCE(my_rdp->nocb_gp_sleep));
+ trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("EndSleep"));
+ } else {
+ rnp = my_rdp->mynode;
+ trace_rcu_this_gp(rnp, my_rdp, wait_gp_seq, TPS("StartWait"));
swait_event_interruptible_exclusive(
- rnp->nocb_gp_wq[rcu_seq_ctr(c) & 0x1],
- (d = rcu_seq_done(&rnp->gp_seq, c)));
- if (likely(d))
- break;
- WARN_ON(signal_pending(current));
- trace_rcu_this_gp(rnp, rdp, c, TPS("ResumeWait"));
+ rnp->nocb_gp_wq[rcu_seq_ctr(wait_gp_seq) & 0x1],
+ rcu_seq_done(&rnp->gp_seq, wait_gp_seq) ||
+ !READ_ONCE(my_rdp->nocb_gp_sleep));
+ trace_rcu_this_gp(rnp, my_rdp, wait_gp_seq, TPS("EndWait"));
}
- trace_rcu_this_gp(rnp, rdp, c, TPS("EndWait"));
- smp_mb(); /* Ensure that CB invocation happens after GP end. */
+ if (!rcu_nocb_poll) {
+ raw_spin_lock_irqsave(&my_rdp->nocb_gp_lock, flags);
+ if (bypass)
+ del_timer(&my_rdp->nocb_bypass_timer);
+ WRITE_ONCE(my_rdp->nocb_gp_sleep, true);
+ raw_spin_unlock_irqrestore(&my_rdp->nocb_gp_lock, flags);
+ }
+ my_rdp->nocb_gp_seq = -1;
+ WARN_ON(signal_pending(current));
}
/*
- * Leaders come here to wait for additional callbacks to show up.
- * This function does not return until callbacks appear.
+ * No-CBs grace-period-wait kthread. There is one of these per group
+ * of CPUs, but only once at least one CPU in that group has come online
+ * at least once since boot. This kthread checks for newly posted
+ * callbacks from any of the CPUs it is responsible for, waits for a
+ * grace period, then awakens all of the rcu_nocb_cb_kthread() instances
+ * that then have callback-invocation work to do.
*/
-static void nocb_leader_wait(struct rcu_data *my_rdp)
+static int rcu_nocb_gp_kthread(void *arg)
{
- bool firsttime = true;
- unsigned long flags;
- bool gotcbs;
- struct rcu_data *rdp;
- struct rcu_head **tail;
-
-wait_again:
-
- /* Wait for callbacks to appear. */
- if (!rcu_nocb_poll) {
- trace_rcu_nocb_wake(rcu_state.name, my_rdp->cpu, TPS("Sleep"));
- swait_event_interruptible_exclusive(my_rdp->nocb_wq,
- !READ_ONCE(my_rdp->nocb_leader_sleep));
- raw_spin_lock_irqsave(&my_rdp->nocb_lock, flags);
- my_rdp->nocb_leader_sleep = true;
- WRITE_ONCE(my_rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT);
- del_timer(&my_rdp->nocb_timer);
- raw_spin_unlock_irqrestore(&my_rdp->nocb_lock, flags);
- } else if (firsttime) {
- firsttime = false; /* Don't drown trace log with "Poll"! */
- trace_rcu_nocb_wake(rcu_state.name, my_rdp->cpu, TPS("Poll"));
- }
-
- /*
- * Each pass through the following loop checks a follower for CBs.
- * We are our own first follower. Any CBs found are moved to
- * nocb_gp_head, where they await a grace period.
- */
- gotcbs = false;
- smp_mb(); /* wakeup and _sleep before ->nocb_head reads. */
- for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) {
- rdp->nocb_gp_head = READ_ONCE(rdp->nocb_head);
- if (!rdp->nocb_gp_head)
- continue; /* No CBs here, try next follower. */
-
- /* Move callbacks to wait-for-GP list, which is empty. */
- WRITE_ONCE(rdp->nocb_head, NULL);
- rdp->nocb_gp_tail = xchg(&rdp->nocb_tail, &rdp->nocb_head);
- gotcbs = true;
- }
-
- /* No callbacks? Sleep a bit if polling, and go retry. */
- if (unlikely(!gotcbs)) {
- WARN_ON(signal_pending(current));
- if (rcu_nocb_poll) {
- schedule_timeout_interruptible(1);
- } else {
- trace_rcu_nocb_wake(rcu_state.name, my_rdp->cpu,
- TPS("WokeEmpty"));
- }
- goto wait_again;
- }
+ struct rcu_data *rdp = arg;
- /* Wait for one grace period. */
- rcu_nocb_wait_gp(my_rdp);
-
- /* Each pass through the following loop wakes a follower, if needed. */
- for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) {
- if (!rcu_nocb_poll &&
- READ_ONCE(rdp->nocb_head) &&
- READ_ONCE(my_rdp->nocb_leader_sleep)) {
- raw_spin_lock_irqsave(&my_rdp->nocb_lock, flags);
- my_rdp->nocb_leader_sleep = false;/* No need to sleep.*/
- raw_spin_unlock_irqrestore(&my_rdp->nocb_lock, flags);
- }
- if (!rdp->nocb_gp_head)
- continue; /* No CBs, so no need to wake follower. */
-
- /* Append callbacks to follower's "done" list. */
- raw_spin_lock_irqsave(&rdp->nocb_lock, flags);
- tail = rdp->nocb_follower_tail;
- rdp->nocb_follower_tail = rdp->nocb_gp_tail;
- *tail = rdp->nocb_gp_head;
- raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
- if (rdp != my_rdp && tail == &rdp->nocb_follower_head) {
- /* List was empty, so wake up the follower. */
- swake_up_one(&rdp->nocb_wq);
- }
+ for (;;) {
+ WRITE_ONCE(rdp->nocb_gp_loops, rdp->nocb_gp_loops + 1);
+ nocb_gp_wait(rdp);
+ cond_resched_tasks_rcu_qs();
}
-
- /* If we (the leader) don't have CBs, go wait some more. */
- if (!my_rdp->nocb_follower_head)
- goto wait_again;
+ return 0;
}
/*
- * Followers come here to wait for additional callbacks to show up.
- * This function does not return until callbacks appear.
+ * Invoke any ready callbacks from the corresponding no-CBs CPU,
+ * then, if there are no more, wait for more to appear.
*/
-static void nocb_follower_wait(struct rcu_data *rdp)
+static void nocb_cb_wait(struct rcu_data *rdp)
{
- for (;;) {
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("FollowerSleep"));
- swait_event_interruptible_exclusive(rdp->nocb_wq,
- READ_ONCE(rdp->nocb_follower_head));
- if (smp_load_acquire(&rdp->nocb_follower_head)) {
- /* ^^^ Ensure CB invocation follows _head test. */
- return;
- }
- WARN_ON(signal_pending(current));
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WokeEmpty"));
+ unsigned long cur_gp_seq;
+ unsigned long flags;
+ bool needwake_gp = false;
+ struct rcu_node *rnp = rdp->mynode;
+
+ local_irq_save(flags);
+ rcu_momentary_dyntick_idle();
+ local_irq_restore(flags);
+ local_bh_disable();
+ rcu_do_batch(rdp);
+ local_bh_enable();
+ lockdep_assert_irqs_enabled();
+ rcu_nocb_lock_irqsave(rdp, flags);
+ if (rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq) &&
+ rcu_seq_done(&rnp->gp_seq, cur_gp_seq) &&
+ raw_spin_trylock_rcu_node(rnp)) { /* irqs already disabled. */
+ needwake_gp = rcu_advance_cbs(rdp->mynode, rdp);
+ raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */
+ }
+ if (rcu_segcblist_ready_cbs(&rdp->cblist)) {
+ rcu_nocb_unlock_irqrestore(rdp, flags);
+ if (needwake_gp)
+ rcu_gp_kthread_wake();
+ return;
+ }
+
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("CBSleep"));
+ WRITE_ONCE(rdp->nocb_cb_sleep, true);
+ rcu_nocb_unlock_irqrestore(rdp, flags);
+ if (needwake_gp)
+ rcu_gp_kthread_wake();
+ swait_event_interruptible_exclusive(rdp->nocb_cb_wq,
+ !READ_ONCE(rdp->nocb_cb_sleep));
+ if (!smp_load_acquire(&rdp->nocb_cb_sleep)) { /* VVV */
+ /* ^^^ Ensure CB invocation follows _sleep test. */
+ return;
}
+ WARN_ON(signal_pending(current));
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WokeEmpty"));
}
/*
- * Per-rcu_data kthread, but only for no-CBs CPUs. Each kthread invokes
- * callbacks queued by the corresponding no-CBs CPU, however, there is
- * an optional leader-follower relationship so that the grace-period
- * kthreads don't have to do quite so many wakeups.
+ * Per-rcu_data kthread, but only for no-CBs CPUs. Repeatedly invoke
+ * nocb_cb_wait() to do the dirty work.
*/
-static int rcu_nocb_kthread(void *arg)
+static int rcu_nocb_cb_kthread(void *arg)
{
- int c, cl;
- unsigned long flags;
- struct rcu_head *list;
- struct rcu_head *next;
- struct rcu_head **tail;
struct rcu_data *rdp = arg;
- /* Each pass through this loop invokes one batch of callbacks */
+ // Each pass through this loop does one callback batch, and,
+ // if there are no more ready callbacks, waits for them.
for (;;) {
- /* Wait for callbacks. */
- if (rdp->nocb_leader == rdp)
- nocb_leader_wait(rdp);
- else
- nocb_follower_wait(rdp);
-
- /* Pull the ready-to-invoke callbacks onto local list. */
- raw_spin_lock_irqsave(&rdp->nocb_lock, flags);
- list = rdp->nocb_follower_head;
- rdp->nocb_follower_head = NULL;
- tail = rdp->nocb_follower_tail;
- rdp->nocb_follower_tail = &rdp->nocb_follower_head;
- raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
- if (WARN_ON_ONCE(!list))
- continue;
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WokeNonEmpty"));
-
- /* Each pass through the following loop invokes a callback. */
- trace_rcu_batch_start(rcu_state.name,
- atomic_long_read(&rdp->nocb_q_count_lazy),
- rcu_get_n_cbs_nocb_cpu(rdp), -1);
- c = cl = 0;
- while (list) {
- next = list->next;
- /* Wait for enqueuing to complete, if needed. */
- while (next == NULL && &list->next != tail) {
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
- TPS("WaitQueue"));
- schedule_timeout_interruptible(1);
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
- TPS("WokeQueue"));
- next = list->next;
- }
- debug_rcu_head_unqueue(list);
- local_bh_disable();
- if (__rcu_reclaim(rcu_state.name, list))
- cl++;
- c++;
- local_bh_enable();
- cond_resched_tasks_rcu_qs();
- list = next;
- }
- trace_rcu_batch_end(rcu_state.name, c, !!list, 0, 0, 1);
- smp_mb__before_atomic(); /* _add after CB invocation. */
- atomic_long_add(-c, &rdp->nocb_q_count);
- atomic_long_add(-cl, &rdp->nocb_q_count_lazy);
+ nocb_cb_wait(rdp);
+ cond_resched_tasks_rcu_qs();
}
return 0;
}
@@ -1993,14 +2159,14 @@ static void do_nocb_deferred_wakeup_common(struct rcu_data *rdp)
unsigned long flags;
int ndw;
- raw_spin_lock_irqsave(&rdp->nocb_lock, flags);
+ rcu_nocb_lock_irqsave(rdp, flags);
if (!rcu_nocb_need_deferred_wakeup(rdp)) {
- raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
+ rcu_nocb_unlock_irqrestore(rdp, flags);
return;
}
ndw = READ_ONCE(rdp->nocb_defer_wakeup);
WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT);
- __wake_nocb_leader(rdp, ndw == RCU_NOCB_WAKE_FORCE, flags);
+ wake_nocb_gp(rdp, ndw == RCU_NOCB_WAKE_FORCE, flags);
trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("DeferredWake"));
}
@@ -2027,6 +2193,7 @@ void __init rcu_init_nohz(void)
{
int cpu;
bool need_rcu_nocb_mask = false;
+ struct rcu_data *rdp;
#if defined(CONFIG_NO_HZ_FULL)
if (tick_nohz_full_running && cpumask_weight(tick_nohz_full_mask))
@@ -2060,67 +2227,63 @@ void __init rcu_init_nohz(void)
if (rcu_nocb_poll)
pr_info("\tPoll for callbacks from no-CBs CPUs.\n");
- for_each_cpu(cpu, rcu_nocb_mask)
- init_nocb_callback_list(per_cpu_ptr(&rcu_data, cpu));
+ for_each_cpu(cpu, rcu_nocb_mask) {
+ rdp = per_cpu_ptr(&rcu_data, cpu);
+ if (rcu_segcblist_empty(&rdp->cblist))
+ rcu_segcblist_init(&rdp->cblist);
+ rcu_segcblist_offload(&rdp->cblist);
+ }
rcu_organize_nocb_kthreads();
}
/* Initialize per-rcu_data variables for no-CBs CPUs. */
static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
{
- rdp->nocb_tail = &rdp->nocb_head;
- init_swait_queue_head(&rdp->nocb_wq);
- rdp->nocb_follower_tail = &rdp->nocb_follower_head;
+ init_swait_queue_head(&rdp->nocb_cb_wq);
+ init_swait_queue_head(&rdp->nocb_gp_wq);
raw_spin_lock_init(&rdp->nocb_lock);
+ raw_spin_lock_init(&rdp->nocb_bypass_lock);
+ raw_spin_lock_init(&rdp->nocb_gp_lock);
timer_setup(&rdp->nocb_timer, do_nocb_deferred_wakeup_timer, 0);
+ timer_setup(&rdp->nocb_bypass_timer, do_nocb_bypass_wakeup_timer, 0);
+ rcu_cblist_init(&rdp->nocb_bypass);
}
/*
* If the specified CPU is a no-CBs CPU that does not already have its
- * rcuo kthread, spawn it. If the CPUs are brought online out of order,
- * this can require re-organizing the leader-follower relationships.
+ * rcuo CB kthread, spawn it. Additionally, if the rcuo GP kthread
+ * for this CPU's group has not yet been created, spawn it as well.
*/
static void rcu_spawn_one_nocb_kthread(int cpu)
{
- struct rcu_data *rdp;
- struct rcu_data *rdp_last;
- struct rcu_data *rdp_old_leader;
- struct rcu_data *rdp_spawn = per_cpu_ptr(&rcu_data, cpu);
+ struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
+ struct rcu_data *rdp_gp;
struct task_struct *t;
/*
* If this isn't a no-CBs CPU or if it already has an rcuo kthread,
* then nothing to do.
*/
- if (!rcu_is_nocb_cpu(cpu) || rdp_spawn->nocb_kthread)
+ if (!rcu_is_nocb_cpu(cpu) || rdp->nocb_cb_kthread)
return;
- /* If we didn't spawn the leader first, reorganize! */
- rdp_old_leader = rdp_spawn->nocb_leader;
- if (rdp_old_leader != rdp_spawn && !rdp_old_leader->nocb_kthread) {
- rdp_last = NULL;
- rdp = rdp_old_leader;
- do {
- rdp->nocb_leader = rdp_spawn;
- if (rdp_last && rdp != rdp_spawn)
- rdp_last->nocb_next_follower = rdp;
- if (rdp == rdp_spawn) {
- rdp = rdp->nocb_next_follower;
- } else {
- rdp_last = rdp;
- rdp = rdp->nocb_next_follower;
- rdp_last->nocb_next_follower = NULL;
- }
- } while (rdp);
- rdp_spawn->nocb_next_follower = rdp_old_leader;
+ /* If we didn't spawn the GP kthread first, reorganize! */
+ rdp_gp = rdp->nocb_gp_rdp;
+ if (!rdp_gp->nocb_gp_kthread) {
+ t = kthread_run(rcu_nocb_gp_kthread, rdp_gp,
+ "rcuog/%d", rdp_gp->cpu);
+ if (WARN_ONCE(IS_ERR(t), "%s: Could not start rcuo GP kthread, OOM is now expected behavior\n", __func__))
+ return;
+ WRITE_ONCE(rdp_gp->nocb_gp_kthread, t);
}
/* Spawn the kthread for this CPU. */
- t = kthread_run(rcu_nocb_kthread, rdp_spawn,
+ t = kthread_run(rcu_nocb_cb_kthread, rdp,
"rcuo%c/%d", rcu_state.abbr, cpu);
- if (WARN_ONCE(IS_ERR(t), "%s: Could not start rcuo kthread, OOM is now expected behavior\n", __func__))
+ if (WARN_ONCE(IS_ERR(t), "%s: Could not start rcuo CB kthread, OOM is now expected behavior\n", __func__))
return;
- WRITE_ONCE(rdp_spawn->nocb_kthread, t);
+ WRITE_ONCE(rdp->nocb_cb_kthread, t);
+ WRITE_ONCE(rdp->nocb_gp_kthread, rdp_gp->nocb_gp_kthread);
}
/*
@@ -2147,27 +2310,28 @@ static void __init rcu_spawn_nocb_kthreads(void)
rcu_spawn_cpu_nocb_kthread(cpu);
}
-/* How many follower CPU IDs per leader? Default of -1 for sqrt(nr_cpu_ids). */
-static int rcu_nocb_leader_stride = -1;
-module_param(rcu_nocb_leader_stride, int, 0444);
+/* How many CB CPU IDs per GP kthread? Default of -1 for sqrt(nr_cpu_ids). */
+static int rcu_nocb_gp_stride = -1;
+module_param(rcu_nocb_gp_stride, int, 0444);
/*
- * Initialize leader-follower relationships for all no-CBs CPU.
+ * Initialize GP-CB relationships for all no-CBs CPU.
*/
static void __init rcu_organize_nocb_kthreads(void)
{
int cpu;
- int ls = rcu_nocb_leader_stride;
- int nl = 0; /* Next leader. */
+ bool firsttime = true;
+ int ls = rcu_nocb_gp_stride;
+ int nl = 0; /* Next GP kthread. */
struct rcu_data *rdp;
- struct rcu_data *rdp_leader = NULL; /* Suppress misguided gcc warn. */
+ struct rcu_data *rdp_gp = NULL; /* Suppress misguided gcc warn. */
struct rcu_data *rdp_prev = NULL;
if (!cpumask_available(rcu_nocb_mask))
return;
if (ls == -1) {
- ls = int_sqrt(nr_cpu_ids);
- rcu_nocb_leader_stride = ls;
+ ls = nr_cpu_ids / int_sqrt(nr_cpu_ids);
+ rcu_nocb_gp_stride = ls;
}
/*
@@ -2178,39 +2342,24 @@ static void __init rcu_organize_nocb_kthreads(void)
for_each_cpu(cpu, rcu_nocb_mask) {
rdp = per_cpu_ptr(&rcu_data, cpu);
if (rdp->cpu >= nl) {
- /* New leader, set up for followers & next leader. */
+ /* New GP kthread, set up for CBs & next GP. */
nl = DIV_ROUND_UP(rdp->cpu + 1, ls) * ls;
- rdp->nocb_leader = rdp;
- rdp_leader = rdp;
+ rdp->nocb_gp_rdp = rdp;
+ rdp_gp = rdp;
+ if (!firsttime && dump_tree)
+ pr_cont("\n");
+ firsttime = false;
+ pr_alert("%s: No-CB GP kthread CPU %d:", __func__, cpu);
} else {
- /* Another follower, link to previous leader. */
- rdp->nocb_leader = rdp_leader;
- rdp_prev->nocb_next_follower = rdp;
+ /* Another CB kthread, link to previous GP kthread. */
+ rdp->nocb_gp_rdp = rdp_gp;
+ rdp_prev->nocb_next_cb_rdp = rdp;
+ pr_alert(" %d", cpu);
}
rdp_prev = rdp;
}
}
-/* Prevent __call_rcu() from enqueuing callbacks on no-CBs CPUs */
-static bool init_nocb_callback_list(struct rcu_data *rdp)
-{
- if (!rcu_is_nocb_cpu(rdp->cpu))
- return false;
-
- /* If there are early-boot callbacks, move them to nocb lists. */
- if (!rcu_segcblist_empty(&rdp->cblist)) {
- rdp->nocb_head = rcu_segcblist_head(&rdp->cblist);
- rdp->nocb_tail = rcu_segcblist_tail(&rdp->cblist);
- atomic_long_set(&rdp->nocb_q_count,
- rcu_segcblist_n_cbs(&rdp->cblist));
- atomic_long_set(&rdp->nocb_q_count_lazy,
- rcu_segcblist_n_lazy_cbs(&rdp->cblist));
- rcu_segcblist_init(&rdp->cblist);
- }
- rcu_segcblist_disable(&rdp->cblist);
- return true;
-}
-
/*
* Bind the current task to the offloaded CPUs. If there are no offloaded
* CPUs, leave the task unbound. Splat if the bind attempt fails.
@@ -2223,20 +2372,101 @@ void rcu_bind_current_to_nocb(void)
EXPORT_SYMBOL_GPL(rcu_bind_current_to_nocb);
/*
- * Return the number of RCU callbacks still queued from the specified
- * CPU, which must be a nocbs CPU.
+ * Dump out nocb grace-period kthread state for the specified rcu_data
+ * structure.
*/
-static unsigned long rcu_get_n_cbs_nocb_cpu(struct rcu_data *rdp)
+static void show_rcu_nocb_gp_state(struct rcu_data *rdp)
{
- return atomic_long_read(&rdp->nocb_q_count);
+ struct rcu_node *rnp = rdp->mynode;
+
+ pr_info("nocb GP %d %c%c%c%c%c%c %c[%c%c] %c%c:%ld rnp %d:%d %lu\n",
+ rdp->cpu,
+ "kK"[!!rdp->nocb_gp_kthread],
+ "lL"[raw_spin_is_locked(&rdp->nocb_gp_lock)],
+ "dD"[!!rdp->nocb_defer_wakeup],
+ "tT"[timer_pending(&rdp->nocb_timer)],
+ "bB"[timer_pending(&rdp->nocb_bypass_timer)],
+ "sS"[!!rdp->nocb_gp_sleep],
+ ".W"[swait_active(&rdp->nocb_gp_wq)],
+ ".W"[swait_active(&rnp->nocb_gp_wq[0])],
+ ".W"[swait_active(&rnp->nocb_gp_wq[1])],
+ ".B"[!!rdp->nocb_gp_bypass],
+ ".G"[!!rdp->nocb_gp_gp],
+ (long)rdp->nocb_gp_seq,
+ rnp->grplo, rnp->grphi, READ_ONCE(rdp->nocb_gp_loops));
+}
+
+/* Dump out nocb kthread state for the specified rcu_data structure. */
+static void show_rcu_nocb_state(struct rcu_data *rdp)
+{
+ struct rcu_segcblist *rsclp = &rdp->cblist;
+ bool waslocked;
+ bool wastimer;
+ bool wassleep;
+
+ if (rdp->nocb_gp_rdp == rdp)
+ show_rcu_nocb_gp_state(rdp);
+
+ pr_info(" CB %d->%d %c%c%c%c%c%c F%ld L%ld C%d %c%c%c%c%c q%ld\n",
+ rdp->cpu, rdp->nocb_gp_rdp->cpu,
+ "kK"[!!rdp->nocb_cb_kthread],
+ "bB"[raw_spin_is_locked(&rdp->nocb_bypass_lock)],
+ "cC"[!!atomic_read(&rdp->nocb_lock_contended)],
+ "lL"[raw_spin_is_locked(&rdp->nocb_lock)],
+ "sS"[!!rdp->nocb_cb_sleep],
+ ".W"[swait_active(&rdp->nocb_cb_wq)],
+ jiffies - rdp->nocb_bypass_first,
+ jiffies - rdp->nocb_nobypass_last,
+ rdp->nocb_nobypass_count,
+ ".D"[rcu_segcblist_ready_cbs(rsclp)],
+ ".W"[!rcu_segcblist_restempty(rsclp, RCU_DONE_TAIL)],
+ ".R"[!rcu_segcblist_restempty(rsclp, RCU_WAIT_TAIL)],
+ ".N"[!rcu_segcblist_restempty(rsclp, RCU_NEXT_READY_TAIL)],
+ ".B"[!!rcu_cblist_n_cbs(&rdp->nocb_bypass)],
+ rcu_segcblist_n_cbs(&rdp->cblist));
+
+ /* It is OK for GP kthreads to have GP state. */
+ if (rdp->nocb_gp_rdp == rdp)
+ return;
+
+ waslocked = raw_spin_is_locked(&rdp->nocb_gp_lock);
+ wastimer = timer_pending(&rdp->nocb_timer);
+ wassleep = swait_active(&rdp->nocb_gp_wq);
+ if (!rdp->nocb_defer_wakeup && !rdp->nocb_gp_sleep &&
+ !waslocked && !wastimer && !wassleep)
+ return; /* Nothing untowards. */
+
+ pr_info(" !!! %c%c%c%c %c\n",
+ "lL"[waslocked],
+ "dD"[!!rdp->nocb_defer_wakeup],
+ "tT"[wastimer],
+ "sS"[!!rdp->nocb_gp_sleep],
+ ".W"[wassleep]);
}
#else /* #ifdef CONFIG_RCU_NOCB_CPU */
-static bool rcu_nocb_cpu_needs_barrier(int cpu)
+/* No ->nocb_lock to acquire. */
+static void rcu_nocb_lock(struct rcu_data *rdp)
+{
+}
+
+/* No ->nocb_lock to release. */
+static void rcu_nocb_unlock(struct rcu_data *rdp)
{
- WARN_ON_ONCE(1); /* Should be dead code. */
- return false;
+}
+
+/* No ->nocb_lock to release. */
+static void rcu_nocb_unlock_irqrestore(struct rcu_data *rdp,
+ unsigned long flags)
+{
+ local_irq_restore(flags);
+}
+
+/* Lockdep check that ->cblist may be safely accessed. */
+static void rcu_lockdep_assert_cblist_protected(struct rcu_data *rdp)
+{
+ lockdep_assert_irqs_disabled();
}
static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq)
@@ -2252,19 +2482,24 @@ static void rcu_init_one_nocb(struct rcu_node *rnp)
{
}
-static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
- bool lazy, unsigned long flags)
+static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
+ unsigned long j)
{
- return false;
+ return true;
}
-static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_data *my_rdp,
- struct rcu_data *rdp,
- unsigned long flags)
+static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
+ bool *was_alldone, unsigned long flags)
{
return false;
}
+static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_empty,
+ unsigned long flags)
+{
+ WARN_ON_ONCE(1); /* Should be dead code! */
+}
+
static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
{
}
@@ -2286,14 +2521,8 @@ static void __init rcu_spawn_nocb_kthreads(void)
{
}
-static bool init_nocb_callback_list(struct rcu_data *rdp)
-{
- return false;
-}
-
-static unsigned long rcu_get_n_cbs_nocb_cpu(struct rcu_data *rdp)
+static void show_rcu_nocb_state(struct rcu_data *rdp)
{
- return 0;
}
#endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */
diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h
index 065183391f75..c0b8c458d8a6 100644
--- a/kernel/rcu/tree_stall.h
+++ b/kernel/rcu/tree_stall.h
@@ -163,7 +163,7 @@ static void rcu_iw_handler(struct irq_work *iwp)
//
// Printing RCU CPU stall warnings
-#ifdef CONFIG_PREEMPT
+#ifdef CONFIG_PREEMPTION
/*
* Dump detailed information for all tasks blocking the current RCU
@@ -215,7 +215,7 @@ static int rcu_print_task_stall(struct rcu_node *rnp)
return ndetected;
}
-#else /* #ifdef CONFIG_PREEMPT */
+#else /* #ifdef CONFIG_PREEMPTION */
/*
* Because preemptible RCU does not exist, we never have to check for
@@ -233,7 +233,7 @@ static int rcu_print_task_stall(struct rcu_node *rnp)
{
return 0;
}
-#endif /* #else #ifdef CONFIG_PREEMPT */
+#endif /* #else #ifdef CONFIG_PREEMPTION */
/*
* Dump stacks of all tasks running on stalled CPUs. First try using
@@ -527,6 +527,8 @@ static void check_cpu_stall(struct rcu_data *rdp)
/* We haven't checked in, so go dump stack. */
print_cpu_stall();
+ if (rcu_cpu_stall_ftrace_dump)
+ rcu_ftrace_dump(DUMP_ALL);
} else if (rcu_gp_in_progress() &&
ULONG_CMP_GE(j, js + RCU_STALL_RAT_DELAY) &&
@@ -534,6 +536,8 @@ static void check_cpu_stall(struct rcu_data *rdp)
/* They had a few time units to dump stack, so complain. */
print_other_cpu_stall(gs2);
+ if (rcu_cpu_stall_ftrace_dump)
+ rcu_ftrace_dump(DUMP_ALL);
}
}
@@ -585,6 +589,11 @@ void show_rcu_gp_kthreads(void)
cpu, (long)rdp->gp_seq_needed);
}
}
+ for_each_possible_cpu(cpu) {
+ rdp = per_cpu_ptr(&rcu_data, cpu);
+ if (rcu_segcblist_is_offloaded(&rdp->cblist))
+ show_rcu_nocb_state(rdp);
+ }
/* sched_show_task(rcu_state.gp_kthread); */
}
EXPORT_SYMBOL_GPL(show_rcu_gp_kthreads);
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index 61df2bf08563..1861103662db 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -61,9 +61,15 @@ module_param(rcu_normal_after_boot, int, 0);
#ifdef CONFIG_DEBUG_LOCK_ALLOC
/**
- * rcu_read_lock_sched_held() - might we be in RCU-sched read-side critical section?
+ * rcu_read_lock_held_common() - might we be in RCU-sched read-side critical section?
+ * @ret: Best guess answer if lockdep cannot be relied on
*
- * If CONFIG_DEBUG_LOCK_ALLOC is selected, returns nonzero iff in an
+ * Returns true if lockdep must be ignored, in which case *ret contains
+ * the best guess described below. Otherwise returns false, in which
+ * case *ret tells the caller nothing and the caller should instead
+ * consult lockdep.
+ *
+ * If CONFIG_DEBUG_LOCK_ALLOC is selected, set *ret to nonzero iff in an
* RCU-sched read-side critical section. In absence of
* CONFIG_DEBUG_LOCK_ALLOC, this assumes we are in an RCU-sched read-side
* critical section unless it can prove otherwise. Note that disabling
@@ -75,35 +81,45 @@ module_param(rcu_normal_after_boot, int, 0);
* Check debug_lockdep_rcu_enabled() to prevent false positives during boot
* and while lockdep is disabled.
*
- * Note that if the CPU is in the idle loop from an RCU point of
- * view (ie: that we are in the section between rcu_idle_enter() and
- * rcu_idle_exit()) then rcu_read_lock_held() returns false even if the CPU
- * did an rcu_read_lock(). The reason for this is that RCU ignores CPUs
- * that are in such a section, considering these as in extended quiescent
- * state, so such a CPU is effectively never in an RCU read-side critical
- * section regardless of what RCU primitives it invokes. This state of
- * affairs is required --- we need to keep an RCU-free window in idle
- * where the CPU may possibly enter into low power mode. This way we can
- * notice an extended quiescent state to other CPUs that started a grace
- * period. Otherwise we would delay any grace period as long as we run in
- * the idle task.
+ * Note that if the CPU is in the idle loop from an RCU point of view (ie:
+ * that we are in the section between rcu_idle_enter() and rcu_idle_exit())
+ * then rcu_read_lock_held() sets *ret to false even if the CPU did an
+ * rcu_read_lock(). The reason for this is that RCU ignores CPUs that are
+ * in such a section, considering these as in extended quiescent state,
+ * so such a CPU is effectively never in an RCU read-side critical section
+ * regardless of what RCU primitives it invokes. This state of affairs is
+ * required --- we need to keep an RCU-free window in idle where the CPU may
+ * possibly enter into low power mode. This way we can notice an extended
+ * quiescent state to other CPUs that started a grace period. Otherwise
+ * we would delay any grace period as long as we run in the idle task.
*
- * Similarly, we avoid claiming an SRCU read lock held if the current
+ * Similarly, we avoid claiming an RCU read lock held if the current
* CPU is offline.
*/
+static bool rcu_read_lock_held_common(bool *ret)
+{
+ if (!debug_lockdep_rcu_enabled()) {
+ *ret = 1;
+ return true;
+ }
+ if (!rcu_is_watching()) {
+ *ret = 0;
+ return true;
+ }
+ if (!rcu_lockdep_current_cpu_online()) {
+ *ret = 0;
+ return true;
+ }
+ return false;
+}
+
int rcu_read_lock_sched_held(void)
{
- int lockdep_opinion = 0;
+ bool ret;
- if (!debug_lockdep_rcu_enabled())
- return 1;
- if (!rcu_is_watching())
- return 0;
- if (!rcu_lockdep_current_cpu_online())
- return 0;
- if (debug_locks)
- lockdep_opinion = lock_is_held(&rcu_sched_lock_map);
- return lockdep_opinion || !preemptible();
+ if (rcu_read_lock_held_common(&ret))
+ return ret;
+ return lock_is_held(&rcu_sched_lock_map) || !preemptible();
}
EXPORT_SYMBOL(rcu_read_lock_sched_held);
#endif
@@ -136,8 +152,7 @@ static atomic_t rcu_expedited_nesting = ATOMIC_INIT(1);
*/
bool rcu_gp_is_expedited(void)
{
- return rcu_expedited || atomic_read(&rcu_expedited_nesting) ||
- rcu_scheduler_active == RCU_SCHEDULER_INIT;
+ return rcu_expedited || atomic_read(&rcu_expedited_nesting);
}
EXPORT_SYMBOL_GPL(rcu_gp_is_expedited);
@@ -261,12 +276,10 @@ NOKPROBE_SYMBOL(debug_lockdep_rcu_enabled);
*/
int rcu_read_lock_held(void)
{
- if (!debug_lockdep_rcu_enabled())
- return 1;
- if (!rcu_is_watching())
- return 0;
- if (!rcu_lockdep_current_cpu_online())
- return 0;
+ bool ret;
+
+ if (rcu_read_lock_held_common(&ret))
+ return ret;
return lock_is_held(&rcu_lock_map);
}
EXPORT_SYMBOL_GPL(rcu_read_lock_held);
@@ -288,16 +301,28 @@ EXPORT_SYMBOL_GPL(rcu_read_lock_held);
*/
int rcu_read_lock_bh_held(void)
{
- if (!debug_lockdep_rcu_enabled())
- return 1;
- if (!rcu_is_watching())
- return 0;
- if (!rcu_lockdep_current_cpu_online())
- return 0;
+ bool ret;
+
+ if (rcu_read_lock_held_common(&ret))
+ return ret;
return in_softirq() || irqs_disabled();
}
EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held);
+int rcu_read_lock_any_held(void)
+{
+ bool ret;
+
+ if (rcu_read_lock_held_common(&ret))
+ return ret;
+ if (lock_is_held(&rcu_lock_map) ||
+ lock_is_held(&rcu_bh_lock_map) ||
+ lock_is_held(&rcu_sched_lock_map))
+ return 1;
+ return !preemptible();
+}
+EXPORT_SYMBOL_GPL(rcu_read_lock_any_held);
+
#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
/**
@@ -437,6 +462,8 @@ EXPORT_SYMBOL_GPL(rcutorture_sched_setaffinity);
#endif
#ifdef CONFIG_RCU_STALL_COMMON
+int rcu_cpu_stall_ftrace_dump __read_mostly;
+module_param(rcu_cpu_stall_ftrace_dump, int, 0644);
int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */
EXPORT_SYMBOL_GPL(rcu_cpu_stall_suppress);
module_param(rcu_cpu_stall_suppress, int, 0644);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 2b037f195473..06961b997ed6 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -773,6 +773,18 @@ static void set_load_weight(struct task_struct *p, bool update_load)
}
#ifdef CONFIG_UCLAMP_TASK
+/*
+ * Serializes updates of utilization clamp values
+ *
+ * The (slow-path) user-space triggers utilization clamp value updates which
+ * can require updates on (fast-path) scheduler's data structures used to
+ * support enqueue/dequeue operations.
+ * While the per-CPU rq lock protects fast-path update operations, user-space
+ * requests are serialized using a mutex to reduce the risk of conflicting
+ * updates or API abuses.
+ */
+static DEFINE_MUTEX(uclamp_mutex);
+
/* Max allowed minimum utilization */
unsigned int sysctl_sched_uclamp_util_min = SCHED_CAPACITY_SCALE;
@@ -798,7 +810,7 @@ static inline unsigned int uclamp_bucket_base_value(unsigned int clamp_value)
return UCLAMP_BUCKET_DELTA * uclamp_bucket_id(clamp_value);
}
-static inline unsigned int uclamp_none(int clamp_id)
+static inline enum uclamp_id uclamp_none(enum uclamp_id clamp_id)
{
if (clamp_id == UCLAMP_MIN)
return 0;
@@ -814,7 +826,7 @@ static inline void uclamp_se_set(struct uclamp_se *uc_se,
}
static inline unsigned int
-uclamp_idle_value(struct rq *rq, unsigned int clamp_id,
+uclamp_idle_value(struct rq *rq, enum uclamp_id clamp_id,
unsigned int clamp_value)
{
/*
@@ -830,7 +842,7 @@ uclamp_idle_value(struct rq *rq, unsigned int clamp_id,
return uclamp_none(UCLAMP_MIN);
}
-static inline void uclamp_idle_reset(struct rq *rq, unsigned int clamp_id,
+static inline void uclamp_idle_reset(struct rq *rq, enum uclamp_id clamp_id,
unsigned int clamp_value)
{
/* Reset max-clamp retention only on idle exit */
@@ -841,8 +853,8 @@ static inline void uclamp_idle_reset(struct rq *rq, unsigned int clamp_id,
}
static inline
-unsigned int uclamp_rq_max_value(struct rq *rq, unsigned int clamp_id,
- unsigned int clamp_value)
+enum uclamp_id uclamp_rq_max_value(struct rq *rq, enum uclamp_id clamp_id,
+ unsigned int clamp_value)
{
struct uclamp_bucket *bucket = rq->uclamp[clamp_id].bucket;
int bucket_id = UCLAMP_BUCKETS - 1;
@@ -861,16 +873,42 @@ unsigned int uclamp_rq_max_value(struct rq *rq, unsigned int clamp_id,
return uclamp_idle_value(rq, clamp_id, clamp_value);
}
+static inline struct uclamp_se
+uclamp_tg_restrict(struct task_struct *p, enum uclamp_id clamp_id)
+{
+ struct uclamp_se uc_req = p->uclamp_req[clamp_id];
+#ifdef CONFIG_UCLAMP_TASK_GROUP
+ struct uclamp_se uc_max;
+
+ /*
+ * Tasks in autogroups or root task group will be
+ * restricted by system defaults.
+ */
+ if (task_group_is_autogroup(task_group(p)))
+ return uc_req;
+ if (task_group(p) == &root_task_group)
+ return uc_req;
+
+ uc_max = task_group(p)->uclamp[clamp_id];
+ if (uc_req.value > uc_max.value || !uc_req.user_defined)
+ return uc_max;
+#endif
+
+ return uc_req;
+}
+
/*
* The effective clamp bucket index of a task depends on, by increasing
* priority:
* - the task specific clamp value, when explicitly requested from userspace
+ * - the task group effective clamp value, for tasks not either in the root
+ * group or in an autogroup
* - the system default clamp value, defined by the sysadmin
*/
static inline struct uclamp_se
-uclamp_eff_get(struct task_struct *p, unsigned int clamp_id)
+uclamp_eff_get(struct task_struct *p, enum uclamp_id clamp_id)
{
- struct uclamp_se uc_req = p->uclamp_req[clamp_id];
+ struct uclamp_se uc_req = uclamp_tg_restrict(p, clamp_id);
struct uclamp_se uc_max = uclamp_default[clamp_id];
/* System default restrictions always apply */
@@ -880,7 +918,7 @@ uclamp_eff_get(struct task_struct *p, unsigned int clamp_id)
return uc_req;
}
-unsigned int uclamp_eff_value(struct task_struct *p, unsigned int clamp_id)
+enum uclamp_id uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id)
{
struct uclamp_se uc_eff;
@@ -904,7 +942,7 @@ unsigned int uclamp_eff_value(struct task_struct *p, unsigned int clamp_id)
* for each bucket when all its RUNNABLE tasks require the same clamp.
*/
static inline void uclamp_rq_inc_id(struct rq *rq, struct task_struct *p,
- unsigned int clamp_id)
+ enum uclamp_id clamp_id)
{
struct uclamp_rq *uc_rq = &rq->uclamp[clamp_id];
struct uclamp_se *uc_se = &p->uclamp[clamp_id];
@@ -942,7 +980,7 @@ static inline void uclamp_rq_inc_id(struct rq *rq, struct task_struct *p,
* enforce the expected state and warn.
*/
static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p,
- unsigned int clamp_id)
+ enum uclamp_id clamp_id)
{
struct uclamp_rq *uc_rq = &rq->uclamp[clamp_id];
struct uclamp_se *uc_se = &p->uclamp[clamp_id];
@@ -981,7 +1019,7 @@ static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p,
static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p)
{
- unsigned int clamp_id;
+ enum uclamp_id clamp_id;
if (unlikely(!p->sched_class->uclamp_enabled))
return;
@@ -996,7 +1034,7 @@ static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p)
static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p)
{
- unsigned int clamp_id;
+ enum uclamp_id clamp_id;
if (unlikely(!p->sched_class->uclamp_enabled))
return;
@@ -1005,15 +1043,82 @@ static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p)
uclamp_rq_dec_id(rq, p, clamp_id);
}
+static inline void
+uclamp_update_active(struct task_struct *p, enum uclamp_id clamp_id)
+{
+ struct rq_flags rf;
+ struct rq *rq;
+
+ /*
+ * Lock the task and the rq where the task is (or was) queued.
+ *
+ * We might lock the (previous) rq of a !RUNNABLE task, but that's the
+ * price to pay to safely serialize util_{min,max} updates with
+ * enqueues, dequeues and migration operations.
+ * This is the same locking schema used by __set_cpus_allowed_ptr().
+ */
+ rq = task_rq_lock(p, &rf);
+
+ /*
+ * Setting the clamp bucket is serialized by task_rq_lock().
+ * If the task is not yet RUNNABLE and its task_struct is not
+ * affecting a valid clamp bucket, the next time it's enqueued,
+ * it will already see the updated clamp bucket value.
+ */
+ if (!p->uclamp[clamp_id].active) {
+ uclamp_rq_dec_id(rq, p, clamp_id);
+ uclamp_rq_inc_id(rq, p, clamp_id);
+ }
+
+ task_rq_unlock(rq, p, &rf);
+}
+
+static inline void
+uclamp_update_active_tasks(struct cgroup_subsys_state *css,
+ unsigned int clamps)
+{
+ enum uclamp_id clamp_id;
+ struct css_task_iter it;
+ struct task_struct *p;
+
+ css_task_iter_start(css, 0, &it);
+ while ((p = css_task_iter_next(&it))) {
+ for_each_clamp_id(clamp_id) {
+ if ((0x1 << clamp_id) & clamps)
+ uclamp_update_active(p, clamp_id);
+ }
+ }
+ css_task_iter_end(&it);
+}
+
+#ifdef CONFIG_UCLAMP_TASK_GROUP
+static void cpu_util_update_eff(struct cgroup_subsys_state *css);
+static void uclamp_update_root_tg(void)
+{
+ struct task_group *tg = &root_task_group;
+
+ uclamp_se_set(&tg->uclamp_req[UCLAMP_MIN],
+ sysctl_sched_uclamp_util_min, false);
+ uclamp_se_set(&tg->uclamp_req[UCLAMP_MAX],
+ sysctl_sched_uclamp_util_max, false);
+
+ rcu_read_lock();
+ cpu_util_update_eff(&root_task_group.css);
+ rcu_read_unlock();
+}
+#else
+static void uclamp_update_root_tg(void) { }
+#endif
+
int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp,
loff_t *ppos)
{
+ bool update_root_tg = false;
int old_min, old_max;
- static DEFINE_MUTEX(mutex);
int result;
- mutex_lock(&mutex);
+ mutex_lock(&uclamp_mutex);
old_min = sysctl_sched_uclamp_util_min;
old_max = sysctl_sched_uclamp_util_max;
@@ -1032,23 +1137,30 @@ int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,
if (old_min != sysctl_sched_uclamp_util_min) {
uclamp_se_set(&uclamp_default[UCLAMP_MIN],
sysctl_sched_uclamp_util_min, false);
+ update_root_tg = true;
}
if (old_max != sysctl_sched_uclamp_util_max) {
uclamp_se_set(&uclamp_default[UCLAMP_MAX],
sysctl_sched_uclamp_util_max, false);
+ update_root_tg = true;
}
+ if (update_root_tg)
+ uclamp_update_root_tg();
+
/*
- * Updating all the RUNNABLE task is expensive, keep it simple and do
- * just a lazy update at each next enqueue time.
+ * We update all RUNNABLE tasks only when task groups are in use.
+ * Otherwise, keep it simple and do just a lazy update at each next
+ * task enqueue time.
*/
+
goto done;
undo:
sysctl_sched_uclamp_util_min = old_min;
sysctl_sched_uclamp_util_max = old_max;
done:
- mutex_unlock(&mutex);
+ mutex_unlock(&uclamp_mutex);
return result;
}
@@ -1075,7 +1187,7 @@ static int uclamp_validate(struct task_struct *p,
static void __setscheduler_uclamp(struct task_struct *p,
const struct sched_attr *attr)
{
- unsigned int clamp_id;
+ enum uclamp_id clamp_id;
/*
* On scheduling class change, reset to default clamps for tasks
@@ -1112,7 +1224,7 @@ static void __setscheduler_uclamp(struct task_struct *p,
static void uclamp_fork(struct task_struct *p)
{
- unsigned int clamp_id;
+ enum uclamp_id clamp_id;
for_each_clamp_id(clamp_id)
p->uclamp[clamp_id].active = false;
@@ -1134,9 +1246,11 @@ static void uclamp_fork(struct task_struct *p)
static void __init init_uclamp(void)
{
struct uclamp_se uc_max = {};
- unsigned int clamp_id;
+ enum uclamp_id clamp_id;
int cpu;
+ mutex_init(&uclamp_mutex);
+
for_each_possible_cpu(cpu) {
memset(&cpu_rq(cpu)->uclamp, 0, sizeof(struct uclamp_rq));
cpu_rq(cpu)->uclamp_flags = 0;
@@ -1149,8 +1263,13 @@ static void __init init_uclamp(void)
/* System defaults allow max clamp values for both indexes */
uclamp_se_set(&uc_max, uclamp_none(UCLAMP_MAX), false);
- for_each_clamp_id(clamp_id)
+ for_each_clamp_id(clamp_id) {
uclamp_default[clamp_id] = uc_max;
+#ifdef CONFIG_UCLAMP_TASK_GROUP
+ root_task_group.uclamp_req[clamp_id] = uc_max;
+ root_task_group.uclamp[clamp_id] = uc_max;
+#endif
+ }
}
#else /* CONFIG_UCLAMP_TASK */
@@ -1494,7 +1613,7 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
if (queued)
enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
if (running)
- set_curr_task(rq, p);
+ set_next_task(rq, p);
}
/*
@@ -3214,12 +3333,8 @@ static __always_inline struct rq *
context_switch(struct rq *rq, struct task_struct *prev,
struct task_struct *next, struct rq_flags *rf)
{
- struct mm_struct *mm, *oldmm;
-
prepare_task_switch(rq, prev, next);
- mm = next->mm;
- oldmm = prev->active_mm;
/*
* For paravirt, this is coupled with an exit in switch_to to
* combine the page table reload and the switch backend into
@@ -3228,22 +3343,37 @@ context_switch(struct rq *rq, struct task_struct *prev,
arch_start_context_switch(prev);
/*
- * If mm is non-NULL, we pass through switch_mm(). If mm is
- * NULL, we will pass through mmdrop() in finish_task_switch().
- * Both of these contain the full memory barrier required by
- * membarrier after storing to rq->curr, before returning to
- * user-space.
+ * kernel -> kernel lazy + transfer active
+ * user -> kernel lazy + mmgrab() active
+ *
+ * kernel -> user switch + mmdrop() active
+ * user -> user switch
*/
- if (!mm) {
- next->active_mm = oldmm;
- mmgrab(oldmm);
- enter_lazy_tlb(oldmm, next);
- } else
- switch_mm_irqs_off(oldmm, mm, next);
+ if (!next->mm) { // to kernel
+ enter_lazy_tlb(prev->active_mm, next);
+
+ next->active_mm = prev->active_mm;
+ if (prev->mm) // from user
+ mmgrab(prev->active_mm);
+ else
+ prev->active_mm = NULL;
+ } else { // to user
+ /*
+ * sys_membarrier() requires an smp_mb() between setting
+ * rq->curr and returning to userspace.
+ *
+ * The below provides this either through switch_mm(), or in
+ * case 'prev->active_mm == next->mm' through
+ * finish_task_switch()'s mmdrop().
+ */
- if (!prev->mm) {
- prev->active_mm = NULL;
- rq->prev_mm = oldmm;
+ switch_mm_irqs_off(prev->active_mm, next->mm, next);
+
+ if (!prev->mm) { // from kernel
+ /* will mmdrop() in finish_task_switch(). */
+ rq->prev_mm = prev->active_mm;
+ prev->active_mm = NULL;
+ }
}
rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
@@ -3486,8 +3616,36 @@ void scheduler_tick(void)
struct tick_work {
int cpu;
+ atomic_t state;
struct delayed_work work;
};
+/* Values for ->state, see diagram below. */
+#define TICK_SCHED_REMOTE_OFFLINE 0
+#define TICK_SCHED_REMOTE_OFFLINING 1
+#define TICK_SCHED_REMOTE_RUNNING 2
+
+/*
+ * State diagram for ->state:
+ *
+ *
+ * TICK_SCHED_REMOTE_OFFLINE
+ * | ^
+ * | |
+ * | | sched_tick_remote()
+ * | |
+ * | |
+ * +--TICK_SCHED_REMOTE_OFFLINING
+ * | ^
+ * | |
+ * sched_tick_start() | | sched_tick_stop()
+ * | |
+ * V |
+ * TICK_SCHED_REMOTE_RUNNING
+ *
+ *
+ * Other transitions get WARN_ON_ONCE(), except that sched_tick_remote()
+ * and sched_tick_start() are happy to leave the state in RUNNING.
+ */
static struct tick_work __percpu *tick_work_cpu;
@@ -3500,6 +3658,7 @@ static void sched_tick_remote(struct work_struct *work)
struct task_struct *curr;
struct rq_flags rf;
u64 delta;
+ int os;
/*
* Handle the tick only if it appears the remote CPU is running in full
@@ -3513,7 +3672,7 @@ static void sched_tick_remote(struct work_struct *work)
rq_lock_irq(rq, &rf);
curr = rq->curr;
- if (is_idle_task(curr))
+ if (is_idle_task(curr) || cpu_is_offline(cpu))
goto out_unlock;
update_rq_clock(rq);
@@ -3533,13 +3692,18 @@ out_requeue:
/*
* Run the remote tick once per second (1Hz). This arbitrary
* frequency is large enough to avoid overload but short enough
- * to keep scheduler internal stats reasonably up to date.
+ * to keep scheduler internal stats reasonably up to date. But
+ * first update state to reflect hotplug activity if required.
*/
- queue_delayed_work(system_unbound_wq, dwork, HZ);
+ os = atomic_fetch_add_unless(&twork->state, -1, TICK_SCHED_REMOTE_RUNNING);
+ WARN_ON_ONCE(os == TICK_SCHED_REMOTE_OFFLINE);
+ if (os == TICK_SCHED_REMOTE_RUNNING)
+ queue_delayed_work(system_unbound_wq, dwork, HZ);
}
static void sched_tick_start(int cpu)
{
+ int os;
struct tick_work *twork;
if (housekeeping_cpu(cpu, HK_FLAG_TICK))
@@ -3548,15 +3712,20 @@ static void sched_tick_start(int cpu)
WARN_ON_ONCE(!tick_work_cpu);
twork = per_cpu_ptr(tick_work_cpu, cpu);
- twork->cpu = cpu;
- INIT_DELAYED_WORK(&twork->work, sched_tick_remote);
- queue_delayed_work(system_unbound_wq, &twork->work, HZ);
+ os = atomic_xchg(&twork->state, TICK_SCHED_REMOTE_RUNNING);
+ WARN_ON_ONCE(os == TICK_SCHED_REMOTE_RUNNING);
+ if (os == TICK_SCHED_REMOTE_OFFLINE) {
+ twork->cpu = cpu;
+ INIT_DELAYED_WORK(&twork->work, sched_tick_remote);
+ queue_delayed_work(system_unbound_wq, &twork->work, HZ);
+ }
}
#ifdef CONFIG_HOTPLUG_CPU
static void sched_tick_stop(int cpu)
{
struct tick_work *twork;
+ int os;
if (housekeeping_cpu(cpu, HK_FLAG_TICK))
return;
@@ -3564,7 +3733,10 @@ static void sched_tick_stop(int cpu)
WARN_ON_ONCE(!tick_work_cpu);
twork = per_cpu_ptr(tick_work_cpu, cpu);
- cancel_delayed_work_sync(&twork->work);
+ /* There cannot be competing actions, but don't rely on stop-machine. */
+ os = atomic_xchg(&twork->state, TICK_SCHED_REMOTE_OFFLINING);
+ WARN_ON_ONCE(os != TICK_SCHED_REMOTE_RUNNING);
+ /* Don't cancel, as this would mess up the state machine. */
}
#endif /* CONFIG_HOTPLUG_CPU */
@@ -3572,7 +3744,6 @@ int __init sched_tick_offload_init(void)
{
tick_work_cpu = alloc_percpu(struct tick_work);
BUG_ON(!tick_work_cpu);
-
return 0;
}
@@ -3581,7 +3752,7 @@ static inline void sched_tick_start(int cpu) { }
static inline void sched_tick_stop(int cpu) { }
#endif
-#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
+#if defined(CONFIG_PREEMPTION) && (defined(CONFIG_DEBUG_PREEMPT) || \
defined(CONFIG_TRACE_PREEMPT_TOGGLE))
/*
* If the value passed in is equal to the current preempt count
@@ -3739,7 +3910,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
p = fair_sched_class.pick_next_task(rq, prev, rf);
if (unlikely(p == RETRY_TASK))
- goto again;
+ goto restart;
/* Assumes fair_sched_class->next == idle_sched_class */
if (unlikely(!p))
@@ -3748,14 +3919,19 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
return p;
}
-again:
+restart:
+ /*
+ * Ensure that we put DL/RT tasks before the pick loop, such that they
+ * can PULL higher prio tasks when we lower the RQ 'priority'.
+ */
+ prev->sched_class->put_prev_task(rq, prev, rf);
+ if (!rq->nr_running)
+ newidle_balance(rq, rf);
+
for_each_class(class) {
- p = class->pick_next_task(rq, prev, rf);
- if (p) {
- if (unlikely(p == RETRY_TASK))
- goto again;
+ p = class->pick_next_task(rq, NULL, NULL);
+ if (p)
return p;
- }
}
/* The idle class should always have a runnable task: */
@@ -3782,7 +3958,7 @@ again:
* task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets
* called on the nearest possible occasion:
*
- * - If the kernel is preemptible (CONFIG_PREEMPT=y):
+ * - If the kernel is preemptible (CONFIG_PREEMPTION=y):
*
* - in syscall or exception context, at the next outmost
* preempt_enable(). (this might be as soon as the wake_up()'s
@@ -3791,7 +3967,7 @@ again:
* - in IRQ context, return from interrupt-handler to
* preemptible context
*
- * - If the kernel is not preemptible (CONFIG_PREEMPT is not set)
+ * - If the kernel is not preemptible (CONFIG_PREEMPTION is not set)
* then at the next:
*
* - cond_resched() call
@@ -3904,7 +4080,7 @@ void __noreturn do_task_dead(void)
static inline void sched_submit_work(struct task_struct *tsk)
{
- if (!tsk->state || tsk_is_pi_blocked(tsk))
+ if (!tsk->state)
return;
/*
@@ -3920,6 +4096,9 @@ static inline void sched_submit_work(struct task_struct *tsk)
preempt_enable_no_resched();
}
+ if (tsk_is_pi_blocked(tsk))
+ return;
+
/*
* If we are going to sleep and we have plugged IO queued,
* make sure to submit it to avoid deadlocks.
@@ -4033,7 +4212,7 @@ static void __sched notrace preempt_schedule_common(void)
} while (need_resched());
}
-#ifdef CONFIG_PREEMPT
+#ifdef CONFIG_PREEMPTION
/*
* this is the entry point to schedule() from in-kernel preemption
* off of preempt_enable. Kernel preemptions off return from interrupt
@@ -4105,7 +4284,7 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)
}
EXPORT_SYMBOL_GPL(preempt_schedule_notrace);
-#endif /* CONFIG_PREEMPT */
+#endif /* CONFIG_PREEMPTION */
/*
* this is the entry point to schedule() from kernel preemption
@@ -4273,7 +4452,7 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
if (queued)
enqueue_task(rq, p, queue_flag);
if (running)
- set_curr_task(rq, p);
+ set_next_task(rq, p);
check_class_changed(rq, p, prev_class, oldprio);
out_unlock:
@@ -4340,7 +4519,7 @@ void set_user_nice(struct task_struct *p, long nice)
resched_curr(rq);
}
if (running)
- set_curr_task(rq, p);
+ set_next_task(rq, p);
out_unlock:
task_rq_unlock(rq, p, &rf);
}
@@ -4657,6 +4836,9 @@ recheck:
return retval;
}
+ if (pi)
+ cpuset_read_lock();
+
/*
* Make sure no PI-waiters arrive (or leave) while we are
* changing the priority of the task:
@@ -4671,8 +4853,8 @@ recheck:
* Changing the policy of the stop threads its a very bad idea:
*/
if (p == rq->stop) {
- task_rq_unlock(rq, p, &rf);
- return -EINVAL;
+ retval = -EINVAL;
+ goto unlock;
}
/*
@@ -4690,8 +4872,8 @@ recheck:
goto change;
p->sched_reset_on_fork = reset_on_fork;
- task_rq_unlock(rq, p, &rf);
- return 0;
+ retval = 0;
+ goto unlock;
}
change:
@@ -4704,8 +4886,8 @@ change:
if (rt_bandwidth_enabled() && rt_policy(policy) &&
task_group(p)->rt_bandwidth.rt_runtime == 0 &&
!task_group_is_autogroup(task_group(p))) {
- task_rq_unlock(rq, p, &rf);
- return -EPERM;
+ retval = -EPERM;
+ goto unlock;
}
#endif
#ifdef CONFIG_SMP
@@ -4720,8 +4902,8 @@ change:
*/
if (!cpumask_subset(span, p->cpus_ptr) ||
rq->rd->dl_bw.bw == 0) {
- task_rq_unlock(rq, p, &rf);
- return -EPERM;
+ retval = -EPERM;
+ goto unlock;
}
}
#endif
@@ -4731,6 +4913,8 @@ change:
if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
policy = oldpolicy = -1;
task_rq_unlock(rq, p, &rf);
+ if (pi)
+ cpuset_read_unlock();
goto recheck;
}
@@ -4740,8 +4924,8 @@ change:
* is available.
*/
if ((dl_policy(policy) || dl_task(p)) && sched_dl_overflow(p, policy, attr)) {
- task_rq_unlock(rq, p, &rf);
- return -EBUSY;
+ retval = -EBUSY;
+ goto unlock;
}
p->sched_reset_on_fork = reset_on_fork;
@@ -4783,7 +4967,7 @@ change:
enqueue_task(rq, p, queue_flags);
}
if (running)
- set_curr_task(rq, p);
+ set_next_task(rq, p);
check_class_changed(rq, p, prev_class, oldprio);
@@ -4791,14 +4975,22 @@ change:
preempt_disable();
task_rq_unlock(rq, p, &rf);
- if (pi)
+ if (pi) {
+ cpuset_read_unlock();
rt_mutex_adjust_pi(p);
+ }
/* Run balance callbacks after we've adjusted the PI chain: */
balance_callback(rq);
preempt_enable();
return 0;
+
+unlock:
+ task_rq_unlock(rq, p, &rf);
+ if (pi)
+ cpuset_read_unlock();
+ return retval;
}
static int _sched_setscheduler(struct task_struct *p, int policy,
@@ -4882,10 +5074,15 @@ do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
rcu_read_lock();
retval = -ESRCH;
p = find_process_by_pid(pid);
- if (p != NULL)
- retval = sched_setscheduler(p, policy, &lparam);
+ if (likely(p))
+ get_task_struct(p);
rcu_read_unlock();
+ if (likely(p)) {
+ retval = sched_setscheduler(p, policy, &lparam);
+ put_task_struct(p);
+ }
+
return retval;
}
@@ -5102,37 +5299,40 @@ out_unlock:
return retval;
}
-static int sched_read_attr(struct sched_attr __user *uattr,
- struct sched_attr *attr,
- unsigned int usize)
+/*
+ * Copy the kernel size attribute structure (which might be larger
+ * than what user-space knows about) to user-space.
+ *
+ * Note that all cases are valid: user-space buffer can be larger or
+ * smaller than the kernel-space buffer. The usual case is that both
+ * have the same size.
+ */
+static int
+sched_attr_copy_to_user(struct sched_attr __user *uattr,
+ struct sched_attr *kattr,
+ unsigned int usize)
{
- int ret;
+ unsigned int ksize = sizeof(*kattr);
if (!access_ok(uattr, usize))
return -EFAULT;
/*
- * If we're handed a smaller struct than we know of,
- * ensure all the unknown bits are 0 - i.e. old
- * user-space does not get uncomplete information.
+ * sched_getattr() ABI forwards and backwards compatibility:
+ *
+ * If usize == ksize then we just copy everything to user-space and all is good.
+ *
+ * If usize < ksize then we only copy as much as user-space has space for,
+ * this keeps ABI compatibility as well. We skip the rest.
+ *
+ * If usize > ksize then user-space is using a newer version of the ABI,
+ * which part the kernel doesn't know about. Just ignore it - tooling can
+ * detect the kernel's knowledge of attributes from the attr->size value
+ * which is set to ksize in this case.
*/
- if (usize < sizeof(*attr)) {
- unsigned char *addr;
- unsigned char *end;
-
- addr = (void *)attr + usize;
- end = (void *)attr + sizeof(*attr);
-
- for (; addr < end; addr++) {
- if (*addr)
- return -EFBIG;
- }
-
- attr->size = usize;
- }
+ kattr->size = min(usize, ksize);
- ret = copy_to_user(uattr, attr, attr->size);
- if (ret)
+ if (copy_to_user(uattr, kattr, kattr->size))
return -EFAULT;
return 0;
@@ -5142,20 +5342,18 @@ static int sched_read_attr(struct sched_attr __user *uattr,
* sys_sched_getattr - similar to sched_getparam, but with sched_attr
* @pid: the pid in question.
* @uattr: structure containing the extended parameters.
- * @size: sizeof(attr) for fwd/bwd comp.
+ * @usize: sizeof(attr) that user-space knows about, for forwards and backwards compatibility.
* @flags: for future extension.
*/
SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
- unsigned int, size, unsigned int, flags)
+ unsigned int, usize, unsigned int, flags)
{
- struct sched_attr attr = {
- .size = sizeof(struct sched_attr),
- };
+ struct sched_attr kattr = { };
struct task_struct *p;
int retval;
- if (!uattr || pid < 0 || size > PAGE_SIZE ||
- size < SCHED_ATTR_SIZE_VER0 || flags)
+ if (!uattr || pid < 0 || usize > PAGE_SIZE ||
+ usize < SCHED_ATTR_SIZE_VER0 || flags)
return -EINVAL;
rcu_read_lock();
@@ -5168,25 +5366,24 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
if (retval)
goto out_unlock;
- attr.sched_policy = p->policy;
+ kattr.sched_policy = p->policy;
if (p->sched_reset_on_fork)
- attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
+ kattr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
if (task_has_dl_policy(p))
- __getparam_dl(p, &attr);
+ __getparam_dl(p, &kattr);
else if (task_has_rt_policy(p))
- attr.sched_priority = p->rt_priority;
+ kattr.sched_priority = p->rt_priority;
else
- attr.sched_nice = task_nice(p);
+ kattr.sched_nice = task_nice(p);
#ifdef CONFIG_UCLAMP_TASK
- attr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value;
- attr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value;
+ kattr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value;
+ kattr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value;
#endif
rcu_read_unlock();
- retval = sched_read_attr(uattr, &attr, size);
- return retval;
+ return sched_attr_copy_to_user(uattr, &kattr, usize);
out_unlock:
rcu_read_unlock();
@@ -5416,7 +5613,7 @@ SYSCALL_DEFINE0(sched_yield)
return 0;
}
-#ifndef CONFIG_PREEMPT
+#ifndef CONFIG_PREEMPTION
int __sched _cond_resched(void)
{
if (should_resched(0)) {
@@ -5433,7 +5630,7 @@ EXPORT_SYMBOL(_cond_resched);
* __cond_resched_lock() - if a reschedule is pending, drop the given lock,
* call schedule, and on return reacquire the lock.
*
- * This works OK both with and without CONFIG_PREEMPT. We do strange low-level
+ * This works OK both with and without CONFIG_PREEMPTION. We do strange low-level
* operations here to prevent schedule() from being called twice (once via
* spin_unlock(), once by hand).
*/
@@ -5972,7 +6169,7 @@ void sched_setnuma(struct task_struct *p, int nid)
if (queued)
enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
if (running)
- set_curr_task(rq, p);
+ set_next_task(rq, p);
task_rq_unlock(rq, p, &rf);
}
#endif /* CONFIG_NUMA_BALANCING */
@@ -6012,21 +6209,22 @@ static void calc_load_migrate(struct rq *rq)
atomic_long_add(delta, &calc_load_tasks);
}
-static void put_prev_task_fake(struct rq *rq, struct task_struct *prev)
+static struct task_struct *__pick_migrate_task(struct rq *rq)
{
-}
+ const struct sched_class *class;
+ struct task_struct *next;
-static const struct sched_class fake_sched_class = {
- .put_prev_task = put_prev_task_fake,
-};
+ for_each_class(class) {
+ next = class->pick_next_task(rq, NULL, NULL);
+ if (next) {
+ next->sched_class->put_prev_task(rq, next, NULL);
+ return next;
+ }
+ }
-static struct task_struct fake_task = {
- /*
- * Avoid pull_{rt,dl}_task()
- */
- .prio = MAX_PRIO + 1,
- .sched_class = &fake_sched_class,
-};
+ /* The idle class should always have a runnable task */
+ BUG();
+}
/*
* Migrate all tasks from the rq, sleeping tasks will be migrated by
@@ -6069,12 +6267,7 @@ static void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf)
if (rq->nr_running == 1)
break;
- /*
- * pick_next_task() assumes pinned rq->lock:
- */
- next = pick_next_task(rq, &fake_task, rf);
- BUG_ON(!next);
- put_prev_task(rq, next);
+ next = __pick_migrate_task(rq);
/*
* Rules for changing task_struct::cpus_mask are holding
@@ -6371,19 +6564,19 @@ DECLARE_PER_CPU(cpumask_var_t, select_idle_mask);
void __init sched_init(void)
{
- unsigned long alloc_size = 0, ptr;
+ unsigned long ptr = 0;
int i;
wait_bit_init();
#ifdef CONFIG_FAIR_GROUP_SCHED
- alloc_size += 2 * nr_cpu_ids * sizeof(void **);
+ ptr += 2 * nr_cpu_ids * sizeof(void **);
#endif
#ifdef CONFIG_RT_GROUP_SCHED
- alloc_size += 2 * nr_cpu_ids * sizeof(void **);
+ ptr += 2 * nr_cpu_ids * sizeof(void **);
#endif
- if (alloc_size) {
- ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);
+ if (ptr) {
+ ptr = (unsigned long)kzalloc(ptr, GFP_NOWAIT);
#ifdef CONFIG_FAIR_GROUP_SCHED
root_task_group.se = (struct sched_entity **)ptr;
@@ -6702,7 +6895,7 @@ struct task_struct *curr_task(int cpu)
#ifdef CONFIG_IA64
/**
- * set_curr_task - set the current task for a given CPU.
+ * ia64_set_curr_task - set the current task for a given CPU.
* @cpu: the processor in question.
* @p: the task pointer to set.
*
@@ -6727,6 +6920,20 @@ void ia64_set_curr_task(int cpu, struct task_struct *p)
/* task_group_lock serializes the addition/removal of task groups */
static DEFINE_SPINLOCK(task_group_lock);
+static inline void alloc_uclamp_sched_group(struct task_group *tg,
+ struct task_group *parent)
+{
+#ifdef CONFIG_UCLAMP_TASK_GROUP
+ enum uclamp_id clamp_id;
+
+ for_each_clamp_id(clamp_id) {
+ uclamp_se_set(&tg->uclamp_req[clamp_id],
+ uclamp_none(clamp_id), false);
+ tg->uclamp[clamp_id] = parent->uclamp[clamp_id];
+ }
+#endif
+}
+
static void sched_free_group(struct task_group *tg)
{
free_fair_sched_group(tg);
@@ -6750,6 +6957,8 @@ struct task_group *sched_create_group(struct task_group *parent)
if (!alloc_rt_sched_group(tg, parent))
goto err;
+ alloc_uclamp_sched_group(tg, parent);
+
return tg;
err:
@@ -6853,7 +7062,7 @@ void sched_move_task(struct task_struct *tsk)
if (queued)
enqueue_task(rq, tsk, queue_flags);
if (running)
- set_curr_task(rq, tsk);
+ set_next_task(rq, tsk);
task_rq_unlock(rq, tsk, &rf);
}
@@ -6936,10 +7145,6 @@ static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)
#ifdef CONFIG_RT_GROUP_SCHED
if (!sched_rt_can_attach(css_tg(css), task))
return -EINVAL;
-#else
- /* We don't support RT-tasks being in separate groups */
- if (task->sched_class != &fair_sched_class)
- return -EINVAL;
#endif
/*
* Serialize against wake_up_new_task() such that if its
@@ -6970,6 +7175,178 @@ static void cpu_cgroup_attach(struct cgroup_taskset *tset)
sched_move_task(task);
}
+#ifdef CONFIG_UCLAMP_TASK_GROUP
+static void cpu_util_update_eff(struct cgroup_subsys_state *css)
+{
+ struct cgroup_subsys_state *top_css = css;
+ struct uclamp_se *uc_parent = NULL;
+ struct uclamp_se *uc_se = NULL;
+ unsigned int eff[UCLAMP_CNT];
+ enum uclamp_id clamp_id;
+ unsigned int clamps;
+
+ css_for_each_descendant_pre(css, top_css) {
+ uc_parent = css_tg(css)->parent
+ ? css_tg(css)->parent->uclamp : NULL;
+
+ for_each_clamp_id(clamp_id) {
+ /* Assume effective clamps matches requested clamps */
+ eff[clamp_id] = css_tg(css)->uclamp_req[clamp_id].value;
+ /* Cap effective clamps with parent's effective clamps */
+ if (uc_parent &&
+ eff[clamp_id] > uc_parent[clamp_id].value) {
+ eff[clamp_id] = uc_parent[clamp_id].value;
+ }
+ }
+ /* Ensure protection is always capped by limit */
+ eff[UCLAMP_MIN] = min(eff[UCLAMP_MIN], eff[UCLAMP_MAX]);
+
+ /* Propagate most restrictive effective clamps */
+ clamps = 0x0;
+ uc_se = css_tg(css)->uclamp;
+ for_each_clamp_id(clamp_id) {
+ if (eff[clamp_id] == uc_se[clamp_id].value)
+ continue;
+ uc_se[clamp_id].value = eff[clamp_id];
+ uc_se[clamp_id].bucket_id = uclamp_bucket_id(eff[clamp_id]);
+ clamps |= (0x1 << clamp_id);
+ }
+ if (!clamps) {
+ css = css_rightmost_descendant(css);
+ continue;
+ }
+
+ /* Immediately update descendants RUNNABLE tasks */
+ uclamp_update_active_tasks(css, clamps);
+ }
+}
+
+/*
+ * Integer 10^N with a given N exponent by casting to integer the literal "1eN"
+ * C expression. Since there is no way to convert a macro argument (N) into a
+ * character constant, use two levels of macros.
+ */
+#define _POW10(exp) ((unsigned int)1e##exp)
+#define POW10(exp) _POW10(exp)
+
+struct uclamp_request {
+#define UCLAMP_PERCENT_SHIFT 2
+#define UCLAMP_PERCENT_SCALE (100 * POW10(UCLAMP_PERCENT_SHIFT))
+ s64 percent;
+ u64 util;
+ int ret;
+};
+
+static inline struct uclamp_request
+capacity_from_percent(char *buf)
+{
+ struct uclamp_request req = {
+ .percent = UCLAMP_PERCENT_SCALE,
+ .util = SCHED_CAPACITY_SCALE,
+ .ret = 0,
+ };
+
+ buf = strim(buf);
+ if (strcmp(buf, "max")) {
+ req.ret = cgroup_parse_float(buf, UCLAMP_PERCENT_SHIFT,
+ &req.percent);
+ if (req.ret)
+ return req;
+ if (req.percent > UCLAMP_PERCENT_SCALE) {
+ req.ret = -ERANGE;
+ return req;
+ }
+
+ req.util = req.percent << SCHED_CAPACITY_SHIFT;
+ req.util = DIV_ROUND_CLOSEST_ULL(req.util, UCLAMP_PERCENT_SCALE);
+ }
+
+ return req;
+}
+
+static ssize_t cpu_uclamp_write(struct kernfs_open_file *of, char *buf,
+ size_t nbytes, loff_t off,
+ enum uclamp_id clamp_id)
+{
+ struct uclamp_request req;
+ struct task_group *tg;
+
+ req = capacity_from_percent(buf);
+ if (req.ret)
+ return req.ret;
+
+ mutex_lock(&uclamp_mutex);
+ rcu_read_lock();
+
+ tg = css_tg(of_css(of));
+ if (tg->uclamp_req[clamp_id].value != req.util)
+ uclamp_se_set(&tg->uclamp_req[clamp_id], req.util, false);
+
+ /*
+ * Because of not recoverable conversion rounding we keep track of the
+ * exact requested value
+ */
+ tg->uclamp_pct[clamp_id] = req.percent;
+
+ /* Update effective clamps to track the most restrictive value */
+ cpu_util_update_eff(of_css(of));
+
+ rcu_read_unlock();
+ mutex_unlock(&uclamp_mutex);
+
+ return nbytes;
+}
+
+static ssize_t cpu_uclamp_min_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes,
+ loff_t off)
+{
+ return cpu_uclamp_write(of, buf, nbytes, off, UCLAMP_MIN);
+}
+
+static ssize_t cpu_uclamp_max_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes,
+ loff_t off)
+{
+ return cpu_uclamp_write(of, buf, nbytes, off, UCLAMP_MAX);
+}
+
+static inline void cpu_uclamp_print(struct seq_file *sf,
+ enum uclamp_id clamp_id)
+{
+ struct task_group *tg;
+ u64 util_clamp;
+ u64 percent;
+ u32 rem;
+
+ rcu_read_lock();
+ tg = css_tg(seq_css(sf));
+ util_clamp = tg->uclamp_req[clamp_id].value;
+ rcu_read_unlock();
+
+ if (util_clamp == SCHED_CAPACITY_SCALE) {
+ seq_puts(sf, "max\n");
+ return;
+ }
+
+ percent = tg->uclamp_pct[clamp_id];
+ percent = div_u64_rem(percent, POW10(UCLAMP_PERCENT_SHIFT), &rem);
+ seq_printf(sf, "%llu.%0*u\n", percent, UCLAMP_PERCENT_SHIFT, rem);
+}
+
+static int cpu_uclamp_min_show(struct seq_file *sf, void *v)
+{
+ cpu_uclamp_print(sf, UCLAMP_MIN);
+ return 0;
+}
+
+static int cpu_uclamp_max_show(struct seq_file *sf, void *v)
+{
+ cpu_uclamp_print(sf, UCLAMP_MAX);
+ return 0;
+}
+#endif /* CONFIG_UCLAMP_TASK_GROUP */
+
#ifdef CONFIG_FAIR_GROUP_SCHED
static int cpu_shares_write_u64(struct cgroup_subsys_state *css,
struct cftype *cftype, u64 shareval)
@@ -7315,6 +7692,20 @@ static struct cftype cpu_legacy_files[] = {
.write_u64 = cpu_rt_period_write_uint,
},
#endif
+#ifdef CONFIG_UCLAMP_TASK_GROUP
+ {
+ .name = "uclamp.min",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .seq_show = cpu_uclamp_min_show,
+ .write = cpu_uclamp_min_write,
+ },
+ {
+ .name = "uclamp.max",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .seq_show = cpu_uclamp_max_show,
+ .write = cpu_uclamp_max_write,
+ },
+#endif
{ } /* Terminate */
};
@@ -7482,6 +7873,20 @@ static struct cftype cpu_files[] = {
.write = cpu_max_write,
},
#endif
+#ifdef CONFIG_UCLAMP_TASK_GROUP
+ {
+ .name = "uclamp.min",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .seq_show = cpu_uclamp_min_show,
+ .write = cpu_uclamp_min_write,
+ },
+ {
+ .name = "uclamp.max",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .seq_show = cpu_uclamp_max_show,
+ .write = cpu_uclamp_max_write,
+ },
+#endif
{ } /* terminate */
};
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 636ca6f88c8e..fdce9cfaca05 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -40,6 +40,7 @@ struct sugov_policy {
struct task_struct *thread;
bool work_in_progress;
+ bool limits_changed;
bool need_freq_update;
};
@@ -89,8 +90,11 @@ static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time)
!cpufreq_this_cpu_can_update(sg_policy->policy))
return false;
- if (unlikely(sg_policy->need_freq_update))
+ if (unlikely(sg_policy->limits_changed)) {
+ sg_policy->limits_changed = false;
+ sg_policy->need_freq_update = true;
return true;
+ }
delta_ns = time - sg_policy->last_freq_update_time;
@@ -259,9 +263,9 @@ unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs,
* irq metric. Because IRQ/steal time is hidden from the task clock we
* need to scale the task numbers:
*
- * 1 - irq
- * U' = irq + ------- * U
- * max
+ * max - irq
+ * U' = irq + --------- * U
+ * max
*/
util = scale_irq_capacity(util, irq, max);
util += irq;
@@ -437,7 +441,7 @@ static inline bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) { return false; }
static inline void ignore_dl_rate_limit(struct sugov_cpu *sg_cpu, struct sugov_policy *sg_policy)
{
if (cpu_bw_dl(cpu_rq(sg_cpu->cpu)) > sg_cpu->bw_dl)
- sg_policy->need_freq_update = true;
+ sg_policy->limits_changed = true;
}
static void sugov_update_single(struct update_util_data *hook, u64 time,
@@ -457,7 +461,8 @@ static void sugov_update_single(struct update_util_data *hook, u64 time,
if (!sugov_should_update_freq(sg_policy, time))
return;
- busy = sugov_cpu_is_busy(sg_cpu);
+ /* Limits may have changed, don't skip frequency update */
+ busy = !sg_policy->need_freq_update && sugov_cpu_is_busy(sg_cpu);
util = sugov_get_util(sg_cpu);
max = sg_cpu->max;
@@ -831,6 +836,7 @@ static int sugov_start(struct cpufreq_policy *policy)
sg_policy->last_freq_update_time = 0;
sg_policy->next_freq = 0;
sg_policy->work_in_progress = false;
+ sg_policy->limits_changed = false;
sg_policy->need_freq_update = false;
sg_policy->cached_raw_freq = 0;
@@ -879,7 +885,7 @@ static void sugov_limits(struct cpufreq_policy *policy)
mutex_unlock(&sg_policy->work_lock);
}
- sg_policy->need_freq_update = true;
+ sg_policy->limits_changed = true;
}
struct cpufreq_governor schedutil_gov = {
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index ef5b9f6b1d42..39dc9f74f289 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -529,6 +529,7 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq);
static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p)
{
struct rq *later_rq = NULL;
+ struct dl_bw *dl_b;
later_rq = find_lock_later_rq(p, rq);
if (!later_rq) {
@@ -557,6 +558,38 @@ static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p
double_lock_balance(rq, later_rq);
}
+ if (p->dl.dl_non_contending || p->dl.dl_throttled) {
+ /*
+ * Inactive timer is armed (or callback is running, but
+ * waiting for us to release rq locks). In any case, when it
+ * will fire (or continue), it will see running_bw of this
+ * task migrated to later_rq (and correctly handle it).
+ */
+ sub_running_bw(&p->dl, &rq->dl);
+ sub_rq_bw(&p->dl, &rq->dl);
+
+ add_rq_bw(&p->dl, &later_rq->dl);
+ add_running_bw(&p->dl, &later_rq->dl);
+ } else {
+ sub_rq_bw(&p->dl, &rq->dl);
+ add_rq_bw(&p->dl, &later_rq->dl);
+ }
+
+ /*
+ * And we finally need to fixup root_domain(s) bandwidth accounting,
+ * since p is still hanging out in the old (now moved to default) root
+ * domain.
+ */
+ dl_b = &rq->rd->dl_bw;
+ raw_spin_lock(&dl_b->lock);
+ __dl_sub(dl_b, p->dl.dl_bw, cpumask_weight(rq->rd->span));
+ raw_spin_unlock(&dl_b->lock);
+
+ dl_b = &later_rq->rd->dl_bw;
+ raw_spin_lock(&dl_b->lock);
+ __dl_add(dl_b, p->dl.dl_bw, cpumask_weight(later_rq->rd->span));
+ raw_spin_unlock(&dl_b->lock);
+
set_task_cpu(p, later_rq->cpu);
double_unlock_balance(later_rq, rq);
@@ -1694,12 +1727,20 @@ static void start_hrtick_dl(struct rq *rq, struct task_struct *p)
}
#endif
-static inline void set_next_task(struct rq *rq, struct task_struct *p)
+static void set_next_task_dl(struct rq *rq, struct task_struct *p)
{
p->se.exec_start = rq_clock_task(rq);
/* You can't push away the running task */
dequeue_pushable_dl_task(rq, p);
+
+ if (hrtick_enabled(rq))
+ start_hrtick_dl(rq, p);
+
+ if (rq->curr->sched_class != &dl_sched_class)
+ update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 0);
+
+ deadline_queue_push_tasks(rq);
}
static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq,
@@ -1720,64 +1761,42 @@ pick_next_task_dl(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
struct task_struct *p;
struct dl_rq *dl_rq;
- dl_rq = &rq->dl;
+ WARN_ON_ONCE(prev || rf);
- if (need_pull_dl_task(rq, prev)) {
- /*
- * This is OK, because current is on_cpu, which avoids it being
- * picked for load-balance and preemption/IRQs are still
- * disabled avoiding further scheduler activity on it and we're
- * being very careful to re-start the picking loop.
- */
- rq_unpin_lock(rq, rf);
- pull_dl_task(rq);
- rq_repin_lock(rq, rf);
- /*
- * pull_dl_task() can drop (and re-acquire) rq->lock; this
- * means a stop task can slip in, in which case we need to
- * re-start task selection.
- */
- if (rq->stop && task_on_rq_queued(rq->stop))
- return RETRY_TASK;
- }
-
- /*
- * When prev is DL, we may throttle it in put_prev_task().
- * So, we update time before we check for dl_nr_running.
- */
- if (prev->sched_class == &dl_sched_class)
- update_curr_dl(rq);
+ dl_rq = &rq->dl;
if (unlikely(!dl_rq->dl_nr_running))
return NULL;
- put_prev_task(rq, prev);
-
dl_se = pick_next_dl_entity(rq, dl_rq);
BUG_ON(!dl_se);
p = dl_task_of(dl_se);
- set_next_task(rq, p);
-
- if (hrtick_enabled(rq))
- start_hrtick_dl(rq, p);
-
- deadline_queue_push_tasks(rq);
-
- if (rq->curr->sched_class != &dl_sched_class)
- update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 0);
+ set_next_task_dl(rq, p);
return p;
}
-static void put_prev_task_dl(struct rq *rq, struct task_struct *p)
+static void put_prev_task_dl(struct rq *rq, struct task_struct *p, struct rq_flags *rf)
{
update_curr_dl(rq);
update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 1);
if (on_dl_rq(&p->dl) && p->nr_cpus_allowed > 1)
enqueue_pushable_dl_task(rq, p);
+
+ if (rf && !on_dl_rq(&p->dl) && need_pull_dl_task(rq, p)) {
+ /*
+ * This is OK, because current is on_cpu, which avoids it being
+ * picked for load-balance and preemption/IRQs are still
+ * disabled avoiding further scheduler activity on it and we've
+ * not yet started the picking loop.
+ */
+ rq_unpin_lock(rq, rf);
+ pull_dl_task(rq);
+ rq_repin_lock(rq, rf);
+ }
}
/*
@@ -1811,11 +1830,6 @@ static void task_fork_dl(struct task_struct *p)
*/
}
-static void set_curr_task_dl(struct rq *rq)
-{
- set_next_task(rq, rq->curr);
-}
-
#ifdef CONFIG_SMP
/* Only try algorithms three times */
@@ -2088,17 +2102,13 @@ retry:
}
deactivate_task(rq, next_task, 0);
- sub_running_bw(&next_task->dl, &rq->dl);
- sub_rq_bw(&next_task->dl, &rq->dl);
set_task_cpu(next_task, later_rq->cpu);
- add_rq_bw(&next_task->dl, &later_rq->dl);
/*
* Update the later_rq clock here, because the clock is used
* by the cpufreq_update_util() inside __add_running_bw().
*/
update_rq_clock(later_rq);
- add_running_bw(&next_task->dl, &later_rq->dl);
activate_task(later_rq, next_task, ENQUEUE_NOCLOCK);
ret = 1;
@@ -2186,11 +2196,7 @@ static void pull_dl_task(struct rq *this_rq)
resched = true;
deactivate_task(src_rq, p, 0);
- sub_running_bw(&p->dl, &src_rq->dl);
- sub_rq_bw(&p->dl, &src_rq->dl);
set_task_cpu(p, this_cpu);
- add_rq_bw(&p->dl, &this_rq->dl);
- add_running_bw(&p->dl, &this_rq->dl);
activate_task(this_rq, p, 0);
dmin = p->dl.deadline;
@@ -2283,6 +2289,36 @@ void __init init_sched_dl_class(void)
GFP_KERNEL, cpu_to_node(i));
}
+void dl_add_task_root_domain(struct task_struct *p)
+{
+ struct rq_flags rf;
+ struct rq *rq;
+ struct dl_bw *dl_b;
+
+ rq = task_rq_lock(p, &rf);
+ if (!dl_task(p))
+ goto unlock;
+
+ dl_b = &rq->rd->dl_bw;
+ raw_spin_lock(&dl_b->lock);
+
+ __dl_add(dl_b, p->dl.dl_bw, cpumask_weight(rq->rd->span));
+
+ raw_spin_unlock(&dl_b->lock);
+
+unlock:
+ task_rq_unlock(rq, p, &rf);
+}
+
+void dl_clear_root_domain(struct root_domain *rd)
+{
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&rd->dl_bw.lock, flags);
+ rd->dl_bw.total_bw = 0;
+ raw_spin_unlock_irqrestore(&rd->dl_bw.lock, flags);
+}
+
#endif /* CONFIG_SMP */
static void switched_from_dl(struct rq *rq, struct task_struct *p)
@@ -2403,6 +2439,7 @@ const struct sched_class dl_sched_class = {
.pick_next_task = pick_next_task_dl,
.put_prev_task = put_prev_task_dl,
+ .set_next_task = set_next_task_dl,
#ifdef CONFIG_SMP
.select_task_rq = select_task_rq_dl,
@@ -2413,7 +2450,6 @@ const struct sched_class dl_sched_class = {
.task_woken = task_woken_dl,
#endif
- .set_curr_task = set_curr_task_dl,
.task_tick = task_tick_dl,
.task_fork = task_fork_dl,
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 036be95a87e9..d4bbf68c3161 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -96,12 +96,12 @@ int __weak arch_asym_cpu_priority(int cpu)
}
/*
- * The margin used when comparing utilization with CPU capacity:
- * util * margin < capacity * 1024
+ * The margin used when comparing utilization with CPU capacity.
*
* (default: ~20%)
*/
-static unsigned int capacity_margin = 1280;
+#define fits_capacity(cap, max) ((cap) * 1280 < (max) * 1024)
+
#endif
#ifdef CONFIG_CFS_BANDWIDTH
@@ -1086,6 +1086,21 @@ struct numa_group {
unsigned long faults[0];
};
+/*
+ * For functions that can be called in multiple contexts that permit reading
+ * ->numa_group (see struct task_struct for locking rules).
+ */
+static struct numa_group *deref_task_numa_group(struct task_struct *p)
+{
+ return rcu_dereference_check(p->numa_group, p == current ||
+ (lockdep_is_held(&task_rq(p)->lock) && !READ_ONCE(p->on_cpu)));
+}
+
+static struct numa_group *deref_curr_numa_group(struct task_struct *p)
+{
+ return rcu_dereference_protected(p->numa_group, p == current);
+}
+
static inline unsigned long group_faults_priv(struct numa_group *ng);
static inline unsigned long group_faults_shared(struct numa_group *ng);
@@ -1129,10 +1144,12 @@ static unsigned int task_scan_start(struct task_struct *p)
{
unsigned long smin = task_scan_min(p);
unsigned long period = smin;
+ struct numa_group *ng;
/* Scale the maximum scan period with the amount of shared memory. */
- if (p->numa_group) {
- struct numa_group *ng = p->numa_group;
+ rcu_read_lock();
+ ng = rcu_dereference(p->numa_group);
+ if (ng) {
unsigned long shared = group_faults_shared(ng);
unsigned long private = group_faults_priv(ng);
@@ -1140,6 +1157,7 @@ static unsigned int task_scan_start(struct task_struct *p)
period *= shared + 1;
period /= private + shared + 1;
}
+ rcu_read_unlock();
return max(smin, period);
}
@@ -1148,13 +1166,14 @@ static unsigned int task_scan_max(struct task_struct *p)
{
unsigned long smin = task_scan_min(p);
unsigned long smax;
+ struct numa_group *ng;
/* Watch for min being lower than max due to floor calculations */
smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p);
/* Scale the maximum scan period with the amount of shared memory. */
- if (p->numa_group) {
- struct numa_group *ng = p->numa_group;
+ ng = deref_curr_numa_group(p);
+ if (ng) {
unsigned long shared = group_faults_shared(ng);
unsigned long private = group_faults_priv(ng);
unsigned long period = smax;
@@ -1169,47 +1188,6 @@ static unsigned int task_scan_max(struct task_struct *p)
return max(smin, smax);
}
-void init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
-{
- int mm_users = 0;
- struct mm_struct *mm = p->mm;
-
- if (mm) {
- mm_users = atomic_read(&mm->mm_users);
- if (mm_users == 1) {
- mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
- mm->numa_scan_seq = 0;
- }
- }
- p->node_stamp = 0;
- p->numa_scan_seq = mm ? mm->numa_scan_seq : 0;
- p->numa_scan_period = sysctl_numa_balancing_scan_delay;
- p->numa_work.next = &p->numa_work;
- p->numa_faults = NULL;
- p->numa_group = NULL;
- p->last_task_numa_placement = 0;
- p->last_sum_exec_runtime = 0;
-
- /* New address space, reset the preferred nid */
- if (!(clone_flags & CLONE_VM)) {
- p->numa_preferred_nid = NUMA_NO_NODE;
- return;
- }
-
- /*
- * New thread, keep existing numa_preferred_nid which should be copied
- * already by arch_dup_task_struct but stagger when scans start.
- */
- if (mm) {
- unsigned int delay;
-
- delay = min_t(unsigned int, task_scan_max(current),
- current->numa_scan_period * mm_users * NSEC_PER_MSEC);
- delay += 2 * TICK_NSEC;
- p->node_stamp = delay;
- }
-}
-
static void account_numa_enqueue(struct rq *rq, struct task_struct *p)
{
rq->nr_numa_running += (p->numa_preferred_nid != NUMA_NO_NODE);
@@ -1233,7 +1211,16 @@ static void account_numa_dequeue(struct rq *rq, struct task_struct *p)
pid_t task_numa_group_id(struct task_struct *p)
{
- return p->numa_group ? p->numa_group->gid : 0;
+ struct numa_group *ng;
+ pid_t gid = 0;
+
+ rcu_read_lock();
+ ng = rcu_dereference(p->numa_group);
+ if (ng)
+ gid = ng->gid;
+ rcu_read_unlock();
+
+ return gid;
}
/*
@@ -1258,11 +1245,13 @@ static inline unsigned long task_faults(struct task_struct *p, int nid)
static inline unsigned long group_faults(struct task_struct *p, int nid)
{
- if (!p->numa_group)
+ struct numa_group *ng = deref_task_numa_group(p);
+
+ if (!ng)
return 0;
- return p->numa_group->faults[task_faults_idx(NUMA_MEM, nid, 0)] +
- p->numa_group->faults[task_faults_idx(NUMA_MEM, nid, 1)];
+ return ng->faults[task_faults_idx(NUMA_MEM, nid, 0)] +
+ ng->faults[task_faults_idx(NUMA_MEM, nid, 1)];
}
static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)
@@ -1400,12 +1389,13 @@ static inline unsigned long task_weight(struct task_struct *p, int nid,
static inline unsigned long group_weight(struct task_struct *p, int nid,
int dist)
{
+ struct numa_group *ng = deref_task_numa_group(p);
unsigned long faults, total_faults;
- if (!p->numa_group)
+ if (!ng)
return 0;
- total_faults = p->numa_group->total_faults;
+ total_faults = ng->total_faults;
if (!total_faults)
return 0;
@@ -1419,7 +1409,7 @@ static inline unsigned long group_weight(struct task_struct *p, int nid,
bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
int src_nid, int dst_cpu)
{
- struct numa_group *ng = p->numa_group;
+ struct numa_group *ng = deref_curr_numa_group(p);
int dst_nid = cpu_to_node(dst_cpu);
int last_cpupid, this_cpupid;
@@ -1600,13 +1590,14 @@ static bool load_too_imbalanced(long src_load, long dst_load,
static void task_numa_compare(struct task_numa_env *env,
long taskimp, long groupimp, bool maymove)
{
+ struct numa_group *cur_ng, *p_ng = deref_curr_numa_group(env->p);
struct rq *dst_rq = cpu_rq(env->dst_cpu);
+ long imp = p_ng ? groupimp : taskimp;
struct task_struct *cur;
long src_load, dst_load;
- long load;
- long imp = env->p->numa_group ? groupimp : taskimp;
- long moveimp = imp;
int dist = env->dist;
+ long moveimp = imp;
+ long load;
if (READ_ONCE(dst_rq->numa_migrate_on))
return;
@@ -1645,21 +1636,22 @@ static void task_numa_compare(struct task_numa_env *env,
* If dst and source tasks are in the same NUMA group, or not
* in any group then look only at task weights.
*/
- if (cur->numa_group == env->p->numa_group) {
+ cur_ng = rcu_dereference(cur->numa_group);
+ if (cur_ng == p_ng) {
imp = taskimp + task_weight(cur, env->src_nid, dist) -
task_weight(cur, env->dst_nid, dist);
/*
* Add some hysteresis to prevent swapping the
* tasks within a group over tiny differences.
*/
- if (cur->numa_group)
+ if (cur_ng)
imp -= imp / 16;
} else {
/*
* Compare the group weights. If a task is all by itself
* (not part of a group), use the task weight instead.
*/
- if (cur->numa_group && env->p->numa_group)
+ if (cur_ng && p_ng)
imp += group_weight(cur, env->src_nid, dist) -
group_weight(cur, env->dst_nid, dist);
else
@@ -1757,11 +1749,12 @@ static int task_numa_migrate(struct task_struct *p)
.best_imp = 0,
.best_cpu = -1,
};
+ unsigned long taskweight, groupweight;
struct sched_domain *sd;
+ long taskimp, groupimp;
+ struct numa_group *ng;
struct rq *best_rq;
- unsigned long taskweight, groupweight;
int nid, ret, dist;
- long taskimp, groupimp;
/*
* Pick the lowest SD_NUMA domain, as that would have the smallest
@@ -1807,7 +1800,8 @@ static int task_numa_migrate(struct task_struct *p)
* multiple NUMA nodes; in order to better consolidate the group,
* we need to check other locations.
*/
- if (env.best_cpu == -1 || (p->numa_group && p->numa_group->active_nodes > 1)) {
+ ng = deref_curr_numa_group(p);
+ if (env.best_cpu == -1 || (ng && ng->active_nodes > 1)) {
for_each_online_node(nid) {
if (nid == env.src_nid || nid == p->numa_preferred_nid)
continue;
@@ -1840,7 +1834,7 @@ static int task_numa_migrate(struct task_struct *p)
* A task that migrated to a second choice node will be better off
* trying for a better one later. Do not set the preferred node here.
*/
- if (p->numa_group) {
+ if (ng) {
if (env.best_cpu == -1)
nid = env.src_nid;
else
@@ -2135,6 +2129,7 @@ static void task_numa_placement(struct task_struct *p)
unsigned long total_faults;
u64 runtime, period;
spinlock_t *group_lock = NULL;
+ struct numa_group *ng;
/*
* The p->mm->numa_scan_seq field gets updated without
@@ -2152,8 +2147,9 @@ static void task_numa_placement(struct task_struct *p)
runtime = numa_get_avg_runtime(p, &period);
/* If the task is part of a group prevent parallel updates to group stats */
- if (p->numa_group) {
- group_lock = &p->numa_group->lock;
+ ng = deref_curr_numa_group(p);
+ if (ng) {
+ group_lock = &ng->lock;
spin_lock_irq(group_lock);
}
@@ -2194,7 +2190,7 @@ static void task_numa_placement(struct task_struct *p)
p->numa_faults[cpu_idx] += f_diff;
faults += p->numa_faults[mem_idx];
p->total_numa_faults += diff;
- if (p->numa_group) {
+ if (ng) {
/*
* safe because we can only change our own group
*
@@ -2202,14 +2198,14 @@ static void task_numa_placement(struct task_struct *p)
* nid and priv in a specific region because it
* is at the beginning of the numa_faults array.
*/
- p->numa_group->faults[mem_idx] += diff;
- p->numa_group->faults_cpu[mem_idx] += f_diff;
- p->numa_group->total_faults += diff;
- group_faults += p->numa_group->faults[mem_idx];
+ ng->faults[mem_idx] += diff;
+ ng->faults_cpu[mem_idx] += f_diff;
+ ng->total_faults += diff;
+ group_faults += ng->faults[mem_idx];
}
}
- if (!p->numa_group) {
+ if (!ng) {
if (faults > max_faults) {
max_faults = faults;
max_nid = nid;
@@ -2220,8 +2216,8 @@ static void task_numa_placement(struct task_struct *p)
}
}
- if (p->numa_group) {
- numa_group_count_active_nodes(p->numa_group);
+ if (ng) {
+ numa_group_count_active_nodes(ng);
spin_unlock_irq(group_lock);
max_nid = preferred_group_nid(p, max_nid);
}
@@ -2255,7 +2251,7 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
int cpu = cpupid_to_cpu(cpupid);
int i;
- if (unlikely(!p->numa_group)) {
+ if (unlikely(!deref_curr_numa_group(p))) {
unsigned int size = sizeof(struct numa_group) +
4*nr_node_ids*sizeof(unsigned long);
@@ -2291,7 +2287,7 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
if (!grp)
goto no_join;
- my_grp = p->numa_group;
+ my_grp = deref_curr_numa_group(p);
if (grp == my_grp)
goto no_join;
@@ -2353,13 +2349,24 @@ no_join:
return;
}
-void task_numa_free(struct task_struct *p)
+/*
+ * Get rid of NUMA staticstics associated with a task (either current or dead).
+ * If @final is set, the task is dead and has reached refcount zero, so we can
+ * safely free all relevant data structures. Otherwise, there might be
+ * concurrent reads from places like load balancing and procfs, and we should
+ * reset the data back to default state without freeing ->numa_faults.
+ */
+void task_numa_free(struct task_struct *p, bool final)
{
- struct numa_group *grp = p->numa_group;
- void *numa_faults = p->numa_faults;
+ /* safe: p either is current or is being freed by current */
+ struct numa_group *grp = rcu_dereference_raw(p->numa_group);
+ unsigned long *numa_faults = p->numa_faults;
unsigned long flags;
int i;
+ if (!numa_faults)
+ return;
+
if (grp) {
spin_lock_irqsave(&grp->lock, flags);
for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
@@ -2372,8 +2379,14 @@ void task_numa_free(struct task_struct *p)
put_numa_group(grp);
}
- p->numa_faults = NULL;
- kfree(numa_faults);
+ if (final) {
+ p->numa_faults = NULL;
+ kfree(numa_faults);
+ } else {
+ p->total_numa_faults = 0;
+ for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
+ numa_faults[i] = 0;
+ }
}
/*
@@ -2426,7 +2439,7 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
* actively using should be counted as local. This allows the
* scan rate to slow down when a workload has settled down.
*/
- ng = p->numa_group;
+ ng = deref_curr_numa_group(p);
if (!priv && !local && ng && ng->active_nodes > 1 &&
numa_is_active_node(cpu_node, ng) &&
numa_is_active_node(mem_node, ng))
@@ -2469,7 +2482,7 @@ static void reset_ptenuma_scan(struct task_struct *p)
* The expensive part of numa migration is done from task_work context.
* Triggered from task_tick_numa().
*/
-void task_numa_work(struct callback_head *work)
+static void task_numa_work(struct callback_head *work)
{
unsigned long migrate, next_scan, now = jiffies;
struct task_struct *p = current;
@@ -2482,7 +2495,7 @@ void task_numa_work(struct callback_head *work)
SCHED_WARN_ON(p != container_of(work, struct task_struct, numa_work));
- work->next = work; /* protect against double add */
+ work->next = work;
/*
* Who cares about NUMA placement when they're dying.
*
@@ -2611,6 +2624,50 @@ out:
}
}
+void init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
+{
+ int mm_users = 0;
+ struct mm_struct *mm = p->mm;
+
+ if (mm) {
+ mm_users = atomic_read(&mm->mm_users);
+ if (mm_users == 1) {
+ mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
+ mm->numa_scan_seq = 0;
+ }
+ }
+ p->node_stamp = 0;
+ p->numa_scan_seq = mm ? mm->numa_scan_seq : 0;
+ p->numa_scan_period = sysctl_numa_balancing_scan_delay;
+ /* Protect against double add, see task_tick_numa and task_numa_work */
+ p->numa_work.next = &p->numa_work;
+ p->numa_faults = NULL;
+ RCU_INIT_POINTER(p->numa_group, NULL);
+ p->last_task_numa_placement = 0;
+ p->last_sum_exec_runtime = 0;
+
+ init_task_work(&p->numa_work, task_numa_work);
+
+ /* New address space, reset the preferred nid */
+ if (!(clone_flags & CLONE_VM)) {
+ p->numa_preferred_nid = NUMA_NO_NODE;
+ return;
+ }
+
+ /*
+ * New thread, keep existing numa_preferred_nid which should be copied
+ * already by arch_dup_task_struct but stagger when scans start.
+ */
+ if (mm) {
+ unsigned int delay;
+
+ delay = min_t(unsigned int, task_scan_max(current),
+ current->numa_scan_period * mm_users * NSEC_PER_MSEC);
+ delay += 2 * TICK_NSEC;
+ p->node_stamp = delay;
+ }
+}
+
/*
* Drive the periodic memory faults..
*/
@@ -2639,10 +2696,8 @@ static void task_tick_numa(struct rq *rq, struct task_struct *curr)
curr->numa_scan_period = task_scan_start(curr);
curr->node_stamp += period;
- if (!time_before(jiffies, curr->mm->numa_next_scan)) {
- init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */
+ if (!time_before(jiffies, curr->mm->numa_next_scan))
task_work_add(curr, work, true);
- }
}
}
@@ -3635,8 +3690,6 @@ static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq)
return cfs_rq->avg.load_avg;
}
-static int idle_balance(struct rq *this_rq, struct rq_flags *rf);
-
static inline unsigned long task_util(struct task_struct *p)
{
return READ_ONCE(p->se.avg.util_avg);
@@ -3753,7 +3806,7 @@ util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep)
static inline int task_fits_capacity(struct task_struct *p, long capacity)
{
- return capacity * 1024 > task_util_est(p) * capacity_margin;
+ return fits_capacity(task_util_est(p), capacity);
}
static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
@@ -4316,8 +4369,6 @@ void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
now = sched_clock_cpu(smp_processor_id());
cfs_b->runtime = cfs_b->quota;
- cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period);
- cfs_b->expires_seq++;
}
static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
@@ -4339,8 +4390,7 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
{
struct task_group *tg = cfs_rq->tg;
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
- u64 amount = 0, min_amount, expires;
- int expires_seq;
+ u64 amount = 0, min_amount;
/* note: this is a positive sum as runtime_remaining <= 0 */
min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
@@ -4357,65 +4407,23 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
cfs_b->idle = 0;
}
}
- expires_seq = cfs_b->expires_seq;
- expires = cfs_b->runtime_expires;
raw_spin_unlock(&cfs_b->lock);
cfs_rq->runtime_remaining += amount;
- /*
- * we may have advanced our local expiration to account for allowed
- * spread between our sched_clock and the one on which runtime was
- * issued.
- */
- if (cfs_rq->expires_seq != expires_seq) {
- cfs_rq->expires_seq = expires_seq;
- cfs_rq->runtime_expires = expires;
- }
return cfs_rq->runtime_remaining > 0;
}
-/*
- * Note: This depends on the synchronization provided by sched_clock and the
- * fact that rq->clock snapshots this value.
- */
-static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
-{
- struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
-
- /* if the deadline is ahead of our clock, nothing to do */
- if (likely((s64)(rq_clock(rq_of(cfs_rq)) - cfs_rq->runtime_expires) < 0))
- return;
-
- if (cfs_rq->runtime_remaining < 0)
- return;
-
- /*
- * If the local deadline has passed we have to consider the
- * possibility that our sched_clock is 'fast' and the global deadline
- * has not truly expired.
- *
- * Fortunately we can check determine whether this the case by checking
- * whether the global deadline(cfs_b->expires_seq) has advanced.
- */
- if (cfs_rq->expires_seq == cfs_b->expires_seq) {
- /* extend local deadline, drift is bounded above by 2 ticks */
- cfs_rq->runtime_expires += TICK_NSEC;
- } else {
- /* global deadline is ahead, expiration has passed */
- cfs_rq->runtime_remaining = 0;
- }
-}
-
static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
{
/* dock delta_exec before expiring quota (as it could span periods) */
cfs_rq->runtime_remaining -= delta_exec;
- expire_cfs_rq_runtime(cfs_rq);
if (likely(cfs_rq->runtime_remaining > 0))
return;
+ if (cfs_rq->throttled)
+ return;
/*
* if we're unable to extend our runtime we resched so that the active
* hierarchy can be throttled
@@ -4500,7 +4508,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
struct rq *rq = rq_of(cfs_rq);
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
struct sched_entity *se;
- long task_delta, dequeue = 1;
+ long task_delta, idle_task_delta, dequeue = 1;
bool empty;
se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
@@ -4511,6 +4519,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
rcu_read_unlock();
task_delta = cfs_rq->h_nr_running;
+ idle_task_delta = cfs_rq->idle_h_nr_running;
for_each_sched_entity(se) {
struct cfs_rq *qcfs_rq = cfs_rq_of(se);
/* throttled entity or throttle-on-deactivate */
@@ -4520,6 +4529,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
if (dequeue)
dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
qcfs_rq->h_nr_running -= task_delta;
+ qcfs_rq->idle_h_nr_running -= idle_task_delta;
if (qcfs_rq->load.weight)
dequeue = 0;
@@ -4559,7 +4569,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
struct sched_entity *se;
int enqueue = 1;
- long task_delta;
+ long task_delta, idle_task_delta;
se = cfs_rq->tg->se[cpu_of(rq)];
@@ -4579,6 +4589,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
return;
task_delta = cfs_rq->h_nr_running;
+ idle_task_delta = cfs_rq->idle_h_nr_running;
for_each_sched_entity(se) {
if (se->on_rq)
enqueue = 0;
@@ -4587,6 +4598,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
if (enqueue)
enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
cfs_rq->h_nr_running += task_delta;
+ cfs_rq->idle_h_nr_running += idle_task_delta;
if (cfs_rq_throttled(cfs_rq))
break;
@@ -4602,8 +4614,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
resched_curr(rq);
}
-static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
- u64 remaining, u64 expires)
+static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, u64 remaining)
{
struct cfs_rq *cfs_rq;
u64 runtime;
@@ -4619,13 +4630,15 @@ static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
if (!cfs_rq_throttled(cfs_rq))
goto next;
+ /* By the above check, this should never be true */
+ SCHED_WARN_ON(cfs_rq->runtime_remaining > 0);
+
runtime = -cfs_rq->runtime_remaining + 1;
if (runtime > remaining)
runtime = remaining;
remaining -= runtime;
cfs_rq->runtime_remaining += runtime;
- cfs_rq->runtime_expires = expires;
/* we check whether we're throttled above */
if (cfs_rq->runtime_remaining > 0)
@@ -4650,7 +4663,7 @@ next:
*/
static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, unsigned long flags)
{
- u64 runtime, runtime_expires;
+ u64 runtime;
int throttled;
/* no need to continue the timer with no bandwidth constraint */
@@ -4678,8 +4691,6 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, u
/* account preceding periods in which throttling occurred */
cfs_b->nr_throttled += overrun;
- runtime_expires = cfs_b->runtime_expires;
-
/*
* This check is repeated as we are holding onto the new bandwidth while
* we unthrottle. This can potentially race with an unthrottled group
@@ -4692,8 +4703,7 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, u
cfs_b->distribute_running = 1;
raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
/* we can't nest cfs_b->lock while distributing bandwidth */
- runtime = distribute_cfs_runtime(cfs_b, runtime,
- runtime_expires);
+ runtime = distribute_cfs_runtime(cfs_b, runtime);
raw_spin_lock_irqsave(&cfs_b->lock, flags);
cfs_b->distribute_running = 0;
@@ -4775,8 +4785,7 @@ static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
return;
raw_spin_lock(&cfs_b->lock);
- if (cfs_b->quota != RUNTIME_INF &&
- cfs_rq->runtime_expires == cfs_b->runtime_expires) {
+ if (cfs_b->quota != RUNTIME_INF) {
cfs_b->runtime += slack_runtime;
/* we are under rq->lock, defer unthrottling using a timer */
@@ -4809,7 +4818,6 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
{
u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
unsigned long flags;
- u64 expires;
/* confirm we're still not at a refresh boundary */
raw_spin_lock_irqsave(&cfs_b->lock, flags);
@@ -4827,7 +4835,6 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice)
runtime = cfs_b->runtime;
- expires = cfs_b->runtime_expires;
if (runtime)
cfs_b->distribute_running = 1;
@@ -4836,11 +4843,10 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
if (!runtime)
return;
- runtime = distribute_cfs_runtime(cfs_b, runtime, expires);
+ runtime = distribute_cfs_runtime(cfs_b, runtime);
raw_spin_lock_irqsave(&cfs_b->lock, flags);
- if (expires == cfs_b->runtime_expires)
- lsub_positive(&cfs_b->runtime, runtime);
+ lsub_positive(&cfs_b->runtime, runtime);
cfs_b->distribute_running = 0;
raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
}
@@ -4997,8 +5003,6 @@ void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
cfs_b->period_active = 1;
overrun = hrtimer_forward_now(&cfs_b->period_timer, cfs_b->period);
- cfs_b->runtime_expires += (overrun + 1) * ktime_to_ns(cfs_b->period);
- cfs_b->expires_seq++;
hrtimer_start_expires(&cfs_b->period_timer, HRTIMER_MODE_ABS_PINNED);
}
@@ -5176,7 +5180,7 @@ static inline unsigned long cpu_util(int cpu);
static inline bool cpu_overutilized(int cpu)
{
- return (capacity_of(cpu) * 1024) < (cpu_util(cpu) * capacity_margin);
+ return !fits_capacity(cpu_util(cpu), capacity_of(cpu));
}
static inline void update_overutilized_status(struct rq *rq)
@@ -5200,6 +5204,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
{
struct cfs_rq *cfs_rq;
struct sched_entity *se = &p->se;
+ int idle_h_nr_running = task_has_idle_policy(p);
/*
* The code below (indirectly) updates schedutil which looks at
@@ -5232,6 +5237,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
if (cfs_rq_throttled(cfs_rq))
break;
cfs_rq->h_nr_running++;
+ cfs_rq->idle_h_nr_running += idle_h_nr_running;
flags = ENQUEUE_WAKEUP;
}
@@ -5239,6 +5245,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
cfs_rq->h_nr_running++;
+ cfs_rq->idle_h_nr_running += idle_h_nr_running;
if (cfs_rq_throttled(cfs_rq))
break;
@@ -5300,6 +5307,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
struct cfs_rq *cfs_rq;
struct sched_entity *se = &p->se;
int task_sleep = flags & DEQUEUE_SLEEP;
+ int idle_h_nr_running = task_has_idle_policy(p);
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
@@ -5314,6 +5322,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
if (cfs_rq_throttled(cfs_rq))
break;
cfs_rq->h_nr_running--;
+ cfs_rq->idle_h_nr_running -= idle_h_nr_running;
/* Don't dequeue parent if it has other entities besides us */
if (cfs_rq->load.weight) {
@@ -5333,6 +5342,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
cfs_rq->h_nr_running--;
+ cfs_rq->idle_h_nr_running -= idle_h_nr_running;
if (cfs_rq_throttled(cfs_rq))
break;
@@ -5366,6 +5376,15 @@ static struct {
#endif /* CONFIG_NO_HZ_COMMON */
+/* CPU only has SCHED_IDLE tasks enqueued */
+static int sched_idle_cpu(int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+
+ return unlikely(rq->nr_running == rq->cfs.idle_h_nr_running &&
+ rq->nr_running);
+}
+
static unsigned long cpu_runnable_load(struct rq *rq)
{
return cfs_rq_runnable_load_avg(&rq->cfs);
@@ -5688,7 +5707,7 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this
unsigned int min_exit_latency = UINT_MAX;
u64 latest_idle_timestamp = 0;
int least_loaded_cpu = this_cpu;
- int shallowest_idle_cpu = -1;
+ int shallowest_idle_cpu = -1, si_cpu = -1;
int i;
/* Check if we have any choice: */
@@ -5719,7 +5738,12 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this
latest_idle_timestamp = rq->idle_stamp;
shallowest_idle_cpu = i;
}
- } else if (shallowest_idle_cpu == -1) {
+ } else if (shallowest_idle_cpu == -1 && si_cpu == -1) {
+ if (sched_idle_cpu(i)) {
+ si_cpu = i;
+ continue;
+ }
+
load = cpu_runnable_load(cpu_rq(i));
if (load < min_load) {
min_load = load;
@@ -5728,7 +5752,11 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this
}
}
- return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu;
+ if (shallowest_idle_cpu != -1)
+ return shallowest_idle_cpu;
+ if (si_cpu != -1)
+ return si_cpu;
+ return least_loaded_cpu;
}
static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p,
@@ -5881,7 +5909,7 @@ static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int
*/
static int select_idle_smt(struct task_struct *p, int target)
{
- int cpu;
+ int cpu, si_cpu = -1;
if (!static_branch_likely(&sched_smt_present))
return -1;
@@ -5891,9 +5919,11 @@ static int select_idle_smt(struct task_struct *p, int target)
continue;
if (available_idle_cpu(cpu))
return cpu;
+ if (si_cpu == -1 && sched_idle_cpu(cpu))
+ si_cpu = cpu;
}
- return -1;
+ return si_cpu;
}
#else /* CONFIG_SCHED_SMT */
@@ -5921,8 +5951,8 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t
u64 avg_cost, avg_idle;
u64 time, cost;
s64 delta;
- int cpu, nr = INT_MAX;
int this = smp_processor_id();
+ int cpu, nr = INT_MAX, si_cpu = -1;
this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
if (!this_sd)
@@ -5950,11 +5980,13 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t
for_each_cpu_wrap(cpu, sched_domain_span(sd), target) {
if (!--nr)
- return -1;
+ return si_cpu;
if (!cpumask_test_cpu(cpu, p->cpus_ptr))
continue;
if (available_idle_cpu(cpu))
break;
+ if (si_cpu == -1 && sched_idle_cpu(cpu))
+ si_cpu = cpu;
}
time = cpu_clock(this) - time;
@@ -5973,13 +6005,14 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
struct sched_domain *sd;
int i, recent_used_cpu;
- if (available_idle_cpu(target))
+ if (available_idle_cpu(target) || sched_idle_cpu(target))
return target;
/*
* If the previous CPU is cache affine and idle, don't be stupid:
*/
- if (prev != target && cpus_share_cache(prev, target) && available_idle_cpu(prev))
+ if (prev != target && cpus_share_cache(prev, target) &&
+ (available_idle_cpu(prev) || sched_idle_cpu(prev)))
return prev;
/* Check a recently used CPU as a potential idle candidate: */
@@ -5987,7 +6020,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
if (recent_used_cpu != prev &&
recent_used_cpu != target &&
cpus_share_cache(recent_used_cpu, target) &&
- available_idle_cpu(recent_used_cpu) &&
+ (available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) &&
cpumask_test_cpu(p->recent_used_cpu, p->cpus_ptr)) {
/*
* Replace recent_used_cpu with prev as it is a potential
@@ -6223,69 +6256,55 @@ static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu)
}
/*
- * compute_energy(): Estimates the energy that would be consumed if @p was
+ * compute_energy(): Estimates the energy that @pd would consume if @p was
* migrated to @dst_cpu. compute_energy() predicts what will be the utilization
- * landscape of the * CPUs after the task migration, and uses the Energy Model
+ * landscape of @pd's CPUs after the task migration, and uses the Energy Model
* to compute what would be the energy if we decided to actually migrate that
* task.
*/
static long
compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd)
{
- unsigned int max_util, util_cfs, cpu_util, cpu_cap;
- unsigned long sum_util, energy = 0;
- struct task_struct *tsk;
+ struct cpumask *pd_mask = perf_domain_span(pd);
+ unsigned long cpu_cap = arch_scale_cpu_capacity(cpumask_first(pd_mask));
+ unsigned long max_util = 0, sum_util = 0;
int cpu;
- for (; pd; pd = pd->next) {
- struct cpumask *pd_mask = perf_domain_span(pd);
+ /*
+ * The capacity state of CPUs of the current rd can be driven by CPUs
+ * of another rd if they belong to the same pd. So, account for the
+ * utilization of these CPUs too by masking pd with cpu_online_mask
+ * instead of the rd span.
+ *
+ * If an entire pd is outside of the current rd, it will not appear in
+ * its pd list and will not be accounted by compute_energy().
+ */
+ for_each_cpu_and(cpu, pd_mask, cpu_online_mask) {
+ unsigned long cpu_util, util_cfs = cpu_util_next(cpu, p, dst_cpu);
+ struct task_struct *tsk = cpu == dst_cpu ? p : NULL;
/*
- * The energy model mandates all the CPUs of a performance
- * domain have the same capacity.
+ * Busy time computation: utilization clamping is not
+ * required since the ratio (sum_util / cpu_capacity)
+ * is already enough to scale the EM reported power
+ * consumption at the (eventually clamped) cpu_capacity.
*/
- cpu_cap = arch_scale_cpu_capacity(cpumask_first(pd_mask));
- max_util = sum_util = 0;
+ sum_util += schedutil_cpu_util(cpu, util_cfs, cpu_cap,
+ ENERGY_UTIL, NULL);
/*
- * The capacity state of CPUs of the current rd can be driven by
- * CPUs of another rd if they belong to the same performance
- * domain. So, account for the utilization of these CPUs too
- * by masking pd with cpu_online_mask instead of the rd span.
- *
- * If an entire performance domain is outside of the current rd,
- * it will not appear in its pd list and will not be accounted
- * by compute_energy().
+ * Performance domain frequency: utilization clamping
+ * must be considered since it affects the selection
+ * of the performance domain frequency.
+ * NOTE: in case RT tasks are running, by default the
+ * FREQUENCY_UTIL's utilization can be max OPP.
*/
- for_each_cpu_and(cpu, pd_mask, cpu_online_mask) {
- util_cfs = cpu_util_next(cpu, p, dst_cpu);
-
- /*
- * Busy time computation: utilization clamping is not
- * required since the ratio (sum_util / cpu_capacity)
- * is already enough to scale the EM reported power
- * consumption at the (eventually clamped) cpu_capacity.
- */
- sum_util += schedutil_cpu_util(cpu, util_cfs, cpu_cap,
- ENERGY_UTIL, NULL);
-
- /*
- * Performance domain frequency: utilization clamping
- * must be considered since it affects the selection
- * of the performance domain frequency.
- * NOTE: in case RT tasks are running, by default the
- * FREQUENCY_UTIL's utilization can be max OPP.
- */
- tsk = cpu == dst_cpu ? p : NULL;
- cpu_util = schedutil_cpu_util(cpu, util_cfs, cpu_cap,
- FREQUENCY_UTIL, tsk);
- max_util = max(max_util, cpu_util);
- }
-
- energy += em_pd_energy(pd->em_pd, max_util, sum_util);
+ cpu_util = schedutil_cpu_util(cpu, util_cfs, cpu_cap,
+ FREQUENCY_UTIL, tsk);
+ max_util = max(max_util, cpu_util);
}
- return energy;
+ return em_pd_energy(pd->em_pd, max_util, sum_util);
}
/*
@@ -6327,21 +6346,19 @@ compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd)
* other use-cases too. So, until someone finds a better way to solve this,
* let's keep things simple by re-using the existing slow path.
*/
-
static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
{
- unsigned long prev_energy = ULONG_MAX, best_energy = ULONG_MAX;
+ unsigned long prev_delta = ULONG_MAX, best_delta = ULONG_MAX;
struct root_domain *rd = cpu_rq(smp_processor_id())->rd;
+ unsigned long cpu_cap, util, base_energy = 0;
int cpu, best_energy_cpu = prev_cpu;
- struct perf_domain *head, *pd;
- unsigned long cpu_cap, util;
struct sched_domain *sd;
+ struct perf_domain *pd;
rcu_read_lock();
pd = rcu_dereference(rd->pd);
if (!pd || READ_ONCE(rd->overutilized))
goto fail;
- head = pd;
/*
* Energy-aware wake-up happens on the lowest sched_domain starting
@@ -6358,9 +6375,14 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
goto unlock;
for (; pd; pd = pd->next) {
- unsigned long cur_energy, spare_cap, max_spare_cap = 0;
+ unsigned long cur_delta, spare_cap, max_spare_cap = 0;
+ unsigned long base_energy_pd;
int max_spare_cap_cpu = -1;
+ /* Compute the 'base' energy of the pd, without @p */
+ base_energy_pd = compute_energy(p, -1, pd);
+ base_energy += base_energy_pd;
+
for_each_cpu_and(cpu, perf_domain_span(pd), sched_domain_span(sd)) {
if (!cpumask_test_cpu(cpu, p->cpus_ptr))
continue;
@@ -6368,14 +6390,14 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
/* Skip CPUs that will be overutilized. */
util = cpu_util_next(cpu, p, cpu);
cpu_cap = capacity_of(cpu);
- if (cpu_cap * 1024 < util * capacity_margin)
+ if (!fits_capacity(util, cpu_cap))
continue;
/* Always use prev_cpu as a candidate. */
if (cpu == prev_cpu) {
- prev_energy = compute_energy(p, prev_cpu, head);
- best_energy = min(best_energy, prev_energy);
- continue;
+ prev_delta = compute_energy(p, prev_cpu, pd);
+ prev_delta -= base_energy_pd;
+ best_delta = min(best_delta, prev_delta);
}
/*
@@ -6391,9 +6413,10 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
/* Evaluate the energy impact of using this CPU. */
if (max_spare_cap_cpu >= 0) {
- cur_energy = compute_energy(p, max_spare_cap_cpu, head);
- if (cur_energy < best_energy) {
- best_energy = cur_energy;
+ cur_delta = compute_energy(p, max_spare_cap_cpu, pd);
+ cur_delta -= base_energy_pd;
+ if (cur_delta < best_delta) {
+ best_delta = cur_delta;
best_energy_cpu = max_spare_cap_cpu;
}
}
@@ -6405,10 +6428,10 @@ unlock:
* Pick the best CPU if prev_cpu cannot be used, or if it saves at
* least 6% of the energy used by prev_cpu.
*/
- if (prev_energy == ULONG_MAX)
+ if (prev_delta == ULONG_MAX)
return best_energy_cpu;
- if ((prev_energy - best_energy) > (prev_energy >> 4))
+ if ((prev_delta - best_delta) > ((prev_delta + base_energy) >> 4))
return best_energy_cpu;
return prev_cpu;
@@ -6742,7 +6765,7 @@ again:
goto idle;
#ifdef CONFIG_FAIR_GROUP_SCHED
- if (prev->sched_class != &fair_sched_class)
+ if (!prev || prev->sched_class != &fair_sched_class)
goto simple;
/*
@@ -6819,8 +6842,8 @@ again:
goto done;
simple:
#endif
-
- put_prev_task(rq, prev);
+ if (prev)
+ put_prev_task(rq, prev);
do {
se = pick_next_entity(cfs_rq, NULL);
@@ -6848,11 +6871,13 @@ done: __maybe_unused;
return p;
idle:
- update_misfit_status(NULL, rq);
- new_tasks = idle_balance(rq, rf);
+ if (!rf)
+ return NULL;
+
+ new_tasks = newidle_balance(rq, rf);
/*
- * Because idle_balance() releases (and re-acquires) rq->lock, it is
+ * Because newidle_balance() releases (and re-acquires) rq->lock, it is
* possible for any higher priority task to appear. In that case we
* must re-start the pick_next_entity() loop.
*/
@@ -6874,7 +6899,7 @@ idle:
/*
* Account for a descheduled task:
*/
-static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
+static void put_prev_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
{
struct sched_entity *se = &prev->se;
struct cfs_rq *cfs_rq;
@@ -7376,7 +7401,7 @@ static int detach_tasks(struct lb_env *env)
detached++;
env->imbalance -= load;
-#ifdef CONFIG_PREEMPT
+#ifdef CONFIG_PREEMPTION
/*
* NEWIDLE balancing is a source of latency, so preemptible
* kernels will stop after the first task is detached to minimize
@@ -7923,8 +7948,7 @@ group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs)
static inline bool
group_smaller_min_cpu_capacity(struct sched_group *sg, struct sched_group *ref)
{
- return sg->sgc->min_capacity * capacity_margin <
- ref->sgc->min_capacity * 1024;
+ return fits_capacity(sg->sgc->min_capacity, ref->sgc->min_capacity);
}
/*
@@ -7934,8 +7958,7 @@ group_smaller_min_cpu_capacity(struct sched_group *sg, struct sched_group *ref)
static inline bool
group_smaller_max_cpu_capacity(struct sched_group *sg, struct sched_group *ref)
{
- return sg->sgc->max_capacity * capacity_margin <
- ref->sgc->max_capacity * 1024;
+ return fits_capacity(sg->sgc->max_capacity, ref->sgc->max_capacity);
}
static inline enum
@@ -8993,9 +9016,10 @@ more_balance:
out_balanced:
/*
* We reach balance although we may have faced some affinity
- * constraints. Clear the imbalance flag if it was set.
+ * constraints. Clear the imbalance flag only if other tasks got
+ * a chance to move and fix the imbalance.
*/
- if (sd_parent) {
+ if (sd_parent && !(env.flags & LBF_ALL_PINNED)) {
int *group_imbalance = &sd_parent->groups->sgc->imbalance;
if (*group_imbalance)
@@ -9016,10 +9040,10 @@ out_one_pinned:
ld_moved = 0;
/*
- * idle_balance() disregards balance intervals, so we could repeatedly
- * reach this code, which would lead to balance_interval skyrocketting
- * in a short amount of time. Skip the balance_interval increase logic
- * to avoid that.
+ * newidle_balance() disregards balance intervals, so we could
+ * repeatedly reach this code, which would lead to balance_interval
+ * skyrocketting in a short amount of time. Skip the balance_interval
+ * increase logic to avoid that.
*/
if (env.idle == CPU_NEWLY_IDLE)
goto out;
@@ -9729,7 +9753,7 @@ static inline void nohz_newidle_balance(struct rq *this_rq) { }
* idle_balance is called by schedule() if this_cpu is about to become
* idle. Attempts to pull tasks from other CPUs.
*/
-static int idle_balance(struct rq *this_rq, struct rq_flags *rf)
+int newidle_balance(struct rq *this_rq, struct rq_flags *rf)
{
unsigned long next_balance = jiffies + HZ;
int this_cpu = this_rq->cpu;
@@ -9737,6 +9761,7 @@ static int idle_balance(struct rq *this_rq, struct rq_flags *rf)
int pulled_task = 0;
u64 curr_cost = 0;
+ update_misfit_status(NULL, this_rq);
/*
* We must set idle_stamp _before_ calling idle_balance(), such that we
* measure the duration of idle_balance() as idle time.
@@ -10121,9 +10146,19 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p)
* This routine is mostly called to set cfs_rq->curr field when a task
* migrates between groups/classes.
*/
-static void set_curr_task_fair(struct rq *rq)
+static void set_next_task_fair(struct rq *rq, struct task_struct *p)
{
- struct sched_entity *se = &rq->curr->se;
+ struct sched_entity *se = &p->se;
+
+#ifdef CONFIG_SMP
+ if (task_on_rq_queued(p)) {
+ /*
+ * Move the next running task to the front of the list, so our
+ * cfs_tasks list becomes MRU one.
+ */
+ list_move(&se->group_node, &rq->cfs_tasks);
+ }
+#endif
for_each_sched_entity(se) {
struct cfs_rq *cfs_rq = cfs_rq_of(se);
@@ -10241,18 +10276,18 @@ err:
void online_fair_sched_group(struct task_group *tg)
{
struct sched_entity *se;
+ struct rq_flags rf;
struct rq *rq;
int i;
for_each_possible_cpu(i) {
rq = cpu_rq(i);
se = tg->se[i];
-
- raw_spin_lock_irq(&rq->lock);
+ rq_lock_irq(rq, &rf);
update_rq_clock(rq);
attach_entity_cfs_rq(se);
sync_throttle(tg, i);
- raw_spin_unlock_irq(&rq->lock);
+ rq_unlock_irq(rq, &rf);
}
}
@@ -10394,7 +10429,9 @@ const struct sched_class fair_sched_class = {
.check_preempt_curr = check_preempt_wakeup,
.pick_next_task = pick_next_task_fair,
+
.put_prev_task = put_prev_task_fair,
+ .set_next_task = set_next_task_fair,
#ifdef CONFIG_SMP
.select_task_rq = select_task_rq_fair,
@@ -10407,7 +10444,6 @@ const struct sched_class fair_sched_class = {
.set_cpus_allowed = set_cpus_allowed_common,
#endif
- .set_curr_task = set_curr_task_fair,
.task_tick = task_tick_fair,
.task_fork = task_fork_fair,
@@ -10444,18 +10480,22 @@ void show_numa_stats(struct task_struct *p, struct seq_file *m)
{
int node;
unsigned long tsf = 0, tpf = 0, gsf = 0, gpf = 0;
+ struct numa_group *ng;
+ rcu_read_lock();
+ ng = rcu_dereference(p->numa_group);
for_each_online_node(node) {
if (p->numa_faults) {
tsf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 0)];
tpf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 1)];
}
- if (p->numa_group) {
- gsf = p->numa_group->faults[task_faults_idx(NUMA_MEM, node, 0)],
- gpf = p->numa_group->faults[task_faults_idx(NUMA_MEM, node, 1)];
+ if (ng) {
+ gsf = ng->faults[task_faults_idx(NUMA_MEM, node, 0)],
+ gpf = ng->faults[task_faults_idx(NUMA_MEM, node, 1)];
}
print_numa_stats(m, node, tsf, tpf, gsf, gpf);
}
+ rcu_read_unlock();
}
#endif /* CONFIG_NUMA_BALANCING */
#endif /* CONFIG_SCHED_DEBUG */
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 80940939b733..8bfeb6395bdd 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -241,13 +241,14 @@ static void do_idle(void)
check_pgt_cache();
rmb();
+ local_irq_disable();
+
if (cpu_is_offline(cpu)) {
- tick_nohz_idle_stop_tick_protected();
+ tick_nohz_idle_stop_tick();
cpuhp_report_idle_dead();
arch_cpu_idle_dead();
}
- local_irq_disable();
arch_cpu_idle_enter();
/*
@@ -374,14 +375,27 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl
resched_curr(rq);
}
-static struct task_struct *
-pick_next_task_idle(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
+static void put_prev_task_idle(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
+{
+}
+
+static void set_next_task_idle(struct rq *rq, struct task_struct *next)
{
- put_prev_task(rq, prev);
update_idle_core(rq);
schedstat_inc(rq->sched_goidle);
+}
+
+static struct task_struct *
+pick_next_task_idle(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
+{
+ struct task_struct *next = rq->idle;
- return rq->idle;
+ if (prev)
+ put_prev_task(rq, prev);
+
+ set_next_task_idle(rq, next);
+
+ return next;
}
/*
@@ -397,10 +411,6 @@ dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags)
raw_spin_lock_irq(&rq->lock);
}
-static void put_prev_task_idle(struct rq *rq, struct task_struct *prev)
-{
-}
-
/*
* scheduler tick hitting a task of our scheduling class.
*
@@ -413,10 +423,6 @@ static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued)
{
}
-static void set_curr_task_idle(struct rq *rq)
-{
-}
-
static void switched_to_idle(struct rq *rq, struct task_struct *p)
{
BUG();
@@ -451,13 +457,13 @@ const struct sched_class idle_sched_class = {
.pick_next_task = pick_next_task_idle,
.put_prev_task = put_prev_task_idle,
+ .set_next_task = set_next_task_idle,
#ifdef CONFIG_SMP
.select_task_rq = select_task_rq_idle,
.set_cpus_allowed = set_cpus_allowed_common,
#endif
- .set_curr_task = set_curr_task_idle,
.task_tick = task_tick_idle,
.get_rr_interval = get_rr_interval_idle,
diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c
index ccb28085b114..9fcb2a695a41 100644
--- a/kernel/sched/isolation.c
+++ b/kernel/sched/isolation.c
@@ -22,9 +22,17 @@ EXPORT_SYMBOL_GPL(housekeeping_enabled);
int housekeeping_any_cpu(enum hk_flags flags)
{
- if (static_branch_unlikely(&housekeeping_overridden))
- if (housekeeping_flags & flags)
+ int cpu;
+
+ if (static_branch_unlikely(&housekeeping_overridden)) {
+ if (housekeeping_flags & flags) {
+ cpu = sched_numa_find_closest(housekeeping_mask, smp_processor_id());
+ if (cpu < nr_cpu_ids)
+ return cpu;
+
return cpumask_any_and(housekeeping_mask, cpu_online_mask);
+ }
+ }
return smp_processor_id();
}
EXPORT_SYMBOL_GPL(housekeeping_any_cpu);
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index 7acc632c3b82..517e3719027e 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -1051,7 +1051,7 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group,
if (!rcu_access_pointer(group->poll_kworker)) {
struct sched_param param = {
- .sched_priority = MAX_RT_PRIO - 1,
+ .sched_priority = 1,
};
struct kthread_worker *kworker;
@@ -1061,7 +1061,7 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group,
mutex_unlock(&group->trigger_lock);
return ERR_CAST(kworker);
}
- sched_setscheduler(kworker->task, SCHED_FIFO, &param);
+ sched_setscheduler_nocheck(kworker->task, SCHED_FIFO, &param);
kthread_init_delayed_work(&group->poll_work,
psi_poll_work);
rcu_assign_pointer(group->poll_kworker, kworker);
@@ -1131,7 +1131,15 @@ static void psi_trigger_destroy(struct kref *ref)
* deadlock while waiting for psi_poll_work to acquire trigger_lock
*/
if (kworker_to_destroy) {
+ /*
+ * After the RCU grace period has expired, the worker
+ * can no longer be found through group->poll_kworker.
+ * But it might have been already scheduled before
+ * that - deschedule it cleanly before destroying it.
+ */
kthread_cancel_delayed_work_sync(&group->poll_work);
+ atomic_set(&group->poll_scheduled, 0);
+
kthread_destroy_worker(kworker_to_destroy);
}
kfree(t);
@@ -1190,7 +1198,7 @@ static ssize_t psi_write(struct file *file, const char __user *user_buf,
if (static_branch_likely(&psi_disabled))
return -EOPNOTSUPP;
- buf_size = min(nbytes, (sizeof(buf) - 1));
+ buf_size = min(nbytes, sizeof(buf));
if (copy_from_user(buf, user_buf, buf_size))
return -EFAULT;
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index a532558a5176..858c4cc6f99b 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1498,12 +1498,22 @@ static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flag
#endif
}
-static inline void set_next_task(struct rq *rq, struct task_struct *p)
+static inline void set_next_task_rt(struct rq *rq, struct task_struct *p)
{
p->se.exec_start = rq_clock_task(rq);
/* The running task is never eligible for pushing */
dequeue_pushable_task(rq, p);
+
+ /*
+ * If prev task was rt, put_prev_task() has already updated the
+ * utilization. We only care of the case where we start to schedule a
+ * rt task
+ */
+ if (rq->curr->sched_class != &rt_sched_class)
+ update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 0);
+
+ rt_queue_push_tasks(rq);
}
static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq,
@@ -1543,56 +1553,19 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
struct task_struct *p;
struct rt_rq *rt_rq = &rq->rt;
- if (need_pull_rt_task(rq, prev)) {
- /*
- * This is OK, because current is on_cpu, which avoids it being
- * picked for load-balance and preemption/IRQs are still
- * disabled avoiding further scheduler activity on it and we're
- * being very careful to re-start the picking loop.
- */
- rq_unpin_lock(rq, rf);
- pull_rt_task(rq);
- rq_repin_lock(rq, rf);
- /*
- * pull_rt_task() can drop (and re-acquire) rq->lock; this
- * means a dl or stop task can slip in, in which case we need
- * to re-start task selection.
- */
- if (unlikely((rq->stop && task_on_rq_queued(rq->stop)) ||
- rq->dl.dl_nr_running))
- return RETRY_TASK;
- }
-
- /*
- * We may dequeue prev's rt_rq in put_prev_task().
- * So, we update time before rt_queued check.
- */
- if (prev->sched_class == &rt_sched_class)
- update_curr_rt(rq);
+ WARN_ON_ONCE(prev || rf);
if (!rt_rq->rt_queued)
return NULL;
- put_prev_task(rq, prev);
-
p = _pick_next_task_rt(rq);
- set_next_task(rq, p);
-
- rt_queue_push_tasks(rq);
-
- /*
- * If prev task was rt, put_prev_task() has already updated the
- * utilization. We only care of the case where we start to schedule a
- * rt task
- */
- if (rq->curr->sched_class != &rt_sched_class)
- update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 0);
+ set_next_task_rt(rq, p);
return p;
}
-static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
+static void put_prev_task_rt(struct rq *rq, struct task_struct *p, struct rq_flags *rf)
{
update_curr_rt(rq);
@@ -1604,6 +1577,18 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
*/
if (on_rt_rq(&p->rt) && p->nr_cpus_allowed > 1)
enqueue_pushable_task(rq, p);
+
+ if (rf && !on_rt_rq(&p->rt) && need_pull_rt_task(rq, p)) {
+ /*
+ * This is OK, because current is on_cpu, which avoids it being
+ * picked for load-balance and preemption/IRQs are still
+ * disabled avoiding further scheduler activity on it and we've
+ * not yet started the picking loop.
+ */
+ rq_unpin_lock(rq, rf);
+ pull_rt_task(rq);
+ rq_repin_lock(rq, rf);
+ }
}
#ifdef CONFIG_SMP
@@ -2354,11 +2339,6 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
}
}
-static void set_curr_task_rt(struct rq *rq)
-{
- set_next_task(rq, rq->curr);
-}
-
static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task)
{
/*
@@ -2380,6 +2360,7 @@ const struct sched_class rt_sched_class = {
.pick_next_task = pick_next_task_rt,
.put_prev_task = put_prev_task_rt,
+ .set_next_task = set_next_task_rt,
#ifdef CONFIG_SMP
.select_task_rq = select_task_rq_rt,
@@ -2391,7 +2372,6 @@ const struct sched_class rt_sched_class = {
.switched_from = switched_from_rt,
#endif
- .set_curr_task = set_curr_task_rt,
.task_tick = task_tick_rt,
.get_rr_interval = get_rr_interval_rt,
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 802b1f3405f2..b3cb895d14a2 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -335,8 +335,6 @@ struct cfs_bandwidth {
u64 quota;
u64 runtime;
s64 hierarchical_quota;
- u64 runtime_expires;
- int expires_seq;
u8 idle;
u8 period_active;
@@ -393,6 +391,16 @@ struct task_group {
#endif
struct cfs_bandwidth cfs_bandwidth;
+
+#ifdef CONFIG_UCLAMP_TASK_GROUP
+ /* The two decimal precision [%] value requested from user-space */
+ unsigned int uclamp_pct[UCLAMP_CNT];
+ /* Clamp values requested for a task group */
+ struct uclamp_se uclamp_req[UCLAMP_CNT];
+ /* Effective clamp values used for a task group */
+ struct uclamp_se uclamp[UCLAMP_CNT];
+#endif
+
};
#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -483,7 +491,8 @@ struct cfs_rq {
struct load_weight load;
unsigned long runnable_weight;
unsigned int nr_running;
- unsigned int h_nr_running;
+ unsigned int h_nr_running; /* SCHED_{NORMAL,BATCH,IDLE} */
+ unsigned int idle_h_nr_running; /* SCHED_IDLE */
u64 exec_clock;
u64 min_vruntime;
@@ -556,8 +565,6 @@ struct cfs_rq {
#ifdef CONFIG_CFS_BANDWIDTH
int runtime_enabled;
- int expires_seq;
- u64 runtime_expires;
s64 runtime_remaining;
u64 throttled_clock;
@@ -777,9 +784,6 @@ struct root_domain {
struct perf_domain __rcu *pd;
};
-extern struct root_domain def_root_domain;
-extern struct mutex sched_domains_mutex;
-
extern void init_defrootdomain(void);
extern int sched_init_domains(const struct cpumask *cpu_map);
extern void rq_attach_root(struct rq *rq, struct root_domain *rd);
@@ -1261,16 +1265,18 @@ enum numa_topology_type {
extern enum numa_topology_type sched_numa_topology_type;
extern int sched_max_numa_distance;
extern bool find_numa_distance(int distance);
-#endif
-
-#ifdef CONFIG_NUMA
extern void sched_init_numa(void);
extern void sched_domains_numa_masks_set(unsigned int cpu);
extern void sched_domains_numa_masks_clear(unsigned int cpu);
+extern int sched_numa_find_closest(const struct cpumask *cpus, int cpu);
#else
static inline void sched_init_numa(void) { }
static inline void sched_domains_numa_masks_set(unsigned int cpu) { }
static inline void sched_domains_numa_masks_clear(unsigned int cpu) { }
+static inline int sched_numa_find_closest(const struct cpumask *cpus, int cpu)
+{
+ return nr_cpu_ids;
+}
#endif
#ifdef CONFIG_NUMA_BALANCING
@@ -1449,10 +1455,14 @@ static inline void unregister_sched_domain_sysctl(void)
}
#endif
+extern int newidle_balance(struct rq *this_rq, struct rq_flags *rf);
+
#else
static inline void sched_ttwu_pending(void) { }
+static inline int newidle_balance(struct rq *this_rq, struct rq_flags *rf) { return 0; }
+
#endif /* CONFIG_SMP */
#include "stats.h"
@@ -1700,17 +1710,21 @@ struct sched_class {
void (*check_preempt_curr)(struct rq *rq, struct task_struct *p, int flags);
/*
- * It is the responsibility of the pick_next_task() method that will
- * return the next task to call put_prev_task() on the @prev task or
- * something equivalent.
+ * Both @prev and @rf are optional and may be NULL, in which case the
+ * caller must already have invoked put_prev_task(rq, prev, rf).
+ *
+ * Otherwise it is the responsibility of the pick_next_task() to call
+ * put_prev_task() on the @prev task or something equivalent, IFF it
+ * returns a next task.
*
- * May return RETRY_TASK when it finds a higher prio class has runnable
- * tasks.
+ * In that case (@rf != NULL) it may return RETRY_TASK when it finds a
+ * higher prio class has runnable tasks.
*/
struct task_struct * (*pick_next_task)(struct rq *rq,
struct task_struct *prev,
struct rq_flags *rf);
- void (*put_prev_task)(struct rq *rq, struct task_struct *p);
+ void (*put_prev_task)(struct rq *rq, struct task_struct *p, struct rq_flags *rf);
+ void (*set_next_task)(struct rq *rq, struct task_struct *p);
#ifdef CONFIG_SMP
int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags);
@@ -1725,7 +1739,6 @@ struct sched_class {
void (*rq_offline)(struct rq *rq);
#endif
- void (*set_curr_task)(struct rq *rq);
void (*task_tick)(struct rq *rq, struct task_struct *p, int queued);
void (*task_fork)(struct task_struct *p);
void (*task_dead)(struct task_struct *p);
@@ -1755,12 +1768,14 @@ struct sched_class {
static inline void put_prev_task(struct rq *rq, struct task_struct *prev)
{
- prev->sched_class->put_prev_task(rq, prev);
+ WARN_ON_ONCE(rq->curr != prev);
+ prev->sched_class->put_prev_task(rq, prev, NULL);
}
-static inline void set_curr_task(struct rq *rq, struct task_struct *curr)
+static inline void set_next_task(struct rq *rq, struct task_struct *next)
{
- curr->sched_class->set_curr_task(rq);
+ WARN_ON_ONCE(rq->curr != next);
+ next->sched_class->set_next_task(rq, next);
}
#ifdef CONFIG_SMP
@@ -1943,7 +1958,7 @@ unsigned long arch_scale_freq_capacity(int cpu)
#endif
#ifdef CONFIG_SMP
-#ifdef CONFIG_PREEMPT
+#ifdef CONFIG_PREEMPTION
static inline void double_rq_lock(struct rq *rq1, struct rq *rq2);
@@ -1995,7 +2010,7 @@ static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
return ret;
}
-#endif /* CONFIG_PREEMPT */
+#endif /* CONFIG_PREEMPTION */
/*
* double_lock_balance - lock the busiest runqueue, this_rq is locked already.
@@ -2266,7 +2281,7 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {}
#endif /* CONFIG_CPU_FREQ */
#ifdef CONFIG_UCLAMP_TASK
-unsigned int uclamp_eff_value(struct task_struct *p, unsigned int clamp_id);
+enum uclamp_id uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id);
static __always_inline
unsigned int uclamp_util_with(struct rq *rq, unsigned int util,
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index aa0de240fb41..ba683fe81a6e 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -157,9 +157,10 @@ static inline void sched_info_dequeued(struct rq *rq, struct task_struct *t)
{
unsigned long long now = rq_clock(rq), delta = 0;
- if (unlikely(sched_info_on()))
+ if (sched_info_on()) {
if (t->sched_info.last_queued)
delta = now - t->sched_info.last_queued;
+ }
sched_info_reset_dequeued(t);
t->sched_info.run_delay += delta;
@@ -192,7 +193,7 @@ static void sched_info_arrive(struct rq *rq, struct task_struct *t)
*/
static inline void sched_info_queued(struct rq *rq, struct task_struct *t)
{
- if (unlikely(sched_info_on())) {
+ if (sched_info_on()) {
if (!t->sched_info.last_queued)
t->sched_info.last_queued = rq_clock(rq);
}
@@ -239,7 +240,7 @@ __sched_info_switch(struct rq *rq, struct task_struct *prev, struct task_struct
static inline void
sched_info_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next)
{
- if (unlikely(sched_info_on()))
+ if (sched_info_on())
__sched_info_switch(rq, prev, next);
}
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index c183b790ca54..7e1cee4e65b2 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -23,17 +23,22 @@ check_preempt_curr_stop(struct rq *rq, struct task_struct *p, int flags)
/* we're never preempted */
}
+static void set_next_task_stop(struct rq *rq, struct task_struct *stop)
+{
+ stop->se.exec_start = rq_clock_task(rq);
+}
+
static struct task_struct *
pick_next_task_stop(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
{
struct task_struct *stop = rq->stop;
+ WARN_ON_ONCE(prev || rf);
+
if (!stop || !task_on_rq_queued(stop))
return NULL;
- put_prev_task(rq, prev);
-
- stop->se.exec_start = rq_clock_task(rq);
+ set_next_task_stop(rq, stop);
return stop;
}
@@ -55,7 +60,7 @@ static void yield_task_stop(struct rq *rq)
BUG(); /* the stop task should never yield, its pointless. */
}
-static void put_prev_task_stop(struct rq *rq, struct task_struct *prev)
+static void put_prev_task_stop(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
{
struct task_struct *curr = rq->curr;
u64 delta_exec;
@@ -86,13 +91,6 @@ static void task_tick_stop(struct rq *rq, struct task_struct *curr, int queued)
{
}
-static void set_curr_task_stop(struct rq *rq)
-{
- struct task_struct *stop = rq->stop;
-
- stop->se.exec_start = rq_clock_task(rq);
-}
-
static void switched_to_stop(struct rq *rq, struct task_struct *p)
{
BUG(); /* its impossible to change to this class */
@@ -128,13 +126,13 @@ const struct sched_class stop_sched_class = {
.pick_next_task = pick_next_task_stop,
.put_prev_task = put_prev_task_stop,
+ .set_next_task = set_next_task_stop,
#ifdef CONFIG_SMP
.select_task_rq = select_task_rq_stop,
.set_cpus_allowed = set_cpus_allowed_common,
#endif
- .set_curr_task = set_curr_task_stop,
.task_tick = task_tick_stop,
.get_rr_interval = get_rr_interval_stop,
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index f751ce0b783e..b5667a273bf6 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -1284,6 +1284,7 @@ static int sched_domains_curr_level;
int sched_max_numa_distance;
static int *sched_domains_numa_distance;
static struct cpumask ***sched_domains_numa_masks;
+int __read_mostly node_reclaim_distance = RECLAIM_DISTANCE;
#endif
/*
@@ -1402,7 +1403,7 @@ sd_init(struct sched_domain_topology_level *tl,
sd->flags &= ~SD_PREFER_SIBLING;
sd->flags |= SD_SERIALIZE;
- if (sched_domains_numa_distance[tl->numa_level] > RECLAIM_DISTANCE) {
+ if (sched_domains_numa_distance[tl->numa_level] > node_reclaim_distance) {
sd->flags &= ~(SD_BALANCE_EXEC |
SD_BALANCE_FORK |
SD_WAKE_AFFINE);
@@ -1724,6 +1725,26 @@ void sched_domains_numa_masks_clear(unsigned int cpu)
}
}
+/*
+ * sched_numa_find_closest() - given the NUMA topology, find the cpu
+ * closest to @cpu from @cpumask.
+ * cpumask: cpumask to find a cpu from
+ * cpu: cpu to be close to
+ *
+ * returns: cpu, or nr_cpu_ids when nothing found.
+ */
+int sched_numa_find_closest(const struct cpumask *cpus, int cpu)
+{
+ int i, j = cpu_to_node(cpu);
+
+ for (i = 0; i < sched_domains_numa_levels; i++) {
+ cpu = cpumask_any_and(cpus, sched_domains_numa_masks[i][j]);
+ if (cpu < nr_cpu_ids)
+ return cpu;
+ }
+ return nr_cpu_ids;
+}
+
#endif /* CONFIG_NUMA */
static int __sdt_alloc(const struct cpumask *cpu_map)
@@ -2149,16 +2170,16 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
* ndoms_new == 0 is a special case for destroying existing domains,
* and it will not create the default domain.
*
- * Call with hotplug lock held
+ * Call with hotplug lock and sched_domains_mutex held
*/
-void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
- struct sched_domain_attr *dattr_new)
+void partition_sched_domains_locked(int ndoms_new, cpumask_var_t doms_new[],
+ struct sched_domain_attr *dattr_new)
{
bool __maybe_unused has_eas = false;
int i, j, n;
int new_topology;
- mutex_lock(&sched_domains_mutex);
+ lockdep_assert_held(&sched_domains_mutex);
/* Always unregister in case we don't destroy any domains: */
unregister_sched_domain_sysctl();
@@ -2183,8 +2204,19 @@ void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
for (i = 0; i < ndoms_cur; i++) {
for (j = 0; j < n && !new_topology; j++) {
if (cpumask_equal(doms_cur[i], doms_new[j]) &&
- dattrs_equal(dattr_cur, i, dattr_new, j))
+ dattrs_equal(dattr_cur, i, dattr_new, j)) {
+ struct root_domain *rd;
+
+ /*
+ * This domain won't be destroyed and as such
+ * its dl_bw->total_bw needs to be cleared. It
+ * will be recomputed in function
+ * update_tasks_root_domain().
+ */
+ rd = cpu_rq(cpumask_any(doms_cur[i]))->rd;
+ dl_clear_root_domain(rd);
goto match1;
+ }
}
/* No match - a current sched domain not in new doms_new[] */
detach_destroy_domains(doms_cur[i]);
@@ -2241,6 +2273,15 @@ match3:
ndoms_cur = ndoms_new;
register_sched_domain_sysctl();
+}
+/*
+ * Call with hotplug lock held
+ */
+void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
+ struct sched_domain_attr *dattr_new)
+{
+ mutex_lock(&sched_domains_mutex);
+ partition_sched_domains_locked(ndoms_new, doms_new, dattr_new);
mutex_unlock(&sched_domains_mutex);
}
diff --git a/kernel/signal.c b/kernel/signal.c
index 91b789dd6e72..c4da1ef56fdf 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -90,6 +90,11 @@ static bool sig_task_ignored(struct task_struct *t, int sig, bool force)
handler == SIG_DFL && !(force && sig_kernel_only(sig)))
return true;
+ /* Only allow kernel generated signals to this kthread */
+ if (unlikely((t->flags & PF_KTHREAD) &&
+ (handler == SIG_KTHREAD_KERNEL) && !force))
+ return true;
+
return sig_handler_ignored(handler, sig);
}
@@ -349,7 +354,7 @@ void task_clear_jobctl_pending(struct task_struct *task, unsigned long mask)
* @task has %JOBCTL_STOP_PENDING set and is participating in a group stop.
* Group stop states are cleared and the group stop count is consumed if
* %JOBCTL_STOP_CONSUME was set. If the consumption completes the group
- * stop, the appropriate %SIGNAL_* flags are set.
+ * stop, the appropriate `SIGNAL_*` flags are set.
*
* CONTEXT:
* Must be called with @task->sighand->siglock held.
@@ -1885,6 +1890,7 @@ static void do_notify_pidfd(struct task_struct *task)
{
struct pid *pid;
+ WARN_ON(task->exit_state == 0);
pid = task_pid(task);
wake_up_all(&pid->wait_pidfd);
}
@@ -3672,8 +3678,11 @@ static int copy_siginfo_from_user_any(kernel_siginfo_t *kinfo, siginfo_t *info)
static struct pid *pidfd_to_pid(const struct file *file)
{
- if (file->f_op == &pidfd_fops)
- return file->private_data;
+ struct pid *pid;
+
+ pid = pidfd_pid(file);
+ if (!IS_ERR(pid))
+ return pid;
return tgid_pidfd_to_pid(file);
}
diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c
index f5440abb7532..6d1f68b7e528 100644
--- a/kernel/stacktrace.c
+++ b/kernel/stacktrace.c
@@ -20,7 +20,7 @@
* @nr_entries: Number of entries in the storage array
* @spaces: Number of leading spaces to print
*/
-void stack_trace_print(unsigned long *entries, unsigned int nr_entries,
+void stack_trace_print(const unsigned long *entries, unsigned int nr_entries,
int spaces)
{
unsigned int i;
@@ -43,7 +43,7 @@ EXPORT_SYMBOL_GPL(stack_trace_print);
*
* Return: Number of bytes printed.
*/
-int stack_trace_snprint(char *buf, size_t size, unsigned long *entries,
+int stack_trace_snprint(char *buf, size_t size, const unsigned long *entries,
unsigned int nr_entries, int spaces)
{
unsigned int generated, i, total = 0;
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index b4f83f7bdf86..c7031a22aa7b 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -383,6 +383,7 @@ static bool queue_stop_cpus_work(const struct cpumask *cpumask,
*/
preempt_disable();
stop_cpus_in_progress = true;
+ barrier();
for_each_cpu(cpu, cpumask) {
work = &per_cpu(cpu_stopper.stop_work, cpu);
work->fn = fn;
@@ -391,6 +392,7 @@ static bool queue_stop_cpus_work(const struct cpumask *cpumask,
if (cpu_stop_queue_work(cpu, work))
queued = true;
}
+ barrier();
stop_cpus_in_progress = false;
preempt_enable();
diff --git a/kernel/sys.c b/kernel/sys.c
index 2969304c29fe..d605fe5e58a5 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -103,12 +103,6 @@
#ifndef SET_TSC_CTL
# define SET_TSC_CTL(a) (-EINVAL)
#endif
-#ifndef MPX_ENABLE_MANAGEMENT
-# define MPX_ENABLE_MANAGEMENT() (-EINVAL)
-#endif
-#ifndef MPX_DISABLE_MANAGEMENT
-# define MPX_DISABLE_MANAGEMENT() (-EINVAL)
-#endif
#ifndef GET_FP_MODE
# define GET_FP_MODE(a) (-EINVAL)
#endif
@@ -124,6 +118,12 @@
#ifndef PAC_RESET_KEYS
# define PAC_RESET_KEYS(a, b) (-EINVAL)
#endif
+#ifndef SET_TAGGED_ADDR_CTRL
+# define SET_TAGGED_ADDR_CTRL(a) (-EINVAL)
+#endif
+#ifndef GET_TAGGED_ADDR_CTRL
+# define GET_TAGGED_ADDR_CTRL() (-EINVAL)
+#endif
/*
* this is where the system-wide overflow UID and GID are defined, for
@@ -2456,15 +2456,9 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
up_write(&me->mm->mmap_sem);
break;
case PR_MPX_ENABLE_MANAGEMENT:
- if (arg2 || arg3 || arg4 || arg5)
- return -EINVAL;
- error = MPX_ENABLE_MANAGEMENT();
- break;
case PR_MPX_DISABLE_MANAGEMENT:
- if (arg2 || arg3 || arg4 || arg5)
- return -EINVAL;
- error = MPX_DISABLE_MANAGEMENT();
- break;
+ /* No longer implemented: */
+ return -EINVAL;
case PR_SET_FP_MODE:
error = SET_FP_MODE(me, arg2);
break;
@@ -2492,6 +2486,16 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
return -EINVAL;
error = PAC_RESET_KEYS(me, arg2);
break;
+ case PR_SET_TAGGED_ADDR_CTRL:
+ if (arg3 || arg4 || arg5)
+ return -EINVAL;
+ error = SET_TAGGED_ADDR_CTRL(arg2);
+ break;
+ case PR_GET_TAGGED_ADDR_CTRL:
+ if (arg2 || arg3 || arg4 || arg5)
+ return -EINVAL;
+ error = GET_TAGGED_ADDR_CTRL();
+ break;
default:
error = -EINVAL;
break;
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 57518efc3810..b7d75a9e8ccf 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -672,7 +672,7 @@ static int alarm_timer_create(struct k_itimer *new_timer)
enum alarmtimer_type type;
if (!alarmtimer_get_rtcdev())
- return -ENOTSUPP;
+ return -EOPNOTSUPP;
if (!capable(CAP_WAKE_ALARM))
return -EPERM;
@@ -790,7 +790,7 @@ static int alarm_timer_nsleep(const clockid_t which_clock, int flags,
int ret = 0;
if (!alarmtimer_get_rtcdev())
- return -ENOTSUPP;
+ return -EOPNOTSUPP;
if (flags & ~TIMER_ABSTIME)
return -EINVAL;
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index d911c8470149..ca69290bee2a 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -146,6 +146,11 @@ static void tk_set_wall_to_mono(struct timekeeper *tk, struct timespec64 wtm)
static inline void tk_update_sleep_time(struct timekeeper *tk, ktime_t delta)
{
tk->offs_boot = ktime_add(tk->offs_boot, delta);
+ /*
+ * Timespec representation for VDSO update to avoid 64bit division
+ * on every update.
+ */
+ tk->monotonic_to_boot = ktime_to_timespec64(tk->offs_boot);
}
/*
diff --git a/kernel/time/vsyscall.c b/kernel/time/vsyscall.c
index 8cf3596a4ce6..4bc37ac3bb05 100644
--- a/kernel/time/vsyscall.c
+++ b/kernel/time/vsyscall.c
@@ -17,7 +17,7 @@ static inline void update_vdso_data(struct vdso_data *vdata,
struct timekeeper *tk)
{
struct vdso_timestamp *vdso_ts;
- u64 nsec;
+ u64 nsec, sec;
vdata[CS_HRES_COARSE].cycle_last = tk->tkr_mono.cycle_last;
vdata[CS_HRES_COARSE].mask = tk->tkr_mono.mask;
@@ -45,23 +45,27 @@ static inline void update_vdso_data(struct vdso_data *vdata,
}
vdso_ts->nsec = nsec;
- /* CLOCK_MONOTONIC_RAW */
- vdso_ts = &vdata[CS_RAW].basetime[CLOCK_MONOTONIC_RAW];
- vdso_ts->sec = tk->raw_sec;
- vdso_ts->nsec = tk->tkr_raw.xtime_nsec;
+ /* Copy MONOTONIC time for BOOTTIME */
+ sec = vdso_ts->sec;
+ /* Add the boot offset */
+ sec += tk->monotonic_to_boot.tv_sec;
+ nsec += (u64)tk->monotonic_to_boot.tv_nsec << tk->tkr_mono.shift;
/* CLOCK_BOOTTIME */
vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_BOOTTIME];
- vdso_ts->sec = tk->xtime_sec + tk->wall_to_monotonic.tv_sec;
- nsec = tk->tkr_mono.xtime_nsec;
- nsec += ((u64)(tk->wall_to_monotonic.tv_nsec +
- ktime_to_ns(tk->offs_boot)) << tk->tkr_mono.shift);
+ vdso_ts->sec = sec;
+
while (nsec >= (((u64)NSEC_PER_SEC) << tk->tkr_mono.shift)) {
nsec -= (((u64)NSEC_PER_SEC) << tk->tkr_mono.shift);
vdso_ts->sec++;
}
vdso_ts->nsec = nsec;
+ /* CLOCK_MONOTONIC_RAW */
+ vdso_ts = &vdata[CS_RAW].basetime[CLOCK_MONOTONIC_RAW];
+ vdso_ts->sec = tk->raw_sec;
+ vdso_ts->nsec = tk->tkr_raw.xtime_nsec;
+
/* CLOCK_TAI */
vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_TAI];
vdso_ts->sec = tk->xtime_sec + (s64)tk->tai_offset;
diff --git a/kernel/torture.c b/kernel/torture.c
index a8d9bdfba7c3..7c13f5558b71 100644
--- a/kernel/torture.c
+++ b/kernel/torture.c
@@ -263,7 +263,6 @@ static void torture_onoff_cleanup(void)
onoff_task = NULL;
#endif /* #ifdef CONFIG_HOTPLUG_CPU */
}
-EXPORT_SYMBOL_GPL(torture_onoff_cleanup);
/*
* Print online/offline testing statistics.
@@ -449,7 +448,6 @@ static void torture_shuffle_cleanup(void)
}
shuffler_task = NULL;
}
-EXPORT_SYMBOL_GPL(torture_shuffle_cleanup);
/*
* Variables for auto-shutdown. This allows "lights out" torture runs
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 98da8998c25c..6a64d7772870 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -146,7 +146,7 @@ config FUNCTION_TRACER
select GENERIC_TRACER
select CONTEXT_SWITCH_TRACER
select GLOB
- select TASKS_RCU if PREEMPT
+ select TASKS_RCU if PREEMPTION
help
Enable the kernel to trace every kernel function. This is done
by using a compiler feature to insert a small, 5-byte No-Operation
@@ -179,7 +179,7 @@ config TRACE_PREEMPT_TOGGLE
config PREEMPTIRQ_EVENTS
bool "Enable trace events for preempt and irq disable/enable"
select TRACE_IRQFLAGS
- select TRACE_PREEMPT_TOGGLE if PREEMPT
+ select TRACE_PREEMPT_TOGGLE if PREEMPTION
select GENERIC_TRACER
default n
help
@@ -214,7 +214,7 @@ config PREEMPT_TRACER
bool "Preemption-off Latency Tracer"
default n
depends on !ARCH_USES_GETTIMEOFFSET
- depends on PREEMPT
+ depends on PREEMPTION
select GENERIC_TRACER
select TRACER_MAX_TRACE
select RING_BUFFER_ALLOW_SWAP
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index eca34503f178..356b848c697a 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -2814,7 +2814,7 @@ int ftrace_shutdown(struct ftrace_ops *ops, int command)
* synchornize_rcu_tasks() will wait for those tasks to
* execute and either schedule voluntarily or enter user space.
*/
- if (IS_ENABLED(CONFIG_PREEMPT))
+ if (IS_ENABLED(CONFIG_PREEMPTION))
synchronize_rcu_tasks();
free_ops:
@@ -3095,6 +3095,14 @@ t_probe_next(struct seq_file *m, loff_t *pos)
hnd = &iter->probe_entry->hlist;
hash = iter->probe->ops.func_hash->filter_hash;
+
+ /*
+ * A probe being registered may temporarily have an empty hash
+ * and it's at the end of the func_probes list.
+ */
+ if (!hash || hash == EMPTY_HASH)
+ return NULL;
+
size = 1 << hash->size_bits;
retry:
@@ -4320,12 +4328,21 @@ register_ftrace_function_probe(char *glob, struct trace_array *tr,
mutex_unlock(&ftrace_lock);
+ /*
+ * Note, there's a small window here that the func_hash->filter_hash
+ * may be NULL or empty. Need to be carefule when reading the loop.
+ */
mutex_lock(&probe->ops.func_hash->regex_lock);
orig_hash = &probe->ops.func_hash->filter_hash;
old_hash = *orig_hash;
hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, old_hash);
+ if (!hash) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
ret = ftrace_match_records(hash, glob, strlen(glob));
/* Nothing found? */
diff --git a/kernel/trace/ftrace_internal.h b/kernel/trace/ftrace_internal.h
index 0515a2096f90..0456e0a3dab1 100644
--- a/kernel/trace/ftrace_internal.h
+++ b/kernel/trace/ftrace_internal.h
@@ -6,22 +6,22 @@
/*
* Traverse the ftrace_global_list, invoking all entries. The reason that we
- * can use rcu_dereference_raw_notrace() is that elements removed from this list
+ * can use rcu_dereference_raw_check() is that elements removed from this list
* are simply leaked, so there is no need to interact with a grace-period
- * mechanism. The rcu_dereference_raw_notrace() calls are needed to handle
+ * mechanism. The rcu_dereference_raw_check() calls are needed to handle
* concurrent insertions into the ftrace_global_list.
*
* Silly Alpha and silly pointer-speculation compiler optimizations!
*/
#define do_for_each_ftrace_op(op, list) \
- op = rcu_dereference_raw_notrace(list); \
+ op = rcu_dereference_raw_check(list); \
do
/*
* Optimized for just a single item in the list (as that is the normal case).
*/
#define while_for_each_ftrace_op(op) \
- while (likely(op = rcu_dereference_raw_notrace((op)->next)) && \
+ while (likely(op = rcu_dereference_raw_check((op)->next)) && \
unlikely((op) != &ftrace_list_end))
extern struct ftrace_ops __rcu *ftrace_ops_list;
diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
index 0564f6db0561..09b0b49f346e 100644
--- a/kernel/trace/ring_buffer_benchmark.c
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -267,7 +267,7 @@ static void ring_buffer_producer(void)
if (consumer && !(cnt % wakeup_interval))
wake_up_process(consumer);
-#ifndef CONFIG_PREEMPT
+#ifndef CONFIG_PREEMPTION
/*
* If we are a non preempt kernel, the 10 second run will
* stop everything while it runs. Instead, we will call
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 525a97fbbc60..947ba433865f 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1567,9 +1567,9 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu,
/**
* update_max_tr_single - only copy one trace over, and reset the rest
- * @tr - tracer
- * @tsk - task with the latency
- * @cpu - the cpu of the buffer to copy.
+ * @tr: tracer
+ * @tsk: task with the latency
+ * @cpu: the cpu of the buffer to copy.
*
* Flip the trace of a single CPU buffer between the @tr and the max_tr.
*/
@@ -1767,7 +1767,7 @@ static void __init apply_trace_boot_options(void);
/**
* register_tracer - register a tracer with the ftrace system.
- * @type - the plugin for the tracer
+ * @type: the plugin for the tracer
*
* Register a new plugin tracer.
*/
@@ -2230,9 +2230,9 @@ static bool tracing_record_taskinfo_skip(int flags)
/**
* tracing_record_taskinfo - record the task info of a task
*
- * @task - task to record
- * @flags - TRACE_RECORD_CMDLINE for recording comm
- * - TRACE_RECORD_TGID for recording tgid
+ * @task: task to record
+ * @flags: TRACE_RECORD_CMDLINE for recording comm
+ * TRACE_RECORD_TGID for recording tgid
*/
void tracing_record_taskinfo(struct task_struct *task, int flags)
{
@@ -2258,10 +2258,10 @@ void tracing_record_taskinfo(struct task_struct *task, int flags)
/**
* tracing_record_taskinfo_sched_switch - record task info for sched_switch
*
- * @prev - previous task during sched_switch
- * @next - next task during sched_switch
- * @flags - TRACE_RECORD_CMDLINE for recording comm
- * TRACE_RECORD_TGID for recording tgid
+ * @prev: previous task during sched_switch
+ * @next: next task during sched_switch
+ * @flags: TRACE_RECORD_CMDLINE for recording comm
+ * TRACE_RECORD_TGID for recording tgid
*/
void tracing_record_taskinfo_sched_switch(struct task_struct *prev,
struct task_struct *next, int flags)
@@ -2642,10 +2642,10 @@ static void ftrace_exports(struct ring_buffer_event *event)
preempt_disable_notrace();
- export = rcu_dereference_raw_notrace(ftrace_exports_list);
+ export = rcu_dereference_raw_check(ftrace_exports_list);
while (export) {
trace_process_export(export, event);
- export = rcu_dereference_raw_notrace(export->next);
+ export = rcu_dereference_raw_check(export->next);
}
preempt_enable_notrace();
@@ -3072,7 +3072,9 @@ static void trace_printk_start_stop_comm(int enabled)
/**
* trace_vbprintk - write binary msg to tracing buffer
- *
+ * @ip: The address of the caller
+ * @fmt: The string format to write to the buffer
+ * @args: Arguments for @fmt
*/
int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
{
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index c7506bc81b75..b89cdfe20bc1 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -255,12 +255,12 @@ void *trace_event_buffer_reserve(struct trace_event_buffer *fbuffer,
local_save_flags(fbuffer->flags);
fbuffer->pc = preempt_count();
/*
- * If CONFIG_PREEMPT is enabled, then the tracepoint itself disables
+ * If CONFIG_PREEMPTION is enabled, then the tracepoint itself disables
* preemption (adding one to the preempt_count). Since we are
* interested in the preempt_count at the time the tracepoint was
* hit, we need to subtract one to offset the increment.
*/
- if (IS_ENABLED(CONFIG_PREEMPT))
+ if (IS_ENABLED(CONFIG_PREEMPTION))
fbuffer->pc--;
fbuffer->trace_file = trace_file;
@@ -787,7 +787,7 @@ static int __ftrace_set_clr_event(struct trace_array *tr, const char *match,
return ret;
}
-static int ftrace_set_clr_event(struct trace_array *tr, char *buf, int set)
+int ftrace_set_clr_event(struct trace_array *tr, char *buf, int set)
{
char *event = NULL, *sub = NULL, *match;
int ret;
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 69ebf3c2f1b5..78af97163147 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -137,6 +137,13 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
if (trace_recursion_test(TRACE_GRAPH_NOTRACE_BIT))
return 0;
+ /*
+ * Do not trace a function if it's filtered by set_graph_notrace.
+ * Make the index of ret stack negative to indicate that it should
+ * ignore further functions. But it needs its own ret stack entry
+ * to recover the original index in order to continue tracing after
+ * returning from the function.
+ */
if (ftrace_graph_notrace_addr(trace->func)) {
trace_recursion_set(TRACE_GRAPH_NOTRACE_BIT);
/*
@@ -156,16 +163,6 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
return 0;
/*
- * Do not trace a function if it's filtered by set_graph_notrace.
- * Make the index of ret stack negative to indicate that it should
- * ignore further functions. But it needs its own ret stack entry
- * to recover the original index in order to continue tracing after
- * returning from the function.
- */
- if (ftrace_graph_notrace_addr(trace->func))
- return 1;
-
- /*
* Stop here if tracing_threshold is set. We only write function return
* events to the ring buffer.
*/
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index dbef0d135075..fb6bfbc5bf86 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -895,7 +895,8 @@ void trace_probe_cleanup(struct trace_probe *tp)
for (i = 0; i < tp->nr_args; i++)
traceprobe_free_probe_arg(&tp->args[i]);
- kfree(call->class->system);
+ if (call->class)
+ kfree(call->class->system);
kfree(call->name);
kfree(call->print_fmt);
}
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 743b2b520d34..5e43b9664eca 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -579,8 +579,7 @@ probe_wakeup(void *ignore, struct task_struct *p)
else
tracing_dl = 0;
- wakeup_task = p;
- get_task_struct(wakeup_task);
+ wakeup_task = get_task_struct(p);
local_save_flags(flags);