summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/audit_tree.c2
-rw-r--r--kernel/bpf/helpers.c15
-rw-r--r--kernel/bpf/local_storage.c5
-rw-r--r--kernel/bpf/verifier.c40
-rw-r--r--kernel/cgroup/cpuset.c95
-rw-r--r--kernel/cpu_pm.c50
-rw-r--r--kernel/cred.c41
-rw-r--r--kernel/dma/debug.c7
-rw-r--r--kernel/fork.c7
-rw-r--r--kernel/irq/timings.c2
-rw-r--r--kernel/kthread.c33
-rw-r--r--kernel/locking/lockdep.c16
-rw-r--r--kernel/locking/mutex.c15
-rw-r--r--kernel/pid_namespace.c3
-rw-r--r--kernel/power/energy_model.c4
-rw-r--r--kernel/rcu/srcutiny.c77
-rw-r--r--kernel/rcu/srcutree.c127
-rw-r--r--kernel/rcu/tree.c4
-rw-r--r--kernel/rcu/tree_plugin.h9
-rw-r--r--kernel/rcu/tree_stall.h34
-rw-r--r--kernel/sched/core.c25
-rw-r--r--kernel/sched/deadline.c8
-rw-r--r--kernel/sched/fair.c4
-rw-r--r--kernel/sched/sched.h2
-rw-r--r--kernel/static_call.c4
-rw-r--r--kernel/sys.c12
-rw-r--r--kernel/time/hrtimer.c92
-rw-r--r--kernel/time/tick-internal.h3
-rw-r--r--kernel/trace/trace_events_hist.c2
-rw-r--r--kernel/tracepoint.c81
-rw-r--r--kernel/ucount.c40
-rw-r--r--kernel/user_namespace.c3
-rw-r--r--kernel/workqueue.c12
33 files changed, 574 insertions, 300 deletions
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 6c91902f4f45..39241207ec04 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -593,7 +593,6 @@ static void prune_tree_chunks(struct audit_tree *victim, bool tagged)
spin_lock(&hash_lock);
}
spin_unlock(&hash_lock);
- put_tree(victim);
}
/*
@@ -602,6 +601,7 @@ static void prune_tree_chunks(struct audit_tree *victim, bool tagged)
static void prune_one(struct audit_tree *victim)
{
prune_tree_chunks(victim, false);
+ put_tree(victim);
}
/* trim the uncommitted chunks from tree */
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index f7e99bb8c3b6..0efe7c7bfe5e 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -372,8 +372,8 @@ const struct bpf_func_proto bpf_get_current_ancestor_cgroup_id_proto = {
};
#ifdef CONFIG_CGROUP_BPF
-DECLARE_PER_CPU(struct bpf_cgroup_storage*,
- bpf_cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]);
+DECLARE_PER_CPU(struct bpf_cgroup_storage_info,
+ bpf_cgroup_storage_info[BPF_CGROUP_STORAGE_NEST_MAX]);
BPF_CALL_2(bpf_get_local_storage, struct bpf_map *, map, u64, flags)
{
@@ -382,10 +382,17 @@ BPF_CALL_2(bpf_get_local_storage, struct bpf_map *, map, u64, flags)
* verifier checks that its value is correct.
*/
enum bpf_cgroup_storage_type stype = cgroup_storage_type(map);
- struct bpf_cgroup_storage *storage;
+ struct bpf_cgroup_storage *storage = NULL;
void *ptr;
+ int i;
- storage = this_cpu_read(bpf_cgroup_storage[stype]);
+ for (i = BPF_CGROUP_STORAGE_NEST_MAX - 1; i >= 0; i--) {
+ if (likely(this_cpu_read(bpf_cgroup_storage_info[i].task) != current))
+ continue;
+
+ storage = this_cpu_read(bpf_cgroup_storage_info[i].storage[stype]);
+ break;
+ }
if (stype == BPF_CGROUP_STORAGE_SHARED)
ptr = &READ_ONCE(storage->buf)->data[0];
diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c
index 571bb351ed3b..b139247d2dd3 100644
--- a/kernel/bpf/local_storage.c
+++ b/kernel/bpf/local_storage.c
@@ -9,10 +9,11 @@
#include <linux/slab.h>
#include <uapi/linux/btf.h>
-DEFINE_PER_CPU(struct bpf_cgroup_storage*, bpf_cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]);
-
#ifdef CONFIG_CGROUP_BPF
+DEFINE_PER_CPU(struct bpf_cgroup_storage_info,
+ bpf_cgroup_storage_info[BPF_CGROUP_STORAGE_NEST_MAX]);
+
#include "../cgroup/cgroup-internal.h"
#define LOCAL_STORAGE_CREATE_FLAG_MASK \
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index ce1e9193365f..cba1f86e75cd 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -4693,8 +4693,6 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
case BPF_MAP_TYPE_RINGBUF:
if (func_id != BPF_FUNC_ringbuf_output &&
func_id != BPF_FUNC_ringbuf_reserve &&
- func_id != BPF_FUNC_ringbuf_submit &&
- func_id != BPF_FUNC_ringbuf_discard &&
func_id != BPF_FUNC_ringbuf_query)
goto error;
break;
@@ -4798,6 +4796,12 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
if (map->map_type != BPF_MAP_TYPE_PERF_EVENT_ARRAY)
goto error;
break;
+ case BPF_FUNC_ringbuf_output:
+ case BPF_FUNC_ringbuf_reserve:
+ case BPF_FUNC_ringbuf_query:
+ if (map->map_type != BPF_MAP_TYPE_RINGBUF)
+ goto error;
+ break;
case BPF_FUNC_get_stackid:
if (map->map_type != BPF_MAP_TYPE_STACK_TRACE)
goto error;
@@ -10452,10 +10456,11 @@ static void convert_pseudo_ld_imm64(struct bpf_verifier_env *env)
* insni[off, off + cnt). Adjust corresponding insn_aux_data by copying
* [0, off) and [off, end) to new locations, so the patched range stays zero
*/
-static int adjust_insn_aux_data(struct bpf_verifier_env *env,
- struct bpf_prog *new_prog, u32 off, u32 cnt)
+static void adjust_insn_aux_data(struct bpf_verifier_env *env,
+ struct bpf_insn_aux_data *new_data,
+ struct bpf_prog *new_prog, u32 off, u32 cnt)
{
- struct bpf_insn_aux_data *new_data, *old_data = env->insn_aux_data;
+ struct bpf_insn_aux_data *old_data = env->insn_aux_data;
struct bpf_insn *insn = new_prog->insnsi;
u32 old_seen = old_data[off].seen;
u32 prog_len;
@@ -10468,12 +10473,9 @@ static int adjust_insn_aux_data(struct bpf_verifier_env *env,
old_data[off].zext_dst = insn_has_def32(env, insn + off + cnt - 1);
if (cnt == 1)
- return 0;
+ return;
prog_len = new_prog->len;
- new_data = vzalloc(array_size(prog_len,
- sizeof(struct bpf_insn_aux_data)));
- if (!new_data)
- return -ENOMEM;
+
memcpy(new_data, old_data, sizeof(struct bpf_insn_aux_data) * off);
memcpy(new_data + off + cnt - 1, old_data + off,
sizeof(struct bpf_insn_aux_data) * (prog_len - off - cnt + 1));
@@ -10484,7 +10486,6 @@ static int adjust_insn_aux_data(struct bpf_verifier_env *env,
}
env->insn_aux_data = new_data;
vfree(old_data);
- return 0;
}
static void adjust_subprog_starts(struct bpf_verifier_env *env, u32 off, u32 len)
@@ -10519,6 +10520,14 @@ static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 of
const struct bpf_insn *patch, u32 len)
{
struct bpf_prog *new_prog;
+ struct bpf_insn_aux_data *new_data = NULL;
+
+ if (len > 1) {
+ new_data = vzalloc(array_size(env->prog->len + len - 1,
+ sizeof(struct bpf_insn_aux_data)));
+ if (!new_data)
+ return NULL;
+ }
new_prog = bpf_patch_insn_single(env->prog, off, patch, len);
if (IS_ERR(new_prog)) {
@@ -10526,10 +10535,10 @@ static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 of
verbose(env,
"insn %d cannot be patched due to 16-bit range\n",
env->insn_aux_data[off].orig_idx);
+ vfree(new_data);
return NULL;
}
- if (adjust_insn_aux_data(env, new_prog, off, len))
- return NULL;
+ adjust_insn_aux_data(env, new_data, new_prog, off, len);
adjust_subprog_starts(env, off, len);
adjust_poke_descs(new_prog, off, len);
return new_prog;
@@ -10705,6 +10714,7 @@ static void sanitize_dead_code(struct bpf_verifier_env *env)
if (aux_data[i].seen)
continue;
memcpy(insn + i, &trap, sizeof(trap));
+ aux_data[i].zext_dst = false;
}
}
@@ -11028,6 +11038,10 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
if (is_narrower_load && size < target_size) {
u8 shift = bpf_ctx_narrow_access_offset(
off, size, size_default) * 8;
+ if (shift && cnt + 1 >= ARRAY_SIZE(insn_buf)) {
+ verbose(env, "bpf verifier narrow ctx load misconfigured\n");
+ return -EINVAL;
+ }
if (ctx_field_size <= 4) {
if (shift)
insn_buf[cnt++] = BPF_ALU32_IMM(BPF_RSH,
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 53c70c470a38..1999fcec45c7 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -1114,7 +1114,7 @@ enum subparts_cmd {
* cpus_allowed can be granted or an error code will be returned.
*
* For partcmd_disable, the cpuset is being transofrmed from a partition
- * root back to a non-partition root. any CPUs in cpus_allowed that are in
+ * root back to a non-partition root. Any CPUs in cpus_allowed that are in
* parent's subparts_cpus will be taken away from that cpumask and put back
* into parent's effective_cpus. 0 should always be returned.
*
@@ -1148,6 +1148,7 @@ static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd,
struct cpuset *parent = parent_cs(cpuset);
int adding; /* Moving cpus from effective_cpus to subparts_cpus */
int deleting; /* Moving cpus from subparts_cpus to effective_cpus */
+ int new_prs;
bool part_error = false; /* Partition error? */
percpu_rwsem_assert_held(&cpuset_rwsem);
@@ -1183,6 +1184,7 @@ static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd,
* A cpumask update cannot make parent's effective_cpus become empty.
*/
adding = deleting = false;
+ new_prs = cpuset->partition_root_state;
if (cmd == partcmd_enable) {
cpumask_copy(tmp->addmask, cpuset->cpus_allowed);
adding = true;
@@ -1225,7 +1227,7 @@ static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd,
/*
* partcmd_update w/o newmask:
*
- * addmask = cpus_allowed & parent->effectiveb_cpus
+ * addmask = cpus_allowed & parent->effective_cpus
*
* Note that parent's subparts_cpus may have been
* pre-shrunk in case there is a change in the cpu list.
@@ -1247,11 +1249,11 @@ static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd,
switch (cpuset->partition_root_state) {
case PRS_ENABLED:
if (part_error)
- cpuset->partition_root_state = PRS_ERROR;
+ new_prs = PRS_ERROR;
break;
case PRS_ERROR:
if (!part_error)
- cpuset->partition_root_state = PRS_ENABLED;
+ new_prs = PRS_ENABLED;
break;
}
/*
@@ -1260,10 +1262,10 @@ static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd,
part_error = (prev_prs == PRS_ERROR);
}
- if (!part_error && (cpuset->partition_root_state == PRS_ERROR))
+ if (!part_error && (new_prs == PRS_ERROR))
return 0; /* Nothing need to be done */
- if (cpuset->partition_root_state == PRS_ERROR) {
+ if (new_prs == PRS_ERROR) {
/*
* Remove all its cpus from parent's subparts_cpus.
*/
@@ -1272,7 +1274,7 @@ static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd,
parent->subparts_cpus);
}
- if (!adding && !deleting)
+ if (!adding && !deleting && (new_prs == cpuset->partition_root_state))
return 0;
/*
@@ -1299,6 +1301,9 @@ static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd,
}
parent->nr_subparts_cpus = cpumask_weight(parent->subparts_cpus);
+
+ if (cpuset->partition_root_state != new_prs)
+ cpuset->partition_root_state = new_prs;
spin_unlock_irq(&callback_lock);
return cmd == partcmd_update;
@@ -1321,6 +1326,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp)
struct cpuset *cp;
struct cgroup_subsys_state *pos_css;
bool need_rebuild_sched_domains = false;
+ int new_prs;
rcu_read_lock();
cpuset_for_each_descendant_pre(cp, pos_css, cs) {
@@ -1360,17 +1366,18 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp)
* update_tasks_cpumask() again for tasks in the parent
* cpuset if the parent's subparts_cpus changes.
*/
- if ((cp != cs) && cp->partition_root_state) {
+ new_prs = cp->partition_root_state;
+ if ((cp != cs) && new_prs) {
switch (parent->partition_root_state) {
case PRS_DISABLED:
/*
* If parent is not a partition root or an
- * invalid partition root, clear the state
- * state and the CS_CPU_EXCLUSIVE flag.
+ * invalid partition root, clear its state
+ * and its CS_CPU_EXCLUSIVE flag.
*/
WARN_ON_ONCE(cp->partition_root_state
!= PRS_ERROR);
- cp->partition_root_state = 0;
+ new_prs = PRS_DISABLED;
/*
* clear_bit() is an atomic operation and
@@ -1391,11 +1398,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp)
/*
* When parent is invalid, it has to be too.
*/
- cp->partition_root_state = PRS_ERROR;
- if (cp->nr_subparts_cpus) {
- cp->nr_subparts_cpus = 0;
- cpumask_clear(cp->subparts_cpus);
- }
+ new_prs = PRS_ERROR;
break;
}
}
@@ -1407,8 +1410,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp)
spin_lock_irq(&callback_lock);
cpumask_copy(cp->effective_cpus, tmp->new_cpus);
- if (cp->nr_subparts_cpus &&
- (cp->partition_root_state != PRS_ENABLED)) {
+ if (cp->nr_subparts_cpus && (new_prs != PRS_ENABLED)) {
cp->nr_subparts_cpus = 0;
cpumask_clear(cp->subparts_cpus);
} else if (cp->nr_subparts_cpus) {
@@ -1435,6 +1437,10 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp)
= cpumask_weight(cp->subparts_cpus);
}
}
+
+ if (new_prs != cp->partition_root_state)
+ cp->partition_root_state = new_prs;
+
spin_unlock_irq(&callback_lock);
WARN_ON(!is_in_v2_mode() &&
@@ -1937,34 +1943,32 @@ out:
/*
* update_prstate - update partititon_root_state
- * cs: the cpuset to update
- * val: 0 - disabled, 1 - enabled
+ * cs: the cpuset to update
+ * new_prs: new partition root state
*
* Call with cpuset_mutex held.
*/
-static int update_prstate(struct cpuset *cs, int val)
+static int update_prstate(struct cpuset *cs, int new_prs)
{
- int err;
+ int err, old_prs = cs->partition_root_state;
struct cpuset *parent = parent_cs(cs);
- struct tmpmasks tmp;
+ struct tmpmasks tmpmask;
- if ((val != 0) && (val != 1))
- return -EINVAL;
- if (val == cs->partition_root_state)
+ if (old_prs == new_prs)
return 0;
/*
* Cannot force a partial or invalid partition root to a full
* partition root.
*/
- if (val && cs->partition_root_state)
+ if (new_prs && (old_prs == PRS_ERROR))
return -EINVAL;
- if (alloc_cpumasks(NULL, &tmp))
+ if (alloc_cpumasks(NULL, &tmpmask))
return -ENOMEM;
err = -EINVAL;
- if (!cs->partition_root_state) {
+ if (!old_prs) {
/*
* Turning on partition root requires setting the
* CS_CPU_EXCLUSIVE bit implicitly as well and cpus_allowed
@@ -1978,31 +1982,27 @@ static int update_prstate(struct cpuset *cs, int val)
goto out;
err = update_parent_subparts_cpumask(cs, partcmd_enable,
- NULL, &tmp);
+ NULL, &tmpmask);
if (err) {
update_flag(CS_CPU_EXCLUSIVE, cs, 0);
goto out;
}
- cs->partition_root_state = PRS_ENABLED;
} else {
/*
* Turning off partition root will clear the
* CS_CPU_EXCLUSIVE bit.
*/
- if (cs->partition_root_state == PRS_ERROR) {
- cs->partition_root_state = 0;
+ if (old_prs == PRS_ERROR) {
update_flag(CS_CPU_EXCLUSIVE, cs, 0);
err = 0;
goto out;
}
err = update_parent_subparts_cpumask(cs, partcmd_disable,
- NULL, &tmp);
+ NULL, &tmpmask);
if (err)
goto out;
- cs->partition_root_state = 0;
-
/* Turning off CS_CPU_EXCLUSIVE will not return error */
update_flag(CS_CPU_EXCLUSIVE, cs, 0);
}
@@ -2015,11 +2015,17 @@ static int update_prstate(struct cpuset *cs, int val)
update_tasks_cpumask(parent);
if (parent->child_ecpus_count)
- update_sibling_cpumasks(parent, cs, &tmp);
+ update_sibling_cpumasks(parent, cs, &tmpmask);
rebuild_sched_domains_locked();
out:
- free_cpumasks(NULL, &tmp);
+ if (!err) {
+ spin_lock_irq(&callback_lock);
+ cs->partition_root_state = new_prs;
+ spin_unlock_irq(&callback_lock);
+ }
+
+ free_cpumasks(NULL, &tmpmask);
return err;
}
@@ -3060,7 +3066,7 @@ retry:
goto retry;
}
- parent = parent_cs(cs);
+ parent = parent_cs(cs);
compute_effective_cpumask(&new_cpus, cs, parent);
nodes_and(new_mems, cs->mems_allowed, parent->effective_mems);
@@ -3082,8 +3088,10 @@ retry:
if (is_partition_root(cs) && (cpumask_empty(&new_cpus) ||
(parent->partition_root_state == PRS_ERROR))) {
if (cs->nr_subparts_cpus) {
+ spin_lock_irq(&callback_lock);
cs->nr_subparts_cpus = 0;
cpumask_clear(cs->subparts_cpus);
+ spin_unlock_irq(&callback_lock);
compute_effective_cpumask(&new_cpus, cs, parent);
}
@@ -3097,7 +3105,9 @@ retry:
cpumask_empty(&new_cpus)) {
update_parent_subparts_cpumask(cs, partcmd_disable,
NULL, tmp);
+ spin_lock_irq(&callback_lock);
cs->partition_root_state = PRS_ERROR;
+ spin_unlock_irq(&callback_lock);
}
cpuset_force_rebuild();
}
@@ -3168,6 +3178,13 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
cpus_updated = !cpumask_equal(top_cpuset.effective_cpus, &new_cpus);
mems_updated = !nodes_equal(top_cpuset.effective_mems, new_mems);
+ /*
+ * In the rare case that hotplug removes all the cpus in subparts_cpus,
+ * we assumed that cpus are updated.
+ */
+ if (!cpus_updated && top_cpuset.nr_subparts_cpus)
+ cpus_updated = true;
+
/* synchronize cpus_allowed to cpu_active_mask */
if (cpus_updated) {
spin_lock_irq(&callback_lock);
diff --git a/kernel/cpu_pm.c b/kernel/cpu_pm.c
index f7e1d0eccdbc..246efc74e3f3 100644
--- a/kernel/cpu_pm.c
+++ b/kernel/cpu_pm.c
@@ -13,19 +13,32 @@
#include <linux/spinlock.h>
#include <linux/syscore_ops.h>
-static ATOMIC_NOTIFIER_HEAD(cpu_pm_notifier_chain);
+/*
+ * atomic_notifiers use a spinlock_t, which can block under PREEMPT_RT.
+ * Notifications for cpu_pm will be issued by the idle task itself, which can
+ * never block, IOW it requires using a raw_spinlock_t.
+ */
+static struct {
+ struct raw_notifier_head chain;
+ raw_spinlock_t lock;
+} cpu_pm_notifier = {
+ .chain = RAW_NOTIFIER_INIT(cpu_pm_notifier.chain),
+ .lock = __RAW_SPIN_LOCK_UNLOCKED(cpu_pm_notifier.lock),
+};
static int cpu_pm_notify(enum cpu_pm_event event)
{
int ret;
/*
- * atomic_notifier_call_chain has a RCU read critical section, which
- * could be disfunctional in cpu idle. Copy RCU_NONIDLE code to let
- * RCU know this.
+ * This introduces a RCU read critical section, which could be
+ * disfunctional in cpu idle. Copy RCU_NONIDLE code to let RCU know
+ * this.
*/
rcu_irq_enter_irqson();
- ret = atomic_notifier_call_chain(&cpu_pm_notifier_chain, event, NULL);
+ rcu_read_lock();
+ ret = raw_notifier_call_chain(&cpu_pm_notifier.chain, event, NULL);
+ rcu_read_unlock();
rcu_irq_exit_irqson();
return notifier_to_errno(ret);
@@ -33,10 +46,13 @@ static int cpu_pm_notify(enum cpu_pm_event event)
static int cpu_pm_notify_robust(enum cpu_pm_event event_up, enum cpu_pm_event event_down)
{
+ unsigned long flags;
int ret;
rcu_irq_enter_irqson();
- ret = atomic_notifier_call_chain_robust(&cpu_pm_notifier_chain, event_up, event_down, NULL);
+ raw_spin_lock_irqsave(&cpu_pm_notifier.lock, flags);
+ ret = raw_notifier_call_chain_robust(&cpu_pm_notifier.chain, event_up, event_down, NULL);
+ raw_spin_unlock_irqrestore(&cpu_pm_notifier.lock, flags);
rcu_irq_exit_irqson();
return notifier_to_errno(ret);
@@ -49,12 +65,17 @@ static int cpu_pm_notify_robust(enum cpu_pm_event event_up, enum cpu_pm_event ev
* Add a driver to a list of drivers that are notified about
* CPU and CPU cluster low power entry and exit.
*
- * This function may sleep, and has the same return conditions as
- * raw_notifier_chain_register.
+ * This function has the same return conditions as raw_notifier_chain_register.
*/
int cpu_pm_register_notifier(struct notifier_block *nb)
{
- return atomic_notifier_chain_register(&cpu_pm_notifier_chain, nb);
+ unsigned long flags;
+ int ret;
+
+ raw_spin_lock_irqsave(&cpu_pm_notifier.lock, flags);
+ ret = raw_notifier_chain_register(&cpu_pm_notifier.chain, nb);
+ raw_spin_unlock_irqrestore(&cpu_pm_notifier.lock, flags);
+ return ret;
}
EXPORT_SYMBOL_GPL(cpu_pm_register_notifier);
@@ -64,12 +85,17 @@ EXPORT_SYMBOL_GPL(cpu_pm_register_notifier);
*
* Remove a driver from the CPU PM notifier list.
*
- * This function may sleep, and has the same return conditions as
- * raw_notifier_chain_unregister.
+ * This function has the same return conditions as raw_notifier_chain_unregister.
*/
int cpu_pm_unregister_notifier(struct notifier_block *nb)
{
- return atomic_notifier_chain_unregister(&cpu_pm_notifier_chain, nb);
+ unsigned long flags;
+ int ret;
+
+ raw_spin_lock_irqsave(&cpu_pm_notifier.lock, flags);
+ ret = raw_notifier_chain_unregister(&cpu_pm_notifier.chain, nb);
+ raw_spin_unlock_irqrestore(&cpu_pm_notifier.lock, flags);
+ return ret;
}
EXPORT_SYMBOL_GPL(cpu_pm_unregister_notifier);
diff --git a/kernel/cred.c b/kernel/cred.c
index 098213d4a39c..421b1149c651 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -60,7 +60,6 @@ struct cred init_cred = {
.user = INIT_USER,
.user_ns = &init_user_ns,
.group_info = &init_groups,
- .ucounts = &init_ucounts,
};
static inline void set_cred_subscribers(struct cred *cred, int n)
@@ -120,8 +119,6 @@ static void put_cred_rcu(struct rcu_head *rcu)
if (cred->group_info)
put_group_info(cred->group_info);
free_uid(cred->user);
- if (cred->ucounts)
- put_ucounts(cred->ucounts);
put_user_ns(cred->user_ns);
kmem_cache_free(cred_jar, cred);
}
@@ -225,7 +222,6 @@ struct cred *cred_alloc_blank(void)
#ifdef CONFIG_DEBUG_CREDENTIALS
new->magic = CRED_MAGIC;
#endif
- new->ucounts = get_ucounts(&init_ucounts);
if (security_cred_alloc_blank(new, GFP_KERNEL_ACCOUNT) < 0)
goto error;
@@ -288,11 +284,6 @@ struct cred *prepare_creds(void)
if (security_prepare_creds(new, old, GFP_KERNEL_ACCOUNT) < 0)
goto error;
-
- new->ucounts = get_ucounts(new->ucounts);
- if (!new->ucounts)
- goto error;
-
validate_creds(new);
return new;
@@ -372,9 +363,6 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
ret = create_user_ns(new);
if (ret < 0)
goto error_put;
- ret = set_cred_ucounts(new);
- if (ret < 0)
- goto error_put;
}
#ifdef CONFIG_KEYS
@@ -665,31 +653,6 @@ int cred_fscmp(const struct cred *a, const struct cred *b)
}
EXPORT_SYMBOL(cred_fscmp);
-int set_cred_ucounts(struct cred *new)
-{
- struct task_struct *task = current;
- const struct cred *old = task->real_cred;
- struct ucounts *old_ucounts = new->ucounts;
-
- if (new->user == old->user && new->user_ns == old->user_ns)
- return 0;
-
- /*
- * This optimization is needed because alloc_ucounts() uses locks
- * for table lookups.
- */
- if (old_ucounts && old_ucounts->ns == new->user_ns && uid_eq(old_ucounts->uid, new->euid))
- return 0;
-
- if (!(new->ucounts = alloc_ucounts(new->user_ns, new->euid)))
- return -EAGAIN;
-
- if (old_ucounts)
- put_ucounts(old_ucounts);
-
- return 0;
-}
-
/*
* initialise the credentials stuff
*/
@@ -756,10 +719,6 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon)
if (security_prepare_creds(new, old, GFP_KERNEL_ACCOUNT) < 0)
goto error;
- new->ucounts = get_ucounts(new->ucounts);
- if (!new->ucounts)
- goto error;
-
put_cred(old);
validate_creds(new);
return new;
diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c
index 14de1271463f..445754529917 100644
--- a/kernel/dma/debug.c
+++ b/kernel/dma/debug.c
@@ -794,7 +794,7 @@ static int dump_show(struct seq_file *seq, void *v)
}
DEFINE_SHOW_ATTRIBUTE(dump);
-static void dma_debug_fs_init(void)
+static int __init dma_debug_fs_init(void)
{
struct dentry *dentry = debugfs_create_dir("dma-api", NULL);
@@ -807,7 +807,10 @@ static void dma_debug_fs_init(void)
debugfs_create_u32("nr_total_entries", 0444, dentry, &nr_total_entries);
debugfs_create_file("driver_filter", 0644, dentry, NULL, &filter_fops);
debugfs_create_file("dump", 0444, dentry, NULL, &dump_fops);
+
+ return 0;
}
+core_initcall_sync(dma_debug_fs_init);
static int device_dma_allocations(struct device *dev, struct dma_debug_entry **out_entry)
{
@@ -892,8 +895,6 @@ static int dma_debug_init(void)
spin_lock_init(&dma_entry_hash[i].lock);
}
- dma_debug_fs_init();
-
nr_pages = DIV_ROUND_UP(nr_prealloc_entries, DMA_DEBUG_DYNAMIC_ENTRIES);
for (i = 0; i < nr_pages; ++i)
dma_debug_create_entries(GFP_KERNEL);
diff --git a/kernel/fork.c b/kernel/fork.c
index 096945ef49ad..3f96400a0ac6 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1037,6 +1037,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
mm->pmd_huge_pte = NULL;
#endif
mm_init_uprobes_state(mm);
+ hugetlb_count_init(mm);
if (current->mm) {
mm->flags = current->mm->flags & MMF_INIT_MASK;
@@ -2960,12 +2961,6 @@ int ksys_unshare(unsigned long unshare_flags)
if (err)
goto bad_unshare_cleanup_cred;
- if (new_cred) {
- err = set_cred_ucounts(new_cred);
- if (err)
- goto bad_unshare_cleanup_cred;
- }
-
if (new_fs || new_fd || do_sysvsem || new_cred || new_nsproxy) {
if (do_sysvsem) {
/*
diff --git a/kernel/irq/timings.c b/kernel/irq/timings.c
index 6990490fa67b..1f981162648a 100644
--- a/kernel/irq/timings.c
+++ b/kernel/irq/timings.c
@@ -799,12 +799,14 @@ static int __init irq_timings_test_irqs(struct timings_intervals *ti)
__irq_timings_store(irq, irqs, ti->intervals[i]);
if (irqs->circ_timings[i & IRQ_TIMINGS_MASK] != index) {
+ ret = -EBADSLT;
pr_err("Failed to store in the circular buffer\n");
goto out;
}
}
if (irqs->count != ti->count) {
+ ret = -ERANGE;
pr_err("Count differs\n");
goto out;
}
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 9825cf89c614..508fe5278285 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -84,6 +84,25 @@ static inline struct kthread *to_kthread(struct task_struct *k)
return (__force void *)k->set_child_tid;
}
+/*
+ * Variant of to_kthread() that doesn't assume @p is a kthread.
+ *
+ * Per construction; when:
+ *
+ * (p->flags & PF_KTHREAD) && p->set_child_tid
+ *
+ * the task is both a kthread and struct kthread is persistent. However
+ * PF_KTHREAD on it's own is not, kernel_thread() can exec() (See umh.c and
+ * begin_new_exec()).
+ */
+static inline struct kthread *__to_kthread(struct task_struct *p)
+{
+ void *kthread = (__force void *)p->set_child_tid;
+ if (kthread && !(p->flags & PF_KTHREAD))
+ kthread = NULL;
+ return kthread;
+}
+
void free_kthread_struct(struct task_struct *k)
{
struct kthread *kthread;
@@ -168,8 +187,9 @@ EXPORT_SYMBOL_GPL(kthread_freezable_should_stop);
*/
void *kthread_func(struct task_struct *task)
{
- if (task->flags & PF_KTHREAD)
- return to_kthread(task)->threadfn;
+ struct kthread *kthread = __to_kthread(task);
+ if (kthread)
+ return kthread->threadfn;
return NULL;
}
EXPORT_SYMBOL_GPL(kthread_func);
@@ -199,10 +219,11 @@ EXPORT_SYMBOL_GPL(kthread_data);
*/
void *kthread_probe_data(struct task_struct *task)
{
- struct kthread *kthread = to_kthread(task);
+ struct kthread *kthread = __to_kthread(task);
void *data = NULL;
- copy_from_kernel_nofault(&data, &kthread->data, sizeof(data));
+ if (kthread)
+ copy_from_kernel_nofault(&data, &kthread->data, sizeof(data));
return data;
}
@@ -514,9 +535,9 @@ void kthread_set_per_cpu(struct task_struct *k, int cpu)
set_bit(KTHREAD_IS_PER_CPU, &kthread->flags);
}
-bool kthread_is_per_cpu(struct task_struct *k)
+bool kthread_is_per_cpu(struct task_struct *p)
{
- struct kthread *kthread = to_kthread(k);
+ struct kthread *kthread = __to_kthread(p);
if (!kthread)
return false;
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 8ae9d7abebc0..5184f6896815 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -1293,6 +1293,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
class->name_version = count_matching_names(class);
class->wait_type_inner = lock->wait_type_inner;
class->wait_type_outer = lock->wait_type_outer;
+ class->lock_type = lock->lock_type;
/*
* We use RCU's safe list-add method to make
* parallel walking of the hash-list safe:
@@ -4621,9 +4622,9 @@ print_lock_invalid_wait_context(struct task_struct *curr,
*/
static int check_wait_context(struct task_struct *curr, struct held_lock *next)
{
- short next_inner = hlock_class(next)->wait_type_inner;
- short next_outer = hlock_class(next)->wait_type_outer;
- short curr_inner;
+ u8 next_inner = hlock_class(next)->wait_type_inner;
+ u8 next_outer = hlock_class(next)->wait_type_outer;
+ u8 curr_inner;
int depth;
if (!next_inner || next->trylock)
@@ -4646,7 +4647,7 @@ static int check_wait_context(struct task_struct *curr, struct held_lock *next)
for (; depth < curr->lockdep_depth; depth++) {
struct held_lock *prev = curr->held_locks + depth;
- short prev_inner = hlock_class(prev)->wait_type_inner;
+ u8 prev_inner = hlock_class(prev)->wait_type_inner;
if (prev_inner) {
/*
@@ -4695,9 +4696,9 @@ static inline int check_wait_context(struct task_struct *curr,
/*
* Initialize a lock instance's lock-class mapping info:
*/
-void lockdep_init_map_waits(struct lockdep_map *lock, const char *name,
+void lockdep_init_map_type(struct lockdep_map *lock, const char *name,
struct lock_class_key *key, int subclass,
- short inner, short outer)
+ u8 inner, u8 outer, u8 lock_type)
{
int i;
@@ -4720,6 +4721,7 @@ void lockdep_init_map_waits(struct lockdep_map *lock, const char *name,
lock->wait_type_outer = outer;
lock->wait_type_inner = inner;
+ lock->lock_type = lock_type;
/*
* No key, no joy, we need to hash something.
@@ -4754,7 +4756,7 @@ void lockdep_init_map_waits(struct lockdep_map *lock, const char *name,
raw_local_irq_restore(flags);
}
}
-EXPORT_SYMBOL_GPL(lockdep_init_map_waits);
+EXPORT_SYMBOL_GPL(lockdep_init_map_type);
struct lock_class_key __lockdep_no_validate__;
EXPORT_SYMBOL_GPL(__lockdep_no_validate__);
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index 15ac7c4bb111..86061901636c 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -938,7 +938,6 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
struct ww_acquire_ctx *ww_ctx, const bool use_ww_ctx)
{
struct mutex_waiter waiter;
- bool first = false;
struct ww_mutex *ww;
int ret;
@@ -1017,6 +1016,8 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
set_current_state(state);
for (;;) {
+ bool first;
+
/*
* Once we hold wait_lock, we're serialized against
* mutex_unlock() handing the lock off to us, do a trylock
@@ -1045,15 +1046,9 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
spin_unlock(&lock->wait_lock);
schedule_preempt_disabled();
- /*
- * ww_mutex needs to always recheck its position since its waiter
- * list is not FIFO ordered.
- */
- if (ww_ctx || !first) {
- first = __mutex_waiter_is_first(lock, &waiter);
- if (first)
- __mutex_set_flag(lock, MUTEX_FLAG_HANDOFF);
- }
+ first = __mutex_waiter_is_first(lock, &waiter);
+ if (first)
+ __mutex_set_flag(lock, MUTEX_FLAG_HANDOFF);
set_current_state(state);
/*
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 9de21803a8ae..ef8733e2a476 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -51,7 +51,8 @@ static struct kmem_cache *create_pid_cachep(unsigned int level)
mutex_lock(&pid_caches_mutex);
/* Name collision forces to do allocation under mutex. */
if (!*pkc)
- *pkc = kmem_cache_create(name, len, 0, SLAB_HWCACHE_ALIGN, 0);
+ *pkc = kmem_cache_create(name, len, 0,
+ SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT, 0);
mutex_unlock(&pid_caches_mutex);
/* current can fail, but someone else can succeed. */
return READ_ONCE(*pkc);
diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c
index 994ca8353543..be381eb6116a 100644
--- a/kernel/power/energy_model.c
+++ b/kernel/power/energy_model.c
@@ -157,7 +157,9 @@ static int em_create_perf_table(struct device *dev, struct em_perf_domain *pd,
/* Compute the cost of each performance state. */
fmax = (u64) table[nr_states - 1].frequency;
for (i = 0; i < nr_states; i++) {
- table[i].cost = div64_u64(fmax * table[i].power,
+ unsigned long power_res = em_scale_power(table[i].power);
+
+ table[i].cost = div64_u64(fmax * power_res,
table[i].frequency);
}
diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c
index 6208c1dae5c9..26344dc6483b 100644
--- a/kernel/rcu/srcutiny.c
+++ b/kernel/rcu/srcutiny.c
@@ -34,6 +34,7 @@ static int init_srcu_struct_fields(struct srcu_struct *ssp)
ssp->srcu_gp_running = false;
ssp->srcu_gp_waiting = false;
ssp->srcu_idx = 0;
+ ssp->srcu_idx_max = 0;
INIT_WORK(&ssp->srcu_work, srcu_drive_gp);
INIT_LIST_HEAD(&ssp->srcu_work.entry);
return 0;
@@ -84,6 +85,8 @@ void cleanup_srcu_struct(struct srcu_struct *ssp)
WARN_ON(ssp->srcu_gp_waiting);
WARN_ON(ssp->srcu_cb_head);
WARN_ON(&ssp->srcu_cb_head != ssp->srcu_cb_tail);
+ WARN_ON(ssp->srcu_idx != ssp->srcu_idx_max);
+ WARN_ON(ssp->srcu_idx & 0x1);
}
EXPORT_SYMBOL_GPL(cleanup_srcu_struct);
@@ -114,7 +117,7 @@ void srcu_drive_gp(struct work_struct *wp)
struct srcu_struct *ssp;
ssp = container_of(wp, struct srcu_struct, srcu_work);
- if (ssp->srcu_gp_running || !READ_ONCE(ssp->srcu_cb_head))
+ if (ssp->srcu_gp_running || USHORT_CMP_GE(ssp->srcu_idx, READ_ONCE(ssp->srcu_idx_max)))
return; /* Already running or nothing to do. */
/* Remove recently arrived callbacks and wait for readers. */
@@ -124,11 +127,12 @@ void srcu_drive_gp(struct work_struct *wp)
ssp->srcu_cb_head = NULL;
ssp->srcu_cb_tail = &ssp->srcu_cb_head;
local_irq_enable();
- idx = ssp->srcu_idx;
- WRITE_ONCE(ssp->srcu_idx, !ssp->srcu_idx);
+ idx = (ssp->srcu_idx & 0x2) / 2;
+ WRITE_ONCE(ssp->srcu_idx, ssp->srcu_idx + 1);
WRITE_ONCE(ssp->srcu_gp_waiting, true); /* srcu_read_unlock() wakes! */
swait_event_exclusive(ssp->srcu_wq, !READ_ONCE(ssp->srcu_lock_nesting[idx]));
WRITE_ONCE(ssp->srcu_gp_waiting, false); /* srcu_read_unlock() cheap. */
+ WRITE_ONCE(ssp->srcu_idx, ssp->srcu_idx + 1);
/* Invoke the callbacks we removed above. */
while (lh) {
@@ -146,11 +150,27 @@ void srcu_drive_gp(struct work_struct *wp)
* straighten that out.
*/
WRITE_ONCE(ssp->srcu_gp_running, false);
- if (READ_ONCE(ssp->srcu_cb_head))
+ if (USHORT_CMP_LT(ssp->srcu_idx, READ_ONCE(ssp->srcu_idx_max)))
schedule_work(&ssp->srcu_work);
}
EXPORT_SYMBOL_GPL(srcu_drive_gp);
+static void srcu_gp_start_if_needed(struct srcu_struct *ssp)
+{
+ unsigned short cookie;
+
+ cookie = get_state_synchronize_srcu(ssp);
+ if (USHORT_CMP_GE(READ_ONCE(ssp->srcu_idx_max), cookie))
+ return;
+ WRITE_ONCE(ssp->srcu_idx_max, cookie);
+ if (!READ_ONCE(ssp->srcu_gp_running)) {
+ if (likely(srcu_init_done))
+ schedule_work(&ssp->srcu_work);
+ else if (list_empty(&ssp->srcu_work.entry))
+ list_add(&ssp->srcu_work.entry, &srcu_boot_list);
+ }
+}
+
/*
* Enqueue an SRCU callback on the specified srcu_struct structure,
* initiating grace-period processing if it is not already running.
@@ -166,12 +186,7 @@ void call_srcu(struct srcu_struct *ssp, struct rcu_head *rhp,
*ssp->srcu_cb_tail = rhp;
ssp->srcu_cb_tail = &rhp->next;
local_irq_restore(flags);
- if (!READ_ONCE(ssp->srcu_gp_running)) {
- if (likely(srcu_init_done))
- schedule_work(&ssp->srcu_work);
- else if (list_empty(&ssp->srcu_work.entry))
- list_add(&ssp->srcu_work.entry, &srcu_boot_list);
- }
+ srcu_gp_start_if_needed(ssp);
}
EXPORT_SYMBOL_GPL(call_srcu);
@@ -190,6 +205,48 @@ void synchronize_srcu(struct srcu_struct *ssp)
}
EXPORT_SYMBOL_GPL(synchronize_srcu);
+/*
+ * get_state_synchronize_srcu - Provide an end-of-grace-period cookie
+ */
+unsigned long get_state_synchronize_srcu(struct srcu_struct *ssp)
+{
+ unsigned long ret;
+
+ barrier();
+ ret = (READ_ONCE(ssp->srcu_idx) + 3) & ~0x1;
+ barrier();
+ return ret & USHRT_MAX;
+}
+EXPORT_SYMBOL_GPL(get_state_synchronize_srcu);
+
+/*
+ * start_poll_synchronize_srcu - Provide cookie and start grace period
+ *
+ * The difference between this and get_state_synchronize_srcu() is that
+ * this function ensures that the poll_state_synchronize_srcu() will
+ * eventually return the value true.
+ */
+unsigned long start_poll_synchronize_srcu(struct srcu_struct *ssp)
+{
+ unsigned long ret = get_state_synchronize_srcu(ssp);
+
+ srcu_gp_start_if_needed(ssp);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(start_poll_synchronize_srcu);
+
+/*
+ * poll_state_synchronize_srcu - Has cookie's grace period ended?
+ */
+bool poll_state_synchronize_srcu(struct srcu_struct *ssp, unsigned long cookie)
+{
+ bool ret = USHORT_CMP_GE(READ_ONCE(ssp->srcu_idx), cookie);
+
+ barrier();
+ return ret;
+}
+EXPORT_SYMBOL_GPL(poll_state_synchronize_srcu);
+
/* Lockdep diagnostics. */
void __init rcu_scheduler_starting(void)
{
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index 68ceac387844..b8821665c435 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -809,6 +809,46 @@ static void srcu_leak_callback(struct rcu_head *rhp)
}
/*
+ * Start an SRCU grace period, and also queue the callback if non-NULL.
+ */
+static unsigned long srcu_gp_start_if_needed(struct srcu_struct *ssp,
+ struct rcu_head *rhp, bool do_norm)
+{
+ unsigned long flags;
+ int idx;
+ bool needexp = false;
+ bool needgp = false;
+ unsigned long s;
+ struct srcu_data *sdp;
+
+ check_init_srcu_struct(ssp);
+ idx = srcu_read_lock(ssp);
+ sdp = raw_cpu_ptr(ssp->sda);
+ spin_lock_irqsave_rcu_node(sdp, flags);
+ if (rhp)
+ rcu_segcblist_enqueue(&sdp->srcu_cblist, rhp);
+ rcu_segcblist_advance(&sdp->srcu_cblist,
+ rcu_seq_current(&ssp->srcu_gp_seq));
+ s = rcu_seq_snap(&ssp->srcu_gp_seq);
+ (void)rcu_segcblist_accelerate(&sdp->srcu_cblist, s);
+ if (ULONG_CMP_LT(sdp->srcu_gp_seq_needed, s)) {
+ sdp->srcu_gp_seq_needed = s;
+ needgp = true;
+ }
+ if (!do_norm && ULONG_CMP_LT(sdp->srcu_gp_seq_needed_exp, s)) {
+ sdp->srcu_gp_seq_needed_exp = s;
+ needexp = true;
+ }
+ spin_unlock_irqrestore_rcu_node(sdp, flags);
+ if (needgp)
+ srcu_funnel_gp_start(ssp, sdp, s, do_norm);
+ else if (needexp)
+ srcu_funnel_exp_start(ssp, sdp->mynode, s);
+ srcu_read_unlock(ssp, idx);
+ return s;
+}
+
+/*
* Enqueue an SRCU callback on the srcu_data structure associated with
* the current CPU and the specified srcu_struct structure, initiating
* grace-period processing if it is not already running.
@@ -839,14 +879,6 @@ static void srcu_leak_callback(struct rcu_head *rhp)
static void __call_srcu(struct srcu_struct *ssp, struct rcu_head *rhp,
rcu_callback_t func, bool do_norm)
{
- unsigned long flags;
- int idx;
- bool needexp = false;
- bool needgp = false;
- unsigned long s;
- struct srcu_data *sdp;
-
- check_init_srcu_struct(ssp);
if (debug_rcu_head_queue(rhp)) {
/* Probable double call_srcu(), so leak the callback. */
WRITE_ONCE(rhp->func, srcu_leak_callback);
@@ -854,28 +886,7 @@ static void __call_srcu(struct srcu_struct *ssp, struct rcu_head *rhp,
return;
}
rhp->func = func;
- idx = srcu_read_lock(ssp);
- sdp = raw_cpu_ptr(ssp->sda);
- spin_lock_irqsave_rcu_node(sdp, flags);
- rcu_segcblist_enqueue(&sdp->srcu_cblist, rhp);
- rcu_segcblist_advance(&sdp->srcu_cblist,
- rcu_seq_current(&ssp->srcu_gp_seq));
- s = rcu_seq_snap(&ssp->srcu_gp_seq);
- (void)rcu_segcblist_accelerate(&sdp->srcu_cblist, s);
- if (ULONG_CMP_LT(sdp->srcu_gp_seq_needed, s)) {
- sdp->srcu_gp_seq_needed = s;
- needgp = true;
- }
- if (!do_norm && ULONG_CMP_LT(sdp->srcu_gp_seq_needed_exp, s)) {
- sdp->srcu_gp_seq_needed_exp = s;
- needexp = true;
- }
- spin_unlock_irqrestore_rcu_node(sdp, flags);
- if (needgp)
- srcu_funnel_gp_start(ssp, sdp, s, do_norm);
- else if (needexp)
- srcu_funnel_exp_start(ssp, sdp->mynode, s);
- srcu_read_unlock(ssp, idx);
+ (void)srcu_gp_start_if_needed(ssp, rhp, do_norm);
}
/**
@@ -1004,6 +1015,62 @@ void synchronize_srcu(struct srcu_struct *ssp)
}
EXPORT_SYMBOL_GPL(synchronize_srcu);
+/**
+ * get_state_synchronize_srcu - Provide an end-of-grace-period cookie
+ * @ssp: srcu_struct to provide cookie for.
+ *
+ * This function returns a cookie that can be passed to
+ * poll_state_synchronize_srcu(), which will return true if a full grace
+ * period has elapsed in the meantime. It is the caller's responsibility
+ * to make sure that grace period happens, for example, by invoking
+ * call_srcu() after return from get_state_synchronize_srcu().
+ */
+unsigned long get_state_synchronize_srcu(struct srcu_struct *ssp)
+{
+ // Any prior manipulation of SRCU-protected data must happen
+ // before the load from ->srcu_gp_seq.
+ smp_mb();
+ return rcu_seq_snap(&ssp->srcu_gp_seq);
+}
+EXPORT_SYMBOL_GPL(get_state_synchronize_srcu);
+
+/**
+ * start_poll_synchronize_srcu - Provide cookie and start grace period
+ * @ssp: srcu_struct to provide cookie for.
+ *
+ * This function returns a cookie that can be passed to
+ * poll_state_synchronize_srcu(), which will return true if a full grace
+ * period has elapsed in the meantime. Unlike get_state_synchronize_srcu(),
+ * this function also ensures that any needed SRCU grace period will be
+ * started. This convenience does come at a cost in terms of CPU overhead.
+ */
+unsigned long start_poll_synchronize_srcu(struct srcu_struct *ssp)
+{
+ return srcu_gp_start_if_needed(ssp, NULL, true);
+}
+EXPORT_SYMBOL_GPL(start_poll_synchronize_srcu);
+
+/**
+ * poll_state_synchronize_srcu - Has cookie's grace period ended?
+ * @ssp: srcu_struct to provide cookie for.
+ * @cookie: Return value from get_state_synchronize_srcu() or start_poll_synchronize_srcu().
+ *
+ * This function takes the cookie that was returned from either
+ * get_state_synchronize_srcu() or start_poll_synchronize_srcu(), and
+ * returns @true if an SRCU grace period elapsed since the time that the
+ * cookie was created.
+ */
+bool poll_state_synchronize_srcu(struct srcu_struct *ssp, unsigned long cookie)
+{
+ if (!rcu_seq_done(&ssp->srcu_gp_seq, cookie))
+ return false;
+ // Ensure that the end of the SRCU grace period happens before
+ // any subsequent code that the caller might execute.
+ smp_mb(); // ^^^
+ return true;
+}
+EXPORT_SYMBOL_GPL(poll_state_synchronize_srcu);
+
/*
* Callback function for srcu_barrier() use.
*/
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 8c3ba0185082..8c81c05c4236 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -2561,6 +2561,7 @@ static void rcu_do_batch(struct rcu_data *rdp)
void rcu_sched_clock_irq(int user)
{
trace_rcu_utilization(TPS("Start scheduler-tick"));
+ lockdep_assert_irqs_disabled();
raw_cpu_inc(rcu_data.ticks_this_gp);
/* The load-acquire pairs with the store-release setting to true. */
if (smp_load_acquire(this_cpu_ptr(&rcu_data.rcu_urgent_qs))) {
@@ -2574,6 +2575,7 @@ void rcu_sched_clock_irq(int user)
rcu_flavor_sched_clock_irq(user);
if (rcu_pending(user))
invoke_rcu_core();
+ lockdep_assert_irqs_disabled();
trace_rcu_utilization(TPS("End scheduler-tick"));
}
@@ -3730,6 +3732,8 @@ static int rcu_pending(int user)
struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
struct rcu_node *rnp = rdp->mynode;
+ lockdep_assert_irqs_disabled();
+
/* Check for CPU stalls, if enabled. */
check_cpu_stall(rdp);
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 7d4f78bf4057..c5091aeaa37b 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -682,6 +682,7 @@ static void rcu_flavor_sched_clock_irq(int user)
{
struct task_struct *t = current;
+ lockdep_assert_irqs_disabled();
if (user || rcu_is_cpu_rrupt_from_idle()) {
rcu_note_voluntary_context_switch(current);
}
@@ -2590,17 +2591,17 @@ static void noinstr rcu_dynticks_task_exit(void)
/* Turn on heavyweight RCU tasks trace readers on idle/user entry. */
static void rcu_dynticks_task_trace_enter(void)
{
-#ifdef CONFIG_TASKS_RCU_TRACE
+#ifdef CONFIG_TASKS_TRACE_RCU
if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB))
current->trc_reader_special.b.need_mb = true;
-#endif /* #ifdef CONFIG_TASKS_RCU_TRACE */
+#endif /* #ifdef CONFIG_TASKS_TRACE_RCU */
}
/* Turn off heavyweight RCU tasks trace readers on idle/user exit. */
static void rcu_dynticks_task_trace_exit(void)
{
-#ifdef CONFIG_TASKS_RCU_TRACE
+#ifdef CONFIG_TASKS_TRACE_RCU
if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB))
current->trc_reader_special.b.need_mb = false;
-#endif /* #ifdef CONFIG_TASKS_RCU_TRACE */
+#endif /* #ifdef CONFIG_TASKS_TRACE_RCU */
}
diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h
index ca21d28a0f98..251a9af3709a 100644
--- a/kernel/rcu/tree_stall.h
+++ b/kernel/rcu/tree_stall.h
@@ -7,6 +7,8 @@
* Author: Paul E. McKenney <paulmck@linux.ibm.com>
*/
+#include <linux/kvm_para.h>
+
//////////////////////////////////////////////////////////////////////////////
//
// Controlling CPU stall warnings, including delay calculation.
@@ -260,8 +262,11 @@ static int rcu_print_task_stall(struct rcu_node *rnp, unsigned long flags)
struct task_struct *t;
struct task_struct *ts[8];
- if (!rcu_preempt_blocked_readers_cgp(rnp))
+ lockdep_assert_irqs_disabled();
+ if (!rcu_preempt_blocked_readers_cgp(rnp)) {
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
return 0;
+ }
pr_err("\tTasks blocked on level-%d rcu_node (CPUs %d-%d):",
rnp->level, rnp->grplo, rnp->grphi);
t = list_entry(rnp->gp_tasks->prev,
@@ -273,8 +278,8 @@ static int rcu_print_task_stall(struct rcu_node *rnp, unsigned long flags)
break;
}
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
- for (i--; i; i--) {
- t = ts[i];
+ while (i) {
+ t = ts[--i];
if (!try_invoke_on_locked_down_task(t, check_slow_task, &rscr))
pr_cont(" P%d", t->pid);
else
@@ -284,6 +289,7 @@ static int rcu_print_task_stall(struct rcu_node *rnp, unsigned long flags)
".q"[rscr.rs.b.need_qs],
".e"[rscr.rs.b.exp_hint],
".l"[rscr.on_blkd_list]);
+ lockdep_assert_irqs_disabled();
put_task_struct(t);
ndetected++;
}
@@ -472,6 +478,8 @@ static void print_other_cpu_stall(unsigned long gp_seq, unsigned long gps)
struct rcu_node *rnp;
long totqlen = 0;
+ lockdep_assert_irqs_disabled();
+
/* Kick and suppress, if so configured. */
rcu_stall_kick_kthreads();
if (rcu_stall_is_suppressed())
@@ -493,6 +501,7 @@ static void print_other_cpu_stall(unsigned long gp_seq, unsigned long gps)
}
}
ndetected += rcu_print_task_stall(rnp, flags); // Releases rnp->lock.
+ lockdep_assert_irqs_disabled();
}
for_each_possible_cpu(cpu)
@@ -538,6 +547,8 @@ static void print_cpu_stall(unsigned long gps)
struct rcu_node *rnp = rcu_get_root();
long totqlen = 0;
+ lockdep_assert_irqs_disabled();
+
/* Kick and suppress, if so configured. */
rcu_stall_kick_kthreads();
if (rcu_stall_is_suppressed())
@@ -592,6 +603,7 @@ static void check_cpu_stall(struct rcu_data *rdp)
unsigned long js;
struct rcu_node *rnp;
+ lockdep_assert_irqs_disabled();
if ((rcu_stall_is_suppressed() && !READ_ONCE(rcu_kick_kthreads)) ||
!rcu_gp_in_progress())
return;
@@ -633,6 +645,14 @@ static void check_cpu_stall(struct rcu_data *rdp)
(READ_ONCE(rnp->qsmask) & rdp->grpmask) &&
cmpxchg(&rcu_state.jiffies_stall, js, jn) == js) {
+ /*
+ * If a virtual machine is stopped by the host it can look to
+ * the watchdog like an RCU stall. Check to see if the host
+ * stopped the vm.
+ */
+ if (kvm_check_and_clear_guest_paused())
+ return;
+
/* We haven't checked in, so go dump stack. */
print_cpu_stall(gps);
if (READ_ONCE(rcu_cpu_stall_ftrace_dump))
@@ -642,6 +662,14 @@ static void check_cpu_stall(struct rcu_data *rdp)
ULONG_CMP_GE(j, js + RCU_STALL_RAT_DELAY) &&
cmpxchg(&rcu_state.jiffies_stall, js, jn) == js) {
+ /*
+ * If a virtual machine is stopped by the host it can look to
+ * the watchdog like an RCU stall. Check to see if the host
+ * stopped the vm.
+ */
+ if (kvm_check_and_clear_guest_paused())
+ return;
+
/* They had a few time units to dump stack, so complain. */
print_other_cpu_stall(gs2, gps);
if (READ_ONCE(rcu_cpu_stall_ftrace_dump))
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 84c105902027..6db20a66e8e6 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1279,6 +1279,23 @@ static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p)
uclamp_rq_dec_id(rq, p, clamp_id);
}
+static inline void uclamp_rq_reinc_id(struct rq *rq, struct task_struct *p,
+ enum uclamp_id clamp_id)
+{
+ if (!p->uclamp[clamp_id].active)
+ return;
+
+ uclamp_rq_dec_id(rq, p, clamp_id);
+ uclamp_rq_inc_id(rq, p, clamp_id);
+
+ /*
+ * Make sure to clear the idle flag if we've transiently reached 0
+ * active tasks on rq.
+ */
+ if (clamp_id == UCLAMP_MAX && (rq->uclamp_flags & UCLAMP_FLAG_IDLE))
+ rq->uclamp_flags &= ~UCLAMP_FLAG_IDLE;
+}
+
static inline void
uclamp_update_active(struct task_struct *p)
{
@@ -1302,12 +1319,8 @@ uclamp_update_active(struct task_struct *p)
* affecting a valid clamp bucket, the next time it's enqueued,
* it will already see the updated clamp bucket value.
*/
- for_each_clamp_id(clamp_id) {
- if (p->uclamp[clamp_id].active) {
- uclamp_rq_dec_id(rq, p, clamp_id);
- uclamp_rq_inc_id(rq, p, clamp_id);
- }
- }
+ for_each_clamp_id(clamp_id)
+ uclamp_rq_reinc_id(rq, p, clamp_id);
task_rq_unlock(rq, p, &rf);
}
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 6b98c1fe6e7f..a3ae00c348a8 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1735,6 +1735,7 @@ static void migrate_task_rq_dl(struct task_struct *p, int new_cpu __maybe_unused
*/
raw_spin_lock(&rq->lock);
if (p->dl.dl_non_contending) {
+ update_rq_clock(rq);
sub_running_bw(&p->dl, &rq->dl);
p->dl.dl_non_contending = 0;
/*
@@ -2703,7 +2704,7 @@ void __setparam_dl(struct task_struct *p, const struct sched_attr *attr)
dl_se->dl_runtime = attr->sched_runtime;
dl_se->dl_deadline = attr->sched_deadline;
dl_se->dl_period = attr->sched_period ?: dl_se->dl_deadline;
- dl_se->flags = attr->sched_flags;
+ dl_se->flags = attr->sched_flags & SCHED_DL_FLAGS;
dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime);
dl_se->dl_density = to_ratio(dl_se->dl_deadline, dl_se->dl_runtime);
}
@@ -2716,7 +2717,8 @@ void __getparam_dl(struct task_struct *p, struct sched_attr *attr)
attr->sched_runtime = dl_se->dl_runtime;
attr->sched_deadline = dl_se->dl_deadline;
attr->sched_period = dl_se->dl_period;
- attr->sched_flags = dl_se->flags;
+ attr->sched_flags &= ~SCHED_DL_FLAGS;
+ attr->sched_flags |= dl_se->flags;
}
/*
@@ -2813,7 +2815,7 @@ bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr)
if (dl_se->dl_runtime != attr->sched_runtime ||
dl_se->dl_deadline != attr->sched_deadline ||
dl_se->dl_period != attr->sched_period ||
- dl_se->flags != attr->sched_flags)
+ dl_se->flags != (attr->sched_flags & SCHED_DL_FLAGS))
return true;
return false;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 262b02d75007..c004e3b89c32 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1533,7 +1533,7 @@ static inline bool is_core_idle(int cpu)
if (cpu == sibling)
continue;
- if (!idle_cpu(cpu))
+ if (!idle_cpu(sibling))
return false;
}
#endif
@@ -7569,7 +7569,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
return 0;
/* Disregard pcpu kthreads; they are where they need to be. */
- if ((p->flags & PF_KTHREAD) && kthread_is_per_cpu(p))
+ if (kthread_is_per_cpu(p))
return 0;
if (!cpumask_test_cpu(env->dst_cpu, p->cpus_ptr)) {
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 39112ac7ab34..08db8e095e48 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -226,6 +226,8 @@ static inline void update_avg(u64 *avg, u64 sample)
*/
#define SCHED_FLAG_SUGOV 0x10000000
+#define SCHED_DL_FLAGS (SCHED_FLAG_RECLAIM | SCHED_FLAG_DL_OVERRUN | SCHED_FLAG_SUGOV)
+
static inline bool dl_entity_is_special(struct sched_dl_entity *dl_se)
{
#ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL
diff --git a/kernel/static_call.c b/kernel/static_call.c
index b62a0c41c905..dc5665b62814 100644
--- a/kernel/static_call.c
+++ b/kernel/static_call.c
@@ -165,13 +165,13 @@ void __static_call_update(struct static_call_key *key, void *tramp, void *func)
stop = __stop_static_call_sites;
-#ifdef CONFIG_MODULES
if (mod) {
+#ifdef CONFIG_MODULES
stop = mod->static_call_sites +
mod->num_static_call_sites;
init = mod->state == MODULE_STATE_COMING;
- }
#endif
+ }
for (site = site_mod->sites;
site < stop && static_call_key(site) == key; site++) {
diff --git a/kernel/sys.c b/kernel/sys.c
index 0670e824e019..a730c03ee607 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -552,10 +552,6 @@ long __sys_setreuid(uid_t ruid, uid_t euid)
if (retval < 0)
goto error;
- retval = set_cred_ucounts(new);
- if (retval < 0)
- goto error;
-
return commit_creds(new);
error:
@@ -614,10 +610,6 @@ long __sys_setuid(uid_t uid)
if (retval < 0)
goto error;
- retval = set_cred_ucounts(new);
- if (retval < 0)
- goto error;
-
return commit_creds(new);
error:
@@ -693,10 +685,6 @@ long __sys_setresuid(uid_t ruid, uid_t euid, uid_t suid)
if (retval < 0)
goto error;
- retval = set_cred_ucounts(new);
- if (retval < 0)
- goto error;
-
return commit_creds(new);
error:
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 9505b1f21cdf..4ef90718c114 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -758,22 +758,6 @@ static void hrtimer_switch_to_hres(void)
retrigger_next_event(NULL);
}
-static void clock_was_set_work(struct work_struct *work)
-{
- clock_was_set();
-}
-
-static DECLARE_WORK(hrtimer_work, clock_was_set_work);
-
-/*
- * Called from timekeeping and resume code to reprogram the hrtimer
- * interrupt device on all cpus.
- */
-void clock_was_set_delayed(void)
-{
- schedule_work(&hrtimer_work);
-}
-
#else
static inline int hrtimer_is_hres_enabled(void) { return 0; }
@@ -891,6 +875,22 @@ void clock_was_set(void)
timerfd_clock_was_set();
}
+static void clock_was_set_work(struct work_struct *work)
+{
+ clock_was_set();
+}
+
+static DECLARE_WORK(hrtimer_work, clock_was_set_work);
+
+/*
+ * Called from timekeeping and resume code to reprogram the hrtimer
+ * interrupt device on all cpus and to notify timerfd.
+ */
+void clock_was_set_delayed(void)
+{
+ schedule_work(&hrtimer_work);
+}
+
/*
* During resume we might have to reprogram the high resolution timer
* interrupt on all online CPUs. However, all other CPUs will be
@@ -1030,12 +1030,13 @@ static void __remove_hrtimer(struct hrtimer *timer,
* remove hrtimer, called with base lock held
*/
static inline int
-remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, bool restart)
+remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base,
+ bool restart, bool keep_local)
{
u8 state = timer->state;
if (state & HRTIMER_STATE_ENQUEUED) {
- int reprogram;
+ bool reprogram;
/*
* Remove the timer and force reprogramming when high
@@ -1048,8 +1049,16 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, bool rest
debug_deactivate(timer);
reprogram = base->cpu_base == this_cpu_ptr(&hrtimer_bases);
+ /*
+ * If the timer is not restarted then reprogramming is
+ * required if the timer is local. If it is local and about
+ * to be restarted, avoid programming it twice (on removal
+ * and a moment later when it's requeued).
+ */
if (!restart)
state = HRTIMER_STATE_INACTIVE;
+ else
+ reprogram &= !keep_local;
__remove_hrtimer(timer, base, state, reprogram);
return 1;
@@ -1103,9 +1112,31 @@ static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
struct hrtimer_clock_base *base)
{
struct hrtimer_clock_base *new_base;
+ bool force_local, first;
+
+ /*
+ * If the timer is on the local cpu base and is the first expiring
+ * timer then this might end up reprogramming the hardware twice
+ * (on removal and on enqueue). To avoid that by prevent the
+ * reprogram on removal, keep the timer local to the current CPU
+ * and enforce reprogramming after it is queued no matter whether
+ * it is the new first expiring timer again or not.
+ */
+ force_local = base->cpu_base == this_cpu_ptr(&hrtimer_bases);
+ force_local &= base->cpu_base->next_timer == timer;
- /* Remove an active timer from the queue: */
- remove_hrtimer(timer, base, true);
+ /*
+ * Remove an active timer from the queue. In case it is not queued
+ * on the current CPU, make sure that remove_hrtimer() updates the
+ * remote data correctly.
+ *
+ * If it's on the current CPU and the first expiring timer, then
+ * skip reprogramming, keep the timer local and enforce
+ * reprogramming later if it was the first expiring timer. This
+ * avoids programming the underlying clock event twice (once at
+ * removal and once after enqueue).
+ */
+ remove_hrtimer(timer, base, true, force_local);
if (mode & HRTIMER_MODE_REL)
tim = ktime_add_safe(tim, base->get_time());
@@ -1115,9 +1146,24 @@ static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
hrtimer_set_expires_range_ns(timer, tim, delta_ns);
/* Switch the timer base, if necessary: */
- new_base = switch_hrtimer_base(timer, base, mode & HRTIMER_MODE_PINNED);
+ if (!force_local) {
+ new_base = switch_hrtimer_base(timer, base,
+ mode & HRTIMER_MODE_PINNED);
+ } else {
+ new_base = base;
+ }
+
+ first = enqueue_hrtimer(timer, new_base, mode);
+ if (!force_local)
+ return first;
- return enqueue_hrtimer(timer, new_base, mode);
+ /*
+ * Timer was forced to stay on the current CPU to avoid
+ * reprogramming on removal and enqueue. Force reprogram the
+ * hardware by evaluating the new first expiring timer.
+ */
+ hrtimer_force_reprogram(new_base->cpu_base, 1);
+ return 0;
}
/**
@@ -1183,7 +1229,7 @@ int hrtimer_try_to_cancel(struct hrtimer *timer)
base = lock_hrtimer_base(timer, &flags);
if (!hrtimer_callback_running(timer))
- ret = remove_hrtimer(timer, base, false);
+ ret = remove_hrtimer(timer, base, false, false);
unlock_hrtimer_base(timer, &flags);
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index 7b2496136729..5294f5b1f955 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -165,3 +165,6 @@ DECLARE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases);
extern u64 get_next_timer_interrupt(unsigned long basej, u64 basem);
void timer_clear_idle(void);
+
+void clock_was_set(void);
+void clock_was_set_delayed(void);
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 75529b311769..1b7f90e00eb0 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -3405,6 +3405,8 @@ trace_action_create_field_var(struct hist_trigger_data *hist_data,
event = data->match_data.event;
}
+ if (!event)
+ goto free;
/*
* At this point, we're looking at a field on another
* event. Because we can't modify a hist trigger on
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index d7260f6614a6..2dff7f1a27ec 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -28,6 +28,44 @@ extern tracepoint_ptr_t __stop___tracepoints_ptrs[];
DEFINE_SRCU(tracepoint_srcu);
EXPORT_SYMBOL_GPL(tracepoint_srcu);
+enum tp_transition_sync {
+ TP_TRANSITION_SYNC_1_0_1,
+ TP_TRANSITION_SYNC_N_2_1,
+
+ _NR_TP_TRANSITION_SYNC,
+};
+
+struct tp_transition_snapshot {
+ unsigned long rcu;
+ unsigned long srcu;
+ bool ongoing;
+};
+
+/* Protected by tracepoints_mutex */
+static struct tp_transition_snapshot tp_transition_snapshot[_NR_TP_TRANSITION_SYNC];
+
+static void tp_rcu_get_state(enum tp_transition_sync sync)
+{
+ struct tp_transition_snapshot *snapshot = &tp_transition_snapshot[sync];
+
+ /* Keep the latest get_state snapshot. */
+ snapshot->rcu = get_state_synchronize_rcu();
+ snapshot->srcu = start_poll_synchronize_srcu(&tracepoint_srcu);
+ snapshot->ongoing = true;
+}
+
+static void tp_rcu_cond_sync(enum tp_transition_sync sync)
+{
+ struct tp_transition_snapshot *snapshot = &tp_transition_snapshot[sync];
+
+ if (!snapshot->ongoing)
+ return;
+ cond_synchronize_rcu(snapshot->rcu);
+ if (!poll_state_synchronize_srcu(&tracepoint_srcu, snapshot->srcu))
+ synchronize_srcu(&tracepoint_srcu);
+ snapshot->ongoing = false;
+}
+
/* Set to 1 to enable tracepoint debug output */
static const int tracepoint_debug;
@@ -332,6 +370,11 @@ static int tracepoint_add_func(struct tracepoint *tp,
*/
switch (nr_func_state(tp_funcs)) {
case TP_FUNC_1: /* 0->1 */
+ /*
+ * Make sure new static func never uses old data after a
+ * 1->0->1 transition sequence.
+ */
+ tp_rcu_cond_sync(TP_TRANSITION_SYNC_1_0_1);
/* Set static call to first function */
tracepoint_update_call(tp, tp_funcs);
/* Both iterator and static call handle NULL tp->funcs */
@@ -346,10 +389,15 @@ static int tracepoint_add_func(struct tracepoint *tp,
* Requires ordering between RCU assign/dereference and
* static call update/call.
*/
- rcu_assign_pointer(tp->funcs, tp_funcs);
- break;
+ fallthrough;
case TP_FUNC_N: /* N->N+1 (N>1) */
rcu_assign_pointer(tp->funcs, tp_funcs);
+ /*
+ * Make sure static func never uses incorrect data after a
+ * N->...->2->1 (N>1) transition sequence.
+ */
+ if (tp_funcs[0].data != old[0].data)
+ tp_rcu_get_state(TP_TRANSITION_SYNC_N_2_1);
break;
default:
WARN_ON_ONCE(1);
@@ -393,24 +441,23 @@ static int tracepoint_remove_func(struct tracepoint *tp,
/* Both iterator and static call handle NULL tp->funcs */
rcu_assign_pointer(tp->funcs, NULL);
/*
- * Make sure new func never uses old data after a 1->0->1
- * transition sequence.
- * Considering that transition 0->1 is the common case
- * and don't have rcu-sync, issue rcu-sync after
- * transition 1->0 to break that sequence by waiting for
- * readers to be quiescent.
+ * Make sure new static func never uses old data after a
+ * 1->0->1 transition sequence.
*/
- tracepoint_synchronize_unregister();
+ tp_rcu_get_state(TP_TRANSITION_SYNC_1_0_1);
break;
case TP_FUNC_1: /* 2->1 */
rcu_assign_pointer(tp->funcs, tp_funcs);
/*
- * On 2->1 transition, RCU sync is needed before setting
- * static call to first callback, because the observer
- * may have loaded any prior tp->funcs after the last one
- * associated with an rcu-sync.
+ * Make sure static func never uses incorrect data after a
+ * N->...->2->1 (N>2) transition sequence. If the first
+ * element's data has changed, then force the synchronization
+ * to prevent current readers that have loaded the old data
+ * from calling the new function.
*/
- tracepoint_synchronize_unregister();
+ if (tp_funcs[0].data != old[0].data)
+ tp_rcu_get_state(TP_TRANSITION_SYNC_N_2_1);
+ tp_rcu_cond_sync(TP_TRANSITION_SYNC_N_2_1);
/* Set static call to first function */
tracepoint_update_call(tp, tp_funcs);
break;
@@ -418,6 +465,12 @@ static int tracepoint_remove_func(struct tracepoint *tp,
fallthrough;
case TP_FUNC_N:
rcu_assign_pointer(tp->funcs, tp_funcs);
+ /*
+ * Make sure static func never uses incorrect data after a
+ * N->...->2->1 (N>2) transition sequence.
+ */
+ if (tp_funcs[0].data != old[0].data)
+ tp_rcu_get_state(TP_TRANSITION_SYNC_N_2_1);
break;
default:
WARN_ON_ONCE(1);
diff --git a/kernel/ucount.c b/kernel/ucount.c
index 9894795043c4..11b1596e2542 100644
--- a/kernel/ucount.c
+++ b/kernel/ucount.c
@@ -8,12 +8,6 @@
#include <linux/kmemleak.h>
#include <linux/user_namespace.h>
-struct ucounts init_ucounts = {
- .ns = &init_user_ns,
- .uid = GLOBAL_ROOT_UID,
- .count = 1,
-};
-
#define UCOUNTS_HASHTABLE_BITS 10
static struct hlist_head ucounts_hashtable[(1 << UCOUNTS_HASHTABLE_BITS)];
static DEFINE_SPINLOCK(ucounts_lock);
@@ -131,15 +125,7 @@ static struct ucounts *find_ucounts(struct user_namespace *ns, kuid_t uid, struc
return NULL;
}
-static void hlist_add_ucounts(struct ucounts *ucounts)
-{
- struct hlist_head *hashent = ucounts_hashentry(ucounts->ns, ucounts->uid);
- spin_lock_irq(&ucounts_lock);
- hlist_add_head(&ucounts->node, hashent);
- spin_unlock_irq(&ucounts_lock);
-}
-
-struct ucounts *alloc_ucounts(struct user_namespace *ns, kuid_t uid)
+static struct ucounts *get_ucounts(struct user_namespace *ns, kuid_t uid)
{
struct hlist_head *hashent = ucounts_hashentry(ns, uid);
struct ucounts *ucounts, *new;
@@ -174,26 +160,7 @@ struct ucounts *alloc_ucounts(struct user_namespace *ns, kuid_t uid)
return ucounts;
}
-struct ucounts *get_ucounts(struct ucounts *ucounts)
-{
- unsigned long flags;
-
- if (!ucounts)
- return NULL;
-
- spin_lock_irqsave(&ucounts_lock, flags);
- if (ucounts->count == INT_MAX) {
- WARN_ONCE(1, "ucounts: counter has reached its maximum value");
- ucounts = NULL;
- } else {
- ucounts->count += 1;
- }
- spin_unlock_irqrestore(&ucounts_lock, flags);
-
- return ucounts;
-}
-
-void put_ucounts(struct ucounts *ucounts)
+static void put_ucounts(struct ucounts *ucounts)
{
unsigned long flags;
@@ -227,7 +194,7 @@ struct ucounts *inc_ucount(struct user_namespace *ns, kuid_t uid,
{
struct ucounts *ucounts, *iter, *bad;
struct user_namespace *tns;
- ucounts = alloc_ucounts(ns, uid);
+ ucounts = get_ucounts(ns, uid);
for (iter = ucounts; iter; iter = tns->ucounts) {
int max;
tns = iter->ns;
@@ -270,7 +237,6 @@ static __init int user_namespace_sysctl_init(void)
BUG_ON(!user_header);
BUG_ON(!setup_userns_sysctls(&init_user_ns));
#endif
- hlist_add_ucounts(&init_ucounts);
return 0;
}
subsys_initcall(user_namespace_sysctl_init);
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 8206a13c81eb..ce396ea4de60 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -1340,9 +1340,6 @@ static int userns_install(struct nsset *nsset, struct ns_common *ns)
put_user_ns(cred->user_ns);
set_cred_user_ns(cred, get_user_ns(user_ns));
- if (set_cred_ucounts(cred) < 0)
- return -EINVAL;
-
return 0;
}
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 51d19fc71e61..4cb622b2661b 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -5893,6 +5893,13 @@ static void __init wq_numa_init(void)
return;
}
+ for_each_possible_cpu(cpu) {
+ if (WARN_ON(cpu_to_node(cpu) == NUMA_NO_NODE)) {
+ pr_warn("workqueue: NUMA node mapping not available for cpu%d, disabling NUMA support\n", cpu);
+ return;
+ }
+ }
+
wq_update_unbound_numa_attrs_buf = alloc_workqueue_attrs();
BUG_ON(!wq_update_unbound_numa_attrs_buf);
@@ -5910,11 +5917,6 @@ static void __init wq_numa_init(void)
for_each_possible_cpu(cpu) {
node = cpu_to_node(cpu);
- if (WARN_ON(node == NUMA_NO_NODE)) {
- pr_warn("workqueue: NUMA node mapping not available for cpu%d, disabling NUMA support\n", cpu);
- /* happens iff arch is bonkers, let's just proceed */
- return;
- }
cpumask_set_cpu(cpu, tbl[node]);
}