summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/bpf/arraymap.c20
-rw-r--r--kernel/bpf/core.c14
-rw-r--r--kernel/bpf/hashtab.c24
-rw-r--r--kernel/bpf/stackmap.c20
-rw-r--r--kernel/bpf/syscall.c34
-rw-r--r--kernel/bpf/verifier.c2
-rw-r--r--kernel/capability.c1
-rw-r--r--kernel/cgroup.c13
-rw-r--r--kernel/cpu.c24
-rw-r--r--kernel/events/core.c244
-rw-r--r--kernel/irq/irqdomain.c44
-rw-r--r--kernel/jump_label.c7
-rw-r--r--kernel/memremap.c4
-rw-r--r--kernel/module.c55
-rw-r--r--kernel/panic.c4
-rw-r--r--kernel/pid_namespace.c10
-rw-r--r--kernel/power/suspend.c4
-rw-r--r--kernel/rcu/rcu.h1
-rw-r--r--kernel/rcu/tiny.c4
-rw-r--r--kernel/rcu/tiny_plugin.h9
-rw-r--r--kernel/rcu/tree.c33
-rw-r--r--kernel/rcu/tree_exp.h52
-rw-r--r--kernel/rcu/tree_plugin.h2
-rw-r--r--kernel/rcu/update.c38
-rw-r--r--kernel/signal.c4
-rw-r--r--kernel/sysctl.c1
-rw-r--r--kernel/time/tick-sched.c9
-rw-r--r--kernel/time/tick-sched.h2
-rw-r--r--kernel/trace/trace_hwlat.c8
-rw-r--r--kernel/trace/trace_kprobe.c2
-rw-r--r--kernel/ucount.c14
-rw-r--r--kernel/watchdog.c9
-rw-r--r--kernel/watchdog_hld.c3
33 files changed, 483 insertions, 232 deletions
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index a2ac051c342f..3d55d95dcf49 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -11,7 +11,6 @@
*/
#include <linux/bpf.h>
#include <linux/err.h>
-#include <linux/vmalloc.h>
#include <linux/slab.h>
#include <linux/mm.h>
#include <linux/filter.h>
@@ -56,7 +55,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
attr->value_size == 0 || attr->map_flags)
return ERR_PTR(-EINVAL);
- if (attr->value_size >= 1 << (KMALLOC_SHIFT_MAX - 1))
+ if (attr->value_size > KMALLOC_MAX_SIZE)
/* if value_size is bigger, the user space won't be able to
* access the elements.
*/
@@ -74,14 +73,10 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
if (array_size >= U32_MAX - PAGE_SIZE)
return ERR_PTR(-ENOMEM);
-
/* allocate all map elements and zero-initialize them */
- array = kzalloc(array_size, GFP_USER | __GFP_NOWARN);
- if (!array) {
- array = vzalloc(array_size);
- if (!array)
- return ERR_PTR(-ENOMEM);
- }
+ array = bpf_map_area_alloc(array_size);
+ if (!array)
+ return ERR_PTR(-ENOMEM);
/* copy mandatory map attributes */
array->map.map_type = attr->map_type;
@@ -97,7 +92,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
if (array_size >= U32_MAX - PAGE_SIZE ||
elem_size > PCPU_MIN_UNIT_SIZE || bpf_array_alloc_percpu(array)) {
- kvfree(array);
+ bpf_map_area_free(array);
return ERR_PTR(-ENOMEM);
}
out:
@@ -262,7 +257,7 @@ static void array_map_free(struct bpf_map *map)
if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY)
bpf_array_free_percpu(array);
- kvfree(array);
+ bpf_map_area_free(array);
}
static const struct bpf_map_ops array_ops = {
@@ -319,7 +314,8 @@ static void fd_array_map_free(struct bpf_map *map)
/* make sure it's empty */
for (i = 0; i < array->map.max_entries; i++)
BUG_ON(array->ptrs[i] != NULL);
- kvfree(array);
+
+ bpf_map_area_free(array);
}
static void *fd_array_map_lookup_elem(struct bpf_map *map, void *key)
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 1eb4f1303756..503d4211988a 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -146,10 +146,11 @@ void __bpf_prog_free(struct bpf_prog *fp)
vfree(fp);
}
-int bpf_prog_calc_digest(struct bpf_prog *fp)
+int bpf_prog_calc_tag(struct bpf_prog *fp)
{
const u32 bits_offset = SHA_MESSAGE_BYTES - sizeof(__be64);
- u32 raw_size = bpf_prog_digest_scratch_size(fp);
+ u32 raw_size = bpf_prog_tag_scratch_size(fp);
+ u32 digest[SHA_DIGEST_WORDS];
u32 ws[SHA_WORKSPACE_WORDS];
u32 i, bsize, psize, blocks;
struct bpf_insn *dst;
@@ -162,7 +163,7 @@ int bpf_prog_calc_digest(struct bpf_prog *fp)
if (!raw)
return -ENOMEM;
- sha_init(fp->digest);
+ sha_init(digest);
memset(ws, 0, sizeof(ws));
/* We need to take out the map fd for the digest calculation
@@ -204,13 +205,14 @@ int bpf_prog_calc_digest(struct bpf_prog *fp)
*bits = cpu_to_be64((psize - 1) << 3);
while (blocks--) {
- sha_transform(fp->digest, todo, ws);
+ sha_transform(digest, todo, ws);
todo += SHA_MESSAGE_BYTES;
}
- result = (__force __be32 *)fp->digest;
+ result = (__force __be32 *)digest;
for (i = 0; i < SHA_DIGEST_WORDS; i++)
- result[i] = cpu_to_be32(fp->digest[i]);
+ result[i] = cpu_to_be32(digest[i]);
+ memcpy(fp->tag, result, sizeof(fp->tag));
vfree(raw);
return 0;
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 34debc1a9641..a753bbe7df0a 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -13,7 +13,6 @@
#include <linux/bpf.h>
#include <linux/jhash.h>
#include <linux/filter.h>
-#include <linux/vmalloc.h>
#include "percpu_freelist.h"
#include "bpf_lru_list.h"
@@ -103,7 +102,7 @@ static void htab_free_elems(struct bpf_htab *htab)
free_percpu(pptr);
}
free_elems:
- vfree(htab->elems);
+ bpf_map_area_free(htab->elems);
}
static struct htab_elem *prealloc_lru_pop(struct bpf_htab *htab, void *key,
@@ -125,7 +124,8 @@ static int prealloc_init(struct bpf_htab *htab)
{
int err = -ENOMEM, i;
- htab->elems = vzalloc(htab->elem_size * htab->map.max_entries);
+ htab->elems = bpf_map_area_alloc(htab->elem_size *
+ htab->map.max_entries);
if (!htab->elems)
return -ENOMEM;
@@ -274,7 +274,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
*/
goto free_htab;
- if (htab->map.value_size >= (1 << (KMALLOC_SHIFT_MAX - 1)) -
+ if (htab->map.value_size >= KMALLOC_MAX_SIZE -
MAX_BPF_STACK - sizeof(struct htab_elem))
/* if value_size is bigger, the user space won't be able to
* access the elements via bpf syscall. This check also makes
@@ -320,14 +320,10 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
goto free_htab;
err = -ENOMEM;
- htab->buckets = kmalloc_array(htab->n_buckets, sizeof(struct bucket),
- GFP_USER | __GFP_NOWARN);
-
- if (!htab->buckets) {
- htab->buckets = vmalloc(htab->n_buckets * sizeof(struct bucket));
- if (!htab->buckets)
- goto free_htab;
- }
+ htab->buckets = bpf_map_area_alloc(htab->n_buckets *
+ sizeof(struct bucket));
+ if (!htab->buckets)
+ goto free_htab;
for (i = 0; i < htab->n_buckets; i++) {
INIT_HLIST_HEAD(&htab->buckets[i].head);
@@ -354,7 +350,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
free_extra_elems:
free_percpu(htab->extra_elems);
free_buckets:
- kvfree(htab->buckets);
+ bpf_map_area_free(htab->buckets);
free_htab:
kfree(htab);
return ERR_PTR(err);
@@ -1014,7 +1010,7 @@ static void htab_map_free(struct bpf_map *map)
prealloc_destroy(htab);
free_percpu(htab->extra_elems);
- kvfree(htab->buckets);
+ bpf_map_area_free(htab->buckets);
kfree(htab);
}
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 732ae16d12b7..be8519148c25 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -7,7 +7,6 @@
#include <linux/bpf.h>
#include <linux/jhash.h>
#include <linux/filter.h>
-#include <linux/vmalloc.h>
#include <linux/stacktrace.h>
#include <linux/perf_event.h>
#include "percpu_freelist.h"
@@ -32,7 +31,7 @@ static int prealloc_elems_and_freelist(struct bpf_stack_map *smap)
u32 elem_size = sizeof(struct stack_map_bucket) + smap->map.value_size;
int err;
- smap->elems = vzalloc(elem_size * smap->map.max_entries);
+ smap->elems = bpf_map_area_alloc(elem_size * smap->map.max_entries);
if (!smap->elems)
return -ENOMEM;
@@ -45,7 +44,7 @@ static int prealloc_elems_and_freelist(struct bpf_stack_map *smap)
return 0;
free_elems:
- vfree(smap->elems);
+ bpf_map_area_free(smap->elems);
return err;
}
@@ -76,12 +75,9 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
if (cost >= U32_MAX - PAGE_SIZE)
return ERR_PTR(-E2BIG);
- smap = kzalloc(cost, GFP_USER | __GFP_NOWARN);
- if (!smap) {
- smap = vzalloc(cost);
- if (!smap)
- return ERR_PTR(-ENOMEM);
- }
+ smap = bpf_map_area_alloc(cost);
+ if (!smap)
+ return ERR_PTR(-ENOMEM);
err = -E2BIG;
cost += n_buckets * (value_size + sizeof(struct stack_map_bucket));
@@ -112,7 +108,7 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
put_buffers:
put_callchain_buffers();
free_smap:
- kvfree(smap);
+ bpf_map_area_free(smap);
return ERR_PTR(err);
}
@@ -262,9 +258,9 @@ static void stack_map_free(struct bpf_map *map)
/* wait for bpf programs to complete before freeing stack map */
synchronize_rcu();
- vfree(smap->elems);
+ bpf_map_area_free(smap->elems);
pcpu_freelist_destroy(&smap->freelist);
- kvfree(smap);
+ bpf_map_area_free(smap);
put_callchain_buffers();
}
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index e89acea22ecf..19b6129eab23 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -12,6 +12,8 @@
#include <linux/bpf.h>
#include <linux/syscalls.h>
#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/mmzone.h>
#include <linux/anon_inodes.h>
#include <linux/file.h>
#include <linux/license.h>
@@ -49,6 +51,30 @@ void bpf_register_map_type(struct bpf_map_type_list *tl)
list_add(&tl->list_node, &bpf_map_types);
}
+void *bpf_map_area_alloc(size_t size)
+{
+ /* We definitely need __GFP_NORETRY, so OOM killer doesn't
+ * trigger under memory pressure as we really just want to
+ * fail instead.
+ */
+ const gfp_t flags = __GFP_NOWARN | __GFP_NORETRY | __GFP_ZERO;
+ void *area;
+
+ if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) {
+ area = kmalloc(size, GFP_USER | flags);
+ if (area != NULL)
+ return area;
+ }
+
+ return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | flags,
+ PAGE_KERNEL);
+}
+
+void bpf_map_area_free(void *area)
+{
+ kvfree(area);
+}
+
int bpf_map_precharge_memlock(u32 pages)
{
struct user_struct *user = get_current_user();
@@ -688,17 +714,17 @@ static int bpf_prog_release(struct inode *inode, struct file *filp)
static void bpf_prog_show_fdinfo(struct seq_file *m, struct file *filp)
{
const struct bpf_prog *prog = filp->private_data;
- char prog_digest[sizeof(prog->digest) * 2 + 1] = { };
+ char prog_tag[sizeof(prog->tag) * 2 + 1] = { };
- bin2hex(prog_digest, prog->digest, sizeof(prog->digest));
+ bin2hex(prog_tag, prog->tag, sizeof(prog->tag));
seq_printf(m,
"prog_type:\t%u\n"
"prog_jited:\t%u\n"
- "prog_digest:\t%s\n"
+ "prog_tag:\t%s\n"
"memlock:\t%llu\n",
prog->type,
prog->jited,
- prog_digest,
+ prog_tag,
prog->pages * 1ULL << PAGE_SHIFT);
}
#endif
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 83ed2f8f6f22..cdc43b899f28 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2936,7 +2936,7 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env)
int insn_cnt = env->prog->len;
int i, j, err;
- err = bpf_prog_calc_digest(env->prog);
+ err = bpf_prog_calc_tag(env->prog);
if (err)
return err;
diff --git a/kernel/capability.c b/kernel/capability.c
index a98e814f216f..f97fe77ceb88 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -318,6 +318,7 @@ bool has_capability(struct task_struct *t, int cap)
{
return has_ns_capability(t, &init_user_ns, cap);
}
+EXPORT_SYMBOL(has_capability);
/**
* has_ns_capability_noaudit - Does a task have a capability (unaudited)
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 2ee9ec3051b2..688dd02af985 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -5221,6 +5221,11 @@ err_free_css:
return ERR_PTR(err);
}
+/*
+ * The returned cgroup is fully initialized including its control mask, but
+ * it isn't associated with its kernfs_node and doesn't have the control
+ * mask applied.
+ */
static struct cgroup *cgroup_create(struct cgroup *parent)
{
struct cgroup_root *root = parent->root;
@@ -5288,11 +5293,6 @@ static struct cgroup *cgroup_create(struct cgroup *parent)
cgroup_propagate_control(cgrp);
- /* @cgrp doesn't have dir yet so the following will only create csses */
- ret = cgroup_apply_control_enable(cgrp);
- if (ret)
- goto out_destroy;
-
return cgrp;
out_cancel_ref:
@@ -5300,9 +5300,6 @@ out_cancel_ref:
out_free_cgrp:
kfree(cgrp);
return ERR_PTR(ret);
-out_destroy:
- cgroup_destroy_locked(cgrp);
- return ERR_PTR(ret);
}
static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
diff --git a/kernel/cpu.c b/kernel/cpu.c
index f75c4d031eeb..0a5f630f5c54 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -764,7 +764,6 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen,
{
struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
int prev_state, ret = 0;
- bool hasdied = false;
if (num_online_cpus() == 1)
return -EBUSY;
@@ -809,7 +808,6 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen,
cpuhp_kick_ap_work(cpu);
}
- hasdied = prev_state != st->state && st->state == CPUHP_OFFLINE;
out:
cpu_hotplug_done();
return ret;
@@ -1302,10 +1300,24 @@ static int cpuhp_cb_check(enum cpuhp_state state)
*/
static int cpuhp_reserve_state(enum cpuhp_state state)
{
- enum cpuhp_state i;
+ enum cpuhp_state i, end;
+ struct cpuhp_step *step;
- for (i = CPUHP_AP_ONLINE_DYN; i <= CPUHP_AP_ONLINE_DYN_END; i++) {
- if (!cpuhp_ap_states[i].name)
+ switch (state) {
+ case CPUHP_AP_ONLINE_DYN:
+ step = cpuhp_ap_states + CPUHP_AP_ONLINE_DYN;
+ end = CPUHP_AP_ONLINE_DYN_END;
+ break;
+ case CPUHP_BP_PREPARE_DYN:
+ step = cpuhp_bp_states + CPUHP_BP_PREPARE_DYN;
+ end = CPUHP_BP_PREPARE_DYN_END;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ for (i = state; i <= end; i++, step++) {
+ if (!step->name)
return i;
}
WARN(1, "No more dynamic states available for CPU hotplug\n");
@@ -1323,7 +1335,7 @@ static int cpuhp_store_callbacks(enum cpuhp_state state, const char *name,
mutex_lock(&cpuhp_state_mutex);
- if (state == CPUHP_AP_ONLINE_DYN) {
+ if (state == CPUHP_AP_ONLINE_DYN || state == CPUHP_BP_PREPARE_DYN) {
ret = cpuhp_reserve_state(state);
if (ret < 0)
goto out;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index ab15509fab8c..e5aaa806702d 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1469,7 +1469,6 @@ ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
static void
list_add_event(struct perf_event *event, struct perf_event_context *ctx)
{
-
lockdep_assert_held(&ctx->lock);
WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT);
@@ -1624,6 +1623,8 @@ static void perf_group_attach(struct perf_event *event)
{
struct perf_event *group_leader = event->group_leader, *pos;
+ lockdep_assert_held(&event->ctx->lock);
+
/*
* We can have double attach due to group movement in perf_event_open.
*/
@@ -1697,6 +1698,8 @@ static void perf_group_detach(struct perf_event *event)
struct perf_event *sibling, *tmp;
struct list_head *list = NULL;
+ lockdep_assert_held(&event->ctx->lock);
+
/*
* We can have double detach due to exit/hot-unplug + close.
*/
@@ -1895,9 +1898,29 @@ __perf_remove_from_context(struct perf_event *event,
*/
static void perf_remove_from_context(struct perf_event *event, unsigned long flags)
{
- lockdep_assert_held(&event->ctx->mutex);
+ struct perf_event_context *ctx = event->ctx;
+
+ lockdep_assert_held(&ctx->mutex);
event_function_call(event, __perf_remove_from_context, (void *)flags);
+
+ /*
+ * The above event_function_call() can NO-OP when it hits
+ * TASK_TOMBSTONE. In that case we must already have been detached
+ * from the context (by perf_event_exit_event()) but the grouping
+ * might still be in-tact.
+ */
+ WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT);
+ if ((flags & DETACH_GROUP) &&
+ (event->attach_state & PERF_ATTACH_GROUP)) {
+ /*
+ * Since in that case we cannot possibly be scheduled, simply
+ * detach now.
+ */
+ raw_spin_lock_irq(&ctx->lock);
+ perf_group_detach(event);
+ raw_spin_unlock_irq(&ctx->lock);
+ }
}
/*
@@ -2249,7 +2272,7 @@ static int __perf_install_in_context(void *info)
struct perf_event_context *ctx = event->ctx;
struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
struct perf_event_context *task_ctx = cpuctx->task_ctx;
- bool activate = true;
+ bool reprogram = true;
int ret = 0;
raw_spin_lock(&cpuctx->ctx.lock);
@@ -2257,27 +2280,26 @@ static int __perf_install_in_context(void *info)
raw_spin_lock(&ctx->lock);
task_ctx = ctx;
- /* If we're on the wrong CPU, try again */
- if (task_cpu(ctx->task) != smp_processor_id()) {
- ret = -ESRCH;
- goto unlock;
- }
+ reprogram = (ctx->task == current);
/*
- * If we're on the right CPU, see if the task we target is
- * current, if not we don't have to activate the ctx, a future
- * context switch will do that for us.
+ * If the task is running, it must be running on this CPU,
+ * otherwise we cannot reprogram things.
+ *
+ * If its not running, we don't care, ctx->lock will
+ * serialize against it becoming runnable.
*/
- if (ctx->task != current)
- activate = false;
- else
- WARN_ON_ONCE(cpuctx->task_ctx && cpuctx->task_ctx != ctx);
+ if (task_curr(ctx->task) && !reprogram) {
+ ret = -ESRCH;
+ goto unlock;
+ }
+ WARN_ON_ONCE(reprogram && cpuctx->task_ctx && cpuctx->task_ctx != ctx);
} else if (task_ctx) {
raw_spin_lock(&task_ctx->lock);
}
- if (activate) {
+ if (reprogram) {
ctx_sched_out(ctx, cpuctx, EVENT_TIME);
add_event_to_ctx(event, ctx);
ctx_resched(cpuctx, task_ctx);
@@ -2328,13 +2350,36 @@ perf_install_in_context(struct perf_event_context *ctx,
/*
* Installing events is tricky because we cannot rely on ctx->is_active
* to be set in case this is the nr_events 0 -> 1 transition.
+ *
+ * Instead we use task_curr(), which tells us if the task is running.
+ * However, since we use task_curr() outside of rq::lock, we can race
+ * against the actual state. This means the result can be wrong.
+ *
+ * If we get a false positive, we retry, this is harmless.
+ *
+ * If we get a false negative, things are complicated. If we are after
+ * perf_event_context_sched_in() ctx::lock will serialize us, and the
+ * value must be correct. If we're before, it doesn't matter since
+ * perf_event_context_sched_in() will program the counter.
+ *
+ * However, this hinges on the remote context switch having observed
+ * our task->perf_event_ctxp[] store, such that it will in fact take
+ * ctx::lock in perf_event_context_sched_in().
+ *
+ * We do this by task_function_call(), if the IPI fails to hit the task
+ * we know any future context switch of task must see the
+ * perf_event_ctpx[] store.
*/
-again:
+
/*
- * Cannot use task_function_call() because we need to run on the task's
- * CPU regardless of whether its current or not.
+ * This smp_mb() orders the task->perf_event_ctxp[] store with the
+ * task_cpu() load, such that if the IPI then does not find the task
+ * running, a future context switch of that task must observe the
+ * store.
*/
- if (!cpu_function_call(task_cpu(task), __perf_install_in_context, event))
+ smp_mb();
+again:
+ if (!task_function_call(task, __perf_install_in_context, event))
return;
raw_spin_lock_irq(&ctx->lock);
@@ -2348,12 +2393,16 @@ again:
raw_spin_unlock_irq(&ctx->lock);
return;
}
- raw_spin_unlock_irq(&ctx->lock);
/*
- * Since !ctx->is_active doesn't mean anything, we must IPI
- * unconditionally.
+ * If the task is not running, ctx->lock will avoid it becoming so,
+ * thus we can safely install the event.
*/
- goto again;
+ if (task_curr(task)) {
+ raw_spin_unlock_irq(&ctx->lock);
+ goto again;
+ }
+ add_event_to_ctx(event, ctx);
+ raw_spin_unlock_irq(&ctx->lock);
}
/*
@@ -6583,6 +6632,27 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
char *buf = NULL;
char *name;
+ if (vma->vm_flags & VM_READ)
+ prot |= PROT_READ;
+ if (vma->vm_flags & VM_WRITE)
+ prot |= PROT_WRITE;
+ if (vma->vm_flags & VM_EXEC)
+ prot |= PROT_EXEC;
+
+ if (vma->vm_flags & VM_MAYSHARE)
+ flags = MAP_SHARED;
+ else
+ flags = MAP_PRIVATE;
+
+ if (vma->vm_flags & VM_DENYWRITE)
+ flags |= MAP_DENYWRITE;
+ if (vma->vm_flags & VM_MAYEXEC)
+ flags |= MAP_EXECUTABLE;
+ if (vma->vm_flags & VM_LOCKED)
+ flags |= MAP_LOCKED;
+ if (vma->vm_flags & VM_HUGETLB)
+ flags |= MAP_HUGETLB;
+
if (file) {
struct inode *inode;
dev_t dev;
@@ -6609,27 +6679,6 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
maj = MAJOR(dev);
min = MINOR(dev);
- if (vma->vm_flags & VM_READ)
- prot |= PROT_READ;
- if (vma->vm_flags & VM_WRITE)
- prot |= PROT_WRITE;
- if (vma->vm_flags & VM_EXEC)
- prot |= PROT_EXEC;
-
- if (vma->vm_flags & VM_MAYSHARE)
- flags = MAP_SHARED;
- else
- flags = MAP_PRIVATE;
-
- if (vma->vm_flags & VM_DENYWRITE)
- flags |= MAP_DENYWRITE;
- if (vma->vm_flags & VM_MAYEXEC)
- flags |= MAP_EXECUTABLE;
- if (vma->vm_flags & VM_LOCKED)
- flags |= MAP_LOCKED;
- if (vma->vm_flags & VM_HUGETLB)
- flags |= MAP_HUGETLB;
-
goto got_name;
} else {
if (vma->vm_ops && vma->vm_ops->name) {
@@ -7034,25 +7083,12 @@ static void perf_log_itrace_start(struct perf_event *event)
perf_output_end(&handle);
}
-/*
- * Generic event overflow handling, sampling.
- */
-
-static int __perf_event_overflow(struct perf_event *event,
- int throttle, struct perf_sample_data *data,
- struct pt_regs *regs)
+static int
+__perf_event_account_interrupt(struct perf_event *event, int throttle)
{
- int events = atomic_read(&event->event_limit);
struct hw_perf_event *hwc = &event->hw;
- u64 seq;
int ret = 0;
-
- /*
- * Non-sampling counters might still use the PMI to fold short
- * hardware counters, ignore those.
- */
- if (unlikely(!is_sampling_event(event)))
- return 0;
+ u64 seq;
seq = __this_cpu_read(perf_throttled_seq);
if (seq != hwc->interrupts_seq) {
@@ -7080,6 +7116,34 @@ static int __perf_event_overflow(struct perf_event *event,
perf_adjust_period(event, delta, hwc->last_period, true);
}
+ return ret;
+}
+
+int perf_event_account_interrupt(struct perf_event *event)
+{
+ return __perf_event_account_interrupt(event, 1);
+}
+
+/*
+ * Generic event overflow handling, sampling.
+ */
+
+static int __perf_event_overflow(struct perf_event *event,
+ int throttle, struct perf_sample_data *data,
+ struct pt_regs *regs)
+{
+ int events = atomic_read(&event->event_limit);
+ int ret = 0;
+
+ /*
+ * Non-sampling counters might still use the PMI to fold short
+ * hardware counters, ignore those.
+ */
+ if (unlikely(!is_sampling_event(event)))
+ return 0;
+
+ ret = __perf_event_account_interrupt(event, throttle);
+
/*
* XXX event_limit might not quite work as expected on inherited
* events
@@ -9503,6 +9567,37 @@ static int perf_event_set_clock(struct perf_event *event, clockid_t clk_id)
return 0;
}
+/*
+ * Variation on perf_event_ctx_lock_nested(), except we take two context
+ * mutexes.
+ */
+static struct perf_event_context *
+__perf_event_ctx_lock_double(struct perf_event *group_leader,
+ struct perf_event_context *ctx)
+{
+ struct perf_event_context *gctx;
+
+again:
+ rcu_read_lock();
+ gctx = READ_ONCE(group_leader->ctx);
+ if (!atomic_inc_not_zero(&gctx->refcount)) {
+ rcu_read_unlock();
+ goto again;
+ }
+ rcu_read_unlock();
+
+ mutex_lock_double(&gctx->mutex, &ctx->mutex);
+
+ if (group_leader->ctx != gctx) {
+ mutex_unlock(&ctx->mutex);
+ mutex_unlock(&gctx->mutex);
+ put_ctx(gctx);
+ goto again;
+ }
+
+ return gctx;
+}
+
/**
* sys_perf_event_open - open a performance event, associate it to a task/cpu
*
@@ -9746,12 +9841,31 @@ SYSCALL_DEFINE5(perf_event_open,
}
if (move_group) {
- gctx = group_leader->ctx;
- mutex_lock_double(&gctx->mutex, &ctx->mutex);
+ gctx = __perf_event_ctx_lock_double(group_leader, ctx);
+
if (gctx->task == TASK_TOMBSTONE) {
err = -ESRCH;
goto err_locked;
}
+
+ /*
+ * Check if we raced against another sys_perf_event_open() call
+ * moving the software group underneath us.
+ */
+ if (!(group_leader->group_caps & PERF_EV_CAP_SOFTWARE)) {
+ /*
+ * If someone moved the group out from under us, check
+ * if this new event wound up on the same ctx, if so
+ * its the regular !move_group case, otherwise fail.
+ */
+ if (gctx != ctx) {
+ err = -EINVAL;
+ goto err_locked;
+ } else {
+ perf_event_ctx_unlock(group_leader, gctx);
+ move_group = 0;
+ }
+ }
} else {
mutex_lock(&ctx->mutex);
}
@@ -9853,7 +9967,7 @@ SYSCALL_DEFINE5(perf_event_open,
perf_unpin_context(ctx);
if (move_group)
- mutex_unlock(&gctx->mutex);
+ perf_event_ctx_unlock(group_leader, gctx);
mutex_unlock(&ctx->mutex);
if (task) {
@@ -9879,7 +9993,7 @@ SYSCALL_DEFINE5(perf_event_open,
err_locked:
if (move_group)
- mutex_unlock(&gctx->mutex);
+ perf_event_ctx_unlock(group_leader, gctx);
mutex_unlock(&ctx->mutex);
/* err_file: */
fput(event_file);
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 8c0a0ae43521..b59e6768c5e9 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -1346,6 +1346,30 @@ void irq_domain_free_irqs_parent(struct irq_domain *domain,
}
EXPORT_SYMBOL_GPL(irq_domain_free_irqs_parent);
+static void __irq_domain_activate_irq(struct irq_data *irq_data)
+{
+ if (irq_data && irq_data->domain) {
+ struct irq_domain *domain = irq_data->domain;
+
+ if (irq_data->parent_data)
+ __irq_domain_activate_irq(irq_data->parent_data);
+ if (domain->ops->activate)
+ domain->ops->activate(domain, irq_data);
+ }
+}
+
+static void __irq_domain_deactivate_irq(struct irq_data *irq_data)
+{
+ if (irq_data && irq_data->domain) {
+ struct irq_domain *domain = irq_data->domain;
+
+ if (domain->ops->deactivate)
+ domain->ops->deactivate(domain, irq_data);
+ if (irq_data->parent_data)
+ __irq_domain_deactivate_irq(irq_data->parent_data);
+ }
+}
+
/**
* irq_domain_activate_irq - Call domain_ops->activate recursively to activate
* interrupt
@@ -1356,13 +1380,9 @@ EXPORT_SYMBOL_GPL(irq_domain_free_irqs_parent);
*/
void irq_domain_activate_irq(struct irq_data *irq_data)
{
- if (irq_data && irq_data->domain) {
- struct irq_domain *domain = irq_data->domain;
-
- if (irq_data->parent_data)
- irq_domain_activate_irq(irq_data->parent_data);
- if (domain->ops->activate)
- domain->ops->activate(domain, irq_data);
+ if (!irqd_is_activated(irq_data)) {
+ __irq_domain_activate_irq(irq_data);
+ irqd_set_activated(irq_data);
}
}
@@ -1376,13 +1396,9 @@ void irq_domain_activate_irq(struct irq_data *irq_data)
*/
void irq_domain_deactivate_irq(struct irq_data *irq_data)
{
- if (irq_data && irq_data->domain) {
- struct irq_domain *domain = irq_data->domain;
-
- if (domain->ops->deactivate)
- domain->ops->deactivate(domain, irq_data);
- if (irq_data->parent_data)
- irq_domain_deactivate_irq(irq_data->parent_data);
+ if (irqd_is_activated(irq_data)) {
+ __irq_domain_deactivate_irq(irq_data);
+ irqd_clr_activated(irq_data);
}
}
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index 93ad6c1fb9b6..a9b8cf500591 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -182,6 +182,13 @@ void static_key_slow_dec_deferred(struct static_key_deferred *key)
}
EXPORT_SYMBOL_GPL(static_key_slow_dec_deferred);
+void static_key_deferred_flush(struct static_key_deferred *key)
+{
+ STATIC_KEY_CHECK_USE();
+ flush_delayed_work(&key->work);
+}
+EXPORT_SYMBOL_GPL(static_key_deferred_flush);
+
void jump_label_rate_limit(struct static_key_deferred *key,
unsigned long rl)
{
diff --git a/kernel/memremap.c b/kernel/memremap.c
index b501e390bb34..9ecedc28b928 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -246,7 +246,9 @@ static void devm_memremap_pages_release(struct device *dev, void *data)
/* pages are dead and unused, undo the arch mapping */
align_start = res->start & ~(SECTION_SIZE - 1);
align_size = ALIGN(resource_size(res), SECTION_SIZE);
+ mem_hotplug_begin();
arch_remove_memory(align_start, align_size);
+ mem_hotplug_done();
untrack_pfn(NULL, PHYS_PFN(align_start), align_size);
pgmap_radix_release(res);
dev_WARN_ONCE(dev, pgmap->altmap && pgmap->altmap->alloc,
@@ -358,7 +360,9 @@ void *devm_memremap_pages(struct device *dev, struct resource *res,
if (error)
goto err_pfn_remap;
+ mem_hotplug_begin();
error = arch_add_memory(nid, align_start, align_size, true);
+ mem_hotplug_done();
if (error)
goto err_add_memory;
diff --git a/kernel/module.c b/kernel/module.c
index 5088784c0cf9..3d8f126208e3 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -389,16 +389,16 @@ extern const struct kernel_symbol __start___ksymtab_gpl[];
extern const struct kernel_symbol __stop___ksymtab_gpl[];
extern const struct kernel_symbol __start___ksymtab_gpl_future[];
extern const struct kernel_symbol __stop___ksymtab_gpl_future[];
-extern const unsigned long __start___kcrctab[];
-extern const unsigned long __start___kcrctab_gpl[];
-extern const unsigned long __start___kcrctab_gpl_future[];
+extern const s32 __start___kcrctab[];
+extern const s32 __start___kcrctab_gpl[];
+extern const s32 __start___kcrctab_gpl_future[];
#ifdef CONFIG_UNUSED_SYMBOLS
extern const struct kernel_symbol __start___ksymtab_unused[];
extern const struct kernel_symbol __stop___ksymtab_unused[];
extern const struct kernel_symbol __start___ksymtab_unused_gpl[];
extern const struct kernel_symbol __stop___ksymtab_unused_gpl[];
-extern const unsigned long __start___kcrctab_unused[];
-extern const unsigned long __start___kcrctab_unused_gpl[];
+extern const s32 __start___kcrctab_unused[];
+extern const s32 __start___kcrctab_unused_gpl[];
#endif
#ifndef CONFIG_MODVERSIONS
@@ -497,7 +497,7 @@ struct find_symbol_arg {
/* Output */
struct module *owner;
- const unsigned long *crc;
+ const s32 *crc;
const struct kernel_symbol *sym;
};
@@ -563,7 +563,7 @@ static bool find_symbol_in_section(const struct symsearch *syms,
* (optional) module which owns it. Needs preempt disabled or module_mutex. */
const struct kernel_symbol *find_symbol(const char *name,
struct module **owner,
- const unsigned long **crc,
+ const s32 **crc,
bool gplok,
bool warn)
{
@@ -1145,7 +1145,7 @@ static size_t module_flags_taint(struct module *mod, char *buf)
for (i = 0; i < TAINT_FLAGS_COUNT; i++) {
if (taint_flags[i].module && test_bit(i, &mod->taints))
- buf[l++] = taint_flags[i].true;
+ buf[l++] = taint_flags[i].c_true;
}
return l;
@@ -1249,23 +1249,17 @@ static int try_to_force_load(struct module *mod, const char *reason)
}
#ifdef CONFIG_MODVERSIONS
-/* If the arch applies (non-zero) relocations to kernel kcrctab, unapply it. */
-static unsigned long maybe_relocated(unsigned long crc,
- const struct module *crc_owner)
+
+static u32 resolve_rel_crc(const s32 *crc)
{
-#ifdef ARCH_RELOCATES_KCRCTAB
- if (crc_owner == NULL)
- return crc - (unsigned long)reloc_start;
-#endif
- return crc;
+ return *(u32 *)((void *)crc + *crc);
}
static int check_version(Elf_Shdr *sechdrs,
unsigned int versindex,
const char *symname,
struct module *mod,
- const unsigned long *crc,
- const struct module *crc_owner)
+ const s32 *crc)
{
unsigned int i, num_versions;
struct modversion_info *versions;
@@ -1283,13 +1277,19 @@ static int check_version(Elf_Shdr *sechdrs,
/ sizeof(struct modversion_info);
for (i = 0; i < num_versions; i++) {
+ u32 crcval;
+
if (strcmp(versions[i].name, symname) != 0)
continue;
- if (versions[i].crc == maybe_relocated(*crc, crc_owner))
+ if (IS_ENABLED(CONFIG_MODULE_REL_CRCS))
+ crcval = resolve_rel_crc(crc);
+ else
+ crcval = *crc;
+ if (versions[i].crc == crcval)
return 1;
- pr_debug("Found checksum %lX vs module %lX\n",
- maybe_relocated(*crc, crc_owner), versions[i].crc);
+ pr_debug("Found checksum %X vs module %lX\n",
+ crcval, versions[i].crc);
goto bad_version;
}
@@ -1307,7 +1307,7 @@ static inline int check_modstruct_version(Elf_Shdr *sechdrs,
unsigned int versindex,
struct module *mod)
{
- const unsigned long *crc;
+ const s32 *crc;
/*
* Since this should be found in kernel (which can't be removed), no
@@ -1321,8 +1321,7 @@ static inline int check_modstruct_version(Elf_Shdr *sechdrs,
}
preempt_enable();
return check_version(sechdrs, versindex,
- VMLINUX_SYMBOL_STR(module_layout), mod, crc,
- NULL);
+ VMLINUX_SYMBOL_STR(module_layout), mod, crc);
}
/* First part is kernel version, which we ignore if module has crcs. */
@@ -1340,8 +1339,7 @@ static inline int check_version(Elf_Shdr *sechdrs,
unsigned int versindex,
const char *symname,
struct module *mod,
- const unsigned long *crc,
- const struct module *crc_owner)
+ const s32 *crc)
{
return 1;
}
@@ -1368,7 +1366,7 @@ static const struct kernel_symbol *resolve_symbol(struct module *mod,
{
struct module *owner;
const struct kernel_symbol *sym;
- const unsigned long *crc;
+ const s32 *crc;
int err;
/*
@@ -1383,8 +1381,7 @@ static const struct kernel_symbol *resolve_symbol(struct module *mod,
if (!sym)
goto unlock;
- if (!check_version(info->sechdrs, info->index.vers, name, mod, crc,
- owner)) {
+ if (!check_version(info->sechdrs, info->index.vers, name, mod, crc)) {
sym = ERR_PTR(-EINVAL);
goto getname;
}
diff --git a/kernel/panic.c b/kernel/panic.c
index c51edaa04fce..08aa88dde7de 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -249,7 +249,7 @@ void panic(const char *fmt, ...)
* Delay timeout seconds before rebooting the machine.
* We can't use the "normal" timers since we just panicked.
*/
- pr_emerg("Rebooting in %d seconds..", panic_timeout);
+ pr_emerg("Rebooting in %d seconds..\n", panic_timeout);
for (i = 0; i < panic_timeout * 1000; i += PANIC_TIMER_STEP) {
touch_nmi_watchdog();
@@ -355,7 +355,7 @@ const char *print_tainted(void)
for (i = 0; i < TAINT_FLAGS_COUNT; i++) {
const struct taint_flag *t = &taint_flags[i];
*s++ = test_bit(i, &tainted_mask) ?
- t->true : t->false;
+ t->c_true : t->c_false;
}
*s = 0;
} else
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index df9e8e9e0be7..eef2ce968636 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -151,8 +151,12 @@ out:
static void delayed_free_pidns(struct rcu_head *p)
{
- kmem_cache_free(pid_ns_cachep,
- container_of(p, struct pid_namespace, rcu));
+ struct pid_namespace *ns = container_of(p, struct pid_namespace, rcu);
+
+ dec_pid_namespaces(ns->ucounts);
+ put_user_ns(ns->user_ns);
+
+ kmem_cache_free(pid_ns_cachep, ns);
}
static void destroy_pid_namespace(struct pid_namespace *ns)
@@ -162,8 +166,6 @@ static void destroy_pid_namespace(struct pid_namespace *ns)
ns_free_inum(&ns->ns);
for (i = 0; i < PIDMAP_ENTRIES; i++)
kfree(ns->pidmap[i].page);
- dec_pid_namespaces(ns->ucounts);
- put_user_ns(ns->user_ns);
call_rcu(&ns->rcu, delayed_free_pidns);
}
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index f67ceb7768b8..15e6baef5c73 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -46,7 +46,7 @@ static const char * const mem_sleep_labels[] = {
const char *mem_sleep_states[PM_SUSPEND_MAX];
suspend_state_t mem_sleep_current = PM_SUSPEND_FREEZE;
-suspend_state_t mem_sleep_default = PM_SUSPEND_MAX;
+static suspend_state_t mem_sleep_default = PM_SUSPEND_MEM;
unsigned int pm_suspend_global_flags;
EXPORT_SYMBOL_GPL(pm_suspend_global_flags);
@@ -168,7 +168,7 @@ void suspend_set_ops(const struct platform_suspend_ops *ops)
}
if (valid_state(PM_SUSPEND_MEM)) {
mem_sleep_states[PM_SUSPEND_MEM] = mem_sleep_labels[PM_SUSPEND_MEM];
- if (mem_sleep_default >= PM_SUSPEND_MEM)
+ if (mem_sleep_default == PM_SUSPEND_MEM)
mem_sleep_current = PM_SUSPEND_MEM;
}
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index 80adef7d4c3d..0d6ff3e471be 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -136,6 +136,7 @@ int rcu_jiffies_till_stall_check(void);
#define TPS(x) tracepoint_string(x)
void rcu_early_boot_tests(void);
+void rcu_test_sync_prims(void);
/*
* This function really isn't for public consumption, but RCU is special in
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
index 1898559e6b60..b23a4d076f3d 100644
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcu/tiny.c
@@ -185,9 +185,6 @@ static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused
* benefits of doing might_sleep() to reduce latency.)
*
* Cool, huh? (Due to Josh Triplett.)
- *
- * But we want to make this a static inline later. The cond_resched()
- * currently makes this problematic.
*/
void synchronize_sched(void)
{
@@ -195,7 +192,6 @@ void synchronize_sched(void)
lock_is_held(&rcu_lock_map) ||
lock_is_held(&rcu_sched_lock_map),
"Illegal synchronize_sched() in RCU read-side critical section");
- cond_resched();
}
EXPORT_SYMBOL_GPL(synchronize_sched);
diff --git a/kernel/rcu/tiny_plugin.h b/kernel/rcu/tiny_plugin.h
index 196f0302e2f4..c64b827ecbca 100644
--- a/kernel/rcu/tiny_plugin.h
+++ b/kernel/rcu/tiny_plugin.h
@@ -60,12 +60,17 @@ EXPORT_SYMBOL_GPL(rcu_scheduler_active);
/*
* During boot, we forgive RCU lockdep issues. After this function is
- * invoked, we start taking RCU lockdep issues seriously.
+ * invoked, we start taking RCU lockdep issues seriously. Note that unlike
+ * Tree RCU, Tiny RCU transitions directly from RCU_SCHEDULER_INACTIVE
+ * to RCU_SCHEDULER_RUNNING, skipping the RCU_SCHEDULER_INIT stage.
+ * The reason for this is that Tiny RCU does not need kthreads, so does
+ * not have to care about the fact that the scheduler is half-initialized
+ * at a certain phase of the boot process.
*/
void __init rcu_scheduler_starting(void)
{
WARN_ON(nr_context_switches() > 0);
- rcu_scheduler_active = 1;
+ rcu_scheduler_active = RCU_SCHEDULER_RUNNING;
}
#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 96c52e43f7ca..cb4e2056ccf3 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -127,13 +127,16 @@ int rcu_num_nodes __read_mostly = NUM_RCU_NODES; /* Total # rcu_nodes in use. */
int sysctl_panic_on_rcu_stall __read_mostly;
/*
- * The rcu_scheduler_active variable transitions from zero to one just
- * before the first task is spawned. So when this variable is zero, RCU
- * can assume that there is but one task, allowing RCU to (for example)
+ * The rcu_scheduler_active variable is initialized to the value
+ * RCU_SCHEDULER_INACTIVE and transitions RCU_SCHEDULER_INIT just before the
+ * first task is spawned. So when this variable is RCU_SCHEDULER_INACTIVE,
+ * RCU can assume that there is but one task, allowing RCU to (for example)
* optimize synchronize_rcu() to a simple barrier(). When this variable
- * is one, RCU must actually do all the hard work required to detect real
- * grace periods. This variable is also used to suppress boot-time false
- * positives from lockdep-RCU error checking.
+ * is RCU_SCHEDULER_INIT, RCU must actually do all the hard work required
+ * to detect real grace periods. This variable is also used to suppress
+ * boot-time false positives from lockdep-RCU error checking. Finally, it
+ * transitions from RCU_SCHEDULER_INIT to RCU_SCHEDULER_RUNNING after RCU
+ * is fully initialized, including all of its kthreads having been spawned.
*/
int rcu_scheduler_active __read_mostly;
EXPORT_SYMBOL_GPL(rcu_scheduler_active);
@@ -3980,18 +3983,22 @@ static int __init rcu_spawn_gp_kthread(void)
early_initcall(rcu_spawn_gp_kthread);
/*
- * This function is invoked towards the end of the scheduler's initialization
- * process. Before this is called, the idle task might contain
- * RCU read-side critical sections (during which time, this idle
- * task is booting the system). After this function is called, the
- * idle tasks are prohibited from containing RCU read-side critical
- * sections. This function also enables RCU lockdep checking.
+ * This function is invoked towards the end of the scheduler's
+ * initialization process. Before this is called, the idle task might
+ * contain synchronous grace-period primitives (during which time, this idle
+ * task is booting the system, and such primitives are no-ops). After this
+ * function is called, any synchronous grace-period primitives are run as
+ * expedited, with the requesting task driving the grace period forward.
+ * A later core_initcall() rcu_exp_runtime_mode() will switch to full
+ * runtime RCU functionality.
*/
void rcu_scheduler_starting(void)
{
WARN_ON(num_online_cpus() != 1);
WARN_ON(nr_context_switches() > 0);
- rcu_scheduler_active = 1;
+ rcu_test_sync_prims();
+ rcu_scheduler_active = RCU_SCHEDULER_INIT;
+ rcu_test_sync_prims();
}
/*
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index d3053e99fdb6..e59e1849b89a 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -532,18 +532,28 @@ struct rcu_exp_work {
};
/*
+ * Common code to drive an expedited grace period forward, used by
+ * workqueues and mid-boot-time tasks.
+ */
+static void rcu_exp_sel_wait_wake(struct rcu_state *rsp,
+ smp_call_func_t func, unsigned long s)
+{
+ /* Initialize the rcu_node tree in preparation for the wait. */
+ sync_rcu_exp_select_cpus(rsp, func);
+
+ /* Wait and clean up, including waking everyone. */
+ rcu_exp_wait_wake(rsp, s);
+}
+
+/*
* Work-queue handler to drive an expedited grace period forward.
*/
static void wait_rcu_exp_gp(struct work_struct *wp)
{
struct rcu_exp_work *rewp;
- /* Initialize the rcu_node tree in preparation for the wait. */
rewp = container_of(wp, struct rcu_exp_work, rew_work);
- sync_rcu_exp_select_cpus(rewp->rew_rsp, rewp->rew_func);
-
- /* Wait and clean up, including waking everyone. */
- rcu_exp_wait_wake(rewp->rew_rsp, rewp->rew_s);
+ rcu_exp_sel_wait_wake(rewp->rew_rsp, rewp->rew_func, rewp->rew_s);
}
/*
@@ -569,12 +579,18 @@ static void _synchronize_rcu_expedited(struct rcu_state *rsp,
if (exp_funnel_lock(rsp, s))
return; /* Someone else did our work for us. */
- /* Marshall arguments and schedule the expedited grace period. */
- rew.rew_func = func;
- rew.rew_rsp = rsp;
- rew.rew_s = s;
- INIT_WORK_ONSTACK(&rew.rew_work, wait_rcu_exp_gp);
- schedule_work(&rew.rew_work);
+ /* Ensure that load happens before action based on it. */
+ if (unlikely(rcu_scheduler_active == RCU_SCHEDULER_INIT)) {
+ /* Direct call during scheduler init and early_initcalls(). */
+ rcu_exp_sel_wait_wake(rsp, func, s);
+ } else {
+ /* Marshall arguments & schedule the expedited grace period. */
+ rew.rew_func = func;
+ rew.rew_rsp = rsp;
+ rew.rew_s = s;
+ INIT_WORK_ONSTACK(&rew.rew_work, wait_rcu_exp_gp);
+ schedule_work(&rew.rew_work);
+ }
/* Wait for expedited grace period to complete. */
rdp = per_cpu_ptr(rsp->rda, raw_smp_processor_id());
@@ -676,6 +692,8 @@ void synchronize_rcu_expedited(void)
{
struct rcu_state *rsp = rcu_state_p;
+ if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE)
+ return;
_synchronize_rcu_expedited(rsp, sync_rcu_exp_handler);
}
EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
@@ -693,3 +711,15 @@ void synchronize_rcu_expedited(void)
EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
#endif /* #else #ifdef CONFIG_PREEMPT_RCU */
+
+/*
+ * Switch to run-time mode once Tree RCU has fully initialized.
+ */
+static int __init rcu_exp_runtime_mode(void)
+{
+ rcu_test_sync_prims();
+ rcu_scheduler_active = RCU_SCHEDULER_RUNNING;
+ rcu_test_sync_prims();
+ return 0;
+}
+core_initcall(rcu_exp_runtime_mode);
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 85c5a883c6e3..56583e764ebf 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -670,7 +670,7 @@ void synchronize_rcu(void)
lock_is_held(&rcu_lock_map) ||
lock_is_held(&rcu_sched_lock_map),
"Illegal synchronize_rcu() in RCU read-side critical section");
- if (!rcu_scheduler_active)
+ if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE)
return;
if (rcu_gp_is_expedited())
synchronize_rcu_expedited();
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index f19271dce0a9..4f6db7e6a117 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -121,11 +121,14 @@ EXPORT_SYMBOL(rcu_read_lock_sched_held);
* Should expedited grace-period primitives always fall back to their
* non-expedited counterparts? Intended for use within RCU. Note
* that if the user specifies both rcu_expedited and rcu_normal, then
- * rcu_normal wins.
+ * rcu_normal wins. (Except during the time period during boot from
+ * when the first task is spawned until the rcu_exp_runtime_mode()
+ * core_initcall() is invoked, at which point everything is expedited.)
*/
bool rcu_gp_is_normal(void)
{
- return READ_ONCE(rcu_normal);
+ return READ_ONCE(rcu_normal) &&
+ rcu_scheduler_active != RCU_SCHEDULER_INIT;
}
EXPORT_SYMBOL_GPL(rcu_gp_is_normal);
@@ -135,13 +138,14 @@ static atomic_t rcu_expedited_nesting =
/*
* Should normal grace-period primitives be expedited? Intended for
* use within RCU. Note that this function takes the rcu_expedited
- * sysfs/boot variable into account as well as the rcu_expedite_gp()
- * nesting. So looping on rcu_unexpedite_gp() until rcu_gp_is_expedited()
- * returns false is a -really- bad idea.
+ * sysfs/boot variable and rcu_scheduler_active into account as well
+ * as the rcu_expedite_gp() nesting. So looping on rcu_unexpedite_gp()
+ * until rcu_gp_is_expedited() returns false is a -really- bad idea.
*/
bool rcu_gp_is_expedited(void)
{
- return rcu_expedited || atomic_read(&rcu_expedited_nesting);
+ return rcu_expedited || atomic_read(&rcu_expedited_nesting) ||
+ rcu_scheduler_active == RCU_SCHEDULER_INIT;
}
EXPORT_SYMBOL_GPL(rcu_gp_is_expedited);
@@ -257,7 +261,7 @@ EXPORT_SYMBOL_GPL(rcu_callback_map);
int notrace debug_lockdep_rcu_enabled(void)
{
- return rcu_scheduler_active && debug_locks &&
+ return rcu_scheduler_active != RCU_SCHEDULER_INACTIVE && debug_locks &&
current->lockdep_recursion == 0;
}
EXPORT_SYMBOL_GPL(debug_lockdep_rcu_enabled);
@@ -591,7 +595,7 @@ EXPORT_SYMBOL_GPL(call_rcu_tasks);
void synchronize_rcu_tasks(void)
{
/* Complain if the scheduler has not started. */
- RCU_LOCKDEP_WARN(!rcu_scheduler_active,
+ RCU_LOCKDEP_WARN(rcu_scheduler_active == RCU_SCHEDULER_INACTIVE,
"synchronize_rcu_tasks called too soon");
/* Wait for the grace period. */
@@ -813,6 +817,23 @@ static void rcu_spawn_tasks_kthread(void)
#endif /* #ifdef CONFIG_TASKS_RCU */
+/*
+ * Test each non-SRCU synchronous grace-period wait API. This is
+ * useful just after a change in mode for these primitives, and
+ * during early boot.
+ */
+void rcu_test_sync_prims(void)
+{
+ if (!IS_ENABLED(CONFIG_PROVE_RCU))
+ return;
+ synchronize_rcu();
+ synchronize_rcu_bh();
+ synchronize_sched();
+ synchronize_rcu_expedited();
+ synchronize_rcu_bh_expedited();
+ synchronize_sched_expedited();
+}
+
#ifdef CONFIG_PROVE_RCU
/*
@@ -865,6 +886,7 @@ void rcu_early_boot_tests(void)
early_boot_test_call_rcu_bh();
if (rcu_self_test_sched)
early_boot_test_call_rcu_sched();
+ rcu_test_sync_prims();
}
static int rcu_verify_early_boot_tests(void)
diff --git a/kernel/signal.c b/kernel/signal.c
index ff046b73ff2d..3603d93a1968 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -346,7 +346,7 @@ static bool task_participate_group_stop(struct task_struct *task)
* fresh group stop. Read comment in do_signal_stop() for details.
*/
if (!sig->group_stop_count && !(sig->flags & SIGNAL_STOP_STOPPED)) {
- sig->flags = SIGNAL_STOP_STOPPED;
+ signal_set_stop_flags(sig, SIGNAL_STOP_STOPPED);
return true;
}
return false;
@@ -843,7 +843,7 @@ static bool prepare_signal(int sig, struct task_struct *p, bool force)
* will take ->siglock, notice SIGNAL_CLD_MASK, and
* notify its parent. See get_signal_to_deliver().
*/
- signal->flags = why | SIGNAL_STOP_CONTINUED;
+ signal_set_stop_flags(signal, why | SIGNAL_STOP_CONTINUED);
signal->group_stop_count = 0;
signal->group_exit_code = 0;
}
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 8dbaec0e4f7f..1aea594a54db 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -2475,6 +2475,7 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int
break;
if (neg)
continue;
+ val = convmul * val / convdiv;
if ((min && val < *min) || (max && val > *max))
continue;
*i = val;
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 2c115fdab397..74e0388cc88d 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -767,7 +767,7 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
tick = expires;
/* Skip reprogram of event if its not changed */
- if (ts->tick_stopped && (expires == dev->next_event))
+ if (ts->tick_stopped && (expires == ts->next_tick))
goto out;
/*
@@ -787,6 +787,8 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
trace_tick_stop(1, TICK_DEP_MASK_NONE);
}
+ ts->next_tick = tick;
+
/*
* If the expiration time == KTIME_MAX, then we simply stop
* the tick timer.
@@ -802,7 +804,10 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
else
tick_program_event(tick, 1);
out:
- /* Update the estimated sleep length */
+ /*
+ * Update the estimated sleep length until the next timer
+ * (not only the tick).
+ */
ts->sleep_length = ktime_sub(dev->next_event, now);
return tick;
}
diff --git a/kernel/time/tick-sched.h b/kernel/time/tick-sched.h
index bf38226e5c17..075444e3d48e 100644
--- a/kernel/time/tick-sched.h
+++ b/kernel/time/tick-sched.h
@@ -27,6 +27,7 @@ enum tick_nohz_mode {
* timer is modified for nohz sleeps. This is necessary
* to resume the tick timer operation in the timeline
* when the CPU returns from nohz sleep.
+ * @next_tick: Next tick to be fired when in dynticks mode.
* @tick_stopped: Indicator that the idle tick has been stopped
* @idle_jiffies: jiffies at the entry to idle for idle time accounting
* @idle_calls: Total number of idle calls
@@ -44,6 +45,7 @@ struct tick_sched {
unsigned long check_clocks;
enum tick_nohz_mode nohz_mode;
ktime_t last_tick;
+ ktime_t next_tick;
int inidle;
int tick_stopped;
unsigned long idle_jiffies;
diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c
index 775569ec50d0..af344a1bf0d0 100644
--- a/kernel/trace/trace_hwlat.c
+++ b/kernel/trace/trace_hwlat.c
@@ -266,7 +266,7 @@ out:
static struct cpumask save_cpumask;
static bool disable_migrate;
-static void move_to_next_cpu(void)
+static void move_to_next_cpu(bool initmask)
{
static struct cpumask *current_mask;
int next_cpu;
@@ -275,7 +275,7 @@ static void move_to_next_cpu(void)
return;
/* Just pick the first CPU on first iteration */
- if (!current_mask) {
+ if (initmask) {
current_mask = &save_cpumask;
get_online_cpus();
cpumask_and(current_mask, cpu_online_mask, tracing_buffer_mask);
@@ -330,10 +330,12 @@ static void move_to_next_cpu(void)
static int kthread_fn(void *data)
{
u64 interval;
+ bool initmask = true;
while (!kthread_should_stop()) {
- move_to_next_cpu();
+ move_to_next_cpu(initmask);
+ initmask = false;
local_irq_disable();
get_sample();
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index a133ecd741e4..7ad9e53ad174 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1372,7 +1372,7 @@ kprobe_trace_selftest_target(int a1, int a2, int a3, int a4, int a5, int a6)
return a1 + a2 + a3 + a4 + a5 + a6;
}
-static struct __init trace_event_file *
+static __init struct trace_event_file *
find_trace_probe_file(struct trace_kprobe *tk, struct trace_array *tr)
{
struct trace_event_file *file;
diff --git a/kernel/ucount.c b/kernel/ucount.c
index 9d20d5dd298a..4bbd38ec3788 100644
--- a/kernel/ucount.c
+++ b/kernel/ucount.c
@@ -128,10 +128,10 @@ static struct ucounts *get_ucounts(struct user_namespace *ns, kuid_t uid)
struct hlist_head *hashent = ucounts_hashentry(ns, uid);
struct ucounts *ucounts, *new;
- spin_lock(&ucounts_lock);
+ spin_lock_irq(&ucounts_lock);
ucounts = find_ucounts(ns, uid, hashent);
if (!ucounts) {
- spin_unlock(&ucounts_lock);
+ spin_unlock_irq(&ucounts_lock);
new = kzalloc(sizeof(*new), GFP_KERNEL);
if (!new)
@@ -141,7 +141,7 @@ static struct ucounts *get_ucounts(struct user_namespace *ns, kuid_t uid)
new->uid = uid;
atomic_set(&new->count, 0);
- spin_lock(&ucounts_lock);
+ spin_lock_irq(&ucounts_lock);
ucounts = find_ucounts(ns, uid, hashent);
if (ucounts) {
kfree(new);
@@ -152,16 +152,18 @@ static struct ucounts *get_ucounts(struct user_namespace *ns, kuid_t uid)
}
if (!atomic_add_unless(&ucounts->count, 1, INT_MAX))
ucounts = NULL;
- spin_unlock(&ucounts_lock);
+ spin_unlock_irq(&ucounts_lock);
return ucounts;
}
static void put_ucounts(struct ucounts *ucounts)
{
+ unsigned long flags;
+
if (atomic_dec_and_test(&ucounts->count)) {
- spin_lock(&ucounts_lock);
+ spin_lock_irqsave(&ucounts_lock, flags);
hlist_del_init(&ucounts->node);
- spin_unlock(&ucounts_lock);
+ spin_unlock_irqrestore(&ucounts_lock, flags);
kfree(ucounts);
}
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index d4b0fa01cae3..63177be0159e 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -49,6 +49,8 @@ unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask);
#define for_each_watchdog_cpu(cpu) \
for_each_cpu_and((cpu), cpu_online_mask, &watchdog_cpumask)
+atomic_t watchdog_park_in_progress = ATOMIC_INIT(0);
+
/*
* The 'watchdog_running' variable is set to 1 when the watchdog threads
* are registered/started and is set to 0 when the watchdog threads are
@@ -260,6 +262,9 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
int duration;
int softlockup_all_cpu_backtrace = sysctl_softlockup_all_cpu_backtrace;
+ if (atomic_read(&watchdog_park_in_progress) != 0)
+ return HRTIMER_NORESTART;
+
/* kick the hardlockup detector */
watchdog_interrupt_count();
@@ -467,12 +472,16 @@ static int watchdog_park_threads(void)
{
int cpu, ret = 0;
+ atomic_set(&watchdog_park_in_progress, 1);
+
for_each_watchdog_cpu(cpu) {
ret = kthread_park(per_cpu(softlockup_watchdog, cpu));
if (ret)
break;
}
+ atomic_set(&watchdog_park_in_progress, 0);
+
return ret;
}
diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c
index 84016c8aee6b..12b8dd640786 100644
--- a/kernel/watchdog_hld.c
+++ b/kernel/watchdog_hld.c
@@ -84,6 +84,9 @@ static void watchdog_overflow_callback(struct perf_event *event,
/* Ensure the watchdog never gets throttled */
event->hw.interrupts = 0;
+ if (atomic_read(&watchdog_park_in_progress) != 0)
+ return;
+
if (__this_cpu_read(watchdog_nmi_touch) == true) {
__this_cpu_write(watchdog_nmi_touch, false);
return;