summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/auditsc.c22
-rw-r--r--kernel/bpf/bpf_lsm.c3
-rw-r--r--kernel/bpf/btf.c4
-rw-r--r--kernel/bpf/hashtab.c4
-rw-r--r--kernel/bpf/inode.c8
-rw-r--r--kernel/bpf/memalloc.c2
-rw-r--r--kernel/bpf/offload.c3
-rw-r--r--kernel/bpf/syscall.c24
-rw-r--r--kernel/bpf/task_iter.c39
-rw-r--r--kernel/bpf/trampoline.c4
-rw-r--r--kernel/bpf/verifier.c56
-rw-r--r--kernel/capability.c10
-rw-r--r--kernel/cgroup/cgroup.c2
-rw-r--r--kernel/cgroup/cpuset.c48
-rw-r--r--kernel/context_tracking.c12
-rw-r--r--kernel/cpu_pm.c9
-rw-r--r--kernel/events/core.c246
-rw-r--r--kernel/exit.c7
-rw-r--r--kernel/fork.c10
-rwxr-xr-xkernel/gen_kheaders.sh2
-rw-r--r--kernel/irq/Kconfig5
-rw-r--r--kernel/irq/Makefile1
-rw-r--r--kernel/irq/affinity.c405
-rw-r--r--kernel/irq/ipi-mux.c206
-rw-r--r--kernel/irq/irqdomain.c412
-rw-r--r--kernel/irq/manage.c5
-rw-r--r--kernel/irq/msi.c6
-rw-r--r--kernel/kallsyms_selftest.c21
-rw-r--r--kernel/kcsan/kcsan_test.c7
-rw-r--r--kernel/locking/lockdep.c3
-rw-r--r--kernel/locking/qspinlock.c4
-rw-r--r--kernel/locking/rtmutex.c5
-rw-r--r--kernel/locking/rwsem.c87
-rw-r--r--kernel/module/main.c26
-rw-r--r--kernel/panic.c5
-rw-r--r--kernel/printk/printk.c4
-rw-r--r--kernel/ptrace.c2
-rw-r--r--kernel/rseq.c65
-rw-r--r--kernel/sched/clock.c27
-rw-r--r--kernel/sched/core.c216
-rw-r--r--kernel/sched/cpufreq_schedutil.c43
-rw-r--r--kernel/sched/cputime.c4
-rw-r--r--kernel/sched/deadline.c42
-rw-r--r--kernel/sched/fair.c415
-rw-r--r--kernel/sched/idle.c47
-rw-r--r--kernel/sched/membarrier.c39
-rw-r--r--kernel/sched/psi.c7
-rw-r--r--kernel/sched/rt.c5
-rw-r--r--kernel/sched/sched.h107
-rw-r--r--kernel/sched/topology.c4
-rw-r--r--kernel/signal.c2
-rw-r--r--kernel/sys.c2
-rw-r--r--kernel/time/Kconfig6
-rw-r--r--kernel/time/alarmtimer.c33
-rw-r--r--kernel/time/clocksource.c72
-rw-r--r--kernel/time/hrtimer.c18
-rw-r--r--kernel/time/posix-cpu-timers.c13
-rw-r--r--kernel/time/posix-stubs.c2
-rw-r--r--kernel/time/posix-timers.c2
-rw-r--r--kernel/time/test_udelay.c2
-rw-r--r--kernel/time/tick-broadcast-hrtimer.c29
-rw-r--r--kernel/time/tick-broadcast.c6
-rw-r--r--kernel/time/tick-oneshot.c4
-rw-r--r--kernel/time/time.c8
-rw-r--r--kernel/time/timekeeping.c8
-rw-r--r--kernel/trace/Kconfig8
-rw-r--r--kernel/trace/blktrace.c4
-rw-r--r--kernel/trace/bpf_trace.c12
-rw-r--r--kernel/trace/ftrace.c23
-rw-r--r--kernel/trace/rv/rv.c2
-rw-r--r--kernel/trace/trace.c8
-rw-r--r--kernel/trace/trace.h2
-rw-r--r--kernel/trace/trace_events.c39
-rw-r--r--kernel/trace/trace_events_filter.c8
-rw-r--r--kernel/trace/trace_events_hist.c2
-rw-r--r--kernel/trace/trace_export.c3
-rw-r--r--kernel/trace/trace_osnoise.c5
-rw-r--r--kernel/trace/trace_output.c3
-rw-r--r--kernel/trace/trace_preemptirq.c61
-rw-r--r--kernel/umh.c20
-rw-r--r--kernel/workqueue.c280
81 files changed, 2058 insertions, 1369 deletions
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 547c88be8a28..93d0b87f3283 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -64,6 +64,7 @@
#include <uapi/linux/limits.h>
#include <uapi/linux/netfilter/nf_tables.h>
#include <uapi/linux/openat2.h> // struct open_how
+#include <uapi/linux/fanotify.h>
#include "audit.h"
@@ -2252,7 +2253,7 @@ static inline int audit_copy_fcaps(struct audit_names *name,
if (!dentry)
return 0;
- rc = get_vfs_caps_from_disk(&init_user_ns, dentry, &caps);
+ rc = get_vfs_caps_from_disk(&nop_mnt_idmap, dentry, &caps);
if (rc)
return rc;
@@ -2807,7 +2808,7 @@ int __audit_log_bprm_fcaps(struct linux_binprm *bprm,
ax->d.next = context->aux;
context->aux = (void *)ax;
- get_vfs_caps_from_disk(&init_user_ns,
+ get_vfs_caps_from_disk(&nop_mnt_idmap,
bprm->file->f_path.dentry, &vcaps);
ax->fcap.permitted = vcaps.permitted;
@@ -2877,10 +2878,21 @@ void __audit_log_kern_module(char *name)
context->type = AUDIT_KERN_MODULE;
}
-void __audit_fanotify(unsigned int response)
+void __audit_fanotify(u32 response, struct fanotify_response_info_audit_rule *friar)
{
- audit_log(audit_context(), GFP_KERNEL,
- AUDIT_FANOTIFY, "resp=%u", response);
+ /* {subj,obj}_trust values are {0,1,2}: no,yes,unknown */
+ switch (friar->hdr.type) {
+ case FAN_RESPONSE_INFO_NONE:
+ audit_log(audit_context(), GFP_KERNEL, AUDIT_FANOTIFY,
+ "resp=%u fan_type=%u fan_info=0 subj_trust=2 obj_trust=2",
+ response, FAN_RESPONSE_INFO_NONE);
+ break;
+ case FAN_RESPONSE_INFO_AUDIT_RULE:
+ audit_log(audit_context(), GFP_KERNEL, AUDIT_FANOTIFY,
+ "resp=%u fan_type=%u fan_info=%X subj_trust=%u obj_trust=%u",
+ response, friar->hdr.type, friar->rule_number,
+ friar->subj_trust, friar->obj_trust);
+ }
}
void __audit_tk_injoffset(struct timespec64 offset)
diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c
index 9ea42a45da47..e14c822f8911 100644
--- a/kernel/bpf/bpf_lsm.c
+++ b/kernel/bpf/bpf_lsm.c
@@ -51,7 +51,6 @@ BTF_SET_END(bpf_lsm_current_hooks)
*/
BTF_SET_START(bpf_lsm_locked_sockopt_hooks)
#ifdef CONFIG_SECURITY_NETWORK
-BTF_ID(func, bpf_lsm_socket_sock_rcv_skb)
BTF_ID(func, bpf_lsm_sock_graft)
BTF_ID(func, bpf_lsm_inet_csk_clone)
BTF_ID(func, bpf_lsm_inet_conn_established)
@@ -351,8 +350,10 @@ BTF_ID(func, bpf_lsm_bpf_prog_alloc_security)
BTF_ID(func, bpf_lsm_bpf_prog_free_security)
BTF_ID(func, bpf_lsm_file_alloc_security)
BTF_ID(func, bpf_lsm_file_free_security)
+#ifdef CONFIG_SECURITY_NETWORK
BTF_ID(func, bpf_lsm_sk_alloc_security)
BTF_ID(func, bpf_lsm_sk_free_security)
+#endif /* CONFIG_SECURITY_NETWORK */
BTF_ID(func, bpf_lsm_task_free)
BTF_SET_END(untrusted_lsm_hooks)
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index f7dd8af06413..b7017cae6fd1 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -7782,9 +7782,9 @@ int register_btf_id_dtor_kfuncs(const struct btf_id_dtor_kfunc *dtors, u32 add_c
sort(tab->dtors, tab->cnt, sizeof(tab->dtors[0]), btf_id_cmp_func, NULL);
- return 0;
end:
- btf_free_dtor_kfunc_tab(btf);
+ if (ret)
+ btf_free_dtor_kfunc_tab(btf);
btf_put(btf);
return ret;
}
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 5aa2b5525f79..66bded144377 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -152,7 +152,7 @@ static inline int htab_lock_bucket(const struct bpf_htab *htab,
{
unsigned long flags;
- hash = hash & HASHTAB_MAP_LOCK_MASK;
+ hash = hash & min_t(u32, HASHTAB_MAP_LOCK_MASK, htab->n_buckets - 1);
preempt_disable();
if (unlikely(__this_cpu_inc_return(*(htab->map_locked[hash])) != 1)) {
@@ -171,7 +171,7 @@ static inline void htab_unlock_bucket(const struct bpf_htab *htab,
struct bucket *b, u32 hash,
unsigned long flags)
{
- hash = hash & HASHTAB_MAP_LOCK_MASK;
+ hash = hash & min_t(u32, HASHTAB_MAP_LOCK_MASK, htab->n_buckets - 1);
raw_spin_unlock_irqrestore(&b->raw_lock, flags);
__this_cpu_dec(*(htab->map_locked[hash]));
preempt_enable();
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index 4f841e16779e..9948b542a470 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -122,7 +122,7 @@ static struct inode *bpf_get_inode(struct super_block *sb,
inode->i_mtime = inode->i_atime;
inode->i_ctime = inode->i_atime;
- inode_init_owner(&init_user_ns, inode, dir, mode);
+ inode_init_owner(&nop_mnt_idmap, inode, dir, mode);
return inode;
}
@@ -152,7 +152,7 @@ static void bpf_dentry_finalize(struct dentry *dentry, struct inode *inode,
dir->i_ctime = dir->i_mtime;
}
-static int bpf_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
+static int bpf_mkdir(struct mnt_idmap *idmap, struct inode *dir,
struct dentry *dentry, umode_t mode)
{
struct inode *inode;
@@ -382,7 +382,7 @@ bpf_lookup(struct inode *dir, struct dentry *dentry, unsigned flags)
return simple_lookup(dir, dentry, flags);
}
-static int bpf_symlink(struct user_namespace *mnt_userns, struct inode *dir,
+static int bpf_symlink(struct mnt_idmap *idmap, struct inode *dir,
struct dentry *dentry, const char *target)
{
char *link = kstrdup(target, GFP_USER | __GFP_NOWARN);
@@ -559,7 +559,7 @@ int bpf_obj_get_user(const char __user *pathname, int flags)
static struct bpf_prog *__get_prog_inode(struct inode *inode, enum bpf_prog_type type)
{
struct bpf_prog *prog;
- int ret = inode_permission(&init_user_ns, inode, MAY_READ);
+ int ret = inode_permission(&nop_mnt_idmap, inode, MAY_READ);
if (ret)
return ERR_PTR(ret);
diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c
index ebcc3dd0fa19..1db156405b68 100644
--- a/kernel/bpf/memalloc.c
+++ b/kernel/bpf/memalloc.c
@@ -71,7 +71,7 @@ static int bpf_mem_cache_idx(size_t size)
if (size <= 192)
return size_index[(size - 1) / 8] - 1;
- return fls(size - 1) - 1;
+ return fls(size - 1) - 2;
}
#define NUM_CACHES 11
diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c
index 13e4efc971e6..190d9f9dc987 100644
--- a/kernel/bpf/offload.c
+++ b/kernel/bpf/offload.c
@@ -216,9 +216,6 @@ static void __bpf_prog_offload_destroy(struct bpf_prog *prog)
if (offload->dev_state)
offload->offdev->ops->destroy(prog);
- /* Make sure BPF_PROG_GET_NEXT_ID can't find this dead program */
- bpf_prog_free_id(prog, true);
-
list_del_init(&offload->offloads);
kfree(offload);
prog->aux->offload = NULL;
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 64131f88c553..ecca9366c7a6 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1972,7 +1972,7 @@ static void bpf_audit_prog(const struct bpf_prog *prog, unsigned int op)
return;
if (audit_enabled == AUDIT_OFF)
return;
- if (op == BPF_AUDIT_LOAD)
+ if (!in_irq() && !irqs_disabled())
ctx = audit_context();
ab = audit_log_start(ctx, GFP_ATOMIC, AUDIT_BPF);
if (unlikely(!ab))
@@ -2001,7 +2001,7 @@ static int bpf_prog_alloc_id(struct bpf_prog *prog)
return id > 0 ? 0 : id;
}
-void bpf_prog_free_id(struct bpf_prog *prog, bool do_idr_lock)
+void bpf_prog_free_id(struct bpf_prog *prog)
{
unsigned long flags;
@@ -2013,18 +2013,10 @@ void bpf_prog_free_id(struct bpf_prog *prog, bool do_idr_lock)
if (!prog->aux->id)
return;
- if (do_idr_lock)
- spin_lock_irqsave(&prog_idr_lock, flags);
- else
- __acquire(&prog_idr_lock);
-
+ spin_lock_irqsave(&prog_idr_lock, flags);
idr_remove(&prog_idr, prog->aux->id);
prog->aux->id = 0;
-
- if (do_idr_lock)
- spin_unlock_irqrestore(&prog_idr_lock, flags);
- else
- __release(&prog_idr_lock);
+ spin_unlock_irqrestore(&prog_idr_lock, flags);
}
static void __bpf_prog_put_rcu(struct rcu_head *rcu)
@@ -2067,17 +2059,15 @@ static void bpf_prog_put_deferred(struct work_struct *work)
prog = aux->prog;
perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_UNLOAD, 0);
bpf_audit_prog(prog, BPF_AUDIT_UNLOAD);
+ bpf_prog_free_id(prog);
__bpf_prog_put_noref(prog, true);
}
-static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock)
+static void __bpf_prog_put(struct bpf_prog *prog)
{
struct bpf_prog_aux *aux = prog->aux;
if (atomic64_dec_and_test(&aux->refcnt)) {
- /* bpf_prog_free_id() must be called first */
- bpf_prog_free_id(prog, do_idr_lock);
-
if (in_irq() || irqs_disabled()) {
INIT_WORK(&aux->work, bpf_prog_put_deferred);
schedule_work(&aux->work);
@@ -2089,7 +2079,7 @@ static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock)
void bpf_prog_put(struct bpf_prog *prog)
{
- __bpf_prog_put(prog, true);
+ __bpf_prog_put(prog);
}
EXPORT_SYMBOL_GPL(bpf_prog_put);
diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
index c2a2182ce570..c4ab9d6cdbe9 100644
--- a/kernel/bpf/task_iter.c
+++ b/kernel/bpf/task_iter.c
@@ -438,6 +438,7 @@ struct bpf_iter_seq_task_vma_info {
*/
struct bpf_iter_seq_task_common common;
struct task_struct *task;
+ struct mm_struct *mm;
struct vm_area_struct *vma;
u32 tid;
unsigned long prev_vm_start;
@@ -456,16 +457,19 @@ task_vma_seq_get_next(struct bpf_iter_seq_task_vma_info *info)
enum bpf_task_vma_iter_find_op op;
struct vm_area_struct *curr_vma;
struct task_struct *curr_task;
+ struct mm_struct *curr_mm;
u32 saved_tid = info->tid;
/* If this function returns a non-NULL vma, it holds a reference to
- * the task_struct, and holds read lock on vma->mm->mmap_lock.
+ * the task_struct, holds a refcount on mm->mm_users, and holds
+ * read lock on vma->mm->mmap_lock.
* If this function returns NULL, it does not hold any reference or
* lock.
*/
if (info->task) {
curr_task = info->task;
curr_vma = info->vma;
+ curr_mm = info->mm;
/* In case of lock contention, drop mmap_lock to unblock
* the writer.
*
@@ -504,13 +508,15 @@ task_vma_seq_get_next(struct bpf_iter_seq_task_vma_info *info)
* 4.2) VMA2 and VMA2' covers different ranges, process
* VMA2'.
*/
- if (mmap_lock_is_contended(curr_task->mm)) {
+ if (mmap_lock_is_contended(curr_mm)) {
info->prev_vm_start = curr_vma->vm_start;
info->prev_vm_end = curr_vma->vm_end;
op = task_vma_iter_find_vma;
- mmap_read_unlock(curr_task->mm);
- if (mmap_read_lock_killable(curr_task->mm))
+ mmap_read_unlock(curr_mm);
+ if (mmap_read_lock_killable(curr_mm)) {
+ mmput(curr_mm);
goto finish;
+ }
} else {
op = task_vma_iter_next_vma;
}
@@ -535,42 +541,47 @@ again:
op = task_vma_iter_find_vma;
}
- if (!curr_task->mm)
+ curr_mm = get_task_mm(curr_task);
+ if (!curr_mm)
goto next_task;
- if (mmap_read_lock_killable(curr_task->mm))
+ if (mmap_read_lock_killable(curr_mm)) {
+ mmput(curr_mm);
goto finish;
+ }
}
switch (op) {
case task_vma_iter_first_vma:
- curr_vma = find_vma(curr_task->mm, 0);
+ curr_vma = find_vma(curr_mm, 0);
break;
case task_vma_iter_next_vma:
- curr_vma = find_vma(curr_task->mm, curr_vma->vm_end);
+ curr_vma = find_vma(curr_mm, curr_vma->vm_end);
break;
case task_vma_iter_find_vma:
/* We dropped mmap_lock so it is necessary to use find_vma
* to find the next vma. This is similar to the mechanism
* in show_smaps_rollup().
*/
- curr_vma = find_vma(curr_task->mm, info->prev_vm_end - 1);
+ curr_vma = find_vma(curr_mm, info->prev_vm_end - 1);
/* case 1) and 4.2) above just use curr_vma */
/* check for case 2) or case 4.1) above */
if (curr_vma &&
curr_vma->vm_start == info->prev_vm_start &&
curr_vma->vm_end == info->prev_vm_end)
- curr_vma = find_vma(curr_task->mm, curr_vma->vm_end);
+ curr_vma = find_vma(curr_mm, curr_vma->vm_end);
break;
}
if (!curr_vma) {
/* case 3) above, or case 2) 4.1) with vma->next == NULL */
- mmap_read_unlock(curr_task->mm);
+ mmap_read_unlock(curr_mm);
+ mmput(curr_mm);
goto next_task;
}
info->task = curr_task;
info->vma = curr_vma;
+ info->mm = curr_mm;
return curr_vma;
next_task:
@@ -579,6 +590,7 @@ next_task:
put_task_struct(curr_task);
info->task = NULL;
+ info->mm = NULL;
info->tid++;
goto again;
@@ -587,6 +599,7 @@ finish:
put_task_struct(curr_task);
info->task = NULL;
info->vma = NULL;
+ info->mm = NULL;
return NULL;
}
@@ -658,7 +671,9 @@ static void task_vma_seq_stop(struct seq_file *seq, void *v)
*/
info->prev_vm_start = ~0UL;
info->prev_vm_end = info->vma->vm_end;
- mmap_read_unlock(info->task->mm);
+ mmap_read_unlock(info->mm);
+ mmput(info->mm);
+ info->mm = NULL;
put_task_struct(info->task);
info->task = NULL;
}
diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
index 11f5ec0b8016..d0ed7d6f5eec 100644
--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -488,6 +488,10 @@ again:
/* reset fops->func and fops->trampoline for re-register */
tr->fops->func = NULL;
tr->fops->trampoline = 0;
+
+ /* reset im->image memory attr for arch_prepare_bpf_trampoline */
+ set_memory_nx((long)im->image, 1);
+ set_memory_rw((long)im->image, 1);
goto again;
}
#endif
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index a5255a0dcbb6..7ee218827259 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1054,6 +1054,8 @@ static void print_insn_state(struct bpf_verifier_env *env,
*/
static void *copy_array(void *dst, const void *src, size_t n, size_t size, gfp_t flags)
{
+ size_t alloc_bytes;
+ void *orig = dst;
size_t bytes;
if (ZERO_OR_NULL_PTR(src))
@@ -1062,11 +1064,11 @@ static void *copy_array(void *dst, const void *src, size_t n, size_t size, gfp_t
if (unlikely(check_mul_overflow(n, size, &bytes)))
return NULL;
- if (ksize(dst) < ksize(src)) {
- kfree(dst);
- dst = kmalloc_track_caller(kmalloc_size_roundup(bytes), flags);
- if (!dst)
- return NULL;
+ alloc_bytes = max(ksize(orig), kmalloc_size_roundup(bytes));
+ dst = krealloc(orig, alloc_bytes, flags);
+ if (!dst) {
+ kfree(orig);
+ return NULL;
}
memcpy(dst, src, bytes);
@@ -2746,6 +2748,12 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx,
*/
if (insn->src_reg == 0 && is_callback_calling_function(insn->imm))
return -ENOTSUPP;
+ /* kfunc with imm==0 is invalid and fixup_kfunc_call will
+ * catch this error later. Make backtracking conservative
+ * with ENOTSUPP.
+ */
+ if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL && insn->imm == 0)
+ return -ENOTSUPP;
/* regular helper call sets R0 */
*reg_mask &= ~1;
if (*reg_mask & 0x3f) {
@@ -3235,13 +3243,24 @@ static bool __is_pointer_value(bool allow_ptr_leaks,
return reg->type != SCALAR_VALUE;
}
+/* Copy src state preserving dst->parent and dst->live fields */
+static void copy_register_state(struct bpf_reg_state *dst, const struct bpf_reg_state *src)
+{
+ struct bpf_reg_state *parent = dst->parent;
+ enum bpf_reg_liveness live = dst->live;
+
+ *dst = *src;
+ dst->parent = parent;
+ dst->live = live;
+}
+
static void save_register_state(struct bpf_func_state *state,
int spi, struct bpf_reg_state *reg,
int size)
{
int i;
- state->stack[spi].spilled_ptr = *reg;
+ copy_register_state(&state->stack[spi].spilled_ptr, reg);
if (size == BPF_REG_SIZE)
state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN;
@@ -3287,7 +3306,9 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env,
bool sanitize = reg && is_spillable_regtype(reg->type);
for (i = 0; i < size; i++) {
- if (state->stack[spi].slot_type[i] == STACK_INVALID) {
+ u8 type = state->stack[spi].slot_type[i];
+
+ if (type != STACK_MISC && type != STACK_ZERO) {
sanitize = true;
break;
}
@@ -3567,7 +3588,7 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env,
*/
s32 subreg_def = state->regs[dst_regno].subreg_def;
- state->regs[dst_regno] = *reg;
+ copy_register_state(&state->regs[dst_regno], reg);
state->regs[dst_regno].subreg_def = subreg_def;
} else {
for (i = 0; i < size; i++) {
@@ -3588,7 +3609,7 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env,
if (dst_regno >= 0) {
/* restore register state from stack */
- state->regs[dst_regno] = *reg;
+ copy_register_state(&state->regs[dst_regno], reg);
/* mark reg as written since spilled pointer state likely
* has its liveness marks cleared by is_state_visited()
* which resets stack/reg liveness for state transitions
@@ -9582,7 +9603,7 @@ do_sim:
*/
if (!ptr_is_dst_reg) {
tmp = *dst_reg;
- *dst_reg = *ptr_reg;
+ copy_register_state(dst_reg, ptr_reg);
}
ret = sanitize_speculative_path(env, NULL, env->insn_idx + 1,
env->insn_idx);
@@ -10835,7 +10856,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
* to propagate min/max range.
*/
src_reg->id = ++env->id_gen;
- *dst_reg = *src_reg;
+ copy_register_state(dst_reg, src_reg);
dst_reg->live |= REG_LIVE_WRITTEN;
dst_reg->subreg_def = DEF_NOT_SUBREG;
} else {
@@ -10846,7 +10867,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
insn->src_reg);
return -EACCES;
} else if (src_reg->type == SCALAR_VALUE) {
- *dst_reg = *src_reg;
+ copy_register_state(dst_reg, src_reg);
/* Make sure ID is cleared otherwise
* dst_reg min/max could be incorrectly
* propagated into src_reg by find_equal_scalars()
@@ -11645,7 +11666,7 @@ static void find_equal_scalars(struct bpf_verifier_state *vstate,
bpf_for_each_reg_in_vstate(vstate, state, reg, ({
if (reg->type == SCALAR_VALUE && reg->id == known_reg->id)
- *reg = *known_reg;
+ copy_register_state(reg, known_reg);
}));
}
@@ -11822,10 +11843,17 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
* register B - not null
* for JNE A, B, ... - A is not null in the false branch;
* for JEQ A, B, ... - A is not null in the true branch.
+ *
+ * Since PTR_TO_BTF_ID points to a kernel struct that does
+ * not need to be null checked by the BPF program, i.e.,
+ * could be null even without PTR_MAYBE_NULL marking, so
+ * only propagate nullness when neither reg is that type.
*/
if (!is_jmp32 && BPF_SRC(insn->code) == BPF_X &&
__is_pointer_value(false, src_reg) && __is_pointer_value(false, dst_reg) &&
- type_may_be_null(src_reg->type) != type_may_be_null(dst_reg->type)) {
+ type_may_be_null(src_reg->type) != type_may_be_null(dst_reg->type) &&
+ base_type(src_reg->type) != PTR_TO_BTF_ID &&
+ base_type(dst_reg->type) != PTR_TO_BTF_ID) {
eq_branch_regs = NULL;
switch (opcode) {
case BPF_JEQ:
diff --git a/kernel/capability.c b/kernel/capability.c
index 860fd22117c1..339a44dfe2f4 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -486,11 +486,11 @@ EXPORT_SYMBOL(file_ns_capable);
* Return true if the inode uid and gid are within the namespace.
*/
bool privileged_wrt_inode_uidgid(struct user_namespace *ns,
- struct user_namespace *mnt_userns,
+ struct mnt_idmap *idmap,
const struct inode *inode)
{
- return vfsuid_has_mapping(ns, i_uid_into_vfsuid(mnt_userns, inode)) &&
- vfsgid_has_mapping(ns, i_gid_into_vfsgid(mnt_userns, inode));
+ return vfsuid_has_mapping(ns, i_uid_into_vfsuid(idmap, inode)) &&
+ vfsgid_has_mapping(ns, i_gid_into_vfsgid(idmap, inode));
}
/**
@@ -502,13 +502,13 @@ bool privileged_wrt_inode_uidgid(struct user_namespace *ns,
* its own user namespace and that the given inode's uid and gid are
* mapped into the current user namespace.
*/
-bool capable_wrt_inode_uidgid(struct user_namespace *mnt_userns,
+bool capable_wrt_inode_uidgid(struct mnt_idmap *idmap,
const struct inode *inode, int cap)
{
struct user_namespace *ns = current_user_ns();
return ns_capable(ns, cap) &&
- privileged_wrt_inode_uidgid(ns, mnt_userns, inode);
+ privileged_wrt_inode_uidgid(ns, idmap, inode);
}
EXPORT_SYMBOL(capable_wrt_inode_uidgid);
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index c099cf3fa02d..935e8121b21e 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -5065,7 +5065,7 @@ static int cgroup_may_write(const struct cgroup *cgrp, struct super_block *sb)
if (!inode)
return -ENOMEM;
- ret = inode_permission(&init_user_ns, inode, MAY_WRITE);
+ ret = inode_permission(&nop_mnt_idmap, inode, MAY_WRITE);
iput(inode);
return ret;
}
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index a585ced99e1e..636f1c682ac0 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -1205,12 +1205,13 @@ void rebuild_sched_domains(void)
/**
* update_tasks_cpumask - Update the cpumasks of tasks in the cpuset.
* @cs: the cpuset in which each task's cpus_allowed mask needs to be changed
+ * @new_cpus: the temp variable for the new effective_cpus mask
*
* Iterate through each task of @cs updating its cpus_allowed to the
* effective cpuset's. As this function is called with cpuset_rwsem held,
* cpuset membership stays stable.
*/
-static void update_tasks_cpumask(struct cpuset *cs)
+static void update_tasks_cpumask(struct cpuset *cs, struct cpumask *new_cpus)
{
struct css_task_iter it;
struct task_struct *task;
@@ -1224,7 +1225,10 @@ static void update_tasks_cpumask(struct cpuset *cs)
if (top_cs && (task->flags & PF_KTHREAD) &&
kthread_is_per_cpu(task))
continue;
- set_cpus_allowed_ptr(task, cs->effective_cpus);
+
+ cpumask_and(new_cpus, cs->effective_cpus,
+ task_cpu_possible_mask(task));
+ set_cpus_allowed_ptr(task, new_cpus);
}
css_task_iter_end(&it);
}
@@ -1346,7 +1350,7 @@ static int update_parent_subparts_cpumask(struct cpuset *cs, int cmd,
* A parent can be left with no CPU as long as there is no
* task directly associated with the parent partition.
*/
- if (!cpumask_intersects(cs->cpus_allowed, parent->effective_cpus) &&
+ if (cpumask_subset(parent->effective_cpus, cs->cpus_allowed) &&
partition_is_populated(parent, cs))
return PERR_NOCPUS;
@@ -1509,7 +1513,7 @@ static int update_parent_subparts_cpumask(struct cpuset *cs, int cmd,
spin_unlock_irq(&callback_lock);
if (adding || deleting)
- update_tasks_cpumask(parent);
+ update_tasks_cpumask(parent, tmp->new_cpus);
/*
* Set or clear CS_SCHED_LOAD_BALANCE when partcmd_update, if necessary.
@@ -1661,7 +1665,7 @@ update_parent_subparts:
WARN_ON(!is_in_v2_mode() &&
!cpumask_equal(cp->cpus_allowed, cp->effective_cpus));
- update_tasks_cpumask(cp);
+ update_tasks_cpumask(cp, tmp->new_cpus);
/*
* On legacy hierarchy, if the effective cpumask of any non-
@@ -2309,7 +2313,7 @@ static int update_prstate(struct cpuset *cs, int new_prs)
}
}
- update_tasks_cpumask(parent);
+ update_tasks_cpumask(parent, tmpmask.new_cpus);
if (parent->child_ecpus_count)
update_sibling_cpumasks(parent, cs, &tmpmask);
@@ -2324,6 +2328,7 @@ out:
new_prs = -new_prs;
spin_lock_irq(&callback_lock);
cs->partition_root_state = new_prs;
+ WRITE_ONCE(cs->prs_err, err);
spin_unlock_irq(&callback_lock);
/*
* Update child cpusets, if present.
@@ -3345,7 +3350,7 @@ hotplug_update_tasks_legacy(struct cpuset *cs,
* as the tasks will be migrated to an ancestor.
*/
if (cpus_updated && !cpumask_empty(cs->cpus_allowed))
- update_tasks_cpumask(cs);
+ update_tasks_cpumask(cs, new_cpus);
if (mems_updated && !nodes_empty(cs->mems_allowed))
update_tasks_nodemask(cs);
@@ -3382,7 +3387,7 @@ hotplug_update_tasks(struct cpuset *cs,
spin_unlock_irq(&callback_lock);
if (cpus_updated)
- update_tasks_cpumask(cs);
+ update_tasks_cpumask(cs, new_cpus);
if (mems_updated)
update_tasks_nodemask(cs);
}
@@ -3689,15 +3694,38 @@ void __init cpuset_init_smp(void)
* Description: Returns the cpumask_var_t cpus_allowed of the cpuset
* attached to the specified @tsk. Guaranteed to return some non-empty
* subset of cpu_online_mask, even if this means going outside the
- * tasks cpuset.
+ * tasks cpuset, except when the task is in the top cpuset.
**/
void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
{
unsigned long flags;
+ struct cpuset *cs;
spin_lock_irqsave(&callback_lock, flags);
- guarantee_online_cpus(tsk, pmask);
+ rcu_read_lock();
+
+ cs = task_cs(tsk);
+ if (cs != &top_cpuset)
+ guarantee_online_cpus(tsk, pmask);
+ /*
+ * Tasks in the top cpuset won't get update to their cpumasks
+ * when a hotplug online/offline event happens. So we include all
+ * offline cpus in the allowed cpu list.
+ */
+ if ((cs == &top_cpuset) || cpumask_empty(pmask)) {
+ const struct cpumask *possible_mask = task_cpu_possible_mask(tsk);
+
+ /*
+ * We first exclude cpus allocated to partitions. If there is no
+ * allowable online cpu left, we fall back to all possible cpus.
+ */
+ cpumask_andnot(pmask, possible_mask, top_cpuset.subparts_cpus);
+ if (!cpumask_intersects(pmask, cpu_online_mask))
+ cpumask_copy(pmask, possible_mask);
+ }
+
+ rcu_read_unlock();
spin_unlock_irqrestore(&callback_lock, flags);
}
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c
index 77978e372377..a09f1c19336a 100644
--- a/kernel/context_tracking.c
+++ b/kernel/context_tracking.c
@@ -510,7 +510,7 @@ void noinstr __ct_user_enter(enum ctx_state state)
* In this we case we don't care about any concurrency/ordering.
*/
if (!IS_ENABLED(CONFIG_CONTEXT_TRACKING_IDLE))
- atomic_set(&ct->state, state);
+ arch_atomic_set(&ct->state, state);
} else {
/*
* Even if context tracking is disabled on this CPU, because it's outside
@@ -527,7 +527,7 @@ void noinstr __ct_user_enter(enum ctx_state state)
*/
if (!IS_ENABLED(CONFIG_CONTEXT_TRACKING_IDLE)) {
/* Tracking for vtime only, no concurrent RCU EQS accounting */
- atomic_set(&ct->state, state);
+ arch_atomic_set(&ct->state, state);
} else {
/*
* Tracking for vtime and RCU EQS. Make sure we don't race
@@ -535,7 +535,7 @@ void noinstr __ct_user_enter(enum ctx_state state)
* RCU only requires RCU_DYNTICKS_IDX increments to be fully
* ordered.
*/
- atomic_add(state, &ct->state);
+ arch_atomic_add(state, &ct->state);
}
}
}
@@ -630,12 +630,12 @@ void noinstr __ct_user_exit(enum ctx_state state)
* In this we case we don't care about any concurrency/ordering.
*/
if (!IS_ENABLED(CONFIG_CONTEXT_TRACKING_IDLE))
- atomic_set(&ct->state, CONTEXT_KERNEL);
+ arch_atomic_set(&ct->state, CONTEXT_KERNEL);
} else {
if (!IS_ENABLED(CONFIG_CONTEXT_TRACKING_IDLE)) {
/* Tracking for vtime only, no concurrent RCU EQS accounting */
- atomic_set(&ct->state, CONTEXT_KERNEL);
+ arch_atomic_set(&ct->state, CONTEXT_KERNEL);
} else {
/*
* Tracking for vtime and RCU EQS. Make sure we don't race
@@ -643,7 +643,7 @@ void noinstr __ct_user_exit(enum ctx_state state)
* RCU only requires RCU_DYNTICKS_IDX increments to be fully
* ordered.
*/
- atomic_sub(state, &ct->state);
+ arch_atomic_sub(state, &ct->state);
}
}
}
diff --git a/kernel/cpu_pm.c b/kernel/cpu_pm.c
index ba4ba71facf9..b0f0d15085db 100644
--- a/kernel/cpu_pm.c
+++ b/kernel/cpu_pm.c
@@ -30,16 +30,9 @@ static int cpu_pm_notify(enum cpu_pm_event event)
{
int ret;
- /*
- * This introduces a RCU read critical section, which could be
- * disfunctional in cpu idle. Copy RCU_NONIDLE code to let RCU know
- * this.
- */
- ct_irq_enter_irqson();
rcu_read_lock();
ret = raw_notifier_call_chain(&cpu_pm_notifier.chain, event, NULL);
rcu_read_unlock();
- ct_irq_exit_irqson();
return notifier_to_errno(ret);
}
@@ -49,11 +42,9 @@ static int cpu_pm_notify_robust(enum cpu_pm_event event_up, enum cpu_pm_event ev
unsigned long flags;
int ret;
- ct_irq_enter_irqson();
raw_spin_lock_irqsave(&cpu_pm_notifier.lock, flags);
ret = raw_notifier_call_chain_robust(&cpu_pm_notifier.chain, event_up, event_down, NULL);
raw_spin_unlock_irqrestore(&cpu_pm_notifier.lock, flags);
- ct_irq_exit_irqson();
return notifier_to_errno(ret);
}
diff --git a/kernel/events/core.c b/kernel/events/core.c
index d56328e5080e..7099c77bc53b 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -4813,19 +4813,17 @@ find_get_pmu_context(struct pmu *pmu, struct perf_event_context *ctx,
cpc = per_cpu_ptr(pmu->cpu_pmu_context, event->cpu);
epc = &cpc->epc;
-
+ raw_spin_lock_irq(&ctx->lock);
if (!epc->ctx) {
atomic_set(&epc->refcount, 1);
epc->embedded = 1;
- raw_spin_lock_irq(&ctx->lock);
list_add(&epc->pmu_ctx_entry, &ctx->pmu_ctx_list);
epc->ctx = ctx;
- raw_spin_unlock_irq(&ctx->lock);
} else {
WARN_ON_ONCE(epc->ctx != ctx);
atomic_inc(&epc->refcount);
}
-
+ raw_spin_unlock_irq(&ctx->lock);
return epc;
}
@@ -4896,33 +4894,30 @@ static void free_epc_rcu(struct rcu_head *head)
static void put_pmu_ctx(struct perf_event_pmu_context *epc)
{
+ struct perf_event_context *ctx = epc->ctx;
unsigned long flags;
- if (!atomic_dec_and_test(&epc->refcount))
+ /*
+ * XXX
+ *
+ * lockdep_assert_held(&ctx->mutex);
+ *
+ * can't because of the call-site in _free_event()/put_event()
+ * which isn't always called under ctx->mutex.
+ */
+ if (!atomic_dec_and_raw_lock_irqsave(&epc->refcount, &ctx->lock, flags))
return;
- if (epc->ctx) {
- struct perf_event_context *ctx = epc->ctx;
+ WARN_ON_ONCE(list_empty(&epc->pmu_ctx_entry));
- /*
- * XXX
- *
- * lockdep_assert_held(&ctx->mutex);
- *
- * can't because of the call-site in _free_event()/put_event()
- * which isn't always called under ctx->mutex.
- */
-
- WARN_ON_ONCE(list_empty(&epc->pmu_ctx_entry));
- raw_spin_lock_irqsave(&ctx->lock, flags);
- list_del_init(&epc->pmu_ctx_entry);
- epc->ctx = NULL;
- raw_spin_unlock_irqrestore(&ctx->lock, flags);
- }
+ list_del_init(&epc->pmu_ctx_entry);
+ epc->ctx = NULL;
WARN_ON_ONCE(!list_empty(&epc->pinned_active));
WARN_ON_ONCE(!list_empty(&epc->flexible_active));
+ raw_spin_unlock_irqrestore(&ctx->lock, flags);
+
if (epc->embedded)
return;
@@ -7046,13 +7041,20 @@ out_put:
ring_buffer_put(rb);
}
-static void __perf_event_header__init_id(struct perf_event_header *header,
- struct perf_sample_data *data,
+/*
+ * A set of common sample data types saved even for non-sample records
+ * when event->attr.sample_id_all is set.
+ */
+#define PERF_SAMPLE_ID_ALL (PERF_SAMPLE_TID | PERF_SAMPLE_TIME | \
+ PERF_SAMPLE_ID | PERF_SAMPLE_STREAM_ID | \
+ PERF_SAMPLE_CPU | PERF_SAMPLE_IDENTIFIER)
+
+static void __perf_event_header__init_id(struct perf_sample_data *data,
struct perf_event *event,
u64 sample_type)
{
data->type = event->attr.sample_type;
- header->size += event->id_header_size;
+ data->sample_flags |= data->type & PERF_SAMPLE_ID_ALL;
if (sample_type & PERF_SAMPLE_TID) {
/* namespace issues */
@@ -7079,8 +7081,10 @@ void perf_event_header__init_id(struct perf_event_header *header,
struct perf_sample_data *data,
struct perf_event *event)
{
- if (event->attr.sample_id_all)
- __perf_event_header__init_id(header, data, event, event->attr.sample_type);
+ if (event->attr.sample_id_all) {
+ header->size += event->id_header_size;
+ __perf_event_header__init_id(data, event, event->attr.sample_type);
+ }
}
static void __perf_event__output_id_sample(struct perf_output_handle *handle,
@@ -7310,7 +7314,7 @@ void perf_output_sample(struct perf_output_handle *handle,
}
if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
- if (data->sample_flags & PERF_SAMPLE_BRANCH_STACK) {
+ if (data->br_stack) {
size_t size;
size = data->br_stack->nr
@@ -7554,83 +7558,68 @@ perf_callchain(struct perf_event *event, struct pt_regs *regs)
return callchain ?: &__empty_callchain;
}
-void perf_prepare_sample(struct perf_event_header *header,
- struct perf_sample_data *data,
+static __always_inline u64 __cond_set(u64 flags, u64 s, u64 d)
+{
+ return d * !!(flags & s);
+}
+
+void perf_prepare_sample(struct perf_sample_data *data,
struct perf_event *event,
struct pt_regs *regs)
{
u64 sample_type = event->attr.sample_type;
u64 filtered_sample_type;
- header->type = PERF_RECORD_SAMPLE;
- header->size = sizeof(*header) + event->header_size;
-
- header->misc = 0;
- header->misc |= perf_misc_flags(regs);
-
/*
- * Clear the sample flags that have already been done by the
- * PMU driver.
+ * Add the sample flags that are dependent to others. And clear the
+ * sample flags that have already been done by the PMU driver.
*/
- filtered_sample_type = sample_type & ~data->sample_flags;
- __perf_event_header__init_id(header, data, event, filtered_sample_type);
-
- if (sample_type & (PERF_SAMPLE_IP | PERF_SAMPLE_CODE_PAGE_SIZE))
- data->ip = perf_instruction_pointer(regs);
-
- if (sample_type & PERF_SAMPLE_CALLCHAIN) {
- int size = 1;
-
- if (filtered_sample_type & PERF_SAMPLE_CALLCHAIN)
- data->callchain = perf_callchain(event, regs);
+ filtered_sample_type = sample_type;
+ filtered_sample_type |= __cond_set(sample_type, PERF_SAMPLE_CODE_PAGE_SIZE,
+ PERF_SAMPLE_IP);
+ filtered_sample_type |= __cond_set(sample_type, PERF_SAMPLE_DATA_PAGE_SIZE |
+ PERF_SAMPLE_PHYS_ADDR, PERF_SAMPLE_ADDR);
+ filtered_sample_type |= __cond_set(sample_type, PERF_SAMPLE_STACK_USER,
+ PERF_SAMPLE_REGS_USER);
+ filtered_sample_type &= ~data->sample_flags;
- size += data->callchain->nr;
-
- header->size += size * sizeof(u64);
+ if (filtered_sample_type == 0) {
+ /* Make sure it has the correct data->type for output */
+ data->type = event->attr.sample_type;
+ return;
}
- if (sample_type & PERF_SAMPLE_RAW) {
- struct perf_raw_record *raw = data->raw;
- int size;
+ __perf_event_header__init_id(data, event, filtered_sample_type);
- if (raw && (data->sample_flags & PERF_SAMPLE_RAW)) {
- struct perf_raw_frag *frag = &raw->frag;
- u32 sum = 0;
+ if (filtered_sample_type & PERF_SAMPLE_IP) {
+ data->ip = perf_instruction_pointer(regs);
+ data->sample_flags |= PERF_SAMPLE_IP;
+ }
- do {
- sum += frag->size;
- if (perf_raw_frag_last(frag))
- break;
- frag = frag->next;
- } while (1);
+ if (filtered_sample_type & PERF_SAMPLE_CALLCHAIN)
+ perf_sample_save_callchain(data, event, regs);
- size = round_up(sum + sizeof(u32), sizeof(u64));
- raw->size = size - sizeof(u32);
- frag->pad = raw->size - sum;
- } else {
- size = sizeof(u64);
- data->raw = NULL;
- }
-
- header->size += size;
+ if (filtered_sample_type & PERF_SAMPLE_RAW) {
+ data->raw = NULL;
+ data->dyn_size += sizeof(u64);
+ data->sample_flags |= PERF_SAMPLE_RAW;
}
- if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
- int size = sizeof(u64); /* nr */
- if (data->sample_flags & PERF_SAMPLE_BRANCH_STACK) {
- if (branch_sample_hw_index(event))
- size += sizeof(u64);
-
- size += data->br_stack->nr
- * sizeof(struct perf_branch_entry);
- }
- header->size += size;
+ if (filtered_sample_type & PERF_SAMPLE_BRANCH_STACK) {
+ data->br_stack = NULL;
+ data->dyn_size += sizeof(u64);
+ data->sample_flags |= PERF_SAMPLE_BRANCH_STACK;
}
- if (sample_type & (PERF_SAMPLE_REGS_USER | PERF_SAMPLE_STACK_USER))
+ if (filtered_sample_type & PERF_SAMPLE_REGS_USER)
perf_sample_regs_user(&data->regs_user, regs);
- if (sample_type & PERF_SAMPLE_REGS_USER) {
+ /*
+ * It cannot use the filtered_sample_type here as REGS_USER can be set
+ * by STACK_USER (using __cond_set() above) and we don't want to update
+ * the dyn_size if it's not requested by users.
+ */
+ if ((sample_type & ~data->sample_flags) & PERF_SAMPLE_REGS_USER) {
/* regs dump ABI info */
int size = sizeof(u64);
@@ -7639,10 +7628,11 @@ void perf_prepare_sample(struct perf_event_header *header,
size += hweight64(mask) * sizeof(u64);
}
- header->size += size;
+ data->dyn_size += size;
+ data->sample_flags |= PERF_SAMPLE_REGS_USER;
}
- if (sample_type & PERF_SAMPLE_STACK_USER) {
+ if (filtered_sample_type & PERF_SAMPLE_STACK_USER) {
/*
* Either we need PERF_SAMPLE_STACK_USER bit to be always
* processed as the last one or have additional check added
@@ -7650,9 +7640,10 @@ void perf_prepare_sample(struct perf_event_header *header,
* up the rest of the sample size.
*/
u16 stack_size = event->attr.sample_stack_user;
+ u16 header_size = perf_sample_data_size(data, event);
u16 size = sizeof(u64);
- stack_size = perf_sample_ustack_size(stack_size, header->size,
+ stack_size = perf_sample_ustack_size(stack_size, header_size,
data->regs_user.regs);
/*
@@ -7664,24 +7655,31 @@ void perf_prepare_sample(struct perf_event_header *header,
size += sizeof(u64) + stack_size;
data->stack_user_size = stack_size;
- header->size += size;
+ data->dyn_size += size;
+ data->sample_flags |= PERF_SAMPLE_STACK_USER;
}
- if (filtered_sample_type & PERF_SAMPLE_WEIGHT_TYPE)
+ if (filtered_sample_type & PERF_SAMPLE_WEIGHT_TYPE) {
data->weight.full = 0;
+ data->sample_flags |= PERF_SAMPLE_WEIGHT_TYPE;
+ }
- if (filtered_sample_type & PERF_SAMPLE_DATA_SRC)
+ if (filtered_sample_type & PERF_SAMPLE_DATA_SRC) {
data->data_src.val = PERF_MEM_NA;
+ data->sample_flags |= PERF_SAMPLE_DATA_SRC;
+ }
- if (filtered_sample_type & PERF_SAMPLE_TRANSACTION)
+ if (filtered_sample_type & PERF_SAMPLE_TRANSACTION) {
data->txn = 0;
+ data->sample_flags |= PERF_SAMPLE_TRANSACTION;
+ }
- if (sample_type & (PERF_SAMPLE_ADDR | PERF_SAMPLE_PHYS_ADDR | PERF_SAMPLE_DATA_PAGE_SIZE)) {
- if (filtered_sample_type & PERF_SAMPLE_ADDR)
- data->addr = 0;
+ if (filtered_sample_type & PERF_SAMPLE_ADDR) {
+ data->addr = 0;
+ data->sample_flags |= PERF_SAMPLE_ADDR;
}
- if (sample_type & PERF_SAMPLE_REGS_INTR) {
+ if (filtered_sample_type & PERF_SAMPLE_REGS_INTR) {
/* regs dump ABI info */
int size = sizeof(u64);
@@ -7693,20 +7691,23 @@ void perf_prepare_sample(struct perf_event_header *header,
size += hweight64(mask) * sizeof(u64);
}
- header->size += size;
+ data->dyn_size += size;
+ data->sample_flags |= PERF_SAMPLE_REGS_INTR;
}
- if (sample_type & PERF_SAMPLE_PHYS_ADDR &&
- filtered_sample_type & PERF_SAMPLE_PHYS_ADDR)
+ if (filtered_sample_type & PERF_SAMPLE_PHYS_ADDR) {
data->phys_addr = perf_virt_to_phys(data->addr);
+ data->sample_flags |= PERF_SAMPLE_PHYS_ADDR;
+ }
#ifdef CONFIG_CGROUP_PERF
- if (sample_type & PERF_SAMPLE_CGROUP) {
+ if (filtered_sample_type & PERF_SAMPLE_CGROUP) {
struct cgroup *cgrp;
/* protected by RCU */
cgrp = task_css_check(current, perf_event_cgrp_id, 1)->cgroup;
data->cgroup = cgroup_id(cgrp);
+ data->sample_flags |= PERF_SAMPLE_CGROUP;
}
#endif
@@ -7715,16 +7716,21 @@ void perf_prepare_sample(struct perf_event_header *header,
* require PERF_SAMPLE_ADDR, kernel implicitly retrieve the data->addr,
* but the value will not dump to the userspace.
*/
- if (sample_type & PERF_SAMPLE_DATA_PAGE_SIZE)
+ if (filtered_sample_type & PERF_SAMPLE_DATA_PAGE_SIZE) {
data->data_page_size = perf_get_page_size(data->addr);
+ data->sample_flags |= PERF_SAMPLE_DATA_PAGE_SIZE;
+ }
- if (sample_type & PERF_SAMPLE_CODE_PAGE_SIZE)
+ if (filtered_sample_type & PERF_SAMPLE_CODE_PAGE_SIZE) {
data->code_page_size = perf_get_page_size(data->ip);
+ data->sample_flags |= PERF_SAMPLE_CODE_PAGE_SIZE;
+ }
- if (sample_type & PERF_SAMPLE_AUX) {
+ if (filtered_sample_type & PERF_SAMPLE_AUX) {
u64 size;
+ u16 header_size = perf_sample_data_size(data, event);
- header->size += sizeof(u64); /* size */
+ header_size += sizeof(u64); /* size */
/*
* Given the 16bit nature of header::size, an AUX sample can
@@ -7732,14 +7738,26 @@ void perf_prepare_sample(struct perf_event_header *header,
* Make sure this doesn't happen by using up to U16_MAX bytes
* per sample in total (rounded down to 8 byte boundary).
*/
- size = min_t(size_t, U16_MAX - header->size,
+ size = min_t(size_t, U16_MAX - header_size,
event->attr.aux_sample_size);
size = rounddown(size, 8);
size = perf_prepare_sample_aux(event, data, size);
- WARN_ON_ONCE(size + header->size > U16_MAX);
- header->size += size;
+ WARN_ON_ONCE(size + header_size > U16_MAX);
+ data->dyn_size += size + sizeof(u64); /* size above */
+ data->sample_flags |= PERF_SAMPLE_AUX;
}
+}
+
+void perf_prepare_header(struct perf_event_header *header,
+ struct perf_sample_data *data,
+ struct perf_event *event,
+ struct pt_regs *regs)
+{
+ header->type = PERF_RECORD_SAMPLE;
+ header->size = perf_sample_data_size(data, event);
+ header->misc = perf_misc_flags(regs);
+
/*
* If you're adding more sample types here, you likely need to do
* something about the overflowing header::size, like repurpose the
@@ -7767,7 +7785,8 @@ __perf_event_output(struct perf_event *event,
/* protect the callchain buffers */
rcu_read_lock();
- perf_prepare_sample(&header, data, event, regs);
+ perf_prepare_sample(data, event, regs);
+ perf_prepare_header(&header, data, event, regs);
err = output_begin(&handle, data, event, header.size);
if (err)
@@ -10125,8 +10144,7 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
};
perf_sample_data_init(&data, 0, 0);
- data.raw = &raw;
- data.sample_flags |= PERF_SAMPLE_RAW;
+ perf_sample_save_raw_data(&data, &raw);
perf_trace_buf_update(record, event_type);
@@ -10333,13 +10351,7 @@ static void bpf_overflow_handler(struct perf_event *event,
rcu_read_lock();
prog = READ_ONCE(event->prog);
if (prog) {
- if (prog->call_get_stack &&
- (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) &&
- !(data->sample_flags & PERF_SAMPLE_CALLCHAIN)) {
- data->callchain = perf_callchain(event, regs);
- data->sample_flags |= PERF_SAMPLE_CALLCHAIN;
- }
-
+ perf_prepare_sample(data, event, regs);
ret = bpf_prog_run(prog, &ctx);
}
rcu_read_unlock();
diff --git a/kernel/exit.c b/kernel/exit.c
index 15dc2ec80c46..bccfa4218356 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -807,6 +807,8 @@ void __noreturn do_exit(long code)
struct task_struct *tsk = current;
int group_dead;
+ WARN_ON(irqs_disabled());
+
synchronize_group_exit(tsk, code);
WARN_ON(tsk->plug);
@@ -938,6 +940,11 @@ void __noreturn make_task_dead(int signr)
if (unlikely(!tsk->pid))
panic("Attempted to kill the idle task!");
+ if (unlikely(irqs_disabled())) {
+ pr_info("note: %s[%d] exited with irqs disabled\n",
+ current->comm, task_pid_nr(current));
+ local_irq_enable();
+ }
if (unlikely(in_atomic())) {
pr_info("note: %s[%d] exited with preempt_count %d\n",
current->comm, task_pid_nr(current),
diff --git a/kernel/fork.c b/kernel/fork.c
index 9f7fe3541897..038b898dad52 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1044,7 +1044,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
#endif
#ifdef CONFIG_BLK_CGROUP
- tsk->throttle_queue = NULL;
+ tsk->throttle_disk = NULL;
tsk->use_memdelay = 0;
#endif
@@ -1060,6 +1060,10 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
tsk->reported_split_lock = 0;
#endif
+#ifdef CONFIG_SCHED_MM_CID
+ tsk->mm_cid = -1;
+ tsk->mm_cid_active = 0;
+#endif
return tsk;
free_stack:
@@ -1169,6 +1173,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
mm->user_ns = get_user_ns(user_ns);
lru_gen_init_mm(mm);
+ mm_init_cid(mm);
return mm;
fail_pcpu:
@@ -1601,6 +1606,7 @@ static int copy_mm(unsigned long clone_flags, struct task_struct *tsk)
tsk->mm = mm;
tsk->active_mm = mm;
+ sched_mm_cid_fork(tsk);
return 0;
}
@@ -3034,7 +3040,7 @@ void __init mm_cache_init(void)
* dynamically sized based on the maximum CPU number this system
* can have, taking hotplug into account (nr_cpu_ids).
*/
- mm_size = sizeof(struct mm_struct) + cpumask_size();
+ mm_size = sizeof(struct mm_struct) + cpumask_size() + mm_cid_size();
mm_cachep = kmem_cache_create_usercopy("mm_struct",
mm_size, ARCH_MIN_MMSTRUCT_ALIGN,
diff --git a/kernel/gen_kheaders.sh b/kernel/gen_kheaders.sh
index 473036b43c83..81b97f0f6556 100755
--- a/kernel/gen_kheaders.sh
+++ b/kernel/gen_kheaders.sh
@@ -14,6 +14,8 @@ include/
arch/$SRCARCH/include/
"
+type cpio > /dev/null
+
# Support incremental builds by skipping archive generation
# if timestamps of files being archived are not changed.
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index b64c44ae4c25..2531f3496ab6 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -86,6 +86,11 @@ config GENERIC_IRQ_IPI
depends on SMP
select IRQ_DOMAIN_HIERARCHY
+# Generic IRQ IPI Mux support
+config GENERIC_IRQ_IPI_MUX
+ bool
+ depends on SMP
+
# Generic MSI hierarchical interrupt domain support
config GENERIC_MSI_IRQ
bool
diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile
index b4f53717d143..f19d3080bf11 100644
--- a/kernel/irq/Makefile
+++ b/kernel/irq/Makefile
@@ -15,6 +15,7 @@ obj-$(CONFIG_GENERIC_IRQ_MIGRATION) += cpuhotplug.o
obj-$(CONFIG_PM_SLEEP) += pm.o
obj-$(CONFIG_GENERIC_MSI_IRQ) += msi.o
obj-$(CONFIG_GENERIC_IRQ_IPI) += ipi.o
+obj-$(CONFIG_GENERIC_IRQ_IPI_MUX) += ipi-mux.o
obj-$(CONFIG_SMP) += affinity.o
obj-$(CONFIG_GENERIC_IRQ_DEBUGFS) += debugfs.o
obj-$(CONFIG_GENERIC_IRQ_MATRIX_ALLOCATOR) += matrix.o
diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
index d9a5c1d65a79..44a4eba80315 100644
--- a/kernel/irq/affinity.c
+++ b/kernel/irq/affinity.c
@@ -7,398 +7,7 @@
#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/cpu.h>
-#include <linux/sort.h>
-
-static void irq_spread_init_one(struct cpumask *irqmsk, struct cpumask *nmsk,
- unsigned int cpus_per_vec)
-{
- const struct cpumask *siblmsk;
- int cpu, sibl;
-
- for ( ; cpus_per_vec > 0; ) {
- cpu = cpumask_first(nmsk);
-
- /* Should not happen, but I'm too lazy to think about it */
- if (cpu >= nr_cpu_ids)
- return;
-
- cpumask_clear_cpu(cpu, nmsk);
- cpumask_set_cpu(cpu, irqmsk);
- cpus_per_vec--;
-
- /* If the cpu has siblings, use them first */
- siblmsk = topology_sibling_cpumask(cpu);
- for (sibl = -1; cpus_per_vec > 0; ) {
- sibl = cpumask_next(sibl, siblmsk);
- if (sibl >= nr_cpu_ids)
- break;
- if (!cpumask_test_and_clear_cpu(sibl, nmsk))
- continue;
- cpumask_set_cpu(sibl, irqmsk);
- cpus_per_vec--;
- }
- }
-}
-
-static cpumask_var_t *alloc_node_to_cpumask(void)
-{
- cpumask_var_t *masks;
- int node;
-
- masks = kcalloc(nr_node_ids, sizeof(cpumask_var_t), GFP_KERNEL);
- if (!masks)
- return NULL;
-
- for (node = 0; node < nr_node_ids; node++) {
- if (!zalloc_cpumask_var(&masks[node], GFP_KERNEL))
- goto out_unwind;
- }
-
- return masks;
-
-out_unwind:
- while (--node >= 0)
- free_cpumask_var(masks[node]);
- kfree(masks);
- return NULL;
-}
-
-static void free_node_to_cpumask(cpumask_var_t *masks)
-{
- int node;
-
- for (node = 0; node < nr_node_ids; node++)
- free_cpumask_var(masks[node]);
- kfree(masks);
-}
-
-static void build_node_to_cpumask(cpumask_var_t *masks)
-{
- int cpu;
-
- for_each_possible_cpu(cpu)
- cpumask_set_cpu(cpu, masks[cpu_to_node(cpu)]);
-}
-
-static int get_nodes_in_cpumask(cpumask_var_t *node_to_cpumask,
- const struct cpumask *mask, nodemask_t *nodemsk)
-{
- int n, nodes = 0;
-
- /* Calculate the number of nodes in the supplied affinity mask */
- for_each_node(n) {
- if (cpumask_intersects(mask, node_to_cpumask[n])) {
- node_set(n, *nodemsk);
- nodes++;
- }
- }
- return nodes;
-}
-
-struct node_vectors {
- unsigned id;
-
- union {
- unsigned nvectors;
- unsigned ncpus;
- };
-};
-
-static int ncpus_cmp_func(const void *l, const void *r)
-{
- const struct node_vectors *ln = l;
- const struct node_vectors *rn = r;
-
- return ln->ncpus - rn->ncpus;
-}
-
-/*
- * Allocate vector number for each node, so that for each node:
- *
- * 1) the allocated number is >= 1
- *
- * 2) the allocated numbver is <= active CPU number of this node
- *
- * The actual allocated total vectors may be less than @numvecs when
- * active total CPU number is less than @numvecs.
- *
- * Active CPUs means the CPUs in '@cpu_mask AND @node_to_cpumask[]'
- * for each node.
- */
-static void alloc_nodes_vectors(unsigned int numvecs,
- cpumask_var_t *node_to_cpumask,
- const struct cpumask *cpu_mask,
- const nodemask_t nodemsk,
- struct cpumask *nmsk,
- struct node_vectors *node_vectors)
-{
- unsigned n, remaining_ncpus = 0;
-
- for (n = 0; n < nr_node_ids; n++) {
- node_vectors[n].id = n;
- node_vectors[n].ncpus = UINT_MAX;
- }
-
- for_each_node_mask(n, nodemsk) {
- unsigned ncpus;
-
- cpumask_and(nmsk, cpu_mask, node_to_cpumask[n]);
- ncpus = cpumask_weight(nmsk);
-
- if (!ncpus)
- continue;
- remaining_ncpus += ncpus;
- node_vectors[n].ncpus = ncpus;
- }
-
- numvecs = min_t(unsigned, remaining_ncpus, numvecs);
-
- sort(node_vectors, nr_node_ids, sizeof(node_vectors[0]),
- ncpus_cmp_func, NULL);
-
- /*
- * Allocate vectors for each node according to the ratio of this
- * node's nr_cpus to remaining un-assigned ncpus. 'numvecs' is
- * bigger than number of active numa nodes. Always start the
- * allocation from the node with minimized nr_cpus.
- *
- * This way guarantees that each active node gets allocated at
- * least one vector, and the theory is simple: over-allocation
- * is only done when this node is assigned by one vector, so
- * other nodes will be allocated >= 1 vector, since 'numvecs' is
- * bigger than number of numa nodes.
- *
- * One perfect invariant is that number of allocated vectors for
- * each node is <= CPU count of this node:
- *
- * 1) suppose there are two nodes: A and B
- * ncpu(X) is CPU count of node X
- * vecs(X) is the vector count allocated to node X via this
- * algorithm
- *
- * ncpu(A) <= ncpu(B)
- * ncpu(A) + ncpu(B) = N
- * vecs(A) + vecs(B) = V
- *
- * vecs(A) = max(1, round_down(V * ncpu(A) / N))
- * vecs(B) = V - vecs(A)
- *
- * both N and V are integer, and 2 <= V <= N, suppose
- * V = N - delta, and 0 <= delta <= N - 2
- *
- * 2) obviously vecs(A) <= ncpu(A) because:
- *
- * if vecs(A) is 1, then vecs(A) <= ncpu(A) given
- * ncpu(A) >= 1
- *
- * otherwise,
- * vecs(A) <= V * ncpu(A) / N <= ncpu(A), given V <= N
- *
- * 3) prove how vecs(B) <= ncpu(B):
- *
- * if round_down(V * ncpu(A) / N) == 0, vecs(B) won't be
- * over-allocated, so vecs(B) <= ncpu(B),
- *
- * otherwise:
- *
- * vecs(A) =
- * round_down(V * ncpu(A) / N) =
- * round_down((N - delta) * ncpu(A) / N) =
- * round_down((N * ncpu(A) - delta * ncpu(A)) / N) >=
- * round_down((N * ncpu(A) - delta * N) / N) =
- * cpu(A) - delta
- *
- * then:
- *
- * vecs(A) - V >= ncpu(A) - delta - V
- * =>
- * V - vecs(A) <= V + delta - ncpu(A)
- * =>
- * vecs(B) <= N - ncpu(A)
- * =>
- * vecs(B) <= cpu(B)
- *
- * For nodes >= 3, it can be thought as one node and another big
- * node given that is exactly what this algorithm is implemented,
- * and we always re-calculate 'remaining_ncpus' & 'numvecs', and
- * finally for each node X: vecs(X) <= ncpu(X).
- *
- */
- for (n = 0; n < nr_node_ids; n++) {
- unsigned nvectors, ncpus;
-
- if (node_vectors[n].ncpus == UINT_MAX)
- continue;
-
- WARN_ON_ONCE(numvecs == 0);
-
- ncpus = node_vectors[n].ncpus;
- nvectors = max_t(unsigned, 1,
- numvecs * ncpus / remaining_ncpus);
- WARN_ON_ONCE(nvectors > ncpus);
-
- node_vectors[n].nvectors = nvectors;
-
- remaining_ncpus -= ncpus;
- numvecs -= nvectors;
- }
-}
-
-static int __irq_build_affinity_masks(unsigned int startvec,
- unsigned int numvecs,
- unsigned int firstvec,
- cpumask_var_t *node_to_cpumask,
- const struct cpumask *cpu_mask,
- struct cpumask *nmsk,
- struct irq_affinity_desc *masks)
-{
- unsigned int i, n, nodes, cpus_per_vec, extra_vecs, done = 0;
- unsigned int last_affv = firstvec + numvecs;
- unsigned int curvec = startvec;
- nodemask_t nodemsk = NODE_MASK_NONE;
- struct node_vectors *node_vectors;
-
- if (cpumask_empty(cpu_mask))
- return 0;
-
- nodes = get_nodes_in_cpumask(node_to_cpumask, cpu_mask, &nodemsk);
-
- /*
- * If the number of nodes in the mask is greater than or equal the
- * number of vectors we just spread the vectors across the nodes.
- */
- if (numvecs <= nodes) {
- for_each_node_mask(n, nodemsk) {
- /* Ensure that only CPUs which are in both masks are set */
- cpumask_and(nmsk, cpu_mask, node_to_cpumask[n]);
- cpumask_or(&masks[curvec].mask, &masks[curvec].mask, nmsk);
- if (++curvec == last_affv)
- curvec = firstvec;
- }
- return numvecs;
- }
-
- node_vectors = kcalloc(nr_node_ids,
- sizeof(struct node_vectors),
- GFP_KERNEL);
- if (!node_vectors)
- return -ENOMEM;
-
- /* allocate vector number for each node */
- alloc_nodes_vectors(numvecs, node_to_cpumask, cpu_mask,
- nodemsk, nmsk, node_vectors);
-
- for (i = 0; i < nr_node_ids; i++) {
- unsigned int ncpus, v;
- struct node_vectors *nv = &node_vectors[i];
-
- if (nv->nvectors == UINT_MAX)
- continue;
-
- /* Get the cpus on this node which are in the mask */
- cpumask_and(nmsk, cpu_mask, node_to_cpumask[nv->id]);
- ncpus = cpumask_weight(nmsk);
- if (!ncpus)
- continue;
-
- WARN_ON_ONCE(nv->nvectors > ncpus);
-
- /* Account for rounding errors */
- extra_vecs = ncpus - nv->nvectors * (ncpus / nv->nvectors);
-
- /* Spread allocated vectors on CPUs of the current node */
- for (v = 0; v < nv->nvectors; v++, curvec++) {
- cpus_per_vec = ncpus / nv->nvectors;
-
- /* Account for extra vectors to compensate rounding errors */
- if (extra_vecs) {
- cpus_per_vec++;
- --extra_vecs;
- }
-
- /*
- * wrapping has to be considered given 'startvec'
- * may start anywhere
- */
- if (curvec >= last_affv)
- curvec = firstvec;
- irq_spread_init_one(&masks[curvec].mask, nmsk,
- cpus_per_vec);
- }
- done += nv->nvectors;
- }
- kfree(node_vectors);
- return done;
-}
-
-/*
- * build affinity in two stages:
- * 1) spread present CPU on these vectors
- * 2) spread other possible CPUs on these vectors
- */
-static int irq_build_affinity_masks(unsigned int startvec, unsigned int numvecs,
- unsigned int firstvec,
- struct irq_affinity_desc *masks)
-{
- unsigned int curvec = startvec, nr_present = 0, nr_others = 0;
- cpumask_var_t *node_to_cpumask;
- cpumask_var_t nmsk, npresmsk;
- int ret = -ENOMEM;
-
- if (!zalloc_cpumask_var(&nmsk, GFP_KERNEL))
- return ret;
-
- if (!zalloc_cpumask_var(&npresmsk, GFP_KERNEL))
- goto fail_nmsk;
-
- node_to_cpumask = alloc_node_to_cpumask();
- if (!node_to_cpumask)
- goto fail_npresmsk;
-
- /* Stabilize the cpumasks */
- cpus_read_lock();
- build_node_to_cpumask(node_to_cpumask);
-
- /* Spread on present CPUs starting from affd->pre_vectors */
- ret = __irq_build_affinity_masks(curvec, numvecs, firstvec,
- node_to_cpumask, cpu_present_mask,
- nmsk, masks);
- if (ret < 0)
- goto fail_build_affinity;
- nr_present = ret;
-
- /*
- * Spread on non present CPUs starting from the next vector to be
- * handled. If the spreading of present CPUs already exhausted the
- * vector space, assign the non present CPUs to the already spread
- * out vectors.
- */
- if (nr_present >= numvecs)
- curvec = firstvec;
- else
- curvec = firstvec + nr_present;
- cpumask_andnot(npresmsk, cpu_possible_mask, cpu_present_mask);
- ret = __irq_build_affinity_masks(curvec, numvecs, firstvec,
- node_to_cpumask, npresmsk, nmsk,
- masks);
- if (ret >= 0)
- nr_others = ret;
-
- fail_build_affinity:
- cpus_read_unlock();
-
- if (ret >= 0)
- WARN_ON(nr_present + nr_others < numvecs);
-
- free_node_to_cpumask(node_to_cpumask);
-
- fail_npresmsk:
- free_cpumask_var(npresmsk);
-
- fail_nmsk:
- free_cpumask_var(nmsk);
- return ret < 0 ? ret : 0;
-}
+#include <linux/group_cpus.h>
static void default_calc_sets(struct irq_affinity *affd, unsigned int affvecs)
{
@@ -461,14 +70,18 @@ irq_create_affinity_masks(unsigned int nvecs, struct irq_affinity *affd)
*/
for (i = 0, usedvecs = 0; i < affd->nr_sets; i++) {
unsigned int this_vecs = affd->set_size[i];
- int ret;
+ int j;
+ struct cpumask *result = group_cpus_evenly(this_vecs);
- ret = irq_build_affinity_masks(curvec, this_vecs,
- curvec, masks);
- if (ret) {
+ if (!result) {
kfree(masks);
return NULL;
}
+
+ for (j = 0; j < this_vecs; j++)
+ cpumask_copy(&masks[curvec + j].mask, &result[j]);
+ kfree(result);
+
curvec += this_vecs;
usedvecs += this_vecs;
}
diff --git a/kernel/irq/ipi-mux.c b/kernel/irq/ipi-mux.c
new file mode 100644
index 000000000000..fa4fc18c6131
--- /dev/null
+++ b/kernel/irq/ipi-mux.c
@@ -0,0 +1,206 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Multiplex several virtual IPIs over a single HW IPI.
+ *
+ * Copyright The Asahi Linux Contributors
+ * Copyright (c) 2022 Ventana Micro Systems Inc.
+ */
+
+#define pr_fmt(fmt) "ipi-mux: " fmt
+#include <linux/cpu.h>
+#include <linux/init.h>
+#include <linux/irq.h>
+#include <linux/irqchip.h>
+#include <linux/irqchip/chained_irq.h>
+#include <linux/irqdomain.h>
+#include <linux/jump_label.h>
+#include <linux/percpu.h>
+#include <linux/smp.h>
+
+struct ipi_mux_cpu {
+ atomic_t enable;
+ atomic_t bits;
+};
+
+static struct ipi_mux_cpu __percpu *ipi_mux_pcpu;
+static struct irq_domain *ipi_mux_domain;
+static void (*ipi_mux_send)(unsigned int cpu);
+
+static void ipi_mux_mask(struct irq_data *d)
+{
+ struct ipi_mux_cpu *icpu = this_cpu_ptr(ipi_mux_pcpu);
+
+ atomic_andnot(BIT(irqd_to_hwirq(d)), &icpu->enable);
+}
+
+static void ipi_mux_unmask(struct irq_data *d)
+{
+ struct ipi_mux_cpu *icpu = this_cpu_ptr(ipi_mux_pcpu);
+ u32 ibit = BIT(irqd_to_hwirq(d));
+
+ atomic_or(ibit, &icpu->enable);
+
+ /*
+ * The atomic_or() above must complete before the atomic_read()
+ * below to avoid racing ipi_mux_send_mask().
+ */
+ smp_mb__after_atomic();
+
+ /* If a pending IPI was unmasked, raise a parent IPI immediately. */
+ if (atomic_read(&icpu->bits) & ibit)
+ ipi_mux_send(smp_processor_id());
+}
+
+static void ipi_mux_send_mask(struct irq_data *d, const struct cpumask *mask)
+{
+ struct ipi_mux_cpu *icpu = this_cpu_ptr(ipi_mux_pcpu);
+ u32 ibit = BIT(irqd_to_hwirq(d));
+ unsigned long pending;
+ int cpu;
+
+ for_each_cpu(cpu, mask) {
+ icpu = per_cpu_ptr(ipi_mux_pcpu, cpu);
+
+ /*
+ * This sequence is the mirror of the one in ipi_mux_unmask();
+ * see the comment there. Additionally, release semantics
+ * ensure that the vIPI flag set is ordered after any shared
+ * memory accesses that precede it. This therefore also pairs
+ * with the atomic_fetch_andnot in ipi_mux_process().
+ */
+ pending = atomic_fetch_or_release(ibit, &icpu->bits);
+
+ /*
+ * The atomic_fetch_or_release() above must complete
+ * before the atomic_read() below to avoid racing with
+ * ipi_mux_unmask().
+ */
+ smp_mb__after_atomic();
+
+ /*
+ * The flag writes must complete before the physical IPI is
+ * issued to another CPU. This is implied by the control
+ * dependency on the result of atomic_read() below, which is
+ * itself already ordered after the vIPI flag write.
+ */
+ if (!(pending & ibit) && (atomic_read(&icpu->enable) & ibit))
+ ipi_mux_send(cpu);
+ }
+}
+
+static const struct irq_chip ipi_mux_chip = {
+ .name = "IPI Mux",
+ .irq_mask = ipi_mux_mask,
+ .irq_unmask = ipi_mux_unmask,
+ .ipi_send_mask = ipi_mux_send_mask,
+};
+
+static int ipi_mux_domain_alloc(struct irq_domain *d, unsigned int virq,
+ unsigned int nr_irqs, void *arg)
+{
+ int i;
+
+ for (i = 0; i < nr_irqs; i++) {
+ irq_set_percpu_devid(virq + i);
+ irq_domain_set_info(d, virq + i, i, &ipi_mux_chip, NULL,
+ handle_percpu_devid_irq, NULL, NULL);
+ }
+
+ return 0;
+}
+
+static const struct irq_domain_ops ipi_mux_domain_ops = {
+ .alloc = ipi_mux_domain_alloc,
+ .free = irq_domain_free_irqs_top,
+};
+
+/**
+ * ipi_mux_process - Process multiplexed virtual IPIs
+ */
+void ipi_mux_process(void)
+{
+ struct ipi_mux_cpu *icpu = this_cpu_ptr(ipi_mux_pcpu);
+ irq_hw_number_t hwirq;
+ unsigned long ipis;
+ unsigned int en;
+
+ /*
+ * Reading enable mask does not need to be ordered as long as
+ * this function is called from interrupt handler because only
+ * the CPU itself can change it's own enable mask.
+ */
+ en = atomic_read(&icpu->enable);
+
+ /*
+ * Clear the IPIs we are about to handle. This pairs with the
+ * atomic_fetch_or_release() in ipi_mux_send_mask().
+ */
+ ipis = atomic_fetch_andnot(en, &icpu->bits) & en;
+
+ for_each_set_bit(hwirq, &ipis, BITS_PER_TYPE(int))
+ generic_handle_domain_irq(ipi_mux_domain, hwirq);
+}
+
+/**
+ * ipi_mux_create - Create virtual IPIs multiplexed on top of a single
+ * parent IPI.
+ * @nr_ipi: number of virtual IPIs to create. This should
+ * be <= BITS_PER_TYPE(int)
+ * @mux_send: callback to trigger parent IPI for a particular CPU
+ *
+ * Returns first virq of the newly created virtual IPIs upon success
+ * or <=0 upon failure
+ */
+int ipi_mux_create(unsigned int nr_ipi, void (*mux_send)(unsigned int cpu))
+{
+ struct fwnode_handle *fwnode;
+ struct irq_domain *domain;
+ int rc;
+
+ if (ipi_mux_domain)
+ return -EEXIST;
+
+ if (BITS_PER_TYPE(int) < nr_ipi || !mux_send)
+ return -EINVAL;
+
+ ipi_mux_pcpu = alloc_percpu(typeof(*ipi_mux_pcpu));
+ if (!ipi_mux_pcpu)
+ return -ENOMEM;
+
+ fwnode = irq_domain_alloc_named_fwnode("IPI-Mux");
+ if (!fwnode) {
+ pr_err("unable to create IPI Mux fwnode\n");
+ rc = -ENOMEM;
+ goto fail_free_cpu;
+ }
+
+ domain = irq_domain_create_linear(fwnode, nr_ipi,
+ &ipi_mux_domain_ops, NULL);
+ if (!domain) {
+ pr_err("unable to add IPI Mux domain\n");
+ rc = -ENOMEM;
+ goto fail_free_fwnode;
+ }
+
+ domain->flags |= IRQ_DOMAIN_FLAG_IPI_SINGLE;
+ irq_domain_update_bus_token(domain, DOMAIN_BUS_IPI);
+
+ rc = irq_domain_alloc_irqs(domain, nr_ipi, NUMA_NO_NODE, NULL);
+ if (rc <= 0) {
+ pr_err("unable to alloc IRQs from IPI Mux domain\n");
+ goto fail_free_domain;
+ }
+
+ ipi_mux_domain = domain;
+ ipi_mux_send = mux_send;
+
+ return rc;
+
+fail_free_domain:
+ irq_domain_remove(domain);
+fail_free_fwnode:
+ irq_domain_free_fwnode(fwnode);
+fail_free_cpu:
+ free_percpu(ipi_mux_pcpu);
+ return rc;
+}
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 8fe1da9614ee..aa5b7eeeceb8 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -25,6 +25,9 @@ static DEFINE_MUTEX(irq_domain_mutex);
static struct irq_domain *irq_default_domain;
+static int irq_domain_alloc_irqs_locked(struct irq_domain *domain, int irq_base,
+ unsigned int nr_irqs, int node, void *arg,
+ bool realloc, const struct irq_affinity_desc *affinity);
static void irq_domain_check_hierarchy(struct irq_domain *domain);
struct irqchip_fwid {
@@ -114,7 +117,7 @@ void irq_domain_free_fwnode(struct fwnode_handle *fwnode)
{
struct irqchip_fwid *fwid;
- if (WARN_ON(!is_fwnode_irqchip(fwnode)))
+ if (!fwnode || WARN_ON(!is_fwnode_irqchip(fwnode)))
return;
fwid = container_of(fwnode, struct irqchip_fwid, fwnode);
@@ -123,23 +126,12 @@ void irq_domain_free_fwnode(struct fwnode_handle *fwnode)
}
EXPORT_SYMBOL_GPL(irq_domain_free_fwnode);
-/**
- * __irq_domain_add() - Allocate a new irq_domain data structure
- * @fwnode: firmware node for the interrupt controller
- * @size: Size of linear map; 0 for radix mapping only
- * @hwirq_max: Maximum number of interrupts supported by controller
- * @direct_max: Maximum value of direct maps; Use ~0 for no limit; 0 for no
- * direct mapping
- * @ops: domain callbacks
- * @host_data: Controller private data pointer
- *
- * Allocates and initializes an irq_domain structure.
- * Returns pointer to IRQ domain, or NULL on failure.
- */
-struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, unsigned int size,
- irq_hw_number_t hwirq_max, int direct_max,
- const struct irq_domain_ops *ops,
- void *host_data)
+static struct irq_domain *__irq_domain_create(struct fwnode_handle *fwnode,
+ unsigned int size,
+ irq_hw_number_t hwirq_max,
+ int direct_max,
+ const struct irq_domain_ops *ops,
+ void *host_data)
{
struct irqchip_fwid *fwid;
struct irq_domain *domain;
@@ -214,25 +206,66 @@ struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, unsigned int s
/* Fill structure */
INIT_RADIX_TREE(&domain->revmap_tree, GFP_KERNEL);
- mutex_init(&domain->revmap_mutex);
domain->ops = ops;
domain->host_data = host_data;
domain->hwirq_max = hwirq_max;
- if (direct_max) {
+ if (direct_max)
domain->flags |= IRQ_DOMAIN_FLAG_NO_MAP;
- }
domain->revmap_size = size;
+ /*
+ * Hierarchical domains use the domain lock of the root domain
+ * (innermost domain).
+ *
+ * For non-hierarchical domains (as for root domains), the root
+ * pointer is set to the domain itself so that &domain->root->mutex
+ * always points to the right lock.
+ */
+ mutex_init(&domain->mutex);
+ domain->root = domain;
+
irq_domain_check_hierarchy(domain);
+ return domain;
+}
+
+static void __irq_domain_publish(struct irq_domain *domain)
+{
mutex_lock(&irq_domain_mutex);
debugfs_add_domain_dir(domain);
list_add(&domain->link, &irq_domain_list);
mutex_unlock(&irq_domain_mutex);
pr_debug("Added domain %s\n", domain->name);
+}
+
+/**
+ * __irq_domain_add() - Allocate a new irq_domain data structure
+ * @fwnode: firmware node for the interrupt controller
+ * @size: Size of linear map; 0 for radix mapping only
+ * @hwirq_max: Maximum number of interrupts supported by controller
+ * @direct_max: Maximum value of direct maps; Use ~0 for no limit; 0 for no
+ * direct mapping
+ * @ops: domain callbacks
+ * @host_data: Controller private data pointer
+ *
+ * Allocates and initializes an irq_domain structure.
+ * Returns pointer to IRQ domain, or NULL on failure.
+ */
+struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, unsigned int size,
+ irq_hw_number_t hwirq_max, int direct_max,
+ const struct irq_domain_ops *ops,
+ void *host_data)
+{
+ struct irq_domain *domain;
+
+ domain = __irq_domain_create(fwnode, size, hwirq_max, direct_max,
+ ops, host_data);
+ if (domain)
+ __irq_domain_publish(domain);
+
return domain;
}
EXPORT_SYMBOL_GPL(__irq_domain_add);
@@ -502,30 +535,34 @@ static bool irq_domain_is_nomap(struct irq_domain *domain)
static void irq_domain_clear_mapping(struct irq_domain *domain,
irq_hw_number_t hwirq)
{
+ lockdep_assert_held(&domain->root->mutex);
+
if (irq_domain_is_nomap(domain))
return;
- mutex_lock(&domain->revmap_mutex);
if (hwirq < domain->revmap_size)
rcu_assign_pointer(domain->revmap[hwirq], NULL);
else
radix_tree_delete(&domain->revmap_tree, hwirq);
- mutex_unlock(&domain->revmap_mutex);
}
static void irq_domain_set_mapping(struct irq_domain *domain,
irq_hw_number_t hwirq,
struct irq_data *irq_data)
{
+ /*
+ * This also makes sure that all domains point to the same root when
+ * called from irq_domain_insert_irq() for each domain in a hierarchy.
+ */
+ lockdep_assert_held(&domain->root->mutex);
+
if (irq_domain_is_nomap(domain))
return;
- mutex_lock(&domain->revmap_mutex);
if (hwirq < domain->revmap_size)
rcu_assign_pointer(domain->revmap[hwirq], irq_data);
else
radix_tree_insert(&domain->revmap_tree, hwirq, irq_data);
- mutex_unlock(&domain->revmap_mutex);
}
static void irq_domain_disassociate(struct irq_domain *domain, unsigned int irq)
@@ -538,6 +575,9 @@ static void irq_domain_disassociate(struct irq_domain *domain, unsigned int irq)
return;
hwirq = irq_data->hwirq;
+
+ mutex_lock(&domain->root->mutex);
+
irq_set_status_flags(irq, IRQ_NOREQUEST);
/* remove chip and handler */
@@ -557,10 +597,12 @@ static void irq_domain_disassociate(struct irq_domain *domain, unsigned int irq)
/* Clear reverse map for this hwirq */
irq_domain_clear_mapping(domain, hwirq);
+
+ mutex_unlock(&domain->root->mutex);
}
-int irq_domain_associate(struct irq_domain *domain, unsigned int virq,
- irq_hw_number_t hwirq)
+static int irq_domain_associate_locked(struct irq_domain *domain, unsigned int virq,
+ irq_hw_number_t hwirq)
{
struct irq_data *irq_data = irq_get_irq_data(virq);
int ret;
@@ -573,7 +615,6 @@ int irq_domain_associate(struct irq_domain *domain, unsigned int virq,
if (WARN(irq_data->domain, "error: virq%i is already associated", virq))
return -EINVAL;
- mutex_lock(&irq_domain_mutex);
irq_data->hwirq = hwirq;
irq_data->domain = domain;
if (domain->ops->map) {
@@ -590,23 +631,29 @@ int irq_domain_associate(struct irq_domain *domain, unsigned int virq,
}
irq_data->domain = NULL;
irq_data->hwirq = 0;
- mutex_unlock(&irq_domain_mutex);
return ret;
}
-
- /* If not already assigned, give the domain the chip's name */
- if (!domain->name && irq_data->chip)
- domain->name = irq_data->chip->name;
}
domain->mapcount++;
irq_domain_set_mapping(domain, hwirq, irq_data);
- mutex_unlock(&irq_domain_mutex);
irq_clear_status_flags(virq, IRQ_NOREQUEST);
return 0;
}
+
+int irq_domain_associate(struct irq_domain *domain, unsigned int virq,
+ irq_hw_number_t hwirq)
+{
+ int ret;
+
+ mutex_lock(&domain->root->mutex);
+ ret = irq_domain_associate_locked(domain, virq, hwirq);
+ mutex_unlock(&domain->root->mutex);
+
+ return ret;
+}
EXPORT_SYMBOL_GPL(irq_domain_associate);
void irq_domain_associate_many(struct irq_domain *domain, unsigned int irq_base,
@@ -619,9 +666,8 @@ void irq_domain_associate_many(struct irq_domain *domain, unsigned int irq_base,
pr_debug("%s(%s, irqbase=%i, hwbase=%i, count=%i)\n", __func__,
of_node_full_name(of_node), irq_base, (int)hwirq_base, count);
- for (i = 0; i < count; i++) {
+ for (i = 0; i < count; i++)
irq_domain_associate(domain, irq_base + i, hwirq_base + i);
- }
}
EXPORT_SYMBOL_GPL(irq_domain_associate_many);
@@ -668,6 +714,34 @@ unsigned int irq_create_direct_mapping(struct irq_domain *domain)
EXPORT_SYMBOL_GPL(irq_create_direct_mapping);
#endif
+static unsigned int irq_create_mapping_affinity_locked(struct irq_domain *domain,
+ irq_hw_number_t hwirq,
+ const struct irq_affinity_desc *affinity)
+{
+ struct device_node *of_node = irq_domain_get_of_node(domain);
+ int virq;
+
+ pr_debug("irq_create_mapping(0x%p, 0x%lx)\n", domain, hwirq);
+
+ /* Allocate a virtual interrupt number */
+ virq = irq_domain_alloc_descs(-1, 1, hwirq, of_node_to_nid(of_node),
+ affinity);
+ if (virq <= 0) {
+ pr_debug("-> virq allocation failed\n");
+ return 0;
+ }
+
+ if (irq_domain_associate_locked(domain, virq, hwirq)) {
+ irq_free_desc(virq);
+ return 0;
+ }
+
+ pr_debug("irq %lu on domain %s mapped to virtual irq %u\n",
+ hwirq, of_node_full_name(of_node), virq);
+
+ return virq;
+}
+
/**
* irq_create_mapping_affinity() - Map a hardware interrupt into linux irq space
* @domain: domain owning this hardware interrupt or NULL for default domain
@@ -680,14 +754,11 @@ EXPORT_SYMBOL_GPL(irq_create_direct_mapping);
* on the number returned from that call.
*/
unsigned int irq_create_mapping_affinity(struct irq_domain *domain,
- irq_hw_number_t hwirq,
- const struct irq_affinity_desc *affinity)
+ irq_hw_number_t hwirq,
+ const struct irq_affinity_desc *affinity)
{
- struct device_node *of_node;
int virq;
- pr_debug("irq_create_mapping(0x%p, 0x%lx)\n", domain, hwirq);
-
/* Look for default domain if necessary */
if (domain == NULL)
domain = irq_default_domain;
@@ -695,32 +766,19 @@ unsigned int irq_create_mapping_affinity(struct irq_domain *domain,
WARN(1, "%s(, %lx) called with NULL domain\n", __func__, hwirq);
return 0;
}
- pr_debug("-> using domain @%p\n", domain);
- of_node = irq_domain_get_of_node(domain);
+ mutex_lock(&domain->root->mutex);
/* Check if mapping already exists */
virq = irq_find_mapping(domain, hwirq);
if (virq) {
- pr_debug("-> existing mapping on virq %d\n", virq);
- return virq;
- }
-
- /* Allocate a virtual interrupt number */
- virq = irq_domain_alloc_descs(-1, 1, hwirq, of_node_to_nid(of_node),
- affinity);
- if (virq <= 0) {
- pr_debug("-> virq allocation failed\n");
- return 0;
- }
-
- if (irq_domain_associate(domain, virq, hwirq)) {
- irq_free_desc(virq);
- return 0;
+ pr_debug("existing mapping on virq %d\n", virq);
+ goto out;
}
- pr_debug("irq %lu on domain %s mapped to virtual irq %u\n",
- hwirq, of_node_full_name(of_node), virq);
+ virq = irq_create_mapping_affinity_locked(domain, hwirq, affinity);
+out:
+ mutex_unlock(&domain->root->mutex);
return virq;
}
@@ -789,6 +847,8 @@ unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec)
if (WARN_ON(type & ~IRQ_TYPE_SENSE_MASK))
type &= IRQ_TYPE_SENSE_MASK;
+ mutex_lock(&domain->root->mutex);
+
/*
* If we've already configured this interrupt,
* don't do it again, or hell will break loose.
@@ -801,7 +861,7 @@ unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec)
* interrupt number.
*/
if (type == IRQ_TYPE_NONE || type == irq_get_trigger_type(virq))
- return virq;
+ goto out;
/*
* If the trigger type has not been set yet, then set
@@ -809,40 +869,45 @@ unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec)
*/
if (irq_get_trigger_type(virq) == IRQ_TYPE_NONE) {
irq_data = irq_get_irq_data(virq);
- if (!irq_data)
- return 0;
+ if (!irq_data) {
+ virq = 0;
+ goto out;
+ }
irqd_set_trigger_type(irq_data, type);
- return virq;
+ goto out;
}
pr_warn("type mismatch, failed to map hwirq-%lu for %s!\n",
hwirq, of_node_full_name(to_of_node(fwspec->fwnode)));
- return 0;
+ virq = 0;
+ goto out;
}
if (irq_domain_is_hierarchy(domain)) {
- virq = irq_domain_alloc_irqs(domain, 1, NUMA_NO_NODE, fwspec);
- if (virq <= 0)
- return 0;
+ virq = irq_domain_alloc_irqs_locked(domain, -1, 1, NUMA_NO_NODE,
+ fwspec, false, NULL);
+ if (virq <= 0) {
+ virq = 0;
+ goto out;
+ }
} else {
/* Create mapping */
- virq = irq_create_mapping(domain, hwirq);
+ virq = irq_create_mapping_affinity_locked(domain, hwirq, NULL);
if (!virq)
- return virq;
+ goto out;
}
irq_data = irq_get_irq_data(virq);
- if (!irq_data) {
- if (irq_domain_is_hierarchy(domain))
- irq_domain_free_irqs(virq, 1);
- else
- irq_dispose_mapping(virq);
- return 0;
+ if (WARN_ON(!irq_data)) {
+ virq = 0;
+ goto out;
}
/* Store trigger type */
irqd_set_trigger_type(irq_data, type);
+out:
+ mutex_unlock(&domain->root->mutex);
return virq;
}
@@ -1102,12 +1167,16 @@ struct irq_domain *irq_domain_create_hierarchy(struct irq_domain *parent,
struct irq_domain *domain;
if (size)
- domain = irq_domain_create_linear(fwnode, size, ops, host_data);
+ domain = __irq_domain_create(fwnode, size, size, 0, ops, host_data);
else
- domain = irq_domain_create_tree(fwnode, ops, host_data);
+ domain = __irq_domain_create(fwnode, 0, ~0, 0, ops, host_data);
+
if (domain) {
+ domain->root = parent->root;
domain->parent = parent;
domain->flags |= flags;
+
+ __irq_domain_publish(domain);
}
return domain;
@@ -1123,10 +1192,6 @@ static void irq_domain_insert_irq(int virq)
domain->mapcount++;
irq_domain_set_mapping(domain, data->hwirq, data);
-
- /* If not already assigned, give the domain the chip's name */
- if (!domain->name && data->chip)
- domain->name = data->chip->name;
}
irq_clear_status_flags(virq, IRQ_NOREQUEST);
@@ -1426,40 +1491,12 @@ int irq_domain_alloc_irqs_hierarchy(struct irq_domain *domain,
return domain->ops->alloc(domain, irq_base, nr_irqs, arg);
}
-/**
- * __irq_domain_alloc_irqs - Allocate IRQs from domain
- * @domain: domain to allocate from
- * @irq_base: allocate specified IRQ number if irq_base >= 0
- * @nr_irqs: number of IRQs to allocate
- * @node: NUMA node id for memory allocation
- * @arg: domain specific argument
- * @realloc: IRQ descriptors have already been allocated if true
- * @affinity: Optional irq affinity mask for multiqueue devices
- *
- * Allocate IRQ numbers and initialized all data structures to support
- * hierarchy IRQ domains.
- * Parameter @realloc is mainly to support legacy IRQs.
- * Returns error code or allocated IRQ number
- *
- * The whole process to setup an IRQ has been split into two steps.
- * The first step, __irq_domain_alloc_irqs(), is to allocate IRQ
- * descriptor and required hardware resources. The second step,
- * irq_domain_activate_irq(), is to program the hardware with preallocated
- * resources. In this way, it's easier to rollback when failing to
- * allocate resources.
- */
-int __irq_domain_alloc_irqs(struct irq_domain *domain, int irq_base,
- unsigned int nr_irqs, int node, void *arg,
- bool realloc, const struct irq_affinity_desc *affinity)
+static int irq_domain_alloc_irqs_locked(struct irq_domain *domain, int irq_base,
+ unsigned int nr_irqs, int node, void *arg,
+ bool realloc, const struct irq_affinity_desc *affinity)
{
int i, ret, virq;
- if (domain == NULL) {
- domain = irq_default_domain;
- if (WARN(!domain, "domain is NULL; cannot allocate IRQ\n"))
- return -EINVAL;
- }
-
if (realloc && irq_base >= 0) {
virq = irq_base;
} else {
@@ -1478,24 +1515,18 @@ int __irq_domain_alloc_irqs(struct irq_domain *domain, int irq_base,
goto out_free_desc;
}
- mutex_lock(&irq_domain_mutex);
ret = irq_domain_alloc_irqs_hierarchy(domain, virq, nr_irqs, arg);
- if (ret < 0) {
- mutex_unlock(&irq_domain_mutex);
+ if (ret < 0)
goto out_free_irq_data;
- }
for (i = 0; i < nr_irqs; i++) {
ret = irq_domain_trim_hierarchy(virq + i);
- if (ret) {
- mutex_unlock(&irq_domain_mutex);
+ if (ret)
goto out_free_irq_data;
- }
}
-
+
for (i = 0; i < nr_irqs; i++)
irq_domain_insert_irq(virq + i);
- mutex_unlock(&irq_domain_mutex);
return virq;
@@ -1505,6 +1536,48 @@ out_free_desc:
irq_free_descs(virq, nr_irqs);
return ret;
}
+
+/**
+ * __irq_domain_alloc_irqs - Allocate IRQs from domain
+ * @domain: domain to allocate from
+ * @irq_base: allocate specified IRQ number if irq_base >= 0
+ * @nr_irqs: number of IRQs to allocate
+ * @node: NUMA node id for memory allocation
+ * @arg: domain specific argument
+ * @realloc: IRQ descriptors have already been allocated if true
+ * @affinity: Optional irq affinity mask for multiqueue devices
+ *
+ * Allocate IRQ numbers and initialized all data structures to support
+ * hierarchy IRQ domains.
+ * Parameter @realloc is mainly to support legacy IRQs.
+ * Returns error code or allocated IRQ number
+ *
+ * The whole process to setup an IRQ has been split into two steps.
+ * The first step, __irq_domain_alloc_irqs(), is to allocate IRQ
+ * descriptor and required hardware resources. The second step,
+ * irq_domain_activate_irq(), is to program the hardware with preallocated
+ * resources. In this way, it's easier to rollback when failing to
+ * allocate resources.
+ */
+int __irq_domain_alloc_irqs(struct irq_domain *domain, int irq_base,
+ unsigned int nr_irqs, int node, void *arg,
+ bool realloc, const struct irq_affinity_desc *affinity)
+{
+ int ret;
+
+ if (domain == NULL) {
+ domain = irq_default_domain;
+ if (WARN(!domain, "domain is NULL; cannot allocate IRQ\n"))
+ return -EINVAL;
+ }
+
+ mutex_lock(&domain->root->mutex);
+ ret = irq_domain_alloc_irqs_locked(domain, irq_base, nr_irqs, node, arg,
+ realloc, affinity);
+ mutex_unlock(&domain->root->mutex);
+
+ return ret;
+}
EXPORT_SYMBOL_GPL(__irq_domain_alloc_irqs);
/* The irq_data was moved, fix the revmap to refer to the new location */
@@ -1512,11 +1585,12 @@ static void irq_domain_fix_revmap(struct irq_data *d)
{
void __rcu **slot;
+ lockdep_assert_held(&d->domain->root->mutex);
+
if (irq_domain_is_nomap(d->domain))
return;
/* Fix up the revmap. */
- mutex_lock(&d->domain->revmap_mutex);
if (d->hwirq < d->domain->revmap_size) {
/* Not using radix tree */
rcu_assign_pointer(d->domain->revmap[d->hwirq], d);
@@ -1525,7 +1599,6 @@ static void irq_domain_fix_revmap(struct irq_data *d)
if (slot)
radix_tree_replace_slot(&d->domain->revmap_tree, slot, d);
}
- mutex_unlock(&d->domain->revmap_mutex);
}
/**
@@ -1541,8 +1614,8 @@ static void irq_domain_fix_revmap(struct irq_data *d)
*/
int irq_domain_push_irq(struct irq_domain *domain, int virq, void *arg)
{
- struct irq_data *child_irq_data;
- struct irq_data *root_irq_data = irq_get_irq_data(virq);
+ struct irq_data *irq_data = irq_get_irq_data(virq);
+ struct irq_data *parent_irq_data;
struct irq_desc *desc;
int rv = 0;
@@ -1567,47 +1640,46 @@ int irq_domain_push_irq(struct irq_domain *domain, int virq, void *arg)
if (WARN_ON(!irq_domain_is_hierarchy(domain)))
return -EINVAL;
- if (!root_irq_data)
+ if (!irq_data)
return -EINVAL;
- if (domain->parent != root_irq_data->domain)
+ if (domain->parent != irq_data->domain)
return -EINVAL;
- child_irq_data = kzalloc_node(sizeof(*child_irq_data), GFP_KERNEL,
- irq_data_get_node(root_irq_data));
- if (!child_irq_data)
+ parent_irq_data = kzalloc_node(sizeof(*parent_irq_data), GFP_KERNEL,
+ irq_data_get_node(irq_data));
+ if (!parent_irq_data)
return -ENOMEM;
- mutex_lock(&irq_domain_mutex);
+ mutex_lock(&domain->root->mutex);
/* Copy the original irq_data. */
- *child_irq_data = *root_irq_data;
+ *parent_irq_data = *irq_data;
/*
- * Overwrite the root_irq_data, which is embedded in struct
- * irq_desc, with values for this domain.
+ * Overwrite the irq_data, which is embedded in struct irq_desc, with
+ * values for this domain.
*/
- root_irq_data->parent_data = child_irq_data;
- root_irq_data->domain = domain;
- root_irq_data->mask = 0;
- root_irq_data->hwirq = 0;
- root_irq_data->chip = NULL;
- root_irq_data->chip_data = NULL;
+ irq_data->parent_data = parent_irq_data;
+ irq_data->domain = domain;
+ irq_data->mask = 0;
+ irq_data->hwirq = 0;
+ irq_data->chip = NULL;
+ irq_data->chip_data = NULL;
/* May (probably does) set hwirq, chip, etc. */
rv = irq_domain_alloc_irqs_hierarchy(domain, virq, 1, arg);
if (rv) {
/* Restore the original irq_data. */
- *root_irq_data = *child_irq_data;
- kfree(child_irq_data);
+ *irq_data = *parent_irq_data;
+ kfree(parent_irq_data);
goto error;
}
- irq_domain_fix_revmap(child_irq_data);
- irq_domain_set_mapping(domain, root_irq_data->hwirq, root_irq_data);
-
+ irq_domain_fix_revmap(parent_irq_data);
+ irq_domain_set_mapping(domain, irq_data->hwirq, irq_data);
error:
- mutex_unlock(&irq_domain_mutex);
+ mutex_unlock(&domain->root->mutex);
return rv;
}
@@ -1623,8 +1695,8 @@ EXPORT_SYMBOL_GPL(irq_domain_push_irq);
*/
int irq_domain_pop_irq(struct irq_domain *domain, int virq)
{
- struct irq_data *root_irq_data = irq_get_irq_data(virq);
- struct irq_data *child_irq_data;
+ struct irq_data *irq_data = irq_get_irq_data(virq);
+ struct irq_data *parent_irq_data;
struct irq_data *tmp_irq_data;
struct irq_desc *desc;
@@ -1646,37 +1718,37 @@ int irq_domain_pop_irq(struct irq_domain *domain, int virq)
if (domain == NULL)
return -EINVAL;
- if (!root_irq_data)
+ if (!irq_data)
return -EINVAL;
tmp_irq_data = irq_domain_get_irq_data(domain, virq);
/* We can only "pop" if this domain is at the top of the list */
- if (WARN_ON(root_irq_data != tmp_irq_data))
+ if (WARN_ON(irq_data != tmp_irq_data))
return -EINVAL;
- if (WARN_ON(root_irq_data->domain != domain))
+ if (WARN_ON(irq_data->domain != domain))
return -EINVAL;
- child_irq_data = root_irq_data->parent_data;
- if (WARN_ON(!child_irq_data))
+ parent_irq_data = irq_data->parent_data;
+ if (WARN_ON(!parent_irq_data))
return -EINVAL;
- mutex_lock(&irq_domain_mutex);
+ mutex_lock(&domain->root->mutex);
- root_irq_data->parent_data = NULL;
+ irq_data->parent_data = NULL;
- irq_domain_clear_mapping(domain, root_irq_data->hwirq);
+ irq_domain_clear_mapping(domain, irq_data->hwirq);
irq_domain_free_irqs_hierarchy(domain, virq, 1);
/* Restore the original irq_data. */
- *root_irq_data = *child_irq_data;
+ *irq_data = *parent_irq_data;
- irq_domain_fix_revmap(root_irq_data);
+ irq_domain_fix_revmap(irq_data);
- mutex_unlock(&irq_domain_mutex);
+ mutex_unlock(&domain->root->mutex);
- kfree(child_irq_data);
+ kfree(parent_irq_data);
return 0;
}
@@ -1690,17 +1762,20 @@ EXPORT_SYMBOL_GPL(irq_domain_pop_irq);
void irq_domain_free_irqs(unsigned int virq, unsigned int nr_irqs)
{
struct irq_data *data = irq_get_irq_data(virq);
+ struct irq_domain *domain;
int i;
if (WARN(!data || !data->domain || !data->domain->ops->free,
"NULL pointer, cannot free irq\n"))
return;
- mutex_lock(&irq_domain_mutex);
+ domain = data->domain;
+
+ mutex_lock(&domain->root->mutex);
for (i = 0; i < nr_irqs; i++)
irq_domain_remove_irq(virq + i);
- irq_domain_free_irqs_hierarchy(data->domain, virq, nr_irqs);
- mutex_unlock(&irq_domain_mutex);
+ irq_domain_free_irqs_hierarchy(domain, virq, nr_irqs);
+ mutex_unlock(&domain->root->mutex);
irq_domain_free_irq_data(virq, nr_irqs);
irq_free_descs(virq, nr_irqs);
@@ -1865,6 +1940,13 @@ void irq_domain_set_info(struct irq_domain *domain, unsigned int virq,
irq_set_handler_data(virq, handler_data);
}
+static int irq_domain_alloc_irqs_locked(struct irq_domain *domain, int irq_base,
+ unsigned int nr_irqs, int node, void *arg,
+ bool realloc, const struct irq_affinity_desc *affinity)
+{
+ return -EINVAL;
+}
+
static void irq_domain_check_hierarchy(struct irq_domain *domain)
{
}
@@ -1915,7 +1997,7 @@ static void debugfs_add_domain_dir(struct irq_domain *d)
static void debugfs_remove_domain_dir(struct irq_domain *d)
{
- debugfs_remove(debugfs_lookup(d->name, domain_dir));
+ debugfs_lookup_and_remove(d->name, domain_dir);
}
void __init irq_domain_debugfs_init(struct dentry *root)
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 5b7cf28df290..8ce75495e04f 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -723,10 +723,13 @@ EXPORT_SYMBOL(disable_irq_nosync);
* to complete before returning. If you use this function while
* holding a resource the IRQ handler may need you will deadlock.
*
- * This function may be called - with care - from IRQ context.
+ * Can only be called from preemptible code as it might sleep when
+ * an interrupt thread is associated to @irq.
+ *
*/
void disable_irq(unsigned int irq)
{
+ might_sleep();
if (!__disable_irq_nosync(irq))
synchronize_irq(irq);
}
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index 955267bbc2be..783a3e6a0b10 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -1000,7 +1000,7 @@ bool msi_create_device_irq_domain(struct device *dev, unsigned int domid,
fail:
msi_unlock_descs(dev);
free_fwnode:
- kfree(fwnode);
+ irq_domain_free_fwnode(fwnode);
free_bundle:
kfree(bundle);
return false;
@@ -1013,6 +1013,7 @@ free_bundle:
*/
void msi_remove_device_irq_domain(struct device *dev, unsigned int domid)
{
+ struct fwnode_handle *fwnode = NULL;
struct msi_domain_info *info;
struct irq_domain *domain;
@@ -1025,7 +1026,10 @@ void msi_remove_device_irq_domain(struct device *dev, unsigned int domid)
dev->msi.data->__domains[domid].domain = NULL;
info = domain->host_data;
+ if (irq_domain_is_msi_device(domain))
+ fwnode = domain->fwnode;
irq_domain_remove(domain);
+ irq_domain_free_fwnode(fwnode);
kfree(container_of(info, struct msi_domain_template, info));
unlock:
diff --git a/kernel/kallsyms_selftest.c b/kernel/kallsyms_selftest.c
index f35d9cc1aab1..bfbc12da3326 100644
--- a/kernel/kallsyms_selftest.c
+++ b/kernel/kallsyms_selftest.c
@@ -157,14 +157,11 @@ static void test_kallsyms_compression_ratio(void)
static int lookup_name(void *data, const char *name, struct module *mod, unsigned long addr)
{
u64 t0, t1, t;
- unsigned long flags;
struct test_stat *stat = (struct test_stat *)data;
- local_irq_save(flags);
- t0 = sched_clock();
+ t0 = ktime_get_ns();
(void)kallsyms_lookup_name(name);
- t1 = sched_clock();
- local_irq_restore(flags);
+ t1 = ktime_get_ns();
t = t1 - t0;
if (t < stat->min)
@@ -234,18 +231,15 @@ static int find_symbol(void *data, const char *name, struct module *mod, unsigne
static void test_perf_kallsyms_on_each_symbol(void)
{
u64 t0, t1;
- unsigned long flags;
struct test_stat stat;
memset(&stat, 0, sizeof(stat));
stat.max = INT_MAX;
stat.name = stub_name;
stat.perf = 1;
- local_irq_save(flags);
- t0 = sched_clock();
+ t0 = ktime_get_ns();
kallsyms_on_each_symbol(find_symbol, &stat);
- t1 = sched_clock();
- local_irq_restore(flags);
+ t1 = ktime_get_ns();
pr_info("kallsyms_on_each_symbol() traverse all: %lld ns\n", t1 - t0);
}
@@ -270,17 +264,14 @@ static int match_symbol(void *data, unsigned long addr)
static void test_perf_kallsyms_on_each_match_symbol(void)
{
u64 t0, t1;
- unsigned long flags;
struct test_stat stat;
memset(&stat, 0, sizeof(stat));
stat.max = INT_MAX;
stat.name = stub_name;
- local_irq_save(flags);
- t0 = sched_clock();
+ t0 = ktime_get_ns();
kallsyms_on_each_match_symbol(match_symbol, stat.name, &stat);
- t1 = sched_clock();
- local_irq_restore(flags);
+ t1 = ktime_get_ns();
pr_info("kallsyms_on_each_match_symbol() traverse all: %lld ns\n", t1 - t0);
}
diff --git a/kernel/kcsan/kcsan_test.c b/kernel/kcsan/kcsan_test.c
index dcec1b743c69..a60c561724be 100644
--- a/kernel/kcsan/kcsan_test.c
+++ b/kernel/kcsan/kcsan_test.c
@@ -159,7 +159,7 @@ static bool __report_matches(const struct expect_report *r)
const bool is_assert = (r->access[0].type | r->access[1].type) & KCSAN_ACCESS_ASSERT;
bool ret = false;
unsigned long flags;
- typeof(observed.lines) expect;
+ typeof(*observed.lines) *expect;
const char *end;
char *cur;
int i;
@@ -168,6 +168,10 @@ static bool __report_matches(const struct expect_report *r)
if (!report_available())
return false;
+ expect = kmalloc(sizeof(observed.lines), GFP_KERNEL);
+ if (WARN_ON(!expect))
+ return false;
+
/* Generate expected report contents. */
/* Title */
@@ -253,6 +257,7 @@ static bool __report_matches(const struct expect_report *r)
strstr(observed.lines[2], expect[1])));
out:
spin_unlock_irqrestore(&observed.lock, flags);
+ kfree(expect);
return ret;
}
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index e3375bc40dad..50d4863974e7 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -55,6 +55,7 @@
#include <linux/rcupdate.h>
#include <linux/kprobes.h>
#include <linux/lockdep.h>
+#include <linux/context_tracking.h>
#include <asm/sections.h>
@@ -6555,6 +6556,7 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s)
{
struct task_struct *curr = current;
int dl = READ_ONCE(debug_locks);
+ bool rcu = warn_rcu_enter();
/* Note: the following can be executed concurrently, so be careful. */
pr_warn("\n");
@@ -6595,5 +6597,6 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s)
lockdep_print_held_locks(curr);
pr_warn("\nstack backtrace:\n");
dump_stack();
+ warn_rcu_exit(rcu);
}
EXPORT_SYMBOL_GPL(lockdep_rcu_suspicious);
diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index 2b23378775fe..ebe6b8ec7cb3 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -371,7 +371,7 @@ void __lockfunc queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
/*
* We're pending, wait for the owner to go away.
*
- * 0,1,1 -> 0,1,0
+ * 0,1,1 -> *,1,0
*
* this wait loop must be a load-acquire such that we match the
* store-release that clears the locked bit and create lock
@@ -380,7 +380,7 @@ void __lockfunc queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
* barriers.
*/
if (val & _Q_LOCKED_MASK)
- atomic_cond_read_acquire(&lock->val, !(VAL & _Q_LOCKED_MASK));
+ smp_cond_load_acquire(&lock->locked, !VAL);
/*
* take ownership and clear the pending bit.
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 010cf4e6d0b8..728f434de2bb 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -901,8 +901,9 @@ static int __sched rt_mutex_adjust_prio_chain(struct task_struct *task,
* then we need to wake the new top waiter up to try
* to get the lock.
*/
- if (prerequeue_top_waiter != rt_mutex_top_waiter(lock))
- wake_up_state(waiter->task, waiter->wake_state);
+ top_waiter = rt_mutex_top_waiter(lock);
+ if (prerequeue_top_waiter != top_waiter)
+ wake_up_state(top_waiter->task, top_waiter->wake_state);
raw_spin_unlock_irq(&lock->wait_lock);
return 0;
}
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index 44873594de03..acb5a50309a1 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -256,16 +256,13 @@ static inline bool rwsem_read_trylock(struct rw_semaphore *sem, long *cntp)
static inline bool rwsem_write_trylock(struct rw_semaphore *sem)
{
long tmp = RWSEM_UNLOCKED_VALUE;
- bool ret = false;
- preempt_disable();
if (atomic_long_try_cmpxchg_acquire(&sem->count, &tmp, RWSEM_WRITER_LOCKED)) {
rwsem_set_owner(sem);
- ret = true;
+ return true;
}
- preempt_enable();
- return ret;
+ return false;
}
/*
@@ -624,18 +621,16 @@ static inline bool rwsem_try_write_lock(struct rw_semaphore *sem,
*/
if (first->handoff_set && (waiter != first))
return false;
-
- /*
- * First waiter can inherit a previously set handoff
- * bit and spin on rwsem if lock acquisition fails.
- */
- if (waiter == first)
- waiter->handoff_set = true;
}
new = count;
if (count & RWSEM_LOCK_MASK) {
+ /*
+ * A waiter (first or not) can set the handoff bit
+ * if it is an RT task or wait in the wait queue
+ * for too long.
+ */
if (has_handoff || (!rt_task(waiter->task) &&
!time_after(jiffies, waiter->timeout)))
return false;
@@ -651,11 +646,12 @@ static inline bool rwsem_try_write_lock(struct rw_semaphore *sem,
} while (!atomic_long_try_cmpxchg_acquire(&sem->count, &count, new));
/*
- * We have either acquired the lock with handoff bit cleared or
- * set the handoff bit.
+ * We have either acquired the lock with handoff bit cleared or set
+ * the handoff bit. Only the first waiter can have its handoff_set
+ * set here to enable optimistic spinning in slowpath loop.
*/
if (new & RWSEM_FLAG_HANDOFF) {
- waiter->handoff_set = true;
+ first->handoff_set = true;
lockevent_inc(rwsem_wlock_handoff);
return false;
}
@@ -717,7 +713,6 @@ static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem)
return false;
}
- preempt_disable();
/*
* Disable preemption is equal to the RCU read-side crital section,
* thus the task_strcut structure won't go away.
@@ -729,7 +724,6 @@ static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem)
if ((flags & RWSEM_NONSPINNABLE) ||
(owner && !(flags & RWSEM_READER_OWNED) && !owner_on_cpu(owner)))
ret = false;
- preempt_enable();
lockevent_cond_inc(rwsem_opt_fail, !ret);
return ret;
@@ -829,8 +823,6 @@ static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
int loop = 0;
u64 rspin_threshold = 0;
- preempt_disable();
-
/* sem->wait_lock should not be held when doing optimistic spinning */
if (!osq_lock(&sem->osq))
goto done;
@@ -938,7 +930,6 @@ static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
}
osq_unlock(&sem->osq);
done:
- preempt_enable();
lockevent_cond_inc(rwsem_opt_fail, !taken);
return taken;
}
@@ -1092,7 +1083,7 @@ queue:
/* Ordered by sem->wait_lock against rwsem_mark_wake(). */
break;
}
- schedule();
+ schedule_preempt_disabled();
lockevent_inc(rwsem_sleep_reader);
}
@@ -1179,15 +1170,12 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state)
if (waiter.handoff_set) {
enum owner_state owner_state;
- preempt_disable();
owner_state = rwsem_spin_on_owner(sem);
- preempt_enable();
-
if (owner_state == OWNER_NULL)
goto trylock_again;
}
- schedule();
+ schedule_preempt_disabled();
lockevent_inc(rwsem_sleep_writer);
set_current_state(state);
trylock_again:
@@ -1254,14 +1242,20 @@ static struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem)
*/
static inline int __down_read_common(struct rw_semaphore *sem, int state)
{
+ int ret = 0;
long count;
+ preempt_disable();
if (!rwsem_read_trylock(sem, &count)) {
- if (IS_ERR(rwsem_down_read_slowpath(sem, count, state)))
- return -EINTR;
+ if (IS_ERR(rwsem_down_read_slowpath(sem, count, state))) {
+ ret = -EINTR;
+ goto out;
+ }
DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem);
}
- return 0;
+out:
+ preempt_enable();
+ return ret;
}
static inline void __down_read(struct rw_semaphore *sem)
@@ -1281,19 +1275,23 @@ static inline int __down_read_killable(struct rw_semaphore *sem)
static inline int __down_read_trylock(struct rw_semaphore *sem)
{
+ int ret = 0;
long tmp;
DEBUG_RWSEMS_WARN_ON(sem->magic != sem, sem);
+ preempt_disable();
tmp = atomic_long_read(&sem->count);
while (!(tmp & RWSEM_READ_FAILED_MASK)) {
if (atomic_long_try_cmpxchg_acquire(&sem->count, &tmp,
tmp + RWSEM_READER_BIAS)) {
rwsem_set_reader_owned(sem);
- return 1;
+ ret = 1;
+ break;
}
}
- return 0;
+ preempt_enable();
+ return ret;
}
/*
@@ -1301,12 +1299,15 @@ static inline int __down_read_trylock(struct rw_semaphore *sem)
*/
static inline int __down_write_common(struct rw_semaphore *sem, int state)
{
+ int ret = 0;
+
+ preempt_disable();
if (unlikely(!rwsem_write_trylock(sem))) {
if (IS_ERR(rwsem_down_write_slowpath(sem, state)))
- return -EINTR;
+ ret = -EINTR;
}
-
- return 0;
+ preempt_enable();
+ return ret;
}
static inline void __down_write(struct rw_semaphore *sem)
@@ -1321,8 +1322,14 @@ static inline int __down_write_killable(struct rw_semaphore *sem)
static inline int __down_write_trylock(struct rw_semaphore *sem)
{
+ int ret;
+
+ preempt_disable();
DEBUG_RWSEMS_WARN_ON(sem->magic != sem, sem);
- return rwsem_write_trylock(sem);
+ ret = rwsem_write_trylock(sem);
+ preempt_enable();
+
+ return ret;
}
/*
@@ -1335,6 +1342,7 @@ static inline void __up_read(struct rw_semaphore *sem)
DEBUG_RWSEMS_WARN_ON(sem->magic != sem, sem);
DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem);
+ preempt_disable();
rwsem_clear_reader_owned(sem);
tmp = atomic_long_add_return_release(-RWSEM_READER_BIAS, &sem->count);
DEBUG_RWSEMS_WARN_ON(tmp < 0, sem);
@@ -1343,6 +1351,7 @@ static inline void __up_read(struct rw_semaphore *sem)
clear_nonspinnable(sem);
rwsem_wake(sem);
}
+ preempt_enable();
}
/*
@@ -1363,9 +1372,9 @@ static inline void __up_write(struct rw_semaphore *sem)
preempt_disable();
rwsem_clear_owner(sem);
tmp = atomic_long_fetch_add_release(-RWSEM_WRITER_LOCKED, &sem->count);
- preempt_enable();
if (unlikely(tmp & RWSEM_FLAG_WAITERS))
rwsem_wake(sem);
+ preempt_enable();
}
/*
@@ -1383,11 +1392,13 @@ static inline void __downgrade_write(struct rw_semaphore *sem)
* write side. As such, rely on RELEASE semantics.
*/
DEBUG_RWSEMS_WARN_ON(rwsem_owner(sem) != current, sem);
+ preempt_disable();
tmp = atomic_long_fetch_add_release(
-RWSEM_WRITER_LOCKED+RWSEM_READER_BIAS, &sem->count);
rwsem_set_reader_owned(sem);
if (tmp & RWSEM_FLAG_WAITERS)
rwsem_downgrade_wake(sem);
+ preempt_enable();
}
#else /* !CONFIG_PREEMPT_RT */
@@ -1662,6 +1673,12 @@ void down_read_non_owner(struct rw_semaphore *sem)
{
might_sleep();
__down_read(sem);
+ /*
+ * The owner value for a reader-owned lock is mostly for debugging
+ * purpose only and is not critical to the correct functioning of
+ * rwsem. So it is perfectly fine to set it in a preempt-enabled
+ * context here.
+ */
__rwsem_set_reader_owned(sem, NULL);
}
EXPORT_SYMBOL(down_read_non_owner);
diff --git a/kernel/module/main.c b/kernel/module/main.c
index 48568a0f5651..4ac3fe43e6c8 100644
--- a/kernel/module/main.c
+++ b/kernel/module/main.c
@@ -2393,7 +2393,8 @@ static bool finished_loading(const char *name)
sched_annotate_sleep();
mutex_lock(&module_mutex);
mod = find_module_all(name, strlen(name), true);
- ret = !mod || mod->state == MODULE_STATE_LIVE;
+ ret = !mod || mod->state == MODULE_STATE_LIVE
+ || mod->state == MODULE_STATE_GOING;
mutex_unlock(&module_mutex);
return ret;
@@ -2569,20 +2570,35 @@ static int add_unformed_module(struct module *mod)
mod->state = MODULE_STATE_UNFORMED;
-again:
mutex_lock(&module_mutex);
old = find_module_all(mod->name, strlen(mod->name), true);
if (old != NULL) {
- if (old->state != MODULE_STATE_LIVE) {
+ if (old->state == MODULE_STATE_COMING
+ || old->state == MODULE_STATE_UNFORMED) {
/* Wait in case it fails to load. */
mutex_unlock(&module_mutex);
err = wait_event_interruptible(module_wq,
finished_loading(mod->name));
if (err)
goto out_unlocked;
- goto again;
+
+ /* The module might have gone in the meantime. */
+ mutex_lock(&module_mutex);
+ old = find_module_all(mod->name, strlen(mod->name),
+ true);
}
- err = -EEXIST;
+
+ /*
+ * We are here only when the same module was being loaded. Do
+ * not try to load it again right now. It prevents long delays
+ * caused by serialized module load failures. It might happen
+ * when more devices of the same type trigger load of
+ * a particular module.
+ */
+ if (old && old->state == MODULE_STATE_LIVE)
+ err = -EEXIST;
+ else
+ err = -EBUSY;
goto out;
}
mod_update_bounds(mod);
diff --git a/kernel/panic.c b/kernel/panic.c
index 463c9295bc28..487f5b03bf83 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -34,6 +34,7 @@
#include <linux/ratelimit.h>
#include <linux/debugfs.h>
#include <linux/sysfs.h>
+#include <linux/context_tracking.h>
#include <trace/events/error_report.h>
#include <asm/sections.h>
@@ -679,6 +680,7 @@ void __warn(const char *file, int line, void *caller, unsigned taint,
void warn_slowpath_fmt(const char *file, int line, unsigned taint,
const char *fmt, ...)
{
+ bool rcu = warn_rcu_enter();
struct warn_args args;
pr_warn(CUT_HERE);
@@ -693,11 +695,13 @@ void warn_slowpath_fmt(const char *file, int line, unsigned taint,
va_start(args.args, fmt);
__warn(file, line, __builtin_return_address(0), taint, NULL, &args);
va_end(args.args);
+ warn_rcu_exit(rcu);
}
EXPORT_SYMBOL(warn_slowpath_fmt);
#else
void __warn_printk(const char *fmt, ...)
{
+ bool rcu = warn_rcu_enter();
va_list args;
pr_warn(CUT_HERE);
@@ -705,6 +709,7 @@ void __warn_printk(const char *fmt, ...)
va_start(args, fmt);
vprintk(fmt, args);
va_end(args);
+ warn_rcu_exit(rcu);
}
EXPORT_SYMBOL(__warn_printk);
#endif
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 7decf1e9c486..94f136b25f6a 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -123,6 +123,7 @@ bool console_srcu_read_lock_is_held(void)
{
return srcu_read_lock_held(&console_srcu);
}
+EXPORT_SYMBOL(console_srcu_read_lock_is_held);
#endif
enum devkmsg_log_bits {
@@ -1891,6 +1892,7 @@ static void console_lock_spinning_enable(void)
/**
* console_lock_spinning_disable_and_check - mark end of code where another
* thread was able to busy wait and check if there is a waiter
+ * @cookie: cookie returned from console_srcu_read_lock()
*
* This is called at the end of the section where spinning is allowed.
* It has two functions. First, it is a signal that it is no longer
@@ -2194,7 +2196,7 @@ static u16 printk_sprint(char *text, u16 size, int facility,
}
}
- trace_console_rcuidle(text, text_len);
+ trace_console(text, text_len);
return text_len;
}
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 54482193e1ed..0786450074c1 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -813,7 +813,7 @@ static long ptrace_get_rseq_configuration(struct task_struct *task,
{
struct ptrace_rseq_configuration conf = {
.rseq_abi_pointer = (u64)(uintptr_t)task->rseq,
- .rseq_abi_size = sizeof(*task->rseq),
+ .rseq_abi_size = task->rseq_len,
.signature = task->rseq_sig,
.flags = 0,
};
diff --git a/kernel/rseq.c b/kernel/rseq.c
index d38ab944105d..9de6e35fe679 100644
--- a/kernel/rseq.c
+++ b/kernel/rseq.c
@@ -18,6 +18,9 @@
#define CREATE_TRACE_POINTS
#include <trace/events/rseq.h>
+/* The original rseq structure size (including padding) is 32 bytes. */
+#define ORIG_RSEQ_SIZE 32
+
#define RSEQ_CS_NO_RESTART_FLAGS (RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT | \
RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL | \
RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE)
@@ -82,15 +85,25 @@
* F1. <failure>
*/
-static int rseq_update_cpu_id(struct task_struct *t)
+static int rseq_update_cpu_node_id(struct task_struct *t)
{
- u32 cpu_id = raw_smp_processor_id();
struct rseq __user *rseq = t->rseq;
+ u32 cpu_id = raw_smp_processor_id();
+ u32 node_id = cpu_to_node(cpu_id);
+ u32 mm_cid = task_mm_cid(t);
- if (!user_write_access_begin(rseq, sizeof(*rseq)))
+ WARN_ON_ONCE((int) mm_cid < 0);
+ if (!user_write_access_begin(rseq, t->rseq_len))
goto efault;
unsafe_put_user(cpu_id, &rseq->cpu_id_start, efault_end);
unsafe_put_user(cpu_id, &rseq->cpu_id, efault_end);
+ unsafe_put_user(node_id, &rseq->node_id, efault_end);
+ unsafe_put_user(mm_cid, &rseq->mm_cid, efault_end);
+ /*
+ * Additional feature fields added after ORIG_RSEQ_SIZE
+ * need to be conditionally updated only if
+ * t->rseq_len != ORIG_RSEQ_SIZE.
+ */
user_write_access_end();
trace_rseq_update(t);
return 0;
@@ -101,9 +114,10 @@ efault:
return -EFAULT;
}
-static int rseq_reset_rseq_cpu_id(struct task_struct *t)
+static int rseq_reset_rseq_cpu_node_id(struct task_struct *t)
{
- u32 cpu_id_start = 0, cpu_id = RSEQ_CPU_ID_UNINITIALIZED;
+ u32 cpu_id_start = 0, cpu_id = RSEQ_CPU_ID_UNINITIALIZED, node_id = 0,
+ mm_cid = 0;
/*
* Reset cpu_id_start to its initial state (0).
@@ -117,6 +131,21 @@ static int rseq_reset_rseq_cpu_id(struct task_struct *t)
*/
if (put_user(cpu_id, &t->rseq->cpu_id))
return -EFAULT;
+ /*
+ * Reset node_id to its initial state (0).
+ */
+ if (put_user(node_id, &t->rseq->node_id))
+ return -EFAULT;
+ /*
+ * Reset mm_cid to its initial state (0).
+ */
+ if (put_user(mm_cid, &t->rseq->mm_cid))
+ return -EFAULT;
+ /*
+ * Additional feature fields added after ORIG_RSEQ_SIZE
+ * need to be conditionally reset only if
+ * t->rseq_len != ORIG_RSEQ_SIZE.
+ */
return 0;
}
@@ -301,7 +330,7 @@ void __rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs)
if (unlikely(ret < 0))
goto error;
}
- if (unlikely(rseq_update_cpu_id(t)))
+ if (unlikely(rseq_update_cpu_node_id(t)))
goto error;
return;
@@ -344,15 +373,16 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len,
/* Unregister rseq for current thread. */
if (current->rseq != rseq || !current->rseq)
return -EINVAL;
- if (rseq_len != sizeof(*rseq))
+ if (rseq_len != current->rseq_len)
return -EINVAL;
if (current->rseq_sig != sig)
return -EPERM;
- ret = rseq_reset_rseq_cpu_id(current);
+ ret = rseq_reset_rseq_cpu_node_id(current);
if (ret)
return ret;
current->rseq = NULL;
current->rseq_sig = 0;
+ current->rseq_len = 0;
return 0;
}
@@ -365,7 +395,7 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len,
* the provided address differs from the prior
* one.
*/
- if (current->rseq != rseq || rseq_len != sizeof(*rseq))
+ if (current->rseq != rseq || rseq_len != current->rseq_len)
return -EINVAL;
if (current->rseq_sig != sig)
return -EPERM;
@@ -374,15 +404,24 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len,
}
/*
- * If there was no rseq previously registered,
- * ensure the provided rseq is properly aligned and valid.
+ * If there was no rseq previously registered, ensure the provided rseq
+ * is properly aligned, as communcated to user-space through the ELF
+ * auxiliary vector AT_RSEQ_ALIGN. If rseq_len is the original rseq
+ * size, the required alignment is the original struct rseq alignment.
+ *
+ * In order to be valid, rseq_len is either the original rseq size, or
+ * large enough to contain all supported fields, as communicated to
+ * user-space through the ELF auxiliary vector AT_RSEQ_FEATURE_SIZE.
*/
- if (!IS_ALIGNED((unsigned long)rseq, __alignof__(*rseq)) ||
- rseq_len != sizeof(*rseq))
+ if (rseq_len < ORIG_RSEQ_SIZE ||
+ (rseq_len == ORIG_RSEQ_SIZE && !IS_ALIGNED((unsigned long)rseq, ORIG_RSEQ_SIZE)) ||
+ (rseq_len != ORIG_RSEQ_SIZE && (!IS_ALIGNED((unsigned long)rseq, __alignof__(*rseq)) ||
+ rseq_len < offsetof(struct rseq, end))))
return -EINVAL;
if (!access_ok(rseq, rseq_len))
return -EFAULT;
current->rseq = rseq;
+ current->rseq_len = rseq_len;
current->rseq_sig = sig;
/*
* If rseq was previously inactive, and has just been
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index e374c0c923da..5732fa75ebab 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -93,7 +93,7 @@ struct sched_clock_data {
static DEFINE_PER_CPU_SHARED_ALIGNED(struct sched_clock_data, sched_clock_data);
-notrace static inline struct sched_clock_data *this_scd(void)
+static __always_inline struct sched_clock_data *this_scd(void)
{
return this_cpu_ptr(&sched_clock_data);
}
@@ -244,12 +244,12 @@ late_initcall(sched_clock_init_late);
* min, max except they take wrapping into account
*/
-notrace static inline u64 wrap_min(u64 x, u64 y)
+static __always_inline u64 wrap_min(u64 x, u64 y)
{
return (s64)(x - y) < 0 ? x : y;
}
-notrace static inline u64 wrap_max(u64 x, u64 y)
+static __always_inline u64 wrap_max(u64 x, u64 y)
{
return (s64)(x - y) > 0 ? x : y;
}
@@ -260,7 +260,7 @@ notrace static inline u64 wrap_max(u64 x, u64 y)
* - filter out backward motion
* - use the GTOD tick value to create a window to filter crazy TSC values
*/
-notrace static u64 sched_clock_local(struct sched_clock_data *scd)
+static __always_inline u64 sched_clock_local(struct sched_clock_data *scd)
{
u64 now, clock, old_clock, min_clock, max_clock, gtod;
s64 delta;
@@ -287,13 +287,28 @@ again:
clock = wrap_max(clock, min_clock);
clock = wrap_min(clock, max_clock);
- if (!try_cmpxchg64(&scd->clock, &old_clock, clock))
+ if (!arch_try_cmpxchg64(&scd->clock, &old_clock, clock))
goto again;
return clock;
}
-notrace static u64 sched_clock_remote(struct sched_clock_data *scd)
+noinstr u64 local_clock(void)
+{
+ u64 clock;
+
+ if (static_branch_likely(&__sched_clock_stable))
+ return sched_clock() + __sched_clock_offset;
+
+ preempt_disable_notrace();
+ clock = sched_clock_local(this_scd());
+ preempt_enable_notrace();
+
+ return clock;
+}
+EXPORT_SYMBOL_GPL(local_clock);
+
+static notrace u64 sched_clock_remote(struct sched_clock_data *scd)
{
struct sched_clock_data *my_scd = this_scd();
u64 this_clock, remote_clock;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 25b582b6ee5f..fb49dbf61273 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -152,7 +152,7 @@ __read_mostly int scheduler_running;
DEFINE_STATIC_KEY_FALSE(__sched_core_enabled);
/* kernel prio, less is more */
-static inline int __task_prio(struct task_struct *p)
+static inline int __task_prio(const struct task_struct *p)
{
if (p->sched_class == &stop_sched_class) /* trumps deadline */
return -2;
@@ -174,7 +174,8 @@ static inline int __task_prio(struct task_struct *p)
*/
/* real prio, less is less */
-static inline bool prio_less(struct task_struct *a, struct task_struct *b, bool in_fi)
+static inline bool prio_less(const struct task_struct *a,
+ const struct task_struct *b, bool in_fi)
{
int pa = __task_prio(a), pb = __task_prio(b);
@@ -194,7 +195,8 @@ static inline bool prio_less(struct task_struct *a, struct task_struct *b, bool
return false;
}
-static inline bool __sched_core_less(struct task_struct *a, struct task_struct *b)
+static inline bool __sched_core_less(const struct task_struct *a,
+ const struct task_struct *b)
{
if (a->core_cookie < b->core_cookie)
return true;
@@ -2604,27 +2606,71 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
.user_mask = NULL,
.flags = SCA_USER, /* clear the user requested mask */
};
+ union cpumask_rcuhead {
+ cpumask_t cpumask;
+ struct rcu_head rcu;
+ };
__do_set_cpus_allowed(p, &ac);
- kfree(ac.user_mask);
+
+ /*
+ * Because this is called with p->pi_lock held, it is not possible
+ * to use kfree() here (when PREEMPT_RT=y), therefore punt to using
+ * kfree_rcu().
+ */
+ kfree_rcu((union cpumask_rcuhead *)ac.user_mask, rcu);
+}
+
+static cpumask_t *alloc_user_cpus_ptr(int node)
+{
+ /*
+ * See do_set_cpus_allowed() above for the rcu_head usage.
+ */
+ int size = max_t(int, cpumask_size(), sizeof(struct rcu_head));
+
+ return kmalloc_node(size, GFP_KERNEL, node);
}
int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src,
int node)
{
+ cpumask_t *user_mask;
unsigned long flags;
- if (!src->user_cpus_ptr)
+ /*
+ * Always clear dst->user_cpus_ptr first as their user_cpus_ptr's
+ * may differ by now due to racing.
+ */
+ dst->user_cpus_ptr = NULL;
+
+ /*
+ * This check is racy and losing the race is a valid situation.
+ * It is not worth the extra overhead of taking the pi_lock on
+ * every fork/clone.
+ */
+ if (data_race(!src->user_cpus_ptr))
return 0;
- dst->user_cpus_ptr = kmalloc_node(cpumask_size(), GFP_KERNEL, node);
- if (!dst->user_cpus_ptr)
+ user_mask = alloc_user_cpus_ptr(node);
+ if (!user_mask)
return -ENOMEM;
- /* Use pi_lock to protect content of user_cpus_ptr */
+ /*
+ * Use pi_lock to protect content of user_cpus_ptr
+ *
+ * Though unlikely, user_cpus_ptr can be reset to NULL by a concurrent
+ * do_set_cpus_allowed().
+ */
raw_spin_lock_irqsave(&src->pi_lock, flags);
- cpumask_copy(dst->user_cpus_ptr, src->user_cpus_ptr);
+ if (src->user_cpus_ptr) {
+ swap(dst->user_cpus_ptr, user_mask);
+ cpumask_copy(dst->user_cpus_ptr, src->user_cpus_ptr);
+ }
raw_spin_unlock_irqrestore(&src->pi_lock, flags);
+
+ if (unlikely(user_mask))
+ kfree(user_mask);
+
return 0;
}
@@ -2907,8 +2953,11 @@ static int __set_cpus_allowed_ptr_locked(struct task_struct *p,
}
if (!(ctx->flags & SCA_MIGRATE_ENABLE)) {
- if (cpumask_equal(&p->cpus_mask, ctx->new_mask))
+ if (cpumask_equal(&p->cpus_mask, ctx->new_mask)) {
+ if (ctx->flags & SCA_USER)
+ swap(p->user_cpus_ptr, ctx->user_mask);
goto out;
+ }
if (WARN_ON_ONCE(p == current &&
is_migration_disabled(p) &&
@@ -3581,6 +3630,11 @@ static inline bool rq_has_pinned_tasks(struct rq *rq)
return false;
}
+static inline cpumask_t *alloc_user_cpus_ptr(int node)
+{
+ return NULL;
+}
+
#endif /* !CONFIG_SMP */
static void
@@ -3623,14 +3677,39 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
}
/*
- * Mark the task runnable and perform wakeup-preemption.
+ * Mark the task runnable.
*/
-static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags,
- struct rq_flags *rf)
+static inline void ttwu_do_wakeup(struct task_struct *p)
{
- check_preempt_curr(rq, p, wake_flags);
WRITE_ONCE(p->__state, TASK_RUNNING);
trace_sched_wakeup(p);
+}
+
+static void
+ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
+ struct rq_flags *rf)
+{
+ int en_flags = ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK;
+
+ lockdep_assert_rq_held(rq);
+
+ if (p->sched_contributes_to_load)
+ rq->nr_uninterruptible--;
+
+#ifdef CONFIG_SMP
+ if (wake_flags & WF_MIGRATED)
+ en_flags |= ENQUEUE_MIGRATED;
+ else
+#endif
+ if (p->in_iowait) {
+ delayacct_blkio_end(p);
+ atomic_dec(&task_rq(p)->nr_iowait);
+ }
+
+ activate_task(rq, p, en_flags);
+ check_preempt_curr(rq, p, wake_flags);
+
+ ttwu_do_wakeup(p);
#ifdef CONFIG_SMP
if (p->sched_class->task_woken) {
@@ -3660,31 +3739,6 @@ static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags,
#endif
}
-static void
-ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
- struct rq_flags *rf)
-{
- int en_flags = ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK;
-
- lockdep_assert_rq_held(rq);
-
- if (p->sched_contributes_to_load)
- rq->nr_uninterruptible--;
-
-#ifdef CONFIG_SMP
- if (wake_flags & WF_MIGRATED)
- en_flags |= ENQUEUE_MIGRATED;
- else
-#endif
- if (p->in_iowait) {
- delayacct_blkio_end(p);
- atomic_dec(&task_rq(p)->nr_iowait);
- }
-
- activate_task(rq, p, en_flags);
- ttwu_do_wakeup(rq, p, wake_flags, rf);
-}
-
/*
* Consider @p being inside a wait loop:
*
@@ -3718,9 +3772,15 @@ static int ttwu_runnable(struct task_struct *p, int wake_flags)
rq = __task_rq_lock(p, &rf);
if (task_on_rq_queued(p)) {
- /* check_preempt_curr() may use rq clock */
- update_rq_clock(rq);
- ttwu_do_wakeup(rq, p, wake_flags, &rf);
+ if (!task_on_cpu(rq, p)) {
+ /*
+ * When on_rq && !on_cpu the task is preempted, see if
+ * it should preempt the task that is current now.
+ */
+ update_rq_clock(rq);
+ check_preempt_curr(rq, p, wake_flags);
+ }
+ ttwu_do_wakeup(p);
ret = 1;
}
__task_rq_unlock(rq, &rf);
@@ -4086,8 +4146,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
goto out;
trace_sched_waking(p);
- WRITE_ONCE(p->__state, TASK_RUNNING);
- trace_sched_wakeup(p);
+ ttwu_do_wakeup(p);
goto out;
}
@@ -5052,6 +5111,7 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev,
sched_info_switch(rq, prev, next);
perf_event_task_sched_out(prev, next);
rseq_preempt(prev);
+ switch_mm_cid(prev, next);
fire_sched_out_preempt_notifiers(prev, next);
kmap_local_sched_out();
prepare_task(next);
@@ -5504,7 +5564,9 @@ void scheduler_tick(void)
unsigned long thermal_pressure;
u64 resched_latency;
- arch_scale_freq_tick();
+ if (housekeeping_cpu(cpu, HK_TYPE_TICK))
+ arch_scale_freq_tick();
+
sched_clock_tick();
rq_lock(rq, &rf);
@@ -6206,7 +6268,7 @@ static bool steal_cookie_task(int cpu, struct sched_domain *sd)
{
int i;
- for_each_cpu_wrap(i, sched_domain_span(sd), cpu) {
+ for_each_cpu_wrap(i, sched_domain_span(sd), cpu + 1) {
if (i == cpu)
continue;
@@ -8239,12 +8301,18 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
if (retval)
goto out_put_task;
- user_mask = kmalloc(cpumask_size(), GFP_KERNEL);
- if (!user_mask) {
+ /*
+ * With non-SMP configs, user_cpus_ptr/user_mask isn't used and
+ * alloc_user_cpus_ptr() returns NULL.
+ */
+ user_mask = alloc_user_cpus_ptr(NUMA_NO_NODE);
+ if (user_mask) {
+ cpumask_copy(user_mask, in_mask);
+ } else if (IS_ENABLED(CONFIG_SMP)) {
retval = -ENOMEM;
goto out_put_task;
}
- cpumask_copy(user_mask, in_mask);
+
ac = (struct affinity_context){
.new_mask = in_mask,
.user_mask = user_mask,
@@ -11305,3 +11373,53 @@ void call_trace_sched_update_nr_running(struct rq *rq, int count)
{
trace_sched_update_nr_running_tp(rq, count);
}
+
+#ifdef CONFIG_SCHED_MM_CID
+void sched_mm_cid_exit_signals(struct task_struct *t)
+{
+ struct mm_struct *mm = t->mm;
+ unsigned long flags;
+
+ if (!mm)
+ return;
+ local_irq_save(flags);
+ mm_cid_put(mm, t->mm_cid);
+ t->mm_cid = -1;
+ t->mm_cid_active = 0;
+ local_irq_restore(flags);
+}
+
+void sched_mm_cid_before_execve(struct task_struct *t)
+{
+ struct mm_struct *mm = t->mm;
+ unsigned long flags;
+
+ if (!mm)
+ return;
+ local_irq_save(flags);
+ mm_cid_put(mm, t->mm_cid);
+ t->mm_cid = -1;
+ t->mm_cid_active = 0;
+ local_irq_restore(flags);
+}
+
+void sched_mm_cid_after_execve(struct task_struct *t)
+{
+ struct mm_struct *mm = t->mm;
+ unsigned long flags;
+
+ if (!mm)
+ return;
+ local_irq_save(flags);
+ t->mm_cid = mm_cid_get(mm);
+ t->mm_cid_active = 1;
+ local_irq_restore(flags);
+ rseq_set_notify_resume(t);
+}
+
+void sched_mm_cid_fork(struct task_struct *t)
+{
+ WARN_ON_ONCE(!t->mm || t->mm_cid != -1);
+ t->mm_cid_active = 1;
+}
+#endif
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 1207c78f85c1..5c840151f3bb 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -48,7 +48,6 @@ struct sugov_cpu {
unsigned long util;
unsigned long bw_dl;
- unsigned long max;
/* The field below is for single-CPU policies only: */
#ifdef CONFIG_NO_HZ_COMMON
@@ -158,7 +157,6 @@ static void sugov_get_util(struct sugov_cpu *sg_cpu)
{
struct rq *rq = cpu_rq(sg_cpu->cpu);
- sg_cpu->max = arch_scale_cpu_capacity(sg_cpu->cpu);
sg_cpu->bw_dl = cpu_bw_dl(rq);
sg_cpu->util = effective_cpu_util(sg_cpu->cpu, cpu_util_cfs(sg_cpu->cpu),
FREQUENCY_UTIL, NULL);
@@ -238,6 +236,7 @@ static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, u64 time,
* sugov_iowait_apply() - Apply the IO boost to a CPU.
* @sg_cpu: the sugov data for the cpu to boost
* @time: the update time from the caller
+ * @max_cap: the max CPU capacity
*
* A CPU running a task which woken up after an IO operation can have its
* utilization boosted to speed up the completion of those IO operations.
@@ -251,7 +250,8 @@ static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, u64 time,
* This mechanism is designed to boost high frequently IO waiting tasks, while
* being more conservative on tasks which does sporadic IO operations.
*/
-static void sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time)
+static void sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time,
+ unsigned long max_cap)
{
unsigned long boost;
@@ -280,7 +280,7 @@ static void sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time)
* sg_cpu->util is already in capacity scale; convert iowait_boost
* into the same scale so we can compare.
*/
- boost = (sg_cpu->iowait_boost * sg_cpu->max) >> SCHED_CAPACITY_SHIFT;
+ boost = (sg_cpu->iowait_boost * max_cap) >> SCHED_CAPACITY_SHIFT;
boost = uclamp_rq_util_with(cpu_rq(sg_cpu->cpu), boost, NULL);
if (sg_cpu->util < boost)
sg_cpu->util = boost;
@@ -310,7 +310,8 @@ static inline void ignore_dl_rate_limit(struct sugov_cpu *sg_cpu)
}
static inline bool sugov_update_single_common(struct sugov_cpu *sg_cpu,
- u64 time, unsigned int flags)
+ u64 time, unsigned long max_cap,
+ unsigned int flags)
{
sugov_iowait_boost(sg_cpu, time, flags);
sg_cpu->last_update = time;
@@ -321,7 +322,7 @@ static inline bool sugov_update_single_common(struct sugov_cpu *sg_cpu,
return false;
sugov_get_util(sg_cpu);
- sugov_iowait_apply(sg_cpu, time);
+ sugov_iowait_apply(sg_cpu, time, max_cap);
return true;
}
@@ -332,12 +333,15 @@ static void sugov_update_single_freq(struct update_util_data *hook, u64 time,
struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util);
struct sugov_policy *sg_policy = sg_cpu->sg_policy;
unsigned int cached_freq = sg_policy->cached_raw_freq;
+ unsigned long max_cap;
unsigned int next_f;
- if (!sugov_update_single_common(sg_cpu, time, flags))
+ max_cap = arch_scale_cpu_capacity(sg_cpu->cpu);
+
+ if (!sugov_update_single_common(sg_cpu, time, max_cap, flags))
return;
- next_f = get_next_freq(sg_policy, sg_cpu->util, sg_cpu->max);
+ next_f = get_next_freq(sg_policy, sg_cpu->util, max_cap);
/*
* Do not reduce the frequency if the CPU has not been idle
* recently, as the reduction is likely to be premature then.
@@ -374,6 +378,7 @@ static void sugov_update_single_perf(struct update_util_data *hook, u64 time,
{
struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util);
unsigned long prev_util = sg_cpu->util;
+ unsigned long max_cap;
/*
* Fall back to the "frequency" path if frequency invariance is not
@@ -385,7 +390,9 @@ static void sugov_update_single_perf(struct update_util_data *hook, u64 time,
return;
}
- if (!sugov_update_single_common(sg_cpu, time, flags))
+ max_cap = arch_scale_cpu_capacity(sg_cpu->cpu);
+
+ if (!sugov_update_single_common(sg_cpu, time, max_cap, flags))
return;
/*
@@ -399,7 +406,7 @@ static void sugov_update_single_perf(struct update_util_data *hook, u64 time,
sg_cpu->util = prev_util;
cpufreq_driver_adjust_perf(sg_cpu->cpu, map_util_perf(sg_cpu->bw_dl),
- map_util_perf(sg_cpu->util), sg_cpu->max);
+ map_util_perf(sg_cpu->util), max_cap);
sg_cpu->sg_policy->last_freq_update_time = time;
}
@@ -408,25 +415,21 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time)
{
struct sugov_policy *sg_policy = sg_cpu->sg_policy;
struct cpufreq_policy *policy = sg_policy->policy;
- unsigned long util = 0, max = 1;
+ unsigned long util = 0, max_cap;
unsigned int j;
+ max_cap = arch_scale_cpu_capacity(sg_cpu->cpu);
+
for_each_cpu(j, policy->cpus) {
struct sugov_cpu *j_sg_cpu = &per_cpu(sugov_cpu, j);
- unsigned long j_util, j_max;
sugov_get_util(j_sg_cpu);
- sugov_iowait_apply(j_sg_cpu, time);
- j_util = j_sg_cpu->util;
- j_max = j_sg_cpu->max;
+ sugov_iowait_apply(j_sg_cpu, time, max_cap);
- if (j_util * max > j_max * util) {
- util = j_util;
- max = j_max;
- }
+ util = max(j_sg_cpu->util, util);
}
- return get_next_freq(sg_policy, util, max);
+ return get_next_freq(sg_policy, util, max_cap);
}
static void
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 95fc77853743..af7952f12e6c 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -3,6 +3,10 @@
* Simple CPU accounting cgroup controller
*/
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
+ #include <asm/cputime.h>
+#endif
+
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
/*
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 0d97d54276cc..71b24371a6f7 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -2663,17 +2663,20 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p)
static void prio_changed_dl(struct rq *rq, struct task_struct *p,
int oldprio)
{
- if (task_on_rq_queued(p) || task_current(rq, p)) {
+ if (!task_on_rq_queued(p))
+ return;
+
#ifdef CONFIG_SMP
- /*
- * This might be too much, but unfortunately
- * we don't have the old deadline value, and
- * we can't argue if the task is increasing
- * or lowering its prio, so...
- */
- if (!rq->dl.overloaded)
- deadline_queue_pull_task(rq);
+ /*
+ * This might be too much, but unfortunately
+ * we don't have the old deadline value, and
+ * we can't argue if the task is increasing
+ * or lowering its prio, so...
+ */
+ if (!rq->dl.overloaded)
+ deadline_queue_pull_task(rq);
+ if (task_current(rq, p)) {
/*
* If we now have a earlier deadline task than p,
* then reschedule, provided p is still on this
@@ -2681,15 +2684,24 @@ static void prio_changed_dl(struct rq *rq, struct task_struct *p,
*/
if (dl_time_before(rq->dl.earliest_dl.curr, p->dl.deadline))
resched_curr(rq);
-#else
+ } else {
/*
- * Again, we don't know if p has a earlier
- * or later deadline, so let's blindly set a
- * (maybe not needed) rescheduling point.
+ * Current may not be deadline in case p was throttled but we
+ * have just replenished it (e.g. rt_mutex_setprio()).
+ *
+ * Otherwise, if p was given an earlier deadline, reschedule.
*/
- resched_curr(rq);
-#endif /* CONFIG_SMP */
+ if (!dl_task(rq->curr) ||
+ dl_time_before(p->dl.deadline, rq->curr->dl.deadline))
+ resched_curr(rq);
}
+#else
+ /*
+ * We don't know if p has a earlier or later deadline, so let's blindly
+ * set a (maybe not needed) rescheduling point.
+ */
+ resched_curr(rq);
+#endif
}
DEFINE_SCHED_CLASS(dl) = {
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index c36aa54ae071..ff4dbbae3b10 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -468,7 +468,7 @@ is_same_group(struct sched_entity *se, struct sched_entity *pse)
return NULL;
}
-static inline struct sched_entity *parent_entity(struct sched_entity *se)
+static inline struct sched_entity *parent_entity(const struct sched_entity *se)
{
return se->parent;
}
@@ -595,8 +595,8 @@ static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
return min_vruntime;
}
-static inline bool entity_before(struct sched_entity *a,
- struct sched_entity *b)
+static inline bool entity_before(const struct sched_entity *a,
+ const struct sched_entity *b)
{
return (s64)(a->vruntime - b->vruntime) < 0;
}
@@ -1804,7 +1804,7 @@ static void update_numa_stats(struct task_numa_env *env,
ns->nr_running += rq->cfs.h_nr_running;
ns->compute_capacity += capacity_of(cpu);
- if (find_idle && !rq->nr_running && idle_cpu(cpu)) {
+ if (find_idle && idle_core < 0 && !rq->nr_running && idle_cpu(cpu)) {
if (READ_ONCE(rq->numa_migrate_on) ||
!cpumask_test_cpu(cpu, env->p->cpus_ptr))
continue;
@@ -1836,7 +1836,7 @@ static void task_numa_assign(struct task_numa_env *env,
int start = env->dst_cpu;
/* Find alternative idle CPU. */
- for_each_cpu_wrap(cpu, cpumask_of_node(env->dst_nid), start) {
+ for_each_cpu_wrap(cpu, cpumask_of_node(env->dst_nid), start + 1) {
if (cpu == env->best_cpu || !idle_cpu(cpu) ||
!cpumask_test_cpu(cpu, env->p->cpus_ptr)) {
continue;
@@ -4476,17 +4476,9 @@ static inline int util_fits_cpu(unsigned long util,
*
* For uclamp_max, we can tolerate a drop in performance level as the
* goal is to cap the task. So it's okay if it's getting less.
- *
- * In case of capacity inversion we should honour the inverted capacity
- * for both uclamp_min and uclamp_max all the time.
*/
- capacity_orig = cpu_in_capacity_inversion(cpu);
- if (capacity_orig) {
- capacity_orig_thermal = capacity_orig;
- } else {
- capacity_orig = capacity_orig_of(cpu);
- capacity_orig_thermal = capacity_orig - arch_scale_thermal_pressure(cpu);
- }
+ capacity_orig = capacity_orig_of(cpu);
+ capacity_orig_thermal = capacity_orig - arch_scale_thermal_pressure(cpu);
/*
* We want to force a task to fit a cpu as implied by uclamp_max.
@@ -4561,8 +4553,8 @@ static inline int util_fits_cpu(unsigned long util,
* handle the case uclamp_min > uclamp_max.
*/
uclamp_min = min(uclamp_min, uclamp_max);
- if (util < uclamp_min && capacity_orig != SCHED_CAPACITY_SCALE)
- fits = fits && (uclamp_min <= capacity_orig_thermal);
+ if (fits && (util < uclamp_min) && (uclamp_min > capacity_orig_thermal))
+ return -1;
return fits;
}
@@ -4572,7 +4564,11 @@ static inline int task_fits_cpu(struct task_struct *p, int cpu)
unsigned long uclamp_min = uclamp_eff_value(p, UCLAMP_MIN);
unsigned long uclamp_max = uclamp_eff_value(p, UCLAMP_MAX);
unsigned long util = task_util_est(p);
- return util_fits_cpu(util, uclamp_min, uclamp_max, cpu);
+ /*
+ * Return true only if the cpu fully fits the task requirements, which
+ * include the utilization but also the performance hints.
+ */
+ return (util_fits_cpu(util, uclamp_min, uclamp_max, cpu) > 0);
}
static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
@@ -4656,6 +4652,7 @@ static void
place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
{
u64 vruntime = cfs_rq->min_vruntime;
+ u64 sleep_time;
/*
* The 'current' period is already promised to the current tasks,
@@ -4685,8 +4682,18 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
vruntime -= thresh;
}
- /* ensure we never gain time by being placed backwards. */
- se->vruntime = max_vruntime(se->vruntime, vruntime);
+ /*
+ * Pull vruntime of the entity being placed to the base level of
+ * cfs_rq, to prevent boosting it if placed backwards. If the entity
+ * slept for a long time, don't even try to compare its vruntime with
+ * the base as it may be too far off and the comparison may get
+ * inversed due to s64 overflow.
+ */
+ sleep_time = rq_clock_task(rq_of(cfs_rq)) - se->exec_start;
+ if ((s64)sleep_time > 60LL * NSEC_PER_SEC)
+ se->vruntime = vruntime;
+ else
+ se->vruntime = max_vruntime(se->vruntime, vruntime);
}
static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
@@ -4896,7 +4903,13 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
struct sched_entity *se;
s64 delta;
- ideal_runtime = sched_slice(cfs_rq, curr);
+ /*
+ * When many tasks blow up the sched_period; it is possible that
+ * sched_slice() reports unusually large results (when many tasks are
+ * very light for example). Therefore impose a maximum.
+ */
+ ideal_runtime = min_t(u64, sched_slice(cfs_rq, curr), sysctl_sched_latency);
+
delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
if (delta_exec > ideal_runtime) {
resched_curr(rq_of(cfs_rq));
@@ -5461,22 +5474,105 @@ unthrottle_throttle:
resched_curr(rq);
}
-static void distribute_cfs_runtime(struct cfs_bandwidth *cfs_b)
+#ifdef CONFIG_SMP
+static void __cfsb_csd_unthrottle(void *arg)
{
- struct cfs_rq *cfs_rq;
+ struct cfs_rq *cursor, *tmp;
+ struct rq *rq = arg;
+ struct rq_flags rf;
+
+ rq_lock(rq, &rf);
+
+ /*
+ * Since we hold rq lock we're safe from concurrent manipulation of
+ * the CSD list. However, this RCU critical section annotates the
+ * fact that we pair with sched_free_group_rcu(), so that we cannot
+ * race with group being freed in the window between removing it
+ * from the list and advancing to the next entry in the list.
+ */
+ rcu_read_lock();
+
+ list_for_each_entry_safe(cursor, tmp, &rq->cfsb_csd_list,
+ throttled_csd_list) {
+ list_del_init(&cursor->throttled_csd_list);
+
+ if (cfs_rq_throttled(cursor))
+ unthrottle_cfs_rq(cursor);
+ }
+
+ rcu_read_unlock();
+
+ rq_unlock(rq, &rf);
+}
+
+static inline void __unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq)
+{
+ struct rq *rq = rq_of(cfs_rq);
+ bool first;
+
+ if (rq == this_rq()) {
+ unthrottle_cfs_rq(cfs_rq);
+ return;
+ }
+
+ /* Already enqueued */
+ if (SCHED_WARN_ON(!list_empty(&cfs_rq->throttled_csd_list)))
+ return;
+
+ first = list_empty(&rq->cfsb_csd_list);
+ list_add_tail(&cfs_rq->throttled_csd_list, &rq->cfsb_csd_list);
+ if (first)
+ smp_call_function_single_async(cpu_of(rq), &rq->cfsb_csd);
+}
+#else
+static inline void __unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq)
+{
+ unthrottle_cfs_rq(cfs_rq);
+}
+#endif
+
+static void unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq)
+{
+ lockdep_assert_rq_held(rq_of(cfs_rq));
+
+ if (SCHED_WARN_ON(!cfs_rq_throttled(cfs_rq) ||
+ cfs_rq->runtime_remaining <= 0))
+ return;
+
+ __unthrottle_cfs_rq_async(cfs_rq);
+}
+
+static bool distribute_cfs_runtime(struct cfs_bandwidth *cfs_b)
+{
+ struct cfs_rq *local_unthrottle = NULL;
+ int this_cpu = smp_processor_id();
u64 runtime, remaining = 1;
+ bool throttled = false;
+ struct cfs_rq *cfs_rq;
+ struct rq_flags rf;
+ struct rq *rq;
rcu_read_lock();
list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
throttled_list) {
- struct rq *rq = rq_of(cfs_rq);
- struct rq_flags rf;
+ rq = rq_of(cfs_rq);
+
+ if (!remaining) {
+ throttled = true;
+ break;
+ }
rq_lock_irqsave(rq, &rf);
if (!cfs_rq_throttled(cfs_rq))
goto next;
- /* By the above check, this should never be true */
+#ifdef CONFIG_SMP
+ /* Already queued for async unthrottle */
+ if (!list_empty(&cfs_rq->throttled_csd_list))
+ goto next;
+#endif
+
+ /* By the above checks, this should never be true */
SCHED_WARN_ON(cfs_rq->runtime_remaining > 0);
raw_spin_lock(&cfs_b->lock);
@@ -5490,16 +5586,30 @@ static void distribute_cfs_runtime(struct cfs_bandwidth *cfs_b)
cfs_rq->runtime_remaining += runtime;
/* we check whether we're throttled above */
- if (cfs_rq->runtime_remaining > 0)
- unthrottle_cfs_rq(cfs_rq);
+ if (cfs_rq->runtime_remaining > 0) {
+ if (cpu_of(rq) != this_cpu ||
+ SCHED_WARN_ON(local_unthrottle))
+ unthrottle_cfs_rq_async(cfs_rq);
+ else
+ local_unthrottle = cfs_rq;
+ } else {
+ throttled = true;
+ }
next:
rq_unlock_irqrestore(rq, &rf);
-
- if (!remaining)
- break;
}
rcu_read_unlock();
+
+ if (local_unthrottle) {
+ rq = cpu_rq(this_cpu);
+ rq_lock_irqsave(rq, &rf);
+ if (cfs_rq_throttled(local_unthrottle))
+ unthrottle_cfs_rq(local_unthrottle);
+ rq_unlock_irqrestore(rq, &rf);
+ }
+
+ return throttled;
}
/*
@@ -5544,10 +5654,8 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, u
while (throttled && cfs_b->runtime > 0) {
raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
/* we can't nest cfs_b->lock while distributing bandwidth */
- distribute_cfs_runtime(cfs_b);
+ throttled = distribute_cfs_runtime(cfs_b);
raw_spin_lock_irqsave(&cfs_b->lock, flags);
-
- throttled = !list_empty(&cfs_b->throttled_cfs_rq);
}
/*
@@ -5824,6 +5932,9 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
{
cfs_rq->runtime_enabled = 0;
INIT_LIST_HEAD(&cfs_rq->throttled_list);
+#ifdef CONFIG_SMP
+ INIT_LIST_HEAD(&cfs_rq->throttled_csd_list);
+#endif
}
void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
@@ -5840,12 +5951,38 @@ void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
{
+ int __maybe_unused i;
+
/* init_cfs_bandwidth() was not called */
if (!cfs_b->throttled_cfs_rq.next)
return;
hrtimer_cancel(&cfs_b->period_timer);
hrtimer_cancel(&cfs_b->slack_timer);
+
+ /*
+ * It is possible that we still have some cfs_rq's pending on a CSD
+ * list, though this race is very rare. In order for this to occur, we
+ * must have raced with the last task leaving the group while there
+ * exist throttled cfs_rq(s), and the period_timer must have queued the
+ * CSD item but the remote cpu has not yet processed it. To handle this,
+ * we can simply flush all pending CSD work inline here. We're
+ * guaranteed at this point that no additional cfs_rq of this group can
+ * join a CSD list.
+ */
+#ifdef CONFIG_SMP
+ for_each_possible_cpu(i) {
+ struct rq *rq = cpu_rq(i);
+ unsigned long flags;
+
+ if (list_empty(&rq->cfsb_csd_list))
+ continue;
+
+ local_irq_save(flags);
+ __cfsb_csd_unthrottle(rq);
+ local_irq_restore(flags);
+ }
+#endif
}
/*
@@ -6008,6 +6145,7 @@ static inline bool cpu_overutilized(int cpu)
unsigned long rq_util_min = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MIN);
unsigned long rq_util_max = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MAX);
+ /* Return true only if the utilization doesn't fit CPU's capacity */
return !util_fits_cpu(cpu_util_cfs(cpu), rq_util_min, rq_util_max, cpu);
}
@@ -6801,6 +6939,7 @@ static int
select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
{
unsigned long task_util, util_min, util_max, best_cap = 0;
+ int fits, best_fits = 0;
int cpu, best_cpu = -1;
struct cpumask *cpus;
@@ -6811,17 +6950,33 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
util_min = uclamp_eff_value(p, UCLAMP_MIN);
util_max = uclamp_eff_value(p, UCLAMP_MAX);
- for_each_cpu_wrap(cpu, cpus, target) {
+ for_each_cpu_wrap(cpu, cpus, target + 1) {
unsigned long cpu_cap = capacity_of(cpu);
if (!available_idle_cpu(cpu) && !sched_idle_cpu(cpu))
continue;
- if (util_fits_cpu(task_util, util_min, util_max, cpu))
+
+ fits = util_fits_cpu(task_util, util_min, util_max, cpu);
+
+ /* This CPU fits with all requirements */
+ if (fits > 0)
return cpu;
+ /*
+ * Only the min performance hint (i.e. uclamp_min) doesn't fit.
+ * Look for the CPU with best capacity.
+ */
+ else if (fits < 0)
+ cpu_cap = capacity_orig_of(cpu) - thermal_load_avg(cpu_rq(cpu));
- if (cpu_cap > best_cap) {
+ /*
+ * First, select CPU which fits better (-1 being better than 0).
+ * Then, select the one with best capacity at same level.
+ */
+ if ((fits < best_fits) ||
+ ((fits == best_fits) && (cpu_cap > best_cap))) {
best_cap = cpu_cap;
best_cpu = cpu;
+ best_fits = fits;
}
}
@@ -6834,7 +6989,11 @@ static inline bool asym_fits_cpu(unsigned long util,
int cpu)
{
if (sched_asym_cpucap_active())
- return util_fits_cpu(util, util_min, util_max, cpu);
+ /*
+ * Return true only if the cpu fully fits the task requirements
+ * which include the utilization and the performance hints.
+ */
+ return (util_fits_cpu(util, util_min, util_max, cpu) > 0);
return true;
}
@@ -7201,6 +7360,9 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
unsigned long p_util_max = uclamp_is_used() ? uclamp_eff_value(p, UCLAMP_MAX) : 1024;
struct root_domain *rd = this_rq()->rd;
int cpu, best_energy_cpu, target = -1;
+ int prev_fits = -1, best_fits = -1;
+ unsigned long best_thermal_cap = 0;
+ unsigned long prev_thermal_cap = 0;
struct sched_domain *sd;
struct perf_domain *pd;
struct energy_env eenv;
@@ -7229,13 +7391,14 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
eenv_task_busy_time(&eenv, p, prev_cpu);
for (; pd; pd = pd->next) {
+ unsigned long util_min = p_util_min, util_max = p_util_max;
unsigned long cpu_cap, cpu_thermal_cap, util;
unsigned long cur_delta, max_spare_cap = 0;
unsigned long rq_util_min, rq_util_max;
- unsigned long util_min, util_max;
unsigned long prev_spare_cap = 0;
int max_spare_cap_cpu = -1;
unsigned long base_energy;
+ int fits, max_fits = -1;
cpumask_and(cpus, perf_domain_span(pd), cpu_online_mask);
@@ -7251,6 +7414,8 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
eenv.pd_cap = 0;
for_each_cpu(cpu, cpus) {
+ struct rq *rq = cpu_rq(cpu);
+
eenv.pd_cap += cpu_thermal_cap;
if (!cpumask_test_cpu(cpu, sched_domain_span(sd)))
@@ -7269,26 +7434,23 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
* much capacity we can get out of the CPU; this is
* aligned with sched_cpu_util().
*/
- if (uclamp_is_used()) {
- if (uclamp_rq_is_idle(cpu_rq(cpu))) {
- util_min = p_util_min;
- util_max = p_util_max;
- } else {
- /*
- * Open code uclamp_rq_util_with() except for
- * the clamp() part. Ie: apply max aggregation
- * only. util_fits_cpu() logic requires to
- * operate on non clamped util but must use the
- * max-aggregated uclamp_{min, max}.
- */
- rq_util_min = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MIN);
- rq_util_max = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MAX);
-
- util_min = max(rq_util_min, p_util_min);
- util_max = max(rq_util_max, p_util_max);
- }
+ if (uclamp_is_used() && !uclamp_rq_is_idle(rq)) {
+ /*
+ * Open code uclamp_rq_util_with() except for
+ * the clamp() part. Ie: apply max aggregation
+ * only. util_fits_cpu() logic requires to
+ * operate on non clamped util but must use the
+ * max-aggregated uclamp_{min, max}.
+ */
+ rq_util_min = uclamp_rq_get(rq, UCLAMP_MIN);
+ rq_util_max = uclamp_rq_get(rq, UCLAMP_MAX);
+
+ util_min = max(rq_util_min, p_util_min);
+ util_max = max(rq_util_max, p_util_max);
}
- if (!util_fits_cpu(util, util_min, util_max, cpu))
+
+ fits = util_fits_cpu(util, util_min, util_max, cpu);
+ if (!fits)
continue;
lsub_positive(&cpu_cap, util);
@@ -7296,7 +7458,9 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
if (cpu == prev_cpu) {
/* Always use prev_cpu as a candidate. */
prev_spare_cap = cpu_cap;
- } else if (cpu_cap > max_spare_cap) {
+ prev_fits = fits;
+ } else if ((fits > max_fits) ||
+ ((fits == max_fits) && (cpu_cap > max_spare_cap))) {
/*
* Find the CPU with the maximum spare capacity
* among the remaining CPUs in the performance
@@ -7304,6 +7468,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
*/
max_spare_cap = cpu_cap;
max_spare_cap_cpu = cpu;
+ max_fits = fits;
}
}
@@ -7322,26 +7487,50 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
if (prev_delta < base_energy)
goto unlock;
prev_delta -= base_energy;
+ prev_thermal_cap = cpu_thermal_cap;
best_delta = min(best_delta, prev_delta);
}
/* Evaluate the energy impact of using max_spare_cap_cpu. */
if (max_spare_cap_cpu >= 0 && max_spare_cap > prev_spare_cap) {
+ /* Current best energy cpu fits better */
+ if (max_fits < best_fits)
+ continue;
+
+ /*
+ * Both don't fit performance hint (i.e. uclamp_min)
+ * but best energy cpu has better capacity.
+ */
+ if ((max_fits < 0) &&
+ (cpu_thermal_cap <= best_thermal_cap))
+ continue;
+
cur_delta = compute_energy(&eenv, pd, cpus, p,
max_spare_cap_cpu);
/* CPU utilization has changed */
if (cur_delta < base_energy)
goto unlock;
cur_delta -= base_energy;
- if (cur_delta < best_delta) {
- best_delta = cur_delta;
- best_energy_cpu = max_spare_cap_cpu;
- }
+
+ /*
+ * Both fit for the task but best energy cpu has lower
+ * energy impact.
+ */
+ if ((max_fits > 0) && (best_fits > 0) &&
+ (cur_delta >= best_delta))
+ continue;
+
+ best_delta = cur_delta;
+ best_energy_cpu = max_spare_cap_cpu;
+ best_fits = max_fits;
+ best_thermal_cap = cpu_thermal_cap;
}
}
rcu_read_unlock();
- if (best_delta < prev_delta)
+ if ((best_fits > prev_fits) ||
+ ((best_fits > 0) && (best_delta < prev_delta)) ||
+ ((best_fits < 0) && (best_thermal_cap > prev_thermal_cap)))
target = best_energy_cpu;
return target;
@@ -8841,73 +9030,16 @@ static unsigned long scale_rt_capacity(int cpu)
static void update_cpu_capacity(struct sched_domain *sd, int cpu)
{
- unsigned long capacity_orig = arch_scale_cpu_capacity(cpu);
unsigned long capacity = scale_rt_capacity(cpu);
struct sched_group *sdg = sd->groups;
- struct rq *rq = cpu_rq(cpu);
- rq->cpu_capacity_orig = capacity_orig;
+ cpu_rq(cpu)->cpu_capacity_orig = arch_scale_cpu_capacity(cpu);
if (!capacity)
capacity = 1;
- rq->cpu_capacity = capacity;
-
- /*
- * Detect if the performance domain is in capacity inversion state.
- *
- * Capacity inversion happens when another perf domain with equal or
- * lower capacity_orig_of() ends up having higher capacity than this
- * domain after subtracting thermal pressure.
- *
- * We only take into account thermal pressure in this detection as it's
- * the only metric that actually results in *real* reduction of
- * capacity due to performance points (OPPs) being dropped/become
- * unreachable due to thermal throttling.
- *
- * We assume:
- * * That all cpus in a perf domain have the same capacity_orig
- * (same uArch).
- * * Thermal pressure will impact all cpus in this perf domain
- * equally.
- */
- if (static_branch_unlikely(&sched_asym_cpucapacity)) {
- unsigned long inv_cap = capacity_orig - thermal_load_avg(rq);
- struct perf_domain *pd = rcu_dereference(rq->rd->pd);
-
- rq->cpu_capacity_inverted = 0;
-
- for (; pd; pd = pd->next) {
- struct cpumask *pd_span = perf_domain_span(pd);
- unsigned long pd_cap_orig, pd_cap;
-
- cpu = cpumask_any(pd_span);
- pd_cap_orig = arch_scale_cpu_capacity(cpu);
-
- if (capacity_orig < pd_cap_orig)
- continue;
-
- /*
- * handle the case of multiple perf domains have the
- * same capacity_orig but one of them is under higher
- * thermal pressure. We record it as capacity
- * inversion.
- */
- if (capacity_orig == pd_cap_orig) {
- pd_cap = pd_cap_orig - thermal_load_avg(cpu_rq(cpu));
-
- if (pd_cap > inv_cap) {
- rq->cpu_capacity_inverted = inv_cap;
- break;
- }
- } else if (pd_cap_orig > inv_cap) {
- rq->cpu_capacity_inverted = inv_cap;
- break;
- }
- }
- }
-
- trace_sched_cpu_capacity_tp(rq);
+ cpu_rq(cpu)->cpu_capacity = capacity;
+ trace_sched_cpu_capacity_tp(cpu_rq(cpu));
sdg->sgc->capacity = capacity;
sdg->sgc->min_capacity = capacity;
@@ -10135,24 +10267,23 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
*/
update_sd_lb_stats(env, &sds);
- if (sched_energy_enabled()) {
- struct root_domain *rd = env->dst_rq->rd;
-
- if (rcu_dereference(rd->pd) && !READ_ONCE(rd->overutilized))
- goto out_balanced;
- }
-
- local = &sds.local_stat;
- busiest = &sds.busiest_stat;
-
/* There is no busy sibling group to pull tasks from */
if (!sds.busiest)
goto out_balanced;
+ busiest = &sds.busiest_stat;
+
/* Misfit tasks should be dealt with regardless of the avg load */
if (busiest->group_type == group_misfit_task)
goto force_balance;
+ if (sched_energy_enabled()) {
+ struct root_domain *rd = env->dst_rq->rd;
+
+ if (rcu_dereference(rd->pd) && !READ_ONCE(rd->overutilized))
+ goto out_balanced;
+ }
+
/* ASYM feature bypasses nice load balance check */
if (busiest->group_type == group_asym_packing)
goto force_balance;
@@ -10165,6 +10296,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
if (busiest->group_type == group_imbalanced)
goto force_balance;
+ local = &sds.local_stat;
/*
* If the local group is busier than the selected busiest group
* don't try and pull any tasks.
@@ -11728,7 +11860,8 @@ static inline void task_tick_core(struct rq *rq, struct task_struct *curr)
/*
* se_fi_update - Update the cfs_rq->min_vruntime_fi in a CFS hierarchy if needed.
*/
-static void se_fi_update(struct sched_entity *se, unsigned int fi_seq, bool forceidle)
+static void se_fi_update(const struct sched_entity *se, unsigned int fi_seq,
+ bool forceidle)
{
for_each_sched_entity(se) {
struct cfs_rq *cfs_rq = cfs_rq_of(se);
@@ -11753,11 +11886,12 @@ void task_vruntime_update(struct rq *rq, struct task_struct *p, bool in_fi)
se_fi_update(se, rq->core->core_forceidle_seq, in_fi);
}
-bool cfs_prio_less(struct task_struct *a, struct task_struct *b, bool in_fi)
+bool cfs_prio_less(const struct task_struct *a, const struct task_struct *b,
+ bool in_fi)
{
struct rq *rq = task_rq(a);
- struct sched_entity *sea = &a->se;
- struct sched_entity *seb = &b->se;
+ const struct sched_entity *sea = &a->se;
+ const struct sched_entity *seb = &b->se;
struct cfs_rq *cfs_rqa;
struct cfs_rq *cfs_rqb;
s64 delta;
@@ -12474,6 +12608,11 @@ __init void init_sched_fair_class(void)
for_each_possible_cpu(i) {
zalloc_cpumask_var_node(&per_cpu(load_balance_mask, i), GFP_KERNEL, cpu_to_node(i));
zalloc_cpumask_var_node(&per_cpu(select_rq_mask, i), GFP_KERNEL, cpu_to_node(i));
+
+#ifdef CONFIG_CFS_BANDWIDTH
+ INIT_CSD(&cpu_rq(i)->cfsb_csd, __cfsb_csd_unthrottle, cpu_rq(i));
+ INIT_LIST_HEAD(&cpu_rq(i)->cfsb_csd_list);
+#endif
}
open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index f26ab2675f7d..e9ef66be2870 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -51,18 +51,22 @@ __setup("hlt", cpu_idle_nopoll_setup);
static noinline int __cpuidle cpu_idle_poll(void)
{
+ instrumentation_begin();
trace_cpu_idle(0, smp_processor_id());
stop_critical_timings();
- ct_idle_enter();
- local_irq_enable();
+ ct_cpuidle_enter();
+ raw_local_irq_enable();
while (!tif_need_resched() &&
(cpu_idle_force_poll || tick_check_broadcast_expired()))
cpu_relax();
+ raw_local_irq_disable();
- ct_idle_exit();
+ ct_cpuidle_exit();
start_critical_timings();
trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id());
+ local_irq_enable();
+ instrumentation_end();
return 1;
}
@@ -75,7 +79,6 @@ void __weak arch_cpu_idle_dead(void) { }
void __weak arch_cpu_idle(void)
{
cpu_idle_force_poll = 1;
- raw_local_irq_enable();
}
/**
@@ -85,44 +88,20 @@ void __weak arch_cpu_idle(void)
*/
void __cpuidle default_idle_call(void)
{
- if (current_clr_polling_and_test()) {
- local_irq_enable();
- } else {
-
+ instrumentation_begin();
+ if (!current_clr_polling_and_test()) {
trace_cpu_idle(1, smp_processor_id());
stop_critical_timings();
- /*
- * arch_cpu_idle() is supposed to enable IRQs, however
- * we can't do that because of RCU and tracing.
- *
- * Trace IRQs enable here, then switch off RCU, and have
- * arch_cpu_idle() use raw_local_irq_enable(). Note that
- * ct_idle_enter() relies on lockdep IRQ state, so switch that
- * last -- this is very similar to the entry code.
- */
- trace_hardirqs_on_prepare();
- lockdep_hardirqs_on_prepare();
- ct_idle_enter();
- lockdep_hardirqs_on(_THIS_IP_);
-
+ ct_cpuidle_enter();
arch_cpu_idle();
-
- /*
- * OK, so IRQs are enabled here, but RCU needs them disabled to
- * turn itself back on.. funny thing is that disabling IRQs
- * will cause tracing, which needs RCU. Jump through hoops to
- * make it 'work'.
- */
- raw_local_irq_disable();
- lockdep_hardirqs_off(_THIS_IP_);
- ct_idle_exit();
- lockdep_hardirqs_on(_THIS_IP_);
- raw_local_irq_enable();
+ ct_cpuidle_exit();
start_critical_timings();
trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id());
}
+ local_irq_enable();
+ instrumentation_end();
}
static int call_cpuidle_s2idle(struct cpuidle_driver *drv,
diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c
index 0c5be7ebb1dc..2ad881d07752 100644
--- a/kernel/sched/membarrier.c
+++ b/kernel/sched/membarrier.c
@@ -159,7 +159,8 @@
| MEMBARRIER_CMD_PRIVATE_EXPEDITED \
| MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED \
| MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK \
- | MEMBARRIER_PRIVATE_EXPEDITED_RSEQ_BITMASK)
+ | MEMBARRIER_PRIVATE_EXPEDITED_RSEQ_BITMASK \
+ | MEMBARRIER_CMD_GET_REGISTRATIONS)
static void ipi_mb(void *info)
{
@@ -540,6 +541,40 @@ static int membarrier_register_private_expedited(int flags)
return 0;
}
+static int membarrier_get_registrations(void)
+{
+ struct task_struct *p = current;
+ struct mm_struct *mm = p->mm;
+ int registrations_mask = 0, membarrier_state, i;
+ static const int states[] = {
+ MEMBARRIER_STATE_GLOBAL_EXPEDITED |
+ MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY,
+ MEMBARRIER_STATE_PRIVATE_EXPEDITED |
+ MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY,
+ MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE |
+ MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY,
+ MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ |
+ MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ_READY
+ };
+ static const int registration_cmds[] = {
+ MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED,
+ MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED,
+ MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE,
+ MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ
+ };
+ BUILD_BUG_ON(ARRAY_SIZE(states) != ARRAY_SIZE(registration_cmds));
+
+ membarrier_state = atomic_read(&mm->membarrier_state);
+ for (i = 0; i < ARRAY_SIZE(states); ++i) {
+ if (membarrier_state & states[i]) {
+ registrations_mask |= registration_cmds[i];
+ membarrier_state &= ~states[i];
+ }
+ }
+ WARN_ON_ONCE(membarrier_state != 0);
+ return registrations_mask;
+}
+
/**
* sys_membarrier - issue memory barriers on a set of threads
* @cmd: Takes command values defined in enum membarrier_cmd.
@@ -623,6 +658,8 @@ SYSCALL_DEFINE3(membarrier, int, cmd, unsigned int, flags, int, cpu_id)
return membarrier_private_expedited(MEMBARRIER_FLAG_RSEQ, cpu_id);
case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ:
return membarrier_register_private_expedited(MEMBARRIER_FLAG_RSEQ);
+ case MEMBARRIER_CMD_GET_REGISTRATIONS:
+ return membarrier_get_registrations();
default:
return -EINVAL;
}
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index 8ac8b81bfee6..02e011cabe91 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -1343,10 +1343,11 @@ void psi_trigger_destroy(struct psi_trigger *t)
group = t->group;
/*
- * Wakeup waiters to stop polling. Can happen if cgroup is deleted
- * from under a polling process.
+ * Wakeup waiters to stop polling and clear the queue to prevent it from
+ * being accessed later. Can happen if cgroup is deleted from under a
+ * polling process.
*/
- wake_up_interruptible(&t->event_wait);
+ wake_up_pollfree(&t->event_wait);
mutex_lock(&group->trigger_lock);
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index ed2a47e4ddae..0a11f44adee5 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1777,6 +1777,8 @@ static struct sched_rt_entity *pick_next_rt_entity(struct rt_rq *rt_rq)
BUG_ON(idx >= MAX_RT_PRIO);
queue = array->queue + idx;
+ if (SCHED_WARN_ON(list_empty(queue)))
+ return NULL;
next = list_entry(queue->next, struct sched_rt_entity, run_list);
return next;
@@ -1789,7 +1791,8 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq)
do {
rt_se = pick_next_rt_entity(rt_rq);
- BUG_ON(!rt_se);
+ if (unlikely(!rt_se))
+ return NULL;
rt_rq = group_rt_rq(rt_se);
} while (rt_rq);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 771f8ddb7053..3e8df6d31c1e 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -248,7 +248,7 @@ static inline void update_avg(u64 *avg, u64 sample)
#define SCHED_DL_FLAGS (SCHED_FLAG_RECLAIM | SCHED_FLAG_DL_OVERRUN | SCHED_FLAG_SUGOV)
-static inline bool dl_entity_is_special(struct sched_dl_entity *dl_se)
+static inline bool dl_entity_is_special(const struct sched_dl_entity *dl_se)
{
#ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL
return unlikely(dl_se->flags & SCHED_FLAG_SUGOV);
@@ -260,8 +260,8 @@ static inline bool dl_entity_is_special(struct sched_dl_entity *dl_se)
/*
* Tells if entity @a should preempt entity @b.
*/
-static inline bool
-dl_entity_preempt(struct sched_dl_entity *a, struct sched_dl_entity *b)
+static inline bool dl_entity_preempt(const struct sched_dl_entity *a,
+ const struct sched_dl_entity *b)
{
return dl_entity_is_special(a) ||
dl_time_before(a->deadline, b->deadline);
@@ -645,6 +645,9 @@ struct cfs_rq {
int throttled;
int throttle_count;
struct list_head throttled_list;
+#ifdef CONFIG_SMP
+ struct list_head throttled_csd_list;
+#endif
#endif /* CONFIG_CFS_BANDWIDTH */
#endif /* CONFIG_FAIR_GROUP_SCHED */
};
@@ -1041,7 +1044,6 @@ struct rq {
unsigned long cpu_capacity;
unsigned long cpu_capacity_orig;
- unsigned long cpu_capacity_inverted;
struct balance_callback *balance_callback;
@@ -1154,6 +1156,11 @@ struct rq {
/* Scratch cpumask to be temporarily used under rq_lock */
cpumask_var_t scratch_mask;
+
+#if defined(CONFIG_CFS_BANDWIDTH) && defined(CONFIG_SMP)
+ call_single_data_t cfsb_csd;
+ struct list_head cfsb_csd_list;
+#endif
};
#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -1236,7 +1243,8 @@ static inline raw_spinlock_t *__rq_lockp(struct rq *rq)
return &rq->__lock;
}
-bool cfs_prio_less(struct task_struct *a, struct task_struct *b, bool fi);
+bool cfs_prio_less(const struct task_struct *a, const struct task_struct *b,
+ bool fi);
/*
* Helpers to check if the CPU's core cookie matches with the task's cookie
@@ -1415,7 +1423,7 @@ static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
}
/* runqueue on which this entity is (to be) queued */
-static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
+static inline struct cfs_rq *cfs_rq_of(const struct sched_entity *se)
{
return se->cfs_rq;
}
@@ -1428,19 +1436,16 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
#else
-static inline struct task_struct *task_of(struct sched_entity *se)
-{
- return container_of(se, struct task_struct, se);
-}
+#define task_of(_se) container_of(_se, struct task_struct, se)
-static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
+static inline struct cfs_rq *task_cfs_rq(const struct task_struct *p)
{
return &task_rq(p)->cfs;
}
-static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
+static inline struct cfs_rq *cfs_rq_of(const struct sched_entity *se)
{
- struct task_struct *p = task_of(se);
+ const struct task_struct *p = task_of(se);
struct rq *rq = task_rq(p);
return &rq->cfs;
@@ -2893,24 +2898,6 @@ static inline unsigned long capacity_orig_of(int cpu)
return cpu_rq(cpu)->cpu_capacity_orig;
}
-/*
- * Returns inverted capacity if the CPU is in capacity inversion state.
- * 0 otherwise.
- *
- * Capacity inversion detection only considers thermal impact where actual
- * performance points (OPPs) gets dropped.
- *
- * Capacity inversion state happens when another performance domain that has
- * equal or lower capacity_orig_of() becomes effectively larger than the perf
- * domain this CPU belongs to due to thermal pressure throttling it hard.
- *
- * See comment in update_cpu_capacity().
- */
-static inline unsigned long cpu_in_capacity_inversion(int cpu)
-{
- return cpu_rq(cpu)->cpu_capacity_inverted;
-}
-
/**
* enum cpu_util_type - CPU utilization type
* @FREQUENCY_UTIL: Utilization used to select frequency
@@ -3261,4 +3248,62 @@ static inline void update_current_exec_runtime(struct task_struct *curr,
cgroup_account_cputime(curr, delta_exec);
}
+#ifdef CONFIG_SCHED_MM_CID
+static inline int __mm_cid_get(struct mm_struct *mm)
+{
+ struct cpumask *cpumask;
+ int cid;
+
+ cpumask = mm_cidmask(mm);
+ cid = cpumask_first_zero(cpumask);
+ if (cid >= nr_cpu_ids)
+ return -1;
+ __cpumask_set_cpu(cid, cpumask);
+ return cid;
+}
+
+static inline void mm_cid_put(struct mm_struct *mm, int cid)
+{
+ lockdep_assert_irqs_disabled();
+ if (cid < 0)
+ return;
+ raw_spin_lock(&mm->cid_lock);
+ __cpumask_clear_cpu(cid, mm_cidmask(mm));
+ raw_spin_unlock(&mm->cid_lock);
+}
+
+static inline int mm_cid_get(struct mm_struct *mm)
+{
+ int ret;
+
+ lockdep_assert_irqs_disabled();
+ raw_spin_lock(&mm->cid_lock);
+ ret = __mm_cid_get(mm);
+ raw_spin_unlock(&mm->cid_lock);
+ return ret;
+}
+
+static inline void switch_mm_cid(struct task_struct *prev, struct task_struct *next)
+{
+ if (prev->mm_cid_active) {
+ if (next->mm_cid_active && next->mm == prev->mm) {
+ /*
+ * Context switch between threads in same mm, hand over
+ * the mm_cid from prev to next.
+ */
+ next->mm_cid = prev->mm_cid;
+ prev->mm_cid = -1;
+ return;
+ }
+ mm_cid_put(prev->mm, prev->mm_cid);
+ prev->mm_cid = -1;
+ }
+ if (next->mm_cid_active)
+ next->mm_cid = mm_cid_get(next->mm);
+}
+
+#else
+static inline void switch_mm_cid(struct task_struct *prev, struct task_struct *next) { }
+#endif
+
#endif /* _KERNEL_SCHED_SCHED_H */
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 8739c2a5a54e..d93c3379e901 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -578,7 +578,7 @@ out:
*/
struct root_domain def_root_domain;
-void init_defrootdomain(void)
+void __init init_defrootdomain(void)
{
init_rootdomain(&def_root_domain);
@@ -2451,7 +2451,7 @@ void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)
* Set up scheduler domains and groups. For now this just excludes isolated
* CPUs, but could be used to exclude other special cases in the future.
*/
-int sched_init_domains(const struct cpumask *cpu_map)
+int __init sched_init_domains(const struct cpumask *cpu_map)
{
int err;
diff --git a/kernel/signal.c b/kernel/signal.c
index ae26da61c4d9..8cb28f1df294 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2951,6 +2951,7 @@ void exit_signals(struct task_struct *tsk)
cgroup_threadgroup_change_begin(tsk);
if (thread_group_empty(tsk) || (tsk->signal->flags & SIGNAL_GROUP_EXIT)) {
+ sched_mm_cid_exit_signals(tsk);
tsk->flags |= PF_EXITING;
cgroup_threadgroup_change_end(tsk);
return;
@@ -2961,6 +2962,7 @@ void exit_signals(struct task_struct *tsk)
* From now this task is not visible for group-wide signals,
* see wants_signal(), do_signal_stop().
*/
+ sched_mm_cid_exit_signals(tsk);
tsk->flags |= PF_EXITING;
cgroup_threadgroup_change_end(tsk);
diff --git a/kernel/sys.c b/kernel/sys.c
index 5fd54bf0e886..88b31f096fb2 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1442,6 +1442,8 @@ static int do_prlimit(struct task_struct *tsk, unsigned int resource,
if (resource >= RLIM_NLIMITS)
return -EINVAL;
+ resource = array_index_nospec(resource, RLIM_NLIMITS);
+
if (new_rlim) {
if (new_rlim->rlim_cur > new_rlim->rlim_max)
return -EINVAL;
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index a41753be1a2b..bae8f11070be 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -200,10 +200,14 @@ config CLOCKSOURCE_WATCHDOG_MAX_SKEW_US
int "Clocksource watchdog maximum allowable skew (in μs)"
depends on CLOCKSOURCE_WATCHDOG
range 50 1000
- default 100
+ default 125
help
Specify the maximum amount of allowable watchdog skew in
microseconds before reporting the clocksource to be unstable.
+ The default is based on a half-second clocksource watchdog
+ interval and NTP's maximum frequency drift of 500 parts
+ per million. If the clocksource is good enough for NTP,
+ it is good enough for the clocksource watchdog!
endmenu
endif
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 5897828b9d7e..7e5dff602585 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -470,11 +470,35 @@ u64 alarm_forward(struct alarm *alarm, ktime_t now, ktime_t interval)
}
EXPORT_SYMBOL_GPL(alarm_forward);
-u64 alarm_forward_now(struct alarm *alarm, ktime_t interval)
+static u64 __alarm_forward_now(struct alarm *alarm, ktime_t interval, bool throttle)
{
struct alarm_base *base = &alarm_bases[alarm->type];
+ ktime_t now = base->get_ktime();
+
+ if (IS_ENABLED(CONFIG_HIGH_RES_TIMERS) && throttle) {
+ /*
+ * Same issue as with posix_timer_fn(). Timers which are
+ * periodic but the signal is ignored can starve the system
+ * with a very small interval. The real fix which was
+ * promised in the context of posix_timer_fn() never
+ * materialized, but someone should really work on it.
+ *
+ * To prevent DOS fake @now to be 1 jiffie out which keeps
+ * the overrun accounting correct but creates an
+ * inconsistency vs. timer_gettime(2).
+ */
+ ktime_t kj = NSEC_PER_SEC / HZ;
+
+ if (interval < kj)
+ now = ktime_add(now, kj);
+ }
+
+ return alarm_forward(alarm, now, interval);
+}
- return alarm_forward(alarm, base->get_ktime(), interval);
+u64 alarm_forward_now(struct alarm *alarm, ktime_t interval)
+{
+ return __alarm_forward_now(alarm, interval, false);
}
EXPORT_SYMBOL_GPL(alarm_forward_now);
@@ -551,9 +575,10 @@ static enum alarmtimer_restart alarm_handle_timer(struct alarm *alarm,
if (posix_timer_event(ptr, si_private) && ptr->it_interval) {
/*
* Handle ignored signals and rearm the timer. This will go
- * away once we handle ignored signals proper.
+ * away once we handle ignored signals proper. Ensure that
+ * small intervals cannot starve the system.
*/
- ptr->it_overrun += alarm_forward_now(alarm, ptr->it_interval);
+ ptr->it_overrun += __alarm_forward_now(alarm, ptr->it_interval, true);
++ptr->it_requeue_pending;
ptr->it_active = 1;
result = ALARMTIMER_RESTART;
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 9cf32ccda715..91836b727cef 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -96,6 +96,11 @@ static int finished_booting;
static u64 suspend_start;
/*
+ * Interval: 0.5sec.
+ */
+#define WATCHDOG_INTERVAL (HZ >> 1)
+
+/*
* Threshold: 0.0312s, when doubled: 0.0625s.
* Also a default for cs->uncertainty_margin when registering clocks.
*/
@@ -106,11 +111,14 @@ static u64 suspend_start;
* clocksource surrounding a read of the clocksource being validated.
* This delay could be due to SMIs, NMIs, or to VCPU preemptions. Used as
* a lower bound for cs->uncertainty_margin values when registering clocks.
+ *
+ * The default of 500 parts per million is based on NTP's limits.
+ * If a clocksource is good enough for NTP, it is good enough for us!
*/
#ifdef CONFIG_CLOCKSOURCE_WATCHDOG_MAX_SKEW_US
#define MAX_SKEW_USEC CONFIG_CLOCKSOURCE_WATCHDOG_MAX_SKEW_US
#else
-#define MAX_SKEW_USEC 100
+#define MAX_SKEW_USEC (125 * WATCHDOG_INTERVAL / HZ)
#endif
#define WATCHDOG_MAX_SKEW (MAX_SKEW_USEC * NSEC_PER_USEC)
@@ -140,11 +148,6 @@ static inline void clocksource_watchdog_unlock(unsigned long *flags)
static int clocksource_watchdog_kthread(void *data);
static void __clocksource_change_rating(struct clocksource *cs, int rating);
-/*
- * Interval: 0.5sec.
- */
-#define WATCHDOG_INTERVAL (HZ >> 1)
-
static void clocksource_watchdog_work(struct work_struct *work)
{
/*
@@ -257,8 +260,8 @@ static enum wd_read_status cs_watchdog_read(struct clocksource *cs, u64 *csnow,
goto skip_test;
}
- pr_warn("timekeeping watchdog on CPU%d: %s read-back delay of %lldns, attempt %d, marking unstable\n",
- smp_processor_id(), watchdog->name, wd_delay, nretries);
+ pr_warn("timekeeping watchdog on CPU%d: wd-%s-wd excessive read-back delay of %lldns vs. limit of %ldns, wd-wd read-back delay only %lldns, attempt %d, marking %s unstable\n",
+ smp_processor_id(), cs->name, wd_delay, WATCHDOG_MAX_SKEW, wd_seq_delay, nretries, cs->name);
return WD_READ_UNSTABLE;
skip_test:
@@ -384,6 +387,15 @@ void clocksource_verify_percpu(struct clocksource *cs)
}
EXPORT_SYMBOL_GPL(clocksource_verify_percpu);
+static inline void clocksource_reset_watchdog(void)
+{
+ struct clocksource *cs;
+
+ list_for_each_entry(cs, &watchdog_list, wd_list)
+ cs->flags &= ~CLOCK_SOURCE_WATCHDOG;
+}
+
+
static void clocksource_watchdog(struct timer_list *unused)
{
u64 csnow, wdnow, cslast, wdlast, delta;
@@ -391,6 +403,7 @@ static void clocksource_watchdog(struct timer_list *unused)
int64_t wd_nsec, cs_nsec;
struct clocksource *cs;
enum wd_read_status read_ret;
+ unsigned long extra_wait = 0;
u32 md;
spin_lock(&watchdog_lock);
@@ -410,13 +423,30 @@ static void clocksource_watchdog(struct timer_list *unused)
read_ret = cs_watchdog_read(cs, &csnow, &wdnow);
- if (read_ret != WD_READ_SUCCESS) {
- if (read_ret == WD_READ_UNSTABLE)
- /* Clock readout unreliable, so give it up. */
- __clocksource_unstable(cs);
+ if (read_ret == WD_READ_UNSTABLE) {
+ /* Clock readout unreliable, so give it up. */
+ __clocksource_unstable(cs);
continue;
}
+ /*
+ * When WD_READ_SKIP is returned, it means the system is likely
+ * under very heavy load, where the latency of reading
+ * watchdog/clocksource is very big, and affect the accuracy of
+ * watchdog check. So give system some space and suspend the
+ * watchdog check for 5 minutes.
+ */
+ if (read_ret == WD_READ_SKIP) {
+ /*
+ * As the watchdog timer will be suspended, and
+ * cs->last could keep unchanged for 5 minutes, reset
+ * the counters.
+ */
+ clocksource_reset_watchdog();
+ extra_wait = HZ * 300;
+ break;
+ }
+
/* Clocksource initialized ? */
if (!(cs->flags & CLOCK_SOURCE_WATCHDOG) ||
atomic_read(&watchdog_reset_pending)) {
@@ -443,12 +473,20 @@ static void clocksource_watchdog(struct timer_list *unused)
/* Check the deviation from the watchdog clocksource. */
md = cs->uncertainty_margin + watchdog->uncertainty_margin;
if (abs(cs_nsec - wd_nsec) > md) {
+ u64 cs_wd_msec;
+ u64 wd_msec;
+ u32 wd_rem;
+
pr_warn("timekeeping watchdog on CPU%d: Marking clocksource '%s' as unstable because the skew is too large:\n",
smp_processor_id(), cs->name);
pr_warn(" '%s' wd_nsec: %lld wd_now: %llx wd_last: %llx mask: %llx\n",
watchdog->name, wd_nsec, wdnow, wdlast, watchdog->mask);
pr_warn(" '%s' cs_nsec: %lld cs_now: %llx cs_last: %llx mask: %llx\n",
cs->name, cs_nsec, csnow, cslast, cs->mask);
+ cs_wd_msec = div_u64_rem(cs_nsec - wd_nsec, 1000U * 1000U, &wd_rem);
+ wd_msec = div_u64_rem(wd_nsec, 1000U * 1000U, &wd_rem);
+ pr_warn(" Clocksource '%s' skewed %lld ns (%lld ms) over watchdog '%s' interval of %lld ns (%lld ms)\n",
+ cs->name, cs_nsec - wd_nsec, cs_wd_msec, watchdog->name, wd_nsec, wd_msec);
if (curr_clocksource == cs)
pr_warn(" '%s' is current clocksource.\n", cs->name);
else if (curr_clocksource)
@@ -512,7 +550,7 @@ static void clocksource_watchdog(struct timer_list *unused)
* pair clocksource_stop_watchdog() clocksource_start_watchdog().
*/
if (!timer_pending(&watchdog_timer)) {
- watchdog_timer.expires += WATCHDOG_INTERVAL;
+ watchdog_timer.expires += WATCHDOG_INTERVAL + extra_wait;
add_timer_on(&watchdog_timer, next_cpu);
}
out:
@@ -537,14 +575,6 @@ static inline void clocksource_stop_watchdog(void)
watchdog_running = 0;
}
-static inline void clocksource_reset_watchdog(void)
-{
- struct clocksource *cs;
-
- list_for_each_entry(cs, &watchdog_list, wd_list)
- cs->flags &= ~CLOCK_SOURCE_WATCHDOG;
-}
-
static void clocksource_resume_watchdog(void)
{
atomic_inc(&watchdog_reset_pending);
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 3ae661ab6260..e8c08292defc 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -2089,7 +2089,7 @@ long hrtimer_nanosleep(ktime_t rqtp, const enum hrtimer_mode mode,
u64 slack;
slack = current->timer_slack_ns;
- if (dl_task(current) || rt_task(current))
+ if (rt_task(current))
slack = 0;
hrtimer_init_sleeper_on_stack(&t, clockid, mode);
@@ -2126,6 +2126,7 @@ SYSCALL_DEFINE2(nanosleep, struct __kernel_timespec __user *, rqtp,
if (!timespec64_valid(&tu))
return -EINVAL;
+ current->restart_block.fn = do_no_restart_syscall;
current->restart_block.nanosleep.type = rmtp ? TT_NATIVE : TT_NONE;
current->restart_block.nanosleep.rmtp = rmtp;
return hrtimer_nanosleep(timespec64_to_ktime(tu), HRTIMER_MODE_REL,
@@ -2147,6 +2148,7 @@ SYSCALL_DEFINE2(nanosleep_time32, struct old_timespec32 __user *, rqtp,
if (!timespec64_valid(&tu))
return -EINVAL;
+ current->restart_block.fn = do_no_restart_syscall;
current->restart_block.nanosleep.type = rmtp ? TT_COMPAT : TT_NONE;
current->restart_block.nanosleep.compat_rmtp = rmtp;
return hrtimer_nanosleep(timespec64_to_ktime(tu), HRTIMER_MODE_REL,
@@ -2270,7 +2272,7 @@ void __init hrtimers_init(void)
/**
* schedule_hrtimeout_range_clock - sleep until timeout
* @expires: timeout value (ktime_t)
- * @delta: slack in expires timeout (ktime_t)
+ * @delta: slack in expires timeout (ktime_t) for SCHED_OTHER tasks
* @mode: timer mode
* @clock_id: timer clock to be used
*/
@@ -2297,6 +2299,13 @@ schedule_hrtimeout_range_clock(ktime_t *expires, u64 delta,
return -EINTR;
}
+ /*
+ * Override any slack passed by the user if under
+ * rt contraints.
+ */
+ if (rt_task(current))
+ delta = 0;
+
hrtimer_init_sleeper_on_stack(&t, clock_id, mode);
hrtimer_set_expires_range_ns(&t.timer, *expires, delta);
hrtimer_sleeper_start_expires(&t, mode);
@@ -2316,7 +2325,7 @@ EXPORT_SYMBOL_GPL(schedule_hrtimeout_range_clock);
/**
* schedule_hrtimeout_range - sleep until timeout
* @expires: timeout value (ktime_t)
- * @delta: slack in expires timeout (ktime_t)
+ * @delta: slack in expires timeout (ktime_t) for SCHED_OTHER tasks
* @mode: timer mode
*
* Make the current task sleep until the given expiry time has
@@ -2324,7 +2333,8 @@ EXPORT_SYMBOL_GPL(schedule_hrtimeout_range_clock);
* the current task state has been set (see set_current_state()).
*
* The @delta argument gives the kernel the freedom to schedule the
- * actual wakeup to a time that is both power and performance friendly.
+ * actual wakeup to a time that is both power and performance friendly
+ * for regular (non RT/DL) tasks.
* The kernel give the normal best effort behavior for "@expires+@delta",
* but may decide to fire the timer earlier, but no earlier than @expires.
*
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index cb925e8ef9a8..2f5e9b34022c 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -243,13 +243,12 @@ static void proc_sample_cputime_atomic(struct task_cputime_atomic *at,
*/
static inline void __update_gt_cputime(atomic64_t *cputime, u64 sum_cputime)
{
- u64 curr_cputime;
-retry:
- curr_cputime = atomic64_read(cputime);
- if (sum_cputime > curr_cputime) {
- if (atomic64_cmpxchg(cputime, curr_cputime, sum_cputime) != curr_cputime)
- goto retry;
- }
+ u64 curr_cputime = atomic64_read(cputime);
+
+ do {
+ if (sum_cputime <= curr_cputime)
+ return;
+ } while (!atomic64_try_cmpxchg(cputime, &curr_cputime, sum_cputime));
}
static void update_gt_cputime(struct task_cputime_atomic *cputime_atomic,
diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c
index 90ea5f373e50..828aeecbd1e8 100644
--- a/kernel/time/posix-stubs.c
+++ b/kernel/time/posix-stubs.c
@@ -147,6 +147,7 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
return -EINVAL;
if (flags & TIMER_ABSTIME)
rmtp = NULL;
+ current->restart_block.fn = do_no_restart_syscall;
current->restart_block.nanosleep.type = rmtp ? TT_NATIVE : TT_NONE;
current->restart_block.nanosleep.rmtp = rmtp;
texp = timespec64_to_ktime(t);
@@ -240,6 +241,7 @@ SYSCALL_DEFINE4(clock_nanosleep_time32, clockid_t, which_clock, int, flags,
return -EINVAL;
if (flags & TIMER_ABSTIME)
rmtp = NULL;
+ current->restart_block.fn = do_no_restart_syscall;
current->restart_block.nanosleep.type = rmtp ? TT_COMPAT : TT_NONE;
current->restart_block.nanosleep.compat_rmtp = rmtp;
texp = timespec64_to_ktime(t);
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index 5dead89308b7..0c8a87a11b39 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -1270,6 +1270,7 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
return -EINVAL;
if (flags & TIMER_ABSTIME)
rmtp = NULL;
+ current->restart_block.fn = do_no_restart_syscall;
current->restart_block.nanosleep.type = rmtp ? TT_NATIVE : TT_NONE;
current->restart_block.nanosleep.rmtp = rmtp;
@@ -1297,6 +1298,7 @@ SYSCALL_DEFINE4(clock_nanosleep_time32, clockid_t, which_clock, int, flags,
return -EINVAL;
if (flags & TIMER_ABSTIME)
rmtp = NULL;
+ current->restart_block.fn = do_no_restart_syscall;
current->restart_block.nanosleep.type = rmtp ? TT_COMPAT : TT_NONE;
current->restart_block.nanosleep.compat_rmtp = rmtp;
diff --git a/kernel/time/test_udelay.c b/kernel/time/test_udelay.c
index 13b11eb62685..20d5df631570 100644
--- a/kernel/time/test_udelay.c
+++ b/kernel/time/test_udelay.c
@@ -149,7 +149,7 @@ module_init(udelay_test_init);
static void __exit udelay_test_exit(void)
{
mutex_lock(&udelay_test_lock);
- debugfs_remove(debugfs_lookup(DEBUGFS_FILENAME, NULL));
+ debugfs_lookup_and_remove(DEBUGFS_FILENAME, NULL);
mutex_unlock(&udelay_test_lock);
}
diff --git a/kernel/time/tick-broadcast-hrtimer.c b/kernel/time/tick-broadcast-hrtimer.c
index 797eb93103ad..e28f9210f8a1 100644
--- a/kernel/time/tick-broadcast-hrtimer.c
+++ b/kernel/time/tick-broadcast-hrtimer.c
@@ -56,25 +56,20 @@ static int bc_set_next(ktime_t expires, struct clock_event_device *bc)
* hrtimer callback function is currently running, then
* hrtimer_start() cannot move it and the timer stays on the CPU on
* which it is assigned at the moment.
+ */
+ hrtimer_start(&bctimer, expires, HRTIMER_MODE_ABS_PINNED_HARD);
+ /*
+ * The core tick broadcast mode expects bc->bound_on to be set
+ * correctly to prevent a CPU which has the broadcast hrtimer
+ * armed from going deep idle.
*
- * As this can be called from idle code, the hrtimer_start()
- * invocation has to be wrapped with RCU_NONIDLE() as
- * hrtimer_start() can call into tracing.
+ * As tick_broadcast_lock is held, nothing can change the cpu
+ * base which was just established in hrtimer_start() above. So
+ * the below access is safe even without holding the hrtimer
+ * base lock.
*/
- RCU_NONIDLE( {
- hrtimer_start(&bctimer, expires, HRTIMER_MODE_ABS_PINNED_HARD);
- /*
- * The core tick broadcast mode expects bc->bound_on to be set
- * correctly to prevent a CPU which has the broadcast hrtimer
- * armed from going deep idle.
- *
- * As tick_broadcast_lock is held, nothing can change the cpu
- * base which was just established in hrtimer_start() above. So
- * the below access is safe even without holding the hrtimer
- * base lock.
- */
- bc->bound_on = bctimer.base->cpu_base->cpu;
- } );
+ bc->bound_on = bctimer.base->cpu_base->cpu;
+
return 0;
}
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index f7fe6fe36173..93bf2b4e47e5 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -622,9 +622,13 @@ struct cpumask *tick_get_broadcast_oneshot_mask(void)
* to avoid a deep idle transition as we are about to get the
* broadcast IPI right away.
*/
-int tick_check_broadcast_expired(void)
+noinstr int tick_check_broadcast_expired(void)
{
+#ifdef _ASM_GENERIC_BITOPS_INSTRUMENTED_NON_ATOMIC_H
+ return arch_test_bit(smp_processor_id(), cpumask_bits(tick_broadcast_force_mask));
+#else
return cpumask_test_cpu(smp_processor_id(), tick_broadcast_force_mask);
+#endif
}
/*
diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c
index 475ecceda768..5e2c2c26b3cc 100644
--- a/kernel/time/tick-oneshot.c
+++ b/kernel/time/tick-oneshot.c
@@ -18,7 +18,7 @@
#include "tick-internal.h"
/**
- * tick_program_event
+ * tick_program_event - program the CPU local timer device for the next event
*/
int tick_program_event(ktime_t expires, int force)
{
@@ -99,7 +99,7 @@ int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *))
}
/**
- * tick_check_oneshot_mode - check whether the system is in oneshot mode
+ * tick_oneshot_mode_active - check whether the system is in oneshot mode
*
* returns 1 when either nohz or highres are enabled. otherwise 0.
*/
diff --git a/kernel/time/time.c b/kernel/time/time.c
index 526257b3727c..f4198af60fee 100644
--- a/kernel/time/time.c
+++ b/kernel/time/time.c
@@ -462,7 +462,7 @@ struct __kernel_old_timeval ns_to_kernel_old_timeval(s64 nsec)
EXPORT_SYMBOL(ns_to_kernel_old_timeval);
/**
- * set_normalized_timespec - set timespec sec and nsec parts and normalize
+ * set_normalized_timespec64 - set timespec sec and nsec parts and normalize
*
* @ts: pointer to timespec variable to be set
* @sec: seconds to set
@@ -526,7 +526,7 @@ struct timespec64 ns_to_timespec64(s64 nsec)
EXPORT_SYMBOL(ns_to_timespec64);
/**
- * msecs_to_jiffies: - convert milliseconds to jiffies
+ * __msecs_to_jiffies: - convert milliseconds to jiffies
* @m: time in milliseconds
*
* conversion is done as follows:
@@ -541,12 +541,12 @@ EXPORT_SYMBOL(ns_to_timespec64);
* handling any 32-bit overflows.
* for the details see __msecs_to_jiffies()
*
- * msecs_to_jiffies() checks for the passed in value being a constant
+ * __msecs_to_jiffies() checks for the passed in value being a constant
* via __builtin_constant_p() allowing gcc to eliminate most of the
* code, __msecs_to_jiffies() is called if the value passed does not
* allow constant folding and the actual conversion must be done at
* runtime.
- * the _msecs_to_jiffies helpers are the HZ dependent conversion
+ * The _msecs_to_jiffies helpers are the HZ dependent conversion
* routines found in include/linux/jiffies.h
*/
unsigned long __msecs_to_jiffies(const unsigned int m)
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index f72b9f1de178..5579ead449f2 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -1590,10 +1590,10 @@ void __weak read_persistent_clock64(struct timespec64 *ts)
/**
* read_persistent_wall_and_boot_offset - Read persistent clock, and also offset
* from the boot.
+ * @wall_time: current time as returned by persistent clock
+ * @boot_offset: offset that is defined as wall_time - boot_time
*
* Weak dummy function for arches that do not yet support it.
- * @wall_time: - current time as returned by persistent clock
- * @boot_offset: - offset that is defined as wall_time - boot_time
*
* The default function calculates offset based on the current value of
* local_clock(). This way architectures that support sched_clock() but don't
@@ -1701,7 +1701,7 @@ static void __timekeeping_inject_sleeptime(struct timekeeper *tk,
}
#if defined(CONFIG_PM_SLEEP) && defined(CONFIG_RTC_HCTOSYS_DEVICE)
-/**
+/*
* We have three kinds of time sources to use for sleep time
* injection, the preference order is:
* 1) non-stop clocksource
@@ -1722,7 +1722,7 @@ bool timekeeping_rtc_skipresume(void)
return !suspend_timing_needed;
}
-/**
+/*
* 1) can be determined whether to use or not only when doing
* timekeeping_resume() which is invoked after rtc_suspend(),
* so we can't skip rtc_suspend() surely if system has 1).
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 197545241ab8..d7043043f59c 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -933,8 +933,8 @@ config RING_BUFFER_RECORD_RECURSION
default y
help
The ring buffer has its own internal recursion. Although when
- recursion happens it wont cause harm because of the protection,
- but it does cause an unwanted overhead. Enabling this option will
+ recursion happens it won't cause harm because of the protection,
+ but it does cause unwanted overhead. Enabling this option will
place where recursion was detected into the ftrace "recursed_functions"
file.
@@ -1017,8 +1017,8 @@ config RING_BUFFER_STARTUP_TEST
The test runs for 10 seconds. This will slow your boot time
by at least 10 more seconds.
- At the end of the test, statics and more checks are done.
- It will output the stats of each per cpu buffer. What
+ At the end of the test, statistics and more checks are done.
+ It will output the stats of each per cpu buffer: What
was written, the sizes, what was read, what was lost, and
other similar details.
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 918a7d12df8f..5743be559415 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -320,8 +320,8 @@ static void blk_trace_free(struct request_queue *q, struct blk_trace *bt)
* under 'q->debugfs_dir', thus lookup and remove them.
*/
if (!bt->dir) {
- debugfs_remove(debugfs_lookup("dropped", q->debugfs_dir));
- debugfs_remove(debugfs_lookup("msg", q->debugfs_dir));
+ debugfs_lookup_and_remove("dropped", q->debugfs_dir);
+ debugfs_lookup_and_remove("msg", q->debugfs_dir);
} else {
debugfs_remove(bt->dir);
}
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 3bbd3f0c810c..b8ac8b09c86f 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -687,8 +687,7 @@ BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map,
}
perf_sample_data_init(sd, 0, 0);
- sd->raw = &raw;
- sd->sample_flags |= PERF_SAMPLE_RAW;
+ perf_sample_save_raw_data(sd, &raw);
err = __bpf_perf_event_output(regs, map, flags, sd);
@@ -746,8 +745,7 @@ u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,
perf_fetch_caller_regs(regs);
perf_sample_data_init(sd, 0, 0);
- sd->raw = &raw;
- sd->sample_flags |= PERF_SAMPLE_RAW;
+ perf_sample_save_raw_data(sd, &raw);
ret = __bpf_perf_event_output(regs, map, flags, sd);
out:
@@ -833,6 +831,7 @@ static void do_bpf_send_signal(struct irq_work *entry)
work = container_of(entry, struct send_signal_irq_work, irq_work);
group_send_sig_info(work->sig, SEND_SIG_PRIV, work->task, work->type);
+ put_task_struct(work->task);
}
static int bpf_send_signal_common(u32 sig, enum pid_type type)
@@ -848,6 +847,9 @@ static int bpf_send_signal_common(u32 sig, enum pid_type type)
return -EPERM;
if (unlikely(!nmi_uaccess_okay()))
return -EPERM;
+ /* Task should not be pid=1 to avoid kernel panic. */
+ if (unlikely(is_global_init(current)))
+ return -EPERM;
if (irqs_disabled()) {
/* Do an early check on signal validity. Otherwise,
@@ -864,7 +866,7 @@ static int bpf_send_signal_common(u32 sig, enum pid_type type)
* to the irq_work. The current task may change when queued
* irq works get executed.
*/
- work->task = current;
+ work->task = get_task_struct(current);
work->sig = sig;
work->type = type;
irq_work_queue(&work->irq_work);
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 442438b93fe9..750aa3f08b25 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1248,12 +1248,17 @@ static void free_ftrace_hash_rcu(struct ftrace_hash *hash)
call_rcu(&hash->rcu, __free_ftrace_hash_rcu);
}
+/**
+ * ftrace_free_filter - remove all filters for an ftrace_ops
+ * @ops - the ops to remove the filters from
+ */
void ftrace_free_filter(struct ftrace_ops *ops)
{
ftrace_ops_init(ops);
free_ftrace_hash(ops->func_hash->filter_hash);
free_ftrace_hash(ops->func_hash->notrace_hash);
}
+EXPORT_SYMBOL_GPL(ftrace_free_filter);
static struct ftrace_hash *alloc_ftrace_hash(int size_bits)
{
@@ -5839,6 +5844,10 @@ EXPORT_SYMBOL_GPL(modify_ftrace_direct_multi);
*
* Filters denote which functions should be enabled when tracing is enabled
* If @ip is NULL, it fails to update filter.
+ *
+ * This can allocate memory which must be freed before @ops can be freed,
+ * either by removing each filtered addr or by using
+ * ftrace_free_filter(@ops).
*/
int ftrace_set_filter_ip(struct ftrace_ops *ops, unsigned long ip,
int remove, int reset)
@@ -5858,7 +5867,11 @@ EXPORT_SYMBOL_GPL(ftrace_set_filter_ip);
*
* Filters denote which functions should be enabled when tracing is enabled
* If @ips array or any ip specified within is NULL , it fails to update filter.
- */
+ *
+ * This can allocate memory which must be freed before @ops can be freed,
+ * either by removing each filtered addr or by using
+ * ftrace_free_filter(@ops).
+*/
int ftrace_set_filter_ips(struct ftrace_ops *ops, unsigned long *ips,
unsigned int cnt, int remove, int reset)
{
@@ -5900,6 +5913,10 @@ ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len,
*
* Filters denote which functions should be enabled when tracing is enabled.
* If @buf is NULL and reset is set, all functions will be enabled for tracing.
+ *
+ * This can allocate memory which must be freed before @ops can be freed,
+ * either by removing each filtered addr or by using
+ * ftrace_free_filter(@ops).
*/
int ftrace_set_filter(struct ftrace_ops *ops, unsigned char *buf,
int len, int reset)
@@ -5919,6 +5936,10 @@ EXPORT_SYMBOL_GPL(ftrace_set_filter);
* Notrace Filters denote which functions should not be enabled when tracing
* is enabled. If @buf is NULL and reset is set, all functions will be enabled
* for tracing.
+ *
+ * This can allocate memory which must be freed before @ops can be freed,
+ * either by removing each filtered addr or by using
+ * ftrace_free_filter(@ops).
*/
int ftrace_set_notrace(struct ftrace_ops *ops, unsigned char *buf,
int len, int reset)
diff --git a/kernel/trace/rv/rv.c b/kernel/trace/rv/rv.c
index 6c97cc2d754a..7e9061828c24 100644
--- a/kernel/trace/rv/rv.c
+++ b/kernel/trace/rv/rv.c
@@ -516,7 +516,7 @@ static ssize_t enabled_monitors_write(struct file *filp, const char __user *user
struct rv_monitor_def *mdef;
int retval = -EINVAL;
bool enable = true;
- char *ptr = buff;
+ char *ptr;
int len;
if (count < 1 || count > MAX_RV_MONITOR_NAME_SIZE + 1)
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index a555a861b978..54a163ae4815 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3128,6 +3128,9 @@ void __trace_stack(struct trace_array *tr, unsigned int trace_ctx,
return;
}
+ if (WARN_ON_ONCE(IS_ENABLED(CONFIG_GENERIC_ENTRY)))
+ return;
+
/*
* When an NMI triggers, RCU is enabled via ct_nmi_enter(),
* but if the above rcu_is_watching() failed, then the NMI
@@ -9148,9 +9151,6 @@ buffer_percent_write(struct file *filp, const char __user *ubuf,
if (val > 100)
return -EINVAL;
- if (!val)
- val = 1;
-
tr->buffer_percent = val;
(*ppos)++;
@@ -10295,6 +10295,8 @@ void __init early_trace_init(void)
static_key_enable(&tracepoint_printk_key.key);
}
tracer_alloc_buffers();
+
+ init_events();
}
void __init trace_init(void)
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index e46a49269be2..085a31b978a5 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -1282,6 +1282,7 @@ struct ftrace_event_field {
int offset;
int size;
int is_signed;
+ int len;
};
struct prog_entry;
@@ -1490,6 +1491,7 @@ extern void trace_event_enable_cmd_record(bool enable);
extern void trace_event_enable_tgid_record(bool enable);
extern int event_trace_init(void);
+extern int init_events(void);
extern int event_trace_add_tracer(struct dentry *parent, struct trace_array *tr);
extern int event_trace_del_tracer(struct trace_array *tr);
extern void __trace_early_add_events(struct trace_array *tr);
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 33e0b4f8ebe6..6a942fa275c7 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -114,7 +114,7 @@ trace_find_event_field(struct trace_event_call *call, char *name)
static int __trace_define_field(struct list_head *head, const char *type,
const char *name, int offset, int size,
- int is_signed, int filter_type)
+ int is_signed, int filter_type, int len)
{
struct ftrace_event_field *field;
@@ -133,6 +133,7 @@ static int __trace_define_field(struct list_head *head, const char *type,
field->offset = offset;
field->size = size;
field->is_signed = is_signed;
+ field->len = len;
list_add(&field->link, head);
@@ -150,14 +151,28 @@ int trace_define_field(struct trace_event_call *call, const char *type,
head = trace_get_fields(call);
return __trace_define_field(head, type, name, offset, size,
- is_signed, filter_type);
+ is_signed, filter_type, 0);
}
EXPORT_SYMBOL_GPL(trace_define_field);
+static int trace_define_field_ext(struct trace_event_call *call, const char *type,
+ const char *name, int offset, int size, int is_signed,
+ int filter_type, int len)
+{
+ struct list_head *head;
+
+ if (WARN_ON(!call->class))
+ return 0;
+
+ head = trace_get_fields(call);
+ return __trace_define_field(head, type, name, offset, size,
+ is_signed, filter_type, len);
+}
+
#define __generic_field(type, item, filter_type) \
ret = __trace_define_field(&ftrace_generic_fields, #type, \
#item, 0, 0, is_signed_type(type), \
- filter_type); \
+ filter_type, 0); \
if (ret) \
return ret;
@@ -166,7 +181,7 @@ EXPORT_SYMBOL_GPL(trace_define_field);
"common_" #item, \
offsetof(typeof(ent), item), \
sizeof(ent.item), \
- is_signed_type(type), FILTER_OTHER); \
+ is_signed_type(type), FILTER_OTHER, 0); \
if (ret) \
return ret;
@@ -1588,12 +1603,17 @@ static int f_show(struct seq_file *m, void *v)
seq_printf(m, "\tfield:%s %s;\toffset:%u;\tsize:%u;\tsigned:%d;\n",
field->type, field->name, field->offset,
field->size, !!field->is_signed);
- else
- seq_printf(m, "\tfield:%.*s %s%s;\toffset:%u;\tsize:%u;\tsigned:%d;\n",
+ else if (field->len)
+ seq_printf(m, "\tfield:%.*s %s[%d];\toffset:%u;\tsize:%u;\tsigned:%d;\n",
(int)(array_descriptor - field->type),
field->type, field->name,
- array_descriptor, field->offset,
+ field->len, field->offset,
field->size, !!field->is_signed);
+ else
+ seq_printf(m, "\tfield:%.*s %s[];\toffset:%u;\tsize:%u;\tsigned:%d;\n",
+ (int)(array_descriptor - field->type),
+ field->type, field->name,
+ field->offset, field->size, !!field->is_signed);
return 0;
}
@@ -2379,9 +2399,10 @@ event_define_fields(struct trace_event_call *call)
}
offset = ALIGN(offset, field->align);
- ret = trace_define_field(call, field->type, field->name,
+ ret = trace_define_field_ext(call, field->type, field->name,
offset, field->size,
- field->is_signed, field->filter_type);
+ field->is_signed, field->filter_type,
+ field->len);
if (WARN_ON_ONCE(ret)) {
pr_err("error code is %d\n", ret);
break;
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 96acc2b71ac7..e095c3b3a50d 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -128,7 +128,7 @@ static bool is_not(const char *str)
}
/**
- * prog_entry - a singe entry in the filter program
+ * struct prog_entry - a singe entry in the filter program
* @target: Index to jump to on a branch (actually one minus the index)
* @when_to_branch: The value of the result of the predicate to do a branch
* @pred: The predicate to execute.
@@ -140,16 +140,16 @@ struct prog_entry {
};
/**
- * update_preds- assign a program entry a label target
+ * update_preds - assign a program entry a label target
* @prog: The program array
* @N: The index of the current entry in @prog
- * @when_to_branch: What to assign a program entry for its branch condition
+ * @invert: What to assign a program entry for its branch condition
*
* The program entry at @N has a target that points to the index of a program
* entry that can have its target and when_to_branch fields updated.
* Update the current program entry denoted by index @N target field to be
* that of the updated entry. This will denote the entry to update if
- * we are processing an "||" after an "&&"
+ * we are processing an "||" after an "&&".
*/
static void update_preds(struct prog_entry *prog, int N, int invert)
{
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index fcaf226b7744..5edbf6b1da3f 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -1988,6 +1988,8 @@ static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data,
hist_field->fn_num = flags & HIST_FIELD_FL_LOG2 ? HIST_FIELD_FN_LOG2 :
HIST_FIELD_FN_BUCKET;
hist_field->operands[0] = create_hist_field(hist_data, field, fl, NULL);
+ if (!hist_field->operands[0])
+ goto free;
hist_field->size = hist_field->operands[0]->size;
hist_field->type = kstrdup_const(hist_field->operands[0]->type, GFP_KERNEL);
if (!hist_field->type)
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index d960f6b11b5e..58f3946081e2 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -111,7 +111,8 @@ static void __always_unused ____ftrace_check_##name(void) \
#define __array(_type, _item, _len) { \
.type = #_type"["__stringify(_len)"]", .name = #_item, \
.size = sizeof(_type[_len]), .align = __alignof__(_type), \
- is_signed_type(_type), .filter_type = FILTER_OTHER },
+ is_signed_type(_type), .filter_type = FILTER_OTHER, \
+ .len = _len },
#undef __array_desc
#define __array_desc(_type, _container, _item, _len) __array(_type, _item, _len)
diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c
index 94c1b5eb1dc0..210e1f168392 100644
--- a/kernel/trace/trace_osnoise.c
+++ b/kernel/trace/trace_osnoise.c
@@ -147,9 +147,8 @@ static void osnoise_unregister_instance(struct trace_array *tr)
* register/unregister serialization is provided by trace's
* trace_types_lock.
*/
- lockdep_assert_held(&trace_types_lock);
-
- list_for_each_entry_rcu(inst, &osnoise_instances, list) {
+ list_for_each_entry_rcu(inst, &osnoise_instances, list,
+ lockdep_is_held(&trace_types_lock)) {
if (inst->tr == tr) {
list_del_rcu(&inst->list);
found = 1;
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 57a13b61f186..bd475a00f96d 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -1535,7 +1535,7 @@ static struct trace_event *events[] __initdata = {
NULL
};
-__init static int init_events(void)
+__init int init_events(void)
{
struct trace_event *event;
int i, ret;
@@ -1548,4 +1548,3 @@ __init static int init_events(void)
return 0;
}
-early_initcall(init_events);
diff --git a/kernel/trace/trace_preemptirq.c b/kernel/trace/trace_preemptirq.c
index 1e130da1b742..e37446f7916e 100644
--- a/kernel/trace/trace_preemptirq.c
+++ b/kernel/trace/trace_preemptirq.c
@@ -15,6 +15,20 @@
#define CREATE_TRACE_POINTS
#include <trace/events/preemptirq.h>
+/*
+ * Use regular trace points on architectures that implement noinstr
+ * tooling: these calls will only happen with RCU enabled, which can
+ * use a regular tracepoint.
+ *
+ * On older architectures, use the rcuidle tracing methods (which
+ * aren't NMI-safe - so exclude NMI contexts):
+ */
+#ifdef CONFIG_ARCH_WANTS_NO_INSTR
+#define trace(point) trace_##point
+#else
+#define trace(point) if (!in_nmi()) trace_##point##_rcuidle
+#endif
+
#ifdef CONFIG_TRACE_IRQFLAGS
/* Per-cpu variable to prevent redundant calls when IRQs already off */
static DEFINE_PER_CPU(int, tracing_irq_cpu);
@@ -28,8 +42,7 @@ static DEFINE_PER_CPU(int, tracing_irq_cpu);
void trace_hardirqs_on_prepare(void)
{
if (this_cpu_read(tracing_irq_cpu)) {
- if (!in_nmi())
- trace_irq_enable(CALLER_ADDR0, CALLER_ADDR1);
+ trace(irq_enable)(CALLER_ADDR0, CALLER_ADDR1);
tracer_hardirqs_on(CALLER_ADDR0, CALLER_ADDR1);
this_cpu_write(tracing_irq_cpu, 0);
}
@@ -40,8 +53,7 @@ NOKPROBE_SYMBOL(trace_hardirqs_on_prepare);
void trace_hardirqs_on(void)
{
if (this_cpu_read(tracing_irq_cpu)) {
- if (!in_nmi())
- trace_irq_enable_rcuidle(CALLER_ADDR0, CALLER_ADDR1);
+ trace(irq_enable)(CALLER_ADDR0, CALLER_ADDR1);
tracer_hardirqs_on(CALLER_ADDR0, CALLER_ADDR1);
this_cpu_write(tracing_irq_cpu, 0);
}
@@ -63,8 +75,7 @@ void trace_hardirqs_off_finish(void)
if (!this_cpu_read(tracing_irq_cpu)) {
this_cpu_write(tracing_irq_cpu, 1);
tracer_hardirqs_off(CALLER_ADDR0, CALLER_ADDR1);
- if (!in_nmi())
- trace_irq_disable(CALLER_ADDR0, CALLER_ADDR1);
+ trace(irq_disable)(CALLER_ADDR0, CALLER_ADDR1);
}
}
@@ -78,56 +89,24 @@ void trace_hardirqs_off(void)
if (!this_cpu_read(tracing_irq_cpu)) {
this_cpu_write(tracing_irq_cpu, 1);
tracer_hardirqs_off(CALLER_ADDR0, CALLER_ADDR1);
- if (!in_nmi())
- trace_irq_disable_rcuidle(CALLER_ADDR0, CALLER_ADDR1);
+ trace(irq_disable)(CALLER_ADDR0, CALLER_ADDR1);
}
}
EXPORT_SYMBOL(trace_hardirqs_off);
NOKPROBE_SYMBOL(trace_hardirqs_off);
-
-__visible void trace_hardirqs_on_caller(unsigned long caller_addr)
-{
- if (this_cpu_read(tracing_irq_cpu)) {
- if (!in_nmi())
- trace_irq_enable_rcuidle(CALLER_ADDR0, caller_addr);
- tracer_hardirqs_on(CALLER_ADDR0, caller_addr);
- this_cpu_write(tracing_irq_cpu, 0);
- }
-
- lockdep_hardirqs_on_prepare();
- lockdep_hardirqs_on(caller_addr);
-}
-EXPORT_SYMBOL(trace_hardirqs_on_caller);
-NOKPROBE_SYMBOL(trace_hardirqs_on_caller);
-
-__visible void trace_hardirqs_off_caller(unsigned long caller_addr)
-{
- lockdep_hardirqs_off(caller_addr);
-
- if (!this_cpu_read(tracing_irq_cpu)) {
- this_cpu_write(tracing_irq_cpu, 1);
- tracer_hardirqs_off(CALLER_ADDR0, caller_addr);
- if (!in_nmi())
- trace_irq_disable_rcuidle(CALLER_ADDR0, caller_addr);
- }
-}
-EXPORT_SYMBOL(trace_hardirqs_off_caller);
-NOKPROBE_SYMBOL(trace_hardirqs_off_caller);
#endif /* CONFIG_TRACE_IRQFLAGS */
#ifdef CONFIG_TRACE_PREEMPT_TOGGLE
void trace_preempt_on(unsigned long a0, unsigned long a1)
{
- if (!in_nmi())
- trace_preempt_enable_rcuidle(a0, a1);
+ trace(preempt_enable)(a0, a1);
tracer_preempt_on(a0, a1);
}
void trace_preempt_off(unsigned long a0, unsigned long a1)
{
- if (!in_nmi())
- trace_preempt_disable_rcuidle(a0, a1);
+ trace(preempt_disable)(a0, a1);
tracer_preempt_off(a0, a1);
}
#endif
diff --git a/kernel/umh.c b/kernel/umh.c
index 850631518665..fbf872c624cb 100644
--- a/kernel/umh.c
+++ b/kernel/umh.c
@@ -438,21 +438,27 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait)
if (wait == UMH_NO_WAIT) /* task has freed sub_info */
goto unlock;
- if (wait & UMH_KILLABLE)
- state |= TASK_KILLABLE;
-
if (wait & UMH_FREEZABLE)
state |= TASK_FREEZABLE;
- retval = wait_for_completion_state(&done, state);
- if (!retval)
- goto wait_done;
-
if (wait & UMH_KILLABLE) {
+ retval = wait_for_completion_state(&done, state | TASK_KILLABLE);
+ if (!retval)
+ goto wait_done;
+
/* umh_complete() will see NULL and free sub_info */
if (xchg(&sub_info->complete, NULL))
goto unlock;
+
+ /*
+ * fallthrough; in case of -ERESTARTSYS now do uninterruptible
+ * wait_for_completion_state(). Since umh_complete() shall call
+ * complete() in a moment if xchg() above returned NULL, this
+ * uninterruptible wait_for_completion_state() will not block
+ * SIGKILL'ed processes for long.
+ */
}
+ wait_for_completion_state(&done, state);
wait_done:
retval = sub_info->retval;
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 07895deca271..b8b541caed48 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -169,7 +169,9 @@ struct worker_pool {
struct list_head idle_list; /* L: list of idle workers */
struct timer_list idle_timer; /* L: worker idle timeout */
- struct timer_list mayday_timer; /* L: SOS timer for workers */
+ struct work_struct idle_cull_work; /* L: worker idle cleanup */
+
+ struct timer_list mayday_timer; /* L: SOS timer for workers */
/* a workers is either on busy_hash or idle_list, or the manager */
DECLARE_HASHTABLE(busy_hash, BUSY_WORKER_HASH_ORDER);
@@ -177,6 +179,7 @@ struct worker_pool {
struct worker *manager; /* L: purely informational */
struct list_head workers; /* A: attached workers */
+ struct list_head dying_workers; /* A: workers about to die */
struct completion *detach_completion; /* all workers detached */
struct ida worker_ida; /* worker IDs for task name */
@@ -326,7 +329,7 @@ static struct rcuwait manager_wait = __RCUWAIT_INITIALIZER(manager_wait);
static LIST_HEAD(workqueues); /* PR: list of all workqueues */
static bool workqueue_freezing; /* PL: have wqs started freezing? */
-/* PL: allowable cpus for unbound wqs and work items */
+/* PL&A: allowable cpus for unbound wqs and work items */
static cpumask_var_t wq_unbound_cpumask;
/* CPU where unbound work was last round robin scheduled from this CPU */
@@ -1433,9 +1436,13 @@ static void __queue_work(int cpu, struct workqueue_struct *wq,
lockdep_assert_irqs_disabled();
- /* if draining, only works from the same workqueue are allowed */
- if (unlikely(wq->flags & __WQ_DRAINING) &&
- WARN_ON_ONCE(!is_chained_work(wq)))
+ /*
+ * For a draining wq, only works from the same workqueue are
+ * allowed. The __WQ_DESTROYING helps to spot the issue that
+ * queues a new work item to a wq after destroy_workqueue(wq).
+ */
+ if (unlikely(wq->flags & (__WQ_DESTROYING | __WQ_DRAINING) &&
+ WARN_ON_ONCE(!is_chained_work(wq))))
return;
rcu_read_lock();
retry:
@@ -1900,7 +1907,7 @@ static void worker_detach_from_pool(struct worker *worker)
list_del(&worker->node);
worker->pool = NULL;
- if (list_empty(&pool->workers))
+ if (list_empty(&pool->workers) && list_empty(&pool->dying_workers))
detach_completion = pool->detach_completion;
mutex_unlock(&wq_pool_attach_mutex);
@@ -1972,21 +1979,55 @@ fail:
return NULL;
}
+static void unbind_worker(struct worker *worker)
+{
+ lockdep_assert_held(&wq_pool_attach_mutex);
+
+ kthread_set_per_cpu(worker->task, -1);
+ if (cpumask_intersects(wq_unbound_cpumask, cpu_active_mask))
+ WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, wq_unbound_cpumask) < 0);
+ else
+ WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, cpu_possible_mask) < 0);
+}
+
+static void wake_dying_workers(struct list_head *cull_list)
+{
+ struct worker *worker, *tmp;
+
+ list_for_each_entry_safe(worker, tmp, cull_list, entry) {
+ list_del_init(&worker->entry);
+ unbind_worker(worker);
+ /*
+ * If the worker was somehow already running, then it had to be
+ * in pool->idle_list when set_worker_dying() happened or we
+ * wouldn't have gotten here.
+ *
+ * Thus, the worker must either have observed the WORKER_DIE
+ * flag, or have set its state to TASK_IDLE. Either way, the
+ * below will be observed by the worker and is safe to do
+ * outside of pool->lock.
+ */
+ wake_up_process(worker->task);
+ }
+}
+
/**
- * destroy_worker - destroy a workqueue worker
+ * set_worker_dying - Tag a worker for destruction
* @worker: worker to be destroyed
+ * @list: transfer worker away from its pool->idle_list and into list
*
- * Destroy @worker and adjust @pool stats accordingly. The worker should
- * be idle.
+ * Tag @worker for destruction and adjust @pool stats accordingly. The worker
+ * should be idle.
*
* CONTEXT:
* raw_spin_lock_irq(pool->lock).
*/
-static void destroy_worker(struct worker *worker)
+static void set_worker_dying(struct worker *worker, struct list_head *list)
{
struct worker_pool *pool = worker->pool;
lockdep_assert_held(&pool->lock);
+ lockdep_assert_held(&wq_pool_attach_mutex);
/* sanity check frenzy */
if (WARN_ON(worker->current_work) ||
@@ -1997,34 +2038,94 @@ static void destroy_worker(struct worker *worker)
pool->nr_workers--;
pool->nr_idle--;
- list_del_init(&worker->entry);
worker->flags |= WORKER_DIE;
- wake_up_process(worker->task);
+
+ list_move(&worker->entry, list);
+ list_move(&worker->node, &pool->dying_workers);
}
+/**
+ * idle_worker_timeout - check if some idle workers can now be deleted.
+ * @t: The pool's idle_timer that just expired
+ *
+ * The timer is armed in worker_enter_idle(). Note that it isn't disarmed in
+ * worker_leave_idle(), as a worker flicking between idle and active while its
+ * pool is at the too_many_workers() tipping point would cause too much timer
+ * housekeeping overhead. Since IDLE_WORKER_TIMEOUT is long enough, we just let
+ * it expire and re-evaluate things from there.
+ */
static void idle_worker_timeout(struct timer_list *t)
{
struct worker_pool *pool = from_timer(pool, t, idle_timer);
+ bool do_cull = false;
+
+ if (work_pending(&pool->idle_cull_work))
+ return;
raw_spin_lock_irq(&pool->lock);
- while (too_many_workers(pool)) {
+ if (too_many_workers(pool)) {
struct worker *worker;
unsigned long expires;
/* idle_list is kept in LIFO order, check the last one */
worker = list_entry(pool->idle_list.prev, struct worker, entry);
expires = worker->last_active + IDLE_WORKER_TIMEOUT;
+ do_cull = !time_before(jiffies, expires);
+
+ if (!do_cull)
+ mod_timer(&pool->idle_timer, expires);
+ }
+ raw_spin_unlock_irq(&pool->lock);
+
+ if (do_cull)
+ queue_work(system_unbound_wq, &pool->idle_cull_work);
+}
+
+/**
+ * idle_cull_fn - cull workers that have been idle for too long.
+ * @work: the pool's work for handling these idle workers
+ *
+ * This goes through a pool's idle workers and gets rid of those that have been
+ * idle for at least IDLE_WORKER_TIMEOUT seconds.
+ *
+ * We don't want to disturb isolated CPUs because of a pcpu kworker being
+ * culled, so this also resets worker affinity. This requires a sleepable
+ * context, hence the split between timer callback and work item.
+ */
+static void idle_cull_fn(struct work_struct *work)
+{
+ struct worker_pool *pool = container_of(work, struct worker_pool, idle_cull_work);
+ struct list_head cull_list;
+
+ INIT_LIST_HEAD(&cull_list);
+ /*
+ * Grabbing wq_pool_attach_mutex here ensures an already-running worker
+ * cannot proceed beyong worker_detach_from_pool() in its self-destruct
+ * path. This is required as a previously-preempted worker could run after
+ * set_worker_dying() has happened but before wake_dying_workers() did.
+ */
+ mutex_lock(&wq_pool_attach_mutex);
+ raw_spin_lock_irq(&pool->lock);
+
+ while (too_many_workers(pool)) {
+ struct worker *worker;
+ unsigned long expires;
+
+ worker = list_entry(pool->idle_list.prev, struct worker, entry);
+ expires = worker->last_active + IDLE_WORKER_TIMEOUT;
if (time_before(jiffies, expires)) {
mod_timer(&pool->idle_timer, expires);
break;
}
- destroy_worker(worker);
+ set_worker_dying(worker, &cull_list);
}
raw_spin_unlock_irq(&pool->lock);
+ wake_dying_workers(&cull_list);
+ mutex_unlock(&wq_pool_attach_mutex);
}
static void send_mayday(struct work_struct *work)
@@ -2388,12 +2489,12 @@ woke_up:
/* am I supposed to die? */
if (unlikely(worker->flags & WORKER_DIE)) {
raw_spin_unlock_irq(&pool->lock);
- WARN_ON_ONCE(!list_empty(&worker->entry));
set_pf_worker(false);
set_task_comm(worker->task, "kworker/dying");
ida_free(&pool->worker_ida, worker->id);
worker_detach_from_pool(worker);
+ WARN_ON_ONCE(!list_empty(&worker->entry));
kfree(worker);
return 0;
}
@@ -3462,10 +3563,12 @@ static int init_worker_pool(struct worker_pool *pool)
hash_init(pool->busy_hash);
timer_setup(&pool->idle_timer, idle_worker_timeout, TIMER_DEFERRABLE);
+ INIT_WORK(&pool->idle_cull_work, idle_cull_fn);
timer_setup(&pool->mayday_timer, pool_mayday_timeout, 0);
INIT_LIST_HEAD(&pool->workers);
+ INIT_LIST_HEAD(&pool->dying_workers);
ida_init(&pool->worker_ida);
INIT_HLIST_NODE(&pool->hash_node);
@@ -3540,18 +3643,6 @@ static void rcu_free_pool(struct rcu_head *rcu)
kfree(pool);
}
-/* This returns with the lock held on success (pool manager is inactive). */
-static bool wq_manager_inactive(struct worker_pool *pool)
-{
- raw_spin_lock_irq(&pool->lock);
-
- if (pool->flags & POOL_MANAGER_ACTIVE) {
- raw_spin_unlock_irq(&pool->lock);
- return false;
- }
- return true;
-}
-
/**
* put_unbound_pool - put a worker_pool
* @pool: worker_pool to put
@@ -3566,8 +3657,11 @@ static bool wq_manager_inactive(struct worker_pool *pool)
static void put_unbound_pool(struct worker_pool *pool)
{
DECLARE_COMPLETION_ONSTACK(detach_completion);
+ struct list_head cull_list;
struct worker *worker;
+ INIT_LIST_HEAD(&cull_list);
+
lockdep_assert_held(&wq_pool_mutex);
if (--pool->refcnt)
@@ -3587,20 +3681,38 @@ static void put_unbound_pool(struct worker_pool *pool)
* Become the manager and destroy all workers. This prevents
* @pool's workers from blocking on attach_mutex. We're the last
* manager and @pool gets freed with the flag set.
- * Because of how wq_manager_inactive() works, we will hold the
- * spinlock after a successful wait.
+ *
+ * Having a concurrent manager is quite unlikely to happen as we can
+ * only get here with
+ * pwq->refcnt == pool->refcnt == 0
+ * which implies no work queued to the pool, which implies no worker can
+ * become the manager. However a worker could have taken the role of
+ * manager before the refcnts dropped to 0, since maybe_create_worker()
+ * drops pool->lock
*/
- rcuwait_wait_event(&manager_wait, wq_manager_inactive(pool),
- TASK_UNINTERRUPTIBLE);
- pool->flags |= POOL_MANAGER_ACTIVE;
+ while (true) {
+ rcuwait_wait_event(&manager_wait,
+ !(pool->flags & POOL_MANAGER_ACTIVE),
+ TASK_UNINTERRUPTIBLE);
+
+ mutex_lock(&wq_pool_attach_mutex);
+ raw_spin_lock_irq(&pool->lock);
+ if (!(pool->flags & POOL_MANAGER_ACTIVE)) {
+ pool->flags |= POOL_MANAGER_ACTIVE;
+ break;
+ }
+ raw_spin_unlock_irq(&pool->lock);
+ mutex_unlock(&wq_pool_attach_mutex);
+ }
while ((worker = first_idle_worker(pool)))
- destroy_worker(worker);
+ set_worker_dying(worker, &cull_list);
WARN_ON(pool->nr_workers || pool->nr_idle);
raw_spin_unlock_irq(&pool->lock);
- mutex_lock(&wq_pool_attach_mutex);
- if (!list_empty(&pool->workers))
+ wake_dying_workers(&cull_list);
+
+ if (!list_empty(&pool->workers) || !list_empty(&pool->dying_workers))
pool->detach_completion = &detach_completion;
mutex_unlock(&wq_pool_attach_mutex);
@@ -3609,6 +3721,7 @@ static void put_unbound_pool(struct worker_pool *pool)
/* shut down the timers */
del_timer_sync(&pool->idle_timer);
+ cancel_work_sync(&pool->idle_cull_work);
del_timer_sync(&pool->mayday_timer);
/* RCU protected to allow dereferences from get_work_pool() */
@@ -3952,7 +4065,8 @@ static void apply_wqattrs_cleanup(struct apply_wqattrs_ctx *ctx)
/* allocate the attrs and pwqs for later installation */
static struct apply_wqattrs_ctx *
apply_wqattrs_prepare(struct workqueue_struct *wq,
- const struct workqueue_attrs *attrs)
+ const struct workqueue_attrs *attrs,
+ const cpumask_var_t unbound_cpumask)
{
struct apply_wqattrs_ctx *ctx;
struct workqueue_attrs *new_attrs, *tmp_attrs;
@@ -3968,14 +4082,15 @@ apply_wqattrs_prepare(struct workqueue_struct *wq,
goto out_free;
/*
- * Calculate the attrs of the default pwq.
+ * Calculate the attrs of the default pwq with unbound_cpumask
+ * which is wq_unbound_cpumask or to set to wq_unbound_cpumask.
* If the user configured cpumask doesn't overlap with the
* wq_unbound_cpumask, we fallback to the wq_unbound_cpumask.
*/
copy_workqueue_attrs(new_attrs, attrs);
- cpumask_and(new_attrs->cpumask, new_attrs->cpumask, wq_unbound_cpumask);
+ cpumask_and(new_attrs->cpumask, new_attrs->cpumask, unbound_cpumask);
if (unlikely(cpumask_empty(new_attrs->cpumask)))
- cpumask_copy(new_attrs->cpumask, wq_unbound_cpumask);
+ cpumask_copy(new_attrs->cpumask, unbound_cpumask);
/*
* We may create multiple pwqs with differing cpumasks. Make a
@@ -4072,7 +4187,7 @@ static int apply_workqueue_attrs_locked(struct workqueue_struct *wq,
wq->flags &= ~__WQ_ORDERED;
}
- ctx = apply_wqattrs_prepare(wq, attrs);
+ ctx = apply_wqattrs_prepare(wq, attrs, wq_unbound_cpumask);
if (!ctx)
return -ENOMEM;
@@ -4414,6 +4529,11 @@ void destroy_workqueue(struct workqueue_struct *wq)
*/
workqueue_sysfs_unregister(wq);
+ /* mark the workqueue destruction is in progress */
+ mutex_lock(&wq->mutex);
+ wq->flags |= __WQ_DESTROYING;
+ mutex_unlock(&wq->mutex);
+
/* drain it before proceeding with destruction */
drain_workqueue(wq);
@@ -4709,22 +4829,53 @@ static void pr_cont_pool_info(struct worker_pool *pool)
pr_cont(" flags=0x%x nice=%d", pool->flags, pool->attrs->nice);
}
-static void pr_cont_work(bool comma, struct work_struct *work)
+struct pr_cont_work_struct {
+ bool comma;
+ work_func_t func;
+ long ctr;
+};
+
+static void pr_cont_work_flush(bool comma, work_func_t func, struct pr_cont_work_struct *pcwsp)
+{
+ if (!pcwsp->ctr)
+ goto out_record;
+ if (func == pcwsp->func) {
+ pcwsp->ctr++;
+ return;
+ }
+ if (pcwsp->ctr == 1)
+ pr_cont("%s %ps", pcwsp->comma ? "," : "", pcwsp->func);
+ else
+ pr_cont("%s %ld*%ps", pcwsp->comma ? "," : "", pcwsp->ctr, pcwsp->func);
+ pcwsp->ctr = 0;
+out_record:
+ if ((long)func == -1L)
+ return;
+ pcwsp->comma = comma;
+ pcwsp->func = func;
+ pcwsp->ctr = 1;
+}
+
+static void pr_cont_work(bool comma, struct work_struct *work, struct pr_cont_work_struct *pcwsp)
{
if (work->func == wq_barrier_func) {
struct wq_barrier *barr;
barr = container_of(work, struct wq_barrier, work);
+ pr_cont_work_flush(comma, (work_func_t)-1, pcwsp);
pr_cont("%s BAR(%d)", comma ? "," : "",
task_pid_nr(barr->task));
} else {
- pr_cont("%s %ps", comma ? "," : "", work->func);
+ if (!comma)
+ pr_cont_work_flush(comma, (work_func_t)-1, pcwsp);
+ pr_cont_work_flush(comma, work->func, pcwsp);
}
}
static void show_pwq(struct pool_workqueue *pwq)
{
+ struct pr_cont_work_struct pcws = { .ctr = 0, };
struct worker_pool *pool = pwq->pool;
struct work_struct *work;
struct worker *worker;
@@ -4757,7 +4908,8 @@ static void show_pwq(struct pool_workqueue *pwq)
worker->rescue_wq ? "(RESCUER)" : "",
worker->current_func);
list_for_each_entry(work, &worker->scheduled, entry)
- pr_cont_work(false, work);
+ pr_cont_work(false, work, &pcws);
+ pr_cont_work_flush(comma, (work_func_t)-1L, &pcws);
comma = true;
}
pr_cont("\n");
@@ -4777,9 +4929,10 @@ static void show_pwq(struct pool_workqueue *pwq)
if (get_work_pwq(work) != pwq)
continue;
- pr_cont_work(comma, work);
+ pr_cont_work(comma, work, &pcws);
comma = !(*work_data_bits(work) & WORK_STRUCT_LINKED);
}
+ pr_cont_work_flush(comma, (work_func_t)-1L, &pcws);
pr_cont("\n");
}
@@ -4788,9 +4941,10 @@ static void show_pwq(struct pool_workqueue *pwq)
pr_info(" inactive:");
list_for_each_entry(work, &pwq->inactive_works, entry) {
- pr_cont_work(comma, work);
+ pr_cont_work(comma, work, &pcws);
comma = !(*work_data_bits(work) & WORK_STRUCT_LINKED);
}
+ pr_cont_work_flush(comma, (work_func_t)-1L, &pcws);
pr_cont("\n");
}
}
@@ -5006,13 +5160,8 @@ static void unbind_workers(int cpu)
raw_spin_unlock_irq(&pool->lock);
- for_each_pool_worker(worker, pool) {
- kthread_set_per_cpu(worker->task, -1);
- if (cpumask_intersects(wq_unbound_cpumask, cpu_active_mask))
- WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, wq_unbound_cpumask) < 0);
- else
- WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, cpu_possible_mask) < 0);
- }
+ for_each_pool_worker(worker, pool)
+ unbind_worker(worker);
mutex_unlock(&wq_pool_attach_mutex);
}
@@ -5334,7 +5483,7 @@ out_unlock:
}
#endif /* CONFIG_FREEZER */
-static int workqueue_apply_unbound_cpumask(void)
+static int workqueue_apply_unbound_cpumask(const cpumask_var_t unbound_cpumask)
{
LIST_HEAD(ctxs);
int ret = 0;
@@ -5350,7 +5499,7 @@ static int workqueue_apply_unbound_cpumask(void)
if (wq->flags & __WQ_ORDERED)
continue;
- ctx = apply_wqattrs_prepare(wq, wq->unbound_attrs);
+ ctx = apply_wqattrs_prepare(wq, wq->unbound_attrs, unbound_cpumask);
if (!ctx) {
ret = -ENOMEM;
break;
@@ -5365,6 +5514,11 @@ static int workqueue_apply_unbound_cpumask(void)
apply_wqattrs_cleanup(ctx);
}
+ if (!ret) {
+ mutex_lock(&wq_pool_attach_mutex);
+ cpumask_copy(wq_unbound_cpumask, unbound_cpumask);
+ mutex_unlock(&wq_pool_attach_mutex);
+ }
return ret;
}
@@ -5383,7 +5537,6 @@ static int workqueue_apply_unbound_cpumask(void)
int workqueue_set_unbound_cpumask(cpumask_var_t cpumask)
{
int ret = -EINVAL;
- cpumask_var_t saved_cpumask;
/*
* Not excluding isolated cpus on purpose.
@@ -5397,23 +5550,8 @@ int workqueue_set_unbound_cpumask(cpumask_var_t cpumask)
goto out_unlock;
}
- if (!zalloc_cpumask_var(&saved_cpumask, GFP_KERNEL)) {
- ret = -ENOMEM;
- goto out_unlock;
- }
-
- /* save the old wq_unbound_cpumask. */
- cpumask_copy(saved_cpumask, wq_unbound_cpumask);
-
- /* update wq_unbound_cpumask at first and apply it to wqs. */
- cpumask_copy(wq_unbound_cpumask, cpumask);
- ret = workqueue_apply_unbound_cpumask();
-
- /* restore the wq_unbound_cpumask when failed. */
- if (ret < 0)
- cpumask_copy(wq_unbound_cpumask, saved_cpumask);
+ ret = workqueue_apply_unbound_cpumask(cpumask);
- free_cpumask_var(saved_cpumask);
out_unlock:
apply_wqattrs_unlock();
}