summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/auditsc.c22
-rw-r--r--kernel/bpf/bpf_lsm.c1
-rw-r--r--kernel/bpf/btf.c4
-rw-r--r--kernel/bpf/hashtab.c4
-rw-r--r--kernel/bpf/inode.c8
-rw-r--r--kernel/bpf/memalloc.c2
-rw-r--r--kernel/bpf/offload.c3
-rw-r--r--kernel/bpf/syscall.c24
-rw-r--r--kernel/bpf/verifier.c35
-rw-r--r--kernel/capability.c10
-rw-r--r--kernel/cgroup/cgroup.c2
-rw-r--r--kernel/cgroup/cpuset.c48
-rw-r--r--kernel/events/core.c39
-rw-r--r--kernel/fork.c2
-rwxr-xr-xkernel/gen_kheaders.sh2
-rw-r--r--kernel/irq/irqdomain.c4
-rw-r--r--kernel/irq/msi.c6
-rw-r--r--kernel/locking/qspinlock.c4
-rw-r--r--kernel/locking/rtmutex.c5
-rw-r--r--kernel/locking/rwsem.c87
-rw-r--r--kernel/module/main.c26
-rw-r--r--kernel/printk/printk.c2
-rw-r--r--kernel/sched/core.c15
-rw-r--r--kernel/sched/fair.c48
-rw-r--r--kernel/sched/psi.c7
-rw-r--r--kernel/sys.c2
-rw-r--r--kernel/time/alarmtimer.c33
-rw-r--r--kernel/trace/Kconfig8
-rw-r--r--kernel/trace/blktrace.c4
-rw-r--r--kernel/trace/bpf_trace.c6
-rw-r--r--kernel/trace/ftrace.c23
-rw-r--r--kernel/trace/rv/rv.c2
-rw-r--r--kernel/trace/trace.c5
-rw-r--r--kernel/trace/trace.h2
-rw-r--r--kernel/trace/trace_events.c39
-rw-r--r--kernel/trace/trace_events_filter.c8
-rw-r--r--kernel/trace/trace_events_hist.c2
-rw-r--r--kernel/trace/trace_export.c3
-rw-r--r--kernel/trace/trace_osnoise.c5
-rw-r--r--kernel/trace/trace_output.c3
-rw-r--r--kernel/umh.c20
41 files changed, 377 insertions, 198 deletions
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 547c88be8a28..93d0b87f3283 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -64,6 +64,7 @@
#include <uapi/linux/limits.h>
#include <uapi/linux/netfilter/nf_tables.h>
#include <uapi/linux/openat2.h> // struct open_how
+#include <uapi/linux/fanotify.h>
#include "audit.h"
@@ -2252,7 +2253,7 @@ static inline int audit_copy_fcaps(struct audit_names *name,
if (!dentry)
return 0;
- rc = get_vfs_caps_from_disk(&init_user_ns, dentry, &caps);
+ rc = get_vfs_caps_from_disk(&nop_mnt_idmap, dentry, &caps);
if (rc)
return rc;
@@ -2807,7 +2808,7 @@ int __audit_log_bprm_fcaps(struct linux_binprm *bprm,
ax->d.next = context->aux;
context->aux = (void *)ax;
- get_vfs_caps_from_disk(&init_user_ns,
+ get_vfs_caps_from_disk(&nop_mnt_idmap,
bprm->file->f_path.dentry, &vcaps);
ax->fcap.permitted = vcaps.permitted;
@@ -2877,10 +2878,21 @@ void __audit_log_kern_module(char *name)
context->type = AUDIT_KERN_MODULE;
}
-void __audit_fanotify(unsigned int response)
+void __audit_fanotify(u32 response, struct fanotify_response_info_audit_rule *friar)
{
- audit_log(audit_context(), GFP_KERNEL,
- AUDIT_FANOTIFY, "resp=%u", response);
+ /* {subj,obj}_trust values are {0,1,2}: no,yes,unknown */
+ switch (friar->hdr.type) {
+ case FAN_RESPONSE_INFO_NONE:
+ audit_log(audit_context(), GFP_KERNEL, AUDIT_FANOTIFY,
+ "resp=%u fan_type=%u fan_info=0 subj_trust=2 obj_trust=2",
+ response, FAN_RESPONSE_INFO_NONE);
+ break;
+ case FAN_RESPONSE_INFO_AUDIT_RULE:
+ audit_log(audit_context(), GFP_KERNEL, AUDIT_FANOTIFY,
+ "resp=%u fan_type=%u fan_info=%X subj_trust=%u obj_trust=%u",
+ response, friar->hdr.type, friar->rule_number,
+ friar->subj_trust, friar->obj_trust);
+ }
}
void __audit_tk_injoffset(struct timespec64 offset)
diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c
index a4a41ee3e80b..e14c822f8911 100644
--- a/kernel/bpf/bpf_lsm.c
+++ b/kernel/bpf/bpf_lsm.c
@@ -51,7 +51,6 @@ BTF_SET_END(bpf_lsm_current_hooks)
*/
BTF_SET_START(bpf_lsm_locked_sockopt_hooks)
#ifdef CONFIG_SECURITY_NETWORK
-BTF_ID(func, bpf_lsm_socket_sock_rcv_skb)
BTF_ID(func, bpf_lsm_sock_graft)
BTF_ID(func, bpf_lsm_inet_csk_clone)
BTF_ID(func, bpf_lsm_inet_conn_established)
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index f7dd8af06413..b7017cae6fd1 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -7782,9 +7782,9 @@ int register_btf_id_dtor_kfuncs(const struct btf_id_dtor_kfunc *dtors, u32 add_c
sort(tab->dtors, tab->cnt, sizeof(tab->dtors[0]), btf_id_cmp_func, NULL);
- return 0;
end:
- btf_free_dtor_kfunc_tab(btf);
+ if (ret)
+ btf_free_dtor_kfunc_tab(btf);
btf_put(btf);
return ret;
}
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 5aa2b5525f79..66bded144377 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -152,7 +152,7 @@ static inline int htab_lock_bucket(const struct bpf_htab *htab,
{
unsigned long flags;
- hash = hash & HASHTAB_MAP_LOCK_MASK;
+ hash = hash & min_t(u32, HASHTAB_MAP_LOCK_MASK, htab->n_buckets - 1);
preempt_disable();
if (unlikely(__this_cpu_inc_return(*(htab->map_locked[hash])) != 1)) {
@@ -171,7 +171,7 @@ static inline void htab_unlock_bucket(const struct bpf_htab *htab,
struct bucket *b, u32 hash,
unsigned long flags)
{
- hash = hash & HASHTAB_MAP_LOCK_MASK;
+ hash = hash & min_t(u32, HASHTAB_MAP_LOCK_MASK, htab->n_buckets - 1);
raw_spin_unlock_irqrestore(&b->raw_lock, flags);
__this_cpu_dec(*(htab->map_locked[hash]));
preempt_enable();
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index 4f841e16779e..9948b542a470 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -122,7 +122,7 @@ static struct inode *bpf_get_inode(struct super_block *sb,
inode->i_mtime = inode->i_atime;
inode->i_ctime = inode->i_atime;
- inode_init_owner(&init_user_ns, inode, dir, mode);
+ inode_init_owner(&nop_mnt_idmap, inode, dir, mode);
return inode;
}
@@ -152,7 +152,7 @@ static void bpf_dentry_finalize(struct dentry *dentry, struct inode *inode,
dir->i_ctime = dir->i_mtime;
}
-static int bpf_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
+static int bpf_mkdir(struct mnt_idmap *idmap, struct inode *dir,
struct dentry *dentry, umode_t mode)
{
struct inode *inode;
@@ -382,7 +382,7 @@ bpf_lookup(struct inode *dir, struct dentry *dentry, unsigned flags)
return simple_lookup(dir, dentry, flags);
}
-static int bpf_symlink(struct user_namespace *mnt_userns, struct inode *dir,
+static int bpf_symlink(struct mnt_idmap *idmap, struct inode *dir,
struct dentry *dentry, const char *target)
{
char *link = kstrdup(target, GFP_USER | __GFP_NOWARN);
@@ -559,7 +559,7 @@ int bpf_obj_get_user(const char __user *pathname, int flags)
static struct bpf_prog *__get_prog_inode(struct inode *inode, enum bpf_prog_type type)
{
struct bpf_prog *prog;
- int ret = inode_permission(&init_user_ns, inode, MAY_READ);
+ int ret = inode_permission(&nop_mnt_idmap, inode, MAY_READ);
if (ret)
return ERR_PTR(ret);
diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c
index ebcc3dd0fa19..1db156405b68 100644
--- a/kernel/bpf/memalloc.c
+++ b/kernel/bpf/memalloc.c
@@ -71,7 +71,7 @@ static int bpf_mem_cache_idx(size_t size)
if (size <= 192)
return size_index[(size - 1) / 8] - 1;
- return fls(size - 1) - 1;
+ return fls(size - 1) - 2;
}
#define NUM_CACHES 11
diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c
index 13e4efc971e6..190d9f9dc987 100644
--- a/kernel/bpf/offload.c
+++ b/kernel/bpf/offload.c
@@ -216,9 +216,6 @@ static void __bpf_prog_offload_destroy(struct bpf_prog *prog)
if (offload->dev_state)
offload->offdev->ops->destroy(prog);
- /* Make sure BPF_PROG_GET_NEXT_ID can't find this dead program */
- bpf_prog_free_id(prog, true);
-
list_del_init(&offload->offloads);
kfree(offload);
prog->aux->offload = NULL;
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 64131f88c553..ecca9366c7a6 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1972,7 +1972,7 @@ static void bpf_audit_prog(const struct bpf_prog *prog, unsigned int op)
return;
if (audit_enabled == AUDIT_OFF)
return;
- if (op == BPF_AUDIT_LOAD)
+ if (!in_irq() && !irqs_disabled())
ctx = audit_context();
ab = audit_log_start(ctx, GFP_ATOMIC, AUDIT_BPF);
if (unlikely(!ab))
@@ -2001,7 +2001,7 @@ static int bpf_prog_alloc_id(struct bpf_prog *prog)
return id > 0 ? 0 : id;
}
-void bpf_prog_free_id(struct bpf_prog *prog, bool do_idr_lock)
+void bpf_prog_free_id(struct bpf_prog *prog)
{
unsigned long flags;
@@ -2013,18 +2013,10 @@ void bpf_prog_free_id(struct bpf_prog *prog, bool do_idr_lock)
if (!prog->aux->id)
return;
- if (do_idr_lock)
- spin_lock_irqsave(&prog_idr_lock, flags);
- else
- __acquire(&prog_idr_lock);
-
+ spin_lock_irqsave(&prog_idr_lock, flags);
idr_remove(&prog_idr, prog->aux->id);
prog->aux->id = 0;
-
- if (do_idr_lock)
- spin_unlock_irqrestore(&prog_idr_lock, flags);
- else
- __release(&prog_idr_lock);
+ spin_unlock_irqrestore(&prog_idr_lock, flags);
}
static void __bpf_prog_put_rcu(struct rcu_head *rcu)
@@ -2067,17 +2059,15 @@ static void bpf_prog_put_deferred(struct work_struct *work)
prog = aux->prog;
perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_UNLOAD, 0);
bpf_audit_prog(prog, BPF_AUDIT_UNLOAD);
+ bpf_prog_free_id(prog);
__bpf_prog_put_noref(prog, true);
}
-static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock)
+static void __bpf_prog_put(struct bpf_prog *prog)
{
struct bpf_prog_aux *aux = prog->aux;
if (atomic64_dec_and_test(&aux->refcnt)) {
- /* bpf_prog_free_id() must be called first */
- bpf_prog_free_id(prog, do_idr_lock);
-
if (in_irq() || irqs_disabled()) {
INIT_WORK(&aux->work, bpf_prog_put_deferred);
schedule_work(&aux->work);
@@ -2089,7 +2079,7 @@ static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock)
void bpf_prog_put(struct bpf_prog *prog)
{
- __bpf_prog_put(prog, true);
+ __bpf_prog_put(prog);
}
EXPORT_SYMBOL_GPL(bpf_prog_put);
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 85f96c1e9f62..7ee218827259 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2748,6 +2748,12 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx,
*/
if (insn->src_reg == 0 && is_callback_calling_function(insn->imm))
return -ENOTSUPP;
+ /* kfunc with imm==0 is invalid and fixup_kfunc_call will
+ * catch this error later. Make backtracking conservative
+ * with ENOTSUPP.
+ */
+ if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL && insn->imm == 0)
+ return -ENOTSUPP;
/* regular helper call sets R0 */
*reg_mask &= ~1;
if (*reg_mask & 0x3f) {
@@ -3237,13 +3243,24 @@ static bool __is_pointer_value(bool allow_ptr_leaks,
return reg->type != SCALAR_VALUE;
}
+/* Copy src state preserving dst->parent and dst->live fields */
+static void copy_register_state(struct bpf_reg_state *dst, const struct bpf_reg_state *src)
+{
+ struct bpf_reg_state *parent = dst->parent;
+ enum bpf_reg_liveness live = dst->live;
+
+ *dst = *src;
+ dst->parent = parent;
+ dst->live = live;
+}
+
static void save_register_state(struct bpf_func_state *state,
int spi, struct bpf_reg_state *reg,
int size)
{
int i;
- state->stack[spi].spilled_ptr = *reg;
+ copy_register_state(&state->stack[spi].spilled_ptr, reg);
if (size == BPF_REG_SIZE)
state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN;
@@ -3289,7 +3306,9 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env,
bool sanitize = reg && is_spillable_regtype(reg->type);
for (i = 0; i < size; i++) {
- if (state->stack[spi].slot_type[i] == STACK_INVALID) {
+ u8 type = state->stack[spi].slot_type[i];
+
+ if (type != STACK_MISC && type != STACK_ZERO) {
sanitize = true;
break;
}
@@ -3569,7 +3588,7 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env,
*/
s32 subreg_def = state->regs[dst_regno].subreg_def;
- state->regs[dst_regno] = *reg;
+ copy_register_state(&state->regs[dst_regno], reg);
state->regs[dst_regno].subreg_def = subreg_def;
} else {
for (i = 0; i < size; i++) {
@@ -3590,7 +3609,7 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env,
if (dst_regno >= 0) {
/* restore register state from stack */
- state->regs[dst_regno] = *reg;
+ copy_register_state(&state->regs[dst_regno], reg);
/* mark reg as written since spilled pointer state likely
* has its liveness marks cleared by is_state_visited()
* which resets stack/reg liveness for state transitions
@@ -9584,7 +9603,7 @@ do_sim:
*/
if (!ptr_is_dst_reg) {
tmp = *dst_reg;
- *dst_reg = *ptr_reg;
+ copy_register_state(dst_reg, ptr_reg);
}
ret = sanitize_speculative_path(env, NULL, env->insn_idx + 1,
env->insn_idx);
@@ -10837,7 +10856,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
* to propagate min/max range.
*/
src_reg->id = ++env->id_gen;
- *dst_reg = *src_reg;
+ copy_register_state(dst_reg, src_reg);
dst_reg->live |= REG_LIVE_WRITTEN;
dst_reg->subreg_def = DEF_NOT_SUBREG;
} else {
@@ -10848,7 +10867,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
insn->src_reg);
return -EACCES;
} else if (src_reg->type == SCALAR_VALUE) {
- *dst_reg = *src_reg;
+ copy_register_state(dst_reg, src_reg);
/* Make sure ID is cleared otherwise
* dst_reg min/max could be incorrectly
* propagated into src_reg by find_equal_scalars()
@@ -11647,7 +11666,7 @@ static void find_equal_scalars(struct bpf_verifier_state *vstate,
bpf_for_each_reg_in_vstate(vstate, state, reg, ({
if (reg->type == SCALAR_VALUE && reg->id == known_reg->id)
- *reg = *known_reg;
+ copy_register_state(reg, known_reg);
}));
}
diff --git a/kernel/capability.c b/kernel/capability.c
index 860fd22117c1..339a44dfe2f4 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -486,11 +486,11 @@ EXPORT_SYMBOL(file_ns_capable);
* Return true if the inode uid and gid are within the namespace.
*/
bool privileged_wrt_inode_uidgid(struct user_namespace *ns,
- struct user_namespace *mnt_userns,
+ struct mnt_idmap *idmap,
const struct inode *inode)
{
- return vfsuid_has_mapping(ns, i_uid_into_vfsuid(mnt_userns, inode)) &&
- vfsgid_has_mapping(ns, i_gid_into_vfsgid(mnt_userns, inode));
+ return vfsuid_has_mapping(ns, i_uid_into_vfsuid(idmap, inode)) &&
+ vfsgid_has_mapping(ns, i_gid_into_vfsgid(idmap, inode));
}
/**
@@ -502,13 +502,13 @@ bool privileged_wrt_inode_uidgid(struct user_namespace *ns,
* its own user namespace and that the given inode's uid and gid are
* mapped into the current user namespace.
*/
-bool capable_wrt_inode_uidgid(struct user_namespace *mnt_userns,
+bool capable_wrt_inode_uidgid(struct mnt_idmap *idmap,
const struct inode *inode, int cap)
{
struct user_namespace *ns = current_user_ns();
return ns_capable(ns, cap) &&
- privileged_wrt_inode_uidgid(ns, mnt_userns, inode);
+ privileged_wrt_inode_uidgid(ns, idmap, inode);
}
EXPORT_SYMBOL(capable_wrt_inode_uidgid);
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index c099cf3fa02d..935e8121b21e 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -5065,7 +5065,7 @@ static int cgroup_may_write(const struct cgroup *cgrp, struct super_block *sb)
if (!inode)
return -ENOMEM;
- ret = inode_permission(&init_user_ns, inode, MAY_WRITE);
+ ret = inode_permission(&nop_mnt_idmap, inode, MAY_WRITE);
iput(inode);
return ret;
}
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index a29c0b13706b..ca826bd1eba3 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -1205,12 +1205,13 @@ void rebuild_sched_domains(void)
/**
* update_tasks_cpumask - Update the cpumasks of tasks in the cpuset.
* @cs: the cpuset in which each task's cpus_allowed mask needs to be changed
+ * @new_cpus: the temp variable for the new effective_cpus mask
*
* Iterate through each task of @cs updating its cpus_allowed to the
* effective cpuset's. As this function is called with cpuset_rwsem held,
* cpuset membership stays stable.
*/
-static void update_tasks_cpumask(struct cpuset *cs)
+static void update_tasks_cpumask(struct cpuset *cs, struct cpumask *new_cpus)
{
struct css_task_iter it;
struct task_struct *task;
@@ -1224,7 +1225,10 @@ static void update_tasks_cpumask(struct cpuset *cs)
if (top_cs && (task->flags & PF_KTHREAD) &&
kthread_is_per_cpu(task))
continue;
- set_cpus_allowed_ptr(task, cs->effective_cpus);
+
+ cpumask_and(new_cpus, cs->effective_cpus,
+ task_cpu_possible_mask(task));
+ set_cpus_allowed_ptr(task, new_cpus);
}
css_task_iter_end(&it);
}
@@ -1346,7 +1350,7 @@ static int update_parent_subparts_cpumask(struct cpuset *cs, int cmd,
* A parent can be left with no CPU as long as there is no
* task directly associated with the parent partition.
*/
- if (!cpumask_intersects(cs->cpus_allowed, parent->effective_cpus) &&
+ if (cpumask_subset(parent->effective_cpus, cs->cpus_allowed) &&
partition_is_populated(parent, cs))
return PERR_NOCPUS;
@@ -1509,7 +1513,7 @@ static int update_parent_subparts_cpumask(struct cpuset *cs, int cmd,
spin_unlock_irq(&callback_lock);
if (adding || deleting)
- update_tasks_cpumask(parent);
+ update_tasks_cpumask(parent, tmp->new_cpus);
/*
* Set or clear CS_SCHED_LOAD_BALANCE when partcmd_update, if necessary.
@@ -1661,7 +1665,7 @@ update_parent_subparts:
WARN_ON(!is_in_v2_mode() &&
!cpumask_equal(cp->cpus_allowed, cp->effective_cpus));
- update_tasks_cpumask(cp);
+ update_tasks_cpumask(cp, tmp->new_cpus);
/*
* On legacy hierarchy, if the effective cpumask of any non-
@@ -2309,7 +2313,7 @@ static int update_prstate(struct cpuset *cs, int new_prs)
}
}
- update_tasks_cpumask(parent);
+ update_tasks_cpumask(parent, tmpmask.new_cpus);
if (parent->child_ecpus_count)
update_sibling_cpumasks(parent, cs, &tmpmask);
@@ -2324,6 +2328,7 @@ out:
new_prs = -new_prs;
spin_lock_irq(&callback_lock);
cs->partition_root_state = new_prs;
+ WRITE_ONCE(cs->prs_err, err);
spin_unlock_irq(&callback_lock);
/*
* Update child cpusets, if present.
@@ -3347,7 +3352,7 @@ hotplug_update_tasks_legacy(struct cpuset *cs,
* as the tasks will be migrated to an ancestor.
*/
if (cpus_updated && !cpumask_empty(cs->cpus_allowed))
- update_tasks_cpumask(cs);
+ update_tasks_cpumask(cs, new_cpus);
if (mems_updated && !nodes_empty(cs->mems_allowed))
update_tasks_nodemask(cs);
@@ -3384,7 +3389,7 @@ hotplug_update_tasks(struct cpuset *cs,
spin_unlock_irq(&callback_lock);
if (cpus_updated)
- update_tasks_cpumask(cs);
+ update_tasks_cpumask(cs, new_cpus);
if (mems_updated)
update_tasks_nodemask(cs);
}
@@ -3691,15 +3696,38 @@ void __init cpuset_init_smp(void)
* Description: Returns the cpumask_var_t cpus_allowed of the cpuset
* attached to the specified @tsk. Guaranteed to return some non-empty
* subset of cpu_online_mask, even if this means going outside the
- * tasks cpuset.
+ * tasks cpuset, except when the task is in the top cpuset.
**/
void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
{
unsigned long flags;
+ struct cpuset *cs;
spin_lock_irqsave(&callback_lock, flags);
- guarantee_online_cpus(tsk, pmask);
+ rcu_read_lock();
+
+ cs = task_cs(tsk);
+ if (cs != &top_cpuset)
+ guarantee_online_cpus(tsk, pmask);
+ /*
+ * Tasks in the top cpuset won't get update to their cpumasks
+ * when a hotplug online/offline event happens. So we include all
+ * offline cpus in the allowed cpu list.
+ */
+ if ((cs == &top_cpuset) || cpumask_empty(pmask)) {
+ const struct cpumask *possible_mask = task_cpu_possible_mask(tsk);
+
+ /*
+ * We first exclude cpus allocated to partitions. If there is no
+ * allowable online cpu left, we fall back to all possible cpus.
+ */
+ cpumask_andnot(pmask, possible_mask, top_cpuset.subparts_cpus);
+ if (!cpumask_intersects(pmask, cpu_online_mask))
+ cpumask_copy(pmask, possible_mask);
+ }
+
+ rcu_read_unlock();
spin_unlock_irqrestore(&callback_lock, flags);
}
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 380476a934e8..7099c77bc53b 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -4813,19 +4813,17 @@ find_get_pmu_context(struct pmu *pmu, struct perf_event_context *ctx,
cpc = per_cpu_ptr(pmu->cpu_pmu_context, event->cpu);
epc = &cpc->epc;
-
+ raw_spin_lock_irq(&ctx->lock);
if (!epc->ctx) {
atomic_set(&epc->refcount, 1);
epc->embedded = 1;
- raw_spin_lock_irq(&ctx->lock);
list_add(&epc->pmu_ctx_entry, &ctx->pmu_ctx_list);
epc->ctx = ctx;
- raw_spin_unlock_irq(&ctx->lock);
} else {
WARN_ON_ONCE(epc->ctx != ctx);
atomic_inc(&epc->refcount);
}
-
+ raw_spin_unlock_irq(&ctx->lock);
return epc;
}
@@ -4896,33 +4894,30 @@ static void free_epc_rcu(struct rcu_head *head)
static void put_pmu_ctx(struct perf_event_pmu_context *epc)
{
+ struct perf_event_context *ctx = epc->ctx;
unsigned long flags;
- if (!atomic_dec_and_test(&epc->refcount))
+ /*
+ * XXX
+ *
+ * lockdep_assert_held(&ctx->mutex);
+ *
+ * can't because of the call-site in _free_event()/put_event()
+ * which isn't always called under ctx->mutex.
+ */
+ if (!atomic_dec_and_raw_lock_irqsave(&epc->refcount, &ctx->lock, flags))
return;
- if (epc->ctx) {
- struct perf_event_context *ctx = epc->ctx;
+ WARN_ON_ONCE(list_empty(&epc->pmu_ctx_entry));
- /*
- * XXX
- *
- * lockdep_assert_held(&ctx->mutex);
- *
- * can't because of the call-site in _free_event()/put_event()
- * which isn't always called under ctx->mutex.
- */
-
- WARN_ON_ONCE(list_empty(&epc->pmu_ctx_entry));
- raw_spin_lock_irqsave(&ctx->lock, flags);
- list_del_init(&epc->pmu_ctx_entry);
- epc->ctx = NULL;
- raw_spin_unlock_irqrestore(&ctx->lock, flags);
- }
+ list_del_init(&epc->pmu_ctx_entry);
+ epc->ctx = NULL;
WARN_ON_ONCE(!list_empty(&epc->pinned_active));
WARN_ON_ONCE(!list_empty(&epc->flexible_active));
+ raw_spin_unlock_irqrestore(&ctx->lock, flags);
+
if (epc->embedded)
return;
diff --git a/kernel/fork.c b/kernel/fork.c
index 9f7fe3541897..d9c97704b7c9 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1044,7 +1044,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
#endif
#ifdef CONFIG_BLK_CGROUP
- tsk->throttle_queue = NULL;
+ tsk->throttle_disk = NULL;
tsk->use_memdelay = 0;
#endif
diff --git a/kernel/gen_kheaders.sh b/kernel/gen_kheaders.sh
index 473036b43c83..81b97f0f6556 100755
--- a/kernel/gen_kheaders.sh
+++ b/kernel/gen_kheaders.sh
@@ -14,6 +14,8 @@ include/
arch/$SRCARCH/include/
"
+type cpio > /dev/null
+
# Support incremental builds by skipping archive generation
# if timestamps of files being archived are not changed.
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 8fe1da9614ee..798a9042421f 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -114,7 +114,7 @@ void irq_domain_free_fwnode(struct fwnode_handle *fwnode)
{
struct irqchip_fwid *fwid;
- if (WARN_ON(!is_fwnode_irqchip(fwnode)))
+ if (!fwnode || WARN_ON(!is_fwnode_irqchip(fwnode)))
return;
fwid = container_of(fwnode, struct irqchip_fwid, fwnode);
@@ -1915,7 +1915,7 @@ static void debugfs_add_domain_dir(struct irq_domain *d)
static void debugfs_remove_domain_dir(struct irq_domain *d)
{
- debugfs_remove(debugfs_lookup(d->name, domain_dir));
+ debugfs_lookup_and_remove(d->name, domain_dir);
}
void __init irq_domain_debugfs_init(struct dentry *root)
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index 955267bbc2be..783a3e6a0b10 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -1000,7 +1000,7 @@ bool msi_create_device_irq_domain(struct device *dev, unsigned int domid,
fail:
msi_unlock_descs(dev);
free_fwnode:
- kfree(fwnode);
+ irq_domain_free_fwnode(fwnode);
free_bundle:
kfree(bundle);
return false;
@@ -1013,6 +1013,7 @@ free_bundle:
*/
void msi_remove_device_irq_domain(struct device *dev, unsigned int domid)
{
+ struct fwnode_handle *fwnode = NULL;
struct msi_domain_info *info;
struct irq_domain *domain;
@@ -1025,7 +1026,10 @@ void msi_remove_device_irq_domain(struct device *dev, unsigned int domid)
dev->msi.data->__domains[domid].domain = NULL;
info = domain->host_data;
+ if (irq_domain_is_msi_device(domain))
+ fwnode = domain->fwnode;
irq_domain_remove(domain);
+ irq_domain_free_fwnode(fwnode);
kfree(container_of(info, struct msi_domain_template, info));
unlock:
diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index 2b23378775fe..ebe6b8ec7cb3 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -371,7 +371,7 @@ void __lockfunc queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
/*
* We're pending, wait for the owner to go away.
*
- * 0,1,1 -> 0,1,0
+ * 0,1,1 -> *,1,0
*
* this wait loop must be a load-acquire such that we match the
* store-release that clears the locked bit and create lock
@@ -380,7 +380,7 @@ void __lockfunc queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
* barriers.
*/
if (val & _Q_LOCKED_MASK)
- atomic_cond_read_acquire(&lock->val, !(VAL & _Q_LOCKED_MASK));
+ smp_cond_load_acquire(&lock->locked, !VAL);
/*
* take ownership and clear the pending bit.
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 010cf4e6d0b8..728f434de2bb 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -901,8 +901,9 @@ static int __sched rt_mutex_adjust_prio_chain(struct task_struct *task,
* then we need to wake the new top waiter up to try
* to get the lock.
*/
- if (prerequeue_top_waiter != rt_mutex_top_waiter(lock))
- wake_up_state(waiter->task, waiter->wake_state);
+ top_waiter = rt_mutex_top_waiter(lock);
+ if (prerequeue_top_waiter != top_waiter)
+ wake_up_state(top_waiter->task, top_waiter->wake_state);
raw_spin_unlock_irq(&lock->wait_lock);
return 0;
}
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index 44873594de03..acb5a50309a1 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -256,16 +256,13 @@ static inline bool rwsem_read_trylock(struct rw_semaphore *sem, long *cntp)
static inline bool rwsem_write_trylock(struct rw_semaphore *sem)
{
long tmp = RWSEM_UNLOCKED_VALUE;
- bool ret = false;
- preempt_disable();
if (atomic_long_try_cmpxchg_acquire(&sem->count, &tmp, RWSEM_WRITER_LOCKED)) {
rwsem_set_owner(sem);
- ret = true;
+ return true;
}
- preempt_enable();
- return ret;
+ return false;
}
/*
@@ -624,18 +621,16 @@ static inline bool rwsem_try_write_lock(struct rw_semaphore *sem,
*/
if (first->handoff_set && (waiter != first))
return false;
-
- /*
- * First waiter can inherit a previously set handoff
- * bit and spin on rwsem if lock acquisition fails.
- */
- if (waiter == first)
- waiter->handoff_set = true;
}
new = count;
if (count & RWSEM_LOCK_MASK) {
+ /*
+ * A waiter (first or not) can set the handoff bit
+ * if it is an RT task or wait in the wait queue
+ * for too long.
+ */
if (has_handoff || (!rt_task(waiter->task) &&
!time_after(jiffies, waiter->timeout)))
return false;
@@ -651,11 +646,12 @@ static inline bool rwsem_try_write_lock(struct rw_semaphore *sem,
} while (!atomic_long_try_cmpxchg_acquire(&sem->count, &count, new));
/*
- * We have either acquired the lock with handoff bit cleared or
- * set the handoff bit.
+ * We have either acquired the lock with handoff bit cleared or set
+ * the handoff bit. Only the first waiter can have its handoff_set
+ * set here to enable optimistic spinning in slowpath loop.
*/
if (new & RWSEM_FLAG_HANDOFF) {
- waiter->handoff_set = true;
+ first->handoff_set = true;
lockevent_inc(rwsem_wlock_handoff);
return false;
}
@@ -717,7 +713,6 @@ static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem)
return false;
}
- preempt_disable();
/*
* Disable preemption is equal to the RCU read-side crital section,
* thus the task_strcut structure won't go away.
@@ -729,7 +724,6 @@ static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem)
if ((flags & RWSEM_NONSPINNABLE) ||
(owner && !(flags & RWSEM_READER_OWNED) && !owner_on_cpu(owner)))
ret = false;
- preempt_enable();
lockevent_cond_inc(rwsem_opt_fail, !ret);
return ret;
@@ -829,8 +823,6 @@ static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
int loop = 0;
u64 rspin_threshold = 0;
- preempt_disable();
-
/* sem->wait_lock should not be held when doing optimistic spinning */
if (!osq_lock(&sem->osq))
goto done;
@@ -938,7 +930,6 @@ static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
}
osq_unlock(&sem->osq);
done:
- preempt_enable();
lockevent_cond_inc(rwsem_opt_fail, !taken);
return taken;
}
@@ -1092,7 +1083,7 @@ queue:
/* Ordered by sem->wait_lock against rwsem_mark_wake(). */
break;
}
- schedule();
+ schedule_preempt_disabled();
lockevent_inc(rwsem_sleep_reader);
}
@@ -1179,15 +1170,12 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state)
if (waiter.handoff_set) {
enum owner_state owner_state;
- preempt_disable();
owner_state = rwsem_spin_on_owner(sem);
- preempt_enable();
-
if (owner_state == OWNER_NULL)
goto trylock_again;
}
- schedule();
+ schedule_preempt_disabled();
lockevent_inc(rwsem_sleep_writer);
set_current_state(state);
trylock_again:
@@ -1254,14 +1242,20 @@ static struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem)
*/
static inline int __down_read_common(struct rw_semaphore *sem, int state)
{
+ int ret = 0;
long count;
+ preempt_disable();
if (!rwsem_read_trylock(sem, &count)) {
- if (IS_ERR(rwsem_down_read_slowpath(sem, count, state)))
- return -EINTR;
+ if (IS_ERR(rwsem_down_read_slowpath(sem, count, state))) {
+ ret = -EINTR;
+ goto out;
+ }
DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem);
}
- return 0;
+out:
+ preempt_enable();
+ return ret;
}
static inline void __down_read(struct rw_semaphore *sem)
@@ -1281,19 +1275,23 @@ static inline int __down_read_killable(struct rw_semaphore *sem)
static inline int __down_read_trylock(struct rw_semaphore *sem)
{
+ int ret = 0;
long tmp;
DEBUG_RWSEMS_WARN_ON(sem->magic != sem, sem);
+ preempt_disable();
tmp = atomic_long_read(&sem->count);
while (!(tmp & RWSEM_READ_FAILED_MASK)) {
if (atomic_long_try_cmpxchg_acquire(&sem->count, &tmp,
tmp + RWSEM_READER_BIAS)) {
rwsem_set_reader_owned(sem);
- return 1;
+ ret = 1;
+ break;
}
}
- return 0;
+ preempt_enable();
+ return ret;
}
/*
@@ -1301,12 +1299,15 @@ static inline int __down_read_trylock(struct rw_semaphore *sem)
*/
static inline int __down_write_common(struct rw_semaphore *sem, int state)
{
+ int ret = 0;
+
+ preempt_disable();
if (unlikely(!rwsem_write_trylock(sem))) {
if (IS_ERR(rwsem_down_write_slowpath(sem, state)))
- return -EINTR;
+ ret = -EINTR;
}
-
- return 0;
+ preempt_enable();
+ return ret;
}
static inline void __down_write(struct rw_semaphore *sem)
@@ -1321,8 +1322,14 @@ static inline int __down_write_killable(struct rw_semaphore *sem)
static inline int __down_write_trylock(struct rw_semaphore *sem)
{
+ int ret;
+
+ preempt_disable();
DEBUG_RWSEMS_WARN_ON(sem->magic != sem, sem);
- return rwsem_write_trylock(sem);
+ ret = rwsem_write_trylock(sem);
+ preempt_enable();
+
+ return ret;
}
/*
@@ -1335,6 +1342,7 @@ static inline void __up_read(struct rw_semaphore *sem)
DEBUG_RWSEMS_WARN_ON(sem->magic != sem, sem);
DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem);
+ preempt_disable();
rwsem_clear_reader_owned(sem);
tmp = atomic_long_add_return_release(-RWSEM_READER_BIAS, &sem->count);
DEBUG_RWSEMS_WARN_ON(tmp < 0, sem);
@@ -1343,6 +1351,7 @@ static inline void __up_read(struct rw_semaphore *sem)
clear_nonspinnable(sem);
rwsem_wake(sem);
}
+ preempt_enable();
}
/*
@@ -1363,9 +1372,9 @@ static inline void __up_write(struct rw_semaphore *sem)
preempt_disable();
rwsem_clear_owner(sem);
tmp = atomic_long_fetch_add_release(-RWSEM_WRITER_LOCKED, &sem->count);
- preempt_enable();
if (unlikely(tmp & RWSEM_FLAG_WAITERS))
rwsem_wake(sem);
+ preempt_enable();
}
/*
@@ -1383,11 +1392,13 @@ static inline void __downgrade_write(struct rw_semaphore *sem)
* write side. As such, rely on RELEASE semantics.
*/
DEBUG_RWSEMS_WARN_ON(rwsem_owner(sem) != current, sem);
+ preempt_disable();
tmp = atomic_long_fetch_add_release(
-RWSEM_WRITER_LOCKED+RWSEM_READER_BIAS, &sem->count);
rwsem_set_reader_owned(sem);
if (tmp & RWSEM_FLAG_WAITERS)
rwsem_downgrade_wake(sem);
+ preempt_enable();
}
#else /* !CONFIG_PREEMPT_RT */
@@ -1662,6 +1673,12 @@ void down_read_non_owner(struct rw_semaphore *sem)
{
might_sleep();
__down_read(sem);
+ /*
+ * The owner value for a reader-owned lock is mostly for debugging
+ * purpose only and is not critical to the correct functioning of
+ * rwsem. So it is perfectly fine to set it in a preempt-enabled
+ * context here.
+ */
__rwsem_set_reader_owned(sem, NULL);
}
EXPORT_SYMBOL(down_read_non_owner);
diff --git a/kernel/module/main.c b/kernel/module/main.c
index 48568a0f5651..4ac3fe43e6c8 100644
--- a/kernel/module/main.c
+++ b/kernel/module/main.c
@@ -2393,7 +2393,8 @@ static bool finished_loading(const char *name)
sched_annotate_sleep();
mutex_lock(&module_mutex);
mod = find_module_all(name, strlen(name), true);
- ret = !mod || mod->state == MODULE_STATE_LIVE;
+ ret = !mod || mod->state == MODULE_STATE_LIVE
+ || mod->state == MODULE_STATE_GOING;
mutex_unlock(&module_mutex);
return ret;
@@ -2569,20 +2570,35 @@ static int add_unformed_module(struct module *mod)
mod->state = MODULE_STATE_UNFORMED;
-again:
mutex_lock(&module_mutex);
old = find_module_all(mod->name, strlen(mod->name), true);
if (old != NULL) {
- if (old->state != MODULE_STATE_LIVE) {
+ if (old->state == MODULE_STATE_COMING
+ || old->state == MODULE_STATE_UNFORMED) {
/* Wait in case it fails to load. */
mutex_unlock(&module_mutex);
err = wait_event_interruptible(module_wq,
finished_loading(mod->name));
if (err)
goto out_unlocked;
- goto again;
+
+ /* The module might have gone in the meantime. */
+ mutex_lock(&module_mutex);
+ old = find_module_all(mod->name, strlen(mod->name),
+ true);
}
- err = -EEXIST;
+
+ /*
+ * We are here only when the same module was being loaded. Do
+ * not try to load it again right now. It prevents long delays
+ * caused by serialized module load failures. It might happen
+ * when more devices of the same type trigger load of
+ * a particular module.
+ */
+ if (old && old->state == MODULE_STATE_LIVE)
+ err = -EEXIST;
+ else
+ err = -EBUSY;
goto out;
}
mod_update_bounds(mod);
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 7decf1e9c486..a5ed2e53547c 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -123,6 +123,7 @@ bool console_srcu_read_lock_is_held(void)
{
return srcu_read_lock_held(&console_srcu);
}
+EXPORT_SYMBOL(console_srcu_read_lock_is_held);
#endif
enum devkmsg_log_bits {
@@ -1891,6 +1892,7 @@ static void console_lock_spinning_enable(void)
/**
* console_lock_spinning_disable_and_check - mark end of code where another
* thread was able to busy wait and check if there is a waiter
+ * @cookie: cookie returned from console_srcu_read_lock()
*
* This is called at the end of the section where spinning is allowed.
* It has two functions. First, it is a signal that it is no longer
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index bb1ee6d7bdde..2a4918a1faa9 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2951,8 +2951,11 @@ static int __set_cpus_allowed_ptr_locked(struct task_struct *p,
}
if (!(ctx->flags & SCA_MIGRATE_ENABLE)) {
- if (cpumask_equal(&p->cpus_mask, ctx->new_mask))
+ if (cpumask_equal(&p->cpus_mask, ctx->new_mask)) {
+ if (ctx->flags & SCA_USER)
+ swap(p->user_cpus_ptr, ctx->user_mask);
goto out;
+ }
if (WARN_ON_ONCE(p == current &&
is_migration_disabled(p) &&
@@ -8290,12 +8293,18 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
if (retval)
goto out_put_task;
+ /*
+ * With non-SMP configs, user_cpus_ptr/user_mask isn't used and
+ * alloc_user_cpus_ptr() returns NULL.
+ */
user_mask = alloc_user_cpus_ptr(NUMA_NO_NODE);
- if (IS_ENABLED(CONFIG_SMP) && !user_mask) {
+ if (user_mask) {
+ cpumask_copy(user_mask, in_mask);
+ } else if (IS_ENABLED(CONFIG_SMP)) {
retval = -ENOMEM;
goto out_put_task;
}
- cpumask_copy(user_mask, in_mask);
+
ac = (struct affinity_context){
.new_mask = in_mask,
.user_mask = user_mask,
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index c36aa54ae071..0f8736991427 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7229,10 +7229,10 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
eenv_task_busy_time(&eenv, p, prev_cpu);
for (; pd; pd = pd->next) {
+ unsigned long util_min = p_util_min, util_max = p_util_max;
unsigned long cpu_cap, cpu_thermal_cap, util;
unsigned long cur_delta, max_spare_cap = 0;
unsigned long rq_util_min, rq_util_max;
- unsigned long util_min, util_max;
unsigned long prev_spare_cap = 0;
int max_spare_cap_cpu = -1;
unsigned long base_energy;
@@ -7251,6 +7251,8 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
eenv.pd_cap = 0;
for_each_cpu(cpu, cpus) {
+ struct rq *rq = cpu_rq(cpu);
+
eenv.pd_cap += cpu_thermal_cap;
if (!cpumask_test_cpu(cpu, sched_domain_span(sd)))
@@ -7269,24 +7271,19 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
* much capacity we can get out of the CPU; this is
* aligned with sched_cpu_util().
*/
- if (uclamp_is_used()) {
- if (uclamp_rq_is_idle(cpu_rq(cpu))) {
- util_min = p_util_min;
- util_max = p_util_max;
- } else {
- /*
- * Open code uclamp_rq_util_with() except for
- * the clamp() part. Ie: apply max aggregation
- * only. util_fits_cpu() logic requires to
- * operate on non clamped util but must use the
- * max-aggregated uclamp_{min, max}.
- */
- rq_util_min = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MIN);
- rq_util_max = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MAX);
-
- util_min = max(rq_util_min, p_util_min);
- util_max = max(rq_util_max, p_util_max);
- }
+ if (uclamp_is_used() && !uclamp_rq_is_idle(rq)) {
+ /*
+ * Open code uclamp_rq_util_with() except for
+ * the clamp() part. Ie: apply max aggregation
+ * only. util_fits_cpu() logic requires to
+ * operate on non clamped util but must use the
+ * max-aggregated uclamp_{min, max}.
+ */
+ rq_util_min = uclamp_rq_get(rq, UCLAMP_MIN);
+ rq_util_max = uclamp_rq_get(rq, UCLAMP_MAX);
+
+ util_min = max(rq_util_min, p_util_min);
+ util_max = max(rq_util_max, p_util_max);
}
if (!util_fits_cpu(util, util_min, util_max, cpu))
continue;
@@ -8871,16 +8868,23 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu)
* * Thermal pressure will impact all cpus in this perf domain
* equally.
*/
- if (static_branch_unlikely(&sched_asym_cpucapacity)) {
+ if (sched_energy_enabled()) {
unsigned long inv_cap = capacity_orig - thermal_load_avg(rq);
- struct perf_domain *pd = rcu_dereference(rq->rd->pd);
+ struct perf_domain *pd;
+ rcu_read_lock();
+
+ pd = rcu_dereference(rq->rd->pd);
rq->cpu_capacity_inverted = 0;
for (; pd; pd = pd->next) {
struct cpumask *pd_span = perf_domain_span(pd);
unsigned long pd_cap_orig, pd_cap;
+ /* We can't be inverted against our own pd */
+ if (cpumask_test_cpu(cpu_of(rq), pd_span))
+ continue;
+
cpu = cpumask_any(pd_span);
pd_cap_orig = arch_scale_cpu_capacity(cpu);
@@ -8905,6 +8909,8 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu)
break;
}
}
+
+ rcu_read_unlock();
}
trace_sched_cpu_capacity_tp(rq);
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index 8ac8b81bfee6..02e011cabe91 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -1343,10 +1343,11 @@ void psi_trigger_destroy(struct psi_trigger *t)
group = t->group;
/*
- * Wakeup waiters to stop polling. Can happen if cgroup is deleted
- * from under a polling process.
+ * Wakeup waiters to stop polling and clear the queue to prevent it from
+ * being accessed later. Can happen if cgroup is deleted from under a
+ * polling process.
*/
- wake_up_interruptible(&t->event_wait);
+ wake_up_pollfree(&t->event_wait);
mutex_lock(&group->trigger_lock);
diff --git a/kernel/sys.c b/kernel/sys.c
index 5fd54bf0e886..88b31f096fb2 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1442,6 +1442,8 @@ static int do_prlimit(struct task_struct *tsk, unsigned int resource,
if (resource >= RLIM_NLIMITS)
return -EINVAL;
+ resource = array_index_nospec(resource, RLIM_NLIMITS);
+
if (new_rlim) {
if (new_rlim->rlim_cur > new_rlim->rlim_max)
return -EINVAL;
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 5897828b9d7e..7e5dff602585 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -470,11 +470,35 @@ u64 alarm_forward(struct alarm *alarm, ktime_t now, ktime_t interval)
}
EXPORT_SYMBOL_GPL(alarm_forward);
-u64 alarm_forward_now(struct alarm *alarm, ktime_t interval)
+static u64 __alarm_forward_now(struct alarm *alarm, ktime_t interval, bool throttle)
{
struct alarm_base *base = &alarm_bases[alarm->type];
+ ktime_t now = base->get_ktime();
+
+ if (IS_ENABLED(CONFIG_HIGH_RES_TIMERS) && throttle) {
+ /*
+ * Same issue as with posix_timer_fn(). Timers which are
+ * periodic but the signal is ignored can starve the system
+ * with a very small interval. The real fix which was
+ * promised in the context of posix_timer_fn() never
+ * materialized, but someone should really work on it.
+ *
+ * To prevent DOS fake @now to be 1 jiffie out which keeps
+ * the overrun accounting correct but creates an
+ * inconsistency vs. timer_gettime(2).
+ */
+ ktime_t kj = NSEC_PER_SEC / HZ;
+
+ if (interval < kj)
+ now = ktime_add(now, kj);
+ }
+
+ return alarm_forward(alarm, now, interval);
+}
- return alarm_forward(alarm, base->get_ktime(), interval);
+u64 alarm_forward_now(struct alarm *alarm, ktime_t interval)
+{
+ return __alarm_forward_now(alarm, interval, false);
}
EXPORT_SYMBOL_GPL(alarm_forward_now);
@@ -551,9 +575,10 @@ static enum alarmtimer_restart alarm_handle_timer(struct alarm *alarm,
if (posix_timer_event(ptr, si_private) && ptr->it_interval) {
/*
* Handle ignored signals and rearm the timer. This will go
- * away once we handle ignored signals proper.
+ * away once we handle ignored signals proper. Ensure that
+ * small intervals cannot starve the system.
*/
- ptr->it_overrun += alarm_forward_now(alarm, ptr->it_interval);
+ ptr->it_overrun += __alarm_forward_now(alarm, ptr->it_interval, true);
++ptr->it_requeue_pending;
ptr->it_active = 1;
result = ALARMTIMER_RESTART;
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 197545241ab8..d7043043f59c 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -933,8 +933,8 @@ config RING_BUFFER_RECORD_RECURSION
default y
help
The ring buffer has its own internal recursion. Although when
- recursion happens it wont cause harm because of the protection,
- but it does cause an unwanted overhead. Enabling this option will
+ recursion happens it won't cause harm because of the protection,
+ but it does cause unwanted overhead. Enabling this option will
place where recursion was detected into the ftrace "recursed_functions"
file.
@@ -1017,8 +1017,8 @@ config RING_BUFFER_STARTUP_TEST
The test runs for 10 seconds. This will slow your boot time
by at least 10 more seconds.
- At the end of the test, statics and more checks are done.
- It will output the stats of each per cpu buffer. What
+ At the end of the test, statistics and more checks are done.
+ It will output the stats of each per cpu buffer: What
was written, the sizes, what was read, what was lost, and
other similar details.
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 918a7d12df8f..5743be559415 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -320,8 +320,8 @@ static void blk_trace_free(struct request_queue *q, struct blk_trace *bt)
* under 'q->debugfs_dir', thus lookup and remove them.
*/
if (!bt->dir) {
- debugfs_remove(debugfs_lookup("dropped", q->debugfs_dir));
- debugfs_remove(debugfs_lookup("msg", q->debugfs_dir));
+ debugfs_lookup_and_remove("dropped", q->debugfs_dir);
+ debugfs_lookup_and_remove("msg", q->debugfs_dir);
} else {
debugfs_remove(bt->dir);
}
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index ad37608afc35..b8ac8b09c86f 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -831,6 +831,7 @@ static void do_bpf_send_signal(struct irq_work *entry)
work = container_of(entry, struct send_signal_irq_work, irq_work);
group_send_sig_info(work->sig, SEND_SIG_PRIV, work->task, work->type);
+ put_task_struct(work->task);
}
static int bpf_send_signal_common(u32 sig, enum pid_type type)
@@ -846,6 +847,9 @@ static int bpf_send_signal_common(u32 sig, enum pid_type type)
return -EPERM;
if (unlikely(!nmi_uaccess_okay()))
return -EPERM;
+ /* Task should not be pid=1 to avoid kernel panic. */
+ if (unlikely(is_global_init(current)))
+ return -EPERM;
if (irqs_disabled()) {
/* Do an early check on signal validity. Otherwise,
@@ -862,7 +866,7 @@ static int bpf_send_signal_common(u32 sig, enum pid_type type)
* to the irq_work. The current task may change when queued
* irq works get executed.
*/
- work->task = current;
+ work->task = get_task_struct(current);
work->sig = sig;
work->type = type;
irq_work_queue(&work->irq_work);
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 442438b93fe9..750aa3f08b25 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1248,12 +1248,17 @@ static void free_ftrace_hash_rcu(struct ftrace_hash *hash)
call_rcu(&hash->rcu, __free_ftrace_hash_rcu);
}
+/**
+ * ftrace_free_filter - remove all filters for an ftrace_ops
+ * @ops - the ops to remove the filters from
+ */
void ftrace_free_filter(struct ftrace_ops *ops)
{
ftrace_ops_init(ops);
free_ftrace_hash(ops->func_hash->filter_hash);
free_ftrace_hash(ops->func_hash->notrace_hash);
}
+EXPORT_SYMBOL_GPL(ftrace_free_filter);
static struct ftrace_hash *alloc_ftrace_hash(int size_bits)
{
@@ -5839,6 +5844,10 @@ EXPORT_SYMBOL_GPL(modify_ftrace_direct_multi);
*
* Filters denote which functions should be enabled when tracing is enabled
* If @ip is NULL, it fails to update filter.
+ *
+ * This can allocate memory which must be freed before @ops can be freed,
+ * either by removing each filtered addr or by using
+ * ftrace_free_filter(@ops).
*/
int ftrace_set_filter_ip(struct ftrace_ops *ops, unsigned long ip,
int remove, int reset)
@@ -5858,7 +5867,11 @@ EXPORT_SYMBOL_GPL(ftrace_set_filter_ip);
*
* Filters denote which functions should be enabled when tracing is enabled
* If @ips array or any ip specified within is NULL , it fails to update filter.
- */
+ *
+ * This can allocate memory which must be freed before @ops can be freed,
+ * either by removing each filtered addr or by using
+ * ftrace_free_filter(@ops).
+*/
int ftrace_set_filter_ips(struct ftrace_ops *ops, unsigned long *ips,
unsigned int cnt, int remove, int reset)
{
@@ -5900,6 +5913,10 @@ ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len,
*
* Filters denote which functions should be enabled when tracing is enabled.
* If @buf is NULL and reset is set, all functions will be enabled for tracing.
+ *
+ * This can allocate memory which must be freed before @ops can be freed,
+ * either by removing each filtered addr or by using
+ * ftrace_free_filter(@ops).
*/
int ftrace_set_filter(struct ftrace_ops *ops, unsigned char *buf,
int len, int reset)
@@ -5919,6 +5936,10 @@ EXPORT_SYMBOL_GPL(ftrace_set_filter);
* Notrace Filters denote which functions should not be enabled when tracing
* is enabled. If @buf is NULL and reset is set, all functions will be enabled
* for tracing.
+ *
+ * This can allocate memory which must be freed before @ops can be freed,
+ * either by removing each filtered addr or by using
+ * ftrace_free_filter(@ops).
*/
int ftrace_set_notrace(struct ftrace_ops *ops, unsigned char *buf,
int len, int reset)
diff --git a/kernel/trace/rv/rv.c b/kernel/trace/rv/rv.c
index 6c97cc2d754a..7e9061828c24 100644
--- a/kernel/trace/rv/rv.c
+++ b/kernel/trace/rv/rv.c
@@ -516,7 +516,7 @@ static ssize_t enabled_monitors_write(struct file *filp, const char __user *user
struct rv_monitor_def *mdef;
int retval = -EINVAL;
bool enable = true;
- char *ptr = buff;
+ char *ptr;
int len;
if (count < 1 || count > MAX_RV_MONITOR_NAME_SIZE + 1)
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index a555a861b978..c9e40f692650 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -9148,9 +9148,6 @@ buffer_percent_write(struct file *filp, const char __user *ubuf,
if (val > 100)
return -EINVAL;
- if (!val)
- val = 1;
-
tr->buffer_percent = val;
(*ppos)++;
@@ -10295,6 +10292,8 @@ void __init early_trace_init(void)
static_key_enable(&tracepoint_printk_key.key);
}
tracer_alloc_buffers();
+
+ init_events();
}
void __init trace_init(void)
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index e46a49269be2..085a31b978a5 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -1282,6 +1282,7 @@ struct ftrace_event_field {
int offset;
int size;
int is_signed;
+ int len;
};
struct prog_entry;
@@ -1490,6 +1491,7 @@ extern void trace_event_enable_cmd_record(bool enable);
extern void trace_event_enable_tgid_record(bool enable);
extern int event_trace_init(void);
+extern int init_events(void);
extern int event_trace_add_tracer(struct dentry *parent, struct trace_array *tr);
extern int event_trace_del_tracer(struct trace_array *tr);
extern void __trace_early_add_events(struct trace_array *tr);
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 33e0b4f8ebe6..6a942fa275c7 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -114,7 +114,7 @@ trace_find_event_field(struct trace_event_call *call, char *name)
static int __trace_define_field(struct list_head *head, const char *type,
const char *name, int offset, int size,
- int is_signed, int filter_type)
+ int is_signed, int filter_type, int len)
{
struct ftrace_event_field *field;
@@ -133,6 +133,7 @@ static int __trace_define_field(struct list_head *head, const char *type,
field->offset = offset;
field->size = size;
field->is_signed = is_signed;
+ field->len = len;
list_add(&field->link, head);
@@ -150,14 +151,28 @@ int trace_define_field(struct trace_event_call *call, const char *type,
head = trace_get_fields(call);
return __trace_define_field(head, type, name, offset, size,
- is_signed, filter_type);
+ is_signed, filter_type, 0);
}
EXPORT_SYMBOL_GPL(trace_define_field);
+static int trace_define_field_ext(struct trace_event_call *call, const char *type,
+ const char *name, int offset, int size, int is_signed,
+ int filter_type, int len)
+{
+ struct list_head *head;
+
+ if (WARN_ON(!call->class))
+ return 0;
+
+ head = trace_get_fields(call);
+ return __trace_define_field(head, type, name, offset, size,
+ is_signed, filter_type, len);
+}
+
#define __generic_field(type, item, filter_type) \
ret = __trace_define_field(&ftrace_generic_fields, #type, \
#item, 0, 0, is_signed_type(type), \
- filter_type); \
+ filter_type, 0); \
if (ret) \
return ret;
@@ -166,7 +181,7 @@ EXPORT_SYMBOL_GPL(trace_define_field);
"common_" #item, \
offsetof(typeof(ent), item), \
sizeof(ent.item), \
- is_signed_type(type), FILTER_OTHER); \
+ is_signed_type(type), FILTER_OTHER, 0); \
if (ret) \
return ret;
@@ -1588,12 +1603,17 @@ static int f_show(struct seq_file *m, void *v)
seq_printf(m, "\tfield:%s %s;\toffset:%u;\tsize:%u;\tsigned:%d;\n",
field->type, field->name, field->offset,
field->size, !!field->is_signed);
- else
- seq_printf(m, "\tfield:%.*s %s%s;\toffset:%u;\tsize:%u;\tsigned:%d;\n",
+ else if (field->len)
+ seq_printf(m, "\tfield:%.*s %s[%d];\toffset:%u;\tsize:%u;\tsigned:%d;\n",
(int)(array_descriptor - field->type),
field->type, field->name,
- array_descriptor, field->offset,
+ field->len, field->offset,
field->size, !!field->is_signed);
+ else
+ seq_printf(m, "\tfield:%.*s %s[];\toffset:%u;\tsize:%u;\tsigned:%d;\n",
+ (int)(array_descriptor - field->type),
+ field->type, field->name,
+ field->offset, field->size, !!field->is_signed);
return 0;
}
@@ -2379,9 +2399,10 @@ event_define_fields(struct trace_event_call *call)
}
offset = ALIGN(offset, field->align);
- ret = trace_define_field(call, field->type, field->name,
+ ret = trace_define_field_ext(call, field->type, field->name,
offset, field->size,
- field->is_signed, field->filter_type);
+ field->is_signed, field->filter_type,
+ field->len);
if (WARN_ON_ONCE(ret)) {
pr_err("error code is %d\n", ret);
break;
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 96acc2b71ac7..e095c3b3a50d 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -128,7 +128,7 @@ static bool is_not(const char *str)
}
/**
- * prog_entry - a singe entry in the filter program
+ * struct prog_entry - a singe entry in the filter program
* @target: Index to jump to on a branch (actually one minus the index)
* @when_to_branch: The value of the result of the predicate to do a branch
* @pred: The predicate to execute.
@@ -140,16 +140,16 @@ struct prog_entry {
};
/**
- * update_preds- assign a program entry a label target
+ * update_preds - assign a program entry a label target
* @prog: The program array
* @N: The index of the current entry in @prog
- * @when_to_branch: What to assign a program entry for its branch condition
+ * @invert: What to assign a program entry for its branch condition
*
* The program entry at @N has a target that points to the index of a program
* entry that can have its target and when_to_branch fields updated.
* Update the current program entry denoted by index @N target field to be
* that of the updated entry. This will denote the entry to update if
- * we are processing an "||" after an "&&"
+ * we are processing an "||" after an "&&".
*/
static void update_preds(struct prog_entry *prog, int N, int invert)
{
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index fcaf226b7744..5edbf6b1da3f 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -1988,6 +1988,8 @@ static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data,
hist_field->fn_num = flags & HIST_FIELD_FL_LOG2 ? HIST_FIELD_FN_LOG2 :
HIST_FIELD_FN_BUCKET;
hist_field->operands[0] = create_hist_field(hist_data, field, fl, NULL);
+ if (!hist_field->operands[0])
+ goto free;
hist_field->size = hist_field->operands[0]->size;
hist_field->type = kstrdup_const(hist_field->operands[0]->type, GFP_KERNEL);
if (!hist_field->type)
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index d960f6b11b5e..58f3946081e2 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -111,7 +111,8 @@ static void __always_unused ____ftrace_check_##name(void) \
#define __array(_type, _item, _len) { \
.type = #_type"["__stringify(_len)"]", .name = #_item, \
.size = sizeof(_type[_len]), .align = __alignof__(_type), \
- is_signed_type(_type), .filter_type = FILTER_OTHER },
+ is_signed_type(_type), .filter_type = FILTER_OTHER, \
+ .len = _len },
#undef __array_desc
#define __array_desc(_type, _container, _item, _len) __array(_type, _item, _len)
diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c
index 94c1b5eb1dc0..210e1f168392 100644
--- a/kernel/trace/trace_osnoise.c
+++ b/kernel/trace/trace_osnoise.c
@@ -147,9 +147,8 @@ static void osnoise_unregister_instance(struct trace_array *tr)
* register/unregister serialization is provided by trace's
* trace_types_lock.
*/
- lockdep_assert_held(&trace_types_lock);
-
- list_for_each_entry_rcu(inst, &osnoise_instances, list) {
+ list_for_each_entry_rcu(inst, &osnoise_instances, list,
+ lockdep_is_held(&trace_types_lock)) {
if (inst->tr == tr) {
list_del_rcu(&inst->list);
found = 1;
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 57a13b61f186..bd475a00f96d 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -1535,7 +1535,7 @@ static struct trace_event *events[] __initdata = {
NULL
};
-__init static int init_events(void)
+__init int init_events(void)
{
struct trace_event *event;
int i, ret;
@@ -1548,4 +1548,3 @@ __init static int init_events(void)
return 0;
}
-early_initcall(init_events);
diff --git a/kernel/umh.c b/kernel/umh.c
index 850631518665..fbf872c624cb 100644
--- a/kernel/umh.c
+++ b/kernel/umh.c
@@ -438,21 +438,27 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait)
if (wait == UMH_NO_WAIT) /* task has freed sub_info */
goto unlock;
- if (wait & UMH_KILLABLE)
- state |= TASK_KILLABLE;
-
if (wait & UMH_FREEZABLE)
state |= TASK_FREEZABLE;
- retval = wait_for_completion_state(&done, state);
- if (!retval)
- goto wait_done;
-
if (wait & UMH_KILLABLE) {
+ retval = wait_for_completion_state(&done, state | TASK_KILLABLE);
+ if (!retval)
+ goto wait_done;
+
/* umh_complete() will see NULL and free sub_info */
if (xchg(&sub_info->complete, NULL))
goto unlock;
+
+ /*
+ * fallthrough; in case of -ERESTARTSYS now do uninterruptible
+ * wait_for_completion_state(). Since umh_complete() shall call
+ * complete() in a moment if xchg() above returned NULL, this
+ * uninterruptible wait_for_completion_state() will not block
+ * SIGKILL'ed processes for long.
+ */
}
+ wait_for_completion_state(&done, state);
wait_done:
retval = sub_info->retval;