diff options
Diffstat (limited to 'kernel')
37 files changed, 594 insertions, 377 deletions
diff --git a/kernel/async.c b/kernel/async.c index b8d7a663497f..b2c4ba5686ee 100644 --- a/kernel/async.c +++ b/kernel/async.c @@ -205,9 +205,6 @@ async_cookie_t async_schedule_node_domain(async_func_t func, void *data, atomic_inc(&entry_count); spin_unlock_irqrestore(&async_lock, flags); - /* mark that this task has queued an async job, used by module init */ - current->flags |= PF_USED_ASYNC; - /* schedule for execution */ queue_work_node(node, system_unbound_wq, &entry->work); diff --git a/kernel/audit.c b/kernel/audit.c index e4bbe2c70c26..7690c29d4ee4 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -541,20 +541,22 @@ static void kauditd_printk_skb(struct sk_buff *skb) /** * kauditd_rehold_skb - Handle a audit record send failure in the hold queue * @skb: audit record + * @error: error code (unused) * * Description: * This should only be used by the kauditd_thread when it fails to flush the * hold queue. */ -static void kauditd_rehold_skb(struct sk_buff *skb) +static void kauditd_rehold_skb(struct sk_buff *skb, __always_unused int error) { - /* put the record back in the queue at the same place */ - skb_queue_head(&audit_hold_queue, skb); + /* put the record back in the queue */ + skb_queue_tail(&audit_hold_queue, skb); } /** * kauditd_hold_skb - Queue an audit record, waiting for auditd * @skb: audit record + * @error: error code * * Description: * Queue the audit record, waiting for an instance of auditd. When this @@ -564,19 +566,31 @@ static void kauditd_rehold_skb(struct sk_buff *skb) * and queue it, if we have room. If we want to hold on to the record, but we * don't have room, record a record lost message. */ -static void kauditd_hold_skb(struct sk_buff *skb) +static void kauditd_hold_skb(struct sk_buff *skb, int error) { /* at this point it is uncertain if we will ever send this to auditd so * try to send the message via printk before we go any further */ kauditd_printk_skb(skb); /* can we just silently drop the message? */ - if (!audit_default) { - kfree_skb(skb); - return; + if (!audit_default) + goto drop; + + /* the hold queue is only for when the daemon goes away completely, + * not -EAGAIN failures; if we are in a -EAGAIN state requeue the + * record on the retry queue unless it's full, in which case drop it + */ + if (error == -EAGAIN) { + if (!audit_backlog_limit || + skb_queue_len(&audit_retry_queue) < audit_backlog_limit) { + skb_queue_tail(&audit_retry_queue, skb); + return; + } + audit_log_lost("kauditd retry queue overflow"); + goto drop; } - /* if we have room, queue the message */ + /* if we have room in the hold queue, queue the message */ if (!audit_backlog_limit || skb_queue_len(&audit_hold_queue) < audit_backlog_limit) { skb_queue_tail(&audit_hold_queue, skb); @@ -585,24 +599,32 @@ static void kauditd_hold_skb(struct sk_buff *skb) /* we have no other options - drop the message */ audit_log_lost("kauditd hold queue overflow"); +drop: kfree_skb(skb); } /** * kauditd_retry_skb - Queue an audit record, attempt to send again to auditd * @skb: audit record + * @error: error code (unused) * * Description: * Not as serious as kauditd_hold_skb() as we still have a connected auditd, * but for some reason we are having problems sending it audit records so * queue the given record and attempt to resend. */ -static void kauditd_retry_skb(struct sk_buff *skb) +static void kauditd_retry_skb(struct sk_buff *skb, __always_unused int error) { - /* NOTE: because records should only live in the retry queue for a - * short period of time, before either being sent or moved to the hold - * queue, we don't currently enforce a limit on this queue */ - skb_queue_tail(&audit_retry_queue, skb); + if (!audit_backlog_limit || + skb_queue_len(&audit_retry_queue) < audit_backlog_limit) { + skb_queue_tail(&audit_retry_queue, skb); + return; + } + + /* we have to drop the record, send it via printk as a last effort */ + kauditd_printk_skb(skb); + audit_log_lost("kauditd retry queue overflow"); + kfree_skb(skb); } /** @@ -640,7 +662,7 @@ static void auditd_reset(const struct auditd_connection *ac) /* flush the retry queue to the hold queue, but don't touch the main * queue since we need to process that normally for multicast */ while ((skb = skb_dequeue(&audit_retry_queue))) - kauditd_hold_skb(skb); + kauditd_hold_skb(skb, -ECONNREFUSED); } /** @@ -714,16 +736,18 @@ static int kauditd_send_queue(struct sock *sk, u32 portid, struct sk_buff_head *queue, unsigned int retry_limit, void (*skb_hook)(struct sk_buff *skb), - void (*err_hook)(struct sk_buff *skb)) + void (*err_hook)(struct sk_buff *skb, int error)) { int rc = 0; - struct sk_buff *skb; + struct sk_buff *skb = NULL; + struct sk_buff *skb_tail; unsigned int failed = 0; /* NOTE: kauditd_thread takes care of all our locking, we just use * the netlink info passed to us (e.g. sk and portid) */ - while ((skb = skb_dequeue(queue))) { + skb_tail = skb_peek_tail(queue); + while ((skb != skb_tail) && (skb = skb_dequeue(queue))) { /* call the skb_hook for each skb we touch */ if (skb_hook) (*skb_hook)(skb); @@ -731,7 +755,7 @@ static int kauditd_send_queue(struct sock *sk, u32 portid, /* can we send to anyone via unicast? */ if (!sk) { if (err_hook) - (*err_hook)(skb); + (*err_hook)(skb, -ECONNREFUSED); continue; } @@ -745,7 +769,7 @@ retry: rc == -ECONNREFUSED || rc == -EPERM) { sk = NULL; if (err_hook) - (*err_hook)(skb); + (*err_hook)(skb, rc); if (rc == -EAGAIN) rc = 0; /* continue to drain the queue */ diff --git a/kernel/auditsc.c b/kernel/auditsc.c index fce5d43a933f..a83928cbdcb7 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -185,7 +185,7 @@ static int audit_match_perm(struct audit_context *ctx, int mask) case AUDITSC_EXECVE: return mask & AUDIT_PERM_EXEC; case AUDITSC_OPENAT2: - return mask & ACC_MODE((u32)((struct open_how *)ctx->argv[2])->flags); + return mask & ACC_MODE((u32)ctx->openat2.flags); default: return 0; } diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c index 06062370c3b8..9e4ecc990647 100644 --- a/kernel/bpf/bpf_lsm.c +++ b/kernel/bpf/bpf_lsm.c @@ -207,7 +207,7 @@ BTF_ID(func, bpf_lsm_socket_socketpair) BTF_ID(func, bpf_lsm_syslog) BTF_ID(func, bpf_lsm_task_alloc) -BTF_ID(func, bpf_lsm_task_getsecid_subj) +BTF_ID(func, bpf_lsm_current_getsecid_subj) BTF_ID(func, bpf_lsm_task_getsecid_obj) BTF_ID(func, bpf_lsm_task_prctl) BTF_ID(func, bpf_lsm_task_setscheduler) diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c index 638d7fd7b375..710ba9de12ce 100644 --- a/kernel/bpf/ringbuf.c +++ b/kernel/bpf/ringbuf.c @@ -104,7 +104,7 @@ static struct bpf_ringbuf *bpf_ringbuf_area_alloc(size_t data_sz, int numa_node) } rb = vmap(pages, nr_meta_pages + 2 * nr_data_pages, - VM_ALLOC | VM_USERMAP, PAGE_KERNEL); + VM_MAP | VM_USERMAP, PAGE_KERNEL); if (rb) { kmemleak_not_leak(pages); rb->pages = pages; diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 49e567209c6b..22c8ae94e4c1 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -472,13 +472,14 @@ BPF_CALL_4(bpf_get_task_stack, struct task_struct *, task, void *, buf, u32, size, u64, flags) { struct pt_regs *regs; - long res; + long res = -EINVAL; if (!try_get_task_stack(task)) return -EFAULT; regs = task_pt_regs(task); - res = __bpf_get_stack(regs, task, NULL, buf, size, flags); + if (regs) + res = __bpf_get_stack(regs, task, NULL, buf, size, flags); put_task_stack(task); return res; diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 4b6974a195c1..5e7edf913060 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -550,11 +550,12 @@ static __always_inline u64 notrace bpf_prog_start_time(void) static void notrace inc_misses_counter(struct bpf_prog *prog) { struct bpf_prog_stats *stats; + unsigned int flags; stats = this_cpu_ptr(prog->stats); - u64_stats_update_begin(&stats->syncp); + flags = u64_stats_update_begin_irqsave(&stats->syncp); u64_stats_inc(&stats->misses); - u64_stats_update_end(&stats->syncp); + u64_stats_update_end_irqrestore(&stats->syncp, flags); } /* The logic is similar to bpf_prog_run(), but with an explicit diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 41e0837a5a0b..0e877dbcfeea 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -549,6 +549,14 @@ static ssize_t cgroup_release_agent_write(struct kernfs_open_file *of, BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX); + /* + * Release agent gets called with all capabilities, + * require capabilities to set release agent. + */ + if ((of->file->f_cred->user_ns != &init_user_ns) || + !capable(CAP_SYS_ADMIN)) + return -EPERM; + cgrp = cgroup_kn_lock_live(of->kn, false); if (!cgrp) return -ENODEV; @@ -954,6 +962,12 @@ int cgroup1_parse_param(struct fs_context *fc, struct fs_parameter *param) /* Specifying two release agents is forbidden */ if (ctx->release_agent) return invalfc(fc, "release_agent respecified"); + /* + * Release agent gets called with all capabilities, + * require capabilities to set release agent. + */ + if ((fc->user_ns != &init_user_ns) || !capable(CAP_SYS_ADMIN)) + return invalfc(fc, "Setting release_agent not allowed"); ctx->release_agent = param->string; param->string = NULL; break; diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index b31e1465868a..9d05c3ca2d5e 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -3643,6 +3643,12 @@ static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf, cgroup_get(cgrp); cgroup_kn_unlock(of->kn); + /* Allow only one trigger per file descriptor */ + if (ctx->psi.trigger) { + cgroup_put(cgrp); + return -EBUSY; + } + psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi; new = psi_trigger_create(psi, buf, nbytes, res); if (IS_ERR(new)) { @@ -3650,8 +3656,7 @@ static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf, return PTR_ERR(new); } - psi_trigger_replace(&ctx->psi.trigger, new); - + smp_store_release(&ctx->psi.trigger, new); cgroup_put(cgrp); return nbytes; @@ -3690,7 +3695,7 @@ static void cgroup_pressure_release(struct kernfs_open_file *of) { struct cgroup_file_ctx *ctx = of->priv; - psi_trigger_replace(&ctx->psi.trigger, NULL); + psi_trigger_destroy(ctx->psi.trigger); } bool cgroup_psi_enabled(void) diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index dc653ab26e50..4c7254e8f49a 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -591,6 +591,35 @@ static inline void free_cpuset(struct cpuset *cs) } /* + * validate_change_legacy() - Validate conditions specific to legacy (v1) + * behavior. + */ +static int validate_change_legacy(struct cpuset *cur, struct cpuset *trial) +{ + struct cgroup_subsys_state *css; + struct cpuset *c, *par; + int ret; + + WARN_ON_ONCE(!rcu_read_lock_held()); + + /* Each of our child cpusets must be a subset of us */ + ret = -EBUSY; + cpuset_for_each_child(c, css, cur) + if (!is_cpuset_subset(c, trial)) + goto out; + + /* On legacy hierarchy, we must be a subset of our parent cpuset. */ + ret = -EACCES; + par = parent_cs(cur); + if (par && !is_cpuset_subset(trial, par)) + goto out; + + ret = 0; +out: + return ret; +} + +/* * validate_change() - Used to validate that any proposed cpuset change * follows the structural rules for cpusets. * @@ -614,20 +643,21 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) { struct cgroup_subsys_state *css; struct cpuset *c, *par; - int ret; - - /* The checks don't apply to root cpuset */ - if (cur == &top_cpuset) - return 0; + int ret = 0; rcu_read_lock(); - par = parent_cs(cur); - /* On legacy hierarchy, we must be a subset of our parent cpuset. */ - ret = -EACCES; - if (!is_in_v2_mode() && !is_cpuset_subset(trial, par)) + if (!is_in_v2_mode()) + ret = validate_change_legacy(cur, trial); + if (ret) + goto out; + + /* Remaining checks don't apply to root cpuset */ + if (cur == &top_cpuset) goto out; + par = parent_cs(cur); + /* * If either I or some sibling (!= me) is exclusive, we can't * overlap @@ -1175,9 +1205,7 @@ enum subparts_cmd { * * Because of the implicit cpu exclusive nature of a partition root, * cpumask changes that violates the cpu exclusivity rule will not be - * permitted when checked by validate_change(). The validate_change() - * function will also prevent any changes to the cpu list if it is not - * a superset of children's cpu lists. + * permitted when checked by validate_change(). */ static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd, struct cpumask *newmask, @@ -1522,10 +1550,15 @@ static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs, struct cpuset *sibling; struct cgroup_subsys_state *pos_css; + percpu_rwsem_assert_held(&cpuset_rwsem); + /* * Check all its siblings and call update_cpumasks_hier() * if their use_parent_ecpus flag is set in order for them * to use the right effective_cpus value. + * + * The update_cpumasks_hier() function may sleep. So we have to + * release the RCU read lock before calling it. */ rcu_read_lock(); cpuset_for_each_child(sibling, pos_css, parent) { @@ -1533,8 +1566,13 @@ static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs, continue; if (!sibling->use_parent_ecpus) continue; + if (!css_tryget_online(&sibling->css)) + continue; + rcu_read_unlock(); update_cpumasks_hier(sibling, tmp); + rcu_read_lock(); + css_put(&sibling->css); } rcu_read_unlock(); } @@ -1607,8 +1645,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, * Make sure that subparts_cpus is a subset of cpus_allowed. */ if (cs->nr_subparts_cpus) { - cpumask_andnot(cs->subparts_cpus, cs->subparts_cpus, - cs->cpus_allowed); + cpumask_and(cs->subparts_cpus, cs->subparts_cpus, cs->cpus_allowed); cs->nr_subparts_cpus = cpumask_weight(cs->subparts_cpus); } spin_unlock_irq(&callback_lock); diff --git a/kernel/cred.c b/kernel/cred.c index 473d17c431f3..933155c96922 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -665,21 +665,16 @@ EXPORT_SYMBOL(cred_fscmp); int set_cred_ucounts(struct cred *new) { - struct task_struct *task = current; - const struct cred *old = task->real_cred; struct ucounts *new_ucounts, *old_ucounts = new->ucounts; - if (new->user == old->user && new->user_ns == old->user_ns) - return 0; - /* * This optimization is needed because alloc_ucounts() uses locks * for table lookups. */ - if (old_ucounts->ns == new->user_ns && uid_eq(old_ucounts->uid, new->euid)) + if (old_ucounts->ns == new->user_ns && uid_eq(old_ucounts->uid, new->uid)) return 0; - if (!(new_ucounts = alloc_ucounts(new->user_ns, new->euid))) + if (!(new_ucounts = alloc_ucounts(new->user_ns, new->uid))) return -EAGAIN; new->ucounts = new_ucounts; diff --git a/kernel/events/core.c b/kernel/events/core.c index fc18664f49b0..6859229497b1 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -674,6 +674,23 @@ perf_event_set_state(struct perf_event *event, enum perf_event_state state) WRITE_ONCE(event->state, state); } +/* + * UP store-release, load-acquire + */ + +#define __store_release(ptr, val) \ +do { \ + barrier(); \ + WRITE_ONCE(*(ptr), (val)); \ +} while (0) + +#define __load_acquire(ptr) \ +({ \ + __unqual_scalar_typeof(*(ptr)) ___p = READ_ONCE(*(ptr)); \ + barrier(); \ + ___p; \ +}) + #ifdef CONFIG_CGROUP_PERF static inline bool @@ -719,34 +736,51 @@ static inline u64 perf_cgroup_event_time(struct perf_event *event) return t->time; } -static inline void __update_cgrp_time(struct perf_cgroup *cgrp) +static inline u64 perf_cgroup_event_time_now(struct perf_event *event, u64 now) { - struct perf_cgroup_info *info; - u64 now; - - now = perf_clock(); + struct perf_cgroup_info *t; - info = this_cpu_ptr(cgrp->info); + t = per_cpu_ptr(event->cgrp->info, event->cpu); + if (!__load_acquire(&t->active)) + return t->time; + now += READ_ONCE(t->timeoffset); + return now; +} - info->time += now - info->timestamp; +static inline void __update_cgrp_time(struct perf_cgroup_info *info, u64 now, bool adv) +{ + if (adv) + info->time += now - info->timestamp; info->timestamp = now; + /* + * see update_context_time() + */ + WRITE_ONCE(info->timeoffset, info->time - info->timestamp); } -static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx) +static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx, bool final) { struct perf_cgroup *cgrp = cpuctx->cgrp; struct cgroup_subsys_state *css; + struct perf_cgroup_info *info; if (cgrp) { + u64 now = perf_clock(); + for (css = &cgrp->css; css; css = css->parent) { cgrp = container_of(css, struct perf_cgroup, css); - __update_cgrp_time(cgrp); + info = this_cpu_ptr(cgrp->info); + + __update_cgrp_time(info, now, true); + if (final) + __store_release(&info->active, 0); } } } static inline void update_cgrp_time_from_event(struct perf_event *event) { + struct perf_cgroup_info *info; struct perf_cgroup *cgrp; /* @@ -760,8 +794,10 @@ static inline void update_cgrp_time_from_event(struct perf_event *event) /* * Do not update time when cgroup is not active */ - if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup)) - __update_cgrp_time(event->cgrp); + if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup)) { + info = this_cpu_ptr(event->cgrp->info); + __update_cgrp_time(info, perf_clock(), true); + } } static inline void @@ -785,7 +821,8 @@ perf_cgroup_set_timestamp(struct task_struct *task, for (css = &cgrp->css; css; css = css->parent) { cgrp = container_of(css, struct perf_cgroup, css); info = this_cpu_ptr(cgrp->info); - info->timestamp = ctx->timestamp; + __update_cgrp_time(info, ctx->timestamp, false); + __store_release(&info->active, 1); } } @@ -802,7 +839,7 @@ static DEFINE_PER_CPU(struct list_head, cgrp_cpuctx_list); */ static void perf_cgroup_switch(struct task_struct *task, int mode) { - struct perf_cpu_context *cpuctx; + struct perf_cpu_context *cpuctx, *tmp; struct list_head *list; unsigned long flags; @@ -813,7 +850,7 @@ static void perf_cgroup_switch(struct task_struct *task, int mode) local_irq_save(flags); list = this_cpu_ptr(&cgrp_cpuctx_list); - list_for_each_entry(cpuctx, list, cgrp_cpuctx_entry) { + list_for_each_entry_safe(cpuctx, tmp, list, cgrp_cpuctx_entry) { WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0); perf_ctx_lock(cpuctx, cpuctx->task_ctx); @@ -982,14 +1019,6 @@ out: } static inline void -perf_cgroup_set_shadow_time(struct perf_event *event, u64 now) -{ - struct perf_cgroup_info *t; - t = per_cpu_ptr(event->cgrp->info, event->cpu); - event->shadow_ctx_time = now - t->timestamp; -} - -static inline void perf_cgroup_event_enable(struct perf_event *event, struct perf_event_context *ctx) { struct perf_cpu_context *cpuctx; @@ -1066,7 +1095,8 @@ static inline void update_cgrp_time_from_event(struct perf_event *event) { } -static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx) +static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx, + bool final) { } @@ -1098,12 +1128,12 @@ perf_cgroup_switch(struct task_struct *task, struct task_struct *next) { } -static inline void -perf_cgroup_set_shadow_time(struct perf_event *event, u64 now) +static inline u64 perf_cgroup_event_time(struct perf_event *event) { + return 0; } -static inline u64 perf_cgroup_event_time(struct perf_event *event) +static inline u64 perf_cgroup_event_time_now(struct perf_event *event, u64 now) { return 0; } @@ -1525,22 +1555,59 @@ static void perf_unpin_context(struct perf_event_context *ctx) /* * Update the record of the current time in a context. */ -static void update_context_time(struct perf_event_context *ctx) +static void __update_context_time(struct perf_event_context *ctx, bool adv) { u64 now = perf_clock(); - ctx->time += now - ctx->timestamp; + if (adv) + ctx->time += now - ctx->timestamp; ctx->timestamp = now; + + /* + * The above: time' = time + (now - timestamp), can be re-arranged + * into: time` = now + (time - timestamp), which gives a single value + * offset to compute future time without locks on. + * + * See perf_event_time_now(), which can be used from NMI context where + * it's (obviously) not possible to acquire ctx->lock in order to read + * both the above values in a consistent manner. + */ + WRITE_ONCE(ctx->timeoffset, ctx->time - ctx->timestamp); +} + +static void update_context_time(struct perf_event_context *ctx) +{ + __update_context_time(ctx, true); } static u64 perf_event_time(struct perf_event *event) { struct perf_event_context *ctx = event->ctx; + if (unlikely(!ctx)) + return 0; + if (is_cgroup_event(event)) return perf_cgroup_event_time(event); - return ctx ? ctx->time : 0; + return ctx->time; +} + +static u64 perf_event_time_now(struct perf_event *event, u64 now) +{ + struct perf_event_context *ctx = event->ctx; + + if (unlikely(!ctx)) + return 0; + + if (is_cgroup_event(event)) + return perf_cgroup_event_time_now(event, now); + + if (!(__load_acquire(&ctx->is_active) & EVENT_TIME)) + return ctx->time; + + now += READ_ONCE(ctx->timeoffset); + return now; } static enum event_type_t get_event_type(struct perf_event *event) @@ -2350,7 +2417,7 @@ __perf_remove_from_context(struct perf_event *event, if (ctx->is_active & EVENT_TIME) { update_context_time(ctx); - update_cgrp_time_from_cpuctx(cpuctx); + update_cgrp_time_from_cpuctx(cpuctx, false); } event_sched_out(event, cpuctx, ctx); @@ -2361,6 +2428,9 @@ __perf_remove_from_context(struct perf_event *event, list_del_event(event, ctx); if (!ctx->nr_events && ctx->is_active) { + if (ctx == &cpuctx->ctx) + update_cgrp_time_from_cpuctx(cpuctx, true); + ctx->is_active = 0; ctx->rotate_necessary = 0; if (ctx->task) { @@ -2392,7 +2462,11 @@ static void perf_remove_from_context(struct perf_event *event, unsigned long fla * event_function_call() user. */ raw_spin_lock_irq(&ctx->lock); - if (!ctx->is_active) { + /* + * Cgroup events are per-cpu events, and must IPI because of + * cgrp_cpuctx_list. + */ + if (!ctx->is_active && !is_cgroup_event(event)) { __perf_remove_from_context(event, __get_cpu_context(ctx), ctx, (void *)flags); raw_spin_unlock_irq(&ctx->lock); @@ -2482,40 +2556,6 @@ void perf_event_disable_inatomic(struct perf_event *event) irq_work_queue(&event->pending); } -static void perf_set_shadow_time(struct perf_event *event, - struct perf_event_context *ctx) -{ - /* - * use the correct time source for the time snapshot - * - * We could get by without this by leveraging the - * fact that to get to this function, the caller - * has most likely already called update_context_time() - * and update_cgrp_time_xx() and thus both timestamp - * are identical (or very close). Given that tstamp is, - * already adjusted for cgroup, we could say that: - * tstamp - ctx->timestamp - * is equivalent to - * tstamp - cgrp->timestamp. - * - * Then, in perf_output_read(), the calculation would - * work with no changes because: - * - event is guaranteed scheduled in - * - no scheduled out in between - * - thus the timestamp would be the same - * - * But this is a bit hairy. - * - * So instead, we have an explicit cgroup call to remain - * within the time source all along. We believe it - * is cleaner and simpler to understand. - */ - if (is_cgroup_event(event)) - perf_cgroup_set_shadow_time(event, event->tstamp); - else - event->shadow_ctx_time = event->tstamp - ctx->timestamp; -} - #define MAX_INTERRUPTS (~0ULL) static void perf_log_throttle(struct perf_event *event, int enable); @@ -2556,8 +2596,6 @@ event_sched_in(struct perf_event *event, perf_pmu_disable(event->pmu); - perf_set_shadow_time(event, ctx); - perf_log_itrace_start(event); if (event->pmu->add(event, PERF_EF_START)) { @@ -2861,11 +2899,14 @@ perf_install_in_context(struct perf_event_context *ctx, * perf_event_attr::disabled events will not run and can be initialized * without IPI. Except when this is the first event for the context, in * that case we need the magic of the IPI to set ctx->is_active. + * Similarly, cgroup events for the context also needs the IPI to + * manipulate the cgrp_cpuctx_list. * * The IOC_ENABLE that is sure to follow the creation of a disabled * event will issue the IPI and reprogram the hardware. */ - if (__perf_effective_state(event) == PERF_EVENT_STATE_OFF && ctx->nr_events) { + if (__perf_effective_state(event) == PERF_EVENT_STATE_OFF && + ctx->nr_events && !is_cgroup_event(event)) { raw_spin_lock_irq(&ctx->lock); if (ctx->task == TASK_TOMBSTONE) { raw_spin_unlock_irq(&ctx->lock); @@ -3197,6 +3238,15 @@ static int perf_event_modify_breakpoint(struct perf_event *bp, return err; } +/* + * Copy event-type-independent attributes that may be modified. + */ +static void perf_event_modify_copy_attr(struct perf_event_attr *to, + const struct perf_event_attr *from) +{ + to->sig_data = from->sig_data; +} + static int perf_event_modify_attr(struct perf_event *event, struct perf_event_attr *attr) { @@ -3219,10 +3269,17 @@ static int perf_event_modify_attr(struct perf_event *event, WARN_ON_ONCE(event->ctx->parent_ctx); mutex_lock(&event->child_mutex); + /* + * Event-type-independent attributes must be copied before event-type + * modification, which will validate that final attributes match the + * source attributes after all relevant attributes have been copied. + */ + perf_event_modify_copy_attr(&event->attr, attr); err = func(event, attr); if (err) goto out; list_for_each_entry(child, &event->child_list, child_list) { + perf_event_modify_copy_attr(&child->attr, attr); err = func(child, attr); if (err) goto out; @@ -3251,16 +3308,6 @@ static void ctx_sched_out(struct perf_event_context *ctx, return; } - ctx->is_active &= ~event_type; - if (!(ctx->is_active & EVENT_ALL)) - ctx->is_active = 0; - - if (ctx->task) { - WARN_ON_ONCE(cpuctx->task_ctx != ctx); - if (!ctx->is_active) - cpuctx->task_ctx = NULL; - } - /* * Always update time if it was set; not only when it changes. * Otherwise we can 'forget' to update time for any but the last @@ -3274,7 +3321,22 @@ static void ctx_sched_out(struct perf_event_context *ctx, if (is_active & EVENT_TIME) { /* update (and stop) ctx time */ update_context_time(ctx); - update_cgrp_time_from_cpuctx(cpuctx); + update_cgrp_time_from_cpuctx(cpuctx, ctx == &cpuctx->ctx); + /* + * CPU-release for the below ->is_active store, + * see __load_acquire() in perf_event_time_now() + */ + barrier(); + } + + ctx->is_active &= ~event_type; + if (!(ctx->is_active & EVENT_ALL)) + ctx->is_active = 0; + + if (ctx->task) { + WARN_ON_ONCE(cpuctx->task_ctx != ctx); + if (!ctx->is_active) + cpuctx->task_ctx = NULL; } is_active ^= ctx->is_active; /* changed bits */ @@ -3711,13 +3773,19 @@ static noinline int visit_groups_merge(struct perf_cpu_context *cpuctx, return 0; } +/* + * Because the userpage is strictly per-event (there is no concept of context, + * so there cannot be a context indirection), every userpage must be updated + * when context time starts :-( + * + * IOW, we must not miss EVENT_TIME edges. + */ static inline bool event_update_userpage(struct perf_event *event) { if (likely(!atomic_read(&event->mmap_count))) return false; perf_event_update_time(event); - perf_set_shadow_time(event, event->ctx); perf_event_update_userpage(event); return true; @@ -3801,13 +3869,23 @@ ctx_sched_in(struct perf_event_context *ctx, struct task_struct *task) { int is_active = ctx->is_active; - u64 now; lockdep_assert_held(&ctx->lock); if (likely(!ctx->nr_events)) return; + if (is_active ^ EVENT_TIME) { + /* start ctx time */ + __update_context_time(ctx, false); + perf_cgroup_set_timestamp(task, ctx); + /* + * CPU-release for the below ->is_active store, + * see __load_acquire() in perf_event_time_now() + */ + barrier(); + } + ctx->is_active |= (event_type | EVENT_TIME); if (ctx->task) { if (!is_active) @@ -3818,13 +3896,6 @@ ctx_sched_in(struct perf_event_context *ctx, is_active ^= ctx->is_active; /* changed bits */ - if (is_active & EVENT_TIME) { - /* start ctx time */ - now = perf_clock(); - ctx->timestamp = now; - perf_cgroup_set_timestamp(task, ctx); - } - /* * First go through the list and put on any pinned groups * in order to give them the best chance of going on. @@ -4418,6 +4489,18 @@ static inline u64 perf_event_count(struct perf_event *event) return local64_read(&event->count) + atomic64_read(&event->child_count); } +static void calc_timer_values(struct perf_event *event, + u64 *now, + u64 *enabled, + u64 *running) +{ + u64 ctx_time; + + *now = perf_clock(); + ctx_time = perf_event_time_now(event, *now); + __perf_update_times(event, ctx_time, enabled, running); +} + /* * NMI-safe method to read a local event, that is an event that * is: @@ -4477,10 +4560,9 @@ int perf_event_read_local(struct perf_event *event, u64 *value, *value = local64_read(&event->count); if (enabled || running) { - u64 now = event->shadow_ctx_time + perf_clock(); - u64 __enabled, __running; + u64 __enabled, __running, __now;; - __perf_update_times(event, now, &__enabled, &__running); + calc_timer_values(event, &__now, &__enabled, &__running); if (enabled) *enabled = __enabled; if (running) @@ -5802,18 +5884,6 @@ static int perf_event_index(struct perf_event *event) return event->pmu->event_idx(event); } -static void calc_timer_values(struct perf_event *event, - u64 *now, - u64 *enabled, - u64 *running) -{ - u64 ctx_time; - - *now = perf_clock(); - ctx_time = event->shadow_ctx_time + *now; - __perf_update_times(event, ctx_time, enabled, running); -} - static void perf_event_init_userpage(struct perf_event *event) { struct perf_event_mmap_page *userpg; @@ -5938,6 +6008,8 @@ static void ring_buffer_attach(struct perf_event *event, struct perf_buffer *old_rb = NULL; unsigned long flags; + WARN_ON_ONCE(event->parent); + if (event->rb) { /* * Should be impossible, we set this when removing @@ -5995,6 +6067,9 @@ static void ring_buffer_wakeup(struct perf_event *event) { struct perf_buffer *rb; + if (event->parent) + event = event->parent; + rcu_read_lock(); rb = rcu_dereference(event->rb); if (rb) { @@ -6008,6 +6083,9 @@ struct perf_buffer *ring_buffer_get(struct perf_event *event) { struct perf_buffer *rb; + if (event->parent) + event = event->parent; + rcu_read_lock(); rb = rcu_dereference(event->rb); if (rb) { @@ -6353,7 +6431,6 @@ accounting: ring_buffer_attach(event, rb); perf_event_update_time(event); - perf_set_shadow_time(event, event->ctx); perf_event_init_userpage(event); perf_event_update_userpage(event); } else { @@ -6717,7 +6794,7 @@ static unsigned long perf_prepare_sample_aux(struct perf_event *event, if (WARN_ON_ONCE(READ_ONCE(sampler->oncpu) != smp_processor_id())) goto out; - rb = ring_buffer_get(sampler->parent ? sampler->parent : sampler); + rb = ring_buffer_get(sampler); if (!rb) goto out; @@ -6783,7 +6860,7 @@ static void perf_aux_sample_output(struct perf_event *event, if (WARN_ON_ONCE(!sampler || !data->aux_size)) return; - rb = ring_buffer_get(sampler->parent ? sampler->parent : sampler); + rb = ring_buffer_get(sampler); if (!rb) return; diff --git a/kernel/fork.c b/kernel/fork.c index d75a528f7b21..a024bf6254df 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2021,18 +2021,18 @@ static __latent_entropy struct task_struct *copy_process( #ifdef CONFIG_PROVE_LOCKING DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled); #endif + retval = copy_creds(p, clone_flags); + if (retval < 0) + goto bad_fork_free; + retval = -EAGAIN; if (is_ucounts_overlimit(task_ucounts(p), UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC))) { if (p->real_cred->user != INIT_USER && !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN)) - goto bad_fork_free; + goto bad_fork_cleanup_count; } current->flags &= ~PF_NPROC_EXCEEDED; - retval = copy_creds(p, clone_flags); - if (retval < 0) - goto bad_fork_free; - /* * If multiple threads are within copy_process(), then this check * triggers too late. This doesn't hurt, the check is only there @@ -2267,6 +2267,17 @@ static __latent_entropy struct task_struct *copy_process( goto bad_fork_put_pidfd; /* + * Now that the cgroups are pinned, re-clone the parent cgroup and put + * the new task on the correct runqueue. All this *before* the task + * becomes visible. + * + * This isn't part of ->can_fork() because while the re-cloning is + * cgroup specific, it unconditionally needs to place the task on a + * runqueue. + */ + sched_cgroup_fork(p, args); + + /* * From this point on we must avoid any synchronous user-space * communication until we take the tasklist-lock. In particular, we do * not want user-space to be able to predict the process start-time by @@ -2323,10 +2334,6 @@ static __latent_entropy struct task_struct *copy_process( goto bad_fork_cancel_cgroup; } - /* past the last point of failure */ - if (pidfile) - fd_install(pidfd, pidfile); - init_task_pid_links(p); if (likely(p->pid)) { ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace); @@ -2375,8 +2382,11 @@ static __latent_entropy struct task_struct *copy_process( syscall_tracepoint_update(p); write_unlock_irq(&tasklist_lock); + if (pidfile) + fd_install(pidfd, pidfile); + proc_fork_connector(p); - sched_post_fork(p, args); + sched_post_fork(p); cgroup_post_fork(p, args); perf_event_fork(p); diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 4a882f83aeb9..f8a0212189ca 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -3462,7 +3462,7 @@ struct lock_class *lock_chain_get_class(struct lock_chain *chain, int i) u16 chain_hlock = chain_hlocks[chain->base + i]; unsigned int class_idx = chain_hlock_class_idx(chain_hlock); - return lock_classes + class_idx - 1; + return lock_classes + class_idx; } /* @@ -3530,7 +3530,7 @@ static void print_chain_keys_chain(struct lock_chain *chain) hlock_id = chain_hlocks[chain->base + i]; chain_key = print_chain_key_iteration(hlock_id, chain_key); - print_lock_name(lock_classes + chain_hlock_class_idx(hlock_id) - 1); + print_lock_name(lock_classes + chain_hlock_class_idx(hlock_id)); printk("\n"); } } diff --git a/kernel/module.c b/kernel/module.c index 24dab046e16c..46a5c2ed1928 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -3725,12 +3725,6 @@ static noinline int do_init_module(struct module *mod) } freeinit->module_init = mod->init_layout.base; - /* - * We want to find out whether @mod uses async during init. Clear - * PF_USED_ASYNC. async_schedule*() will set it. - */ - current->flags &= ~PF_USED_ASYNC; - do_mod_ctors(mod); /* Start the module */ if (mod->init != NULL) @@ -3756,22 +3750,13 @@ static noinline int do_init_module(struct module *mod) /* * We need to finish all async code before the module init sequence - * is done. This has potential to deadlock. For example, a newly - * detected block device can trigger request_module() of the - * default iosched from async probing task. Once userland helper - * reaches here, async_synchronize_full() will wait on the async - * task waiting on request_module() and deadlock. - * - * This deadlock is avoided by perfomring async_synchronize_full() - * iff module init queued any async jobs. This isn't a full - * solution as it will deadlock the same if module loading from - * async jobs nests more than once; however, due to the various - * constraints, this hack seems to be the best option for now. - * Please refer to the following thread for details. + * is done. This has potential to deadlock if synchronous module + * loading is requested from async (which is not allowed!). * - * http://thread.gmane.org/gmane.linux.kernel/1420814 + * See commit 0fdff3ec6d87 ("async, kmod: warn on synchronous + * request_module() from async workers") for more details. */ - if (!mod->async_probe_requested && (current->flags & PF_USED_ASYNC)) + if (!mod->async_probe_requested) async_synchronize_full(); ftrace_free_mem(mod, mod->init_layout.base, mod->init_layout.base + diff --git a/kernel/module_decompress.c b/kernel/module_decompress.c index b01c69c2ff99..ffef98a20320 100644 --- a/kernel/module_decompress.c +++ b/kernel/module_decompress.c @@ -250,6 +250,7 @@ void module_decompress_cleanup(struct load_info *info) info->max_pages = info->used_pages = 0; } +#ifdef CONFIG_SYSFS static ssize_t compression_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { @@ -269,3 +270,4 @@ static int __init module_decompress_sysfs_init(void) return 0; } late_initcall(module_decompress_sysfs_init); +#endif diff --git a/kernel/power/main.c b/kernel/power/main.c index 44169f3081fd..7e646079fbeb 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -504,7 +504,10 @@ static ssize_t pm_wakeup_irq_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return pm_wakeup_irq ? sprintf(buf, "%u\n", pm_wakeup_irq) : -ENODATA; + if (!pm_wakeup_irq()) + return -ENODATA; + + return sprintf(buf, "%u\n", pm_wakeup_irq()); } power_attr_ro(pm_wakeup_irq); diff --git a/kernel/power/process.c b/kernel/power/process.c index b7e7798637b8..11b570fcf049 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -134,7 +134,7 @@ int freeze_processes(void) if (!pm_freezing) atomic_inc(&system_freezing_cnt); - pm_wakeup_clear(true); + pm_wakeup_clear(0); pr_info("Freezing user space processes ... "); pm_freezing = true; error = try_to_freeze_tasks(true); diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index f7a986078213..330d49937692 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -978,8 +978,7 @@ static void memory_bm_recycle(struct memory_bitmap *bm) * Register a range of page frames the contents of which should not be saved * during hibernation (to be used in the early initialization code). */ -void __init __register_nosave_region(unsigned long start_pfn, - unsigned long end_pfn, int use_kmalloc) +void __init register_nosave_region(unsigned long start_pfn, unsigned long end_pfn) { struct nosave_region *region; @@ -995,18 +994,12 @@ void __init __register_nosave_region(unsigned long start_pfn, goto Report; } } - if (use_kmalloc) { - /* During init, this shouldn't fail */ - region = kmalloc(sizeof(struct nosave_region), GFP_KERNEL); - BUG_ON(!region); - } else { - /* This allocation cannot fail */ - region = memblock_alloc(sizeof(struct nosave_region), - SMP_CACHE_BYTES); - if (!region) - panic("%s: Failed to allocate %zu bytes\n", __func__, - sizeof(struct nosave_region)); - } + /* This allocation cannot fail */ + region = memblock_alloc(sizeof(struct nosave_region), + SMP_CACHE_BYTES); + if (!region) + panic("%s: Failed to allocate %zu bytes\n", __func__, + sizeof(struct nosave_region)); region->start_pfn = start_pfn; region->end_pfn = end_pfn; list_add_tail(®ion->list, &nosave_regions); diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 80cc1f0f502b..6fcdee7e87a5 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -136,8 +136,6 @@ static void s2idle_loop(void) break; } - pm_wakeup_clear(false); - s2idle_enter(); } diff --git a/kernel/power/wakelock.c b/kernel/power/wakelock.c index 105df4dfc783..52571dcad768 100644 --- a/kernel/power/wakelock.c +++ b/kernel/power/wakelock.c @@ -39,23 +39,20 @@ ssize_t pm_show_wakelocks(char *buf, bool show_active) { struct rb_node *node; struct wakelock *wl; - char *str = buf; - char *end = buf + PAGE_SIZE; + int len = 0; mutex_lock(&wakelocks_lock); for (node = rb_first(&wakelocks_tree); node; node = rb_next(node)) { wl = rb_entry(node, struct wakelock, node); if (wl->ws->active == show_active) - str += scnprintf(str, end - str, "%s ", wl->name); + len += sysfs_emit_at(buf, len, "%s ", wl->name); } - if (str > buf) - str--; - str += scnprintf(str, end - str, "\n"); + len += sysfs_emit_at(buf, len, "\n"); mutex_unlock(&wakelocks_lock); - return (str - buf); + return len; } #if CONFIG_PM_WAKELOCKS_LIMIT > 0 diff --git a/kernel/printk/sysctl.c b/kernel/printk/sysctl.c index 653ae04aab7f..c228343eeb97 100644 --- a/kernel/printk/sysctl.c +++ b/kernel/printk/sysctl.c @@ -12,7 +12,7 @@ static const int ten_thousand = 10000; static int proc_dointvec_minmax_sysadmin(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { if (write && !capable(CAP_SYS_ADMIN)) return -EPERM; diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 84f1d91604cc..d64f0b1d8cd3 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -123,7 +123,7 @@ static struct rcu_tasks rt_name = \ .call_func = call, \ .rtpcpu = &rt_name ## __percpu, \ .name = n, \ - .percpu_enqueue_shift = ilog2(CONFIG_NR_CPUS), \ + .percpu_enqueue_shift = ilog2(CONFIG_NR_CPUS) + 1, \ .percpu_enqueue_lim = 1, \ .percpu_dequeue_lim = 1, \ .barrier_q_mutex = __MUTEX_INITIALIZER(rt_name.barrier_q_mutex), \ @@ -216,6 +216,7 @@ static void cblist_init_generic(struct rcu_tasks *rtp) int cpu; unsigned long flags; int lim; + int shift; raw_spin_lock_irqsave(&rtp->cbs_gbl_lock, flags); if (rcu_task_enqueue_lim < 0) { @@ -229,7 +230,10 @@ static void cblist_init_generic(struct rcu_tasks *rtp) if (lim > nr_cpu_ids) lim = nr_cpu_ids; - WRITE_ONCE(rtp->percpu_enqueue_shift, ilog2(nr_cpu_ids / lim)); + shift = ilog2(nr_cpu_ids / lim); + if (((nr_cpu_ids - 1) >> shift) >= lim) + shift++; + WRITE_ONCE(rtp->percpu_enqueue_shift, shift); WRITE_ONCE(rtp->percpu_dequeue_lim, lim); smp_store_release(&rtp->percpu_enqueue_lim, lim); for_each_possible_cpu(cpu) { @@ -298,7 +302,7 @@ static void call_rcu_tasks_generic(struct rcu_head *rhp, rcu_callback_t func, if (unlikely(needadjust)) { raw_spin_lock_irqsave(&rtp->cbs_gbl_lock, flags); if (rtp->percpu_enqueue_lim != nr_cpu_ids) { - WRITE_ONCE(rtp->percpu_enqueue_shift, ilog2(nr_cpu_ids)); + WRITE_ONCE(rtp->percpu_enqueue_shift, ilog2(nr_cpu_ids) + 1); WRITE_ONCE(rtp->percpu_dequeue_lim, nr_cpu_ids); smp_store_release(&rtp->percpu_enqueue_lim, nr_cpu_ids); pr_info("Switching %s to per-CPU callback queuing.\n", rtp->name); @@ -413,7 +417,7 @@ static int rcu_tasks_need_gpcb(struct rcu_tasks *rtp) if (rcu_task_cb_adjust && ncbs <= rcu_task_collapse_lim) { raw_spin_lock_irqsave(&rtp->cbs_gbl_lock, flags); if (rtp->percpu_enqueue_lim > 1) { - WRITE_ONCE(rtp->percpu_enqueue_shift, ilog2(nr_cpu_ids)); + WRITE_ONCE(rtp->percpu_enqueue_shift, ilog2(nr_cpu_ids) + 1); smp_store_release(&rtp->percpu_enqueue_lim, 1); rtp->percpu_dequeue_gpseq = get_state_synchronize_rcu(); pr_info("Starting switch %s to CPU-0 callback queuing.\n", rtp->name); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 2e4ae00e52d1..9745613d531c 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4424,6 +4424,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) init_entity_runnable_average(&p->se); + #ifdef CONFIG_SCHED_INFO if (likely(sched_info_on())) memset(&p->sched_info, 0, sizeof(p->sched_info)); @@ -4439,18 +4440,23 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) return 0; } -void sched_post_fork(struct task_struct *p, struct kernel_clone_args *kargs) +void sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs) { unsigned long flags; -#ifdef CONFIG_CGROUP_SCHED - struct task_group *tg; -#endif + /* + * Because we're not yet on the pid-hash, p->pi_lock isn't strictly + * required yet, but lockdep gets upset if rules are violated. + */ raw_spin_lock_irqsave(&p->pi_lock, flags); #ifdef CONFIG_CGROUP_SCHED - tg = container_of(kargs->cset->subsys[cpu_cgrp_id], - struct task_group, css); - p->sched_task_group = autogroup_task_group(p, tg); + if (1) { + struct task_group *tg; + tg = container_of(kargs->cset->subsys[cpu_cgrp_id], + struct task_group, css); + tg = autogroup_task_group(p, tg); + p->sched_task_group = tg; + } #endif rseq_migrate(p); /* @@ -4461,7 +4467,10 @@ void sched_post_fork(struct task_struct *p, struct kernel_clone_args *kargs) if (p->sched_class->task_fork) p->sched_class->task_fork(p); raw_spin_unlock_irqrestore(&p->pi_lock, flags); +} +void sched_post_fork(struct task_struct *p) +{ uclamp_post_fork(p); } @@ -5822,8 +5831,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) } if (schedstat_enabled() && rq->core->core_forceidle_count) { - if (cookie) - rq->core->core_forceidle_start = rq_clock(rq->core); + rq->core->core_forceidle_start = rq_clock(rq->core); rq->core->core_forceidle_occupation = occ; } @@ -8219,9 +8227,7 @@ int __cond_resched_lock(spinlock_t *lock) if (spin_needbreak(lock) || resched) { spin_unlock(lock); - if (resched) - preempt_schedule_common(); - else + if (!_cond_resched()) cpu_relax(); ret = 1; spin_lock(lock); @@ -8239,9 +8245,7 @@ int __cond_resched_rwlock_read(rwlock_t *lock) if (rwlock_needbreak(lock) || resched) { read_unlock(lock); - if (resched) - preempt_schedule_common(); - else + if (!_cond_resched()) cpu_relax(); ret = 1; read_lock(lock); @@ -8259,9 +8263,7 @@ int __cond_resched_rwlock_write(rwlock_t *lock) if (rwlock_needbreak(lock) || resched) { write_unlock(lock); - if (resched) - preempt_schedule_common(); - else + if (!_cond_resched()) cpu_relax(); ret = 1; write_lock(lock); diff --git a/kernel/sched/core_sched.c b/kernel/sched/core_sched.c index 1fb45672ec85..c8746a9a7ada 100644 --- a/kernel/sched/core_sched.c +++ b/kernel/sched/core_sched.c @@ -277,7 +277,7 @@ void __sched_core_account_forceidle(struct rq *rq) rq_i = cpu_rq(i); p = rq_i->core_pick ?: rq_i->curr; - if (!p->core_cookie) + if (p == rq_i->idle) continue; __schedstat_add(p->stats.core_forceidle_sum, delta); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 095b0aa378df..5146163bfabb 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3028,9 +3028,11 @@ enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) static inline void dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { - u32 divider = get_pelt_divider(&se->avg); sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg); - cfs_rq->avg.load_sum = cfs_rq->avg.load_avg * divider; + sub_positive(&cfs_rq->avg.load_sum, se_weight(se) * se->avg.load_sum); + /* See update_cfs_rq_load_avg() */ + cfs_rq->avg.load_sum = max_t(u32, cfs_rq->avg.load_sum, + cfs_rq->avg.load_avg * PELT_MIN_DIVIDER); } #else static inline void @@ -3381,7 +3383,6 @@ void set_task_rq_fair(struct sched_entity *se, se->avg.last_update_time = n_last_update_time; } - /* * When on migration a sched_entity joins/leaves the PELT hierarchy, we need to * propagate its contribution. The key to this propagation is the invariant @@ -3449,15 +3450,14 @@ void set_task_rq_fair(struct sched_entity *se, * XXX: only do this for the part of runnable > running ? * */ - static inline void update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq) { - long delta = gcfs_rq->avg.util_avg - se->avg.util_avg; - u32 divider; + long delta_sum, delta_avg = gcfs_rq->avg.util_avg - se->avg.util_avg; + u32 new_sum, divider; /* Nothing to update */ - if (!delta) + if (!delta_avg) return; /* @@ -3466,23 +3466,30 @@ update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq */ divider = get_pelt_divider(&cfs_rq->avg); + /* Set new sched_entity's utilization */ se->avg.util_avg = gcfs_rq->avg.util_avg; - se->avg.util_sum = se->avg.util_avg * divider; + new_sum = se->avg.util_avg * divider; + delta_sum = (long)new_sum - (long)se->avg.util_sum; + se->avg.util_sum = new_sum; /* Update parent cfs_rq utilization */ - add_positive(&cfs_rq->avg.util_avg, delta); - cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * divider; + add_positive(&cfs_rq->avg.util_avg, delta_avg); + add_positive(&cfs_rq->avg.util_sum, delta_sum); + + /* See update_cfs_rq_load_avg() */ + cfs_rq->avg.util_sum = max_t(u32, cfs_rq->avg.util_sum, + cfs_rq->avg.util_avg * PELT_MIN_DIVIDER); } static inline void update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq) { - long delta = gcfs_rq->avg.runnable_avg - se->avg.runnable_avg; - u32 divider; + long delta_sum, delta_avg = gcfs_rq->avg.runnable_avg - se->avg.runnable_avg; + u32 new_sum, divider; /* Nothing to update */ - if (!delta) + if (!delta_avg) return; /* @@ -3493,19 +3500,25 @@ update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cf /* Set new sched_entity's runnable */ se->avg.runnable_avg = gcfs_rq->avg.runnable_avg; - se->avg.runnable_sum = se->avg.runnable_avg * divider; + new_sum = se->avg.runnable_avg * divider; + delta_sum = (long)new_sum - (long)se->avg.runnable_sum; + se->avg.runnable_sum = new_sum; /* Update parent cfs_rq runnable */ - add_positive(&cfs_rq->avg.runnable_avg, delta); - cfs_rq->avg.runnable_sum = cfs_rq->avg.runnable_avg * divider; + add_positive(&cfs_rq->avg.runnable_avg, delta_avg); + add_positive(&cfs_rq->avg.runnable_sum, delta_sum); + /* See update_cfs_rq_load_avg() */ + cfs_rq->avg.runnable_sum = max_t(u32, cfs_rq->avg.runnable_sum, + cfs_rq->avg.runnable_avg * PELT_MIN_DIVIDER); } static inline void update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq) { - long delta, running_sum, runnable_sum = gcfs_rq->prop_runnable_sum; + long delta_avg, running_sum, runnable_sum = gcfs_rq->prop_runnable_sum; unsigned long load_avg; u64 load_sum = 0; + s64 delta_sum; u32 divider; if (!runnable_sum) @@ -3532,7 +3545,7 @@ update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq * assuming all tasks are equally runnable. */ if (scale_load_down(gcfs_rq->load.weight)) { - load_sum = div_s64(gcfs_rq->avg.load_sum, + load_sum = div_u64(gcfs_rq->avg.load_sum, scale_load_down(gcfs_rq->load.weight)); } @@ -3549,19 +3562,22 @@ update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq running_sum = se->avg.util_sum >> SCHED_CAPACITY_SHIFT; runnable_sum = max(runnable_sum, running_sum); - load_sum = (s64)se_weight(se) * runnable_sum; - load_avg = div_s64(load_sum, divider); - - se->avg.load_sum = runnable_sum; + load_sum = se_weight(se) * runnable_sum; + load_avg = div_u64(load_sum, divider); - delta = load_avg - se->avg.load_avg; - if (!delta) + delta_avg = load_avg - se->avg.load_avg; + if (!delta_avg) return; - se->avg.load_avg = load_avg; + delta_sum = load_sum - (s64)se_weight(se) * se->avg.load_sum; - add_positive(&cfs_rq->avg.load_avg, delta); - cfs_rq->avg.load_sum = cfs_rq->avg.load_avg * divider; + se->avg.load_sum = runnable_sum; + se->avg.load_avg = load_avg; + add_positive(&cfs_rq->avg.load_avg, delta_avg); + add_positive(&cfs_rq->avg.load_sum, delta_sum); + /* See update_cfs_rq_load_avg() */ + cfs_rq->avg.load_sum = max_t(u32, cfs_rq->avg.load_sum, + cfs_rq->avg.load_avg * PELT_MIN_DIVIDER); } static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum) @@ -3652,7 +3668,7 @@ static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum * * cfs_rq->avg is used for task_h_load() and update_cfs_share() for example. * - * Returns true if the load decayed or we removed load. + * Return: true if the load decayed or we removed load. * * Since both these conditions indicate a changed cfs_rq->avg.load we should * call update_tg_load_avg() when this function returns true. @@ -3677,15 +3693,32 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) r = removed_load; sub_positive(&sa->load_avg, r); - sa->load_sum = sa->load_avg * divider; + sub_positive(&sa->load_sum, r * divider); + /* See sa->util_sum below */ + sa->load_sum = max_t(u32, sa->load_sum, sa->load_avg * PELT_MIN_DIVIDER); r = removed_util; sub_positive(&sa->util_avg, r); - sa->util_sum = sa->util_avg * divider; + sub_positive(&sa->util_sum, r * divider); + /* + * Because of rounding, se->util_sum might ends up being +1 more than + * cfs->util_sum. Although this is not a problem by itself, detaching + * a lot of tasks with the rounding problem between 2 updates of + * util_avg (~1ms) can make cfs->util_sum becoming null whereas + * cfs_util_avg is not. + * Check that util_sum is still above its lower bound for the new + * util_avg. Given that period_contrib might have moved since the last + * sync, we are only sure that util_sum must be above or equal to + * util_avg * minimum possible divider + */ + sa->util_sum = max_t(u32, sa->util_sum, sa->util_avg * PELT_MIN_DIVIDER); r = removed_runnable; sub_positive(&sa->runnable_avg, r); - sa->runnable_sum = sa->runnable_avg * divider; + sub_positive(&sa->runnable_sum, r * divider); + /* See sa->util_sum above */ + sa->runnable_sum = max_t(u32, sa->runnable_sum, + sa->runnable_avg * PELT_MIN_DIVIDER); /* * removed_runnable is the unweighted version of removed_load so we @@ -3772,17 +3805,18 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s */ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { - /* - * cfs_rq->avg.period_contrib can be used for both cfs_rq and se. - * See ___update_load_avg() for details. - */ - u32 divider = get_pelt_divider(&cfs_rq->avg); - dequeue_load_avg(cfs_rq, se); sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg); - cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * divider; + sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum); + /* See update_cfs_rq_load_avg() */ + cfs_rq->avg.util_sum = max_t(u32, cfs_rq->avg.util_sum, + cfs_rq->avg.util_avg * PELT_MIN_DIVIDER); + sub_positive(&cfs_rq->avg.runnable_avg, se->avg.runnable_avg); - cfs_rq->avg.runnable_sum = cfs_rq->avg.runnable_avg * divider; + sub_positive(&cfs_rq->avg.runnable_sum, se->avg.runnable_sum); + /* See update_cfs_rq_load_avg() */ + cfs_rq->avg.runnable_sum = max_t(u32, cfs_rq->avg.runnable_sum, + cfs_rq->avg.runnable_avg * PELT_MIN_DIVIDER); add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum); @@ -8539,6 +8573,8 @@ group_type group_classify(unsigned int imbalance_pct, * * If @sg does not have SMT siblings, only pull tasks if all of the SMT siblings * of @dst_cpu are idle and @sg has lower priority. + * + * Return: true if @dst_cpu can pull tasks, false otherwise. */ static bool asym_smt_can_pull_tasks(int dst_cpu, struct sd_lb_stats *sds, struct sg_lb_stats *sgs, @@ -8614,6 +8650,7 @@ sched_asym(struct lb_env *env, struct sd_lb_stats *sds, struct sg_lb_stats *sgs /** * update_sg_lb_stats - Update sched_group's statistics for load balancing. * @env: The load balancing environment. + * @sds: Load-balancing data with statistics of the local group. * @group: sched_group whose statistics are to be updated. * @sgs: variable to hold the statistics for this group. * @sg_status: Holds flag indicating the status of the sched_group @@ -9421,12 +9458,11 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s /** * find_busiest_group - Returns the busiest group within the sched_domain * if there is an imbalance. + * @env: The load balancing environment. * * Also calculates the amount of runnable load which should be moved * to restore balance. * - * @env: The load balancing environment. - * * Return: - The busiest group if imbalance exists. */ static struct sched_group *find_busiest_group(struct lb_env *env) diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c index b5add64d9698..3d2825408e3a 100644 --- a/kernel/sched/membarrier.c +++ b/kernel/sched/membarrier.c @@ -147,11 +147,11 @@ #endif #ifdef CONFIG_RSEQ -#define MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ_BITMASK \ +#define MEMBARRIER_PRIVATE_EXPEDITED_RSEQ_BITMASK \ (MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ \ - | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ_BITMASK) + | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ) #else -#define MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ_BITMASK 0 +#define MEMBARRIER_PRIVATE_EXPEDITED_RSEQ_BITMASK 0 #endif #define MEMBARRIER_CMD_BITMASK \ @@ -159,7 +159,8 @@ | MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED \ | MEMBARRIER_CMD_PRIVATE_EXPEDITED \ | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED \ - | MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK) + | MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK \ + | MEMBARRIER_PRIVATE_EXPEDITED_RSEQ_BITMASK) static void ipi_mb(void *info) { diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h index e06071bf3472..c336f5f481bc 100644 --- a/kernel/sched/pelt.h +++ b/kernel/sched/pelt.h @@ -37,9 +37,11 @@ update_irq_load_avg(struct rq *rq, u64 running) } #endif +#define PELT_MIN_DIVIDER (LOAD_AVG_MAX - 1024) + static inline u32 get_pelt_divider(struct sched_avg *avg) { - return LOAD_AVG_MAX - 1024 + avg->period_contrib; + return PELT_MIN_DIVIDER + avg->period_contrib; } static inline void cfs_se_util_change(struct sched_avg *avg) diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index a679613a7cb7..e14358178849 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -1082,44 +1082,6 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res) return 0; } -static int psi_io_show(struct seq_file *m, void *v) -{ - return psi_show(m, &psi_system, PSI_IO); -} - -static int psi_memory_show(struct seq_file *m, void *v) -{ - return psi_show(m, &psi_system, PSI_MEM); -} - -static int psi_cpu_show(struct seq_file *m, void *v) -{ - return psi_show(m, &psi_system, PSI_CPU); -} - -static int psi_open(struct file *file, int (*psi_show)(struct seq_file *, void *)) -{ - if (file->f_mode & FMODE_WRITE && !capable(CAP_SYS_RESOURCE)) - return -EPERM; - - return single_open(file, psi_show, NULL); -} - -static int psi_io_open(struct inode *inode, struct file *file) -{ - return psi_open(file, psi_io_show); -} - -static int psi_memory_open(struct inode *inode, struct file *file) -{ - return psi_open(file, psi_memory_show); -} - -static int psi_cpu_open(struct inode *inode, struct file *file) -{ - return psi_open(file, psi_cpu_show); -} - struct psi_trigger *psi_trigger_create(struct psi_group *group, char *buf, size_t nbytes, enum psi_res res) { @@ -1162,7 +1124,6 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group, t->event = 0; t->last_event_time = 0; init_waitqueue_head(&t->event_wait); - kref_init(&t->refcount); mutex_lock(&group->trigger_lock); @@ -1191,15 +1152,19 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group, return t; } -static void psi_trigger_destroy(struct kref *ref) +void psi_trigger_destroy(struct psi_trigger *t) { - struct psi_trigger *t = container_of(ref, struct psi_trigger, refcount); - struct psi_group *group = t->group; + struct psi_group *group; struct task_struct *task_to_destroy = NULL; - if (static_branch_likely(&psi_disabled)) + /* + * We do not check psi_disabled since it might have been disabled after + * the trigger got created. + */ + if (!t) return; + group = t->group; /* * Wakeup waiters to stop polling. Can happen if cgroup is deleted * from under a polling process. @@ -1235,9 +1200,9 @@ static void psi_trigger_destroy(struct kref *ref) mutex_unlock(&group->trigger_lock); /* - * Wait for both *trigger_ptr from psi_trigger_replace and - * poll_task RCUs to complete their read-side critical sections - * before destroying the trigger and optionally the poll_task + * Wait for psi_schedule_poll_work RCU to complete its read-side + * critical section before destroying the trigger and optionally the + * poll_task. */ synchronize_rcu(); /* @@ -1254,18 +1219,6 @@ static void psi_trigger_destroy(struct kref *ref) kfree(t); } -void psi_trigger_replace(void **trigger_ptr, struct psi_trigger *new) -{ - struct psi_trigger *old = *trigger_ptr; - - if (static_branch_likely(&psi_disabled)) - return; - - rcu_assign_pointer(*trigger_ptr, new); - if (old) - kref_put(&old->refcount, psi_trigger_destroy); -} - __poll_t psi_trigger_poll(void **trigger_ptr, struct file *file, poll_table *wait) { @@ -1275,27 +1228,57 @@ __poll_t psi_trigger_poll(void **trigger_ptr, if (static_branch_likely(&psi_disabled)) return DEFAULT_POLLMASK | EPOLLERR | EPOLLPRI; - rcu_read_lock(); - - t = rcu_dereference(*(void __rcu __force **)trigger_ptr); - if (!t) { - rcu_read_unlock(); + t = smp_load_acquire(trigger_ptr); + if (!t) return DEFAULT_POLLMASK | EPOLLERR | EPOLLPRI; - } - kref_get(&t->refcount); - - rcu_read_unlock(); poll_wait(file, &t->event_wait, wait); if (cmpxchg(&t->event, 1, 0) == 1) ret |= EPOLLPRI; - kref_put(&t->refcount, psi_trigger_destroy); - return ret; } +#ifdef CONFIG_PROC_FS +static int psi_io_show(struct seq_file *m, void *v) +{ + return psi_show(m, &psi_system, PSI_IO); +} + +static int psi_memory_show(struct seq_file *m, void *v) +{ + return psi_show(m, &psi_system, PSI_MEM); +} + +static int psi_cpu_show(struct seq_file *m, void *v) +{ + return psi_show(m, &psi_system, PSI_CPU); +} + +static int psi_open(struct file *file, int (*psi_show)(struct seq_file *, void *)) +{ + if (file->f_mode & FMODE_WRITE && !capable(CAP_SYS_RESOURCE)) + return -EPERM; + + return single_open(file, psi_show, NULL); +} + +static int psi_io_open(struct inode *inode, struct file *file) +{ + return psi_open(file, psi_io_show); +} + +static int psi_memory_open(struct inode *inode, struct file *file) +{ + return psi_open(file, psi_memory_show); +} + +static int psi_cpu_open(struct inode *inode, struct file *file) +{ + return psi_open(file, psi_cpu_show); +} + static ssize_t psi_write(struct file *file, const char __user *user_buf, size_t nbytes, enum psi_res res) { @@ -1316,14 +1299,24 @@ static ssize_t psi_write(struct file *file, const char __user *user_buf, buf[buf_size - 1] = '\0'; - new = psi_trigger_create(&psi_system, buf, nbytes, res); - if (IS_ERR(new)) - return PTR_ERR(new); - seq = file->private_data; + /* Take seq->lock to protect seq->private from concurrent writes */ mutex_lock(&seq->lock); - psi_trigger_replace(&seq->private, new); + + /* Allow only one trigger per file descriptor */ + if (seq->private) { + mutex_unlock(&seq->lock); + return -EBUSY; + } + + new = psi_trigger_create(&psi_system, buf, nbytes, res); + if (IS_ERR(new)) { + mutex_unlock(&seq->lock); + return PTR_ERR(new); + } + + smp_store_release(&seq->private, new); mutex_unlock(&seq->lock); return nbytes; @@ -1358,7 +1351,7 @@ static int psi_fop_release(struct inode *inode, struct file *file) { struct seq_file *seq = file->private_data; - psi_trigger_replace(&seq->private, NULL); + psi_trigger_destroy(seq->private); return single_release(inode, file); } @@ -1400,3 +1393,5 @@ static int __init psi_proc_init(void) return 0; } module_init(psi_proc_init); + +#endif /* CONFIG_PROC_FS */ diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 4d8f44a17727..db10e73d06e0 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -29,6 +29,9 @@ #include <linux/syscalls.h> #include <linux/sysctl.h> +/* Not exposed in headers: strictly internal use only. */ +#define SECCOMP_MODE_DEAD (SECCOMP_MODE_FILTER + 1) + #ifdef CONFIG_HAVE_ARCH_SECCOMP_FILTER #include <asm/syscall.h> #endif @@ -1010,6 +1013,7 @@ static void __secure_computing_strict(int this_syscall) #ifdef SECCOMP_DEBUG dump_stack(); #endif + current->seccomp.mode = SECCOMP_MODE_DEAD; seccomp_log(this_syscall, SIGKILL, SECCOMP_RET_KILL_THREAD, true); do_exit(SIGKILL); } @@ -1261,6 +1265,7 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, case SECCOMP_RET_KILL_THREAD: case SECCOMP_RET_KILL_PROCESS: default: + current->seccomp.mode = SECCOMP_MODE_DEAD; seccomp_log(this_syscall, SIGSYS, action, true); /* Dump core only if this is the last remaining thread. */ if (action != SECCOMP_RET_KILL_THREAD || @@ -1309,6 +1314,11 @@ int __secure_computing(const struct seccomp_data *sd) return 0; case SECCOMP_MODE_FILTER: return __seccomp_filter(this_syscall, sd, false); + /* Surviving SECCOMP_RET_KILL_* must be proactively impossible. */ + case SECCOMP_MODE_DEAD: + WARN_ON_ONCE(1); + do_exit(SIGKILL); + return -1; default: BUG(); } diff --git a/kernel/signal.c b/kernel/signal.c index 38602738866e..9b04631acde8 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1342,9 +1342,10 @@ force_sig_info_to_task(struct kernel_siginfo *info, struct task_struct *t, } /* * Don't clear SIGNAL_UNKILLABLE for traced tasks, users won't expect - * debugging to leave init killable. + * debugging to leave init killable. But HANDLER_EXIT is always fatal. */ - if (action->sa.sa_handler == SIG_DFL && !t->ptrace) + if (action->sa.sa_handler == SIG_DFL && + (!t->ptrace || (handler == HANDLER_EXIT))) t->signal->flags &= ~SIGNAL_UNKILLABLE; ret = send_signal(sig, info, t, PIDTYPE_PID); spin_unlock_irqrestore(&t->sighand->siglock, flags); diff --git a/kernel/stackleak.c b/kernel/stackleak.c index 66b8af394e58..ddb5a7f48d69 100644 --- a/kernel/stackleak.c +++ b/kernel/stackleak.c @@ -70,7 +70,7 @@ late_initcall(stackleak_sysctls_init); #define skip_erasing() false #endif /* CONFIG_STACKLEAK_RUNTIME_DISABLE */ -asmlinkage void notrace stackleak_erase(void) +asmlinkage void noinstr stackleak_erase(void) { /* It would be nice not to have 'kstack_ptr' and 'boundary' on stack */ unsigned long kstack_ptr = current->lowest_stack; @@ -124,9 +124,8 @@ asmlinkage void notrace stackleak_erase(void) /* Reset the 'lowest_stack' value for the next syscall */ current->lowest_stack = current_top_of_stack() - THREAD_SIZE/64; } -NOKPROBE_SYMBOL(stackleak_erase); -void __used __no_caller_saved_registers notrace stackleak_track_stack(void) +void __used __no_caller_saved_registers noinstr stackleak_track_stack(void) { unsigned long sp = current_stack_pointer; diff --git a/kernel/sys.c b/kernel/sys.c index ecc4cf019242..97dc9e5d6bf9 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -472,6 +472,16 @@ static int set_user(struct cred *new) if (!new_user) return -EAGAIN; + free_uid(new->user); + new->user = new_user; + return 0; +} + +static void flag_nproc_exceeded(struct cred *new) +{ + if (new->ucounts == current_ucounts()) + return; + /* * We don't fail in case of NPROC limit excess here because too many * poorly written programs don't check set*uid() return code, assuming @@ -480,15 +490,10 @@ static int set_user(struct cred *new) * failure to the execve() stage. */ if (is_ucounts_overlimit(new->ucounts, UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC)) && - new_user != INIT_USER && - !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN)) + new->user != INIT_USER) current->flags |= PF_NPROC_EXCEEDED; else current->flags &= ~PF_NPROC_EXCEEDED; - - free_uid(new->user); - new->user = new_user; - return 0; } /* @@ -563,6 +568,7 @@ long __sys_setreuid(uid_t ruid, uid_t euid) if (retval < 0) goto error; + flag_nproc_exceeded(new); return commit_creds(new); error: @@ -625,6 +631,7 @@ long __sys_setuid(uid_t uid) if (retval < 0) goto error; + flag_nproc_exceeded(new); return commit_creds(new); error: @@ -704,6 +711,7 @@ long __sys_setresuid(uid_t ruid, uid_t euid, uid_t suid) if (retval < 0) goto error; + flag_nproc_exceeded(new); return commit_creds(new); error: diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 752ed89a293b..a5eb5e7fd624 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -70,10 +70,16 @@ config HAVE_C_RECORDMCOUNT help C version of recordmcount available? +config HAVE_BUILDTIME_MCOUNT_SORT + bool + help + An architecture selects this if it sorts the mcount_loc section + at build time. + config BUILDTIME_MCOUNT_SORT bool default y - depends on BUILDTIME_TABLE_SORT && !S390 + depends on HAVE_BUILDTIME_MCOUNT_SORT && DYNAMIC_FTRACE help Sort the mcount_loc section at build time. diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index a569a0cb81ee..7c2578efde26 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -252,6 +252,10 @@ __setup("trace_clock=", set_trace_boot_clock); static int __init set_tracepoint_printk(char *str) { + /* Ignore the "tp_printk_stop_on_boot" param */ + if (*str == '_') + return 0; + if ((strcmp(str, "=0") != 0 && strcmp(str, "=off") != 0)) tracepoint_printk = 1; return 1; @@ -7740,7 +7744,8 @@ static struct tracing_log_err *get_tracing_log_err(struct trace_array *tr) err = kzalloc(sizeof(*err), GFP_KERNEL); if (!err) err = ERR_PTR(-ENOMEM); - tr->n_err_log_entries++; + else + tr->n_err_log_entries++; return err; } diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 5e6a988a8a51..ada87bfb5bb8 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -2503,6 +2503,8 @@ static struct hist_field *parse_unary(struct hist_trigger_data *hist_data, (HIST_FIELD_FL_TIMESTAMP | HIST_FIELD_FL_TIMESTAMP_USECS); expr->fn = hist_field_unary_minus; expr->operands[0] = operand1; + expr->size = operand1->size; + expr->is_signed = operand1->is_signed; expr->operator = FIELD_OP_UNARY_MINUS; expr->name = expr_str(expr, 0); expr->type = kstrdup_const(operand1->type, GFP_KERNEL); @@ -2719,6 +2721,7 @@ static struct hist_field *parse_expr(struct hist_trigger_data *hist_data, /* The operand sizes should be the same, so just pick one */ expr->size = operand1->size; + expr->is_signed = operand1->is_signed; expr->operator = field_op; expr->type = kstrdup_const(operand1->type, GFP_KERNEL); @@ -3935,6 +3938,7 @@ static int trace_action_create(struct hist_trigger_data *hist_data, var_ref_idx = find_var_ref_idx(hist_data, var_ref); if (WARN_ON(var_ref_idx < 0)) { + kfree(p); ret = var_ref_idx; goto err; } @@ -6163,7 +6167,9 @@ static int event_hist_trigger_parse(struct event_command *cmd_ops, lockdep_assert_held(&event_mutex); - if (glob && strlen(glob)) { + WARN_ON(!glob); + + if (strlen(glob)) { hist_err_clear(); last_cmd_set(file, param); } @@ -6196,7 +6202,7 @@ static int event_hist_trigger_parse(struct event_command *cmd_ops, continue; } break; - } while (p); + } while (1); if (!p) param = NULL; diff --git a/kernel/ucount.c b/kernel/ucount.c index 7b32c356ebc5..06ea04d44685 100644 --- a/kernel/ucount.c +++ b/kernel/ucount.c @@ -190,6 +190,7 @@ struct ucounts *alloc_ucounts(struct user_namespace *ns, kuid_t uid) kfree(new); } else { hlist_add_head(&new->node, hashent); + get_user_ns(new->ns); spin_unlock_irq(&ucounts_lock); return new; } @@ -210,6 +211,7 @@ void put_ucounts(struct ucounts *ucounts) if (atomic_dec_and_lock_irqsave(&ucounts->count, &ucounts_lock, flags)) { hlist_del_init(&ucounts->node); spin_unlock_irqrestore(&ucounts_lock, flags); + put_user_ns(ucounts->ns); kfree(ucounts); } } @@ -348,7 +350,8 @@ bool is_ucounts_overlimit(struct ucounts *ucounts, enum ucount_type type, unsign if (rlimit > LONG_MAX) max = LONG_MAX; for (iter = ucounts; iter; iter = iter->ns->ucounts) { - if (get_ucounts_value(iter, type) > max) + long val = get_ucounts_value(iter, type); + if (val < 0 || val > max) return true; max = READ_ONCE(iter->ns->ucount_max[type]); } |