diff options
author | Joel Stanley <joel@jms.id.au> | 2020-08-31 03:46:52 +0300 |
---|---|---|
committer | Joel Stanley <joel@jms.id.au> | 2020-08-31 03:46:57 +0300 |
commit | 0dd0c8c492fa70707ca4f0d36dcb2e3c64105b16 (patch) | |
tree | a420abd8f26264544246602c60d161a7cc4de390 /kernel | |
parent | 31d8605658d37d9197a989838508481d5dc1d8bc (diff) | |
parent | 9ece50d8a470ca7235ffd6ac0f9c5f0f201fe2c8 (diff) | |
download | linux-dev-5.8.tar.xz |
Merge tag 'v5.8.5' into dev-5.8dev-5.8
This is the 5.8.5 stable release
Signed-off-by: Joel Stanley <joel@jms.id.au>
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/bpf/map_iter.c | 16 | ||||
-rw-r--r-- | kernel/bpf/task_iter.c | 9 | ||||
-rw-r--r-- | kernel/events/uprobes.c | 2 | ||||
-rw-r--r-- | kernel/irq/manage.c | 13 | ||||
-rw-r--r-- | kernel/irq/pm.c | 8 | ||||
-rw-r--r-- | kernel/kprobes.c | 24 | ||||
-rw-r--r-- | kernel/kthread.c | 7 | ||||
-rw-r--r-- | kernel/module.c | 22 | ||||
-rw-r--r-- | kernel/pid.c | 7 | ||||
-rw-r--r-- | kernel/rcu/tree.c | 9 | ||||
-rw-r--r-- | kernel/relay.c | 1 | ||||
-rw-r--r-- | kernel/sched/core.c | 102 | ||||
-rw-r--r-- | kernel/sched/cpufreq_schedutil.c | 2 | ||||
-rw-r--r-- | kernel/sched/fair.c | 23 | ||||
-rw-r--r-- | kernel/sched/sched.h | 47 | ||||
-rw-r--r-- | kernel/sched/topology.c | 2 | ||||
-rw-r--r-- | kernel/seccomp.c | 9 | ||||
-rw-r--r-- | kernel/signal.c | 16 | ||||
-rw-r--r-- | kernel/task_work.c | 8 | ||||
-rw-r--r-- | kernel/time/tick-sched.c | 22 | ||||
-rw-r--r-- | kernel/trace/blktrace.c | 18 | ||||
-rw-r--r-- | kernel/trace/ftrace.c | 18 | ||||
-rw-r--r-- | kernel/trace/trace.c | 12 | ||||
-rw-r--r-- | kernel/trace/trace.h | 9 | ||||
-rw-r--r-- | kernel/trace/trace_events.c | 4 | ||||
-rw-r--r-- | kernel/trace/trace_hwlat.c | 5 | ||||
-rw-r--r-- | kernel/watch_queue.c | 8 |
27 files changed, 337 insertions, 86 deletions
diff --git a/kernel/bpf/map_iter.c b/kernel/bpf/map_iter.c index c69071e334bf..1a04c168563d 100644 --- a/kernel/bpf/map_iter.c +++ b/kernel/bpf/map_iter.c @@ -6,7 +6,7 @@ #include <linux/kernel.h> struct bpf_iter_seq_map_info { - u32 mid; + u32 map_id; }; static void *bpf_map_seq_start(struct seq_file *seq, loff_t *pos) @@ -14,27 +14,23 @@ static void *bpf_map_seq_start(struct seq_file *seq, loff_t *pos) struct bpf_iter_seq_map_info *info = seq->private; struct bpf_map *map; - map = bpf_map_get_curr_or_next(&info->mid); + map = bpf_map_get_curr_or_next(&info->map_id); if (!map) return NULL; - ++*pos; + if (*pos == 0) + ++*pos; return map; } static void *bpf_map_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct bpf_iter_seq_map_info *info = seq->private; - struct bpf_map *map; ++*pos; - ++info->mid; + ++info->map_id; bpf_map_put((struct bpf_map *)v); - map = bpf_map_get_curr_or_next(&info->mid); - if (!map) - return NULL; - - return map; + return bpf_map_get_curr_or_next(&info->map_id); } struct bpf_iter__bpf_map { diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c index 4dbf2b6035f8..a4a0fb4f94cc 100644 --- a/kernel/bpf/task_iter.c +++ b/kernel/bpf/task_iter.c @@ -50,7 +50,8 @@ static void *task_seq_start(struct seq_file *seq, loff_t *pos) if (!task) return NULL; - ++*pos; + if (*pos == 0) + ++*pos; return task; } @@ -176,10 +177,11 @@ again: f = fcheck_files(curr_files, curr_fd); if (!f) continue; + if (!get_file_rcu(f)) + continue; /* set info->fd */ info->fd = curr_fd; - get_file(f); rcu_read_unlock(); return f; } @@ -209,7 +211,8 @@ static void *task_file_seq_start(struct seq_file *seq, loff_t *pos) return NULL; } - ++*pos; + if (*pos == 0) + ++*pos; info->task = task; info->files = files; diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 5f8b0c52fd2e..661333c2893d 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -205,7 +205,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, try_to_free_swap(old_page); page_vma_mapped_walk_done(&pvmw); - if (vma->vm_flags & VM_LOCKED) + if ((vma->vm_flags & VM_LOCKED) && !PageCompound(old_page)) munlock_vma_page(old_page); put_page(old_page); diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 2a9fec53e159..e68a8f993106 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -320,12 +320,16 @@ static bool irq_set_affinity_deactivated(struct irq_data *data, struct irq_desc *desc = irq_data_to_desc(data); /* + * Handle irq chips which can handle affinity only in activated + * state correctly + * * If the interrupt is not yet activated, just store the affinity * mask and do not call the chip driver at all. On activation the * driver has to make sure anyway that the interrupt is in a * useable state so startup works. */ - if (!IS_ENABLED(CONFIG_IRQ_DOMAIN_HIERARCHY) || irqd_is_activated(data)) + if (!IS_ENABLED(CONFIG_IRQ_DOMAIN_HIERARCHY) || + irqd_is_activated(data) || !irqd_affinity_on_activate(data)) return false; cpumask_copy(desc->irq_common_data.affinity, mask); @@ -2731,8 +2735,10 @@ int irq_set_irqchip_state(unsigned int irq, enum irqchip_irq_state which, do { chip = irq_data_get_irq_chip(data); - if (WARN_ON_ONCE(!chip)) - return -ENODEV; + if (WARN_ON_ONCE(!chip)) { + err = -ENODEV; + goto out_unlock; + } if (chip->irq_set_irqchip_state) break; #ifdef CONFIG_IRQ_DOMAIN_HIERARCHY @@ -2745,6 +2751,7 @@ int irq_set_irqchip_state(unsigned int irq, enum irqchip_irq_state which, if (data) err = chip->irq_set_irqchip_state(data, which, val); +out_unlock: irq_put_desc_busunlock(desc, flags); return err; } diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c index 8f557fa1f4fe..c6c7e187ae74 100644 --- a/kernel/irq/pm.c +++ b/kernel/irq/pm.c @@ -185,14 +185,18 @@ void rearm_wake_irq(unsigned int irq) unsigned long flags; struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL); - if (!desc || !(desc->istate & IRQS_SUSPENDED) || - !irqd_is_wakeup_set(&desc->irq_data)) + if (!desc) return; + if (!(desc->istate & IRQS_SUSPENDED) || + !irqd_is_wakeup_set(&desc->irq_data)) + goto unlock; + desc->istate &= ~IRQS_SUSPENDED; irqd_set(&desc->irq_data, IRQD_WAKEUP_ARMED); __enable_irq(desc); +unlock: irq_put_desc_busunlock(desc, flags); } diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 2e97febeef77..72af5d37e9ff 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1079,9 +1079,20 @@ static int disarm_kprobe_ftrace(struct kprobe *p) ipmodify ? &kprobe_ipmodify_enabled : &kprobe_ftrace_enabled); } #else /* !CONFIG_KPROBES_ON_FTRACE */ -#define prepare_kprobe(p) arch_prepare_kprobe(p) -#define arm_kprobe_ftrace(p) (-ENODEV) -#define disarm_kprobe_ftrace(p) (-ENODEV) +static inline int prepare_kprobe(struct kprobe *p) +{ + return arch_prepare_kprobe(p); +} + +static inline int arm_kprobe_ftrace(struct kprobe *p) +{ + return -ENODEV; +} + +static inline int disarm_kprobe_ftrace(struct kprobe *p) +{ + return -ENODEV; +} #endif /* Arm a kprobe with text_mutex */ @@ -2113,6 +2124,13 @@ static void kill_kprobe(struct kprobe *p) * the original probed function (which will be freed soon) any more. */ arch_remove_kprobe(p); + + /* + * The module is going away. We should disarm the kprobe which + * is using ftrace. + */ + if (kprobe_ftrace(p)) + disarm_kprobe_ftrace(p); } /* Disable one kprobe */ diff --git a/kernel/kthread.c b/kernel/kthread.c index 132f84a5fde3..f481ab35de2f 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -1239,13 +1239,16 @@ void kthread_use_mm(struct mm_struct *mm) WARN_ON_ONCE(tsk->mm); task_lock(tsk); + /* Hold off tlb flush IPIs while switching mm's */ + local_irq_disable(); active_mm = tsk->active_mm; if (active_mm != mm) { mmgrab(mm); tsk->active_mm = mm; } tsk->mm = mm; - switch_mm(active_mm, mm, tsk); + switch_mm_irqs_off(active_mm, mm, tsk); + local_irq_enable(); task_unlock(tsk); #ifdef finish_arch_post_lock_switch finish_arch_post_lock_switch(); @@ -1274,9 +1277,11 @@ void kthread_unuse_mm(struct mm_struct *mm) task_lock(tsk); sync_mm_rss(mm); + local_irq_disable(); tsk->mm = NULL; /* active_mm is still 'mm' */ enter_lazy_tlb(mm, tsk); + local_irq_enable(); task_unlock(tsk); } EXPORT_SYMBOL_GPL(kthread_unuse_mm); diff --git a/kernel/module.c b/kernel/module.c index aa183c9ac0a2..08c46084d8cc 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1520,18 +1520,34 @@ struct module_sect_attrs { struct module_sect_attr attrs[]; }; +#define MODULE_SECT_READ_SIZE (3 /* "0x", "\n" */ + (BITS_PER_LONG / 4)) static ssize_t module_sect_read(struct file *file, struct kobject *kobj, struct bin_attribute *battr, char *buf, loff_t pos, size_t count) { struct module_sect_attr *sattr = container_of(battr, struct module_sect_attr, battr); + char bounce[MODULE_SECT_READ_SIZE + 1]; + size_t wrote; if (pos != 0) return -EINVAL; - return sprintf(buf, "0x%px\n", - kallsyms_show_value(file->f_cred) ? (void *)sattr->address : NULL); + /* + * Since we're a binary read handler, we must account for the + * trailing NUL byte that sprintf will write: if "buf" is + * too small to hold the NUL, or the NUL is exactly the last + * byte, the read will look like it got truncated by one byte. + * Since there is no way to ask sprintf nicely to not write + * the NUL, we have to use a bounce buffer. + */ + wrote = scnprintf(bounce, sizeof(bounce), "0x%px\n", + kallsyms_show_value(file->f_cred) + ? (void *)sattr->address : NULL); + count = min(count, wrote); + memcpy(buf, bounce, count); + + return count; } static void free_sect_attrs(struct module_sect_attrs *sect_attrs) @@ -1580,7 +1596,7 @@ static void add_sect_attrs(struct module *mod, const struct load_info *info) goto out; sect_attrs->nsections++; sattr->battr.read = module_sect_read; - sattr->battr.size = 3 /* "0x", "\n" */ + (BITS_PER_LONG / 4); + sattr->battr.size = MODULE_SECT_READ_SIZE; sattr->battr.attr.mode = 0400; *(gattr++) = &(sattr++)->battr; } diff --git a/kernel/pid.c b/kernel/pid.c index f1496b757162..ee58530d1aca 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -42,6 +42,7 @@ #include <linux/sched/signal.h> #include <linux/sched/task.h> #include <linux/idr.h> +#include <net/sock.h> struct pid init_struct_pid = { .count = REFCOUNT_INIT(1), @@ -642,10 +643,12 @@ static int pidfd_getfd(struct pid *pid, int fd) } ret = get_unused_fd_flags(O_CLOEXEC); - if (ret < 0) + if (ret < 0) { fput(file); - else + } else { + __receive_sock(file); fd_install(ret, file); + } return ret; } diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 6c6569e0586c..1e9e500ff790 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3105,7 +3105,7 @@ static void kfree_rcu_work(struct work_struct *work) static inline bool queue_kfree_rcu_work(struct kfree_rcu_cpu *krcp) { struct kfree_rcu_cpu_work *krwp; - bool queued = false; + bool repeat = false; int i; lockdep_assert_held(&krcp->lock); @@ -3143,11 +3143,14 @@ static inline bool queue_kfree_rcu_work(struct kfree_rcu_cpu *krcp) * been detached following each other, one by one. */ queue_rcu_work(system_wq, &krwp->rcu_work); - queued = true; } + + /* Repeat if any "free" corresponding channel is still busy. */ + if (krcp->bhead || krcp->head) + repeat = true; } - return queued; + return !repeat; } static inline void kfree_rcu_drain_unlock(struct kfree_rcu_cpu *krcp, diff --git a/kernel/relay.c b/kernel/relay.c index 72fe443ea78f..fb4e0c530c08 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -197,6 +197,7 @@ free_buf: static void relay_destroy_channel(struct kref *kref) { struct rchan *chan = container_of(kref, struct rchan, kref); + free_percpu(chan->buf); kfree(chan); } diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 2142c6767682..f788cd61df21 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -794,6 +794,26 @@ unsigned int sysctl_sched_uclamp_util_max = SCHED_CAPACITY_SCALE; /* All clamps are required to be less or equal than these values */ static struct uclamp_se uclamp_default[UCLAMP_CNT]; +/* + * This static key is used to reduce the uclamp overhead in the fast path. It + * primarily disables the call to uclamp_rq_{inc, dec}() in + * enqueue/dequeue_task(). + * + * This allows users to continue to enable uclamp in their kernel config with + * minimum uclamp overhead in the fast path. + * + * As soon as userspace modifies any of the uclamp knobs, the static key is + * enabled, since we have an actual users that make use of uclamp + * functionality. + * + * The knobs that would enable this static key are: + * + * * A task modifying its uclamp value with sched_setattr(). + * * An admin modifying the sysctl_sched_uclamp_{min, max} via procfs. + * * An admin modifying the cgroup cpu.uclamp.{min, max} + */ +DEFINE_STATIC_KEY_FALSE(sched_uclamp_used); + /* Integer rounded range for each bucket */ #define UCLAMP_BUCKET_DELTA DIV_ROUND_CLOSEST(SCHED_CAPACITY_SCALE, UCLAMP_BUCKETS) @@ -990,10 +1010,38 @@ static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p, lockdep_assert_held(&rq->lock); + /* + * If sched_uclamp_used was enabled after task @p was enqueued, + * we could end up with unbalanced call to uclamp_rq_dec_id(). + * + * In this case the uc_se->active flag should be false since no uclamp + * accounting was performed at enqueue time and we can just return + * here. + * + * Need to be careful of the following enqeueue/dequeue ordering + * problem too + * + * enqueue(taskA) + * // sched_uclamp_used gets enabled + * enqueue(taskB) + * dequeue(taskA) + * // Must not decrement bukcet->tasks here + * dequeue(taskB) + * + * where we could end up with stale data in uc_se and + * bucket[uc_se->bucket_id]. + * + * The following check here eliminates the possibility of such race. + */ + if (unlikely(!uc_se->active)) + return; + bucket = &uc_rq->bucket[uc_se->bucket_id]; + SCHED_WARN_ON(!bucket->tasks); if (likely(bucket->tasks)) bucket->tasks--; + uc_se->active = false; /* @@ -1021,6 +1069,15 @@ static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p) { enum uclamp_id clamp_id; + /* + * Avoid any overhead until uclamp is actually used by the userspace. + * + * The condition is constructed such that a NOP is generated when + * sched_uclamp_used is disabled. + */ + if (!static_branch_unlikely(&sched_uclamp_used)) + return; + if (unlikely(!p->sched_class->uclamp_enabled)) return; @@ -1036,6 +1093,15 @@ static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p) { enum uclamp_id clamp_id; + /* + * Avoid any overhead until uclamp is actually used by the userspace. + * + * The condition is constructed such that a NOP is generated when + * sched_uclamp_used is disabled. + */ + if (!static_branch_unlikely(&sched_uclamp_used)) + return; + if (unlikely(!p->sched_class->uclamp_enabled)) return; @@ -1144,8 +1210,10 @@ int sysctl_sched_uclamp_handler(struct ctl_table *table, int write, update_root_tg = true; } - if (update_root_tg) + if (update_root_tg) { + static_branch_enable(&sched_uclamp_used); uclamp_update_root_tg(); + } /* * We update all RUNNABLE tasks only when task groups are in use. @@ -1180,6 +1248,15 @@ static int uclamp_validate(struct task_struct *p, if (upper_bound > SCHED_CAPACITY_SCALE) return -EINVAL; + /* + * We have valid uclamp attributes; make sure uclamp is enabled. + * + * We need to do that here, because enabling static branches is a + * blocking operation which obviously cannot be done while holding + * scheduler locks. + */ + static_branch_enable(&sched_uclamp_used); + return 0; } @@ -1237,6 +1314,20 @@ static void uclamp_fork(struct task_struct *p) } } +static void __init init_uclamp_rq(struct rq *rq) +{ + enum uclamp_id clamp_id; + struct uclamp_rq *uc_rq = rq->uclamp; + + for_each_clamp_id(clamp_id) { + uc_rq[clamp_id] = (struct uclamp_rq) { + .value = uclamp_none(clamp_id) + }; + } + + rq->uclamp_flags = 0; +} + static void __init init_uclamp(void) { struct uclamp_se uc_max = {}; @@ -1245,11 +1336,8 @@ static void __init init_uclamp(void) mutex_init(&uclamp_mutex); - for_each_possible_cpu(cpu) { - memset(&cpu_rq(cpu)->uclamp, 0, - sizeof(struct uclamp_rq)*UCLAMP_CNT); - cpu_rq(cpu)->uclamp_flags = 0; - } + for_each_possible_cpu(cpu) + init_uclamp_rq(cpu_rq(cpu)); for_each_clamp_id(clamp_id) { uclamp_se_set(&init_task.uclamp_req[clamp_id], @@ -7431,6 +7519,8 @@ static ssize_t cpu_uclamp_write(struct kernfs_open_file *of, char *buf, if (req.ret) return req.ret; + static_branch_enable(&sched_uclamp_used); + mutex_lock(&uclamp_mutex); rcu_read_lock(); diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 7fbaee24c824..dc6835bc6490 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -210,7 +210,7 @@ unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs, unsigned long dl_util, util, irq; struct rq *rq = cpu_rq(cpu); - if (!IS_BUILTIN(CONFIG_UCLAMP_TASK) && + if (!uclamp_is_used() && type == FREQUENCY_UTIL && rt_rq_is_runnable(&rq->rt)) { return max; } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 04fa8dbcfa4d..6b3b59cc51d6 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -10027,7 +10027,12 @@ static void kick_ilb(unsigned int flags) { int ilb_cpu; - nohz.next_balance++; + /* + * Increase nohz.next_balance only when if full ilb is triggered but + * not if we only update stats. + */ + if (flags & NOHZ_BALANCE_KICK) + nohz.next_balance = jiffies+1; ilb_cpu = find_new_ilb(); @@ -10348,6 +10353,14 @@ static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags, } } + /* + * next_balance will be updated only when there is a need. + * When the CPU is attached to null domain for ex, it will not be + * updated. + */ + if (likely(update_next_balance)) + nohz.next_balance = next_balance; + /* Newly idle CPU doesn't need an update */ if (idle != CPU_NEWLY_IDLE) { update_blocked_averages(this_cpu); @@ -10368,14 +10381,6 @@ abort: if (has_blocked_load) WRITE_ONCE(nohz.has_blocked, 1); - /* - * next_balance will be updated only when there is a need. - * When the CPU is attached to null domain for ex, it will not be - * updated. - */ - if (likely(update_next_balance)) - nohz.next_balance = next_balance; - return ret; } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 877fb08eb1b0..c82857e2e288 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -862,6 +862,8 @@ struct uclamp_rq { unsigned int value; struct uclamp_bucket bucket[UCLAMP_BUCKETS]; }; + +DECLARE_STATIC_KEY_FALSE(sched_uclamp_used); #endif /* CONFIG_UCLAMP_TASK */ /* @@ -2349,12 +2351,35 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {} #ifdef CONFIG_UCLAMP_TASK unsigned long uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id); +/** + * uclamp_rq_util_with - clamp @util with @rq and @p effective uclamp values. + * @rq: The rq to clamp against. Must not be NULL. + * @util: The util value to clamp. + * @p: The task to clamp against. Can be NULL if you want to clamp + * against @rq only. + * + * Clamps the passed @util to the max(@rq, @p) effective uclamp values. + * + * If sched_uclamp_used static key is disabled, then just return the util + * without any clamping since uclamp aggregation at the rq level in the fast + * path is disabled, rendering this operation a NOP. + * + * Use uclamp_eff_value() if you don't care about uclamp values at rq level. It + * will return the correct effective uclamp value of the task even if the + * static key is disabled. + */ static __always_inline unsigned long uclamp_rq_util_with(struct rq *rq, unsigned long util, struct task_struct *p) { - unsigned long min_util = READ_ONCE(rq->uclamp[UCLAMP_MIN].value); - unsigned long max_util = READ_ONCE(rq->uclamp[UCLAMP_MAX].value); + unsigned long min_util; + unsigned long max_util; + + if (!static_branch_likely(&sched_uclamp_used)) + return util; + + min_util = READ_ONCE(rq->uclamp[UCLAMP_MIN].value); + max_util = READ_ONCE(rq->uclamp[UCLAMP_MAX].value); if (p) { min_util = max(min_util, uclamp_eff_value(p, UCLAMP_MIN)); @@ -2371,6 +2396,19 @@ unsigned long uclamp_rq_util_with(struct rq *rq, unsigned long util, return clamp(util, min_util, max_util); } + +/* + * When uclamp is compiled in, the aggregation at rq level is 'turned off' + * by default in the fast path and only gets turned on once userspace performs + * an operation that requires it. + * + * Returns true if userspace opted-in to use uclamp and aggregation at rq level + * hence is active. + */ +static inline bool uclamp_is_used(void) +{ + return static_branch_likely(&sched_uclamp_used); +} #else /* CONFIG_UCLAMP_TASK */ static inline unsigned long uclamp_rq_util_with(struct rq *rq, unsigned long util, @@ -2378,6 +2416,11 @@ unsigned long uclamp_rq_util_with(struct rq *rq, unsigned long util, { return util; } + +static inline bool uclamp_is_used(void) +{ + return false; +} #endif /* CONFIG_UCLAMP_TASK */ #ifdef arch_scale_freq_capacity diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index ba81187bb7af..9079d865a935 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1328,7 +1328,7 @@ sd_init(struct sched_domain_topology_level *tl, sd_flags = (*tl->sd_flags)(); if (WARN_ONCE(sd_flags & ~TOPOLOGY_SD_FLAGS, "wrong sd_flags in topology description\n")) - sd_flags &= ~TOPOLOGY_SD_FLAGS; + sd_flags &= TOPOLOGY_SD_FLAGS; /* Apply detected topology flags */ sd_flags |= dflags; diff --git a/kernel/seccomp.c b/kernel/seccomp.c index d653d8426de9..c461ba992513 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -42,6 +42,14 @@ #include <linux/uaccess.h> #include <linux/anon_inodes.h> +/* + * When SECCOMP_IOCTL_NOTIF_ID_VALID was first introduced, it had the + * wrong direction flag in the ioctl number. This is the broken one, + * which the kernel needs to keep supporting until all userspaces stop + * using the wrong command number. + */ +#define SECCOMP_IOCTL_NOTIF_ID_VALID_WRONG_DIR SECCOMP_IOR(2, __u64) + enum notify_state { SECCOMP_NOTIFY_INIT, SECCOMP_NOTIFY_SENT, @@ -1186,6 +1194,7 @@ static long seccomp_notify_ioctl(struct file *file, unsigned int cmd, return seccomp_notify_recv(filter, buf); case SECCOMP_IOCTL_NOTIF_SEND: return seccomp_notify_send(filter, buf); + case SECCOMP_IOCTL_NOTIF_ID_VALID_WRONG_DIR: case SECCOMP_IOCTL_NOTIF_ID_VALID: return seccomp_notify_id_valid(filter, buf); default: diff --git a/kernel/signal.c b/kernel/signal.c index 6f16f7c5d375..42b67d2cea37 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2541,7 +2541,21 @@ bool get_signal(struct ksignal *ksig) relock: spin_lock_irq(&sighand->siglock); - current->jobctl &= ~JOBCTL_TASK_WORK; + /* + * Make sure we can safely read ->jobctl() in task_work add. As Oleg + * states: + * + * It pairs with mb (implied by cmpxchg) before READ_ONCE. So we + * roughly have + * + * task_work_add: get_signal: + * STORE(task->task_works, new_work); STORE(task->jobctl); + * mb(); mb(); + * LOAD(task->jobctl); LOAD(task->task_works); + * + * and we can rely on STORE-MB-LOAD [ in task_work_add]. + */ + smp_store_mb(current->jobctl, current->jobctl & ~JOBCTL_TASK_WORK); if (unlikely(current->task_works)) { spin_unlock_irq(&sighand->siglock); task_work_run(); diff --git a/kernel/task_work.c b/kernel/task_work.c index 5c0848ca1287..613b2d634af8 100644 --- a/kernel/task_work.c +++ b/kernel/task_work.c @@ -42,7 +42,13 @@ task_work_add(struct task_struct *task, struct callback_head *work, int notify) set_notify_resume(task); break; case TWA_SIGNAL: - if (lock_task_sighand(task, &flags)) { + /* + * Only grab the sighand lock if we don't already have some + * task_work pending. This pairs with the smp_store_mb() + * in get_signal(), see comment there. + */ + if (!(READ_ONCE(task->jobctl) & JOBCTL_TASK_WORK) && + lock_task_sighand(task, &flags)) { task->jobctl |= JOBCTL_TASK_WORK; signal_wake_up(task, 0); unlock_task_sighand(task, &flags); diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 3e2dc9b8858c..f0199a4ba1ad 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -351,16 +351,24 @@ void tick_nohz_dep_clear_cpu(int cpu, enum tick_dep_bits bit) EXPORT_SYMBOL_GPL(tick_nohz_dep_clear_cpu); /* - * Set a per-task tick dependency. Posix CPU timers need this in order to elapse - * per task timers. + * Set a per-task tick dependency. RCU need this. Also posix CPU timers + * in order to elapse per task timers. */ void tick_nohz_dep_set_task(struct task_struct *tsk, enum tick_dep_bits bit) { - /* - * We could optimize this with just kicking the target running the task - * if that noise matters for nohz full users. - */ - tick_nohz_dep_set_all(&tsk->tick_dep_mask, bit); + if (!atomic_fetch_or(BIT(bit), &tsk->tick_dep_mask)) { + if (tsk == current) { + preempt_disable(); + tick_nohz_full_kick(); + preempt_enable(); + } else { + /* + * Some future tick_nohz_full_kick_task() + * should optimize this. + */ + tick_nohz_full_kick_all(); + } + } } EXPORT_SYMBOL_GPL(tick_nohz_dep_set_task); diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 5ef0484513ec..588e8e396019 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -522,10 +522,18 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, if (!bt->msg_data) goto err; - ret = -ENOENT; - - dir = debugfs_lookup(buts->name, blk_debugfs_root); - if (!dir) +#ifdef CONFIG_BLK_DEBUG_FS + /* + * When tracing whole make_request drivers (multiqueue) block devices, + * reuse the existing debugfs directory created by the block layer on + * init. For request-based block devices, all partitions block devices, + * and scsi-generic block devices we create a temporary new debugfs + * directory that will be removed once the trace ends. + */ + if (queue_is_mq(q) && bdev && bdev == bdev->bd_contains) + dir = q->debugfs_dir; + else +#endif bt->dir = dir = debugfs_create_dir(buts->name, blk_debugfs_root); bt->dev = dev; @@ -563,8 +571,6 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, ret = 0; err: - if (dir && !bt->dir) - dput(dir); if (ret) blk_trace_free(bt); return ret; diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 1903b80db6eb..b5cb5be3ca6f 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -139,9 +139,6 @@ static inline void ftrace_ops_init(struct ftrace_ops *ops) #endif } -#define FTRACE_PID_IGNORE -1 -#define FTRACE_PID_TRACE -2 - static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct pt_regs *regs) { @@ -6190,8 +6187,11 @@ static int referenced_filters(struct dyn_ftrace *rec) int cnt = 0; for (ops = ftrace_ops_list; ops != &ftrace_list_end; ops = ops->next) { - if (ops_references_rec(ops, rec)) - cnt++; + if (ops_references_rec(ops, rec)) { + cnt++; + if (ops->flags & FTRACE_OPS_FL_SAVE_REGS) + rec->flags |= FTRACE_FL_REGS; + } } return cnt; @@ -6370,8 +6370,8 @@ void ftrace_module_enable(struct module *mod) if (ftrace_start_up) cnt += referenced_filters(rec); - /* This clears FTRACE_FL_DISABLED */ - rec->flags = cnt; + rec->flags &= ~FTRACE_FL_DISABLED; + rec->flags += cnt; if (ftrace_start_up && cnt) { int failed = __ftrace_replace_code(rec, 1); @@ -6969,12 +6969,12 @@ void ftrace_pid_follow_fork(struct trace_array *tr, bool enable) if (enable) { register_trace_sched_process_fork(ftrace_pid_follow_sched_process_fork, tr); - register_trace_sched_process_exit(ftrace_pid_follow_sched_process_exit, + register_trace_sched_process_free(ftrace_pid_follow_sched_process_exit, tr); } else { unregister_trace_sched_process_fork(ftrace_pid_follow_sched_process_fork, tr); - unregister_trace_sched_process_exit(ftrace_pid_follow_sched_process_exit, + unregister_trace_sched_process_free(ftrace_pid_follow_sched_process_exit, tr); } } diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index bb62269724d5..6fc6da55b94e 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -5887,7 +5887,7 @@ int tracing_set_tracer(struct trace_array *tr, const char *buf) } /* If trace pipe files are being read, we can't change the tracer */ - if (tr->current_trace->ref) { + if (tr->trace_ref) { ret = -EBUSY; goto out; } @@ -6103,7 +6103,7 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp) nonseekable_open(inode, filp); - tr->current_trace->ref++; + tr->trace_ref++; out: mutex_unlock(&trace_types_lock); return ret; @@ -6122,7 +6122,7 @@ static int tracing_release_pipe(struct inode *inode, struct file *file) mutex_lock(&trace_types_lock); - tr->current_trace->ref--; + tr->trace_ref--; if (iter->trace->pipe_close) iter->trace->pipe_close(iter); @@ -7424,7 +7424,7 @@ static int tracing_buffers_open(struct inode *inode, struct file *filp) filp->private_data = info; - tr->current_trace->ref++; + tr->trace_ref++; mutex_unlock(&trace_types_lock); @@ -7525,7 +7525,7 @@ static int tracing_buffers_release(struct inode *inode, struct file *file) mutex_lock(&trace_types_lock); - iter->tr->current_trace->ref--; + iter->tr->trace_ref--; __trace_array_put(iter->tr); @@ -8733,7 +8733,7 @@ static int __remove_instance(struct trace_array *tr) int i; /* Reference counter for a newly created trace array = 1. */ - if (tr->ref > 1 || (tr->current_trace && tr->current_trace->ref)) + if (tr->ref > 1 || (tr->current_trace && tr->trace_ref)) return -EBUSY; list_del(&tr->list); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 13db4000af3f..610d21355526 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -356,6 +356,7 @@ struct trace_array { struct trace_event_file *trace_marker_file; cpumask_var_t tracing_cpumask; /* only trace on set CPUs */ int ref; + int trace_ref; #ifdef CONFIG_FUNCTION_TRACER struct ftrace_ops *ops; struct trace_pid_list __rcu *function_pids; @@ -547,7 +548,6 @@ struct tracer { struct tracer *next; struct tracer_flags *flags; int enabled; - int ref; bool print_max; bool allow_instances; #ifdef CONFIG_TRACER_MAX_TRACE @@ -1103,6 +1103,10 @@ print_graph_function_flags(struct trace_iterator *iter, u32 flags) extern struct list_head ftrace_pids; #ifdef CONFIG_FUNCTION_TRACER + +#define FTRACE_PID_IGNORE -1 +#define FTRACE_PID_TRACE -2 + struct ftrace_func_command { struct list_head list; char *name; @@ -1114,7 +1118,8 @@ struct ftrace_func_command { extern bool ftrace_filter_param __initdata; static inline int ftrace_trace_task(struct trace_array *tr) { - return !this_cpu_read(tr->array_buffer.data->ftrace_ignore_pid); + return this_cpu_read(tr->array_buffer.data->ftrace_ignore_pid) != + FTRACE_PID_IGNORE; } extern int ftrace_is_dead(void); int ftrace_create_function_files(struct trace_array *tr, diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index f6f55682d3e2..a85effb2373b 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -538,12 +538,12 @@ void trace_event_follow_fork(struct trace_array *tr, bool enable) if (enable) { register_trace_prio_sched_process_fork(event_filter_pid_sched_process_fork, tr, INT_MIN); - register_trace_prio_sched_process_exit(event_filter_pid_sched_process_exit, + register_trace_prio_sched_process_free(event_filter_pid_sched_process_exit, tr, INT_MAX); } else { unregister_trace_sched_process_fork(event_filter_pid_sched_process_fork, tr); - unregister_trace_sched_process_exit(event_filter_pid_sched_process_exit, + unregister_trace_sched_process_free(event_filter_pid_sched_process_exit, tr); } } diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c index e2be7bb7ef7e..17e1e49e5b93 100644 --- a/kernel/trace/trace_hwlat.c +++ b/kernel/trace/trace_hwlat.c @@ -283,6 +283,7 @@ static bool disable_migrate; static void move_to_next_cpu(void) { struct cpumask *current_mask = &save_cpumask; + struct trace_array *tr = hwlat_trace; int next_cpu; if (disable_migrate) @@ -296,7 +297,7 @@ static void move_to_next_cpu(void) goto disable; get_online_cpus(); - cpumask_and(current_mask, cpu_online_mask, tracing_buffer_mask); + cpumask_and(current_mask, cpu_online_mask, tr->tracing_cpumask); next_cpu = cpumask_next(smp_processor_id(), current_mask); put_online_cpus(); @@ -373,7 +374,7 @@ static int start_kthread(struct trace_array *tr) /* Just pick the first CPU on first iteration */ current_mask = &save_cpumask; get_online_cpus(); - cpumask_and(current_mask, cpu_online_mask, tracing_buffer_mask); + cpumask_and(current_mask, cpu_online_mask, tr->tracing_cpumask); put_online_cpus(); next_cpu = cpumask_first(current_mask); diff --git a/kernel/watch_queue.c b/kernel/watch_queue.c index f74020f6bd9d..0ef8f65bd2d7 100644 --- a/kernel/watch_queue.c +++ b/kernel/watch_queue.c @@ -393,6 +393,7 @@ static void free_watch(struct rcu_head *rcu) struct watch *watch = container_of(rcu, struct watch, rcu); put_watch_queue(rcu_access_pointer(watch->queue)); + atomic_dec(&watch->cred->user->nr_watches); put_cred(watch->cred); } @@ -452,6 +453,13 @@ int add_watch_to_object(struct watch *watch, struct watch_list *wlist) watch->cred = get_current_cred(); rcu_assign_pointer(watch->watch_list, wlist); + if (atomic_inc_return(&watch->cred->user->nr_watches) > + task_rlimit(current, RLIMIT_NOFILE)) { + atomic_dec(&watch->cred->user->nr_watches); + put_cred(watch->cred); + return -EAGAIN; + } + spin_lock_bh(&wqueue->lock); kref_get(&wqueue->usage); kref_get(&watch->usage); |