From 97c79a38cd454602645f0470ffb444b3b75ce574 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Thu, 28 Apr 2016 13:16:33 -0300 Subject: perf core: Per event callchain limit Additionally to being able to control the system wide maximum depth via /proc/sys/kernel/perf_event_max_stack, now we are able to ask for different depths per event, using perf_event_attr.sample_max_stack for that. This uses an u16 hole at the end of perf_event_attr, that, when perf_event_attr.sample_type has the PERF_SAMPLE_CALLCHAIN, if sample_max_stack is zero, means use perf_event_max_stack, otherwise it'll be bounds checked under callchain_mutex. Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Alexei Starovoitov Cc: Brendan Gregg Cc: David Ahern Cc: Frederic Weisbecker Cc: He Kuang Cc: Jiri Olsa Cc: Linus Torvalds Cc: Masami Hiramatsu Cc: Milian Wolff Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: Wang Nan Cc: Zefan Li Link: http://lkml.kernel.org/n/tip-kolmn1yo40p7jhswxwrc7rrd@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- kernel/bpf/stackmap.c | 2 +- kernel/events/callchain.c | 14 ++++++++++++-- kernel/events/core.c | 5 ++++- 3 files changed, 17 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index a82d7605db3f..f1de5c1a2af6 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -99,7 +99,7 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr) if (err) goto free_smap; - err = get_callchain_buffers(); + err = get_callchain_buffers(sysctl_perf_event_max_stack); if (err) goto free_smap; diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c index 179ef4640964..e9fdb5203de5 100644 --- a/kernel/events/callchain.c +++ b/kernel/events/callchain.c @@ -104,7 +104,7 @@ fail: return -ENOMEM; } -int get_callchain_buffers(void) +int get_callchain_buffers(int event_max_stack) { int err = 0; int count; @@ -121,6 +121,15 @@ int get_callchain_buffers(void) /* If the allocation failed, give up */ if (!callchain_cpus_entries) err = -ENOMEM; + /* + * If requesting per event more than the global cap, + * return a different error to help userspace figure + * this out. + * + * And also do it here so that we have &callchain_mutex held. + */ + if (event_max_stack > sysctl_perf_event_max_stack) + err = -EOVERFLOW; goto exit; } @@ -174,11 +183,12 @@ perf_callchain(struct perf_event *event, struct pt_regs *regs) bool user = !event->attr.exclude_callchain_user; /* Disallow cross-task user callchains. */ bool crosstask = event->ctx->task && event->ctx->task != current; + const u32 max_stack = event->attr.sample_max_stack; if (!kernel && !user) return NULL; - return get_perf_callchain(regs, 0, kernel, user, sysctl_perf_event_max_stack, crosstask, true); + return get_perf_callchain(regs, 0, kernel, user, max_stack, crosstask, true); } struct perf_callchain_entry * diff --git a/kernel/events/core.c b/kernel/events/core.c index 050a290c72c7..79363f298445 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -8843,7 +8843,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, if (!event->parent) { if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) { - err = get_callchain_buffers(); + err = get_callchain_buffers(attr->sample_max_stack); if (err) goto err_addr_filters; } @@ -9165,6 +9165,9 @@ SYSCALL_DEFINE5(perf_event_open, return -EINVAL; } + if (!attr.sample_max_stack) + attr.sample_max_stack = sysctl_perf_event_max_stack; + /* * In cgroup mode, the pid argument is used to pass the fd * opened to the cgroup directory in cgroupfs. The cpu argument -- cgit v1.2.3 From 1a99ae3f00d3c7c7885ee529ac9a874b19caa0cf Mon Sep 17 00:00:00 2001 From: Xunlei Pang Date: Tue, 10 May 2016 21:03:18 +0800 Subject: sched/fair: Fix the wrong throttled clock time for cfs_rq_clock_task() Two minor fixes for cfs_rq_clock_task(): 1) If cfs_rq is currently being throttled, we need to subtract the cfs throttled clock time. 2) Make "throttled_clock_task_time" update SMP unrelated. Now UP cases need it as well. Signed-off-by: Xunlei Pang Signed-off-by: Peter Zijlstra (Intel) Cc: Juri Lelli Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1462885398-14724-1-git-send-email-xlpang@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 218f8e83db73..1e87bb633d43 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3688,7 +3688,7 @@ static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq) { if (unlikely(cfs_rq->throttle_count)) - return cfs_rq->throttled_clock_task; + return cfs_rq->throttled_clock_task - cfs_rq->throttled_clock_task_time; return rq_clock_task(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time; } @@ -3826,13 +3826,11 @@ static int tg_unthrottle_up(struct task_group *tg, void *data) struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; cfs_rq->throttle_count--; -#ifdef CONFIG_SMP if (!cfs_rq->throttle_count) { /* adjust cfs_rq_clock_task() */ cfs_rq->throttled_clock_task_time += rq_clock_task(rq) - cfs_rq->throttled_clock_task; } -#endif return 0; } -- cgit v1.2.3 From df55f462b905f3b2d40ec3fb865891382a6ebfb1 Mon Sep 17 00:00:00 2001 From: "Gaurav Jindal (Gaurav Jindal)" Date: Thu, 12 May 2016 10:13:33 +0000 Subject: sched/idle: Optimize the generic idle loop Currently, smp_processor_id() is used to fetch the current CPU in cpu_idle_loop(). Every time the idle thread runs, it fetches the current CPU using smp_processor_id(). Since the idle thread is per CPU, the current CPU is constant, so we can lift the load out of the loop, saving execution cycles/time in the loop. x86-64: Before patch (execution in loop): 148: 0f ae e8 lfence 14b: 65 8b 04 25 00 00 00 00 mov %gs:0x0,%eax 152: 00 153: 89 c0 mov %eax,%eax 155: 49 0f a3 04 24 bt %rax,(%r12) After patch (execution in loop): 150: 0f ae e8 lfence 153: 4d 0f a3 34 24 bt %r14,(%r12) ARM64: Before patch (execution in loop): 168: d5033d9f dsb ld 16c: b9405661 ldr w1,[x19,#84] 170: 1100fc20 add w0,w1,#0x3f 174: 6b1f003f cmp w1,wzr 178: 1a81b000 csel w0,w0,w1,lt 17c: 130c7000 asr w0,w0,#6 180: 937d7c00 sbfiz x0,x0,#3,#32 184: f8606aa0 ldr x0,[x21,x0] 188: 9ac12401 lsr x1,x0,x1 18c: 36000e61 tbz w1,#0,358 After patch (execution in loop): 1a8: d50339df dsb ld 1ac: f8776ac0 ldr x0,[x22,x23] ab0: ea18001f tst x0,x24 1b4: 54000ea0 b.eq 388 Further observance on ARM64 for 4 seconds shows that cpu_idle_loop is called 8672 times. Shifting the code will save instructions executed in loop and eventually time as well. Signed-off-by: Gaurav Jindal Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Sanjeev Yadav Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20160512101330.GA488@gauravjindalubtnb.del.spreadtrum.com Signed-off-by: Ingo Molnar --- kernel/sched/idle.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index bd12c6c714ec..db4ff7c100b9 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -201,6 +201,8 @@ exit_idle: */ static void cpu_idle_loop(void) { + int cpu = smp_processor_id(); + while (1) { /* * If the arch has a polling bit, we maintain an invariant: @@ -219,7 +221,7 @@ static void cpu_idle_loop(void) check_pgt_cache(); rmb(); - if (cpu_is_offline(smp_processor_id())) { + if (cpu_is_offline(cpu)) { cpuhp_report_idle_dead(); arch_cpu_idle_dead(); } -- cgit v1.2.3 From 150593bf869393d10a79f6bd3df2585ecc20a9bb Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 18 May 2016 19:02:18 +0200 Subject: sched/api: Introduce task_rcu_dereference() and try_get_task_struct() Generally task_struct is only protected by RCU if it was found on a RCU protected list (say, for_each_process() or find_task_by_vpid()). As Kirill pointed out rq->curr isn't protected by RCU, the scheduler drops the (potentially) last reference without RCU gp, this means that we need to fix the code which uses foreign_rq->curr under rcu_read_lock(). Add a new helper which can be used to dereference rq->curr or any other pointer to task_struct assuming that it should be cleared or updated before the final put_task_struct(). It returns non-NULL only if this task can't go away before rcu_read_unlock(). ( Also add try_get_task_struct() to make it easier to use this API correctly. ) Suggested-by: Kirill Tkhai Signed-off-by: Oleg Nesterov [ Updated comments; added try_get_task_struct()] Signed-off-by: Peter Zijlstra (Intel) Cc: Chris Metcalf Cc: Christoph Lameter Cc: Kirill Tkhai Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Vladimir Davydov Link: http://lkml.kernel.org/r/20160518170218.GY3192@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- include/linux/sched.h | 3 ++ kernel/exit.c | 76 +++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 79 insertions(+) (limited to 'kernel') diff --git a/include/linux/sched.h b/include/linux/sched.h index 6e42ada26345..dee41bf59e6b 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2139,6 +2139,9 @@ static inline void put_task_struct(struct task_struct *t) __put_task_struct(t); } +struct task_struct *task_rcu_dereference(struct task_struct **ptask); +struct task_struct *try_get_task_struct(struct task_struct **ptask); + #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN extern void task_cputime(struct task_struct *t, cputime_t *utime, cputime_t *stime); diff --git a/kernel/exit.c b/kernel/exit.c index 9e6e1356e6bb..2fb4d44c51b1 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -210,6 +210,82 @@ repeat: goto repeat; } +/* + * Note that if this function returns a valid task_struct pointer (!NULL) + * task->usage must remain >0 for the duration of the RCU critical section. + */ +struct task_struct *task_rcu_dereference(struct task_struct **ptask) +{ + struct sighand_struct *sighand; + struct task_struct *task; + + /* + * We need to verify that release_task() was not called and thus + * delayed_put_task_struct() can't run and drop the last reference + * before rcu_read_unlock(). We check task->sighand != NULL, + * but we can read the already freed and reused memory. + */ +retry: + task = rcu_dereference(*ptask); + if (!task) + return NULL; + + probe_kernel_address(&task->sighand, sighand); + + /* + * Pairs with atomic_dec_and_test() in put_task_struct(). If this task + * was already freed we can not miss the preceding update of this + * pointer. + */ + smp_rmb(); + if (unlikely(task != READ_ONCE(*ptask))) + goto retry; + + /* + * We've re-checked that "task == *ptask", now we have two different + * cases: + * + * 1. This is actually the same task/task_struct. In this case + * sighand != NULL tells us it is still alive. + * + * 2. This is another task which got the same memory for task_struct. + * We can't know this of course, and we can not trust + * sighand != NULL. + * + * In this case we actually return a random value, but this is + * correct. + * + * If we return NULL - we can pretend that we actually noticed that + * *ptask was updated when the previous task has exited. Or pretend + * that probe_slab_address(&sighand) reads NULL. + * + * If we return the new task (because sighand is not NULL for any + * reason) - this is fine too. This (new) task can't go away before + * another gp pass. + * + * And note: We could even eliminate the false positive if re-read + * task->sighand once again to avoid the falsely NULL. But this case + * is very unlikely so we don't care. + */ + if (!sighand) + return NULL; + + return task; +} + +struct task_struct *try_get_task_struct(struct task_struct **ptask) +{ + struct task_struct *task; + + rcu_read_lock(); + task = task_rcu_dereference(ptask); + if (task) + get_task_struct(task); + rcu_read_unlock(); + + return task; +} + /* * Determine if a process group is "orphaned", according to the POSIX * definition in 2.2.2.52. Orphaned process groups are not to be affected -- cgit v1.2.3 From bac7857319bcf7fed329a10bb760053e761115c0 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 18 May 2016 21:57:33 +0200 Subject: sched/fair: Use task_rcu_dereference() Simplify task_numa_compare()'s task reference magic by using task_rcu_dereference(). Signed-off-by: Oleg Nesterov Signed-off-by: Peter Zijlstra (Intel) Cc: Chris Metcalf Cc: Christoph Lameter Cc: Kirill Tkhai Cc: Kirill Tkhai Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Vladimir Davydov Link: http://lkml.kernel.org/r/20160518195733.GA15914@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 34 ++++------------------------------ 1 file changed, 4 insertions(+), 30 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 1e87bb633d43..c6dd8bab010c 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1305,6 +1305,8 @@ static void task_numa_assign(struct task_numa_env *env, { if (env->best_task) put_task_struct(env->best_task); + if (p) + get_task_struct(p); env->best_task = p; env->best_imp = imp; @@ -1372,31 +1374,11 @@ static void task_numa_compare(struct task_numa_env *env, long imp = env->p->numa_group ? groupimp : taskimp; long moveimp = imp; int dist = env->dist; - bool assigned = false; rcu_read_lock(); - - raw_spin_lock_irq(&dst_rq->lock); - cur = dst_rq->curr; - /* - * No need to move the exiting task or idle task. - */ - if ((cur->flags & PF_EXITING) || is_idle_task(cur)) + cur = task_rcu_dereference(&dst_rq->curr); + if (cur && ((cur->flags & PF_EXITING) || is_idle_task(cur))) cur = NULL; - else { - /* - * The task_struct must be protected here to protect the - * p->numa_faults access in the task_weight since the - * numa_faults could already be freed in the following path: - * finish_task_switch() - * --> put_task_struct() - * --> __put_task_struct() - * --> task_numa_free() - */ - get_task_struct(cur); - } - - raw_spin_unlock_irq(&dst_rq->lock); /* * Because we have preemption enabled we can get migrated around and @@ -1479,7 +1461,6 @@ balance: */ if (!load_too_imbalanced(src_load, dst_load, env)) { imp = moveimp - 1; - put_task_struct(cur); cur = NULL; goto assign; } @@ -1505,16 +1486,9 @@ balance: env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu); assign: - assigned = true; task_numa_assign(env, cur, imp); unlock: rcu_read_unlock(); - /* - * The dst_rq->curr isn't assigned. The protection for task_struct is - * finished. - */ - if (cur && !assigned) - put_task_struct(cur); } static void task_numa_find_cpu(struct task_numa_env *env, -- cgit v1.2.3 From f2fb6bef92514432398a653df1c2f1041d79ac46 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Wed, 23 Mar 2016 11:24:37 -0700 Subject: perf/core: Optimize side-band event delivery The perf_event_aux() function iterates all PMUs and all events in their respective per-CPU contexts to find the events to deliver side-band records to. For example, the brk test case in lkp triggers many mmap() operations, which, if we're also running perf, results in many perf_event_aux() invocations. If we enable uncore PMU support (even when uncore events are not used), dozens of uncore PMUs will be iterated, which can significantly decrease brk_test's throughput. For example, the brk throughput: without uncore PMUs: 2647573 ops_per_sec with uncore PMUs: 1768444 ops_per_sec ... a 33% reduction. To get at the per-CPU events that need side-band records, this patch puts these events on a per-CPU list, this avoids iterating the PMUs and any events that do not need side-band records. Per task events are unchanged to avoid extra overhead on the context switch paths. Suggested-by: Peter Zijlstra (Intel) Reported-by: Huang, Ying Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Link: http://lkml.kernel.org/r/1458757477-3781-1-git-send-email-kan.liang@intel.com Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 6 ++++ kernel/events/core.c | 85 +++++++++++++++++++++++++++++++++++++++------- 2 files changed, 79 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 0e43355c7aad..92e9ce737432 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -517,6 +517,11 @@ struct swevent_hlist { struct perf_cgroup; struct ring_buffer; +struct pmu_event_list { + raw_spinlock_t lock; + struct list_head list; +}; + /** * struct perf_event - performance event kernel representation: */ @@ -675,6 +680,7 @@ struct perf_event { int cgrp_defer_enabled; #endif + struct list_head sb_list; #endif /* CONFIG_PERF_EVENTS */ }; diff --git a/kernel/events/core.c b/kernel/events/core.c index 79363f298445..6615c8922ee3 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -335,6 +335,7 @@ static atomic_t perf_sched_count; static DEFINE_PER_CPU(atomic_t, perf_cgroup_events); static DEFINE_PER_CPU(int, perf_sched_cb_usages); +static DEFINE_PER_CPU(struct pmu_event_list, pmu_sb_events); static atomic_t nr_mmap_events __read_mostly; static atomic_t nr_comm_events __read_mostly; @@ -3665,6 +3666,26 @@ static void free_event_rcu(struct rcu_head *head) static void ring_buffer_attach(struct perf_event *event, struct ring_buffer *rb); +static void detach_sb_event(struct perf_event *event) +{ + struct pmu_event_list *pel = per_cpu_ptr(&pmu_sb_events, event->cpu); + + raw_spin_lock(&pel->lock); + list_del_rcu(&event->sb_list); + raw_spin_unlock(&pel->lock); +} + +static void unaccount_pmu_sb_event(struct perf_event *event) +{ + if (event->parent) + return; + + if (event->attach_state & PERF_ATTACH_TASK) + return; + + detach_sb_event(event); +} + static void unaccount_event_cpu(struct perf_event *event, int cpu) { if (event->parent) @@ -3728,6 +3749,8 @@ static void unaccount_event(struct perf_event *event) } unaccount_event_cpu(event, event->cpu); + + unaccount_pmu_sb_event(event); } static void perf_sched_delayed(struct work_struct *work) @@ -5888,13 +5911,25 @@ perf_event_aux_task_ctx(perf_event_aux_output_cb output, void *data, rcu_read_unlock(); } +static void perf_event_sb_iterate(perf_event_aux_output_cb output, void *data) +{ + struct pmu_event_list *pel = this_cpu_ptr(&pmu_sb_events); + struct perf_event *event; + + list_for_each_entry_rcu(event, &pel->list, sb_list) { + if (event->state < PERF_EVENT_STATE_INACTIVE) + continue; + if (!event_filter_match(event)) + continue; + output(event, data); + } +} + static void perf_event_aux(perf_event_aux_output_cb output, void *data, struct perf_event_context *task_ctx) { - struct perf_cpu_context *cpuctx; struct perf_event_context *ctx; - struct pmu *pmu; int ctxn; /* @@ -5909,20 +5944,15 @@ perf_event_aux(perf_event_aux_output_cb output, void *data, } rcu_read_lock(); - list_for_each_entry_rcu(pmu, &pmus, entry) { - cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); - if (cpuctx->unique_pmu != pmu) - goto next; - perf_event_aux_ctx(&cpuctx->ctx, output, data, false); - ctxn = pmu->task_ctx_nr; - if (ctxn < 0) - goto next; + preempt_disable(); + perf_event_sb_iterate(output, data); + + for_each_task_context_nr(ctxn) { ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); if (ctx) perf_event_aux_ctx(ctx, output, data, false); -next: - put_cpu_ptr(pmu->pmu_cpu_context); } + preempt_enable(); rcu_read_unlock(); } @@ -8615,6 +8645,32 @@ unlock: return pmu; } +static void attach_sb_event(struct perf_event *event) +{ + struct pmu_event_list *pel = per_cpu_ptr(&pmu_sb_events, event->cpu); + + raw_spin_lock(&pel->lock); + list_add_rcu(&event->sb_list, &pel->list); + raw_spin_unlock(&pel->lock); +} + +static void account_pmu_sb_event(struct perf_event *event) +{ + struct perf_event_attr *attr = &event->attr; + + if (event->parent) + return; + + if (event->attach_state & PERF_ATTACH_TASK) + return; + + if (attr->mmap || attr->mmap_data || attr->mmap2 || + attr->comm || attr->comm_exec || + attr->task || + attr->context_switch) + attach_sb_event(event); +} + static void account_event_cpu(struct perf_event *event, int cpu) { if (event->parent) @@ -8695,6 +8751,8 @@ static void account_event(struct perf_event *event) enabled: account_event_cpu(event, event->cpu); + + account_pmu_sb_event(event); } /* @@ -10203,6 +10261,9 @@ static void __init perf_event_init_all_cpus(void) swhash = &per_cpu(swevent_htable, cpu); mutex_init(&swhash->hlist_mutex); INIT_LIST_HEAD(&per_cpu(active_ctx_list, cpu)); + + INIT_LIST_HEAD(&per_cpu(pmu_sb_events.list, cpu)); + raw_spin_lock_init(&per_cpu(pmu_sb_events.lock, cpu)); } } -- cgit v1.2.3 From aab5b71ef2b5c62323b9abe397e2db57b18e1f78 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 12 May 2016 17:26:46 +0200 Subject: perf/core: Rename the perf_event_aux*() APIs to perf_event_sb*(), to separate them from AUX ring-buffer records There are now two different things called AUX in perf, the infrastructure to deliver the mmap/comm/task records and the AUX part in the mmap buffer (with associated AUX_RECORD). Since the former is internal, rename it to side-band to reduce the confusion factor. No change in functionality. Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/events/core.c | 69 +++++++++++++++++++++++++++------------------------- 1 file changed, 36 insertions(+), 33 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 6615c8922ee3..f54454ea5f31 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -5879,11 +5879,11 @@ perf_event_read_event(struct perf_event *event, perf_output_end(&handle); } -typedef void (perf_event_aux_output_cb)(struct perf_event *event, void *data); +typedef void (perf_iterate_f)(struct perf_event *event, void *data); static void -perf_event_aux_ctx(struct perf_event_context *ctx, - perf_event_aux_output_cb output, +perf_iterate_ctx(struct perf_event_context *ctx, + perf_iterate_f output, void *data, bool all) { struct perf_event *event; @@ -5900,18 +5900,7 @@ perf_event_aux_ctx(struct perf_event_context *ctx, } } -static void -perf_event_aux_task_ctx(perf_event_aux_output_cb output, void *data, - struct perf_event_context *task_ctx) -{ - rcu_read_lock(); - preempt_disable(); - perf_event_aux_ctx(task_ctx, output, data, false); - preempt_enable(); - rcu_read_unlock(); -} - -static void perf_event_sb_iterate(perf_event_aux_output_cb output, void *data) +static void perf_iterate_sb_cpu(perf_iterate_f output, void *data) { struct pmu_event_list *pel = this_cpu_ptr(&pmu_sb_events); struct perf_event *event; @@ -5925,33 +5914,40 @@ static void perf_event_sb_iterate(perf_event_aux_output_cb output, void *data) } } +/* + * Iterate all events that need to receive side-band events. + * + * For new callers; ensure that account_pmu_sb_event() includes + * your event, otherwise it might not get delivered. + */ static void -perf_event_aux(perf_event_aux_output_cb output, void *data, +perf_iterate_sb(perf_iterate_f output, void *data, struct perf_event_context *task_ctx) { struct perf_event_context *ctx; int ctxn; + rcu_read_lock(); + preempt_disable(); + /* - * If we have task_ctx != NULL we only notify - * the task context itself. The task_ctx is set - * only for EXIT events before releasing task + * If we have task_ctx != NULL we only notify the task context itself. + * The task_ctx is set only for EXIT events before releasing task * context. */ if (task_ctx) { - perf_event_aux_task_ctx(output, data, task_ctx); - return; + perf_iterate_ctx(task_ctx, output, data, false); + goto done; } - rcu_read_lock(); - preempt_disable(); - perf_event_sb_iterate(output, data); + perf_iterate_sb_cpu(output, data); for_each_task_context_nr(ctxn) { ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); if (ctx) - perf_event_aux_ctx(ctx, output, data, false); + perf_iterate_ctx(ctx, output, data, false); } +done: preempt_enable(); rcu_read_unlock(); } @@ -6001,7 +5997,7 @@ void perf_event_exec(void) perf_event_enable_on_exec(ctxn); - perf_event_aux_ctx(ctx, perf_event_addr_filters_exec, NULL, + perf_iterate_ctx(ctx, perf_event_addr_filters_exec, NULL, true); } rcu_read_unlock(); @@ -6045,9 +6041,9 @@ static int __perf_pmu_output_stop(void *info) }; rcu_read_lock(); - perf_event_aux_ctx(&cpuctx->ctx, __perf_event_output_stop, &ro, false); + perf_iterate_ctx(&cpuctx->ctx, __perf_event_output_stop, &ro, false); if (cpuctx->task_ctx) - perf_event_aux_ctx(cpuctx->task_ctx, __perf_event_output_stop, + perf_iterate_ctx(cpuctx->task_ctx, __perf_event_output_stop, &ro, false); rcu_read_unlock(); @@ -6176,7 +6172,7 @@ static void perf_event_task(struct task_struct *task, }, }; - perf_event_aux(perf_event_task_output, + perf_iterate_sb(perf_event_task_output, &task_event, task_ctx); } @@ -6255,7 +6251,7 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event) comm_event->event_id.header.size = sizeof(comm_event->event_id) + size; - perf_event_aux(perf_event_comm_output, + perf_iterate_sb(perf_event_comm_output, comm_event, NULL); } @@ -6486,7 +6482,7 @@ got_name: mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size; - perf_event_aux(perf_event_mmap_output, + perf_iterate_sb(perf_event_mmap_output, mmap_event, NULL); @@ -6569,7 +6565,7 @@ static void perf_addr_filters_adjust(struct vm_area_struct *vma) if (!ctx) continue; - perf_event_aux_ctx(ctx, __perf_addr_filters_adjust, vma, true); + perf_iterate_ctx(ctx, __perf_addr_filters_adjust, vma, true); } rcu_read_unlock(); } @@ -6756,7 +6752,7 @@ static void perf_event_switch(struct task_struct *task, }, }; - perf_event_aux(perf_event_switch_output, + perf_iterate_sb(perf_event_switch_output, &switch_event, NULL); } @@ -8654,6 +8650,13 @@ static void attach_sb_event(struct perf_event *event) raw_spin_unlock(&pel->lock); } +/* + * We keep a list of all !task (and therefore per-cpu) events + * that need to receive side-band records. + * + * This avoids having to scan all the various PMU per-cpu contexts + * looking for them. + */ static void account_pmu_sb_event(struct perf_event *event) { struct perf_event_attr *attr = &event->attr; -- cgit v1.2.3 From ab7fdefba68f66c8523571c3b3a940635d781824 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Tue, 3 May 2016 00:26:06 -0700 Subject: perf/core: Fix implicitly enable dynamic interrupt throttle This patch fixes an issue which was introduced by commit: 91a612eea9a3 ("perf/core: Fix dynamic interrupt throttle") ... which commit unconditionally sets the perf_sample_allowed_ns value to !0. But that could trigger a bug in the following corner case: The user can disable the dynamic interrupt throttle mechanism by setting perf_cpu_time_max_percent to 0. Then they change perf_event_max_sample_rate. For this case, the mechanism will be enabled implicitly, because perf_sample_allowed_ns becomes !0 - which is not what we want. This patch only updates perf_sample_allowed_ns when the dynamic interrupt throttle mechanism is enabled. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: acme@kernel.org Link: http://lkml.kernel.org/r/1462260366-3160-1-git-send-email-kan.liang@intel.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index f54454ea5f31..f94f164b5054 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -397,6 +397,13 @@ int perf_proc_update_handler(struct ctl_table *table, int write, if (ret || !write) return ret; + /* + * If throttling is disabled don't allow the write: + */ + if (sysctl_perf_cpu_time_max_percent == 100 || + sysctl_perf_cpu_time_max_percent == 0) + return -EINVAL; + max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ); perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate; update_perf_cpu_limits(); -- cgit v1.2.3 From a1396555abff9ff9b74c2e4da13e27e81fd094b2 Mon Sep 17 00:00:00 2001 From: Vineet Gupta Date: Mon, 9 May 2016 15:07:40 +0530 Subject: perf/abi: Change the errno for sampling event not supported in hardware Change the return code for sampling event not supported from -ENOTSUPP to -EOPNOTSUPP. This allows userspace to identify this case specifically, instead of printing the catch-all error message it did previously. Technically this is an ABI change, but we think we can get away with it. Old behavior: ------- | # perf record ls | Error: | The sys_perf_event_open() syscall returned with 524 (Unknown error 524) | for event (cycles:ppp). | /bin/dmesg may provide additional information. | No CONFIG_PERF_EVENTS=y kernel support configured? New behavior: ------- | # perf record ls | Error: | PMU Hardware doesn't support sampling/overflow-interrupts. Signed-off-by: Vineet Gupta Signed-off-by: Peter Zijlstra (Intel) Cc: Cc: Cc: Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: Vineet Gupta Link: http://lkml.kernel.org/r/1462786660-2900-3-git-send-email-vgupta@synopsys.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index f94f164b5054..5d48306879d5 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -9309,7 +9309,7 @@ SYSCALL_DEFINE5(perf_event_open, if (is_sampling_event(event)) { if (event->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT) { - err = -ENOTSUPP; + err = -EOPNOTSUPP; goto err_alloc; } } -- cgit v1.2.3 From 133e89ef5ef338e1358b16246521ba17d935c396 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Fri, 13 May 2016 11:56:26 -0700 Subject: locking/rwsem: Enable lockless waiter wakeup(s) As wake_qs gain users, we can teach rwsems about them such that waiters can be awoken without the wait_lock. This is for both readers and writer, the former being the most ideal candidate as we can batch the wakeups shortening the critical region that much more -- ie writer task blocking a bunch of tasks waiting to service page-faults (mmap_sem readers). In general applying wake_qs to rwsem (xadd) is not difficult as the wait_lock is intended to be released soon _anyways_, with the exception of when a writer slowpath will proactively wakeup any queued readers if it sees that the lock is owned by a reader, in which we simply do the wakeups with the lock held (see comment in __rwsem_down_write_failed_common()). Similar to other locking primitives, delaying the waiter being awoken does allow, at least in theory, the lock to be stolen in the case of writers, however no harm was seen in this (in fact lock stealing tends to be a _good_ thing in most workloads), and this is a tiny window anyways. Some page-fault (pft) and mmap_sem intensive benchmarks show some pretty constant reduction in systime (by up to ~8 and ~10%) on a 2-socket, 12 core AMD box. In addition, on an 8-core Westmere doing page allocations (page_test) aim9: 4.6-rc6 4.6-rc6 rwsemv2 Min page_test 378167.89 ( 0.00%) 382613.33 ( 1.18%) Min exec_test 499.00 ( 0.00%) 502.67 ( 0.74%) Min fork_test 3395.47 ( 0.00%) 3537.64 ( 4.19%) Hmean page_test 395433.06 ( 0.00%) 414693.68 ( 4.87%) Hmean exec_test 499.67 ( 0.00%) 505.30 ( 1.13%) Hmean fork_test 3504.22 ( 0.00%) 3594.95 ( 2.59%) Stddev page_test 17426.57 ( 0.00%) 26649.92 (-52.93%) Stddev exec_test 0.47 ( 0.00%) 1.41 (-199.05%) Stddev fork_test 63.74 ( 0.00%) 32.59 ( 48.86%) Max page_test 429873.33 ( 0.00%) 456960.00 ( 6.30%) Max exec_test 500.33 ( 0.00%) 507.66 ( 1.47%) Max fork_test 3653.33 ( 0.00%) 3650.90 ( -0.07%) 4.6-rc6 4.6-rc6 rwsemv2 User 1.12 0.04 System 0.23 0.04 Elapsed 727.27 721.98 Signed-off-by: Davidlohr Bueso Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Waiman.Long@hpe.com Cc: dave@stgolabs.net Cc: jason.low2@hp.com Cc: peter@hurleysoftware.com Link: http://lkml.kernel.org/r/1463165787-25937-2-git-send-email-dave@stgolabs.net Signed-off-by: Ingo Molnar --- kernel/locking/rwsem-xadd.c | 58 ++++++++++++++++++++++++++++++++------------- 1 file changed, 42 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index 09e30c6225e5..80b05ac0f015 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c @@ -114,12 +114,16 @@ enum rwsem_wake_type { * - the 'active part' of count (&0x0000ffff) reached 0 (but may have changed) * - the 'waiting part' of count (&0xffff0000) is -ve (and will still be so) * - there must be someone on the queue - * - the spinlock must be held by the caller + * - the wait_lock must be held by the caller + * - tasks are marked for wakeup, the caller must later invoke wake_up_q() + * to actually wakeup the blocked task(s) and drop the reference count, + * preferably when the wait_lock is released * - woken process blocks are discarded from the list after having task zeroed - * - writers are only woken if downgrading is false + * - writers are only marked woken if downgrading is false */ static struct rw_semaphore * -__rwsem_do_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type) +__rwsem_mark_wake(struct rw_semaphore *sem, + enum rwsem_wake_type wake_type, struct wake_q_head *wake_q) { struct rwsem_waiter *waiter; struct task_struct *tsk; @@ -128,13 +132,16 @@ __rwsem_do_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type) waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list); if (waiter->type == RWSEM_WAITING_FOR_WRITE) { - if (wake_type == RWSEM_WAKE_ANY) - /* Wake writer at the front of the queue, but do not - * grant it the lock yet as we want other writers - * to be able to steal it. Readers, on the other hand, - * will block as they will notice the queued writer. + if (wake_type == RWSEM_WAKE_ANY) { + /* + * Mark writer at the front of the queue for wakeup. + * Until the task is actually later awoken later by + * the caller, other writers are able to steal it. + * Readers, on the other hand, will block as they + * will notice the queued writer. */ - wake_up_process(waiter->task); + wake_q_add(wake_q, waiter->task); + } goto out; } @@ -196,7 +203,7 @@ __rwsem_do_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type) */ smp_mb(); waiter->task = NULL; - wake_up_process(tsk); + wake_q_add(wake_q, tsk); put_task_struct(tsk); } while (--loop); @@ -216,6 +223,7 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem) long count, adjustment = -RWSEM_ACTIVE_READ_BIAS; struct rwsem_waiter waiter; struct task_struct *tsk = current; + WAKE_Q(wake_q); /* set up my own style of waitqueue */ waiter.task = tsk; @@ -238,9 +246,10 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem) if (count == RWSEM_WAITING_BIAS || (count > RWSEM_WAITING_BIAS && adjustment != -RWSEM_ACTIVE_READ_BIAS)) - sem = __rwsem_do_wake(sem, RWSEM_WAKE_ANY); + sem = __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); raw_spin_unlock_irq(&sem->wait_lock); + wake_up_q(&wake_q); /* wait to be given the lock */ while (true) { @@ -440,6 +449,7 @@ __rwsem_down_write_failed_common(struct rw_semaphore *sem, int state) bool waiting = true; /* any queued threads before us */ struct rwsem_waiter waiter; struct rw_semaphore *ret = sem; + WAKE_Q(wake_q); /* undo write bias from down_write operation, stop active locking */ count = rwsem_atomic_update(-RWSEM_ACTIVE_WRITE_BIAS, sem); @@ -472,8 +482,19 @@ __rwsem_down_write_failed_common(struct rw_semaphore *sem, int state) * no active writers, the lock must be read owned; so we try to * wake any read locks that were queued ahead of us. */ - if (count > RWSEM_WAITING_BIAS) - sem = __rwsem_do_wake(sem, RWSEM_WAKE_READERS); + if (count > RWSEM_WAITING_BIAS) { + WAKE_Q(wake_q); + + sem = __rwsem_mark_wake(sem, RWSEM_WAKE_READERS, &wake_q); + /* + * The wakeup is normally called _after_ the wait_lock + * is released, but given that we are proactively waking + * readers we can deal with the wake_q overhead as it is + * similar to releasing and taking the wait_lock again + * for attempting rwsem_try_write_lock(). + */ + wake_up_q(&wake_q); + } } else count = rwsem_atomic_update(RWSEM_WAITING_BIAS, sem); @@ -509,8 +530,9 @@ out_nolock: if (list_empty(&sem->wait_list)) rwsem_atomic_update(-RWSEM_WAITING_BIAS, sem); else - __rwsem_do_wake(sem, RWSEM_WAKE_ANY); + __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); raw_spin_unlock_irq(&sem->wait_lock); + wake_up_q(&wake_q); return ERR_PTR(-EINTR); } @@ -537,6 +559,7 @@ __visible struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem) { unsigned long flags; + WAKE_Q(wake_q); /* * If a spinner is present, it is not necessary to do the wakeup. @@ -573,9 +596,10 @@ locked: /* do nothing if list empty */ if (!list_empty(&sem->wait_list)) - sem = __rwsem_do_wake(sem, RWSEM_WAKE_ANY); + sem = __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); raw_spin_unlock_irqrestore(&sem->wait_lock, flags); + wake_up_q(&wake_q); return sem; } @@ -590,14 +614,16 @@ __visible struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem) { unsigned long flags; + WAKE_Q(wake_q); raw_spin_lock_irqsave(&sem->wait_lock, flags); /* do nothing if list empty */ if (!list_empty(&sem->wait_list)) - sem = __rwsem_do_wake(sem, RWSEM_WAKE_READ_OWNED); + sem = __rwsem_mark_wake(sem, RWSEM_WAKE_READ_OWNED, &wake_q); raw_spin_unlock_irqrestore(&sem->wait_lock, flags); + wake_up_q(&wake_q); return sem; } -- cgit v1.2.3 From e38513905eeaae59056eac2c9ac55a43b1fc41b2 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Fri, 13 May 2016 11:56:27 -0700 Subject: locking/rwsem: Rework zeroing reader waiter->task Readers that are awoken will expect a nil ->task indicating that a wakeup has occurred. Because of the way readers are implemented, there's a small chance that the waiter will never block in the slowpath (rwsem_down_read_failed), and therefore requires some form of reference counting to avoid the following scenario: rwsem_down_read_failed() rwsem_wake() get_task_struct(); spin_lock_irq(&wait_lock); list_add_tail(&waiter.list) spin_unlock_irq(&wait_lock); raw_spin_lock_irqsave(&wait_lock) __rwsem_do_wake() while (1) { set_task_state(TASK_UNINTERRUPTIBLE); waiter->task = NULL if (!waiter.task) // true break; schedule() // never reached __set_task_state(TASK_RUNNING); do_exit(); wake_up_process(tsk); // boom ... and therefore race with do_exit() when the caller returns. There is also a mismatch between the smp_mb() and its documentation, in that the serialization is done between reading the task and the nil store. Furthermore, in addition to having the overlapping of loads and stores to waiter->task guaranteed to be ordered within that CPU, both wake_up_process() originally and now wake_q_add() already imply barriers upon successful calls, which serves the comment. Now, as an alternative to perhaps inverting the checks in the blocker side (which has its own penalty in that schedule is unavoidable), with lockless wakeups this situation is naturally addressed and we can just use the refcount held by wake_q_add(), instead doing so explicitly. Of course, we must guarantee that the nil store is done as the _last_ operation in that the task must already be marked for deletion to not fall into the race above. Spurious wakeups are also handled transparently in that the task's reference is only removed when wake_up_q() is actually called _after_ the nil store. Signed-off-by: Davidlohr Bueso Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Waiman.Long@hpe.com Cc: dave@stgolabs.net Cc: jason.low2@hp.com Cc: peter@hurleysoftware.com Link: http://lkml.kernel.org/r/1463165787-25937-3-git-send-email-dave@stgolabs.net Signed-off-by: Ingo Molnar --- kernel/locking/rwsem-xadd.c | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index 80b05ac0f015..fcbf75ac3dcb 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c @@ -194,17 +194,15 @@ __rwsem_mark_wake(struct rw_semaphore *sem, waiter = list_entry(next, struct rwsem_waiter, list); next = waiter->list.next; tsk = waiter->task; + + wake_q_add(wake_q, tsk); /* - * Make sure we do not wakeup the next reader before - * setting the nil condition to grant the next reader; - * otherwise we could miss the wakeup on the other - * side and end up sleeping again. See the pairing - * in rwsem_down_read_failed(). + * Ensure that the last operation is setting the reader + * waiter to nil such that rwsem_down_read_failed() cannot + * race with do_exit() by always holding a reference count + * to the task to wakeup. */ - smp_mb(); - waiter->task = NULL; - wake_q_add(wake_q, tsk); - put_task_struct(tsk); + smp_store_release(&waiter->task, NULL); } while (--loop); sem->wait_list.next = next; @@ -228,7 +226,6 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem) /* set up my own style of waitqueue */ waiter.task = tsk; waiter.type = RWSEM_WAITING_FOR_READ; - get_task_struct(tsk); raw_spin_lock_irq(&sem->wait_lock); if (list_empty(&sem->wait_list)) -- cgit v1.2.3 From c0fcb6c2d332041256dc55d8a1ec3c0a2d0befb8 Mon Sep 17 00:00:00 2001 From: Jason Low Date: Mon, 16 May 2016 17:38:00 -0700 Subject: locking/rwsem: Optimize write lock by reducing operations in slowpath When acquiring the rwsem write lock in the slowpath, we first try to set count to RWSEM_WAITING_BIAS. When that is successful, we then atomically add the RWSEM_WAITING_BIAS in cases where there are other tasks on the wait list. This causes write lock operations to often issue multiple atomic operations. We can instead make the list_is_singular() check first, and then set the count accordingly, so that we issue at most 1 atomic operation when acquiring the write lock and reduce unnecessary cacheline contention. Signed-off-by: Jason Low Signed-off-by: Peter Zijlstra (Intel) Acked-by: Waiman Long Acked-by: Davidlohr Bueso Cc: Andrew Morton Cc: Arnd Bergmann Cc: Christoph Lameter Cc: Fenghua Yu Cc: Heiko Carstens Cc: Ivan Kokshaysky Cc: Jason Low Cc: Linus Torvalds Cc: Martin Schwidefsky Cc: Matt Turner Cc: Paul E. McKenney Cc: Peter Hurley Cc: Peter Zijlstra Cc: Richard Henderson Cc: Terry Rudd Cc: Thomas Gleixner Cc: Tim Chen Cc: Tony Luck Link: http://lkml.kernel.org/r/1463445486-16078-2-git-send-email-jason.low2@hpe.com Signed-off-by: Ingo Molnar --- kernel/locking/rwsem-xadd.c | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index fcbf75ac3dcb..b957da7fcb19 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c @@ -261,17 +261,28 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem) } EXPORT_SYMBOL(rwsem_down_read_failed); +/* + * This function must be called with the sem->wait_lock held to prevent + * race conditions between checking the rwsem wait list and setting the + * sem->count accordingly. + */ static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem) { /* - * Try acquiring the write lock. Check count first in order - * to reduce unnecessary expensive cmpxchg() operations. + * Avoid trying to acquire write lock if count isn't RWSEM_WAITING_BIAS. */ - if (count == RWSEM_WAITING_BIAS && - cmpxchg_acquire(&sem->count, RWSEM_WAITING_BIAS, - RWSEM_ACTIVE_WRITE_BIAS) == RWSEM_WAITING_BIAS) { - if (!list_is_singular(&sem->wait_list)) - rwsem_atomic_update(RWSEM_WAITING_BIAS, sem); + if (count != RWSEM_WAITING_BIAS) + return false; + + /* + * Acquire the lock by trying to set it to ACTIVE_WRITE_BIAS. If there + * are other tasks on the wait list, we need to add on WAITING_BIAS. + */ + count = list_is_singular(&sem->wait_list) ? + RWSEM_ACTIVE_WRITE_BIAS : + RWSEM_ACTIVE_WRITE_BIAS + RWSEM_WAITING_BIAS; + + if (cmpxchg_acquire(&sem->count, RWSEM_WAITING_BIAS, count) == RWSEM_WAITING_BIAS) { rwsem_set_owner(sem); return true; } -- cgit v1.2.3 From 6e2814745c67ab422b86262b05e6f23a56f28aa3 Mon Sep 17 00:00:00 2001 From: Jason Low Date: Fri, 20 May 2016 15:19:36 -0700 Subject: locking/mutex: Set and clear owner using WRITE_ONCE() The mutex owner can get read and written to locklessly. Use WRITE_ONCE when setting and clearing the owner field in order to avoid optimizations such as store tearing. This avoids situations where the owner field gets written to with multiple stores and another thread could concurrently read and use a partially written owner value. Signed-off-by: Jason Low Signed-off-by: Peter Zijlstra (Intel) Acked-by: Davidlohr Bueso Acked-by: Waiman Long Cc: Andrew Morton Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Scott J Norton Cc: Terry Rudd Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1463782776.2479.9.camel@j-VirtualBox Signed-off-by: Ingo Molnar --- kernel/locking/mutex-debug.h | 4 ++-- kernel/locking/mutex.h | 10 ++++++++-- 2 files changed, 10 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/mutex-debug.h b/kernel/locking/mutex-debug.h index 0799fd3e4cfa..372e6530180d 100644 --- a/kernel/locking/mutex-debug.h +++ b/kernel/locking/mutex-debug.h @@ -29,12 +29,12 @@ extern void debug_mutex_init(struct mutex *lock, const char *name, static inline void mutex_set_owner(struct mutex *lock) { - lock->owner = current; + WRITE_ONCE(lock->owner, current); } static inline void mutex_clear_owner(struct mutex *lock) { - lock->owner = NULL; + WRITE_ONCE(lock->owner, NULL); } #define spin_lock_mutex(lock, flags) \ diff --git a/kernel/locking/mutex.h b/kernel/locking/mutex.h index 5cda397607f2..12f96199441c 100644 --- a/kernel/locking/mutex.h +++ b/kernel/locking/mutex.h @@ -17,14 +17,20 @@ __list_del((waiter)->list.prev, (waiter)->list.next) #ifdef CONFIG_MUTEX_SPIN_ON_OWNER +/* + * The mutex owner can get read and written to locklessly. + * We should use WRITE_ONCE when writing the owner value to + * avoid store tearing, otherwise, a thread could potentially + * read a partially written and incomplete owner value. + */ static inline void mutex_set_owner(struct mutex *lock) { - lock->owner = current; + WRITE_ONCE(lock->owner, current); } static inline void mutex_clear_owner(struct mutex *lock) { - lock->owner = NULL; + WRITE_ONCE(lock->owner, NULL); } #else static inline void mutex_set_owner(struct mutex *lock) -- cgit v1.2.3 From a4f144ebbdf6f7807c477bce8e136047ed27321f Mon Sep 17 00:00:00 2001 From: David Carrillo-Cisneros Date: Wed, 1 Jun 2016 12:33:05 -0700 Subject: perf/core: Fix crash due to account/unaccount_sb_event() inconsistency unaccount_pmu_sb_event() did not check for attributes in event->attr before calling detach_sb_event(), while account_pmu_event() did. This caused NULL pointer reference in cgroup events that did not have any of the attributes checked by account_pmu_event(). To trigger the bug just wait for a cgroup event to terminate, e.g.: $ mkdir /dev/cgroup/devices/test $ perf stat -e cycles -a -G test sleep 0 ... see crash ... Signed-off-by: David Carrillo-Cisneros Reviewed-by: Stephane Eranian Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Kan Liang Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Zheng Link: http://lkml.kernel.org/r/1464809585-66072-1-git-send-email-davidcc@google.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 34 ++++++++++++++++++---------------- 1 file changed, 18 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 5d48306879d5..ae081a141a4a 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3682,15 +3682,28 @@ static void detach_sb_event(struct perf_event *event) raw_spin_unlock(&pel->lock); } -static void unaccount_pmu_sb_event(struct perf_event *event) +static bool is_sb_event(struct perf_event *event) { + struct perf_event_attr *attr = &event->attr; + if (event->parent) - return; + return false; if (event->attach_state & PERF_ATTACH_TASK) - return; + return false; - detach_sb_event(event); + if (attr->mmap || attr->mmap_data || attr->mmap2 || + attr->comm || attr->comm_exec || + attr->task || + attr->context_switch) + return true; + return false; +} + +static void unaccount_pmu_sb_event(struct perf_event *event) +{ + if (is_sb_event(event)) + detach_sb_event(event); } static void unaccount_event_cpu(struct perf_event *event, int cpu) @@ -8666,18 +8679,7 @@ static void attach_sb_event(struct perf_event *event) */ static void account_pmu_sb_event(struct perf_event *event) { - struct perf_event_attr *attr = &event->attr; - - if (event->parent) - return; - - if (event->attach_state & PERF_ATTACH_TASK) - return; - - if (attr->mmap || attr->mmap_data || attr->mmap2 || - attr->comm || attr->comm_exec || - attr->task || - attr->context_switch) + if (is_sb_event(event)) attach_sb_event(event); } -- cgit v1.2.3 From dfaaf3fa01d65cf6e2072965bb0b7aaa7285344f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 30 May 2016 18:31:33 +0200 Subject: locking/lockdep: Use __jhash_mix() for iterate_chain_key() Use __jhash_mix() to mix the class_idx into the class_key. This function provides better mixing than the previously used, home grown mix function. Leave hashing to the professionals :-) Suggested-by: George Spelvin Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- kernel/locking/lockdep.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 81f1a7107c0e..589d763a49b3 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -46,6 +46,7 @@ #include #include #include +#include #include @@ -309,10 +310,14 @@ static struct hlist_head chainhash_table[CHAINHASH_SIZE]; * It's a 64-bit hash, because it's important for the keys to be * unique. */ -#define iterate_chain_key(key1, key2) \ - (((key1) << MAX_LOCKDEP_KEYS_BITS) ^ \ - ((key1) >> (64-MAX_LOCKDEP_KEYS_BITS)) ^ \ - (key2)) +static inline u64 iterate_chain_key(u64 key, u32 idx) +{ + u32 k0 = key, k1 = key >> 32; + + __jhash_mix(idx, k0, k1); /* Macro that modifies arguments! */ + + return k0 | (u64)k1 << 32; +} void lockdep_off(void) { -- cgit v1.2.3 From a461d58792d4a46b8b10ae50973ec9b2763b694e Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Fri, 27 May 2016 15:47:18 +0200 Subject: locking/rtmutex: Only warn once on a trylock from bad context One warning should be enough to get one motivated to fix this. It is possible that this happens more than once and that starts flooding the output. Later the prints will be suppressed so we only get half of it. Depending on the console system used it might not be helpful. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1464356838-1755-1-git-send-email-bigeasy@linutronix.de Signed-off-by: Ingo Molnar --- kernel/locking/rtmutex.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 3e746607abe5..1ec0f48962b3 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -1478,7 +1478,7 @@ EXPORT_SYMBOL_GPL(rt_mutex_timed_lock); */ int __sched rt_mutex_trylock(struct rt_mutex *lock) { - if (WARN_ON(in_irq() || in_nmi() || in_serving_softirq())) + if (WARN_ON_ONCE(in_irq() || in_nmi() || in_serving_softirq())) return 0; return rt_mutex_fasttrylock(lock, rt_mutex_slowtrylock); -- cgit v1.2.3 From 03c041c5bf6ed584dff36b7cd509e0146a124277 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Fri, 3 Jun 2016 17:58:41 -0500 Subject: sched/debug: Always show 'nr_migrations' The nr_migrations field is updated independently of CONFIG_SCHEDSTATS, so it can be displayed regardless. Signed-off-by: Josh Poimboeuf Signed-off-by: Peter Zijlstra (Intel) Acked-by: Mel Gorman Cc: Linus Torvalds Cc: Matt Fleming Cc: Peter Zijlstra Cc: Srikar Dronamraju Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/5b1b04057ae2b14d73c2d03f56582c1d38cfe066.1464994423.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/debug.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 0368c393a336..2a0a9995256d 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -879,9 +879,9 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) nr_switches = p->nvcsw + p->nivcsw; -#ifdef CONFIG_SCHEDSTATS P(se.nr_migrations); +#ifdef CONFIG_SCHEDSTATS if (schedstat_enabled()) { u64 avg_atom, avg_per_cpu; -- cgit v1.2.3 From 8d53fa19041ae65c484d81d75179b4a577e6d8e4 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 8 Jun 2016 09:12:30 +0200 Subject: locking/qspinlock: Clarify xchg_tail() ordering While going over the code I noticed that xchg_tail() is a RELEASE but had no obvious pairing commented. It pairs with a somewhat unique address dependency through decode_tail(). So the store-release of xchg_tail() is paired by the address dependency of the load of xchg_tail followed by the dereference from the pointer computed from that load. The @old -> @prev transformation itself is pure, and therefore does not depend on external state, so that is immaterial wrt. ordering. Signed-off-by: Peter Zijlstra (Intel) Cc: Boqun Feng Cc: Davidlohr Bueso Cc: Linus Torvalds Cc: Pan Xinhui Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Waiman Long Cc: Will Deacon Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/locking/qspinlock.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c index 5fc8c311b8fe..ee7deb08d43d 100644 --- a/kernel/locking/qspinlock.c +++ b/kernel/locking/qspinlock.c @@ -90,7 +90,7 @@ static DEFINE_PER_CPU_ALIGNED(struct mcs_spinlock, mcs_nodes[MAX_NODES]); * therefore increment the cpu number by one. */ -static inline u32 encode_tail(int cpu, int idx) +static inline __pure u32 encode_tail(int cpu, int idx) { u32 tail; @@ -103,7 +103,7 @@ static inline u32 encode_tail(int cpu, int idx) return tail; } -static inline struct mcs_spinlock *decode_tail(u32 tail) +static inline __pure struct mcs_spinlock *decode_tail(u32 tail) { int cpu = (tail >> _Q_TAIL_CPU_OFFSET) - 1; int idx = (tail & _Q_TAIL_IDX_MASK) >> _Q_TAIL_IDX_OFFSET; @@ -455,6 +455,8 @@ queue: * pending stuff. * * p,*,* -> n,*,* + * + * RELEASE, such that the stores to @node must be complete. */ old = xchg_tail(lock, tail); next = NULL; @@ -465,6 +467,15 @@ queue: */ if (old & _Q_TAIL_MASK) { prev = decode_tail(old); + /* + * The above xchg_tail() is also a load of @lock which generates, + * through decode_tail(), a pointer. + * + * The address dependency matches the RELEASE of xchg_tail() + * such that the access to @prev must happen after. + */ + smp_read_barrier_depends(); + WRITE_ONCE(prev->next, node); pv_wait_node(node, prev); -- cgit v1.2.3 From 055ce0fd1b86c204430cbc0887165599d6e15090 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 8 Jun 2016 10:36:53 +0200 Subject: locking/qspinlock: Add comments I figured we need to document the spin_is_locked() and spin_unlock_wait() constraints somwehere. Ideally 'someone' would rewrite Documentation/atomic_ops.txt and we could find a place in there. But currently that document is stale to the point of hardly being useful. Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Boqun Feng Cc: Davidlohr Bueso Cc: Linus Torvalds Cc: Pan Xinhui Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Waiman Long Cc: Will Deacon Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/locking/qspinlock.c | 57 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 57 insertions(+) (limited to 'kernel') diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c index ee7deb08d43d..2f9153b183c9 100644 --- a/kernel/locking/qspinlock.c +++ b/kernel/locking/qspinlock.c @@ -267,6 +267,63 @@ static __always_inline u32 __pv_wait_head_or_lock(struct qspinlock *lock, #define queued_spin_lock_slowpath native_queued_spin_lock_slowpath #endif +/* + * Various notes on spin_is_locked() and spin_unlock_wait(), which are + * 'interesting' functions: + * + * PROBLEM: some architectures have an interesting issue with atomic ACQUIRE + * operations in that the ACQUIRE applies to the LOAD _not_ the STORE (ARM64, + * PPC). Also qspinlock has a similar issue per construction, the setting of + * the locked byte can be unordered acquiring the lock proper. + * + * This gets to be 'interesting' in the following cases, where the /should/s + * end up false because of this issue. + * + * + * CASE 1: + * + * So the spin_is_locked() correctness issue comes from something like: + * + * CPU0 CPU1 + * + * global_lock(); local_lock(i) + * spin_lock(&G) spin_lock(&L[i]) + * for (i) if (!spin_is_locked(&G)) { + * spin_unlock_wait(&L[i]); smp_acquire__after_ctrl_dep(); + * return; + * } + * // deal with fail + * + * Where it is important CPU1 sees G locked or CPU0 sees L[i] locked such + * that there is exclusion between the two critical sections. + * + * The load from spin_is_locked(&G) /should/ be constrained by the ACQUIRE from + * spin_lock(&L[i]), and similarly the load(s) from spin_unlock_wait(&L[i]) + * /should/ be constrained by the ACQUIRE from spin_lock(&G). + * + * Similarly, later stuff is constrained by the ACQUIRE from CTRL+RMB. + * + * + * CASE 2: + * + * For spin_unlock_wait() there is a second correctness issue, namely: + * + * CPU0 CPU1 + * + * flag = set; + * smp_mb(); spin_lock(&l) + * spin_unlock_wait(&l); if (!flag) + * // add to lockless list + * spin_unlock(&l); + * // iterate lockless list + * + * Which wants to ensure that CPU1 will stop adding bits to the list and CPU0 + * will observe the last entry on the list (if spin_unlock_wait() had ACQUIRE + * semantics etc..) + * + * Where flag /should/ be ordered against the locked store of l. + */ + /* * queued_spin_lock_slowpath() can (load-)ACQUIRE the lock before * issuing an _unordered_ store to set _Q_LOCKED_VAL. -- cgit v1.2.3 From 8ee62b1870be8e630158701632a533d0378e15b8 Mon Sep 17 00:00:00 2001 From: Jason Low Date: Fri, 3 Jun 2016 22:26:02 -0700 Subject: locking/rwsem: Convert sem->count to 'atomic_long_t' Convert the rwsem count variable to an atomic_long_t since we use it as an atomic variable. This also allows us to remove the rwsem_atomic_{add,update}() "abstraction" which would now be an unnecesary level of indirection. In follow up patches, we also remove the rwsem_atomic_{add,update}() definitions across the various architectures. Suggested-by: Peter Zijlstra Signed-off-by: Jason Low [ Build warning fixes on various architectures. ] Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Davidlohr Bueso Cc: Fenghua Yu Cc: Heiko Carstens Cc: Jason Low Cc: Linus Torvalds Cc: Martin Schwidefsky Cc: Paul E. McKenney Cc: Peter Hurley Cc: Terry Rudd Cc: Thomas Gleixner Cc: Tim Chen Cc: Tony Luck Cc: Waiman Long Link: http://lkml.kernel.org/r/1465017963-4839-2-git-send-email-jason.low2@hpe.com Signed-off-by: Ingo Molnar --- arch/alpha/include/asm/rwsem.h | 26 +++++++++++++------------- arch/ia64/include/asm/rwsem.h | 24 ++++++++++++------------ include/asm-generic/rwsem.h | 6 +++--- include/linux/rwsem.h | 8 +++++--- kernel/locking/rwsem-xadd.c | 32 +++++++++++++++++--------------- 5 files changed, 50 insertions(+), 46 deletions(-) (limited to 'kernel') diff --git a/arch/alpha/include/asm/rwsem.h b/arch/alpha/include/asm/rwsem.h index 0131a7058778..b40021aabb9f 100644 --- a/arch/alpha/include/asm/rwsem.h +++ b/arch/alpha/include/asm/rwsem.h @@ -25,8 +25,8 @@ static inline void __down_read(struct rw_semaphore *sem) { long oldcount; #ifndef CONFIG_SMP - oldcount = sem->count; - sem->count += RWSEM_ACTIVE_READ_BIAS; + oldcount = sem->count.counter; + sem->count.counter += RWSEM_ACTIVE_READ_BIAS; #else long temp; __asm__ __volatile__( @@ -52,13 +52,13 @@ static inline int __down_read_trylock(struct rw_semaphore *sem) { long old, new, res; - res = sem->count; + res = atomic_long_read(&sem->count); do { new = res + RWSEM_ACTIVE_READ_BIAS; if (new <= 0) break; old = res; - res = cmpxchg(&sem->count, old, new); + res = atomic_long_cmpxchg(&sem->count, old, new); } while (res != old); return res >= 0 ? 1 : 0; } @@ -67,8 +67,8 @@ static inline long ___down_write(struct rw_semaphore *sem) { long oldcount; #ifndef CONFIG_SMP - oldcount = sem->count; - sem->count += RWSEM_ACTIVE_WRITE_BIAS; + oldcount = sem->count.counter; + sem->count.counter += RWSEM_ACTIVE_WRITE_BIAS; #else long temp; __asm__ __volatile__( @@ -106,7 +106,7 @@ static inline int __down_write_killable(struct rw_semaphore *sem) */ static inline int __down_write_trylock(struct rw_semaphore *sem) { - long ret = cmpxchg(&sem->count, RWSEM_UNLOCKED_VALUE, + long ret = atomic_long_cmpxchg(&sem->count, RWSEM_UNLOCKED_VALUE, RWSEM_ACTIVE_WRITE_BIAS); if (ret == RWSEM_UNLOCKED_VALUE) return 1; @@ -117,8 +117,8 @@ static inline void __up_read(struct rw_semaphore *sem) { long oldcount; #ifndef CONFIG_SMP - oldcount = sem->count; - sem->count -= RWSEM_ACTIVE_READ_BIAS; + oldcount = sem->count.counter; + sem->count.counter -= RWSEM_ACTIVE_READ_BIAS; #else long temp; __asm__ __volatile__( @@ -142,8 +142,8 @@ static inline void __up_write(struct rw_semaphore *sem) { long count; #ifndef CONFIG_SMP - sem->count -= RWSEM_ACTIVE_WRITE_BIAS; - count = sem->count; + sem->count.counter -= RWSEM_ACTIVE_WRITE_BIAS; + count = sem->count.counter; #else long temp; __asm__ __volatile__( @@ -171,8 +171,8 @@ static inline void __downgrade_write(struct rw_semaphore *sem) { long oldcount; #ifndef CONFIG_SMP - oldcount = sem->count; - sem->count -= RWSEM_WAITING_BIAS; + oldcount = sem->count.counter; + sem->count.counter -= RWSEM_WAITING_BIAS; #else long temp; __asm__ __volatile__( diff --git a/arch/ia64/include/asm/rwsem.h b/arch/ia64/include/asm/rwsem.h index 8b23e070b844..c5d544f188ed 100644 --- a/arch/ia64/include/asm/rwsem.h +++ b/arch/ia64/include/asm/rwsem.h @@ -40,7 +40,7 @@ static inline void __down_read (struct rw_semaphore *sem) { - long result = ia64_fetchadd8_acq((unsigned long *)&sem->count, 1); + long result = ia64_fetchadd8_acq((unsigned long *)&sem->count.counter, 1); if (result < 0) rwsem_down_read_failed(sem); @@ -55,9 +55,9 @@ ___down_write (struct rw_semaphore *sem) long old, new; do { - old = sem->count; + old = atomic_long_read(&sem->count); new = old + RWSEM_ACTIVE_WRITE_BIAS; - } while (cmpxchg_acq(&sem->count, old, new) != old); + } while (atomic_long_cmpxchg_acquire(&sem->count, old, new) != old); return old; } @@ -85,7 +85,7 @@ __down_write_killable (struct rw_semaphore *sem) static inline void __up_read (struct rw_semaphore *sem) { - long result = ia64_fetchadd8_rel((unsigned long *)&sem->count, -1); + long result = ia64_fetchadd8_rel((unsigned long *)&sem->count.counter, -1); if (result < 0 && (--result & RWSEM_ACTIVE_MASK) == 0) rwsem_wake(sem); @@ -100,9 +100,9 @@ __up_write (struct rw_semaphore *sem) long old, new; do { - old = sem->count; + old = atomic_long_read(&sem->count); new = old - RWSEM_ACTIVE_WRITE_BIAS; - } while (cmpxchg_rel(&sem->count, old, new) != old); + } while (atomic_long_cmpxchg_release(&sem->count, old, new) != old); if (new < 0 && (new & RWSEM_ACTIVE_MASK) == 0) rwsem_wake(sem); @@ -115,8 +115,8 @@ static inline int __down_read_trylock (struct rw_semaphore *sem) { long tmp; - while ((tmp = sem->count) >= 0) { - if (tmp == cmpxchg_acq(&sem->count, tmp, tmp+1)) { + while ((tmp = atomic_long_read(&sem->count)) >= 0) { + if (tmp == atomic_long_cmpxchg_acquire(&sem->count, tmp, tmp+1)) { return 1; } } @@ -129,8 +129,8 @@ __down_read_trylock (struct rw_semaphore *sem) static inline int __down_write_trylock (struct rw_semaphore *sem) { - long tmp = cmpxchg_acq(&sem->count, RWSEM_UNLOCKED_VALUE, - RWSEM_ACTIVE_WRITE_BIAS); + long tmp = atomic_long_cmpxchg_acquire(&sem->count, + RWSEM_UNLOCKED_VALUE, RWSEM_ACTIVE_WRITE_BIAS); return tmp == RWSEM_UNLOCKED_VALUE; } @@ -143,9 +143,9 @@ __downgrade_write (struct rw_semaphore *sem) long old, new; do { - old = sem->count; + old = atomic_long_read(&sem->count); new = old - RWSEM_WAITING_BIAS; - } while (cmpxchg_rel(&sem->count, old, new) != old); + } while (atomic_long_cmpxchg_release(&sem->count, old, new) != old); if (old < 0) rwsem_downgrade_wake(sem); diff --git a/include/asm-generic/rwsem.h b/include/asm-generic/rwsem.h index 3fc94a046bf5..a3a93eca766c 100644 --- a/include/asm-generic/rwsem.h +++ b/include/asm-generic/rwsem.h @@ -41,8 +41,8 @@ static inline int __down_read_trylock(struct rw_semaphore *sem) { long tmp; - while ((tmp = sem->count) >= 0) { - if (tmp == cmpxchg_acquire(&sem->count, tmp, + while ((tmp = atomic_long_read(&sem->count)) >= 0) { + if (tmp == atomic_long_cmpxchg_acquire(&sem->count, tmp, tmp + RWSEM_ACTIVE_READ_BIAS)) { return 1; } @@ -79,7 +79,7 @@ static inline int __down_write_trylock(struct rw_semaphore *sem) { long tmp; - tmp = cmpxchg_acquire(&sem->count, RWSEM_UNLOCKED_VALUE, + tmp = atomic_long_cmpxchg_acquire(&sem->count, RWSEM_UNLOCKED_VALUE, RWSEM_ACTIVE_WRITE_BIAS); return tmp == RWSEM_UNLOCKED_VALUE; } diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h index d37fbb34d06f..dd1d14250340 100644 --- a/include/linux/rwsem.h +++ b/include/linux/rwsem.h @@ -23,10 +23,11 @@ struct rw_semaphore; #ifdef CONFIG_RWSEM_GENERIC_SPINLOCK #include /* use a generic implementation */ +#define __RWSEM_INIT_COUNT(name) .count = RWSEM_UNLOCKED_VALUE #else /* All arch specific implementations share the same struct */ struct rw_semaphore { - long count; + atomic_long_t count; struct list_head wait_list; raw_spinlock_t wait_lock; #ifdef CONFIG_RWSEM_SPIN_ON_OWNER @@ -54,9 +55,10 @@ extern struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem); /* In all implementations count != 0 means locked */ static inline int rwsem_is_locked(struct rw_semaphore *sem) { - return sem->count != 0; + return atomic_long_read(&sem->count) != 0; } +#define __RWSEM_INIT_COUNT(name) .count = ATOMIC_LONG_INIT(RWSEM_UNLOCKED_VALUE) #endif /* Common initializer macros and functions */ @@ -74,7 +76,7 @@ static inline int rwsem_is_locked(struct rw_semaphore *sem) #endif #define __RWSEM_INITIALIZER(name) \ - { .count = RWSEM_UNLOCKED_VALUE, \ + { __RWSEM_INIT_COUNT(name), \ .wait_list = LIST_HEAD_INIT((name).wait_list), \ .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock) \ __RWSEM_OPT_INIT(name) \ diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index b957da7fcb19..63b40a5c62ec 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c @@ -80,7 +80,7 @@ void __init_rwsem(struct rw_semaphore *sem, const char *name, debug_check_no_locks_freed((void *)sem, sizeof(*sem)); lockdep_init_map(&sem->dep_map, name, key, 0); #endif - sem->count = RWSEM_UNLOCKED_VALUE; + atomic_long_set(&sem->count, RWSEM_UNLOCKED_VALUE); raw_spin_lock_init(&sem->wait_lock); INIT_LIST_HEAD(&sem->wait_list); #ifdef CONFIG_RWSEM_SPIN_ON_OWNER @@ -153,10 +153,11 @@ __rwsem_mark_wake(struct rw_semaphore *sem, if (wake_type != RWSEM_WAKE_READ_OWNED) { adjustment = RWSEM_ACTIVE_READ_BIAS; try_reader_grant: - oldcount = rwsem_atomic_update(adjustment, sem) - adjustment; + oldcount = atomic_long_add_return(adjustment, &sem->count) - adjustment; + if (unlikely(oldcount < RWSEM_WAITING_BIAS)) { /* A writer stole the lock. Undo our reader grant. */ - if (rwsem_atomic_update(-adjustment, sem) & + if (atomic_long_sub_return(adjustment, &sem->count) & RWSEM_ACTIVE_MASK) goto out; /* Last active locker left. Retry waking readers. */ @@ -186,7 +187,7 @@ __rwsem_mark_wake(struct rw_semaphore *sem, adjustment -= RWSEM_WAITING_BIAS; if (adjustment) - rwsem_atomic_add(adjustment, sem); + atomic_long_add(adjustment, &sem->count); next = sem->wait_list.next; loop = woken; @@ -233,7 +234,7 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem) list_add_tail(&waiter.list, &sem->wait_list); /* we're now waiting on the lock, but no longer actively locking */ - count = rwsem_atomic_update(adjustment, sem); + count = atomic_long_add_return(adjustment, &sem->count); /* If there are no active locks, wake the front queued process(es). * @@ -282,7 +283,8 @@ static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem) RWSEM_ACTIVE_WRITE_BIAS : RWSEM_ACTIVE_WRITE_BIAS + RWSEM_WAITING_BIAS; - if (cmpxchg_acquire(&sem->count, RWSEM_WAITING_BIAS, count) == RWSEM_WAITING_BIAS) { + if (atomic_long_cmpxchg_acquire(&sem->count, RWSEM_WAITING_BIAS, count) + == RWSEM_WAITING_BIAS) { rwsem_set_owner(sem); return true; } @@ -296,13 +298,13 @@ static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem) */ static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem) { - long old, count = READ_ONCE(sem->count); + long old, count = atomic_long_read(&sem->count); while (true) { if (!(count == 0 || count == RWSEM_WAITING_BIAS)) return false; - old = cmpxchg_acquire(&sem->count, count, + old = atomic_long_cmpxchg_acquire(&sem->count, count, count + RWSEM_ACTIVE_WRITE_BIAS); if (old == count) { rwsem_set_owner(sem); @@ -324,7 +326,7 @@ static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem) rcu_read_lock(); owner = READ_ONCE(sem->owner); if (!owner) { - long count = READ_ONCE(sem->count); + long count = atomic_long_read(&sem->count); /* * If sem->owner is not set, yet we have just recently entered the * slowpath with the lock being active, then there is a possibility @@ -375,7 +377,7 @@ bool rwsem_spin_on_owner(struct rw_semaphore *sem, struct task_struct *owner) * held by readers. Check the counter to verify the * state. */ - count = READ_ONCE(sem->count); + count = atomic_long_read(&sem->count); return (count == 0 || count == RWSEM_WAITING_BIAS); } @@ -460,7 +462,7 @@ __rwsem_down_write_failed_common(struct rw_semaphore *sem, int state) WAKE_Q(wake_q); /* undo write bias from down_write operation, stop active locking */ - count = rwsem_atomic_update(-RWSEM_ACTIVE_WRITE_BIAS, sem); + count = atomic_long_sub_return(RWSEM_ACTIVE_WRITE_BIAS, &sem->count); /* do optimistic spinning and steal lock if possible */ if (rwsem_optimistic_spin(sem)) @@ -483,7 +485,7 @@ __rwsem_down_write_failed_common(struct rw_semaphore *sem, int state) /* we're now waiting on the lock, but no longer actively locking */ if (waiting) { - count = READ_ONCE(sem->count); + count = atomic_long_read(&sem->count); /* * If there were already threads queued before us and there are @@ -505,7 +507,7 @@ __rwsem_down_write_failed_common(struct rw_semaphore *sem, int state) } } else - count = rwsem_atomic_update(RWSEM_WAITING_BIAS, sem); + count = atomic_long_add_return(RWSEM_WAITING_BIAS, &sem->count); /* wait until we successfully acquire the lock */ set_current_state(state); @@ -521,7 +523,7 @@ __rwsem_down_write_failed_common(struct rw_semaphore *sem, int state) schedule(); set_current_state(state); - } while ((count = sem->count) & RWSEM_ACTIVE_MASK); + } while ((count = atomic_long_read(&sem->count)) & RWSEM_ACTIVE_MASK); raw_spin_lock_irq(&sem->wait_lock); } @@ -536,7 +538,7 @@ out_nolock: raw_spin_lock_irq(&sem->wait_lock); list_del(&waiter.list); if (list_empty(&sem->wait_list)) - rwsem_atomic_update(-RWSEM_WAITING_BIAS, sem); + atomic_long_add(-RWSEM_WAITING_BIAS, &sem->count); else __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); raw_spin_unlock_irq(&sem->wait_lock); -- cgit v1.2.3 From 19c5d690e41697fcdd19379ab9d10d8d37818414 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Tue, 17 May 2016 21:26:19 -0400 Subject: locking/rwsem: Add reader-owned state to the owner field Currently, it is not possible to determine for sure if a reader owns a rwsem by looking at the content of the rwsem data structure. This patch adds a new state RWSEM_READER_OWNED to the owner field to indicate that readers currently own the lock. This enables us to address the following 2 issues in the rwsem optimistic spinning code: 1) rwsem_can_spin_on_owner() will disallow optimistic spinning if the owner field is NULL which can mean either the readers own the lock or the owning writer hasn't set the owner field yet. In the latter case, we miss the chance to do optimistic spinning. 2) While a writer is waiting in the OSQ and a reader takes the lock, the writer will continue to spin when out of the OSQ in the main rwsem_optimistic_spin() loop as the owner field is NULL wasting CPU cycles if some of readers are sleeping. Adding the new state will allow optimistic spinning to go forward as long as the owner field is not RWSEM_READER_OWNED and the owner is running, if set, but stop immediately when that state has been reached. On a 4-socket Haswell machine running on a 4.6-rc1 based kernel, the fio test with multithreaded randrw and randwrite tests on the same file on a XFS partition on top of a NVDIMM were run, the aggregated bandwidths before and after the patch were as follows: Test BW before patch BW after patch % change ---- --------------- -------------- -------- randrw 988 MB/s 1192 MB/s +21% randwrite 1513 MB/s 1623 MB/s +7.3% The perf profile of the rwsem_down_write_failed() function in randrw before and after the patch were: 19.95% 5.88% fio [kernel.vmlinux] [k] rwsem_down_write_failed 14.20% 1.52% fio [kernel.vmlinux] [k] rwsem_down_write_failed The actual CPU cycles spend in rwsem_down_write_failed() dropped from 5.88% to 1.52% after the patch. The xfstests was also run and no regression was observed. Signed-off-by: Waiman Long Signed-off-by: Peter Zijlstra (Intel) Acked-by: Jason Low Acked-by: Davidlohr Bueso Cc: Andrew Morton Cc: Dave Chinner Cc: Douglas Hatch Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Hurley Cc: Peter Zijlstra Cc: Scott J Norton Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1463534783-38814-2-git-send-email-Waiman.Long@hpe.com Signed-off-by: Ingo Molnar --- kernel/locking/rwsem-xadd.c | 41 ++++++++++++++++++++++------------------- kernel/locking/rwsem.c | 8 ++++++-- kernel/locking/rwsem.h | 41 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 69 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index 63b40a5c62ec..6b0d0605910e 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c @@ -163,6 +163,12 @@ __rwsem_mark_wake(struct rw_semaphore *sem, /* Last active locker left. Retry waking readers. */ goto try_reader_grant; } + /* + * It is not really necessary to set it to reader-owned here, + * but it gives the spinners an early indication that the + * readers now have the lock. + */ + rwsem_set_reader_owned(sem); } /* Grant an infinite number of read locks to the readers at the front @@ -325,16 +331,11 @@ static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem) rcu_read_lock(); owner = READ_ONCE(sem->owner); - if (!owner) { - long count = atomic_long_read(&sem->count); + if (!rwsem_owner_is_writer(owner)) { /* - * If sem->owner is not set, yet we have just recently entered the - * slowpath with the lock being active, then there is a possibility - * reader(s) may have the lock. To be safe, bail spinning in these - * situations. + * Don't spin if the rwsem is readers owned. */ - if (count & RWSEM_ACTIVE_MASK) - ret = false; + ret = !rwsem_owner_is_reader(owner); goto done; } @@ -347,8 +348,6 @@ done: static noinline bool rwsem_spin_on_owner(struct rw_semaphore *sem, struct task_struct *owner) { - long count; - rcu_read_lock(); while (sem->owner == owner) { /* @@ -369,16 +368,11 @@ bool rwsem_spin_on_owner(struct rw_semaphore *sem, struct task_struct *owner) } rcu_read_unlock(); - if (READ_ONCE(sem->owner)) - return true; /* new owner, continue spinning */ - /* - * When the owner is not set, the lock could be free or - * held by readers. Check the counter to verify the - * state. + * If there is a new owner or the owner is not set, we continue + * spinning. */ - count = atomic_long_read(&sem->count); - return (count == 0 || count == RWSEM_WAITING_BIAS); + return !rwsem_owner_is_reader(READ_ONCE(sem->owner)); } static bool rwsem_optimistic_spin(struct rw_semaphore *sem) @@ -397,7 +391,16 @@ static bool rwsem_optimistic_spin(struct rw_semaphore *sem) while (true) { owner = READ_ONCE(sem->owner); - if (owner && !rwsem_spin_on_owner(sem, owner)) + /* + * Don't spin if + * 1) the owner is a reader as we we can't determine if the + * reader is actively running or not. + * 2) The rwsem_spin_on_owner() returns false which means + * the owner isn't running. + */ + if (rwsem_owner_is_reader(owner) || + (rwsem_owner_is_writer(owner) && + !rwsem_spin_on_owner(sem, owner))) break; /* wait_lock will be acquired if write_lock is obtained */ diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index 2e853ad93a3a..45ba475d4be3 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -22,6 +22,7 @@ void __sched down_read(struct rw_semaphore *sem) rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_); LOCK_CONTENDED(sem, __down_read_trylock, __down_read); + rwsem_set_reader_owned(sem); } EXPORT_SYMBOL(down_read); @@ -33,8 +34,10 @@ int down_read_trylock(struct rw_semaphore *sem) { int ret = __down_read_trylock(sem); - if (ret == 1) + if (ret == 1) { rwsem_acquire_read(&sem->dep_map, 0, 1, _RET_IP_); + rwsem_set_reader_owned(sem); + } return ret; } @@ -124,7 +127,7 @@ void downgrade_write(struct rw_semaphore *sem) * lockdep: a downgraded write will live on as a write * dependency. */ - rwsem_clear_owner(sem); + rwsem_set_reader_owned(sem); __downgrade_write(sem); } @@ -138,6 +141,7 @@ void down_read_nested(struct rw_semaphore *sem, int subclass) rwsem_acquire_read(&sem->dep_map, subclass, 0, _RET_IP_); LOCK_CONTENDED(sem, __down_read_trylock, __down_read); + rwsem_set_reader_owned(sem); } EXPORT_SYMBOL(down_read_nested); diff --git a/kernel/locking/rwsem.h b/kernel/locking/rwsem.h index 870ed9a5b426..8f43ba234787 100644 --- a/kernel/locking/rwsem.h +++ b/kernel/locking/rwsem.h @@ -1,3 +1,20 @@ +/* + * The owner field of the rw_semaphore structure will be set to + * RWSEM_READ_OWNED when a reader grabs the lock. A writer will clear + * the owner field when it unlocks. A reader, on the other hand, will + * not touch the owner field when it unlocks. + * + * In essence, the owner field now has the following 3 states: + * 1) 0 + * - lock is free or the owner hasn't set the field yet + * 2) RWSEM_READER_OWNED + * - lock is currently or previously owned by readers (lock is free + * or not set by owner yet) + * 3) Other non-zero value + * - a writer owns the lock + */ +#define RWSEM_READER_OWNED ((struct task_struct *)1UL) + #ifdef CONFIG_RWSEM_SPIN_ON_OWNER static inline void rwsem_set_owner(struct rw_semaphore *sem) { @@ -9,6 +26,26 @@ static inline void rwsem_clear_owner(struct rw_semaphore *sem) sem->owner = NULL; } +static inline void rwsem_set_reader_owned(struct rw_semaphore *sem) +{ + /* + * We check the owner value first to make sure that we will only + * do a write to the rwsem cacheline when it is really necessary + * to minimize cacheline contention. + */ + if (sem->owner != RWSEM_READER_OWNED) + sem->owner = RWSEM_READER_OWNED; +} + +static inline bool rwsem_owner_is_writer(struct task_struct *owner) +{ + return owner && owner != RWSEM_READER_OWNED; +} + +static inline bool rwsem_owner_is_reader(struct task_struct *owner) +{ + return owner == RWSEM_READER_OWNED; +} #else static inline void rwsem_set_owner(struct rw_semaphore *sem) { @@ -17,4 +54,8 @@ static inline void rwsem_set_owner(struct rw_semaphore *sem) static inline void rwsem_clear_owner(struct rw_semaphore *sem) { } + +static inline void rwsem_set_reader_owned(struct rw_semaphore *sem) +{ +} #endif -- cgit v1.2.3 From fb6a44f33be542fd81575ff93a4e8118d6a58592 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Tue, 17 May 2016 21:26:20 -0400 Subject: locking/rwsem: Protect all writes to owner by WRITE_ONCE() Without using WRITE_ONCE(), the compiler can potentially break a write into multiple smaller ones (store tearing). So a read from the same data by another task concurrently may return a partial result. This can result in a kernel crash if the data is a memory address that is being dereferenced. This patch changes all write to rwsem->owner to use WRITE_ONCE() to make sure that store tearing will not happen. READ_ONCE() may not be needed for rwsem->owner as long as the value is only used for comparison and not dereferencing. Signed-off-by: Waiman Long Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Dave Chinner Cc: Davidlohr Bueso Cc: Douglas Hatch Cc: Jason Low Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Hurley Cc: Peter Zijlstra Cc: Scott J Norton Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1463534783-38814-3-git-send-email-Waiman.Long@hpe.com Signed-off-by: Ingo Molnar --- kernel/locking/rwsem.h | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/rwsem.h b/kernel/locking/rwsem.h index 8f43ba234787..a699f4048ba1 100644 --- a/kernel/locking/rwsem.h +++ b/kernel/locking/rwsem.h @@ -16,14 +16,21 @@ #define RWSEM_READER_OWNED ((struct task_struct *)1UL) #ifdef CONFIG_RWSEM_SPIN_ON_OWNER +/* + * All writes to owner are protected by WRITE_ONCE() to make sure that + * store tearing can't happen as optimistic spinners may read and use + * the owner value concurrently without lock. Read from owner, however, + * may not need READ_ONCE() as long as the pointer value is only used + * for comparison and isn't being dereferenced. + */ static inline void rwsem_set_owner(struct rw_semaphore *sem) { - sem->owner = current; + WRITE_ONCE(sem->owner, current); } static inline void rwsem_clear_owner(struct rw_semaphore *sem) { - sem->owner = NULL; + WRITE_ONCE(sem->owner, NULL); } static inline void rwsem_set_reader_owned(struct rw_semaphore *sem) @@ -34,7 +41,7 @@ static inline void rwsem_set_reader_owned(struct rw_semaphore *sem) * to minimize cacheline contention. */ if (sem->owner != RWSEM_READER_OWNED) - sem->owner = RWSEM_READER_OWNED; + WRITE_ONCE(sem->owner, RWSEM_READER_OWNED); } static inline bool rwsem_owner_is_writer(struct task_struct *owner) -- cgit v1.2.3 From bf7b4c472db44413251bcef79ca1f6bf1ec81475 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Tue, 17 May 2016 21:26:22 -0400 Subject: locking/rwsem: Improve reader wakeup code In __rwsem_do_wake(), the reader wakeup code will assume a writer has stolen the lock if the active reader/writer count is not 0. However, this is not as reliable an indicator as the original "< RWSEM_WAITING_BIAS" check. If another reader is present, the code will still break out and exit even if the writer is gone. This patch changes it to check the same "< RWSEM_WAITING_BIAS" condition to reduce the chance of false positive. Signed-off-by: Waiman Long Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Peter Hurley Cc: Andrew Morton Cc: Dave Chinner Cc: Davidlohr Bueso Cc: Douglas Hatch Cc: Jason Low Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Scott J Norton Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1463534783-38814-5-git-send-email-Waiman.Long@hpe.com Signed-off-by: Ingo Molnar --- kernel/locking/rwsem-xadd.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index 6b0d0605910e..4f1daf5a472d 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c @@ -156,9 +156,14 @@ __rwsem_mark_wake(struct rw_semaphore *sem, oldcount = atomic_long_add_return(adjustment, &sem->count) - adjustment; if (unlikely(oldcount < RWSEM_WAITING_BIAS)) { - /* A writer stole the lock. Undo our reader grant. */ - if (atomic_long_sub_return(adjustment, &sem->count) & - RWSEM_ACTIVE_MASK) + /* + * If the count is still less than RWSEM_WAITING_BIAS + * after removing the adjustment, it is assumed that + * a writer has stolen the lock. We have to undo our + * reader grant. + */ + if (atomic_long_add_return(-adjustment, &sem->count) < + RWSEM_WAITING_BIAS) goto out; /* Last active locker left. Retry waking readers. */ goto try_reader_grant; -- cgit v1.2.3 From ddd0fa73c2b71c35de4fe7ae60a5f1a6cddc2cf0 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Tue, 17 May 2016 21:26:23 -0400 Subject: locking/rwsem: Streamline the rwsem_optimistic_spin() code This patch moves the owner loading and checking code entirely inside of rwsem_spin_on_owner() to simplify the logic of rwsem_optimistic_spin() loop. Suggested-by: Peter Hurley Signed-off-by: Waiman Long Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Peter Hurley Cc: Andrew Morton Cc: Dave Chinner Cc: Davidlohr Bueso Cc: Douglas Hatch Cc: Jason Low Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Scott J Norton Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1463534783-38814-6-git-send-email-Waiman.Long@hpe.com Signed-off-by: Ingo Molnar --- kernel/locking/rwsem-xadd.c | 38 ++++++++++++++++++++------------------ 1 file changed, 20 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index 4f1daf5a472d..2031281bb940 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c @@ -350,9 +350,16 @@ done: return ret; } -static noinline -bool rwsem_spin_on_owner(struct rw_semaphore *sem, struct task_struct *owner) +/* + * Return true only if we can still spin on the owner field of the rwsem. + */ +static noinline bool rwsem_spin_on_owner(struct rw_semaphore *sem) { + struct task_struct *owner = READ_ONCE(sem->owner); + + if (!rwsem_owner_is_writer(owner)) + goto out; + rcu_read_lock(); while (sem->owner == owner) { /* @@ -372,7 +379,7 @@ bool rwsem_spin_on_owner(struct rw_semaphore *sem, struct task_struct *owner) cpu_relax_lowlatency(); } rcu_read_unlock(); - +out: /* * If there is a new owner or the owner is not set, we continue * spinning. @@ -382,7 +389,6 @@ bool rwsem_spin_on_owner(struct rw_semaphore *sem, struct task_struct *owner) static bool rwsem_optimistic_spin(struct rw_semaphore *sem) { - struct task_struct *owner; bool taken = false; preempt_disable(); @@ -394,21 +400,17 @@ static bool rwsem_optimistic_spin(struct rw_semaphore *sem) if (!osq_lock(&sem->osq)) goto done; - while (true) { - owner = READ_ONCE(sem->owner); + /* + * Optimistically spin on the owner field and attempt to acquire the + * lock whenever the owner changes. Spinning will be stopped when: + * 1) the owning writer isn't running; or + * 2) readers own the lock as we can't determine if they are + * actively running or not. + */ + while (rwsem_spin_on_owner(sem)) { /* - * Don't spin if - * 1) the owner is a reader as we we can't determine if the - * reader is actively running or not. - * 2) The rwsem_spin_on_owner() returns false which means - * the owner isn't running. + * Try to acquire the lock */ - if (rwsem_owner_is_reader(owner) || - (rwsem_owner_is_writer(owner) && - !rwsem_spin_on_owner(sem, owner))) - break; - - /* wait_lock will be acquired if write_lock is obtained */ if (rwsem_try_write_lock_unqueued(sem)) { taken = true; break; @@ -420,7 +422,7 @@ static bool rwsem_optimistic_spin(struct rw_semaphore *sem) * we're an RT task that will live-lock because we won't let * the owner complete. */ - if (!owner && (need_resched() || rt_task(current))) + if (!sem->owner && (need_resched() || rt_task(current))) break; /* -- cgit v1.2.3 From 6428671bae97caa7040e24e79e969fd87908f4f3 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 1 Jun 2016 20:58:15 +0200 Subject: locking/mutex: Optimize mutex_trylock() fast-path A while back Viro posted a number of 'interesting' mutex_is_locked() users on IRC, one of those was RCU. RCU seems to use mutex_is_locked() to avoid doing mutex_trylock(), the regular load before modify pattern. While the use isn't wrong per se, its curious in that its needed at all, mutex_trylock() should be good enough on its own to avoid the pointless cacheline bounces. So fix those and remove the mutex_is_locked() (ab)use from RCU. Reported-by: Al Viro Signed-off-by: Peter Zijlstra (Intel) Acked-by: Paul McKenney Acked-by: Davidlohr Bueso Cc: Andrew Morton Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Waiman Long Link: http://lkml.kernel.org/r/20160601185815.GW3190@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- arch/ia64/include/asm/mutex.h | 2 +- arch/powerpc/include/asm/mutex.h | 2 +- arch/x86/include/asm/mutex_32.h | 2 +- arch/x86/include/asm/mutex_64.h | 6 +++--- include/asm-generic/mutex-dec.h | 2 +- include/asm-generic/mutex-xchg.h | 6 +++++- kernel/rcu/tree.c | 1 - 7 files changed, 12 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/arch/ia64/include/asm/mutex.h b/arch/ia64/include/asm/mutex.h index f41e66d65e31..28cb819e0ff9 100644 --- a/arch/ia64/include/asm/mutex.h +++ b/arch/ia64/include/asm/mutex.h @@ -82,7 +82,7 @@ __mutex_fastpath_unlock(atomic_t *count, void (*fail_fn)(atomic_t *)) static inline int __mutex_fastpath_trylock(atomic_t *count, int (*fail_fn)(atomic_t *)) { - if (cmpxchg_acq(count, 1, 0) == 1) + if (atomic_read(count) == 1 && cmpxchg_acq(count, 1, 0) == 1) return 1; return 0; } diff --git a/arch/powerpc/include/asm/mutex.h b/arch/powerpc/include/asm/mutex.h index 127ab23e1f6c..078155fa1189 100644 --- a/arch/powerpc/include/asm/mutex.h +++ b/arch/powerpc/include/asm/mutex.h @@ -124,7 +124,7 @@ __mutex_fastpath_unlock(atomic_t *count, void (*fail_fn)(atomic_t *)) static inline int __mutex_fastpath_trylock(atomic_t *count, int (*fail_fn)(atomic_t *)) { - if (likely(__mutex_cmpxchg_lock(count, 1, 0) == 1)) + if (likely(atomic_read(count) == 1 && __mutex_cmpxchg_lock(count, 1, 0) == 1)) return 1; return 0; } diff --git a/arch/x86/include/asm/mutex_32.h b/arch/x86/include/asm/mutex_32.h index 85e6cda45a02..e9355a84fc67 100644 --- a/arch/x86/include/asm/mutex_32.h +++ b/arch/x86/include/asm/mutex_32.h @@ -101,7 +101,7 @@ static inline int __mutex_fastpath_trylock(atomic_t *count, int (*fail_fn)(atomic_t *)) { /* cmpxchg because it never induces a false contention state. */ - if (likely(atomic_cmpxchg(count, 1, 0) == 1)) + if (likely(atomic_read(count) == 1 && atomic_cmpxchg(count, 1, 0) == 1)) return 1; return 0; diff --git a/arch/x86/include/asm/mutex_64.h b/arch/x86/include/asm/mutex_64.h index 07537a44216e..d9850758464e 100644 --- a/arch/x86/include/asm/mutex_64.h +++ b/arch/x86/include/asm/mutex_64.h @@ -118,10 +118,10 @@ do { \ static inline int __mutex_fastpath_trylock(atomic_t *count, int (*fail_fn)(atomic_t *)) { - if (likely(atomic_cmpxchg(count, 1, 0) == 1)) + if (likely(atomic_read(count) == 1 && atomic_cmpxchg(count, 1, 0) == 1)) return 1; - else - return 0; + + return 0; } #endif /* _ASM_X86_MUTEX_64_H */ diff --git a/include/asm-generic/mutex-dec.h b/include/asm-generic/mutex-dec.h index fd694cfd678a..c54829d3de37 100644 --- a/include/asm-generic/mutex-dec.h +++ b/include/asm-generic/mutex-dec.h @@ -80,7 +80,7 @@ __mutex_fastpath_unlock(atomic_t *count, void (*fail_fn)(atomic_t *)) static inline int __mutex_fastpath_trylock(atomic_t *count, int (*fail_fn)(atomic_t *)) { - if (likely(atomic_cmpxchg_acquire(count, 1, 0) == 1)) + if (likely(atomic_read(count) == 1 && atomic_cmpxchg_acquire(count, 1, 0) == 1)) return 1; return 0; } diff --git a/include/asm-generic/mutex-xchg.h b/include/asm-generic/mutex-xchg.h index a6b4a7bd6ac9..3269ec4e195f 100644 --- a/include/asm-generic/mutex-xchg.h +++ b/include/asm-generic/mutex-xchg.h @@ -91,8 +91,12 @@ __mutex_fastpath_unlock(atomic_t *count, void (*fail_fn)(atomic_t *)) static inline int __mutex_fastpath_trylock(atomic_t *count, int (*fail_fn)(atomic_t *)) { - int prev = atomic_xchg_acquire(count, 0); + int prev; + if (atomic_read(count) != 1) + return 0; + + prev = atomic_xchg_acquire(count, 0); if (unlikely(prev < 0)) { /* * The lock was marked contended so we must restore that diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index c7f1bc4f817c..b7326893221f 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3681,7 +3681,6 @@ static bool exp_funnel_lock(struct rcu_state *rsp, unsigned long s) if (ULONG_CMP_LT(READ_ONCE(rnp->exp_seq_rq), s) && (rnp == rnp_root || ULONG_CMP_LT(READ_ONCE(rnp_root->exp_seq_rq), s)) && - !mutex_is_locked(&rsp->exp_mutex) && mutex_trylock(&rsp->exp_mutex)) goto fastpath; -- cgit v1.2.3 From b5227d03b7191a9a44bf75a4c228a6a9ddbe781b Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Tue, 31 May 2016 16:23:02 -0500 Subject: timers: Clarify usleep_range() function comment Update the usleep_range() function comment to make it clear that it can only be used in non-atomic context. Previously we claimed usleep_range() was a drop-in replacement for udelay() where wakeup is flexible. But that's only true in non-atomic contexts, where it's possible to sleep instead of delay. Signed-off-by: Bjorn Helgaas Cc: John Stultz Link: http://lkml.kernel.org/r/20160531212302.28502.44995.stgit@bhelgaas-glaptop2.roam.corp.google.com Signed-off-by: Thomas Gleixner --- kernel/time/timer.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 3a95f9728778..67dd6103003a 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1702,9 +1702,15 @@ static void __sched do_usleep_range(unsigned long min, unsigned long max) } /** - * usleep_range - Drop in replacement for udelay where wakeup is flexible + * usleep_range - Sleep for an approximate time * @min: Minimum time in usecs to sleep * @max: Maximum time in usecs to sleep + * + * In non-atomic context where the exact wakeup time is flexible, use + * usleep_range() instead of udelay(). The sleep improves responsiveness + * by avoiding the CPU-hogging busy-wait of udelay(), and the range reduces + * power usage by allowing hrtimers to take advantage of an already- + * scheduled interrupt instead of scheduling a new one just for this sleep. */ void __sched usleep_range(unsigned long min, unsigned long max) { -- cgit v1.2.3 From 86721ab63b61ef1dd7305308e4049f644703decf Mon Sep 17 00:00:00 2001 From: Pratyush Patel Date: Tue, 1 Mar 2016 22:58:49 +0530 Subject: hrtimer: Remove redundant #ifdef block Only need CONFIG_NO_HZ_COMMON as this block is already in a CONFIG_SMP block. Signed-off-by: Pratyush Patel Link: http://lkml.kernel.org/r/20160301172849.GA18152@cyborg Signed-off-by: Thomas Gleixner --- kernel/time/hrtimer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index e99df0ff1d42..d13c9aebf7a3 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -177,7 +177,7 @@ hrtimer_check_target(struct hrtimer *timer, struct hrtimer_clock_base *new_base) #endif } -#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) +#ifdef CONFIG_NO_HZ_COMMON static inline struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base, int pinned) -- cgit v1.2.3 From fe3464ca8710012a247bb4586dde21b080f88514 Mon Sep 17 00:00:00 2001 From: Jianyu Zhan Date: Sat, 19 Mar 2016 21:59:19 +0800 Subject: genirq: Remove redundant NULL check of irq_desc for_each_irq_desc() macro has already skipped NULL irq_desc, don't bother to check it again. Signed-off-by: Jianyu Zhan Cc: mingo@kernel.org Cc: yhlu.kernel@gmail.com Link: http://lkml.kernel.org/r/1458395959-7046-1-git-send-email-nasa4836@gmail.com Signed-off-by: Thomas Gleixner --- kernel/irq/proc.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 4e1b94726818..50a8f28be247 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -421,12 +421,8 @@ void init_irq_proc(void) /* * Create entries for all existing IRQs. */ - for_each_irq_desc(irq, desc) { - if (!desc) - continue; - + for_each_irq_desc(irq, desc) register_irq_proc(irq, desc); - } } #ifdef CONFIG_GENERIC_IRQ_SHOW -- cgit v1.2.3 From ff5b706f5189fe8d2a6fd576b491b769ec1d29d3 Mon Sep 17 00:00:00 2001 From: Weongyo Jeong Date: Thu, 31 Mar 2016 12:15:03 -0700 Subject: genirq: Remove unnecessary memset() calls sprintf() and snprintf() implementation of kernel guarantees that its result is terminated with null byte if size is larger than 0. So we don't need to call memset() at all. Signed-off-by: Weongyo Jeong Link: http://lkml.kernel.org/r/1459451703-5744-1-git-send-email-weongyo.linux@gmail.com Signed-off-by: Thomas Gleixner --- kernel/irq/proc.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 50a8f28be247..f30425dce9dd 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -311,7 +311,6 @@ void register_handler_proc(unsigned int irq, struct irqaction *action) !name_unique(irq, action)) return; - memset(name, 0, MAX_NAMELEN); snprintf(name, MAX_NAMELEN, "%s", action->name); /* create /proc/irq/1234/handler/ */ @@ -340,7 +339,6 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc) if (desc->dir) goto out_unlock; - memset(name, 0, MAX_NAMELEN); sprintf(name, "%d", irq); /* create /proc/irq/1234 */ @@ -386,7 +384,6 @@ void unregister_irq_proc(unsigned int irq, struct irq_desc *desc) #endif remove_proc_entry("spurious", desc->dir); - memset(name, 0, MAX_NAMELEN); sprintf(name, "%u", irq); remove_proc_entry(name, root_irq_dir); } -- cgit v1.2.3 From b62b2cf5759b0c2206ddff92226f1eb8ac8f9f13 Mon Sep 17 00:00:00 2001 From: Jon Hunter Date: Tue, 7 Jun 2016 16:12:26 +0100 Subject: irqdomain: Fix handling of type settings for existing mappings When mapping an IRQ, it is possible that a mapping for the IRQ already exists. If mapping does exist then there are the following issues with regard to the handling of the IRQ type settings ... 1. If the domain is part of a hierarchy, then: a. We do not check that the type settings for the existing mapping match those of the new mapping. b. We do not check to see if the type settings have been programmed yet (and they might not have been) and so we may never set the type. 2. If the domain is NOT part of a hierarchy, we will overwrite the current type settings programmed if they are different from the previous mapping. Please note that irq_create_mapping() calls irq_find_mapping() to check if a mapping already exists. Although, it may be unlikely that the type settings for a shared interrupt would not match, nonetheless we should check for this. Therefore, to fix this check if a mapping exists (regardless of whether the domain is part of a hierarchy or not) and if it does then: 1. Return the IRQ number if the type settings match or are not specified. 2. Program the type settings and return the IRQ number if the type settings have not been programmed yet. 3. Otherwise if the type setting do not match, then print a warning and don't return the IRQ number. Furthermore, add a warning if the type return by irq_domain_translate() has bits outside the sense mask set and then clear these bits. If these bits are not cleared then this will cause the comparision of the type settings for an existing mapping to fail with that of the new mapping even if the sense bit themselves match. The reason being is that the existing type settings are read by calling irq_get_trigger_type() which will clear any bits outside the sense mask. This will allow us to detect irqchips that are not correctly clearing these bits and fix them. Signed-off-by: Jon Hunter Reviewed-by: Marc Zyngier Signed-off-by: Marc Zyngier --- kernel/irq/irqdomain.c | 37 ++++++++++++++++++++++++++++++++----- 1 file changed, 32 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 8798b6c9e945..f3ff1eb8dd09 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -588,15 +588,42 @@ unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec) if (irq_domain_translate(domain, fwspec, &hwirq, &type)) return 0; - if (irq_domain_is_hierarchy(domain)) { + /* + * WARN if the irqchip returns a type with bits + * outside the sense mask set and clear these bits. + */ + if (WARN_ON(type & ~IRQ_TYPE_SENSE_MASK)) + type &= IRQ_TYPE_SENSE_MASK; + + /* + * If we've already configured this interrupt, + * don't do it again, or hell will break loose. + */ + virq = irq_find_mapping(domain, hwirq); + if (virq) { /* - * If we've already configured this interrupt, - * don't do it again, or hell will break loose. + * If the trigger type is not specified or matches the + * current trigger type then we are done so return the + * interrupt number. */ - virq = irq_find_mapping(domain, hwirq); - if (virq) + if (type == IRQ_TYPE_NONE || type == irq_get_trigger_type(virq)) return virq; + /* + * If the trigger type has not been set yet, then set + * it now and return the interrupt number. + */ + if (irq_get_trigger_type(virq) == IRQ_TYPE_NONE) { + irq_set_irq_type(virq, type); + return virq; + } + + pr_warn("type mismatch, failed to map hwirq-%lu for %s!\n", + hwirq, of_node_full_name(to_of_node(fwspec->fwnode))); + return 0; + } + + if (irq_domain_is_hierarchy(domain)) { virq = irq_domain_alloc_irqs(domain, 1, NUMA_NO_NODE, fwspec); if (virq <= 0) return 0; -- cgit v1.2.3 From 4b357daed698c95d6b5eacc1c3c4afa206071ba2 Mon Sep 17 00:00:00 2001 From: Jon Hunter Date: Tue, 7 Jun 2016 16:12:27 +0100 Subject: genirq: Look-up trigger type if not specified by caller For some devices the IRQ trigger type for a device is read from firmware, such as device-tree. The IRQ trigger type is typically read when the mapping for IRQ is created, which is before the IRQ is requested. Hence, the IRQ trigger type is programmed when mapping the IRQ and not when requesting the IRQ. Although this works for most cases, in order to support IRQ chips which require runtime power management, which may not be accessible prior to requesting the IRQ, it is desirable to look-up the IRQ trigger type when it is requested. Therefore, if the IRQ trigger type is not specified when __setup_irq() is called, look-up the saved IRQ trigger type. This will allow us to defer the programming of the trigger type from when the IRQ is mapped to when it is actually requested. Signed-off-by: Jon Hunter Reviewed-by: Marc Zyngier Signed-off-by: Marc Zyngier --- kernel/irq/manage.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index ef0bc02c3a70..eaedeb74b49d 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -1116,6 +1116,13 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) new->irq = irq; + /* + * If the trigger type is not specified by the caller, + * then use the default for this interrupt. + */ + if (!(new->flags & IRQF_TRIGGER_MASK)) + new->flags |= irqd_get_trigger_type(&desc->irq_data); + /* * Check whether the interrupt nests into another interrupt * thread. -- cgit v1.2.3 From f35ad083783e8ed6ac030f5feb209f864875b413 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 13 Jun 2016 10:39:44 +0100 Subject: genirq: Look-up percpu trigger type if not specified by caller As we now do for non-percpu interrupt, perform a lookup of the interrupt trigger if the user doesn't supply one. The difference here is that we can only do it at enable time (trigger configuration can be per-cpu as well). Signed-off-by: Marc Zyngier --- kernel/irq/manage.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index eaedeb74b49d..f78b0846afb1 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -1737,7 +1737,14 @@ void enable_percpu_irq(unsigned int irq, unsigned int type) if (!desc) return; + /* + * If the trigger type is not specified by the caller, then + * use the default for this interrupt. + */ type &= IRQ_TYPE_SENSE_MASK; + if (type == IRQ_TYPE_NONE) + type = irqd_get_trigger_type(&desc->irq_data); + if (type != IRQ_TYPE_NONE) { int ret; -- cgit v1.2.3 From 1e2a7d78499ec8859d2b469051b7b80bad3b08aa Mon Sep 17 00:00:00 2001 From: Jon Hunter Date: Tue, 7 Jun 2016 16:12:28 +0100 Subject: irqdomain: Don't set type when mapping an IRQ Some IRQ chips, such as GPIO controllers or secondary level interrupt controllers, may require require additional runtime power management control to ensure they are accessible. For such IRQ chips, it makes sense to enable the IRQ chip when interrupts are requested and disabled them again once all interrupts have been freed. When mapping an IRQ, the IRQ type settings are read and then programmed. The mapping of the IRQ happens before the IRQ is requested and so the programming of the type settings occurs before the IRQ is requested. This is a problem for IRQ chips that require additional power management control because they may not be accessible yet. Therefore, when mapping the IRQ, don't program the type settings, just save them and then program these saved settings when the IRQ is requested (so long as if they are not overridden via the call to request the IRQ). Add a stub function for irq_domain_free_irqs() to avoid any compilation errors when CONFIG_IRQ_DOMAIN_HIERARCHY is not selected. Signed-off-by: Jon Hunter Reviewed-by: Marc Zyngier Signed-off-by: Marc Zyngier --- include/linux/irqdomain.h | 3 +++ kernel/irq/irqdomain.c | 23 ++++++++++++++++++----- 2 files changed, 21 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h index f1f36e04d885..317503763314 100644 --- a/include/linux/irqdomain.h +++ b/include/linux/irqdomain.h @@ -452,6 +452,9 @@ static inline int irq_domain_alloc_irqs(struct irq_domain *domain, return -1; } +static inline void irq_domain_free_irqs(unsigned int virq, + unsigned int nr_irqs) { } + static inline bool irq_domain_is_hierarchy(struct irq_domain *domain) { return false; diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index f3ff1eb8dd09..caa6a63d26f0 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -567,6 +567,7 @@ static void of_phandle_args_to_fwspec(struct of_phandle_args *irq_data, unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec) { struct irq_domain *domain; + struct irq_data *irq_data; irq_hw_number_t hwirq; unsigned int type = IRQ_TYPE_NONE; int virq; @@ -614,7 +615,11 @@ unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec) * it now and return the interrupt number. */ if (irq_get_trigger_type(virq) == IRQ_TYPE_NONE) { - irq_set_irq_type(virq, type); + irq_data = irq_get_irq_data(virq); + if (!irq_data) + return 0; + + irqd_set_trigger_type(irq_data, type); return virq; } @@ -634,10 +639,18 @@ unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec) return virq; } - /* Set type if specified and different than the current one */ - if (type != IRQ_TYPE_NONE && - type != irq_get_trigger_type(virq)) - irq_set_irq_type(virq, type); + irq_data = irq_get_irq_data(virq); + if (!irq_data) { + if (irq_domain_is_hierarchy(domain)) + irq_domain_free_irqs(virq, 1); + else + irq_dispose_mapping(virq); + return 0; + } + + /* Store trigger type */ + irqd_set_trigger_type(irq_data, type); + return virq; } EXPORT_SYMBOL_GPL(irq_create_fwspec_mapping); -- cgit v1.2.3 From be45beb2df6909d42a6b3b0052601b3eef878fc0 Mon Sep 17 00:00:00 2001 From: Jon Hunter Date: Tue, 7 Jun 2016 16:12:29 +0100 Subject: genirq: Add runtime power management support for IRQ chips Some IRQ chips may be located in a power domain outside of the CPU subsystem and hence will require device specific runtime power management. In order to support such IRQ chips, add a pointer for a device structure to the irq_chip structure, and if this pointer is populated by the IRQ chip driver and CONFIG_PM is selected in the kernel configuration, then the pm_runtime_get/put APIs for this chip will be called when an IRQ is requested/freed, respectively. Reviewed-by: Kevin Hilman Signed-off-by: Jon Hunter Signed-off-by: Marc Zyngier --- include/linux/irq.h | 4 ++++ kernel/irq/chip.c | 40 ++++++++++++++++++++++++++++++++++++++++ kernel/irq/internals.h | 1 + kernel/irq/manage.c | 31 ++++++++++++++++++++++++++++++- 4 files changed, 75 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/linux/irq.h b/include/linux/irq.h index 4d758a7c604a..6c92a847394d 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -315,6 +315,7 @@ static inline irq_hw_number_t irqd_to_hwirq(struct irq_data *d) /** * struct irq_chip - hardware interrupt chip descriptor * + * @parent_device: pointer to parent device for irqchip * @name: name for /proc/interrupts * @irq_startup: start up the interrupt (defaults to ->enable if NULL) * @irq_shutdown: shut down the interrupt (defaults to ->disable if NULL) @@ -354,6 +355,7 @@ static inline irq_hw_number_t irqd_to_hwirq(struct irq_data *d) * @flags: chip specific flags */ struct irq_chip { + struct device *parent_device; const char *name; unsigned int (*irq_startup)(struct irq_data *data); void (*irq_shutdown)(struct irq_data *data); @@ -488,6 +490,8 @@ extern void handle_bad_irq(struct irq_desc *desc); extern void handle_nested_irq(unsigned int irq); extern int irq_chip_compose_msi_msg(struct irq_data *data, struct msi_msg *msg); +extern int irq_chip_pm_get(struct irq_data *data); +extern int irq_chip_pm_put(struct irq_data *data); #ifdef CONFIG_IRQ_DOMAIN_HIERARCHY extern void irq_chip_enable_parent(struct irq_data *data); extern void irq_chip_disable_parent(struct irq_data *data); diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 2f9f2b0e79f2..ad8131473774 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -1093,3 +1093,43 @@ int irq_chip_compose_msi_msg(struct irq_data *data, struct msi_msg *msg) return 0; } + +/** + * irq_chip_pm_get - Enable power for an IRQ chip + * @data: Pointer to interrupt specific data + * + * Enable the power to the IRQ chip referenced by the interrupt data + * structure. + */ +int irq_chip_pm_get(struct irq_data *data) +{ + int retval; + + if (IS_ENABLED(CONFIG_PM) && data->chip->parent_device) { + retval = pm_runtime_get_sync(data->chip->parent_device); + if (retval < 0) { + pm_runtime_put_noidle(data->chip->parent_device); + return retval; + } + } + + return 0; +} + +/** + * irq_chip_pm_put - Disable power for an IRQ chip + * @data: Pointer to interrupt specific data + * + * Disable the power to the IRQ chip referenced by the interrupt data + * structure, belongs. Note that power will only be disabled, once this + * function has been called for all IRQs that have called irq_chip_pm_get(). + */ +int irq_chip_pm_put(struct irq_data *data) +{ + int retval = 0; + + if (IS_ENABLED(CONFIG_PM) && data->chip->parent_device) + retval = pm_runtime_put(data->chip->parent_device); + + return (retval < 0) ? retval : 0; +} diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 09be2c903c6d..d5edcdc9382a 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -7,6 +7,7 @@ */ #include #include +#include #ifdef CONFIG_SPARSE_IRQ # define IRQ_BITMAP_BITS (NR_IRQS + 8196) diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index f78b0846afb1..00cfc852cca8 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -1416,10 +1416,18 @@ int setup_irq(unsigned int irq, struct irqaction *act) if (!desc || WARN_ON(irq_settings_is_per_cpu_devid(desc))) return -EINVAL; + + retval = irq_chip_pm_get(&desc->irq_data); + if (retval < 0) + return retval; + chip_bus_lock(desc); retval = __setup_irq(irq, desc, act); chip_bus_sync_unlock(desc); + if (retval) + irq_chip_pm_put(&desc->irq_data); + return retval; } EXPORT_SYMBOL_GPL(setup_irq); @@ -1513,6 +1521,7 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id) } } + irq_chip_pm_put(&desc->irq_data); module_put(desc->owner); kfree(action->secondary); return action; @@ -1655,11 +1664,16 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler, action->name = devname; action->dev_id = dev_id; + retval = irq_chip_pm_get(&desc->irq_data); + if (retval < 0) + return retval; + chip_bus_lock(desc); retval = __setup_irq(irq, desc, action); chip_bus_sync_unlock(desc); if (retval) { + irq_chip_pm_put(&desc->irq_data); kfree(action->secondary); kfree(action); } @@ -1836,6 +1850,7 @@ static struct irqaction *__free_percpu_irq(unsigned int irq, void __percpu *dev_ unregister_handler_proc(irq, action); + irq_chip_pm_put(&desc->irq_data); module_put(desc->owner); return action; @@ -1898,10 +1913,18 @@ int setup_percpu_irq(unsigned int irq, struct irqaction *act) if (!desc || !irq_settings_is_per_cpu_devid(desc)) return -EINVAL; + + retval = irq_chip_pm_get(&desc->irq_data); + if (retval < 0) + return retval; + chip_bus_lock(desc); retval = __setup_irq(irq, desc, act); chip_bus_sync_unlock(desc); + if (retval) + irq_chip_pm_put(&desc->irq_data); + return retval; } @@ -1945,12 +1968,18 @@ int request_percpu_irq(unsigned int irq, irq_handler_t handler, action->name = devname; action->percpu_dev_id = dev_id; + retval = irq_chip_pm_get(&desc->irq_data); + if (retval < 0) + return retval; + chip_bus_lock(desc); retval = __setup_irq(irq, desc, action); chip_bus_sync_unlock(desc); - if (retval) + if (retval) { + irq_chip_pm_put(&desc->irq_data); kfree(action); + } return retval; } -- cgit v1.2.3 From 3d89e5478bf550a50c99e93adf659369798263b0 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Mon, 13 Jun 2016 18:32:45 +0800 Subject: sched/cputime: Fix prev steal time accouting during CPU hotplug MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit: e9532e69b8d1 ("sched/cputime: Fix steal time accounting vs. CPU hotplug") ... set rq->prev_* to 0 after a CPU hotplug comes back, in order to fix the case where (after CPU hotplug) steal time is smaller than rq->prev_steal_time. However, this should never happen. Steal time was only smaller because of the KVM-specific bug fixed by the previous patch. Worse, the previous patch triggers a bug on CPU hot-unplug/plug operation: because rq->prev_steal_time is cleared, all of the CPU's past steal time will be accounted again on hot-plug. Since the root cause has been fixed, we can just revert commit e9532e69b8d1. Signed-off-by: Wanpeng Li Signed-off-by: Peter Zijlstra (Intel) Acked-by: Paolo Bonzini Cc: Frederic Weisbecker Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Radim Krčmář Cc: Rik van Riel Cc: Thomas Gleixner Fixes: 'commit e9532e69b8d1 ("sched/cputime: Fix steal time accounting vs. CPU hotplug")' Link: http://lkml.kernel.org/r/1465813966-3116-3-git-send-email-wanpeng.li@hotmail.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 1 - kernel/sched/sched.h | 13 ------------- 2 files changed, 14 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 13d0896aff87..c1b537bc4b90 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7227,7 +7227,6 @@ static void sched_rq_cpu_starting(unsigned int cpu) struct rq *rq = cpu_rq(cpu); rq->calc_load_update = calc_load_update; - account_reset_rq(rq); update_max_interval(); } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 72f1f3087b04..de607e4febd9 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1809,16 +1809,3 @@ static inline void cpufreq_trigger_update(u64 time) {} #else /* arch_scale_freq_capacity */ #define arch_scale_freq_invariant() (false) #endif - -static inline void account_reset_rq(struct rq *rq) -{ -#ifdef CONFIG_IRQ_TIME_ACCOUNTING - rq->prev_irq_time = 0; -#endif -#ifdef CONFIG_PARAVIRT - rq->prev_steal_time = 0; -#endif -#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING - rq->prev_steal_time_rq = 0; -#endif -} -- cgit v1.2.3 From 807e5b80687c06715d62df51a5473b231e3e8b15 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Mon, 13 Jun 2016 18:32:46 +0800 Subject: sched/cputime: Add steal time support to full dynticks CPU time accounting MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch adds guest steal-time support to full dynticks CPU time accounting. After the following commit: ff9a9b4c4334 ("sched, time: Switch VIRT_CPU_ACCOUNTING_GEN to jiffy granularity") ... time sampling became jiffy based, even if we do the sampling from the context tracking code, so steal_account_process_tick() can be reused to account how many 'ticks' are stolen-time, after the last accumulation. Signed-off-by: Wanpeng Li Signed-off-by: Peter Zijlstra (Intel) Acked-by: Paolo Bonzini Cc: Frederic Weisbecker Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Radim Krčmář Cc: Rik van Riel Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1465813966-3116-4-git-send-email-wanpeng.li@hotmail.com Signed-off-by: Ingo Molnar --- kernel/sched/cputime.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 75f98c5498d5..3d60e5d76fdb 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -257,7 +257,7 @@ void account_idle_time(cputime_t cputime) cpustat[CPUTIME_IDLE] += (__force u64) cputime; } -static __always_inline bool steal_account_process_tick(void) +static __always_inline unsigned long steal_account_process_tick(unsigned long max_jiffies) { #ifdef CONFIG_PARAVIRT if (static_key_false(¶virt_steal_enabled)) { @@ -272,14 +272,14 @@ static __always_inline bool steal_account_process_tick(void) * time in jiffies. Lets cast the result to jiffies * granularity and account the rest on the next rounds. */ - steal_jiffies = nsecs_to_jiffies(steal); + steal_jiffies = min(nsecs_to_jiffies(steal), max_jiffies); this_rq()->prev_steal_time += jiffies_to_nsecs(steal_jiffies); account_steal_time(jiffies_to_cputime(steal_jiffies)); return steal_jiffies; } #endif - return false; + return 0; } /* @@ -346,7 +346,7 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick, u64 cputime = (__force u64) cputime_one_jiffy; u64 *cpustat = kcpustat_this_cpu->cpustat; - if (steal_account_process_tick()) + if (steal_account_process_tick(ULONG_MAX)) return; cputime *= ticks; @@ -477,7 +477,7 @@ void account_process_tick(struct task_struct *p, int user_tick) return; } - if (steal_account_process_tick()) + if (steal_account_process_tick(ULONG_MAX)) return; if (user_tick) @@ -681,12 +681,14 @@ static cputime_t vtime_delta(struct task_struct *tsk) static cputime_t get_vtime_delta(struct task_struct *tsk) { unsigned long now = READ_ONCE(jiffies); - unsigned long delta = now - tsk->vtime_snap; + unsigned long delta_jiffies, steal_jiffies; + delta_jiffies = now - tsk->vtime_snap; + steal_jiffies = steal_account_process_tick(delta_jiffies); WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_INACTIVE); tsk->vtime_snap = now; - return jiffies_to_cputime(delta); + return jiffies_to_cputime(delta_jiffies - steal_jiffies); } static void __vtime_account_system(struct task_struct *tsk) -- cgit v1.2.3 From 2c95afc1e83d93fac3be6923465e1753c2c53b0a Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 9 Jun 2016 06:14:38 -0700 Subject: perf/x86/intel, watchdog: Switch NMI watchdog to ref cycles on x86 The NMI watchdog uses either the fixed cycles or a generic cycles counter. This causes a lot of conflicts with users of the PMU who want to run a full group including the cycles fixed counter, for example the --topdown support recently added to perf stat. The code needs to fall back to not use groups, which can cause measurement inaccuracy due to multiplexing errors. This patch switches the NMI watchdog to use reference cycles on Intel systems. This is actually more accurate than cycles, because cycles can tick faster than the measured CPU Frequency due to Turbo mode. The ref cycles always tick at their frequency, or slower when the system is idling. That means the NMI watchdog can never expire too early, unlike with cycles. The reference cycles tick roughly at the frequency of the TSC, so the same period computation can be used. Signed-off-by: Andi Kleen Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: acme@kernel.org Cc: jolsa@kernel.org Link: http://lkml.kernel.org/r/1465478079-19993-1-git-send-email-andi@firstfloor.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/hw_nmi.c | 8 ++++++++ include/linux/nmi.h | 1 + kernel/watchdog.c | 7 +++++++ 3 files changed, 16 insertions(+) (limited to 'kernel') diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c index 7788ce643bf4..016f4263fad4 100644 --- a/arch/x86/kernel/apic/hw_nmi.c +++ b/arch/x86/kernel/apic/hw_nmi.c @@ -18,8 +18,16 @@ #include #include #include +#include #ifdef CONFIG_HARDLOCKUP_DETECTOR +int hw_nmi_get_event(void) +{ + if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) + return PERF_COUNT_HW_REF_CPU_CYCLES; + return PERF_COUNT_HW_CPU_CYCLES; +} + u64 hw_nmi_get_sample_period(int watchdog_thresh) { return (u64)(cpu_khz) * 1000 * watchdog_thresh; diff --git a/include/linux/nmi.h b/include/linux/nmi.h index 4630eeae18e0..79858af27209 100644 --- a/include/linux/nmi.h +++ b/include/linux/nmi.h @@ -66,6 +66,7 @@ static inline bool trigger_allbutself_cpu_backtrace(void) #ifdef CONFIG_LOCKUP_DETECTOR u64 hw_nmi_get_sample_period(int watchdog_thresh); +int hw_nmi_get_event(void); extern int nmi_watchdog_enabled; extern int soft_watchdog_enabled; extern int watchdog_user_enabled; diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 9acb29f280ec..8dd30fcd91be 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -315,6 +315,12 @@ static int is_softlockup(unsigned long touch_ts) #ifdef CONFIG_HARDLOCKUP_DETECTOR +/* Can be overriden by architecture */ +__weak int hw_nmi_get_event(void) +{ + return PERF_COUNT_HW_CPU_CYCLES; +} + static struct perf_event_attr wd_hw_attr = { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CPU_CYCLES, @@ -604,6 +610,7 @@ static int watchdog_nmi_enable(unsigned int cpu) wd_attr = &wd_hw_attr; wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh); + wd_attr->config = hw_nmi_get_event(); /* Try to register using hardware perf events */ event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL); -- cgit v1.2.3 From 1f03e8d2919270bd6ef64f39a45ce8df8a9f012a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 4 Apr 2016 10:57:12 +0200 Subject: locking/barriers: Replace smp_cond_acquire() with smp_cond_load_acquire() This new form allows using hardware assisted waiting. Some hardware (ARM64 and x86) allow monitoring an address for changes, so by providing a pointer we can use this to replace the cpu_relax() with hardware optimized methods in the future. Requested-by: Will Deacon Suggested-by: Linus Torvalds Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- include/linux/compiler.h | 25 +++++++++++++++++++------ kernel/locking/qspinlock.c | 12 ++++++------ kernel/sched/core.c | 8 ++++---- kernel/sched/sched.h | 2 +- kernel/smp.c | 2 +- 5 files changed, 31 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 06f27fd9d760..2bcaedc0f032 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -305,21 +305,34 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s }) /** - * smp_cond_acquire() - Spin wait for cond with ACQUIRE ordering + * smp_cond_load_acquire() - (Spin) wait for cond with ACQUIRE ordering + * @ptr: pointer to the variable to wait on * @cond: boolean expression to wait for * * Equivalent to using smp_load_acquire() on the condition variable but employs * the control dependency of the wait to reduce the barrier on many platforms. * + * Due to C lacking lambda expressions we load the value of *ptr into a + * pre-named variable @VAL to be used in @cond. + * * The control dependency provides a LOAD->STORE order, the additional RMB * provides LOAD->LOAD order, together they provide LOAD->{LOAD,STORE} order, * aka. ACQUIRE. */ -#define smp_cond_acquire(cond) do { \ - while (!(cond)) \ - cpu_relax(); \ - smp_rmb(); /* ctrl + rmb := acquire */ \ -} while (0) +#ifndef smp_cond_load_acquire +#define smp_cond_load_acquire(ptr, cond_expr) ({ \ + typeof(ptr) __PTR = (ptr); \ + typeof(*ptr) VAL; \ + for (;;) { \ + VAL = READ_ONCE(*__PTR); \ + if (cond_expr) \ + break; \ + cpu_relax(); \ + } \ + smp_rmb(); /* ctrl + rmb := acquire */ \ + VAL; \ +}) +#endif #endif /* __KERNEL__ */ diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c index 2f9153b183c9..1b8dda90ebfa 100644 --- a/kernel/locking/qspinlock.c +++ b/kernel/locking/qspinlock.c @@ -475,7 +475,7 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val) * sequentiality; this is because not all clear_pending_set_locked() * implementations imply full barriers. */ - smp_cond_acquire(!(atomic_read(&lock->val) & _Q_LOCKED_MASK)); + smp_cond_load_acquire(&lock->val.counter, !(VAL & _Q_LOCKED_MASK)); /* * take ownership and clear the pending bit. @@ -562,7 +562,7 @@ queue: * * The PV pv_wait_head_or_lock function, if active, will acquire * the lock and return a non-zero value. So we have to skip the - * smp_cond_acquire() call. As the next PV queue head hasn't been + * smp_cond_load_acquire() call. As the next PV queue head hasn't been * designated yet, there is no way for the locked value to become * _Q_SLOW_VAL. So both the set_locked() and the * atomic_cmpxchg_relaxed() calls will be safe. @@ -573,7 +573,7 @@ queue: if ((val = pv_wait_head_or_lock(lock, node))) goto locked; - smp_cond_acquire(!((val = atomic_read(&lock->val)) & _Q_LOCKED_PENDING_MASK)); + val = smp_cond_load_acquire(&lock->val.counter, !(VAL & _Q_LOCKED_PENDING_MASK)); locked: /* @@ -593,9 +593,9 @@ locked: break; } /* - * The smp_cond_acquire() call above has provided the necessary - * acquire semantics required for locking. At most two - * iterations of this loop may be ran. + * The smp_cond_load_acquire() call above has provided the + * necessary acquire semantics required for locking. At most + * two iterations of this loop may be ran. */ old = atomic_cmpxchg_relaxed(&lock->val, val, _Q_LOCKED_VAL); if (old == val) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 017d5394f5dc..5cd6931cb2cb 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1935,7 +1935,7 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) * chain to provide order. Instead we do: * * 1) smp_store_release(X->on_cpu, 0) - * 2) smp_cond_acquire(!X->on_cpu) + * 2) smp_cond_load_acquire(!X->on_cpu) * * Example: * @@ -1946,7 +1946,7 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) * sched-out X * smp_store_release(X->on_cpu, 0); * - * smp_cond_acquire(!X->on_cpu); + * smp_cond_load_acquire(&X->on_cpu, !VAL); * X->state = WAKING * set_task_cpu(X,2) * @@ -1972,7 +1972,7 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) * This means that any means of doing remote wakeups must order the CPU doing * the wakeup against the CPU the task is going to end up running on. This, * however, is already required for the regular Program-Order guarantee above, - * since the waking CPU is the one issueing the ACQUIRE (smp_cond_acquire). + * since the waking CPU is the one issueing the ACQUIRE (smp_cond_load_acquire). * */ @@ -2045,7 +2045,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) * This ensures that tasks getting woken will be fully ordered against * their previous state and preserve Program Order. */ - smp_cond_acquire(!p->on_cpu); + smp_cond_load_acquire(&p->on_cpu, !VAL); p->sched_contributes_to_load = !!task_contributes_to_load(p); p->state = TASK_WAKING; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 72f1f3087b04..425bf5ddaa5a 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1113,7 +1113,7 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) * In particular, the load of prev->state in finish_task_switch() must * happen before this. * - * Pairs with the smp_cond_acquire() in try_to_wake_up(). + * Pairs with the smp_cond_load_acquire() in try_to_wake_up(). */ smp_store_release(&prev->on_cpu, 0); #endif diff --git a/kernel/smp.c b/kernel/smp.c index 74165443c240..36552beed397 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -107,7 +107,7 @@ void __init call_function_init(void) */ static __always_inline void csd_lock_wait(struct call_single_data *csd) { - smp_cond_acquire(!(csd->flags & CSD_FLAG_LOCK)); + smp_cond_load_acquire(&csd->flags, !(VAL & CSD_FLAG_LOCK)); } static __always_inline void csd_lock(struct call_single_data *csd) -- cgit v1.2.3 From 33ac279677dcc2441cb93d8cb9cf7a74df62814d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 24 May 2016 13:17:12 +0200 Subject: locking/barriers: Introduce smp_acquire__after_ctrl_dep() Introduce smp_acquire__after_ctrl_dep(), this construct is not uncommon, but the lack of this barrier is. Use it to better express smp_rmb() uses in WRITE_ONCE(), the IPC semaphore code and the qspinlock code. Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- include/linux/compiler.h | 17 ++++++++++++----- ipc/sem.c | 14 ++------------ kernel/locking/qspinlock.c | 2 +- 3 files changed, 15 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 2bcaedc0f032..59a7004fc7dd 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -304,6 +304,17 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s __u.__val; \ }) +/** + * smp_acquire__after_ctrl_dep() - Provide ACQUIRE ordering after a control dependency + * + * A control dependency provides a LOAD->STORE order, the additional RMB + * provides LOAD->LOAD order, together they provide LOAD->{LOAD,STORE} order, + * aka. (load)-ACQUIRE. + * + * Architectures that do not do load speculation can have this be barrier(). + */ +#define smp_acquire__after_ctrl_dep() smp_rmb() + /** * smp_cond_load_acquire() - (Spin) wait for cond with ACQUIRE ordering * @ptr: pointer to the variable to wait on @@ -314,10 +325,6 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s * * Due to C lacking lambda expressions we load the value of *ptr into a * pre-named variable @VAL to be used in @cond. - * - * The control dependency provides a LOAD->STORE order, the additional RMB - * provides LOAD->LOAD order, together they provide LOAD->{LOAD,STORE} order, - * aka. ACQUIRE. */ #ifndef smp_cond_load_acquire #define smp_cond_load_acquire(ptr, cond_expr) ({ \ @@ -329,7 +336,7 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s break; \ cpu_relax(); \ } \ - smp_rmb(); /* ctrl + rmb := acquire */ \ + smp_acquire__after_ctrl_dep(); \ VAL; \ }) #endif diff --git a/ipc/sem.c b/ipc/sem.c index b3757ea0694b..84dff3df11a4 100644 --- a/ipc/sem.c +++ b/ipc/sem.c @@ -259,16 +259,6 @@ static void sem_rcu_free(struct rcu_head *head) ipc_rcu_free(head); } -/* - * spin_unlock_wait() and !spin_is_locked() are not memory barriers, they - * are only control barriers. - * The code must pair with spin_unlock(&sem->lock) or - * spin_unlock(&sem_perm.lock), thus just the control barrier is insufficient. - * - * smp_rmb() is sufficient, as writes cannot pass the control barrier. - */ -#define ipc_smp_acquire__after_spin_is_unlocked() smp_rmb() - /* * Wait until all currently ongoing simple ops have completed. * Caller must own sem_perm.lock. @@ -292,7 +282,7 @@ static void sem_wait_array(struct sem_array *sma) sem = sma->sem_base + i; spin_unlock_wait(&sem->lock); } - ipc_smp_acquire__after_spin_is_unlocked(); + smp_acquire__after_ctrl_dep(); } /* @@ -350,7 +340,7 @@ static inline int sem_lock(struct sem_array *sma, struct sembuf *sops, * complex_count++; * spin_unlock(sem_perm.lock); */ - ipc_smp_acquire__after_spin_is_unlocked(); + smp_acquire__after_ctrl_dep(); /* * Now repeat the test of complex_count: diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c index 1b8dda90ebfa..730655533440 100644 --- a/kernel/locking/qspinlock.c +++ b/kernel/locking/qspinlock.c @@ -379,7 +379,7 @@ void queued_spin_unlock_wait(struct qspinlock *lock) cpu_relax(); done: - smp_rmb(); /* CTRL + RMB -> ACQUIRE */ + smp_acquire__after_ctrl_dep(); } EXPORT_SYMBOL(queued_spin_unlock_wait); #endif -- cgit v1.2.3 From be3e7844980352756de4261b276ee2ba5be7a26b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 24 May 2016 14:45:21 +0200 Subject: locking/spinlock: Update spin_unlock_wait() users With the modified semantics of spin_unlock_wait() a number of explicit barriers can be removed. Also update the comment for the do_exit() usecase, as that was somewhat stale/obscure. Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- ipc/sem.c | 1 - kernel/exit.c | 8 ++++++-- kernel/task_work.c | 1 - 3 files changed, 6 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/ipc/sem.c b/ipc/sem.c index 84dff3df11a4..ae72b3cddc8d 100644 --- a/ipc/sem.c +++ b/ipc/sem.c @@ -282,7 +282,6 @@ static void sem_wait_array(struct sem_array *sma) sem = sma->sem_base + i; spin_unlock_wait(&sem->lock); } - smp_acquire__after_ctrl_dep(); } /* diff --git a/kernel/exit.c b/kernel/exit.c index 9e6e1356e6bb..0b40791b9e70 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -700,10 +700,14 @@ void do_exit(long code) exit_signals(tsk); /* sets PF_EXITING */ /* - * tsk->flags are checked in the futex code to protect against - * an exiting task cleaning up the robust pi futexes. + * Ensure that all new tsk->pi_lock acquisitions must observe + * PF_EXITING. Serializes against futex.c:attach_to_pi_owner(). */ smp_mb(); + /* + * Ensure that we must observe the pi_state in exit_mm() -> + * mm_release() -> exit_pi_state_list(). + */ raw_spin_unlock_wait(&tsk->pi_lock); if (unlikely(in_atomic())) { diff --git a/kernel/task_work.c b/kernel/task_work.c index 53fa971d000d..6ab4842b00e8 100644 --- a/kernel/task_work.c +++ b/kernel/task_work.c @@ -108,7 +108,6 @@ void task_work_run(void) * fail, but it can play with *work and other entries. */ raw_spin_unlock_wait(&task->pi_lock); - smp_mb(); do { next = work->next; -- cgit v1.2.3 From 0d95092ccba1a30b74fa52ff94ec5415e63744a0 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 8 Apr 2016 05:00:03 -0700 Subject: rcu: Fix outdated rcu_scheduler_active comment The comment header for rcu_scheduler_active states that it is used to optimize synchronize_sched() at early boot. This is incorrect. The synchronize_sched() function instead checks the number of online CPUs. This commit therefore replaces the comment's synchronize_sched() with synchronize_rcu(), which really does use rcu_scheduler_active for this purpose. Reported-by: Lihao Liang Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index c7f1bc4f817c..0fa692f1094e 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -130,7 +130,7 @@ int rcu_num_nodes __read_mostly = NUM_RCU_NODES; /* Total # rcu_nodes in use. */ * The rcu_scheduler_active variable transitions from zero to one just * before the first task is spawned. So when this variable is zero, RCU * can assume that there is but one task, allowing RCU to (for example) - * optimize synchronize_sched() to a simple barrier(). When this variable + * optimize synchronize_rcu() to a simple barrier(). When this variable * is one, RCU must actually do all the hard work required to detect real * grace periods. This variable is also used to suppress boot-time false * positives from lockdep-RCU error checking. -- cgit v1.2.3 From 590d1757b9d177e7fe3707963d0209d6eefbc746 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 10 Apr 2016 08:23:24 -0700 Subject: rcu: Fix outdated hotplug-exclusion comment in rcu_gp_init() In the past, RCU grace-period initialization excluded CPU-hotplug operations, but this is no longer the case. This commit therefore removed an outdated comment in rcu_gp_init() claiming that these are excluded. Reported-by: Lihao Liang Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 0fa692f1094e..6043c14165d1 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1989,8 +1989,7 @@ static bool rcu_gp_init(struct rcu_state *rsp) * of the tree within the rsp->node[] array. Note that other CPUs * will access only the leaves of the hierarchy, thus seeing that no * grace period is in progress, at least until the corresponding - * leaf node has been initialized. In addition, we have excluded - * CPU-hotplug operations. + * leaf node has been initialized. * * The grace period cannot complete until the initialization * process finishes, because this kthread handles both. -- cgit v1.2.3 From d3acab65f274800dd0901f0816f8bca9f2a8c8ec Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 10 Mar 2016 09:49:04 +0100 Subject: rcu: Remove some superfluous lines I think you'll find this condition is superfluous, as the whole function is under #ifdef of that same. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 6043c14165d1..4aefeafb9a95 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -4363,9 +4363,6 @@ static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp) struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */ - if (!IS_ENABLED(CONFIG_HOTPLUG_CPU)) - return; - /* Remove outgoing CPU from mask in the leaf rcu_node structure. */ mask = rdp->grpmask; raw_spin_lock_irqsave_rcu_node(rnp, flags); /* Enforce GP memory-order guarantee. */ -- cgit v1.2.3 From 3549c2bc2c4ea8ecfeb9d21cb81cb00c6002b011 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 15 Apr 2016 16:35:29 -0700 Subject: rcu: Move expedited code from tree.c to tree_exp.h People have been having some difficulty finding their way around the RCU code. This commit therefore pulls some of the expedited grace-period code from tree.c to a new tree_exp.h file. This commit is strictly code movement, with the exception of a forward declaration that was added for the sync_sched_exp_online_cleanup() function. A subsequent commit will move the remaining expedited grace-period code from tree_plugin.h to tree_exp.h. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 545 +----------------------------------------------- kernel/rcu/tree_exp.h | 564 ++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 566 insertions(+), 543 deletions(-) create mode 100644 kernel/rcu/tree_exp.h (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 4aefeafb9a95..c844b6142a86 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -159,6 +159,7 @@ static void invoke_rcu_core(void); static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp); static void rcu_report_exp_rdp(struct rcu_state *rsp, struct rcu_data *rdp, bool wake); +static void sync_sched_exp_online_cleanup(int cpu); /* rcuc/rcub kthread realtime priority */ #ifdef CONFIG_RCU_KTHREAD_PRIO @@ -3447,549 +3448,6 @@ static bool rcu_seq_done(unsigned long *sp, unsigned long s) return ULONG_CMP_GE(READ_ONCE(*sp), s); } -/* Wrapper functions for expedited grace periods. */ -static void rcu_exp_gp_seq_start(struct rcu_state *rsp) -{ - rcu_seq_start(&rsp->expedited_sequence); -} -static void rcu_exp_gp_seq_end(struct rcu_state *rsp) -{ - rcu_seq_end(&rsp->expedited_sequence); - smp_mb(); /* Ensure that consecutive grace periods serialize. */ -} -static unsigned long rcu_exp_gp_seq_snap(struct rcu_state *rsp) -{ - unsigned long s; - - smp_mb(); /* Caller's modifications seen first by other CPUs. */ - s = rcu_seq_snap(&rsp->expedited_sequence); - trace_rcu_exp_grace_period(rsp->name, s, TPS("snap")); - return s; -} -static bool rcu_exp_gp_seq_done(struct rcu_state *rsp, unsigned long s) -{ - return rcu_seq_done(&rsp->expedited_sequence, s); -} - -/* - * Reset the ->expmaskinit values in the rcu_node tree to reflect any - * recent CPU-online activity. Note that these masks are not cleared - * when CPUs go offline, so they reflect the union of all CPUs that have - * ever been online. This means that this function normally takes its - * no-work-to-do fastpath. - */ -static void sync_exp_reset_tree_hotplug(struct rcu_state *rsp) -{ - bool done; - unsigned long flags; - unsigned long mask; - unsigned long oldmask; - int ncpus = READ_ONCE(rsp->ncpus); - struct rcu_node *rnp; - struct rcu_node *rnp_up; - - /* If no new CPUs onlined since last time, nothing to do. */ - if (likely(ncpus == rsp->ncpus_snap)) - return; - rsp->ncpus_snap = ncpus; - - /* - * Each pass through the following loop propagates newly onlined - * CPUs for the current rcu_node structure up the rcu_node tree. - */ - rcu_for_each_leaf_node(rsp, rnp) { - raw_spin_lock_irqsave_rcu_node(rnp, flags); - if (rnp->expmaskinit == rnp->expmaskinitnext) { - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - continue; /* No new CPUs, nothing to do. */ - } - - /* Update this node's mask, track old value for propagation. */ - oldmask = rnp->expmaskinit; - rnp->expmaskinit = rnp->expmaskinitnext; - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - - /* If was already nonzero, nothing to propagate. */ - if (oldmask) - continue; - - /* Propagate the new CPU up the tree. */ - mask = rnp->grpmask; - rnp_up = rnp->parent; - done = false; - while (rnp_up) { - raw_spin_lock_irqsave_rcu_node(rnp_up, flags); - if (rnp_up->expmaskinit) - done = true; - rnp_up->expmaskinit |= mask; - raw_spin_unlock_irqrestore_rcu_node(rnp_up, flags); - if (done) - break; - mask = rnp_up->grpmask; - rnp_up = rnp_up->parent; - } - } -} - -/* - * Reset the ->expmask values in the rcu_node tree in preparation for - * a new expedited grace period. - */ -static void __maybe_unused sync_exp_reset_tree(struct rcu_state *rsp) -{ - unsigned long flags; - struct rcu_node *rnp; - - sync_exp_reset_tree_hotplug(rsp); - rcu_for_each_node_breadth_first(rsp, rnp) { - raw_spin_lock_irqsave_rcu_node(rnp, flags); - WARN_ON_ONCE(rnp->expmask); - rnp->expmask = rnp->expmaskinit; - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - } -} - -/* - * Return non-zero if there is no RCU expedited grace period in progress - * for the specified rcu_node structure, in other words, if all CPUs and - * tasks covered by the specified rcu_node structure have done their bit - * for the current expedited grace period. Works only for preemptible - * RCU -- other RCU implementation use other means. - * - * Caller must hold the rcu_state's exp_mutex. - */ -static int sync_rcu_preempt_exp_done(struct rcu_node *rnp) -{ - return rnp->exp_tasks == NULL && - READ_ONCE(rnp->expmask) == 0; -} - -/* - * Report the exit from RCU read-side critical section for the last task - * that queued itself during or before the current expedited preemptible-RCU - * grace period. This event is reported either to the rcu_node structure on - * which the task was queued or to one of that rcu_node structure's ancestors, - * recursively up the tree. (Calm down, calm down, we do the recursion - * iteratively!) - * - * Caller must hold the rcu_state's exp_mutex and the specified rcu_node - * structure's ->lock. - */ -static void __rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, - bool wake, unsigned long flags) - __releases(rnp->lock) -{ - unsigned long mask; - - for (;;) { - if (!sync_rcu_preempt_exp_done(rnp)) { - if (!rnp->expmask) - rcu_initiate_boost(rnp, flags); - else - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - break; - } - if (rnp->parent == NULL) { - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - if (wake) { - smp_mb(); /* EGP done before wake_up(). */ - swake_up(&rsp->expedited_wq); - } - break; - } - mask = rnp->grpmask; - raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled */ - rnp = rnp->parent; - raw_spin_lock_rcu_node(rnp); /* irqs already disabled */ - WARN_ON_ONCE(!(rnp->expmask & mask)); - rnp->expmask &= ~mask; - } -} - -/* - * Report expedited quiescent state for specified node. This is a - * lock-acquisition wrapper function for __rcu_report_exp_rnp(). - * - * Caller must hold the rcu_state's exp_mutex. - */ -static void __maybe_unused rcu_report_exp_rnp(struct rcu_state *rsp, - struct rcu_node *rnp, bool wake) -{ - unsigned long flags; - - raw_spin_lock_irqsave_rcu_node(rnp, flags); - __rcu_report_exp_rnp(rsp, rnp, wake, flags); -} - -/* - * Report expedited quiescent state for multiple CPUs, all covered by the - * specified leaf rcu_node structure. Caller must hold the rcu_state's - * exp_mutex. - */ -static void rcu_report_exp_cpu_mult(struct rcu_state *rsp, struct rcu_node *rnp, - unsigned long mask, bool wake) -{ - unsigned long flags; - - raw_spin_lock_irqsave_rcu_node(rnp, flags); - if (!(rnp->expmask & mask)) { - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - return; - } - rnp->expmask &= ~mask; - __rcu_report_exp_rnp(rsp, rnp, wake, flags); /* Releases rnp->lock. */ -} - -/* - * Report expedited quiescent state for specified rcu_data (CPU). - */ -static void rcu_report_exp_rdp(struct rcu_state *rsp, struct rcu_data *rdp, - bool wake) -{ - rcu_report_exp_cpu_mult(rsp, rdp->mynode, rdp->grpmask, wake); -} - -/* Common code for synchronize_{rcu,sched}_expedited() work-done checking. */ -static bool sync_exp_work_done(struct rcu_state *rsp, atomic_long_t *stat, - unsigned long s) -{ - if (rcu_exp_gp_seq_done(rsp, s)) { - trace_rcu_exp_grace_period(rsp->name, s, TPS("done")); - /* Ensure test happens before caller kfree(). */ - smp_mb__before_atomic(); /* ^^^ */ - atomic_long_inc(stat); - return true; - } - return false; -} - -/* - * Funnel-lock acquisition for expedited grace periods. Returns true - * if some other task completed an expedited grace period that this task - * can piggy-back on, and with no mutex held. Otherwise, returns false - * with the mutex held, indicating that the caller must actually do the - * expedited grace period. - */ -static bool exp_funnel_lock(struct rcu_state *rsp, unsigned long s) -{ - struct rcu_data *rdp = per_cpu_ptr(rsp->rda, raw_smp_processor_id()); - struct rcu_node *rnp = rdp->mynode; - struct rcu_node *rnp_root = rcu_get_root(rsp); - - /* Low-contention fastpath. */ - if (ULONG_CMP_LT(READ_ONCE(rnp->exp_seq_rq), s) && - (rnp == rnp_root || - ULONG_CMP_LT(READ_ONCE(rnp_root->exp_seq_rq), s)) && - !mutex_is_locked(&rsp->exp_mutex) && - mutex_trylock(&rsp->exp_mutex)) - goto fastpath; - - /* - * Each pass through the following loop works its way up - * the rcu_node tree, returning if others have done the work or - * otherwise falls through to acquire rsp->exp_mutex. The mapping - * from CPU to rcu_node structure can be inexact, as it is just - * promoting locality and is not strictly needed for correctness. - */ - for (; rnp != NULL; rnp = rnp->parent) { - if (sync_exp_work_done(rsp, &rdp->exp_workdone1, s)) - return true; - - /* Work not done, either wait here or go up. */ - spin_lock(&rnp->exp_lock); - if (ULONG_CMP_GE(rnp->exp_seq_rq, s)) { - - /* Someone else doing GP, so wait for them. */ - spin_unlock(&rnp->exp_lock); - trace_rcu_exp_funnel_lock(rsp->name, rnp->level, - rnp->grplo, rnp->grphi, - TPS("wait")); - wait_event(rnp->exp_wq[(s >> 1) & 0x3], - sync_exp_work_done(rsp, - &rdp->exp_workdone2, s)); - return true; - } - rnp->exp_seq_rq = s; /* Followers can wait on us. */ - spin_unlock(&rnp->exp_lock); - trace_rcu_exp_funnel_lock(rsp->name, rnp->level, rnp->grplo, - rnp->grphi, TPS("nxtlvl")); - } - mutex_lock(&rsp->exp_mutex); -fastpath: - if (sync_exp_work_done(rsp, &rdp->exp_workdone3, s)) { - mutex_unlock(&rsp->exp_mutex); - return true; - } - rcu_exp_gp_seq_start(rsp); - trace_rcu_exp_grace_period(rsp->name, s, TPS("start")); - return false; -} - -/* Invoked on each online non-idle CPU for expedited quiescent state. */ -static void sync_sched_exp_handler(void *data) -{ - struct rcu_data *rdp; - struct rcu_node *rnp; - struct rcu_state *rsp = data; - - rdp = this_cpu_ptr(rsp->rda); - rnp = rdp->mynode; - if (!(READ_ONCE(rnp->expmask) & rdp->grpmask) || - __this_cpu_read(rcu_sched_data.cpu_no_qs.b.exp)) - return; - if (rcu_is_cpu_rrupt_from_idle()) { - rcu_report_exp_rdp(&rcu_sched_state, - this_cpu_ptr(&rcu_sched_data), true); - return; - } - __this_cpu_write(rcu_sched_data.cpu_no_qs.b.exp, true); - resched_cpu(smp_processor_id()); -} - -/* Send IPI for expedited cleanup if needed at end of CPU-hotplug operation. */ -static void sync_sched_exp_online_cleanup(int cpu) -{ - struct rcu_data *rdp; - int ret; - struct rcu_node *rnp; - struct rcu_state *rsp = &rcu_sched_state; - - rdp = per_cpu_ptr(rsp->rda, cpu); - rnp = rdp->mynode; - if (!(READ_ONCE(rnp->expmask) & rdp->grpmask)) - return; - ret = smp_call_function_single(cpu, sync_sched_exp_handler, rsp, 0); - WARN_ON_ONCE(ret); -} - -/* - * Select the nodes that the upcoming expedited grace period needs - * to wait for. - */ -static void sync_rcu_exp_select_cpus(struct rcu_state *rsp, - smp_call_func_t func) -{ - int cpu; - unsigned long flags; - unsigned long mask; - unsigned long mask_ofl_test; - unsigned long mask_ofl_ipi; - int ret; - struct rcu_node *rnp; - - sync_exp_reset_tree(rsp); - rcu_for_each_leaf_node(rsp, rnp) { - raw_spin_lock_irqsave_rcu_node(rnp, flags); - - /* Each pass checks a CPU for identity, offline, and idle. */ - mask_ofl_test = 0; - for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++) { - struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); - struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); - - if (raw_smp_processor_id() == cpu || - !(atomic_add_return(0, &rdtp->dynticks) & 0x1)) - mask_ofl_test |= rdp->grpmask; - } - mask_ofl_ipi = rnp->expmask & ~mask_ofl_test; - - /* - * Need to wait for any blocked tasks as well. Note that - * additional blocking tasks will also block the expedited - * GP until such time as the ->expmask bits are cleared. - */ - if (rcu_preempt_has_tasks(rnp)) - rnp->exp_tasks = rnp->blkd_tasks.next; - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - - /* IPI the remaining CPUs for expedited quiescent state. */ - mask = 1; - for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) { - if (!(mask_ofl_ipi & mask)) - continue; -retry_ipi: - ret = smp_call_function_single(cpu, func, rsp, 0); - if (!ret) { - mask_ofl_ipi &= ~mask; - continue; - } - /* Failed, raced with offline. */ - raw_spin_lock_irqsave_rcu_node(rnp, flags); - if (cpu_online(cpu) && - (rnp->expmask & mask)) { - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - schedule_timeout_uninterruptible(1); - if (cpu_online(cpu) && - (rnp->expmask & mask)) - goto retry_ipi; - raw_spin_lock_irqsave_rcu_node(rnp, flags); - } - if (!(rnp->expmask & mask)) - mask_ofl_ipi &= ~mask; - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - } - /* Report quiescent states for those that went offline. */ - mask_ofl_test |= mask_ofl_ipi; - if (mask_ofl_test) - rcu_report_exp_cpu_mult(rsp, rnp, mask_ofl_test, false); - } -} - -static void synchronize_sched_expedited_wait(struct rcu_state *rsp) -{ - int cpu; - unsigned long jiffies_stall; - unsigned long jiffies_start; - unsigned long mask; - int ndetected; - struct rcu_node *rnp; - struct rcu_node *rnp_root = rcu_get_root(rsp); - int ret; - - jiffies_stall = rcu_jiffies_till_stall_check(); - jiffies_start = jiffies; - - for (;;) { - ret = swait_event_timeout( - rsp->expedited_wq, - sync_rcu_preempt_exp_done(rnp_root), - jiffies_stall); - if (ret > 0 || sync_rcu_preempt_exp_done(rnp_root)) - return; - if (ret < 0) { - /* Hit a signal, disable CPU stall warnings. */ - swait_event(rsp->expedited_wq, - sync_rcu_preempt_exp_done(rnp_root)); - return; - } - pr_err("INFO: %s detected expedited stalls on CPUs/tasks: {", - rsp->name); - ndetected = 0; - rcu_for_each_leaf_node(rsp, rnp) { - ndetected += rcu_print_task_exp_stall(rnp); - mask = 1; - for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) { - struct rcu_data *rdp; - - if (!(rnp->expmask & mask)) - continue; - ndetected++; - rdp = per_cpu_ptr(rsp->rda, cpu); - pr_cont(" %d-%c%c%c", cpu, - "O."[!!cpu_online(cpu)], - "o."[!!(rdp->grpmask & rnp->expmaskinit)], - "N."[!!(rdp->grpmask & rnp->expmaskinitnext)]); - } - mask <<= 1; - } - pr_cont(" } %lu jiffies s: %lu root: %#lx/%c\n", - jiffies - jiffies_start, rsp->expedited_sequence, - rnp_root->expmask, ".T"[!!rnp_root->exp_tasks]); - if (ndetected) { - pr_err("blocking rcu_node structures:"); - rcu_for_each_node_breadth_first(rsp, rnp) { - if (rnp == rnp_root) - continue; /* printed unconditionally */ - if (sync_rcu_preempt_exp_done(rnp)) - continue; - pr_cont(" l=%u:%d-%d:%#lx/%c", - rnp->level, rnp->grplo, rnp->grphi, - rnp->expmask, - ".T"[!!rnp->exp_tasks]); - } - pr_cont("\n"); - } - rcu_for_each_leaf_node(rsp, rnp) { - mask = 1; - for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) { - if (!(rnp->expmask & mask)) - continue; - dump_cpu_task(cpu); - } - } - jiffies_stall = 3 * rcu_jiffies_till_stall_check() + 3; - } -} - -/* - * Wait for the current expedited grace period to complete, and then - * wake up everyone who piggybacked on the just-completed expedited - * grace period. Also update all the ->exp_seq_rq counters as needed - * in order to avoid counter-wrap problems. - */ -static void rcu_exp_wait_wake(struct rcu_state *rsp, unsigned long s) -{ - struct rcu_node *rnp; - - synchronize_sched_expedited_wait(rsp); - rcu_exp_gp_seq_end(rsp); - trace_rcu_exp_grace_period(rsp->name, s, TPS("end")); - - /* - * Switch over to wakeup mode, allowing the next GP, but -only- the - * next GP, to proceed. - */ - mutex_lock(&rsp->exp_wake_mutex); - mutex_unlock(&rsp->exp_mutex); - - rcu_for_each_node_breadth_first(rsp, rnp) { - if (ULONG_CMP_LT(READ_ONCE(rnp->exp_seq_rq), s)) { - spin_lock(&rnp->exp_lock); - /* Recheck, avoid hang in case someone just arrived. */ - if (ULONG_CMP_LT(rnp->exp_seq_rq, s)) - rnp->exp_seq_rq = s; - spin_unlock(&rnp->exp_lock); - } - wake_up_all(&rnp->exp_wq[(rsp->expedited_sequence >> 1) & 0x3]); - } - trace_rcu_exp_grace_period(rsp->name, s, TPS("endwake")); - mutex_unlock(&rsp->exp_wake_mutex); -} - -/** - * synchronize_sched_expedited - Brute-force RCU-sched grace period - * - * Wait for an RCU-sched grace period to elapse, but use a "big hammer" - * approach to force the grace period to end quickly. This consumes - * significant time on all CPUs and is unfriendly to real-time workloads, - * so is thus not recommended for any sort of common-case code. In fact, - * if you are using synchronize_sched_expedited() in a loop, please - * restructure your code to batch your updates, and then use a single - * synchronize_sched() instead. - * - * This implementation can be thought of as an application of sequence - * locking to expedited grace periods, but using the sequence counter to - * determine when someone else has already done the work instead of for - * retrying readers. - */ -void synchronize_sched_expedited(void) -{ - unsigned long s; - struct rcu_state *rsp = &rcu_sched_state; - - /* If only one CPU, this is automatically a grace period. */ - if (rcu_blocking_is_gp()) - return; - - /* If expedited grace periods are prohibited, fall back to normal. */ - if (rcu_gp_is_normal()) { - wait_rcu_gp(call_rcu_sched); - return; - } - - /* Take a snapshot of the sequence number. */ - s = rcu_exp_gp_seq_snap(rsp); - if (exp_funnel_lock(rsp, s)) - return; /* Someone else did our work for us. */ - - /* Initialize the rcu_node tree in preparation for the wait. */ - sync_rcu_exp_select_cpus(rsp, sync_sched_exp_handler); - - /* Wait and clean up, including waking everyone. */ - rcu_exp_wait_wake(rsp, s); -} -EXPORT_SYMBOL_GPL(synchronize_sched_expedited); - /* * Check to see if there is any immediate RCU-related work to be done * by the current CPU, for the specified type of RCU, returning 1 if so. @@ -4747,4 +4205,5 @@ void __init rcu_init(void) rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu); } +#include "tree_exp.h" #include "tree_plugin.h" diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h new file mode 100644 index 000000000000..db0909cf7fe1 --- /dev/null +++ b/kernel/rcu/tree_exp.h @@ -0,0 +1,564 @@ +/* + * RCU expedited grace periods + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, you can access it online at + * http://www.gnu.org/licenses/gpl-2.0.html. + * + * Copyright IBM Corporation, 2016 + * + * Authors: Paul E. McKenney + */ + +/* Wrapper functions for expedited grace periods. */ +static void rcu_exp_gp_seq_start(struct rcu_state *rsp) +{ + rcu_seq_start(&rsp->expedited_sequence); +} +static void rcu_exp_gp_seq_end(struct rcu_state *rsp) +{ + rcu_seq_end(&rsp->expedited_sequence); + smp_mb(); /* Ensure that consecutive grace periods serialize. */ +} +static unsigned long rcu_exp_gp_seq_snap(struct rcu_state *rsp) +{ + unsigned long s; + + smp_mb(); /* Caller's modifications seen first by other CPUs. */ + s = rcu_seq_snap(&rsp->expedited_sequence); + trace_rcu_exp_grace_period(rsp->name, s, TPS("snap")); + return s; +} +static bool rcu_exp_gp_seq_done(struct rcu_state *rsp, unsigned long s) +{ + return rcu_seq_done(&rsp->expedited_sequence, s); +} + +/* + * Reset the ->expmaskinit values in the rcu_node tree to reflect any + * recent CPU-online activity. Note that these masks are not cleared + * when CPUs go offline, so they reflect the union of all CPUs that have + * ever been online. This means that this function normally takes its + * no-work-to-do fastpath. + */ +static void sync_exp_reset_tree_hotplug(struct rcu_state *rsp) +{ + bool done; + unsigned long flags; + unsigned long mask; + unsigned long oldmask; + int ncpus = READ_ONCE(rsp->ncpus); + struct rcu_node *rnp; + struct rcu_node *rnp_up; + + /* If no new CPUs onlined since last time, nothing to do. */ + if (likely(ncpus == rsp->ncpus_snap)) + return; + rsp->ncpus_snap = ncpus; + + /* + * Each pass through the following loop propagates newly onlined + * CPUs for the current rcu_node structure up the rcu_node tree. + */ + rcu_for_each_leaf_node(rsp, rnp) { + raw_spin_lock_irqsave_rcu_node(rnp, flags); + if (rnp->expmaskinit == rnp->expmaskinitnext) { + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + continue; /* No new CPUs, nothing to do. */ + } + + /* Update this node's mask, track old value for propagation. */ + oldmask = rnp->expmaskinit; + rnp->expmaskinit = rnp->expmaskinitnext; + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + + /* If was already nonzero, nothing to propagate. */ + if (oldmask) + continue; + + /* Propagate the new CPU up the tree. */ + mask = rnp->grpmask; + rnp_up = rnp->parent; + done = false; + while (rnp_up) { + raw_spin_lock_irqsave_rcu_node(rnp_up, flags); + if (rnp_up->expmaskinit) + done = true; + rnp_up->expmaskinit |= mask; + raw_spin_unlock_irqrestore_rcu_node(rnp_up, flags); + if (done) + break; + mask = rnp_up->grpmask; + rnp_up = rnp_up->parent; + } + } +} + +/* + * Reset the ->expmask values in the rcu_node tree in preparation for + * a new expedited grace period. + */ +static void __maybe_unused sync_exp_reset_tree(struct rcu_state *rsp) +{ + unsigned long flags; + struct rcu_node *rnp; + + sync_exp_reset_tree_hotplug(rsp); + rcu_for_each_node_breadth_first(rsp, rnp) { + raw_spin_lock_irqsave_rcu_node(rnp, flags); + WARN_ON_ONCE(rnp->expmask); + rnp->expmask = rnp->expmaskinit; + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + } +} + +/* + * Return non-zero if there is no RCU expedited grace period in progress + * for the specified rcu_node structure, in other words, if all CPUs and + * tasks covered by the specified rcu_node structure have done their bit + * for the current expedited grace period. Works only for preemptible + * RCU -- other RCU implementation use other means. + * + * Caller must hold the rcu_state's exp_mutex. + */ +static int sync_rcu_preempt_exp_done(struct rcu_node *rnp) +{ + return rnp->exp_tasks == NULL && + READ_ONCE(rnp->expmask) == 0; +} + +/* + * Report the exit from RCU read-side critical section for the last task + * that queued itself during or before the current expedited preemptible-RCU + * grace period. This event is reported either to the rcu_node structure on + * which the task was queued or to one of that rcu_node structure's ancestors, + * recursively up the tree. (Calm down, calm down, we do the recursion + * iteratively!) + * + * Caller must hold the rcu_state's exp_mutex and the specified rcu_node + * structure's ->lock. + */ +static void __rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, + bool wake, unsigned long flags) + __releases(rnp->lock) +{ + unsigned long mask; + + for (;;) { + if (!sync_rcu_preempt_exp_done(rnp)) { + if (!rnp->expmask) + rcu_initiate_boost(rnp, flags); + else + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + break; + } + if (rnp->parent == NULL) { + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + if (wake) { + smp_mb(); /* EGP done before wake_up(). */ + swake_up(&rsp->expedited_wq); + } + break; + } + mask = rnp->grpmask; + raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled */ + rnp = rnp->parent; + raw_spin_lock_rcu_node(rnp); /* irqs already disabled */ + WARN_ON_ONCE(!(rnp->expmask & mask)); + rnp->expmask &= ~mask; + } +} + +/* + * Report expedited quiescent state for specified node. This is a + * lock-acquisition wrapper function for __rcu_report_exp_rnp(). + * + * Caller must hold the rcu_state's exp_mutex. + */ +static void __maybe_unused rcu_report_exp_rnp(struct rcu_state *rsp, + struct rcu_node *rnp, bool wake) +{ + unsigned long flags; + + raw_spin_lock_irqsave_rcu_node(rnp, flags); + __rcu_report_exp_rnp(rsp, rnp, wake, flags); +} + +/* + * Report expedited quiescent state for multiple CPUs, all covered by the + * specified leaf rcu_node structure. Caller must hold the rcu_state's + * exp_mutex. + */ +static void rcu_report_exp_cpu_mult(struct rcu_state *rsp, struct rcu_node *rnp, + unsigned long mask, bool wake) +{ + unsigned long flags; + + raw_spin_lock_irqsave_rcu_node(rnp, flags); + if (!(rnp->expmask & mask)) { + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + return; + } + rnp->expmask &= ~mask; + __rcu_report_exp_rnp(rsp, rnp, wake, flags); /* Releases rnp->lock. */ +} + +/* + * Report expedited quiescent state for specified rcu_data (CPU). + */ +static void rcu_report_exp_rdp(struct rcu_state *rsp, struct rcu_data *rdp, + bool wake) +{ + rcu_report_exp_cpu_mult(rsp, rdp->mynode, rdp->grpmask, wake); +} + +/* Common code for synchronize_{rcu,sched}_expedited() work-done checking. */ +static bool sync_exp_work_done(struct rcu_state *rsp, atomic_long_t *stat, + unsigned long s) +{ + if (rcu_exp_gp_seq_done(rsp, s)) { + trace_rcu_exp_grace_period(rsp->name, s, TPS("done")); + /* Ensure test happens before caller kfree(). */ + smp_mb__before_atomic(); /* ^^^ */ + atomic_long_inc(stat); + return true; + } + return false; +} + +/* + * Funnel-lock acquisition for expedited grace periods. Returns true + * if some other task completed an expedited grace period that this task + * can piggy-back on, and with no mutex held. Otherwise, returns false + * with the mutex held, indicating that the caller must actually do the + * expedited grace period. + */ +static bool exp_funnel_lock(struct rcu_state *rsp, unsigned long s) +{ + struct rcu_data *rdp = per_cpu_ptr(rsp->rda, raw_smp_processor_id()); + struct rcu_node *rnp = rdp->mynode; + struct rcu_node *rnp_root = rcu_get_root(rsp); + + /* Low-contention fastpath. */ + if (ULONG_CMP_LT(READ_ONCE(rnp->exp_seq_rq), s) && + (rnp == rnp_root || + ULONG_CMP_LT(READ_ONCE(rnp_root->exp_seq_rq), s)) && + !mutex_is_locked(&rsp->exp_mutex) && + mutex_trylock(&rsp->exp_mutex)) + goto fastpath; + + /* + * Each pass through the following loop works its way up + * the rcu_node tree, returning if others have done the work or + * otherwise falls through to acquire rsp->exp_mutex. The mapping + * from CPU to rcu_node structure can be inexact, as it is just + * promoting locality and is not strictly needed for correctness. + */ + for (; rnp != NULL; rnp = rnp->parent) { + if (sync_exp_work_done(rsp, &rdp->exp_workdone1, s)) + return true; + + /* Work not done, either wait here or go up. */ + spin_lock(&rnp->exp_lock); + if (ULONG_CMP_GE(rnp->exp_seq_rq, s)) { + + /* Someone else doing GP, so wait for them. */ + spin_unlock(&rnp->exp_lock); + trace_rcu_exp_funnel_lock(rsp->name, rnp->level, + rnp->grplo, rnp->grphi, + TPS("wait")); + wait_event(rnp->exp_wq[(s >> 1) & 0x3], + sync_exp_work_done(rsp, + &rdp->exp_workdone2, s)); + return true; + } + rnp->exp_seq_rq = s; /* Followers can wait on us. */ + spin_unlock(&rnp->exp_lock); + trace_rcu_exp_funnel_lock(rsp->name, rnp->level, rnp->grplo, + rnp->grphi, TPS("nxtlvl")); + } + mutex_lock(&rsp->exp_mutex); +fastpath: + if (sync_exp_work_done(rsp, &rdp->exp_workdone3, s)) { + mutex_unlock(&rsp->exp_mutex); + return true; + } + rcu_exp_gp_seq_start(rsp); + trace_rcu_exp_grace_period(rsp->name, s, TPS("start")); + return false; +} + +/* Invoked on each online non-idle CPU for expedited quiescent state. */ +static void sync_sched_exp_handler(void *data) +{ + struct rcu_data *rdp; + struct rcu_node *rnp; + struct rcu_state *rsp = data; + + rdp = this_cpu_ptr(rsp->rda); + rnp = rdp->mynode; + if (!(READ_ONCE(rnp->expmask) & rdp->grpmask) || + __this_cpu_read(rcu_sched_data.cpu_no_qs.b.exp)) + return; + if (rcu_is_cpu_rrupt_from_idle()) { + rcu_report_exp_rdp(&rcu_sched_state, + this_cpu_ptr(&rcu_sched_data), true); + return; + } + __this_cpu_write(rcu_sched_data.cpu_no_qs.b.exp, true); + resched_cpu(smp_processor_id()); +} + +/* Send IPI for expedited cleanup if needed at end of CPU-hotplug operation. */ +static void sync_sched_exp_online_cleanup(int cpu) +{ + struct rcu_data *rdp; + int ret; + struct rcu_node *rnp; + struct rcu_state *rsp = &rcu_sched_state; + + rdp = per_cpu_ptr(rsp->rda, cpu); + rnp = rdp->mynode; + if (!(READ_ONCE(rnp->expmask) & rdp->grpmask)) + return; + ret = smp_call_function_single(cpu, sync_sched_exp_handler, rsp, 0); + WARN_ON_ONCE(ret); +} + +/* + * Select the nodes that the upcoming expedited grace period needs + * to wait for. + */ +static void sync_rcu_exp_select_cpus(struct rcu_state *rsp, + smp_call_func_t func) +{ + int cpu; + unsigned long flags; + unsigned long mask; + unsigned long mask_ofl_test; + unsigned long mask_ofl_ipi; + int ret; + struct rcu_node *rnp; + + sync_exp_reset_tree(rsp); + rcu_for_each_leaf_node(rsp, rnp) { + raw_spin_lock_irqsave_rcu_node(rnp, flags); + + /* Each pass checks a CPU for identity, offline, and idle. */ + mask_ofl_test = 0; + for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++) { + struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); + struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); + + if (raw_smp_processor_id() == cpu || + !(atomic_add_return(0, &rdtp->dynticks) & 0x1)) + mask_ofl_test |= rdp->grpmask; + } + mask_ofl_ipi = rnp->expmask & ~mask_ofl_test; + + /* + * Need to wait for any blocked tasks as well. Note that + * additional blocking tasks will also block the expedited + * GP until such time as the ->expmask bits are cleared. + */ + if (rcu_preempt_has_tasks(rnp)) + rnp->exp_tasks = rnp->blkd_tasks.next; + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + + /* IPI the remaining CPUs for expedited quiescent state. */ + mask = 1; + for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) { + if (!(mask_ofl_ipi & mask)) + continue; +retry_ipi: + ret = smp_call_function_single(cpu, func, rsp, 0); + if (!ret) { + mask_ofl_ipi &= ~mask; + continue; + } + /* Failed, raced with offline. */ + raw_spin_lock_irqsave_rcu_node(rnp, flags); + if (cpu_online(cpu) && + (rnp->expmask & mask)) { + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + schedule_timeout_uninterruptible(1); + if (cpu_online(cpu) && + (rnp->expmask & mask)) + goto retry_ipi; + raw_spin_lock_irqsave_rcu_node(rnp, flags); + } + if (!(rnp->expmask & mask)) + mask_ofl_ipi &= ~mask; + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + } + /* Report quiescent states for those that went offline. */ + mask_ofl_test |= mask_ofl_ipi; + if (mask_ofl_test) + rcu_report_exp_cpu_mult(rsp, rnp, mask_ofl_test, false); + } +} + +static void synchronize_sched_expedited_wait(struct rcu_state *rsp) +{ + int cpu; + unsigned long jiffies_stall; + unsigned long jiffies_start; + unsigned long mask; + int ndetected; + struct rcu_node *rnp; + struct rcu_node *rnp_root = rcu_get_root(rsp); + int ret; + + jiffies_stall = rcu_jiffies_till_stall_check(); + jiffies_start = jiffies; + + for (;;) { + ret = swait_event_timeout( + rsp->expedited_wq, + sync_rcu_preempt_exp_done(rnp_root), + jiffies_stall); + if (ret > 0 || sync_rcu_preempt_exp_done(rnp_root)) + return; + if (ret < 0) { + /* Hit a signal, disable CPU stall warnings. */ + swait_event(rsp->expedited_wq, + sync_rcu_preempt_exp_done(rnp_root)); + return; + } + pr_err("INFO: %s detected expedited stalls on CPUs/tasks: {", + rsp->name); + ndetected = 0; + rcu_for_each_leaf_node(rsp, rnp) { + ndetected += rcu_print_task_exp_stall(rnp); + mask = 1; + for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) { + struct rcu_data *rdp; + + if (!(rnp->expmask & mask)) + continue; + ndetected++; + rdp = per_cpu_ptr(rsp->rda, cpu); + pr_cont(" %d-%c%c%c", cpu, + "O."[!!cpu_online(cpu)], + "o."[!!(rdp->grpmask & rnp->expmaskinit)], + "N."[!!(rdp->grpmask & rnp->expmaskinitnext)]); + } + mask <<= 1; + } + pr_cont(" } %lu jiffies s: %lu root: %#lx/%c\n", + jiffies - jiffies_start, rsp->expedited_sequence, + rnp_root->expmask, ".T"[!!rnp_root->exp_tasks]); + if (ndetected) { + pr_err("blocking rcu_node structures:"); + rcu_for_each_node_breadth_first(rsp, rnp) { + if (rnp == rnp_root) + continue; /* printed unconditionally */ + if (sync_rcu_preempt_exp_done(rnp)) + continue; + pr_cont(" l=%u:%d-%d:%#lx/%c", + rnp->level, rnp->grplo, rnp->grphi, + rnp->expmask, + ".T"[!!rnp->exp_tasks]); + } + pr_cont("\n"); + } + rcu_for_each_leaf_node(rsp, rnp) { + mask = 1; + for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) { + if (!(rnp->expmask & mask)) + continue; + dump_cpu_task(cpu); + } + } + jiffies_stall = 3 * rcu_jiffies_till_stall_check() + 3; + } +} + +/* + * Wait for the current expedited grace period to complete, and then + * wake up everyone who piggybacked on the just-completed expedited + * grace period. Also update all the ->exp_seq_rq counters as needed + * in order to avoid counter-wrap problems. + */ +static void rcu_exp_wait_wake(struct rcu_state *rsp, unsigned long s) +{ + struct rcu_node *rnp; + + synchronize_sched_expedited_wait(rsp); + rcu_exp_gp_seq_end(rsp); + trace_rcu_exp_grace_period(rsp->name, s, TPS("end")); + + /* + * Switch over to wakeup mode, allowing the next GP, but -only- the + * next GP, to proceed. + */ + mutex_lock(&rsp->exp_wake_mutex); + mutex_unlock(&rsp->exp_mutex); + + rcu_for_each_node_breadth_first(rsp, rnp) { + if (ULONG_CMP_LT(READ_ONCE(rnp->exp_seq_rq), s)) { + spin_lock(&rnp->exp_lock); + /* Recheck, avoid hang in case someone just arrived. */ + if (ULONG_CMP_LT(rnp->exp_seq_rq, s)) + rnp->exp_seq_rq = s; + spin_unlock(&rnp->exp_lock); + } + wake_up_all(&rnp->exp_wq[(rsp->expedited_sequence >> 1) & 0x3]); + } + trace_rcu_exp_grace_period(rsp->name, s, TPS("endwake")); + mutex_unlock(&rsp->exp_wake_mutex); +} + +/** + * synchronize_sched_expedited - Brute-force RCU-sched grace period + * + * Wait for an RCU-sched grace period to elapse, but use a "big hammer" + * approach to force the grace period to end quickly. This consumes + * significant time on all CPUs and is unfriendly to real-time workloads, + * so is thus not recommended for any sort of common-case code. In fact, + * if you are using synchronize_sched_expedited() in a loop, please + * restructure your code to batch your updates, and then use a single + * synchronize_sched() instead. + * + * This implementation can be thought of as an application of sequence + * locking to expedited grace periods, but using the sequence counter to + * determine when someone else has already done the work instead of for + * retrying readers. + */ +void synchronize_sched_expedited(void) +{ + unsigned long s; + struct rcu_state *rsp = &rcu_sched_state; + + /* If only one CPU, this is automatically a grace period. */ + if (rcu_blocking_is_gp()) + return; + + /* If expedited grace periods are prohibited, fall back to normal. */ + if (rcu_gp_is_normal()) { + wait_rcu_gp(call_rcu_sched); + return; + } + + /* Take a snapshot of the sequence number. */ + s = rcu_exp_gp_seq_snap(rsp); + if (exp_funnel_lock(rsp, s)) + return; /* Someone else did our work for us. */ + + /* Initialize the rcu_node tree in preparation for the wait. */ + sync_rcu_exp_select_cpus(rsp, sync_sched_exp_handler); + + /* Wait and clean up, including waking everyone. */ + rcu_exp_wait_wake(rsp, s); +} +EXPORT_SYMBOL_GPL(synchronize_sched_expedited); -- cgit v1.2.3 From 40e0a6cfd53e37d9b8863cdbc0adb1f72e9311e7 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 15 Apr 2016 16:44:07 -0700 Subject: rcu: Move expedited code from tree_plugin.h to tree_exp.h People have been having some difficulty finding their way around the RCU code. This commit therefore pulls some of the expedited grace-period code from tree_plugin.h to a new tree_exp.h file. This commit is strictly code movement. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_exp.h | 94 ++++++++++++++++++++++++++++++++++++++++++++++++ kernel/rcu/tree_plugin.h | 88 --------------------------------------------- 2 files changed, 94 insertions(+), 88 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index db0909cf7fe1..00a02a231ada 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -562,3 +562,97 @@ void synchronize_sched_expedited(void) rcu_exp_wait_wake(rsp, s); } EXPORT_SYMBOL_GPL(synchronize_sched_expedited); + +#ifdef CONFIG_PREEMPT_RCU + +/* + * Remote handler for smp_call_function_single(). If there is an + * RCU read-side critical section in effect, request that the + * next rcu_read_unlock() record the quiescent state up the + * ->expmask fields in the rcu_node tree. Otherwise, immediately + * report the quiescent state. + */ +static void sync_rcu_exp_handler(void *info) +{ + struct rcu_data *rdp; + struct rcu_state *rsp = info; + struct task_struct *t = current; + + /* + * Within an RCU read-side critical section, request that the next + * rcu_read_unlock() report. Unless this RCU read-side critical + * section has already blocked, in which case it is already set + * up for the expedited grace period to wait on it. + */ + if (t->rcu_read_lock_nesting > 0 && + !t->rcu_read_unlock_special.b.blocked) { + t->rcu_read_unlock_special.b.exp_need_qs = true; + return; + } + + /* + * We are either exiting an RCU read-side critical section (negative + * values of t->rcu_read_lock_nesting) or are not in one at all + * (zero value of t->rcu_read_lock_nesting). Or we are in an RCU + * read-side critical section that blocked before this expedited + * grace period started. Either way, we can immediately report + * the quiescent state. + */ + rdp = this_cpu_ptr(rsp->rda); + rcu_report_exp_rdp(rsp, rdp, true); +} + +/** + * synchronize_rcu_expedited - Brute-force RCU grace period + * + * Wait for an RCU-preempt grace period, but expedite it. The basic + * idea is to IPI all non-idle non-nohz online CPUs. The IPI handler + * checks whether the CPU is in an RCU-preempt critical section, and + * if so, it sets a flag that causes the outermost rcu_read_unlock() + * to report the quiescent state. On the other hand, if the CPU is + * not in an RCU read-side critical section, the IPI handler reports + * the quiescent state immediately. + * + * Although this is a greate improvement over previous expedited + * implementations, it is still unfriendly to real-time workloads, so is + * thus not recommended for any sort of common-case code. In fact, if + * you are using synchronize_rcu_expedited() in a loop, please restructure + * your code to batch your updates, and then Use a single synchronize_rcu() + * instead. + */ +void synchronize_rcu_expedited(void) +{ + struct rcu_state *rsp = rcu_state_p; + unsigned long s; + + /* If expedited grace periods are prohibited, fall back to normal. */ + if (rcu_gp_is_normal()) { + wait_rcu_gp(call_rcu); + return; + } + + s = rcu_exp_gp_seq_snap(rsp); + if (exp_funnel_lock(rsp, s)) + return; /* Someone else did our work for us. */ + + /* Initialize the rcu_node tree in preparation for the wait. */ + sync_rcu_exp_select_cpus(rsp, sync_rcu_exp_handler); + + /* Wait for ->blkd_tasks lists to drain, then wake everyone up. */ + rcu_exp_wait_wake(rsp, s); +} +EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); + +#else /* #ifdef CONFIG_PREEMPT_RCU */ + +/* + * Wait for an rcu-preempt grace period, but make it happen quickly. + * But because preemptible RCU does not exist, map to rcu-sched. + */ +void synchronize_rcu_expedited(void) +{ + synchronize_sched_expedited(); +} +EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); + +#endif /* #else #ifdef CONFIG_PREEMPT_RCU */ diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index ff1cd4e1188d..695071dd1e9c 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -681,84 +681,6 @@ void synchronize_rcu(void) } EXPORT_SYMBOL_GPL(synchronize_rcu); -/* - * Remote handler for smp_call_function_single(). If there is an - * RCU read-side critical section in effect, request that the - * next rcu_read_unlock() record the quiescent state up the - * ->expmask fields in the rcu_node tree. Otherwise, immediately - * report the quiescent state. - */ -static void sync_rcu_exp_handler(void *info) -{ - struct rcu_data *rdp; - struct rcu_state *rsp = info; - struct task_struct *t = current; - - /* - * Within an RCU read-side critical section, request that the next - * rcu_read_unlock() report. Unless this RCU read-side critical - * section has already blocked, in which case it is already set - * up for the expedited grace period to wait on it. - */ - if (t->rcu_read_lock_nesting > 0 && - !t->rcu_read_unlock_special.b.blocked) { - t->rcu_read_unlock_special.b.exp_need_qs = true; - return; - } - - /* - * We are either exiting an RCU read-side critical section (negative - * values of t->rcu_read_lock_nesting) or are not in one at all - * (zero value of t->rcu_read_lock_nesting). Or we are in an RCU - * read-side critical section that blocked before this expedited - * grace period started. Either way, we can immediately report - * the quiescent state. - */ - rdp = this_cpu_ptr(rsp->rda); - rcu_report_exp_rdp(rsp, rdp, true); -} - -/** - * synchronize_rcu_expedited - Brute-force RCU grace period - * - * Wait for an RCU-preempt grace period, but expedite it. The basic - * idea is to IPI all non-idle non-nohz online CPUs. The IPI handler - * checks whether the CPU is in an RCU-preempt critical section, and - * if so, it sets a flag that causes the outermost rcu_read_unlock() - * to report the quiescent state. On the other hand, if the CPU is - * not in an RCU read-side critical section, the IPI handler reports - * the quiescent state immediately. - * - * Although this is a greate improvement over previous expedited - * implementations, it is still unfriendly to real-time workloads, so is - * thus not recommended for any sort of common-case code. In fact, if - * you are using synchronize_rcu_expedited() in a loop, please restructure - * your code to batch your updates, and then Use a single synchronize_rcu() - * instead. - */ -void synchronize_rcu_expedited(void) -{ - struct rcu_state *rsp = rcu_state_p; - unsigned long s; - - /* If expedited grace periods are prohibited, fall back to normal. */ - if (rcu_gp_is_normal()) { - wait_rcu_gp(call_rcu); - return; - } - - s = rcu_exp_gp_seq_snap(rsp); - if (exp_funnel_lock(rsp, s)) - return; /* Someone else did our work for us. */ - - /* Initialize the rcu_node tree in preparation for the wait. */ - sync_rcu_exp_select_cpus(rsp, sync_rcu_exp_handler); - - /* Wait for ->blkd_tasks lists to drain, then wake everyone up. */ - rcu_exp_wait_wake(rsp, s); -} -EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); - /** * rcu_barrier - Wait until all in-flight call_rcu() callbacks complete. * @@ -882,16 +804,6 @@ static void rcu_preempt_check_callbacks(void) { } -/* - * Wait for an rcu-preempt grace period, but make it happen quickly. - * But because preemptible RCU does not exist, map to rcu-sched. - */ -void synchronize_rcu_expedited(void) -{ - synchronize_sched_expedited(); -} -EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); - /* * Because preemptible RCU does not exist, rcu_barrier() is just * another name for rcu_barrier_sched(). -- cgit v1.2.3 From f8cbdee99b161cb08c3fb55200941028c5fe25c8 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 19 Apr 2016 13:03:18 -0700 Subject: torture: Simplify code, eliminate RCU_PERF_TEST_RUNNABLE This commit applies the infamous IS_ENABLED() macro to eliminate a #ifdef. It also eliminates the RCU_PERF_TEST_RUNNABLE Kconfig option in favor of the already-existing rcuperf.perf_runnable kernel boot parameter. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcuperf.c | 7 +------ lib/Kconfig.debug | 16 ---------------- 2 files changed, 1 insertion(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c index 3cee0d8393ed..afd174e901c3 100644 --- a/kernel/rcu/rcuperf.c +++ b/kernel/rcu/rcuperf.c @@ -96,12 +96,7 @@ static int rcu_perf_writer_state; #define MAX_MEAS 10000 #define MIN_MEAS 100 -#if defined(MODULE) || defined(CONFIG_RCU_PERF_TEST_RUNNABLE) -#define RCUPERF_RUNNABLE_INIT 1 -#else -#define RCUPERF_RUNNABLE_INIT 0 -#endif -static int perf_runnable = RCUPERF_RUNNABLE_INIT; +static int perf_runnable = IS_ENABLED(MODULE); module_param(perf_runnable, int, 0444); MODULE_PARM_DESC(perf_runnable, "Start rcuperf at boot"); diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index b9cfdbfae9aa..cf6ddcd8f70c 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1307,22 +1307,6 @@ config RCU_PERF_TEST Say M if you want the RCU performance tests to build as a module. Say N if you are unsure. -config RCU_PERF_TEST_RUNNABLE - bool "performance tests for RCU runnable by default" - depends on RCU_PERF_TEST = y - default n - help - This option provides a way to build the RCU performance tests - directly into the kernel without them starting up at boot time. - You can use /sys/module to manually override this setting. - This /proc file is available only when the RCU performance - tests have been built into the kernel. - - Say Y here if you want the RCU performance tests to start during - boot (you probably don't). - Say N here if you want the RCU performance tests to start only - after being manually enabled via /sys/module. - config RCU_TORTURE_TEST tristate "torture tests for RCU" depends on DEBUG_KERNEL -- cgit v1.2.3 From 4e9a073f60367157fd64b65490654c39d4c44321 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 19 Apr 2016 14:42:34 -0700 Subject: torture: Remove CONFIG_RCU_TORTURE_TEST_RUNNABLE, simplify code This commit removes CONFIG_RCU_TORTURE_TEST_RUNNABLE in favor of the already-existing rcutorture.torture_runnable kernel boot parameter. It also converts an #ifdef into IS_ENABLED(), saving a few lines of code. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 7 +------ kernel/rcu/tree_plugin.h | 2 -- lib/Kconfig.debug | 17 ----------------- 3 files changed, 1 insertion(+), 25 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 084a28a732eb..01cb57ff106f 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -182,12 +182,7 @@ static const char *rcu_torture_writer_state_getname(void) return rcu_torture_writer_state_names[i]; } -#if defined(MODULE) || defined(CONFIG_RCU_TORTURE_TEST_RUNNABLE) -#define RCUTORTURE_RUNNABLE_INIT 1 -#else -#define RCUTORTURE_RUNNABLE_INIT 0 -#endif -static int torture_runnable = RCUTORTURE_RUNNABLE_INIT; +static int torture_runnable = IS_ENABLED(MODULE); module_param(torture_runnable, int, 0444); MODULE_PARM_DESC(torture_runnable, "Start rcutorture at boot"); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index ff1cd4e1188d..5b2d723e6568 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -79,8 +79,6 @@ static void __init rcu_bootup_announce_oddness(void) pr_info("\tRCU dyntick-idle grace-period acceleration is enabled.\n"); if (IS_ENABLED(CONFIG_PROVE_RCU)) pr_info("\tRCU lockdep checking is enabled.\n"); - if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_RUNNABLE)) - pr_info("\tRCU torture testing starts during boot.\n"); if (RCU_NUM_LVLS >= 4) pr_info("\tFour(or more)-level hierarchy is enabled.\n"); if (RCU_FANOUT_LEAF != 16) diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index cf6ddcd8f70c..805b7048a1bd 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1324,23 +1324,6 @@ config RCU_TORTURE_TEST Say M if you want the RCU torture tests to build as a module. Say N if you are unsure. -config RCU_TORTURE_TEST_RUNNABLE - bool "torture tests for RCU runnable by default" - depends on RCU_TORTURE_TEST = y - default n - help - This option provides a way to build the RCU torture tests - directly into the kernel without them starting up at boot - time. You can use /proc/sys/kernel/rcutorture_runnable - to manually override this setting. This /proc file is - available only when the RCU torture tests have been built - into the kernel. - - Say Y here if you want the RCU torture tests to start during - boot (you probably don't). - Say N here if you want the RCU torture tests to start only - after being manually enabled via /proc. - config RCU_TORTURE_TEST_SLOW_PREINIT bool "Slow down RCU grace-period pre-initialization to expose races" depends on RCU_TORTURE_TEST -- cgit v1.2.3 From d95f5ba90fa6043a366958897fdef705af968b70 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 20 Apr 2016 17:18:41 -0700 Subject: torture: Break online and offline functions out of torture_onoff() This commit breaks torture_online() and torture_offline() out of torture_onoff() in preparation for allowing waketorture finer-grained control of its CPU-hotplug workload. Signed-off-by: Paul E. McKenney --- include/linux/torture.h | 4 ++ kernel/torture.c | 168 ++++++++++++++++++++++++++++++------------------ 2 files changed, 108 insertions(+), 64 deletions(-) (limited to 'kernel') diff --git a/include/linux/torture.h b/include/linux/torture.h index 7759fc3c622d..6685a73736a2 100644 --- a/include/linux/torture.h +++ b/include/linux/torture.h @@ -50,6 +50,10 @@ do { if (verbose) pr_alert("%s" TORTURE_FLAG "!!! %s\n", torture_type, s); } while (0) /* Definitions for online/offline exerciser. */ +bool torture_offline(int cpu, long *n_onl_attempts, long *n_onl_successes, + unsigned long *sum_offl, int *min_onl, int *max_onl); +bool torture_online(int cpu, long *n_onl_attempts, long *n_onl_successes, + unsigned long *sum_onl, int *min_onl, int *max_onl); int torture_onoff_init(long ooholdoff, long oointerval); void torture_onoff_stats(void); bool torture_onoff_failures(void); diff --git a/kernel/torture.c b/kernel/torture.c index fa0bdeee17ac..fb39a06bbef5 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -81,6 +81,104 @@ static unsigned long sum_online; static int min_online = -1; static int max_online; +/* + * Attempt to take a CPU offline. Return false if the CPU is already + * offline or if it is not subject to CPU-hotplug operations. The + * caller can detect other failures by looking at the statistics. + */ +bool torture_offline(int cpu, long *n_offl_attempts, long *n_offl_successes, + unsigned long *sum_offl, int *min_offl, int *max_offl) +{ + unsigned long delta; + int ret; + unsigned long starttime; + + if (!cpu_online(cpu) || !cpu_is_hotpluggable(cpu)) + return false; + + if (verbose) + pr_alert("%s" TORTURE_FLAG + "torture_onoff task: offlining %d\n", + torture_type, cpu); + starttime = jiffies; + (*n_offl_attempts)++; + ret = cpu_down(cpu); + if (ret) { + if (verbose) + pr_alert("%s" TORTURE_FLAG + "torture_onoff task: offline %d failed: errno %d\n", + torture_type, cpu, ret); + } else { + if (verbose) + pr_alert("%s" TORTURE_FLAG + "torture_onoff task: offlined %d\n", + torture_type, cpu); + (*n_offl_successes)++; + delta = jiffies - starttime; + sum_offl += delta; + if (*min_offl < 0) { + *min_offl = delta; + *max_offl = delta; + } + if (*min_offl > delta) + *min_offl = delta; + if (*max_offl < delta) + *max_offl = delta; + } + + return true; +} +EXPORT_SYMBOL_GPL(torture_offline); + +/* + * Attempt to bring a CPU online. Return false if the CPU is already + * online or if it is not subject to CPU-hotplug operations. The + * caller can detect other failures by looking at the statistics. + */ +bool torture_online(int cpu, long *n_onl_attempts, long *n_onl_successes, + unsigned long *sum_onl, int *min_onl, int *max_onl) +{ + unsigned long delta; + int ret; + unsigned long starttime; + + if (cpu_online(cpu) || !cpu_is_hotpluggable(cpu)) + return false; + + if (verbose) + pr_alert("%s" TORTURE_FLAG + "torture_onoff task: onlining %d\n", + torture_type, cpu); + starttime = jiffies; + (*n_onl_attempts)++; + ret = cpu_up(cpu); + if (ret) { + if (verbose) + pr_alert("%s" TORTURE_FLAG + "torture_onoff task: online %d failed: errno %d\n", + torture_type, cpu, ret); + } else { + if (verbose) + pr_alert("%s" TORTURE_FLAG + "torture_onoff task: onlined %d\n", + torture_type, cpu); + (*n_onl_successes)++; + delta = jiffies - starttime; + *sum_onl += delta; + if (*min_onl < 0) { + *min_onl = delta; + *max_onl = delta; + } + if (*min_onl > delta) + *min_onl = delta; + if (*max_onl < delta) + *max_onl = delta; + } + + return true; +} +EXPORT_SYMBOL_GPL(torture_online); + /* * Execute random CPU-hotplug operations at the interval specified * by the onoff_interval. @@ -89,11 +187,8 @@ static int torture_onoff(void *arg) { int cpu; - unsigned long delta; int maxcpu = -1; DEFINE_TORTURE_RANDOM(rand); - int ret; - unsigned long starttime; VERBOSE_TOROUT_STRING("torture_onoff task started"); for_each_online_cpu(cpu) @@ -106,67 +201,12 @@ torture_onoff(void *arg) } while (!torture_must_stop()) { cpu = (torture_random(&rand) >> 4) % (maxcpu + 1); - if (cpu_online(cpu) && cpu_is_hotpluggable(cpu)) { - if (verbose) - pr_alert("%s" TORTURE_FLAG - "torture_onoff task: offlining %d\n", - torture_type, cpu); - starttime = jiffies; - n_offline_attempts++; - ret = cpu_down(cpu); - if (ret) { - if (verbose) - pr_alert("%s" TORTURE_FLAG - "torture_onoff task: offline %d failed: errno %d\n", - torture_type, cpu, ret); - } else { - if (verbose) - pr_alert("%s" TORTURE_FLAG - "torture_onoff task: offlined %d\n", - torture_type, cpu); - n_offline_successes++; - delta = jiffies - starttime; - sum_offline += delta; - if (min_offline < 0) { - min_offline = delta; - max_offline = delta; - } - if (min_offline > delta) - min_offline = delta; - if (max_offline < delta) - max_offline = delta; - } - } else if (cpu_is_hotpluggable(cpu)) { - if (verbose) - pr_alert("%s" TORTURE_FLAG - "torture_onoff task: onlining %d\n", - torture_type, cpu); - starttime = jiffies; - n_online_attempts++; - ret = cpu_up(cpu); - if (ret) { - if (verbose) - pr_alert("%s" TORTURE_FLAG - "torture_onoff task: online %d failed: errno %d\n", - torture_type, cpu, ret); - } else { - if (verbose) - pr_alert("%s" TORTURE_FLAG - "torture_onoff task: onlined %d\n", - torture_type, cpu); - n_online_successes++; - delta = jiffies - starttime; - sum_online += delta; - if (min_online < 0) { - min_online = delta; - max_online = delta; - } - if (min_online > delta) - min_online = delta; - if (max_online < delta) - max_online = delta; - } - } + if (!torture_offline(cpu, + &n_offline_attempts, &n_offline_successes, + &sum_offline, &min_offline, &max_offline)) + torture_online(cpu, + &n_online_attempts, &n_online_successes, + &sum_online, &min_online, &max_online); schedule_timeout_interruptible(onoff_interval); } torture_kthread_stopping("torture_onoff"); -- cgit v1.2.3 From 750db0f5f7d0ff6b86158015f02c275702639b20 Mon Sep 17 00:00:00 2001 From: Boqun Feng Date: Mon, 2 May 2016 10:30:00 +0800 Subject: torture: Stop onoff task if there is only one cpu If the whole system has only one cpu, that cpu won't be able to be offlined, so there is no need onoff task is stil running. Signed-off-by: Boqun Feng Signed-off-by: Paul E. McKenney --- kernel/torture.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'kernel') diff --git a/kernel/torture.c b/kernel/torture.c index fb39a06bbef5..75961b3decfe 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -194,6 +194,12 @@ torture_onoff(void *arg) for_each_online_cpu(cpu) maxcpu = cpu; WARN_ON(maxcpu < 0); + + if (maxcpu == 0) { + VERBOSE_TOROUT_STRING("Only one CPU, so CPU-hotplug testing is disabled"); + goto stop; + } + if (onoff_holdoff > 0) { VERBOSE_TOROUT_STRING("torture_onoff begin holdoff"); schedule_timeout_interruptible(onoff_holdoff); @@ -209,6 +215,8 @@ torture_onoff(void *arg) &sum_online, &min_online, &max_online); schedule_timeout_interruptible(onoff_interval); } + +stop: torture_kthread_stopping("torture_onoff"); return 0; } -- cgit v1.2.3 From af06d4f74a7d2132c805339bfd5ab771b5706f42 Mon Sep 17 00:00:00 2001 From: Boqun Feng Date: Wed, 25 May 2016 09:25:33 +0800 Subject: rcuperf: Don't treat gp_exp mis-setting as a WARN 0day found a boot warning triggered in rcu_perf_writer() on !SMP kernel: WARN_ON(rcu_gp_is_normal() && gp_exp); , the root cause of which is trying to measure expedited grace periods(by setting gp_exp to true by default) when all the grace periods are normal(TINY RCU only has normal grace periods). However, such a mis-setting would only result in failing to measure the performance for a specific kind of grace periods, therefore using a WARN_ON to check this is a little overkilling. We could handle this inside rcuperf module via some error messages to tell users about the mis-settings. Therefore this patch removes the WARN_ON in rcu_perf_writer() and handles those checkings in rcu_perf_init() with plain if() code. Moreover, this patch changes the default value of gp_exp to 1) align with rcutorture tests and 2) make the default setting work for all RCU implementations by default. Suggested-by: Paul E. McKenney Signed-off-by: Boqun Feng Fixes: http://lkml.kernel.org/r/57411b10.mFvG0+AgcrMXGtcj%fengguang.wu@intel.com Signed-off-by: Paul E. McKenney --- kernel/rcu/rcuperf.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c index afd174e901c3..7b2dbdffd791 100644 --- a/kernel/rcu/rcuperf.c +++ b/kernel/rcu/rcuperf.c @@ -58,7 +58,7 @@ MODULE_AUTHOR("Paul E. McKenney "); #define VERBOSE_PERFOUT_ERRSTRING(s) \ do { if (verbose) pr_alert("%s" PERF_FLAG "!!! %s\n", perf_type, s); } while (0) -torture_param(bool, gp_exp, true, "Use expedited GP wait primitives"); +torture_param(bool, gp_exp, false, "Use expedited GP wait primitives"); torture_param(int, holdoff, 10, "Holdoff time before test start (s)"); torture_param(int, nreaders, -1, "Number of RCU reader threads"); torture_param(int, nwriters, -1, "Number of RCU updater threads"); @@ -358,8 +358,6 @@ rcu_perf_writer(void *arg) u64 *wdpp = writer_durations[me]; VERBOSE_PERFOUT_STRING("rcu_perf_writer task started"); - WARN_ON(rcu_gp_is_expedited() && !rcu_gp_is_normal() && !gp_exp); - WARN_ON(rcu_gp_is_normal() && gp_exp); WARN_ON(!wdpp); set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids)); sp.sched_priority = 1; @@ -626,6 +624,16 @@ rcu_perf_init(void) firsterr = -ENOMEM; goto unwind; } + if (rcu_gp_is_expedited() && !rcu_gp_is_normal() && !gp_exp) { + VERBOSE_PERFOUT_ERRSTRING("All grace periods expedited, no normal ones to measure!"); + firsterr = -EINVAL; + goto unwind; + } + if (rcu_gp_is_normal() && gp_exp) { + VERBOSE_PERFOUT_ERRSTRING("All grace periods normal, no expedited ones to measure!"); + firsterr = -EINVAL; + goto unwind; + } for (i = 0; i < nrealwriters; i++) { writer_durations[i] = kcalloc(MAX_MEAS, sizeof(*writer_durations[i]), -- cgit v1.2.3 From 05dbbfe753792dcebb6b85d84fa5926f09723cfe Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Mon, 13 Jun 2016 15:20:39 +0000 Subject: rcutorture: Fix error return code in rcu_perf_init() Fix to return a negative error code -ENOMEM from kcalloc() error handling case instead of 0, as done elsewhere in this function. Signed-off-by: Wei Yongjun Signed-off-by: Paul E. McKenney --- kernel/rcu/rcuperf.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c index 7b2dbdffd791..d38ab08a3fe7 100644 --- a/kernel/rcu/rcuperf.c +++ b/kernel/rcu/rcuperf.c @@ -638,8 +638,10 @@ rcu_perf_init(void) writer_durations[i] = kcalloc(MAX_MEAS, sizeof(*writer_durations[i]), GFP_KERNEL); - if (!writer_durations[i]) + if (!writer_durations[i]) { + firsterr = -ENOMEM; goto unwind; + } firsterr = torture_create_kthread(rcu_perf_writer, (void *)i, writer_tasks[i]); if (firsterr) -- cgit v1.2.3 From 4929c913bda505dbe44bb42c00da06011fee6c9d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 2 May 2016 11:58:56 -0700 Subject: rcu: Make call_rcu_tasks() tolerate first call with irqs disabled Currently, if the very first call to call_rcu_tasks() has irqs disabled, it will create the rcu_tasks_kthread with irqs disabled, which will result in a splat in the memory allocator, which kthread_run() invokes with the expectation that irqs are enabled. This commit fixes this problem by deferring kthread creation if called with irqs disabled. The first call to call_rcu_tasks() that has irqs enabled will create the kthread. This bug was detected by rcutorture changes that were motivated by Iftekhar Ahmed's mutation-testing efforts. Signed-off-by: Paul E. McKenney --- include/linux/rcupdate.h | 1 + kernel/rcu/update.c | 7 +++++-- 2 files changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index a8af79738a0e..3bc5de08c0b7 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -45,6 +45,7 @@ #include #include #include +#include #include diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 3e888cd5a594..f0d8322bc3ec 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -528,6 +528,7 @@ static int rcu_task_stall_timeout __read_mostly = HZ * 60 * 10; module_param(rcu_task_stall_timeout, int, 0644); static void rcu_spawn_tasks_kthread(void); +static struct task_struct *rcu_tasks_kthread_ptr; /* * Post an RCU-tasks callback. First call must be from process context @@ -537,6 +538,7 @@ void call_rcu_tasks(struct rcu_head *rhp, rcu_callback_t func) { unsigned long flags; bool needwake; + bool havetask = READ_ONCE(rcu_tasks_kthread_ptr); rhp->next = NULL; rhp->func = func; @@ -545,7 +547,9 @@ void call_rcu_tasks(struct rcu_head *rhp, rcu_callback_t func) *rcu_tasks_cbs_tail = rhp; rcu_tasks_cbs_tail = &rhp->next; raw_spin_unlock_irqrestore(&rcu_tasks_cbs_lock, flags); - if (needwake) { + /* We can't create the thread unless interrupts are enabled. */ + if ((needwake && havetask) || + (!havetask && !irqs_disabled_flags(flags))) { rcu_spawn_tasks_kthread(); wake_up(&rcu_tasks_cbs_wq); } @@ -790,7 +794,6 @@ static int __noreturn rcu_tasks_kthread(void *arg) static void rcu_spawn_tasks_kthread(void) { static DEFINE_MUTEX(rcu_tasks_kthread_mutex); - static struct task_struct *rcu_tasks_kthread_ptr; struct task_struct *t; if (READ_ONCE(rcu_tasks_kthread_ptr)) { -- cgit v1.2.3 From aab057382cb9b16249552684c1ebd270f070ec02 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 2 May 2016 12:20:51 -0700 Subject: rcu: Fix a typo in a comment In the area in hot pursuit of a bug, so might as well clean it up. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 084a28a732eb..60bd533902f9 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1476,7 +1476,7 @@ static int rcu_torture_barrier_cbs(void *arg) break; /* * The above smp_load_acquire() ensures barrier_phase load - * is ordered before the folloiwng ->call(). + * is ordered before the following ->call(). */ local_irq_disable(); /* Just to test no-irq call_rcu(). */ cur_ops->call(&rcu, rcu_torture_barrier_cbf); -- cgit v1.2.3 From 088e9d253d3a4ab7e058dd84bb532c32dadf1882 Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Thu, 2 Jun 2016 13:51:41 -0300 Subject: rcu: sysctl: Panic on RCU Stall It is not always easy to determine the cause of an RCU stall just by analysing the RCU stall messages, mainly when the problem is caused by the indirect starvation of rcu threads. For example, when preempt_rcu is not awakened due to the starvation of a timer softirq. We have been hard coding panic() in the RCU stall functions for some time while testing the kernel-rt. But this is not possible in some scenarios, like when supporting customers. This patch implements the sysctl kernel.panic_on_rcu_stall. If set to 1, the system will panic() when an RCU stall takes place, enabling the capture of a vmcore. The vmcore provides a way to analyze all kernel/tasks states, helping out to point to the culprit and the solution for the stall. The kernel.panic_on_rcu_stall sysctl is disabled by default. Changes from v1: - Fixed a typo in the git log - The if(sysctl_panic_on_rcu_stall) panic() is in a static function - Fixed the CONFIG_TINY_RCU compilation issue - The var sysctl_panic_on_rcu_stall is now __read_mostly Cc: Jonathan Corbet Cc: "Paul E. McKenney" Cc: Josh Triplett Cc: Steven Rostedt Cc: Mathieu Desnoyers Cc: Lai Jiangshan Acked-by: Christian Borntraeger Reviewed-by: Josh Triplett Reviewed-by: Arnaldo Carvalho de Melo Tested-by: "Luis Claudio R. Goncalves" Signed-off-by: Daniel Bristot de Oliveira Signed-off-by: Paul E. McKenney --- Documentation/sysctl/kernel.txt | 12 ++++++++++++ include/linux/kernel.h | 1 + kernel/rcu/tree.c | 12 ++++++++++++ kernel/sysctl.c | 11 +++++++++++ 4 files changed, 36 insertions(+) (limited to 'kernel') diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt index a3683ce2a2f3..33204604de6c 100644 --- a/Documentation/sysctl/kernel.txt +++ b/Documentation/sysctl/kernel.txt @@ -58,6 +58,7 @@ show up in /proc/sys/kernel: - panic_on_stackoverflow - panic_on_unrecovered_nmi - panic_on_warn +- panic_on_rcu_stall - perf_cpu_time_max_percent - perf_event_paranoid - perf_event_max_stack @@ -618,6 +619,17 @@ a kernel rebuild when attempting to kdump at the location of a WARN(). ============================================================== +panic_on_rcu_stall: + +When set to 1, calls panic() after RCU stall detection messages. This +is useful to define the root cause of RCU stalls using a vmcore. + +0: do not panic() when RCU stall takes place, default behavior. + +1: panic() after printing RCU stall messages. + +============================================================== + perf_cpu_time_max_percent: Hints to the kernel how much CPU time it should be allowed to diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 94aa10ffe156..c42082112ec8 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -451,6 +451,7 @@ extern int panic_on_oops; extern int panic_on_unrecovered_nmi; extern int panic_on_io_nmi; extern int panic_on_warn; +extern int sysctl_panic_on_rcu_stall; extern int sysctl_panic_on_stackoverflow; extern bool crash_kexec_post_notifiers; diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index c844b6142a86..e5ca15a461b9 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -125,6 +125,8 @@ int rcu_num_lvls __read_mostly = RCU_NUM_LVLS; /* Number of rcu_nodes at specified level. */ static int num_rcu_lvl[] = NUM_RCU_LVL_INIT; int rcu_num_nodes __read_mostly = NUM_RCU_NODES; /* Total # rcu_nodes in use. */ +/* panic() on RCU Stall sysctl. */ +int sysctl_panic_on_rcu_stall __read_mostly; /* * The rcu_scheduler_active variable transitions from zero to one just @@ -1312,6 +1314,12 @@ static void rcu_stall_kick_kthreads(struct rcu_state *rsp) } } +static inline void panic_on_rcu_stall(void) +{ + if (sysctl_panic_on_rcu_stall) + panic("RCU Stall\n"); +} + static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum) { int cpu; @@ -1391,6 +1399,8 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum) rcu_check_gp_kthread_starvation(rsp); + panic_on_rcu_stall(); + force_quiescent_state(rsp); /* Kick them all. */ } @@ -1431,6 +1441,8 @@ static void print_cpu_stall(struct rcu_state *rsp) jiffies + 3 * rcu_jiffies_till_stall_check() + 3); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + panic_on_rcu_stall(); + /* * Attempt to revive the RCU machinery by forcing a context switch. * diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 87b2fc38398b..35f0dcb1cb4f 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1204,6 +1204,17 @@ static struct ctl_table kern_table[] = { .extra1 = &one, .extra2 = &one, }, +#endif +#if defined(CONFIG_TREE_RCU) || defined(CONFIG_PREEMPT_RCU) + { + .procname = "panic_on_rcu_stall", + .data = &sysctl_panic_on_rcu_stall, + .maxlen = sizeof(sysctl_panic_on_rcu_stall), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &one, + }, #endif { } }; -- cgit v1.2.3 From bc75e99983df1efd977a5cd468893d55d52b8d70 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Fri, 3 Jun 2016 15:20:04 +0100 Subject: rcu: Correctly handle sparse possible cpus In many cases in the RCU tree code, we iterate over the set of cpus for a leaf node described by rcu_node::grplo and rcu_node::grphi, checking per-cpu data for each cpu in this range. However, if the set of possible cpus is sparse, some cpus described in this range are not possible, and thus no per-cpu region will have been allocated (or initialised) for them by the generic percpu code. Erroneous accesses to a per-cpu area for these !possible cpus may fault or may hit other data depending on the addressed generated when the erroneous per cpu offset is applied. In practice, both cases have been observed on arm64 hardware (the former being silent, but detectable with additional patches). To avoid issues resulting from this, we must iterate over the set of *possible* cpus for a given leaf node. This patch add a new helper, for_each_leaf_node_possible_cpu, to enable this. As iteration is often intertwined with rcu_node local bitmask manipulation, a new leaf_node_cpu_bit helper is added to make this simpler and more consistent. The RCU tree code is made to use both of these where appropriate. Without this patch, running reboot at a shell can result in an oops like: [ 3369.075979] Unable to handle kernel paging request at virtual address ffffff8008b21b4c [ 3369.083881] pgd = ffffffc3ecdda000 [ 3369.087270] [ffffff8008b21b4c] *pgd=00000083eca48003, *pud=00000083eca48003, *pmd=0000000000000000 [ 3369.096222] Internal error: Oops: 96000007 [#1] PREEMPT SMP [ 3369.101781] Modules linked in: [ 3369.104825] CPU: 2 PID: 1817 Comm: NetworkManager Tainted: G W 4.6.0+ #3 [ 3369.121239] task: ffffffc0fa13e000 ti: ffffffc3eb940000 task.ti: ffffffc3eb940000 [ 3369.128708] PC is at sync_rcu_exp_select_cpus+0x188/0x510 [ 3369.134094] LR is at sync_rcu_exp_select_cpus+0x104/0x510 [ 3369.139479] pc : [] lr : [] pstate: 200001c5 [ 3369.146860] sp : ffffffc3eb9435a0 [ 3369.150162] x29: ffffffc3eb9435a0 x28: ffffff8008be4f88 [ 3369.155465] x27: ffffff8008b66c80 x26: ffffffc3eceb2600 [ 3369.160767] x25: 0000000000000001 x24: ffffff8008be4f88 [ 3369.166070] x23: ffffff8008b51c3c x22: ffffff8008b66c80 [ 3369.171371] x21: 0000000000000001 x20: ffffff8008b21b40 [ 3369.176673] x19: ffffff8008b66c80 x18: 0000000000000000 [ 3369.181975] x17: 0000007fa951a010 x16: ffffff80086a30f0 [ 3369.187278] x15: 0000007fa9505590 x14: 0000000000000000 [ 3369.192580] x13: ffffff8008b51000 x12: ffffffc3eb940000 [ 3369.197882] x11: 0000000000000006 x10: ffffff8008b51b78 [ 3369.203184] x9 : 0000000000000001 x8 : ffffff8008be4000 [ 3369.208486] x7 : ffffff8008b21b40 x6 : 0000000000001003 [ 3369.213788] x5 : 0000000000000000 x4 : ffffff8008b27280 [ 3369.219090] x3 : ffffff8008b21b4c x2 : 0000000000000001 [ 3369.224406] x1 : 0000000000000001 x0 : 0000000000000140 ... [ 3369.972257] [] sync_rcu_exp_select_cpus+0x188/0x510 [ 3369.978685] [] synchronize_rcu_expedited+0x64/0xa8 [ 3369.985026] [] synchronize_net+0x24/0x30 [ 3369.990499] [] dev_deactivate_many+0x28c/0x298 [ 3369.996493] [] __dev_close_many+0x60/0xd0 [ 3370.002052] [] __dev_close+0x28/0x40 [ 3370.007178] [] __dev_change_flags+0x8c/0x158 [ 3370.012999] [] dev_change_flags+0x20/0x60 [ 3370.018558] [] do_setlink+0x288/0x918 [ 3370.023771] [] rtnl_newlink+0x398/0x6a8 [ 3370.029158] [] rtnetlink_rcv_msg+0xe4/0x220 [ 3370.034891] [] netlink_rcv_skb+0xc4/0xf8 [ 3370.040364] [] rtnetlink_rcv+0x2c/0x40 [ 3370.045663] [] netlink_unicast+0x160/0x238 [ 3370.051309] [] netlink_sendmsg+0x2f0/0x358 [ 3370.056956] [] sock_sendmsg+0x18/0x30 [ 3370.062168] [] ___sys_sendmsg+0x26c/0x280 [ 3370.067728] [] __sys_sendmsg+0x44/0x88 [ 3370.073027] [] SyS_sendmsg+0x10/0x20 [ 3370.078153] [] el0_svc_naked+0x24/0x28 Signed-off-by: Mark Rutland Reported-by: Dennis Chen Cc: Catalin Marinas Cc: Josh Triplett Cc: Lai Jiangshan Cc: Mathieu Desnoyers Cc: Steve Capper Cc: Steven Rostedt Cc: Will Deacon Cc: linux-kernel@vger.kernel.org Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 21 +++++++++------------ kernel/rcu/tree.h | 15 +++++++++++++++ kernel/rcu/tree_exp.h | 16 +++++++--------- kernel/rcu/tree_plugin.h | 5 +++-- 4 files changed, 34 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index e5ca15a461b9..f433959e9322 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1287,9 +1287,9 @@ static void rcu_dump_cpu_stacks(struct rcu_state *rsp) rcu_for_each_leaf_node(rsp, rnp) { raw_spin_lock_irqsave_rcu_node(rnp, flags); if (rnp->qsmask != 0) { - for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++) - if (rnp->qsmask & (1UL << cpu)) - dump_cpu_task(rnp->grplo + cpu); + for_each_leaf_node_possible_cpu(rnp, cpu) + if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu)) + dump_cpu_task(cpu); } raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } @@ -1360,10 +1360,9 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum) raw_spin_lock_irqsave_rcu_node(rnp, flags); ndetected += rcu_print_task_stall(rnp); if (rnp->qsmask != 0) { - for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++) - if (rnp->qsmask & (1UL << cpu)) { - print_cpu_stall_info(rsp, - rnp->grplo + cpu); + for_each_leaf_node_possible_cpu(rnp, cpu) + if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu)) { + print_cpu_stall_info(rsp, cpu); ndetected++; } } @@ -2884,7 +2883,6 @@ static void force_qs_rnp(struct rcu_state *rsp, unsigned long *maxj), bool *isidle, unsigned long *maxj) { - unsigned long bit; int cpu; unsigned long flags; unsigned long mask; @@ -2919,9 +2917,8 @@ static void force_qs_rnp(struct rcu_state *rsp, continue; } } - cpu = rnp->grplo; - bit = 1; - for (; cpu <= rnp->grphi; cpu++, bit <<= 1) { + for_each_leaf_node_possible_cpu(rnp, cpu) { + unsigned long bit = leaf_node_cpu_bit(rnp, cpu); if ((rnp->qsmask & bit) != 0) { if (f(per_cpu_ptr(rsp->rda, cpu), isidle, maxj)) mask |= bit; @@ -3750,7 +3747,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) /* Set up local state, ensuring consistent view of global state. */ raw_spin_lock_irqsave_rcu_node(rnp, flags); - rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo); + rdp->grpmask = leaf_node_cpu_bit(rdp->mynode, cpu); rdp->dynticks = &per_cpu(rcu_dynticks, cpu); WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_EXIT_IDLE); WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1); diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index e3959f5e6ddf..f714f873bf9d 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -253,6 +253,13 @@ struct rcu_node { wait_queue_head_t exp_wq[4]; } ____cacheline_internodealigned_in_smp; +/* + * Bitmasks in an rcu_node cover the interval [grplo, grphi] of CPU IDs, and + * are indexed relative to this interval rather than the global CPU ID space. + * This generates the bit for a CPU in node-local masks. + */ +#define leaf_node_cpu_bit(rnp, cpu) (1UL << ((cpu) - (rnp)->grplo)) + /* * Do a full breadth-first scan of the rcu_node structures for the * specified rcu_state structure. @@ -280,6 +287,14 @@ struct rcu_node { for ((rnp) = (rsp)->level[rcu_num_lvls - 1]; \ (rnp) < &(rsp)->node[rcu_num_nodes]; (rnp)++) +/* + * Iterate over all possible CPUs in a leaf RCU node. + */ +#define for_each_leaf_node_possible_cpu(rnp, cpu) \ + for ((cpu) = cpumask_next(rnp->grplo - 1, cpu_possible_mask); \ + cpu <= rnp->grphi; \ + cpu = cpumask_next((cpu), cpu_possible_mask)) + /* * Union to allow "aggregate OR" operation on the need for a quiescent * state by the normal and expedited grace periods. diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 00a02a231ada..d400434af6b2 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -344,7 +344,6 @@ static void sync_rcu_exp_select_cpus(struct rcu_state *rsp, { int cpu; unsigned long flags; - unsigned long mask; unsigned long mask_ofl_test; unsigned long mask_ofl_ipi; int ret; @@ -356,7 +355,7 @@ static void sync_rcu_exp_select_cpus(struct rcu_state *rsp, /* Each pass checks a CPU for identity, offline, and idle. */ mask_ofl_test = 0; - for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++) { + for_each_leaf_node_possible_cpu(rnp, cpu) { struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); @@ -376,8 +375,8 @@ static void sync_rcu_exp_select_cpus(struct rcu_state *rsp, raw_spin_unlock_irqrestore_rcu_node(rnp, flags); /* IPI the remaining CPUs for expedited quiescent state. */ - mask = 1; - for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) { + for_each_leaf_node_possible_cpu(rnp, cpu) { + unsigned long mask = leaf_node_cpu_bit(rnp, cpu); if (!(mask_ofl_ipi & mask)) continue; retry_ipi: @@ -440,10 +439,10 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp) ndetected = 0; rcu_for_each_leaf_node(rsp, rnp) { ndetected += rcu_print_task_exp_stall(rnp); - mask = 1; - for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) { + for_each_leaf_node_possible_cpu(rnp, cpu) { struct rcu_data *rdp; + mask = leaf_node_cpu_bit(rnp, cpu); if (!(rnp->expmask & mask)) continue; ndetected++; @@ -453,7 +452,6 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp) "o."[!!(rdp->grpmask & rnp->expmaskinit)], "N."[!!(rdp->grpmask & rnp->expmaskinitnext)]); } - mask <<= 1; } pr_cont(" } %lu jiffies s: %lu root: %#lx/%c\n", jiffies - jiffies_start, rsp->expedited_sequence, @@ -473,8 +471,8 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp) pr_cont("\n"); } rcu_for_each_leaf_node(rsp, rnp) { - mask = 1; - for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) { + for_each_leaf_node_possible_cpu(rnp, cpu) { + mask = leaf_node_cpu_bit(rnp, cpu); if (!(rnp->expmask & mask)) continue; dump_cpu_task(cpu); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 695071dd1e9c..534c590e8852 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1166,8 +1166,9 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) return; if (!zalloc_cpumask_var(&cm, GFP_KERNEL)) return; - for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1) - if ((mask & 0x1) && cpu != outgoingcpu) + for_each_leaf_node_possible_cpu(rnp, cpu) + if ((mask & leaf_node_cpu_bit(rnp, cpu)) && + cpu != outgoingcpu) cpumask_set_cpu(cpu, cm); if (cpumask_weight(cm) == 0) cpumask_setall(cm); -- cgit v1.2.3 From e37837fb62f95a81bdcefa86ceea043df84937d7 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 18 Apr 2016 01:01:27 +0200 Subject: locking/atomic: Remove the deprecated atomic_{set,clear}_mask() functions These functions have been deprecated for a while and there is only the one user left, convert and kill. Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Boqun Feng Cc: Davidlohr Bueso Cc: Frederic Weisbecker Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Will Deacon Cc: linux-arch@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- include/linux/atomic.h | 10 ---------- kernel/locking/qspinlock_paravirt.h | 4 ++-- 2 files changed, 2 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/include/linux/atomic.h b/include/linux/atomic.h index 0b3802d33125..12d910d61b83 100644 --- a/include/linux/atomic.h +++ b/include/linux/atomic.h @@ -480,16 +480,6 @@ static inline int atomic_fetch_andnot_release(int i, atomic_t *v) } #endif -static inline __deprecated void atomic_clear_mask(unsigned int mask, atomic_t *v) -{ - atomic_andnot(mask, v); -} - -static inline __deprecated void atomic_set_mask(unsigned int mask, atomic_t *v) -{ - atomic_or(mask, v); -} - /** * atomic_inc_not_zero_hint - increment if not null * @v: pointer of type atomic_t diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h index 21ede57f68b3..37649e69056c 100644 --- a/kernel/locking/qspinlock_paravirt.h +++ b/kernel/locking/qspinlock_paravirt.h @@ -112,12 +112,12 @@ static __always_inline int trylock_clear_pending(struct qspinlock *lock) #else /* _Q_PENDING_BITS == 8 */ static __always_inline void set_pending(struct qspinlock *lock) { - atomic_set_mask(_Q_PENDING_VAL, &lock->val); + atomic_or(_Q_PENDING_VAL, &lock->val); } static __always_inline void clear_pending(struct qspinlock *lock) { - atomic_clear_mask(_Q_PENDING_VAL, &lock->val); + atomic_andnot(_Q_PENDING_VAL, &lock->val); } static __always_inline int trylock_clear_pending(struct qspinlock *lock) -- cgit v1.2.3 From f9852b74bec0117b888da39d070c323ea1cb7f4c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 18 Apr 2016 01:27:03 +0200 Subject: locking/atomic, arch/qrwlock: Employ atomic_fetch_add_acquire() The only reason for the current code is to make GCC emit only the "LOCK XADD" instruction on x86 (and not do a pointless extra ADD on the result), do so nicer. Signed-off-by: Peter Zijlstra (Intel) Acked-by: Waiman Long Cc: Andrew Morton Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-arch@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/locking/qrwlock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c index fec082338668..19248ddf37ce 100644 --- a/kernel/locking/qrwlock.c +++ b/kernel/locking/qrwlock.c @@ -93,7 +93,7 @@ void queued_read_lock_slowpath(struct qrwlock *lock, u32 cnts) * that accesses can't leak upwards out of our subsequent critical * section in the case that the lock is currently held for write. */ - cnts = atomic_add_return_acquire(_QR_BIAS, &lock->cnts) - _QR_BIAS; + cnts = atomic_fetch_add_acquire(_QR_BIAS, &lock->cnts); rspin_until_writer_unlock(lock, cnts); /* -- cgit v1.2.3 From 86a3b5f34fc1fb307abef4fde76bebd3edce0324 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 18 May 2016 12:42:21 +0200 Subject: locking/atomic, arch/rwsem: Employ atomic_long_fetch_add() Now that we have fetch_add() we can stop using add_return() - val. Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Jason Low Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Waiman Long Cc: linux-arch@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/locking/rwsem-xadd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index 2031281bb940..447e08de1fab 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c @@ -153,7 +153,7 @@ __rwsem_mark_wake(struct rw_semaphore *sem, if (wake_type != RWSEM_WAKE_READ_OWNED) { adjustment = RWSEM_ACTIVE_READ_BIAS; try_reader_grant: - oldcount = atomic_long_add_return(adjustment, &sem->count) - adjustment; + oldcount = atomic_long_fetch_add(adjustment, &sem->count); if (unlikely(oldcount < RWSEM_WAITING_BIAS)) { /* -- cgit v1.2.3 From d945b5e9f0e35cb56a3783d849b5f0f37da0a7f1 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 16 Jun 2016 14:38:42 +0200 Subject: workqueue: Fix setting affinity of unbound worker threads With commit e9d867a67fd03ccc ("sched: Allow per-cpu kernel threads to run on online && !active"), __set_cpus_allowed_ptr() expects that only strict per-cpu kernel threads can have affinity to an online CPU which is not yet active. This assumption is currently broken in the CPU_ONLINE notification handler for the workqueues where restore_unbound_workers_cpumask() calls set_cpus_allowed_ptr() when the first cpu in the unbound worker's pool->attr->cpumask comes online. Since set_cpus_allowed_ptr() is called with pool->attr->cpumask in which only one CPU is online which is not yet active, we get the following WARN_ON during an CPU online operation. ------------[ cut here ]------------ WARNING: CPU: 40 PID: 248 at kernel/sched/core.c:1166 __set_cpus_allowed_ptr+0x228/0x2e0 Modules linked in: CPU: 40 PID: 248 Comm: cpuhp/40 Not tainted 4.6.0-autotest+ #4 <..snip..> Call Trace: [c000000f273ff920] [c00000000010493c] __set_cpus_allowed_ptr+0x2cc/0x2e0 (unreliable) [c000000f273ffac0] [c0000000000ed4b0] workqueue_cpu_up_callback+0x2c0/0x470 [c000000f273ffb70] [c0000000000f5c58] notifier_call_chain+0x98/0x100 [c000000f273ffbc0] [c0000000000c5ed0] __cpu_notify+0x70/0xe0 [c000000f273ffc00] [c0000000000c6028] notify_online+0x38/0x50 [c000000f273ffc30] [c0000000000c5214] cpuhp_invoke_callback+0x84/0x250 [c000000f273ffc90] [c0000000000c562c] cpuhp_up_callbacks+0x5c/0x120 [c000000f273ffce0] [c0000000000c64d4] cpuhp_thread_fun+0x184/0x1c0 [c000000f273ffd20] [c0000000000fa050] smpboot_thread_fn+0x290/0x2a0 [c000000f273ffd80] [c0000000000f45b0] kthread+0x110/0x130 [c000000f273ffe30] [c000000000009570] ret_from_kernel_thread+0x5c/0x6c ---[ end trace 00f1456578b2a3b2 ]--- This patch fixes this by limiting the mask to the intersection of the pool affinity and online CPUs. Changelog-cribbed-from: Gautham R. Shenoy Reported-by: Abdul Haleem Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Tejun Heo --- kernel/workqueue.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index e1c0e996b5ae..97e7b793df35 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -4600,15 +4600,11 @@ static void restore_unbound_workers_cpumask(struct worker_pool *pool, int cpu) if (!cpumask_test_cpu(cpu, pool->attrs->cpumask)) return; - /* is @cpu the only online CPU? */ cpumask_and(&cpumask, pool->attrs->cpumask, cpu_online_mask); - if (cpumask_weight(&cpumask) != 1) - return; /* as we're called from CPU_ONLINE, the following shouldn't fail */ for_each_pool_worker(worker, pool) - WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, - pool->attrs->cpumask) < 0); + WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, &cpumask) < 0); } /* -- cgit v1.2.3 From edd14cfebc4404698544d407ecf8eda6e19aa19e Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Fri, 17 Jun 2016 16:00:20 -0600 Subject: genirq: Add untracked irq handler This adds a software irq handler for controllers that multiplex interrupts from multiple devices, but don't know which device generated the interrupt. For these devices, the irq handler that demuxes must check every action for every software irq using the same h/w irq in order to find out which device generated the interrupt. This will inevitably trigger spurious interrupt detection if we are noting the irq. The new irq handler does not track the handling for spurious interrupt detection. An irq that uses this also won't get stats tracked since it didn't generate the interrupt, nor added to randomness since they are not random. Signed-off-by: Keith Busch Cc: Bjorn Helgaas Cc: linux-pci@vger.kernel.org Cc: Jon Derrick Link: http://lkml.kernel.org/r/1466200821-29159-1-git-send-email-keith.busch@intel.com Signed-off-by: Thomas Gleixner --- include/linux/irq.h | 1 + kernel/irq/chip.c | 43 +++++++++++++++++++++++++++++++++++++++++++ kernel/irq/handle.c | 18 ++++++++++++++---- kernel/irq/internals.h | 1 + 4 files changed, 59 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/include/linux/irq.h b/include/linux/irq.h index 6c92a847394d..562cef010aa8 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -484,6 +484,7 @@ extern void handle_fasteoi_irq(struct irq_desc *desc); extern void handle_edge_irq(struct irq_desc *desc); extern void handle_edge_eoi_irq(struct irq_desc *desc); extern void handle_simple_irq(struct irq_desc *desc); +extern void handle_untracked_irq(struct irq_desc *desc); extern void handle_percpu_irq(struct irq_desc *desc); extern void handle_percpu_devid_irq(struct irq_desc *desc); extern void handle_bad_irq(struct irq_desc *desc); diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index ad8131473774..b4c1bc7c9ca2 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -426,6 +426,49 @@ out_unlock: } EXPORT_SYMBOL_GPL(handle_simple_irq); +/** + * handle_untracked_irq - Simple and software-decoded IRQs. + * @desc: the interrupt description structure for this irq + * + * Untracked interrupts are sent from a demultiplexing interrupt + * handler when the demultiplexer does not know which device it its + * multiplexed irq domain generated the interrupt. IRQ's handled + * through here are not subjected to stats tracking, randomness, or + * spurious interrupt detection. + * + * Note: Like handle_simple_irq, the caller is expected to handle + * the ack, clear, mask and unmask issues if necessary. + */ +void handle_untracked_irq(struct irq_desc *desc) +{ + unsigned int flags = 0; + + raw_spin_lock(&desc->lock); + + if (!irq_may_run(desc)) + goto out_unlock; + + desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); + + if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) { + desc->istate |= IRQS_PENDING; + goto out_unlock; + } + + desc->istate &= ~IRQS_PENDING; + irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS); + raw_spin_unlock(&desc->lock); + + __handle_irq_event_percpu(desc, &flags); + + raw_spin_lock(&desc->lock); + irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS); + +out_unlock: + raw_spin_unlock(&desc->lock); +} +EXPORT_SYMBOL_GPL(handle_untracked_irq); + /* * Called unconditionally from handle_level_irq() and only for oneshot * interrupts from handle_fasteoi_irq() diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index a15b5485b446..d3f24905852c 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -132,10 +132,10 @@ void __irq_wake_thread(struct irq_desc *desc, struct irqaction *action) wake_up_process(action->thread); } -irqreturn_t handle_irq_event_percpu(struct irq_desc *desc) +irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc, unsigned int *flags) { irqreturn_t retval = IRQ_NONE; - unsigned int flags = 0, irq = desc->irq_data.irq; + unsigned int irq = desc->irq_data.irq; struct irqaction *action; for_each_action_of_desc(desc, action) { @@ -164,7 +164,7 @@ irqreturn_t handle_irq_event_percpu(struct irq_desc *desc) /* Fall through to add to randomness */ case IRQ_HANDLED: - flags |= action->flags; + *flags |= action->flags; break; default: @@ -174,7 +174,17 @@ irqreturn_t handle_irq_event_percpu(struct irq_desc *desc) retval |= res; } - add_interrupt_randomness(irq, flags); + return retval; +} + +irqreturn_t handle_irq_event_percpu(struct irq_desc *desc) +{ + irqreturn_t retval; + unsigned int flags = 0; + + retval = __handle_irq_event_percpu(desc, &flags); + + add_interrupt_randomness(desc->irq_data.irq, flags); if (!noirqdebug) note_interrupt(desc, retval); diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index d5edcdc9382a..0c6f35ba9cc0 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -84,6 +84,7 @@ extern void irq_mark_irq(unsigned int irq); extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr); +irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc, unsigned int *flags); irqreturn_t handle_irq_event_percpu(struct irq_desc *desc); irqreturn_t handle_irq_event(struct irq_desc *desc); -- cgit v1.2.3 From 0fb71d340d355156818bb53eb36ae79a3f88bda9 Mon Sep 17 00:00:00 2001 From: Minfei Huang Date: Mon, 25 Apr 2016 17:20:28 +0800 Subject: clocksource: Make clocksource insert entry more efficient In clocksource_enqueue(), it is unnecessary to continue looping the list, if we find there is an entry that the value of rating is smaller than the new one. It is safe to be out the loop, because all of entry are inserted in descending order. Cc: Prarit Bhargava Cc: Richard Cochran Cc: Thomas Gleixner Cc: Ingo Molnar Signed-off-by: Minfei Huang Signed-off-by: John Stultz --- kernel/time/clocksource.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 56ece145a814..6a5a310a1a53 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -669,10 +669,12 @@ static void clocksource_enqueue(struct clocksource *cs) struct list_head *entry = &clocksource_list; struct clocksource *tmp; - list_for_each_entry(tmp, &clocksource_list, list) + list_for_each_entry(tmp, &clocksource_list, list) { /* Keep track of the place, where to insert */ - if (tmp->rating >= cs->rating) - entry = &tmp->list; + if (tmp->rating < cs->rating) + break; + entry = &tmp->list; + } list_add(&cs->list, entry); } -- cgit v1.2.3 From 0209b937569a133dedfe930cdfff3a0d1d68c9e9 Mon Sep 17 00:00:00 2001 From: Thomas Graziadei Date: Tue, 31 May 2016 15:06:06 +0200 Subject: timekeeping: Fix 1ns/tick drift with GENERIC_TIME_VSYSCALL_OLD The user notices the problem in a raw and real time drift, calling clock_gettime with CLOCK_REALTIME / CLOCK_MONOTONIC_RAW on a system with no ntp correction taking place (no ntpd or ptp stuff running). The problem is, that old_vsyscall_fixup adds an extra 1ns even though xtime_nsec is already held in full nsecs and the remainder in this case is 0. Do the rounding up buisness only if needed. Cc: Prarit Bhargava Cc: Richard Cochran Cc: Thomas Gleixner Cc: Ingo Molnar Signed-off-by: Thomas Graziadei Signed-off-by: John Stultz --- kernel/time/timekeeping.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 479d25cd3d4f..a196e08324e7 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -480,10 +480,12 @@ static inline void old_vsyscall_fixup(struct timekeeper *tk) * users are removed, this can be killed. */ remainder = tk->tkr_mono.xtime_nsec & ((1ULL << tk->tkr_mono.shift) - 1); - tk->tkr_mono.xtime_nsec -= remainder; - tk->tkr_mono.xtime_nsec += 1ULL << tk->tkr_mono.shift; - tk->ntp_error += remainder << tk->ntp_error_shift; - tk->ntp_error -= (1ULL << tk->tkr_mono.shift) << tk->ntp_error_shift; + if (remainder != 0) { + tk->tkr_mono.xtime_nsec -= remainder; + tk->tkr_mono.xtime_nsec += 1ULL << tk->tkr_mono.shift; + tk->ntp_error += remainder << tk->ntp_error_shift; + tk->ntp_error -= (1ULL << tk->tkr_mono.shift) << tk->ntp_error_shift; + } } #else #define old_vsyscall_fixup(tk) -- cgit v1.2.3 From af4afb40085f04f8b2ea8d28df878f7f0be02f89 Mon Sep 17 00:00:00 2001 From: Pratyush Patel Date: Tue, 14 Jun 2016 11:00:42 +0200 Subject: alarmtimer: Fix comments describing structure fields Updated struct alarm and struct alarm_timer descriptions. Cc: Prarit Bhargava Cc: Richard Cochran Cc: Thomas Gleixner Cc: Ingo Molnar Signed-off-by: Pratyush Patel Signed-off-by: John Stultz --- include/linux/alarmtimer.h | 6 +++--- kernel/time/alarmtimer.c | 1 - 2 files changed, 3 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/include/linux/alarmtimer.h b/include/linux/alarmtimer.h index 52f3b7da4f2d..9d8031257a90 100644 --- a/include/linux/alarmtimer.h +++ b/include/linux/alarmtimer.h @@ -26,10 +26,10 @@ enum alarmtimer_restart { * struct alarm - Alarm timer structure * @node: timerqueue node for adding to the event list this value * also includes the expiration time. - * @period: Period for recuring alarms + * @timer: hrtimer used to schedule events while running * @function: Function pointer to be executed when the timer fires. - * @type: Alarm type (BOOTTIME/REALTIME) - * @enabled: Flag that represents if the alarm is set to fire or not + * @type: Alarm type (BOOTTIME/REALTIME). + * @state: Flag that represents if the alarm is set to fire or not. * @data: Internal data value. */ struct alarm { diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index e840ed867a5d..c3aad685bbc0 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -30,7 +30,6 @@ * struct alarm_base - Alarm timer bases * @lock: Lock for syncrhonized access to the base * @timerqueue: Timerqueue head managing the list of events - * @timer: hrtimer used to schedule events while running * @gettime: Function to read the time correlating to the base * @base_clockid: clockid for the base */ -- cgit v1.2.3 From e6c2682a1da36a2e79d9bab470412374434ce89e Mon Sep 17 00:00:00 2001 From: Deepa Dinamani Date: Wed, 8 Jun 2016 22:04:59 -0700 Subject: time: Add time64_to_tm() time_to_tm() takes time_t as an argument. time_t is not y2038 safe. Add time64_to_tm() that takes time64_t as an argument which is y2038 safe. The plan is to eventually replace all calls to time_to_tm() by time64_to_tm(). Cc: Prarit Bhargava Cc: Richard Cochran Cc: Thomas Gleixner Cc: Ingo Molnar Signed-off-by: Deepa Dinamani Signed-off-by: John Stultz --- include/linux/time.h | 15 ++++++++++++++- kernel/time/timeconv.c | 11 ++++++----- 2 files changed, 20 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/include/linux/time.h b/include/linux/time.h index 297f09f23896..4cea09d94208 100644 --- a/include/linux/time.h +++ b/include/linux/time.h @@ -205,7 +205,20 @@ struct tm { int tm_yday; }; -void time_to_tm(time_t totalsecs, int offset, struct tm *result); +void time64_to_tm(time64_t totalsecs, int offset, struct tm *result); + +/** + * time_to_tm - converts the calendar time to local broken-down time + * + * @totalsecs the number of seconds elapsed since 00:00:00 on January 1, 1970, + * Coordinated Universal Time (UTC). + * @offset offset seconds adding to totalsecs. + * @result pointer to struct tm variable to receive broken-down time + */ +static inline void time_to_tm(time_t totalsecs, int offset, struct tm *result) +{ + time64_to_tm(totalsecs, offset, result); +} /** * timespec_to_ns - Convert timespec to nanoseconds diff --git a/kernel/time/timeconv.c b/kernel/time/timeconv.c index 86628e755f38..7142580ad94f 100644 --- a/kernel/time/timeconv.c +++ b/kernel/time/timeconv.c @@ -67,20 +67,21 @@ static const unsigned short __mon_yday[2][13] = { #define SECS_PER_DAY (SECS_PER_HOUR * 24) /** - * time_to_tm - converts the calendar time to local broken-down time + * time64_to_tm - converts the calendar time to local broken-down time * * @totalsecs the number of seconds elapsed since 00:00:00 on January 1, 1970, * Coordinated Universal Time (UTC). * @offset offset seconds adding to totalsecs. * @result pointer to struct tm variable to receive broken-down time */ -void time_to_tm(time_t totalsecs, int offset, struct tm *result) +void time64_to_tm(time64_t totalsecs, int offset, struct tm *result) { long days, rem, y; + int remainder; const unsigned short *ip; - days = totalsecs / SECS_PER_DAY; - rem = totalsecs % SECS_PER_DAY; + days = div_s64_rem(totalsecs, SECS_PER_DAY, &remainder); + rem = remainder; rem += offset; while (rem < 0) { rem += SECS_PER_DAY; @@ -124,4 +125,4 @@ void time_to_tm(time_t totalsecs, int offset, struct tm *result) result->tm_mon = y; result->tm_mday = days + 1; } -EXPORT_SYMBOL(time_to_tm); +EXPORT_SYMBOL(time64_to_tm); -- cgit v1.2.3 From 4a19bd3d22d51a0c89db10879dacaffa0f52aecf Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 17 Jun 2016 18:03:02 +0200 Subject: time: Avoid timespec in udelay_test udelay_test_single() uses ktime_get_ts() to get two timespec values and calculate the difference between them, while udelay_test_show() uses the same to printk() the current monotonic time. Both of these are y2038 safe on all machines, but we want to get rid of struct timespec anyway, so this converts the code to use ktime_get_ns() and ktime_get_ts64() respectively. Cc: Prarit Bhargava Cc: Richard Cochran Cc: Thomas Gleixner Cc: Ingo Molnar Signed-off-by: Arnd Bergmann Signed-off-by: John Stultz --- kernel/time/test_udelay.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/time/test_udelay.c b/kernel/time/test_udelay.c index e622ba365a13..b0928ab3270f 100644 --- a/kernel/time/test_udelay.c +++ b/kernel/time/test_udelay.c @@ -43,13 +43,13 @@ static int udelay_test_single(struct seq_file *s, int usecs, uint32_t iters) int allowed_error_ns = usecs * 5; for (i = 0; i < iters; ++i) { - struct timespec ts1, ts2; + s64 kt1, kt2; int time_passed; - ktime_get_ts(&ts1); + kt1 = ktime_get_ns(); udelay(usecs); - ktime_get_ts(&ts2); - time_passed = timespec_to_ns(&ts2) - timespec_to_ns(&ts1); + kt2 = ktime_get_ns(); + time_passed = kt2 - kt1; if (i == 0 || time_passed < min) min = time_passed; @@ -87,11 +87,11 @@ static int udelay_test_show(struct seq_file *s, void *v) if (usecs > 0 && iters > 0) { return udelay_test_single(s, usecs, iters); } else if (usecs == 0) { - struct timespec ts; + struct timespec64 ts; - ktime_get_ts(&ts); - seq_printf(s, "udelay() test (lpj=%ld kt=%ld.%09ld)\n", - loops_per_jiffy, ts.tv_sec, ts.tv_nsec); + ktime_get_ts64(&ts); + seq_printf(s, "udelay() test (lpj=%ld kt=%lld.%09ld)\n", + loops_per_jiffy, (s64)ts.tv_sec, ts.tv_nsec); seq_puts(s, "usage:\n"); seq_puts(s, "echo USECS [ITERS] > " DEBUGFS_FILENAME "\n"); seq_puts(s, "cat " DEBUGFS_FILENAME "\n"); -- cgit v1.2.3 From 7c71feb0a6766c7c3a262e3cc33ae231f3953cb6 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 17 Jun 2016 17:30:47 +0200 Subject: timer: Avoid using timespec The tstats_show() function prints a ktime_t variable by converting it to struct timespec first. The algorithm is ok, but we want to stop using timespec in general because of the 32-bit time_t overflow problem. This changes the code to use struct timespec64, without any functional change. Cc: Prarit Bhargava Cc: Richard Cochran Cc: Thomas Gleixner Cc: Ingo Molnar Signed-off-by: Arnd Bergmann Signed-off-by: John Stultz --- kernel/time/timer_stats.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c index 1adecb4b87c8..087204c733eb 100644 --- a/kernel/time/timer_stats.c +++ b/kernel/time/timer_stats.c @@ -279,7 +279,7 @@ static void print_name_offset(struct seq_file *m, unsigned long addr) static int tstats_show(struct seq_file *m, void *v) { - struct timespec period; + struct timespec64 period; struct entry *entry; unsigned long ms; long events = 0; @@ -295,11 +295,11 @@ static int tstats_show(struct seq_file *m, void *v) time = ktime_sub(time_stop, time_start); - period = ktime_to_timespec(time); + period = ktime_to_timespec64(time); ms = period.tv_nsec / 1000000; seq_puts(m, "Timer Stats Version: v0.3\n"); - seq_printf(m, "Sample period: %ld.%03ld s\n", period.tv_sec, ms); + seq_printf(m, "Sample period: %ld.%03ld s\n", (long)period.tv_sec, ms); if (atomic_read(&overflow_count)) seq_printf(m, "Overflow: %d entries\n", atomic_read(&overflow_count)); seq_printf(m, "Collection: %s\n", timer_stats_active ? "active" : "inactive"); -- cgit v1.2.3 From d16dcd3d18759eb955e0325572d07457f93494f5 Mon Sep 17 00:00:00 2001 From: Jon Hunter Date: Tue, 21 Jun 2016 10:23:22 +0100 Subject: irqdomain: Fix disposal of mappings for interrupt hierarchies The function irq_create_of_mapping() is used to create an interrupt mapping. However, depending on whether the irqdomain, to which the interrupt belongs, is part of a hierarchy, determines whether the mapping is created via calling irq_domain_alloc_irqs() or irq_create_mapping(). To dispose of the interrupt mapping, drivers call irq_dispose_mapping(). However, this function does not check to see if the irqdomain is part of a hierarchy or not and simply assumes that it was mapped via calling irq_create_mapping() so calls irq_domain_disassociate() to unmap the interrupt. Fix this by checking to see if the irqdomain is part of a hierarchy and if so call irq_domain_free_irqs() to free/unmap the interrupt. Signed-off-by: Jon Hunter Cc: Marc Zyngier Cc: Jiang Liu Link: http://lkml.kernel.org/r/1466501002-16368-1-git-send-email-jonathanh@nvidia.com Signed-off-by: Thomas Gleixner --- kernel/irq/irqdomain.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index caa6a63d26f0..5d89d724a02a 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -680,8 +680,12 @@ void irq_dispose_mapping(unsigned int virq) if (WARN_ON(domain == NULL)) return; - irq_domain_disassociate(domain, virq); - irq_free_desc(virq); + if (irq_domain_is_hierarchy(domain)) { + irq_domain_free_irqs(virq, 1); + } else { + irq_domain_disassociate(domain, virq); + irq_free_desc(virq); + } } EXPORT_SYMBOL_GPL(irq_dispose_mapping); -- cgit v1.2.3 From 65fe935dd2387a4faf15314c73f5e6d31ef0217e Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 13 Jun 2016 15:10:02 -0700 Subject: x86/KASLR, x86/power: Remove x86 hibernation restrictions With the following fix: 70595b479ce1 ("x86/power/64: Fix crash whan the hibernation code passes control to the image kernel") ... there is no longer a problem with hibernation resuming a KASLR-booted kernel image, so remove the restriction. Signed-off-by: Kees Cook Cc: Andy Lutomirski Cc: Baoquan He Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Jonathan Corbet Cc: Len Brown Cc: Linus Torvalds Cc: Linux PM list Cc: Logan Gunthorpe Cc: Pavel Machek Cc: Peter Zijlstra Cc: Stephen Smalley Cc: Thomas Gleixner Cc: Yinghai Lu Cc: linux-doc@vger.kernel.org Link: http://lkml.kernel.org/r/20160613221002.GA29719@www.outflux.net Signed-off-by: Ingo Molnar --- Documentation/kernel-parameters.txt | 10 ++++------ arch/x86/boot/compressed/kaslr.c | 7 ------- kernel/power/hibernate.c | 6 ------ 3 files changed, 4 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 82b42c958d1c..fa8c6d470ad2 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -1803,12 +1803,10 @@ bytes respectively. Such letter suffixes can also be entirely omitted. js= [HW,JOY] Analog joystick See Documentation/input/joystick.txt. - kaslr/nokaslr [X86] - Enable/disable kernel and module base offset ASLR - (Address Space Layout Randomization) if built into - the kernel. When CONFIG_HIBERNATION is selected, - kASLR is disabled by default. When kASLR is enabled, - hibernation will be disabled. + nokaslr [KNL] + When CONFIG_RANDOMIZE_BASE is set, this disables + kernel and module base offset ASLR (Address Space + Layout Randomization). keepinitrd [HW,ARM] diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c index cfeb0259ed81..dff42177cb0c 100644 --- a/arch/x86/boot/compressed/kaslr.c +++ b/arch/x86/boot/compressed/kaslr.c @@ -471,17 +471,10 @@ unsigned char *choose_random_location(unsigned long input, unsigned long choice = output; unsigned long random_addr; -#ifdef CONFIG_HIBERNATION - if (!cmdline_find_option_bool("kaslr")) { - warn("KASLR disabled: 'kaslr' not on cmdline (hibernation selected)."); - goto out; - } -#else if (cmdline_find_option_bool("nokaslr")) { warn("KASLR disabled: 'nokaslr' on cmdline."); goto out; } -#endif boot_params->hdr.loadflags |= KASLR_FLAG; diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index fca9254280ee..9021387c6ff4 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -1154,11 +1154,6 @@ static int __init nohibernate_setup(char *str) return 1; } -static int __init kaslr_nohibernate_setup(char *str) -{ - return nohibernate_setup(str); -} - static int __init page_poison_nohibernate_setup(char *str) { #ifdef CONFIG_PAGE_POISONING_ZERO @@ -1182,5 +1177,4 @@ __setup("hibernate=", hibernate_setup); __setup("resumewait", resumewait_setup); __setup("resumedelay=", resumedelay_setup); __setup("nohibernate", nohibernate_setup); -__setup("kaslr", kaslr_nohibernate_setup); __setup("page_poison=", page_poison_nohibernate_setup); -- cgit v1.2.3 From 7dd4912594daf769a46744848b05bd5bc6d62469 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 24 Jun 2016 15:53:54 +0200 Subject: sched/fair: Fix effective_load() to consistently use smoothed load Starting with the following commit: fde7d22e01aa ("sched/fair: Fix overly small weight for interactive group entities") calc_tg_weight() doesn't compute the right value as expected by effective_load(). The difference is in the 'correction' term. In order to ensure \Sum rw_j >= rw_i we cannot use tg->load_avg directly, since that might be lagging a correction on the current cfs_rq->avg.load_avg value. Therefore we use tg->load_avg - cfs_rq->tg_load_avg_contrib + cfs_rq->avg.load_avg. Now, per the referenced commit, calc_tg_weight() doesn't use cfs_rq->avg.load_avg, as is later used in @w, but uses cfs_rq->load.weight instead. So stop using calc_tg_weight() and do it explicitly. The effects of this bug are wake_affine() making randomly poor choices in cgroup-intense workloads. Signed-off-by: Peter Zijlstra (Intel) Cc: # v4.3+ Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: fde7d22e01aa ("sched/fair: Fix overly small weight for interactive group entities") Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index bdcbeea90c95..cc48bef40cca 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -735,8 +735,6 @@ void post_init_entity_util_avg(struct sched_entity *se) } } -static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq); -static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq); #else void init_entity_runnable_average(struct sched_entity *se) { @@ -4946,19 +4944,24 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg) return wl; for_each_sched_entity(se) { - long w, W; + struct cfs_rq *cfs_rq = se->my_q; + long W, w = cfs_rq_load_avg(cfs_rq); - tg = se->my_q->tg; + tg = cfs_rq->tg; /* * W = @wg + \Sum rw_j */ - W = wg + calc_tg_weight(tg, se->my_q); + W = wg + atomic_long_read(&tg->load_avg); + + /* Ensure \Sum rw_j >= rw_i */ + W -= cfs_rq->tg_load_avg_contrib; + W += w; /* * w = rw_i + @wl */ - w = cfs_rq_load_avg(se->my_q) + wl; + w += wl; /* * wl = S * s'_i; see (2) -- cgit v1.2.3 From ea1dc6fc6242f991656e35e2ed3d90ec1cd13418 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 24 Jun 2016 16:11:02 +0200 Subject: sched/fair: Fix calc_cfs_shares() fixed point arithmetics width confusion Commit: fde7d22e01aa ("sched/fair: Fix overly small weight for interactive group entities") did something non-obvious but also did it buggy yet latent. The problem was exposed for real by a later commit in the v4.7 merge window: 2159197d6677 ("sched/core: Enable increased load resolution on 64-bit kernels") ... after which tg->load_avg and cfs_rq->load.weight had different units (10 bit fixed point and 20 bit fixed point resp.). Add a comment to explain the use of cfs_rq->load.weight over the 'natural' cfs_rq->avg.load_avg and add scale_load_down() to correct for the difference in unit. Since this is (now, as per a previous commit) the only user of calc_tg_weight(), collapse it. The effects of this bug should be randomly inconsistent SMP-balancing of cgroups workloads. Reported-by: Jirka Hladky Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: 2159197d6677 ("sched/core: Enable increased load resolution on 64-bit kernels") Fixes: fde7d22e01aa ("sched/fair: Fix overly small weight for interactive group entities") Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 27 +++++++++++---------------- 1 file changed, 11 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index cc48bef40cca..c8c5d2d48424 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2497,28 +2497,22 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) #ifdef CONFIG_FAIR_GROUP_SCHED # ifdef CONFIG_SMP -static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq) +static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg) { - long tg_weight; + long tg_weight, load, shares; /* - * Use this CPU's real-time load instead of the last load contribution - * as the updating of the contribution is delayed, and we will use the - * the real-time load to calc the share. See update_tg_load_avg(). + * This really should be: cfs_rq->avg.load_avg, but instead we use + * cfs_rq->load.weight, which is its upper bound. This helps ramp up + * the shares for small weight interactive tasks. */ - tg_weight = atomic_long_read(&tg->load_avg); - tg_weight -= cfs_rq->tg_load_avg_contrib; - tg_weight += cfs_rq->load.weight; - - return tg_weight; -} + load = scale_load_down(cfs_rq->load.weight); -static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg) -{ - long tg_weight, load, shares; + tg_weight = atomic_long_read(&tg->load_avg); - tg_weight = calc_tg_weight(tg, cfs_rq); - load = cfs_rq->load.weight; + /* Ensure tg_weight >= load */ + tg_weight -= cfs_rq->tg_load_avg_contrib; + tg_weight += load; shares = (tg->shares * load); if (tg_weight) @@ -2537,6 +2531,7 @@ static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg) return tg->shares; } # endif /* CONFIG_SMP */ + static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, unsigned long weight) { -- cgit v1.2.3 From 0dceeaf599e6d9b8bd908ba4bd3dfee84aa26be2 Mon Sep 17 00:00:00 2001 From: Pan Xinhui Date: Tue, 14 Jun 2016 14:37:27 +0800 Subject: locking/qspinlock: Use __this_cpu_dec() instead of full-blown this_cpu_dec() queued_spin_lock_slowpath() should not worry about another queued_spin_lock_slowpath() running in interrupt context and changing node->count by accident, because node->count keeps the same value every time we enter/leave queued_spin_lock_slowpath(). On some architectures this_cpu_dec() will save/restore irq flags, which has high overhead. Use the much cheaper __this_cpu_dec() instead. Signed-off-by: Pan Xinhui Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Waiman.Long@hpe.com Link: http://lkml.kernel.org/r/1465886247-3773-1-git-send-email-xinhui.pan@linux.vnet.ibm.com [ Rewrote changelog. ] Signed-off-by: Ingo Molnar --- kernel/locking/qspinlock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c index 730655533440..b2caec7315af 100644 --- a/kernel/locking/qspinlock.c +++ b/kernel/locking/qspinlock.c @@ -619,7 +619,7 @@ release: /* * release the node */ - this_cpu_dec(mcs_nodes[0].count); + __this_cpu_dec(mcs_nodes[0].count); } EXPORT_SYMBOL(queued_spin_lock_slowpath); -- cgit v1.2.3 From e210bffd39d01b649c94b820c28ff112673266dd Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 16 Jun 2016 18:51:48 +0200 Subject: sched/fair: Fix and optimize the fork() path The task_fork_fair() callback already calls __set_task_cpu() and takes rq->lock. If we move the sched_class::task_fork callback in sched_fork() under the existing p->pi_lock, right after its set_task_cpu() call, we can avoid doing two such calls and omit the IRQ disabling on the rq->lock. Change to __set_task_cpu() to skip the migration bits, this is a new task, not a migration. Similarly, make wake_up_new_task() use __set_task_cpu() for the same reason, the task hasn't actually migrated as it hasn't ever ran. This cures the problem of calling migrate_task_rq_fair(), which does remove_entity_from_load_avg() on tasks that have never been added to the load avg to begin with. This bug would result in transiently messed up load_avg values, averaged out after a few dozen milliseconds. This is probably the reason why this bug was not found for such a long time. Reported-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 16 +++++++++++----- kernel/sched/fair.c | 27 ++++++--------------------- 2 files changed, 17 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index e406ba0c6891..fa3434dffbbb 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2383,9 +2383,6 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) p->sched_class = &fair_sched_class; } - if (p->sched_class->task_fork) - p->sched_class->task_fork(p); - /* * The child is not yet in the pid-hash so no cgroup attach races, * and the cgroup is pinned to this child due to cgroup_fork() @@ -2394,7 +2391,13 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) * Silence PROVE_RCU. */ raw_spin_lock_irqsave(&p->pi_lock, flags); - set_task_cpu(p, cpu); + /* + * We're setting the cpu for the first time, we don't migrate, + * so use __set_task_cpu(). + */ + __set_task_cpu(p, cpu); + if (p->sched_class->task_fork) + p->sched_class->task_fork(p); raw_spin_unlock_irqrestore(&p->pi_lock, flags); #ifdef CONFIG_SCHED_INFO @@ -2534,8 +2537,11 @@ void wake_up_new_task(struct task_struct *p) * Fork balancing, do it here and not earlier because: * - cpus_allowed can change in the fork path * - any previously selected cpu might disappear through hotplug + * + * Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq, + * as we're not fully set-up yet. */ - set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0)); + __set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0)); #endif rq = __task_rq_lock(p, &rf); post_init_entity_util_avg(&p->se); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 73063560b9ec..994f5493ee5b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4448,7 +4448,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) * * note: in the case of encountering a throttled cfs_rq we will * post the final h_nr_running increment below. - */ + */ if (cfs_rq_throttled(cfs_rq)) break; cfs_rq->h_nr_running++; @@ -8289,31 +8289,17 @@ static void task_fork_fair(struct task_struct *p) { struct cfs_rq *cfs_rq; struct sched_entity *se = &p->se, *curr; - int this_cpu = smp_processor_id(); struct rq *rq = this_rq(); - unsigned long flags; - - raw_spin_lock_irqsave(&rq->lock, flags); + raw_spin_lock(&rq->lock); update_rq_clock(rq); cfs_rq = task_cfs_rq(current); curr = cfs_rq->curr; - - /* - * Not only the cpu but also the task_group of the parent might have - * been changed after parent->se.parent,cfs_rq were copied to - * child->se.parent,cfs_rq. So call __set_task_cpu() to make those - * of child point to valid ones. - */ - rcu_read_lock(); - __set_task_cpu(p, this_cpu); - rcu_read_unlock(); - - update_curr(cfs_rq); - - if (curr) + if (curr) { + update_curr(cfs_rq); se->vruntime = curr->vruntime; + } place_entity(cfs_rq, se, 1); if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) { @@ -8326,8 +8312,7 @@ static void task_fork_fair(struct task_struct *p) } se->vruntime -= cfs_rq->min_vruntime; - - raw_spin_unlock_irqrestore(&rq->lock, flags); + raw_spin_unlock(&rq->lock); } /* -- cgit v1.2.3 From 010114739d294c474764c94196d32fb92e233657 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 17 Jun 2016 11:20:46 +0200 Subject: sched/fair: Fix PELT integrity for new groups Vincent reported that when a new task is moved into a new cgroup it gets attached twice to the load tracking: sched_move_task() task_move_group_fair() detach_task_cfs_rq() set_task_rq() attach_task_cfs_rq() attach_entity_load_avg() se->avg.last_load_update = cfs_rq->avg.last_load_update // == 0 enqueue_entity() enqueue_entity_load_avg() update_cfs_rq_load_avg() now = clock() __update_load_avg(&cfs_rq->avg) cfs_rq->avg.last_load_update = now // ages load/util for: now - 0, load/util -> 0 if (migrated) attach_entity_load_avg() se->avg.last_load_update = cfs_rq->avg.last_load_update; // now != 0 The problem is that we don't update cfs_rq load_avg before all entity attach/detach operations. Only enqueue_task() and migrate_task() do this. By fixing this, the above will not happen, because the sched_move_task() attach will have updated cfs_rq's last_load_update time before attach, and in turn the attach will have set the entity's last_load_update stamp. Note that there is a further problem with sched_move_task() calling detach on a task that hasn't yet been attached; this will be taken care of in a subsequent patch. Reported-by: Vincent Guittot Tested-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Yuyang Du Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 994f5493ee5b..dda0ccbd0f3d 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3083,6 +3083,12 @@ static int idle_balance(struct rq *this_rq); #else /* CONFIG_SMP */ +static inline int +update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq) +{ + return 0; +} + static inline void update_load_avg(struct sched_entity *se, int not_used) { struct cfs_rq *cfs_rq = cfs_rq_of(se); @@ -8368,6 +8374,7 @@ static void detach_task_cfs_rq(struct task_struct *p) { struct sched_entity *se = &p->se; struct cfs_rq *cfs_rq = cfs_rq_of(se); + u64 now = cfs_rq_clock_task(cfs_rq); if (!vruntime_normalized(p)) { /* @@ -8379,6 +8386,7 @@ static void detach_task_cfs_rq(struct task_struct *p) } /* Catch up with the cfs_rq and remove our load when we leave */ + update_cfs_rq_load_avg(now, cfs_rq, false); detach_entity_load_avg(cfs_rq, se); } @@ -8386,6 +8394,7 @@ static void attach_task_cfs_rq(struct task_struct *p) { struct sched_entity *se = &p->se; struct cfs_rq *cfs_rq = cfs_rq_of(se); + u64 now = cfs_rq_clock_task(cfs_rq); #ifdef CONFIG_FAIR_GROUP_SCHED /* @@ -8396,6 +8405,7 @@ static void attach_task_cfs_rq(struct task_struct *p) #endif /* Synchronize task with its cfs_rq */ + update_cfs_rq_load_avg(now, cfs_rq, false); attach_entity_load_avg(cfs_rq, se); if (!vruntime_normalized(p)) -- cgit v1.2.3 From ea86cb4b7621e1298a37197005bf0abcc86348d4 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Fri, 17 Jun 2016 13:38:55 +0200 Subject: sched/cgroup: Fix cpu_cgroup_fork() handling A new fair task is detached and attached from/to task_group with: cgroup_post_fork() ss->fork(child) := cpu_cgroup_fork() sched_move_task() task_move_group_fair() Which is wrong, because at this point in fork() the task isn't fully initialized and it cannot 'move' to another group, because its not attached to any group as yet. In fact, cpu_cgroup_fork() needs a small part of sched_move_task() so we can just call this small part directly instead sched_move_task(). And the task doesn't really migrate because it is not yet attached so we need the following sequence: do_fork() sched_fork() __set_task_cpu() cgroup_post_fork() set_task_rq() # set task group and runqueue wake_up_new_task() select_task_rq() can select a new cpu __set_task_cpu post_init_entity_util_avg attach_task_cfs_rq() activate_task enqueue_task This patch makes that happen. Signed-off-by: Vincent Guittot [ Added TASK_SET_GROUP to set depth properly. ] Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 63 ++++++++++++++++++++++++++++++++++------------------ kernel/sched/fair.c | 23 ++++++++++++++++++- kernel/sched/sched.h | 5 ++++- 3 files changed, 67 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index fa3434dffbbb..3d856c46f6d8 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7744,27 +7744,9 @@ void sched_offline_group(struct task_group *tg) spin_unlock_irqrestore(&task_group_lock, flags); } -/* change task's runqueue when it moves between groups. - * The caller of this function should have put the task in its new group - * by now. This function just updates tsk->se.cfs_rq and tsk->se.parent to - * reflect its new group. - */ -void sched_move_task(struct task_struct *tsk) +static void sched_change_group(struct task_struct *tsk, int type) { struct task_group *tg; - int queued, running; - struct rq_flags rf; - struct rq *rq; - - rq = task_rq_lock(tsk, &rf); - - running = task_current(rq, tsk); - queued = task_on_rq_queued(tsk); - - if (queued) - dequeue_task(rq, tsk, DEQUEUE_SAVE | DEQUEUE_MOVE); - if (unlikely(running)) - put_prev_task(rq, tsk); /* * All callers are synchronized by task_rq_lock(); we do not use RCU @@ -7777,11 +7759,37 @@ void sched_move_task(struct task_struct *tsk) tsk->sched_task_group = tg; #ifdef CONFIG_FAIR_GROUP_SCHED - if (tsk->sched_class->task_move_group) - tsk->sched_class->task_move_group(tsk); + if (tsk->sched_class->task_change_group) + tsk->sched_class->task_change_group(tsk, type); else #endif set_task_rq(tsk, task_cpu(tsk)); +} + +/* + * Change task's runqueue when it moves between groups. + * + * The caller of this function should have put the task in its new group by + * now. This function just updates tsk->se.cfs_rq and tsk->se.parent to reflect + * its new group. + */ +void sched_move_task(struct task_struct *tsk) +{ + int queued, running; + struct rq_flags rf; + struct rq *rq; + + rq = task_rq_lock(tsk, &rf); + + running = task_current(rq, tsk); + queued = task_on_rq_queued(tsk); + + if (queued) + dequeue_task(rq, tsk, DEQUEUE_SAVE | DEQUEUE_MOVE); + if (unlikely(running)) + put_prev_task(rq, tsk); + + sched_change_group(tsk, TASK_MOVE_GROUP); if (unlikely(running)) tsk->sched_class->set_curr_task(rq); @@ -8209,9 +8217,20 @@ static void cpu_cgroup_css_free(struct cgroup_subsys_state *css) sched_free_group(tg); } +/* + * This is called before wake_up_new_task(), therefore we really only + * have to set its group bits, all the other stuff does not apply. + */ static void cpu_cgroup_fork(struct task_struct *task) { - sched_move_task(task); + struct rq_flags rf; + struct rq *rq; + + rq = task_rq_lock(task, &rf); + + sched_change_group(task, TASK_SET_GROUP); + + task_rq_unlock(rq, task, &rf); } static int cpu_cgroup_can_attach(struct cgroup_taskset *tset) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index dda0ccbd0f3d..64f26bc436eb 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -8466,6 +8466,14 @@ void init_cfs_rq(struct cfs_rq *cfs_rq) } #ifdef CONFIG_FAIR_GROUP_SCHED +static void task_set_group_fair(struct task_struct *p) +{ + struct sched_entity *se = &p->se; + + set_task_rq(p, task_cpu(p)); + se->depth = se->parent ? se->parent->depth + 1 : 0; +} + static void task_move_group_fair(struct task_struct *p) { detach_task_cfs_rq(p); @@ -8478,6 +8486,19 @@ static void task_move_group_fair(struct task_struct *p) attach_task_cfs_rq(p); } +static void task_change_group_fair(struct task_struct *p, int type) +{ + switch (type) { + case TASK_SET_GROUP: + task_set_group_fair(p); + break; + + case TASK_MOVE_GROUP: + task_move_group_fair(p); + break; + } +} + void free_fair_sched_group(struct task_group *tg) { int i; @@ -8706,7 +8727,7 @@ const struct sched_class fair_sched_class = { .update_curr = update_curr_fair, #ifdef CONFIG_FAIR_GROUP_SCHED - .task_move_group = task_move_group_fair, + .task_change_group = task_change_group_fair, #endif }; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 71ce9862abc3..307bd0418095 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1246,8 +1246,11 @@ struct sched_class { void (*update_curr) (struct rq *rq); +#define TASK_SET_GROUP 0 +#define TASK_MOVE_GROUP 1 + #ifdef CONFIG_FAIR_GROUP_SCHED - void (*task_move_group) (struct task_struct *p); + void (*task_change_group) (struct task_struct *p, int type); #endif }; -- cgit v1.2.3 From 7dc603c9028ea5d4354e0e317e8481df99b06d7e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 16 Jun 2016 13:29:28 +0200 Subject: sched/fair: Fix PELT integrity for new tasks Vincent and Yuyang found another few scenarios in which entity tracking goes wobbly. The scenarios are basically due to the fact that new tasks are not immediately attached and thereby differ from the normal situation -- a task is always attached to a cfs_rq load average (such that it includes its blocked contribution) and are explicitly detached/attached on migration to another cfs_rq. Scenario 1: switch to fair class p->sched_class = fair_class; if (queued) enqueue_task(p); ... enqueue_entity() enqueue_entity_load_avg() migrated = !sa->last_update_time (true) if (migrated) attach_entity_load_avg() check_class_changed() switched_from() (!fair) switched_to() (fair) switched_to_fair() attach_entity_load_avg() If @p is a new task that hasn't been fair before, it will have !last_update_time and, per the above, end up in attach_entity_load_avg() _twice_. Scenario 2: change between cgroups sched_move_group(p) if (queued) dequeue_task() task_move_group_fair() detach_task_cfs_rq() detach_entity_load_avg() set_task_rq() attach_task_cfs_rq() attach_entity_load_avg() if (queued) enqueue_task(); ... enqueue_entity() enqueue_entity_load_avg() migrated = !sa->last_update_time (true) if (migrated) attach_entity_load_avg() Similar as with scenario 1, if @p is a new task, it will have !load_update_time and we'll end up in attach_entity_load_avg() _twice_. Furthermore, notice how we do a detach_entity_load_avg() on something that wasn't attached to begin with. As stated above; the problem is that the new task isn't yet attached to the load tracking and thereby violates the invariant assumption. This patch remedies this by ensuring a new task is indeed properly attached to the load tracking on creation, through post_init_entity_util_avg(). Of course, this isn't entirely as straightforward as one might think, since the task is hashed before we call wake_up_new_task() and thus can be poked at. We avoid this by adding TASK_NEW and teaching cpu_cgroup_can_attach() to refuse such tasks. Reported-by: Yuyang Du Reported-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- include/linux/sched.h | 5 +++-- kernel/sched/core.c | 28 +++++++++++++++++++++++----- kernel/sched/fair.c | 45 +++++++++++++++++++++++++++++++++++++-------- 3 files changed, 63 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched.h b/include/linux/sched.h index b45acfd18f4e..d99218a1e043 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -219,9 +219,10 @@ extern void proc_sched_set_task(struct task_struct *p); #define TASK_WAKING 256 #define TASK_PARKED 512 #define TASK_NOLOAD 1024 -#define TASK_STATE_MAX 2048 +#define TASK_NEW 2048 +#define TASK_STATE_MAX 4096 -#define TASK_STATE_TO_CHAR_STR "RSDTtXZxKWPN" +#define TASK_STATE_TO_CHAR_STR "RSDTtXZxKWPNn" extern char ___assert_task_state[1 - 2*!!( sizeof(TASK_STATE_TO_CHAR_STR)-1 != ilog2(TASK_STATE_MAX)+1)]; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 3d856c46f6d8..14afa518948c 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2342,11 +2342,11 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) __sched_fork(clone_flags, p); /* - * We mark the process as running here. This guarantees that + * We mark the process as NEW here. This guarantees that * nobody will actually run it, and a signal or other external * event cannot wake it up and insert it on the runqueue either. */ - p->state = TASK_RUNNING; + p->state = TASK_NEW; /* * Make sure we do not leak PI boosting priority to the child. @@ -2383,6 +2383,8 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) p->sched_class = &fair_sched_class; } + init_entity_runnable_average(&p->se); + /* * The child is not yet in the pid-hash so no cgroup attach races, * and the cgroup is pinned to this child due to cgroup_fork() @@ -2529,9 +2531,8 @@ void wake_up_new_task(struct task_struct *p) struct rq_flags rf; struct rq *rq; - /* Initialize new task's runnable average */ - init_entity_runnable_average(&p->se); raw_spin_lock_irqsave(&p->pi_lock, rf.flags); + p->state = TASK_RUNNING; #ifdef CONFIG_SMP /* * Fork balancing, do it here and not earlier because: @@ -8237,6 +8238,7 @@ static int cpu_cgroup_can_attach(struct cgroup_taskset *tset) { struct task_struct *task; struct cgroup_subsys_state *css; + int ret = 0; cgroup_taskset_for_each(task, css, tset) { #ifdef CONFIG_RT_GROUP_SCHED @@ -8247,8 +8249,24 @@ static int cpu_cgroup_can_attach(struct cgroup_taskset *tset) if (task->sched_class != &fair_sched_class) return -EINVAL; #endif + /* + * Serialize against wake_up_new_task() such that if its + * running, we're sure to observe its full state. + */ + raw_spin_lock_irq(&task->pi_lock); + /* + * Avoid calling sched_move_task() before wake_up_new_task() + * has happened. This would lead to problems with PELT, due to + * move wanting to detach+attach while we're not attached yet. + */ + if (task->state == TASK_NEW) + ret = -EINVAL; + raw_spin_unlock_irq(&task->pi_lock); + + if (ret) + break; } - return 0; + return ret; } static void cpu_cgroup_attach(struct cgroup_taskset *tset) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 64f26bc436eb..0c21a12c0205 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -690,6 +690,10 @@ void init_entity_runnable_average(struct sched_entity *se) /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */ } +static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq); +static int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq); +static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se); + /* * With new tasks being created, their initial util_avgs are extrapolated * based on the cfs_rq's current util_avg: @@ -720,6 +724,7 @@ void post_init_entity_util_avg(struct sched_entity *se) struct cfs_rq *cfs_rq = cfs_rq_of(se); struct sched_avg *sa = &se->avg; long cap = (long)(SCHED_CAPACITY_SCALE - cfs_rq->avg.util_avg) / 2; + u64 now = cfs_rq_clock_task(cfs_rq); if (cap > 0) { if (cfs_rq->avg.util_avg != 0) { @@ -733,16 +738,37 @@ void post_init_entity_util_avg(struct sched_entity *se) } sa->util_sum = sa->util_avg * LOAD_AVG_MAX; } + + if (entity_is_task(se)) { + struct task_struct *p = task_of(se); + if (p->sched_class != &fair_sched_class) { + /* + * For !fair tasks do: + * + update_cfs_rq_load_avg(now, cfs_rq, false); + attach_entity_load_avg(cfs_rq, se); + switched_from_fair(rq, p); + * + * such that the next switched_to_fair() has the + * expected state. + */ + se->avg.last_update_time = now; + return; + } + } + + update_cfs_rq_load_avg(now, cfs_rq, false); + attach_entity_load_avg(cfs_rq, se); } -#else +#else /* !CONFIG_SMP */ void init_entity_runnable_average(struct sched_entity *se) { } void post_init_entity_util_avg(struct sched_entity *se) { } -#endif +#endif /* CONFIG_SMP */ /* * Update the current task's runtime statistics. @@ -2840,8 +2866,6 @@ void set_task_rq_fair(struct sched_entity *se, static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {} #endif /* CONFIG_FAIR_GROUP_SCHED */ -static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq); - static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq) { struct rq *rq = rq_of(cfs_rq); @@ -2951,6 +2975,8 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s /* * If we got migrated (either between CPUs or between cgroups) we'll * have aged the average right before clearing @last_update_time. + * + * Or we're fresh through post_init_entity_util_avg(). */ if (se->avg.last_update_time) { __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)), @@ -3056,11 +3082,14 @@ void remove_entity_load_avg(struct sched_entity *se) u64 last_update_time; /* - * Newly created task or never used group entity should not be removed - * from its (source) cfs_rq + * tasks cannot exit without having gone through wake_up_new_task() -> + * post_init_entity_util_avg() which will have added things to the + * cfs_rq, so we can remove unconditionally. + * + * Similarly for groups, they will have passed through + * post_init_entity_util_avg() before unregister_sched_fair_group() + * calls this. */ - if (se->avg.last_update_time == 0) - return; last_update_time = cfs_rq_last_update_time(cfs_rq); -- cgit v1.2.3 From 3d30544f02120b884bba2a9466c87dba980e3be5 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 21 Jun 2016 14:27:50 +0200 Subject: sched/fair: Apply more PELT fixes One additional 'rule' for using update_cfs_rq_load_avg() is that one should call update_tg_load_avg() if it returns true. Add a bunch of comments to hopefully clarify some of the rules: o You need to update cfs_rq _before_ any entity attach/detach, this is important, because while for mathmatical consisency this isn't strictly needed, it is required for the physical interpretation of the model, you attach/detach _now_. o When you modify the cfs_rq avg, you have to then call update_tg_load_avg() in order to propagate changes upwards. o (Fair) entities are always attached, switched_{to,from}_fair() deal with !fair. This directly follows from the definition of the cfs_rq averages, namely that they are a direct sum of all (runnable or blocked) entities on that rq. It is the second rule that this patch enforces, but it adds comments pertaining to all of them. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 53 +++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 49 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 0c21a12c0205..781788d54736 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -692,6 +692,7 @@ void init_entity_runnable_average(struct sched_entity *se) static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq); static int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq); +static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force); static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se); /* @@ -725,6 +726,7 @@ void post_init_entity_util_avg(struct sched_entity *se) struct sched_avg *sa = &se->avg; long cap = (long)(SCHED_CAPACITY_SCALE - cfs_rq->avg.util_avg) / 2; u64 now = cfs_rq_clock_task(cfs_rq); + int tg_update; if (cap > 0) { if (cfs_rq->avg.util_avg != 0) { @@ -757,8 +759,10 @@ void post_init_entity_util_avg(struct sched_entity *se) } } - update_cfs_rq_load_avg(now, cfs_rq, false); + tg_update = update_cfs_rq_load_avg(now, cfs_rq, false); attach_entity_load_avg(cfs_rq, se); + if (tg_update) + update_tg_load_avg(cfs_rq, false); } #else /* !CONFIG_SMP */ @@ -768,6 +772,9 @@ void init_entity_runnable_average(struct sched_entity *se) void post_init_entity_util_avg(struct sched_entity *se) { } +static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) +{ +} #endif /* CONFIG_SMP */ /* @@ -2912,7 +2919,23 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq) WRITE_ONCE(*ptr, res); \ } while (0) -/* Group cfs_rq's load_avg is used for task_h_load and update_cfs_share */ +/** + * update_cfs_rq_load_avg - update the cfs_rq's load/util averages + * @now: current time, as per cfs_rq_clock_task() + * @cfs_rq: cfs_rq to update + * @update_freq: should we call cfs_rq_util_change() or will the call do so + * + * The cfs_rq avg is the direct sum of all its entities (blocked and runnable) + * avg. The immediate corollary is that all (fair) tasks must be attached, see + * post_init_entity_util_avg(). + * + * cfs_rq->avg is used for task_h_load() and update_cfs_share() for example. + * + * Returns true if the load decayed or we removed utilization. It is expected + * that one calls update_tg_load_avg() on this condition, but after you've + * modified the cfs_rq avg (attach/detach), such that we propagate the new + * avg up. + */ static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq) { @@ -2967,6 +2990,14 @@ static inline void update_load_avg(struct sched_entity *se, int update_tg) update_tg_load_avg(cfs_rq, 0); } +/** + * attach_entity_load_avg - attach this entity to its cfs_rq load avg + * @cfs_rq: cfs_rq to attach to + * @se: sched_entity to attach + * + * Must call update_cfs_rq_load_avg() before this, since we rely on + * cfs_rq->avg.last_update_time being current. + */ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { if (!sched_feat(ATTACH_AGE_LOAD)) @@ -2998,6 +3029,14 @@ skip_aging: cfs_rq_util_change(cfs_rq); } +/** + * detach_entity_load_avg - detach this entity from its cfs_rq load avg + * @cfs_rq: cfs_rq to detach from + * @se: sched_entity to detach + * + * Must call update_cfs_rq_load_avg() before this, since we rely on + * cfs_rq->avg.last_update_time being current. + */ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)), @@ -8404,6 +8443,7 @@ static void detach_task_cfs_rq(struct task_struct *p) struct sched_entity *se = &p->se; struct cfs_rq *cfs_rq = cfs_rq_of(se); u64 now = cfs_rq_clock_task(cfs_rq); + int tg_update; if (!vruntime_normalized(p)) { /* @@ -8415,8 +8455,10 @@ static void detach_task_cfs_rq(struct task_struct *p) } /* Catch up with the cfs_rq and remove our load when we leave */ - update_cfs_rq_load_avg(now, cfs_rq, false); + tg_update = update_cfs_rq_load_avg(now, cfs_rq, false); detach_entity_load_avg(cfs_rq, se); + if (tg_update) + update_tg_load_avg(cfs_rq, false); } static void attach_task_cfs_rq(struct task_struct *p) @@ -8424,6 +8466,7 @@ static void attach_task_cfs_rq(struct task_struct *p) struct sched_entity *se = &p->se; struct cfs_rq *cfs_rq = cfs_rq_of(se); u64 now = cfs_rq_clock_task(cfs_rq); + int tg_update; #ifdef CONFIG_FAIR_GROUP_SCHED /* @@ -8434,8 +8477,10 @@ static void attach_task_cfs_rq(struct task_struct *p) #endif /* Synchronize task with its cfs_rq */ - update_cfs_rq_load_avg(now, cfs_rq, false); + tg_update = update_cfs_rq_load_avg(now, cfs_rq, false); attach_entity_load_avg(cfs_rq, se); + if (tg_update) + update_tg_load_avg(cfs_rq, false); if (!vruntime_normalized(p)) se->vruntime += cfs_rq->min_vruntime; -- cgit v1.2.3 From 8663e24d56dc1f093232783c23ea17f2a6f61c03 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 22 Jun 2016 14:58:02 +0200 Subject: sched/fair: Reorder cgroup creation code A future patch needs rq->lock held _after_ we link the task_group into the hierarchy. In order to avoid taking every rq->lock twice, reorder things a little and create online_fair_sched_group() to be called after we link the task_group. All this code is still ran from css_alloc() so css_online() isn't in fact used for this. Signed-off-by: Peter Zijlstra (Intel) Cc: Konstantin Khlebnikov Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: bsegall@google.com Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 2 ++ kernel/sched/fair.c | 22 ++++++++++++++++++---- kernel/sched/sched.h | 1 + 3 files changed, 21 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 14afa518948c..4ede4fc65653 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7717,6 +7717,8 @@ void sched_online_group(struct task_group *tg, struct task_group *parent) INIT_LIST_HEAD(&tg->children); list_add_rcu(&tg->siblings, &parent->children); spin_unlock_irqrestore(&task_group_lock, flags); + + online_fair_sched_group(tg); } /* rcu callback to free various structures associated with a task group */ diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 781788d54736..62d5e7dcc7f8 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -8624,10 +8624,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) init_cfs_rq(cfs_rq); init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]); init_entity_runnable_average(se); - - raw_spin_lock_irq(&rq->lock); - post_init_entity_util_avg(se); - raw_spin_unlock_irq(&rq->lock); } return 1; @@ -8638,6 +8634,22 @@ err: return 0; } +void online_fair_sched_group(struct task_group *tg) +{ + struct sched_entity *se; + struct rq *rq; + int i; + + for_each_possible_cpu(i) { + rq = cpu_rq(i); + se = tg->se[i]; + + raw_spin_lock_irq(&rq->lock); + post_init_entity_util_avg(se); + raw_spin_unlock_irq(&rq->lock); + } +} + void unregister_fair_sched_group(struct task_group *tg) { unsigned long flags; @@ -8742,6 +8754,8 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) return 1; } +void online_fair_sched_group(struct task_group *tg) { } + void unregister_fair_sched_group(struct task_group *tg) { } #endif /* CONFIG_FAIR_GROUP_SCHED */ diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 307bd0418095..28c42b789f70 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -321,6 +321,7 @@ extern int tg_nop(struct task_group *tg, void *data); extern void free_fair_sched_group(struct task_group *tg); extern int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent); +extern void online_fair_sched_group(struct task_group *tg); extern void unregister_fair_sched_group(struct task_group *tg); extern void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, struct sched_entity *se, int cpu, -- cgit v1.2.3 From 599b4840b0ea453c7d11e1798dcc8f494dcfd58a Mon Sep 17 00:00:00 2001 From: Zev Weiss Date: Sun, 26 Jun 2016 16:13:23 -0500 Subject: sched/core: Fix sched_getaffinity() return value kerneldoc comment Previous version was probably written referencing the man page for glibc's wrapper, but the wrapper's behavior differs from that of the syscall itself in this case. Signed-off-by: Zev Weiss Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/1466975603-25408-1-git-send-email-zev@bewilderbeest.net Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 4ede4fc65653..28da50a5bc76 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4759,7 +4759,8 @@ out_unlock: * @len: length in bytes of the bitmask pointed to by user_mask_ptr * @user_mask_ptr: user-space pointer to hold the current cpu mask * - * Return: 0 on success. An error code otherwise. + * Return: size of CPU mask copied to user_mask_ptr on success. An + * error code otherwise. */ SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, unsigned long __user *, user_mask_ptr) -- cgit v1.2.3 From 55e16d30bd99510900caec913c90f53bc2b35cba Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 22 Jun 2016 15:14:26 +0200 Subject: sched/fair: Rework throttle_count sync Since we already take rq->lock when creating a cgroup, use it to also sync the throttle_count and avoid the extra state and enqueue path branch. Signed-off-by: Peter Zijlstra (Intel) Cc: Konstantin Khlebnikov Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: bsegall@google.com Cc: linux-kernel@vger.kernel.org [ Fixed build warning. ] Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 39 +++++++++++++++++++-------------------- kernel/sched/sched.h | 2 +- 2 files changed, 20 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 62d5e7dcc7f8..4088eedea763 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4241,26 +4241,6 @@ static void check_enqueue_throttle(struct cfs_rq *cfs_rq) if (!cfs_bandwidth_used()) return; - /* Synchronize hierarchical throttle counter: */ - if (unlikely(!cfs_rq->throttle_uptodate)) { - struct rq *rq = rq_of(cfs_rq); - struct cfs_rq *pcfs_rq; - struct task_group *tg; - - cfs_rq->throttle_uptodate = 1; - - /* Get closest up-to-date node, because leaves go first: */ - for (tg = cfs_rq->tg->parent; tg; tg = tg->parent) { - pcfs_rq = tg->cfs_rq[cpu_of(rq)]; - if (pcfs_rq->throttle_uptodate) - break; - } - if (tg) { - cfs_rq->throttle_count = pcfs_rq->throttle_count; - cfs_rq->throttled_clock_task = rq_clock_task(rq); - } - } - /* an active group must be handled by the update_curr()->put() path */ if (!cfs_rq->runtime_enabled || cfs_rq->curr) return; @@ -4275,6 +4255,23 @@ static void check_enqueue_throttle(struct cfs_rq *cfs_rq) throttle_cfs_rq(cfs_rq); } +static void sync_throttle(struct task_group *tg, int cpu) +{ + struct cfs_rq *pcfs_rq, *cfs_rq; + + if (!cfs_bandwidth_used()) + return; + + if (!tg->parent) + return; + + cfs_rq = tg->cfs_rq[cpu]; + pcfs_rq = tg->parent->cfs_rq[cpu]; + + cfs_rq->throttle_count = pcfs_rq->throttle_count; + pcfs_rq->throttled_clock_task = rq_clock_task(cpu_rq(cpu)); +} + /* conditionally throttle active cfs_rq's from put_prev_entity() */ static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { @@ -4414,6 +4411,7 @@ static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq) static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {} static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; } static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {} +static inline void sync_throttle(struct task_group *tg, int cpu) {} static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq) @@ -8646,6 +8644,7 @@ void online_fair_sched_group(struct task_group *tg) raw_spin_lock_irq(&rq->lock); post_init_entity_util_avg(se); + sync_throttle(tg, i); raw_spin_unlock_irq(&rq->lock); } } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 28c42b789f70..f44da95c70cd 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -438,7 +438,7 @@ struct cfs_rq { u64 throttled_clock, throttled_clock_task; u64 throttled_clock_task_time; - int throttled, throttle_count, throttle_uptodate; + int throttled, throttle_count; struct list_head throttled_list; #endif /* CONFIG_CFS_BANDWIDTH */ #endif /* CONFIG_FAIR_GROUP_SCHED */ -- cgit v1.2.3 From eaaa7ec71bff4cb34d9025ed89068d4b3cac3df0 Mon Sep 17 00:00:00 2001 From: Gregor Boirie Date: Wed, 9 Mar 2016 19:05:48 +0100 Subject: timekeeping: export get_monotonic_coarse64 symbol EXPORT_SYMBOL() get_monotonic_coarse64 for new IIO timestamping clock selection usage. This provides user apps the ability to request a particular IIO device to timestamp samples using a monotonic coarse clock granularity. Signed-off-by: Gregor Boirie Signed-off-by: Jonathan Cameron --- kernel/time/timekeeping.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 479d25cd3d4f..255e225393ac 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -2186,6 +2186,7 @@ struct timespec64 get_monotonic_coarse64(void) return now; } +EXPORT_SYMBOL(get_monotonic_coarse64); /* * Must hold jiffies_lock -- cgit v1.2.3 From 6168f8ed01dc46a277908938294f1132d723f58d Mon Sep 17 00:00:00 2001 From: Wei Jiangang Date: Wed, 29 Jun 2016 12:51:50 +0800 Subject: timers/nohz: Fix several typos Signed-off-by: Wei Jiangang Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: fenghua.yu@intel.com Link: http://lkml.kernel.org/r/1467175910-2966-2-git-send-email-weijg.fnst@cn.fujitsu.com Signed-off-by: Ingo Molnar --- kernel/time/tick-sched.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 536ada80f6dd..6d83e9c4a302 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -61,7 +61,7 @@ static void tick_do_update_jiffies64(ktime_t now) if (delta.tv64 < tick_period.tv64) return; - /* Reevalute with jiffies_lock held */ + /* Reevaluate with jiffies_lock held */ write_seqlock(&jiffies_lock); delta = ktime_sub(now, last_jiffies_update); @@ -117,7 +117,7 @@ static void tick_sched_do_timer(ktime_t now) /* * Check if the do_timer duty was dropped. We don't care about * concurrency: This happens only when the cpu in charge went - * into a long sleep. If two cpus happen to assign themself to + * into a long sleep. If two cpus happen to assign themselves to * this duty, then the jiffies update is still serialized by * jiffies_lock. */ @@ -571,7 +571,7 @@ static ktime_t tick_nohz_start_idle(struct tick_sched *ts) * @last_update_time: variable to store update time in. Do not update * counters if NULL. * - * Return the cummulative idle time (since boot) for a given + * Return the cumulative idle time (since boot) for a given * CPU, in microseconds. * * This time is measured via accounting rather than sampling, @@ -612,7 +612,7 @@ EXPORT_SYMBOL_GPL(get_cpu_idle_time_us); * @last_update_time: variable to store update time in. Do not update * counters if NULL. * - * Return the cummulative iowait time (since boot) for a given + * Return the cumulative iowait time (since boot) for a given * CPU, in microseconds. * * This time is measured via accounting rather than sampling, @@ -733,7 +733,7 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, * do_timer() never invoked. Keep track of the fact that it * was the one which had the do_timer() duty last. If this cpu * is the one which had the do_timer() duty last, we limit the - * sleep time to the timekeeping max_deferement value. + * sleep time to the timekeeping max_deferment value. * Otherwise we can sleep as long as we want. */ delta = timekeeping_max_deferment(); -- cgit v1.2.3 From 0de7611a1031f25b713fda7d36de44f17c2ed790 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 1 Jul 2016 12:42:35 +0200 Subject: timers/nohz: Capitalize 'CPU' consistently While reviewing another patch I noticed that kernel/time/tick-sched.c had a charmingly (confusingly, annoyingly) rich set of variants for spelling 'CPU': cpu cpus CPU CPUs per CPU per-CPU per cpu ... sometimes these were mixed even within the same comment block! Compress these variants down to a single consistent set of: CPU CPUs per-CPU Cc: Frederic Weisbecker Cc: Thomas Gleixner Cc: Linus Torvalds Cc: Peter Zijlstra Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/time/tick-sched.c | 42 +++++++++++++++++++++--------------------- 1 file changed, 21 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 6d83e9c4a302..db57d1ba73eb 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -31,7 +31,7 @@ #include /* - * Per cpu nohz control structure + * Per-CPU nohz control structure */ static DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched); @@ -116,8 +116,8 @@ static void tick_sched_do_timer(ktime_t now) #ifdef CONFIG_NO_HZ_COMMON /* * Check if the do_timer duty was dropped. We don't care about - * concurrency: This happens only when the cpu in charge went - * into a long sleep. If two cpus happen to assign themselves to + * concurrency: This happens only when the CPU in charge went + * into a long sleep. If two CPUs happen to assign themselves to * this duty, then the jiffies update is still serialized by * jiffies_lock. */ @@ -349,7 +349,7 @@ void tick_nohz_dep_clear_signal(struct signal_struct *sig, enum tick_dep_bits bi /* * Re-evaluate the need for the tick as we switch the current task. * It might need the tick due to per task/process properties: - * perf events, posix cpu timers, ... + * perf events, posix CPU timers, ... */ void __tick_nohz_task_switch(void) { @@ -509,8 +509,8 @@ int tick_nohz_tick_stopped(void) * * In case the sched_tick was stopped on this CPU, we have to check if jiffies * must be updated. Otherwise an interrupt handler could use a stale jiffy - * value. We do this unconditionally on any cpu, as we don't know whether the - * cpu, which has the update task assigned is in a long sleep. + * value. We do this unconditionally on any CPU, as we don't know whether the + * CPU, which has the update task assigned is in a long sleep. */ static void tick_nohz_update_jiffies(ktime_t now) { @@ -526,7 +526,7 @@ static void tick_nohz_update_jiffies(ktime_t now) } /* - * Updates the per cpu time idle statistics counters + * Updates the per-CPU time idle statistics counters */ static void update_ts_time_stats(int cpu, struct tick_sched *ts, ktime_t now, u64 *last_update_time) @@ -566,7 +566,7 @@ static ktime_t tick_nohz_start_idle(struct tick_sched *ts) } /** - * get_cpu_idle_time_us - get the total idle time of a cpu + * get_cpu_idle_time_us - get the total idle time of a CPU * @cpu: CPU number to query * @last_update_time: variable to store update time in. Do not update * counters if NULL. @@ -607,7 +607,7 @@ u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time) EXPORT_SYMBOL_GPL(get_cpu_idle_time_us); /** - * get_cpu_iowait_time_us - get the total iowait time of a cpu + * get_cpu_iowait_time_us - get the total iowait time of a CPU * @cpu: CPU number to query * @last_update_time: variable to store update time in. Do not update * counters if NULL. @@ -726,12 +726,12 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, } /* - * If this cpu is the one which updates jiffies, then give up - * the assignment and let it be taken by the cpu which runs - * the tick timer next, which might be this cpu as well. If we + * If this CPU is the one which updates jiffies, then give up + * the assignment and let it be taken by the CPU which runs + * the tick timer next, which might be this CPU as well. If we * don't drop this here the jiffies might be stale and * do_timer() never invoked. Keep track of the fact that it - * was the one which had the do_timer() duty last. If this cpu + * was the one which had the do_timer() duty last. If this CPU * is the one which had the do_timer() duty last, we limit the * sleep time to the timekeeping max_deferment value. * Otherwise we can sleep as long as we want. @@ -841,9 +841,9 @@ static void tick_nohz_full_update_tick(struct tick_sched *ts) static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) { /* - * If this cpu is offline and it is the one which updates + * If this CPU is offline and it is the one which updates * jiffies, then give up the assignment and let it be taken by - * the cpu which runs the tick timer next. If we don't drop + * the CPU which runs the tick timer next. If we don't drop * this here the jiffies might be stale and do_timer() never * invoked. */ @@ -933,11 +933,11 @@ void tick_nohz_idle_enter(void) WARN_ON_ONCE(irqs_disabled()); /* - * Update the idle state in the scheduler domain hierarchy - * when tick_nohz_stop_sched_tick() is called from the idle loop. - * State will be updated to busy during the first busy tick after - * exiting idle. - */ + * Update the idle state in the scheduler domain hierarchy + * when tick_nohz_stop_sched_tick() is called from the idle loop. + * State will be updated to busy during the first busy tick after + * exiting idle. + */ set_cpu_sd_state_idle(); local_irq_disable(); @@ -1211,7 +1211,7 @@ void tick_setup_sched_timer(void) hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); ts->sched_timer.function = tick_sched_timer; - /* Get the next period (per cpu) */ + /* Get the next period (per-CPU) */ hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update()); /* Offset the tick to avert jiffies_lock contention. */ -- cgit v1.2.3 From b6140914fd079e43ea75a53429b47128584f033a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 4 Jul 2016 17:39:22 +0900 Subject: genirq/msi: Remove unused MSI_FLAG_IDENTITY_MAP No user and we definitely don't want to grow one. Signed-off-by: Thomas Gleixner Reviewed-by: Bart Van Assche Cc: Christoph Hellwig Cc: linux-block@vger.kernel.org Cc: linux-pci@vger.kernel.org Cc: linux-nvme@lists.infradead.org Cc: axboe@fb.com Cc: agordeev@redhat.com Link: http://lkml.kernel.org/r/1467621574-8277-2-git-send-email-hch@lst.de Signed-off-by: Thomas Gleixner --- include/linux/msi.h | 6 ++---- kernel/irq/msi.c | 8 ++------ 2 files changed, 4 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/include/linux/msi.h b/include/linux/msi.h index 8b425c66305a..c33abfa0f5a7 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -264,12 +264,10 @@ enum { * callbacks. */ MSI_FLAG_USE_DEF_CHIP_OPS = (1 << 1), - /* Build identity map between hwirq and irq */ - MSI_FLAG_IDENTITY_MAP = (1 << 2), /* Support multiple PCI MSI interrupts */ - MSI_FLAG_MULTI_PCI_MSI = (1 << 3), + MSI_FLAG_MULTI_PCI_MSI = (1 << 2), /* Support PCI MSIX interrupts */ - MSI_FLAG_PCI_MSIX = (1 << 4), + MSI_FLAG_PCI_MSIX = (1 << 3), }; int msi_domain_set_affinity(struct irq_data *data, const struct cpumask *mask, diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index 38e89ce7b071..eb5bf2b50b07 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -324,7 +324,7 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, struct msi_domain_ops *ops = info->ops; msi_alloc_info_t arg; struct msi_desc *desc; - int i, ret, virq = -1; + int i, ret, virq; ret = msi_domain_prepare_irqs(domain, dev, nvec, &arg); if (ret) @@ -332,12 +332,8 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, for_each_msi_entry(desc, dev) { ops->set_desc(&arg, desc); - if (info->flags & MSI_FLAG_IDENTITY_MAP) - virq = (int)ops->get_hwirq(info, &arg); - else - virq = -1; - virq = __irq_domain_alloc_irqs(domain, virq, desc->nvec_used, + virq = __irq_domain_alloc_irqs(domain, -1, desc->nvec_used, dev_to_node(dev), &arg, false); if (virq < 0) { ret = -ENOSPC; -- cgit v1.2.3 From 9c2555835bb3d34dfac52a0be943dcc4bedd650f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 4 Jul 2016 17:39:23 +0900 Subject: genirq: Introduce IRQD_AFFINITY_MANAGED flag Interupts marked with this flag are excluded from user space interrupt affinity changes. Contrary to the IRQ_NO_BALANCING flag, the kernel internal affinity mechanism is not blocked. This flag will be used for multi-queue device interrupts. Signed-off-by: Thomas Gleixner Cc: Christoph Hellwig Cc: linux-block@vger.kernel.org Cc: linux-pci@vger.kernel.org Cc: linux-nvme@lists.infradead.org Cc: axboe@fb.com Cc: agordeev@redhat.com Link: http://lkml.kernel.org/r/1467621574-8277-3-git-send-email-hch@lst.de Signed-off-by: Thomas Gleixner --- include/linux/irq.h | 7 +++++++ kernel/irq/internals.h | 2 ++ kernel/irq/manage.c | 21 ++++++++++++++++++--- kernel/irq/proc.c | 2 +- 4 files changed, 28 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/include/linux/irq.h b/include/linux/irq.h index 4d758a7c604a..f6074813688d 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -197,6 +197,7 @@ struct irq_data { * IRQD_IRQ_INPROGRESS - In progress state of the interrupt * IRQD_WAKEUP_ARMED - Wakeup mode armed * IRQD_FORWARDED_TO_VCPU - The interrupt is forwarded to a VCPU + * IRQD_AFFINITY_MANAGED - Affinity is auto-managed by the kernel */ enum { IRQD_TRIGGER_MASK = 0xf, @@ -212,6 +213,7 @@ enum { IRQD_IRQ_INPROGRESS = (1 << 18), IRQD_WAKEUP_ARMED = (1 << 19), IRQD_FORWARDED_TO_VCPU = (1 << 20), + IRQD_AFFINITY_MANAGED = (1 << 21), }; #define __irqd_to_state(d) ACCESS_PRIVATE((d)->common, state_use_accessors) @@ -305,6 +307,11 @@ static inline void irqd_clr_forwarded_to_vcpu(struct irq_data *d) __irqd_to_state(d) &= ~IRQD_FORWARDED_TO_VCPU; } +static inline bool irqd_affinity_is_managed(struct irq_data *d) +{ + return __irqd_to_state(d) & IRQD_AFFINITY_MANAGED; +} + #undef __irqd_to_state static inline irq_hw_number_t irqd_to_hwirq(struct irq_data *d) diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 09be2c903c6d..b15aa3b617a2 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -105,6 +105,8 @@ static inline void unregister_handler_proc(unsigned int irq, struct irqaction *action) { } #endif +extern bool irq_can_set_affinity_usr(unsigned int irq); + extern int irq_select_affinity_usr(unsigned int irq, struct cpumask *mask); extern void irq_set_thread_affinity(struct irq_desc *desc); diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index ef0bc02c3a70..30658e9827f0 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -115,12 +115,12 @@ EXPORT_SYMBOL(synchronize_irq); #ifdef CONFIG_SMP cpumask_var_t irq_default_affinity; -static int __irq_can_set_affinity(struct irq_desc *desc) +static bool __irq_can_set_affinity(struct irq_desc *desc) { if (!desc || !irqd_can_balance(&desc->irq_data) || !desc->irq_data.chip || !desc->irq_data.chip->irq_set_affinity) - return 0; - return 1; + return false; + return true; } /** @@ -133,6 +133,21 @@ int irq_can_set_affinity(unsigned int irq) return __irq_can_set_affinity(irq_to_desc(irq)); } +/** + * irq_can_set_affinity_usr - Check if affinity of a irq can be set from user space + * @irq: Interrupt to check + * + * Like irq_can_set_affinity() above, but additionally checks for the + * AFFINITY_MANAGED flag. + */ +bool irq_can_set_affinity_usr(unsigned int irq) +{ + struct irq_desc *desc = irq_to_desc(irq); + + return __irq_can_set_affinity(desc) && + !irqd_affinity_is_managed(&desc->irq_data); +} + /** * irq_set_thread_affinity - Notify irq threads to adjust affinity * @desc: irq descriptor which has affitnity changed diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 4e1b94726818..40bdcdc1f700 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -96,7 +96,7 @@ static ssize_t write_irq_affinity(int type, struct file *file, cpumask_var_t new_value; int err; - if (!irq_can_set_affinity(irq) || no_irq_affinity) + if (!irq_can_set_affinity_usr(irq) || no_irq_affinity) return -EIO; if (!alloc_cpumask_var(&new_value, GFP_KERNEL)) -- cgit v1.2.3 From 06ee6d571f0e350253a8fc3492316b2be007fae2 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 4 Jul 2016 17:39:24 +0900 Subject: genirq: Add affinity hint to irq allocation Add an extra argument to the irq(domain) allocation functions, so we can hand down affinity hints to the allocator. Thats necessary to implement proper support for multiqueue devices. Signed-off-by: Thomas Gleixner Cc: Christoph Hellwig Cc: linux-block@vger.kernel.org Cc: linux-pci@vger.kernel.org Cc: linux-nvme@lists.infradead.org Cc: axboe@fb.com Cc: agordeev@redhat.com Link: http://lkml.kernel.org/r/1467621574-8277-4-git-send-email-hch@lst.de Signed-off-by: Thomas Gleixner --- arch/sparc/kernel/irq_64.c | 2 +- arch/x86/kernel/apic/io_apic.c | 5 +++-- include/linux/irq.h | 4 ++-- include/linux/irqdomain.h | 9 ++++++--- kernel/irq/ipi.c | 2 +- kernel/irq/irqdesc.c | 12 ++++++++---- kernel/irq/irqdomain.c | 22 ++++++++++++++-------- kernel/irq/manage.c | 7 ++++--- kernel/irq/msi.c | 3 ++- 9 files changed, 41 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/arch/sparc/kernel/irq_64.c b/arch/sparc/kernel/irq_64.c index e22416ce56ea..34a7930b76ef 100644 --- a/arch/sparc/kernel/irq_64.c +++ b/arch/sparc/kernel/irq_64.c @@ -242,7 +242,7 @@ unsigned int irq_alloc(unsigned int dev_handle, unsigned int dev_ino) { int irq; - irq = __irq_alloc_descs(-1, 1, 1, numa_node_id(), NULL); + irq = __irq_alloc_descs(-1, 1, 1, numa_node_id(), NULL, NULL); if (irq <= 0) goto out; diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 446702ed99dc..7c4f90dd4c2a 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -981,7 +981,7 @@ static int alloc_irq_from_domain(struct irq_domain *domain, int ioapic, u32 gsi, return __irq_domain_alloc_irqs(domain, irq, 1, ioapic_alloc_attr_node(info), - info, legacy); + info, legacy, NULL); } /* @@ -1014,7 +1014,8 @@ static int alloc_isa_irq_from_domain(struct irq_domain *domain, info->ioapic_pin)) return -ENOMEM; } else { - irq = __irq_domain_alloc_irqs(domain, irq, 1, node, info, true); + irq = __irq_domain_alloc_irqs(domain, irq, 1, node, info, true, + NULL); if (irq >= 0) { irq_data = irq_domain_get_irq_data(domain, irq); data = irq_data->chip_data; diff --git a/include/linux/irq.h b/include/linux/irq.h index f6074813688d..39ce46ac5c18 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -708,11 +708,11 @@ static inline struct cpumask *irq_data_get_affinity_mask(struct irq_data *d) unsigned int arch_dynirq_lower_bound(unsigned int from); int __irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node, - struct module *owner); + struct module *owner, const struct cpumask *affinity); /* use macros to avoid needing export.h for THIS_MODULE */ #define irq_alloc_descs(irq, from, cnt, node) \ - __irq_alloc_descs(irq, from, cnt, node, THIS_MODULE) + __irq_alloc_descs(irq, from, cnt, node, THIS_MODULE, NULL) #define irq_alloc_desc(node) \ irq_alloc_descs(-1, 0, 1, node) diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h index f1f36e04d885..1aee0fbe900e 100644 --- a/include/linux/irqdomain.h +++ b/include/linux/irqdomain.h @@ -39,6 +39,7 @@ struct irq_domain; struct of_device_id; struct irq_chip; struct irq_data; +struct cpumask; /* Number of irqs reserved for a legacy isa controller */ #define NUM_ISA_INTERRUPTS 16 @@ -217,7 +218,8 @@ extern struct irq_domain *irq_find_matching_fwspec(struct irq_fwspec *fwspec, enum irq_domain_bus_token bus_token); extern void irq_set_default_host(struct irq_domain *host); extern int irq_domain_alloc_descs(int virq, unsigned int nr_irqs, - irq_hw_number_t hwirq, int node); + irq_hw_number_t hwirq, int node, + const struct cpumask *affinity); static inline struct fwnode_handle *of_node_to_fwnode(struct device_node *node) { @@ -389,7 +391,7 @@ static inline struct irq_domain *irq_domain_add_hierarchy(struct irq_domain *par extern int __irq_domain_alloc_irqs(struct irq_domain *domain, int irq_base, unsigned int nr_irqs, int node, void *arg, - bool realloc); + bool realloc, const struct cpumask *affinity); extern void irq_domain_free_irqs(unsigned int virq, unsigned int nr_irqs); extern void irq_domain_activate_irq(struct irq_data *irq_data); extern void irq_domain_deactivate_irq(struct irq_data *irq_data); @@ -397,7 +399,8 @@ extern void irq_domain_deactivate_irq(struct irq_data *irq_data); static inline int irq_domain_alloc_irqs(struct irq_domain *domain, unsigned int nr_irqs, int node, void *arg) { - return __irq_domain_alloc_irqs(domain, -1, nr_irqs, node, arg, false); + return __irq_domain_alloc_irqs(domain, -1, nr_irqs, node, arg, false, + NULL); } extern int irq_domain_alloc_irqs_recursive(struct irq_domain *domain, diff --git a/kernel/irq/ipi.c b/kernel/irq/ipi.c index 89b49f6773f0..4fd23510d5f2 100644 --- a/kernel/irq/ipi.c +++ b/kernel/irq/ipi.c @@ -76,7 +76,7 @@ int irq_reserve_ipi(struct irq_domain *domain, } } - virq = irq_domain_alloc_descs(-1, nr_irqs, 0, NUMA_NO_NODE); + virq = irq_domain_alloc_descs(-1, nr_irqs, 0, NUMA_NO_NODE, NULL); if (virq <= 0) { pr_warn("Can't reserve IPI, failed to alloc descs\n"); return -ENOMEM; diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 8731e1c5d1e7..b8df4fcdbb5f 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -223,7 +223,7 @@ static void free_desc(unsigned int irq) } static int alloc_descs(unsigned int start, unsigned int cnt, int node, - struct module *owner) + const struct cpumask *affinity, struct module *owner) { struct irq_desc *desc; int i; @@ -333,6 +333,7 @@ static void free_desc(unsigned int irq) } static inline int alloc_descs(unsigned int start, unsigned int cnt, int node, + const struct cpumask *affinity, struct module *owner) { u32 i; @@ -453,12 +454,15 @@ EXPORT_SYMBOL_GPL(irq_free_descs); * @cnt: Number of consecutive irqs to allocate. * @node: Preferred node on which the irq descriptor should be allocated * @owner: Owning module (can be NULL) + * @affinity: Optional pointer to an affinity mask which hints where the + * irq descriptors should be allocated and which default + * affinities to use * * Returns the first irq number or error code */ int __ref __irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node, - struct module *owner) + struct module *owner, const struct cpumask *affinity) { int start, ret; @@ -494,7 +498,7 @@ __irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node, bitmap_set(allocated_irqs, start, cnt); mutex_unlock(&sparse_irq_lock); - return alloc_descs(start, cnt, node, owner); + return alloc_descs(start, cnt, node, affinity, owner); err: mutex_unlock(&sparse_irq_lock); @@ -512,7 +516,7 @@ EXPORT_SYMBOL_GPL(__irq_alloc_descs); */ unsigned int irq_alloc_hwirqs(int cnt, int node) { - int i, irq = __irq_alloc_descs(-1, 0, cnt, node, NULL); + int i, irq = __irq_alloc_descs(-1, 0, cnt, node, NULL, NULL); if (irq < 0) return 0; diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 8798b6c9e945..79459b732dc9 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -481,7 +481,7 @@ unsigned int irq_create_mapping(struct irq_domain *domain, } /* Allocate a virtual interrupt number */ - virq = irq_domain_alloc_descs(-1, 1, hwirq, of_node_to_nid(of_node)); + virq = irq_domain_alloc_descs(-1, 1, hwirq, of_node_to_nid(of_node), NULL); if (virq <= 0) { pr_debug("-> virq allocation failed\n"); return 0; @@ -835,19 +835,23 @@ const struct irq_domain_ops irq_domain_simple_ops = { EXPORT_SYMBOL_GPL(irq_domain_simple_ops); int irq_domain_alloc_descs(int virq, unsigned int cnt, irq_hw_number_t hwirq, - int node) + int node, const struct cpumask *affinity) { unsigned int hint; if (virq >= 0) { - virq = irq_alloc_descs(virq, virq, cnt, node); + virq = __irq_alloc_descs(virq, virq, cnt, node, THIS_MODULE, + affinity); } else { hint = hwirq % nr_irqs; if (hint == 0) hint++; - virq = irq_alloc_descs_from(hint, cnt, node); - if (virq <= 0 && hint > 1) - virq = irq_alloc_descs_from(1, cnt, node); + virq = __irq_alloc_descs(-1, hint, cnt, node, THIS_MODULE, + affinity); + if (virq <= 0 && hint > 1) { + virq = __irq_alloc_descs(-1, 1, cnt, node, THIS_MODULE, + affinity); + } } return virq; @@ -1160,6 +1164,7 @@ int irq_domain_alloc_irqs_recursive(struct irq_domain *domain, * @node: NUMA node id for memory allocation * @arg: domain specific argument * @realloc: IRQ descriptors have already been allocated if true + * @affinity: Optional irq affinity mask for multiqueue devices * * Allocate IRQ numbers and initialized all data structures to support * hierarchy IRQ domains. @@ -1175,7 +1180,7 @@ int irq_domain_alloc_irqs_recursive(struct irq_domain *domain, */ int __irq_domain_alloc_irqs(struct irq_domain *domain, int irq_base, unsigned int nr_irqs, int node, void *arg, - bool realloc) + bool realloc, const struct cpumask *affinity) { int i, ret, virq; @@ -1193,7 +1198,8 @@ int __irq_domain_alloc_irqs(struct irq_domain *domain, int irq_base, if (realloc && irq_base >= 0) { virq = irq_base; } else { - virq = irq_domain_alloc_descs(irq_base, nr_irqs, 0, node); + virq = irq_domain_alloc_descs(irq_base, nr_irqs, 0, node, + affinity); if (virq < 0) { pr_debug("cannot allocate IRQ(base %d, count %d)\n", irq_base, nr_irqs); diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 30658e9827f0..ad0aac6d1248 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -353,10 +353,11 @@ static int setup_affinity(struct irq_desc *desc, struct cpumask *mask) return 0; /* - * Preserve an userspace affinity setup, but make sure that - * one of the targets is online. + * Preserve the managed affinity setting and an userspace affinity + * setup, but make sure that one of the targets is online. */ - if (irqd_has_set(&desc->irq_data, IRQD_AFFINITY_SET)) { + if (irqd_affinity_is_managed(&desc->irq_data) || + irqd_has_set(&desc->irq_data, IRQD_AFFINITY_SET)) { if (cpumask_intersects(desc->irq_common_data.affinity, cpu_online_mask)) set = desc->irq_common_data.affinity; diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index eb5bf2b50b07..58dbbacc6fbb 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -334,7 +334,8 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, ops->set_desc(&arg, desc); virq = __irq_domain_alloc_irqs(domain, -1, desc->nvec_used, - dev_to_node(dev), &arg, false); + dev_to_node(dev), &arg, false, + NULL); if (virq < 0) { ret = -ENOSPC; if (ops->handle_error) -- cgit v1.2.3 From 45ddcecbfa947f1dd8e8019bad9e90d6c9f2665c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 4 Jul 2016 17:39:25 +0900 Subject: genirq: Use affinity hint in irqdesc allocation Use the affinity hint in the irqdesc allocator. The hint is used to determine the node for the allocation and to set the affinity of the interrupt. If multiple interrupts are allocated (multi-MSI) then the allocator iterates over the cpumask and for each set cpu it allocates on their node and sets the initial affinity to that cpu. If a single interrupt is allocated (MSI-X) then the allocator uses the first cpu in the mask to compute the allocation node and uses the mask for the initial affinity setting. Interrupts set up this way are marked with the AFFINITY_MANAGED flag to prevent userspace from messing with their affinity settings. Signed-off-by: Thomas Gleixner Cc: Christoph Hellwig Cc: linux-block@vger.kernel.org Cc: linux-pci@vger.kernel.org Cc: linux-nvme@lists.infradead.org Cc: axboe@fb.com Cc: agordeev@redhat.com Link: http://lkml.kernel.org/r/1467621574-8277-5-git-send-email-hch@lst.de Signed-off-by: Thomas Gleixner --- kernel/irq/irqdesc.c | 51 +++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 39 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index b8df4fcdbb5f..a623b44f2d4b 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -68,9 +68,13 @@ static int alloc_masks(struct irq_desc *desc, gfp_t gfp, int node) return 0; } -static void desc_smp_init(struct irq_desc *desc, int node) +static void desc_smp_init(struct irq_desc *desc, int node, + const struct cpumask *affinity) { - cpumask_copy(desc->irq_common_data.affinity, irq_default_affinity); + if (!affinity) + affinity = irq_default_affinity; + cpumask_copy(desc->irq_common_data.affinity, affinity); + #ifdef CONFIG_GENERIC_PENDING_IRQ cpumask_clear(desc->pending_mask); #endif @@ -82,11 +86,12 @@ static void desc_smp_init(struct irq_desc *desc, int node) #else static inline int alloc_masks(struct irq_desc *desc, gfp_t gfp, int node) { return 0; } -static inline void desc_smp_init(struct irq_desc *desc, int node) { } +static inline void +desc_smp_init(struct irq_desc *desc, int node, const struct cpumask *affinity) { } #endif static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node, - struct module *owner) + const struct cpumask *affinity, struct module *owner) { int cpu; @@ -107,7 +112,7 @@ static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node, desc->owner = owner; for_each_possible_cpu(cpu) *per_cpu_ptr(desc->kstat_irqs, cpu) = 0; - desc_smp_init(desc, node); + desc_smp_init(desc, node, affinity); } int nr_irqs = NR_IRQS; @@ -158,7 +163,9 @@ void irq_unlock_sparse(void) mutex_unlock(&sparse_irq_lock); } -static struct irq_desc *alloc_desc(int irq, int node, struct module *owner) +static struct irq_desc *alloc_desc(int irq, int node, unsigned int flags, + const struct cpumask *affinity, + struct module *owner) { struct irq_desc *desc; gfp_t gfp = GFP_KERNEL; @@ -178,7 +185,8 @@ static struct irq_desc *alloc_desc(int irq, int node, struct module *owner) lockdep_set_class(&desc->lock, &irq_desc_lock_class); init_rcu_head(&desc->rcu); - desc_set_defaults(irq, desc, node, owner); + desc_set_defaults(irq, desc, node, affinity, owner); + irqd_set(&desc->irq_data, flags); return desc; @@ -225,11 +233,30 @@ static void free_desc(unsigned int irq) static int alloc_descs(unsigned int start, unsigned int cnt, int node, const struct cpumask *affinity, struct module *owner) { + const struct cpumask *mask = NULL; struct irq_desc *desc; - int i; + unsigned int flags; + int i, cpu = -1; + + if (affinity && cpumask_empty(affinity)) + return -EINVAL; + + flags = affinity ? IRQD_AFFINITY_MANAGED : 0; for (i = 0; i < cnt; i++) { - desc = alloc_desc(start + i, node, owner); + if (affinity) { + cpu = cpumask_next(cpu, affinity); + if (cpu >= nr_cpu_ids) + cpu = cpumask_first(affinity); + node = cpu_to_node(cpu); + + /* + * For single allocations we use the caller provided + * mask otherwise we use the mask of the target cpu + */ + mask = cnt == 1 ? affinity : cpumask_of(cpu); + } + desc = alloc_desc(start + i, node, flags, mask, owner); if (!desc) goto err; mutex_lock(&sparse_irq_lock); @@ -277,7 +304,7 @@ int __init early_irq_init(void) nr_irqs = initcnt; for (i = 0; i < initcnt; i++) { - desc = alloc_desc(i, node, NULL); + desc = alloc_desc(i, node, 0, NULL, NULL); set_bit(i, allocated_irqs); irq_insert_desc(i, desc); } @@ -311,7 +338,7 @@ int __init early_irq_init(void) alloc_masks(&desc[i], GFP_KERNEL, node); raw_spin_lock_init(&desc[i].lock); lockdep_set_class(&desc[i].lock, &irq_desc_lock_class); - desc_set_defaults(i, &desc[i], node, NULL); + desc_set_defaults(i, &desc[i], node, NULL, NULL); } return arch_early_irq_init(); } @@ -328,7 +355,7 @@ static void free_desc(unsigned int irq) unsigned long flags; raw_spin_lock_irqsave(&desc->lock, flags); - desc_set_defaults(irq, desc, irq_desc_get_node(desc), NULL); + desc_set_defaults(irq, desc, irq_desc_get_node(desc), NULL, NULL); raw_spin_unlock_irqrestore(&desc->lock, flags); } -- cgit v1.2.3 From 0972fa57f53525ffa6ced12d703750fc2791e3ce Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 4 Jul 2016 17:39:26 +0900 Subject: genirq/msi: Make use of affinity aware allocations Allow the MSI code to provide affinity hints per MSI descriptor. Signed-off-by: Thomas Gleixner Cc: Christoph Hellwig Cc: linux-block@vger.kernel.org Cc: linux-pci@vger.kernel.org Cc: linux-nvme@lists.infradead.org Cc: axboe@fb.com Cc: agordeev@redhat.com Link: http://lkml.kernel.org/r/1467621574-8277-6-git-send-email-hch@lst.de Signed-off-by: Thomas Gleixner --- include/linux/msi.h | 2 ++ kernel/irq/msi.c | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/linux/msi.h b/include/linux/msi.h index c33abfa0f5a7..4f0bfe5912b2 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -47,6 +47,7 @@ struct fsl_mc_msi_desc { * @nvec_used: The number of vectors used * @dev: Pointer to the device which uses this descriptor * @msg: The last set MSI message cached for reuse + * @affinity: Optional pointer to a cpu affinity mask for this descriptor * * @masked: [PCI MSI/X] Mask bits * @is_msix: [PCI MSI/X] True if MSI-X @@ -67,6 +68,7 @@ struct msi_desc { unsigned int nvec_used; struct device *dev; struct msi_msg msg; + const struct cpumask *affinity; union { /* PCI MSI/X specific data */ diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index 58dbbacc6fbb..0e2a736b14a7 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -335,7 +335,7 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, virq = __irq_domain_alloc_irqs(domain, -1, desc->nvec_used, dev_to_node(dev), &arg, false, - NULL); + desc->affinity); if (virq < 0) { ret = -ENOSPC; if (ops->handle_error) -- cgit v1.2.3 From 5e385a6ef31fbbf2acbda770aecc2bc2ff933d17 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 4 Jul 2016 17:39:27 +0900 Subject: genirq: Add a helper to spread an affinity mask for MSI/MSI-X vectors This is lifted from the blk-mq code and adopted to use the affinity mask concept just introduced in the irq handling code. It tries to keep the algorithm the same as the one current used by blk-mq, but improvements like assining vectors on a per-node basis instead of just per sibling are possible with this simple move and refactoring. Signed-off-by: Christoph Hellwig Cc: linux-block@vger.kernel.org Cc: linux-pci@vger.kernel.org Cc: linux-nvme@lists.infradead.org Cc: axboe@fb.com Cc: agordeev@redhat.com Link: http://lkml.kernel.org/r/1467621574-8277-7-git-send-email-hch@lst.de Signed-off-by: Thomas Gleixner --- include/linux/interrupt.h | 8 +++++++ kernel/irq/Makefile | 1 + kernel/irq/affinity.c | 61 +++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 70 insertions(+) create mode 100644 kernel/irq/affinity.c (limited to 'kernel') diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index 9fcabeb07787..b6683f0ffc9f 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -278,6 +278,8 @@ extern int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m); extern int irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify); +struct cpumask *irq_create_affinity_mask(unsigned int *nr_vecs); + #else /* CONFIG_SMP */ static inline int irq_set_affinity(unsigned int irq, const struct cpumask *m) @@ -308,6 +310,12 @@ irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify) { return 0; } + +static inline struct cpumask *irq_create_affinity_mask(unsigned int *nr_vecs) +{ + *nr_vecs = 1; + return NULL; +} #endif /* CONFIG_SMP */ /* diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile index 2ee42e95a3ce..1d3ee3169202 100644 --- a/kernel/irq/Makefile +++ b/kernel/irq/Makefile @@ -9,3 +9,4 @@ obj-$(CONFIG_GENERIC_IRQ_MIGRATION) += cpuhotplug.o obj-$(CONFIG_PM_SLEEP) += pm.o obj-$(CONFIG_GENERIC_MSI_IRQ) += msi.o obj-$(CONFIG_GENERIC_IRQ_IPI) += ipi.o +obj-$(CONFIG_SMP) += affinity.o diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c new file mode 100644 index 000000000000..f68959341c0f --- /dev/null +++ b/kernel/irq/affinity.c @@ -0,0 +1,61 @@ + +#include +#include +#include +#include + +static int get_first_sibling(unsigned int cpu) +{ + unsigned int ret; + + ret = cpumask_first(topology_sibling_cpumask(cpu)); + if (ret < nr_cpu_ids) + return ret; + return cpu; +} + +/* + * Take a map of online CPUs and the number of available interrupt vectors + * and generate an output cpumask suitable for spreading MSI/MSI-X vectors + * so that they are distributed as good as possible around the CPUs. If + * more vectors than CPUs are available we'll map one to each CPU, + * otherwise we map one to the first sibling of each socket. + * + * If there are more vectors than CPUs we will still only have one bit + * set per CPU, but interrupt code will keep on assigning the vectors from + * the start of the bitmap until we run out of vectors. + */ +struct cpumask *irq_create_affinity_mask(unsigned int *nr_vecs) +{ + struct cpumask *affinity_mask; + unsigned int max_vecs = *nr_vecs; + + if (max_vecs == 1) + return NULL; + + affinity_mask = kzalloc(cpumask_size(), GFP_KERNEL); + if (!affinity_mask) { + *nr_vecs = 1; + return NULL; + } + + if (max_vecs >= num_online_cpus()) { + cpumask_copy(affinity_mask, cpu_online_mask); + *nr_vecs = num_online_cpus(); + } else { + unsigned int vecs = 0, cpu; + + for_each_online_cpu(cpu) { + if (cpu == get_first_sibling(cpu)) { + cpumask_set_cpu(cpu, affinity_mask); + vecs++; + } + + if (--max_vecs == 0) + break; + } + *nr_vecs = vecs; + } + + return affinity_mask; +} -- cgit v1.2.3 From 4364e1a29be16b2783c0bcbc263f61236af64281 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 4 Jul 2016 15:32:25 +0200 Subject: genirq/msi: Fix broken debug output virq is not required to be the same for all msi descs. Use the base irq number from the desc in the debug printk. Reported-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/irq/msi.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index 0e2a736b14a7..54999350162c 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -353,6 +353,7 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, ops->msi_finish(&arg, 0); for_each_msi_entry(desc, dev) { + virq = desc->irq; if (desc->nvec_used == 1) dev_dbg(dev, "irq %d for MSI\n", virq); else -- cgit v1.2.3 From 5130213721d01b6632c255d4295a8102cbb58379 Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Tue, 5 Jul 2016 16:57:51 +0800 Subject: tick/broadcast-hrtimer: Set name of the ce_broadcast_hrtimer This is to avoid the "null" name when we either ~ # cat /sys/devices/system/clockevents/broadcast/current_device (null) or ~ # cat /proc/timer_list ... Tick Device: mode: 1 Broadcast device Clock Event Device: (null) ... Signed-off-by: Jisheng Zhang Cc: linux-arm-kernel@lists.infradead.org Link: http://lkml.kernel.org/r/1467709071-3667-1-git-send-email-jszhang@marvell.com Signed-off-by: Thomas Gleixner --- kernel/time/tick-broadcast-hrtimer.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/time/tick-broadcast-hrtimer.c b/kernel/time/tick-broadcast-hrtimer.c index 53d7184da0be..690b797f522e 100644 --- a/kernel/time/tick-broadcast-hrtimer.c +++ b/kernel/time/tick-broadcast-hrtimer.c @@ -75,6 +75,7 @@ static int bc_set_next(ktime_t expires, struct clock_event_device *bc) } static struct clock_event_device ce_broadcast_hrtimer = { + .name = "bc_hrtimer", .set_state_shutdown = bc_shutdown, .set_next_ktime = bc_set_next, .features = CLOCK_EVT_FEAT_ONESHOT | -- cgit v1.2.3 From 2c81a6477081966fe80b8c6daa68459bca896774 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Tue, 14 Jun 2016 16:10:41 +0100 Subject: perf/core: Fix pmu::filter_match for SW-led groups The following commit: 66eb579e66ec ("perf: allow for PMU-specific event filtering") added the pmu::filter_match() callback. This was intended to avoid HW constraints on events from resulting in extremely pessimistic scheduling. However, pmu::filter_match() is only called for the leader of each event group. When the leader is a SW event, we do not filter the groups, and may fail at pmu::add() time, and when this happens we'll give up on scheduling any event groups later in the list until they are rotated ahead of the failing group. This can result in extremely sub-optimal event scheduling behaviour, e.g. if running the following on a big.LITTLE platform: $ taskset -c 0 ./perf stat \ -e 'a57{context-switches,armv8_cortex_a57/config=0x11/}' \ -e 'a53{context-switches,armv8_cortex_a53/config=0x11/}' \ ls context-switches (0.00%) armv8_cortex_a57/config=0x11/ (0.00%) 24 context-switches (37.36%) 57589154 armv8_cortex_a53/config=0x11/ (37.36%) Here the 'a53' event group was always eligible to be scheduled, but the 'a57' group never eligible to be scheduled, as the task was always affine to a Cortex-A53 CPU. The SW (group leader) event in the 'a57' group was eligible, but the HW event failed at pmu::add() time, resulting in ctx_flexible_sched_in giving up on scheduling further groups with HW events. One way of avoiding this is to check pmu::filter_match() on siblings as well as the group leader. If any of these fail their pmu::filter_match() call, we must skip the entire group before attempting to add any events. Signed-off-by: Mark Rutland Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Will Deacon Fixes: 66eb579e66ec ("perf: allow for PMU-specific event filtering") Link: http://lkml.kernel.org/r/1465917041-15339-1-git-send-email-mark.rutland@arm.com [ Small readability edits. ] Signed-off-by: Ingo Molnar --- kernel/events/core.c | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 85cd41878a74..43d43a2d5811 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1678,12 +1678,33 @@ static bool is_orphaned_event(struct perf_event *event) return event->state == PERF_EVENT_STATE_DEAD; } -static inline int pmu_filter_match(struct perf_event *event) +static inline int __pmu_filter_match(struct perf_event *event) { struct pmu *pmu = event->pmu; return pmu->filter_match ? pmu->filter_match(event) : 1; } +/* + * Check whether we should attempt to schedule an event group based on + * PMU-specific filtering. An event group can consist of HW and SW events, + * potentially with a SW leader, so we must check all the filters, to + * determine whether a group is schedulable: + */ +static inline int pmu_filter_match(struct perf_event *event) +{ + struct perf_event *child; + + if (!__pmu_filter_match(event)) + return 0; + + list_for_each_entry(child, &event->sibling_list, group_entry) { + if (!__pmu_filter_match(child)) + return 0; + } + + return 1; +} + static inline int event_filter_match(struct perf_event *event) { -- cgit v1.2.3 From 885885f6b88d22f81e67ee6a61561e480b27d27a Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Fri, 17 Jun 2016 17:19:40 +0000 Subject: locking/static_keys: Fix non static symbol Sparse warning Fix the following sparse warnings: kernel/jump_label.c:473:23: warning: symbol 'jump_label_module_nb' was not declared. Should it be static? Signed-off-by: Wei Yongjun Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1466183980-8903-1-git-send-email-weiyj_lk@163.com Signed-off-by: Ingo Molnar --- kernel/jump_label.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/jump_label.c b/kernel/jump_label.c index 05254eeb4b4e..ac4ab953b49c 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -422,7 +422,7 @@ jump_label_module_notify(struct notifier_block *self, unsigned long val, return notifier_from_errno(ret); } -struct notifier_block jump_label_module_nb = { +static struct notifier_block jump_label_module_nb = { .notifier_call = jump_label_module_notify, .priority = 1, /* higher than tracepoints */ }; -- cgit v1.2.3 From e675447bda51c1ea72d1ac9132ce3bed974f1da3 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 4 Jul 2016 09:50:15 +0000 Subject: timers: Make 'pinned' a timer property We want to move the timer migration logic from a 'push' to a 'pull' model. Under the current 'push' model pinned timers are handled via a runtime API variant: mod_timer_pinned(). The 'pull' model requires us to store the pinned attribute of a timer in the timer_list structure itself, as a new TIMER_PINNED bit in timer->flags. This flag must be set at initialization time and the timer APIs recognize the flag. This patch: - Implements the new flag and associated new-style initialization methods - makes mod_timer() recognize new-style pinned timers, - and adds some migration helper facility to allow step by step conversion of old-style to new-style pinned timers. Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Cc: Arjan van de Ven Cc: Chris Mason Cc: Eric Dumazet Cc: George Spelvin Cc: Josh Triplett Cc: Len Brown Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Rik van Riel Cc: rt@linutronix.de Link: http://lkml.kernel.org/r/20160704094341.049338558@linutronix.de Signed-off-by: Ingo Molnar --- include/linux/timer.h | 25 ++++++++++++++++++++++--- kernel/time/timer.c | 10 +++++----- 2 files changed, 27 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/include/linux/timer.h b/include/linux/timer.h index 20ac746f3eb3..046d6cf26498 100644 --- a/include/linux/timer.h +++ b/include/linux/timer.h @@ -62,7 +62,8 @@ struct timer_list { #define TIMER_MIGRATING 0x00080000 #define TIMER_BASEMASK (TIMER_CPUMASK | TIMER_MIGRATING) #define TIMER_DEFERRABLE 0x00100000 -#define TIMER_IRQSAFE 0x00200000 +#define TIMER_PINNED 0x00200000 +#define TIMER_IRQSAFE 0x00400000 #define __TIMER_INITIALIZER(_function, _expires, _data, _flags) { \ .entry = { .next = TIMER_ENTRY_STATIC }, \ @@ -78,9 +79,15 @@ struct timer_list { #define TIMER_INITIALIZER(_function, _expires, _data) \ __TIMER_INITIALIZER((_function), (_expires), (_data), 0) +#define TIMER_PINNED_INITIALIZER(_function, _expires, _data) \ + __TIMER_INITIALIZER((_function), (_expires), (_data), TIMER_PINNED) + #define TIMER_DEFERRED_INITIALIZER(_function, _expires, _data) \ __TIMER_INITIALIZER((_function), (_expires), (_data), TIMER_DEFERRABLE) +#define TIMER_PINNED_DEFERRED_INITIALIZER(_function, _expires, _data) \ + __TIMER_INITIALIZER((_function), (_expires), (_data), TIMER_DEFERRABLE | TIMER_PINNED) + #define DEFINE_TIMER(_name, _function, _expires, _data) \ struct timer_list _name = \ TIMER_INITIALIZER(_function, _expires, _data) @@ -124,8 +131,12 @@ static inline void init_timer_on_stack_key(struct timer_list *timer, #define init_timer(timer) \ __init_timer((timer), 0) +#define init_timer_pinned(timer) \ + __init_timer((timer), TIMER_PINNED) #define init_timer_deferrable(timer) \ __init_timer((timer), TIMER_DEFERRABLE) +#define init_timer_pinned_deferrable(timer) \ + __init_timer((timer), TIMER_DEFERRABLE | TIMER_PINNED) #define init_timer_on_stack(timer) \ __init_timer_on_stack((timer), 0) @@ -145,12 +156,20 @@ static inline void init_timer_on_stack_key(struct timer_list *timer, #define setup_timer(timer, fn, data) \ __setup_timer((timer), (fn), (data), 0) +#define setup_pinned_timer(timer, fn, data) \ + __setup_timer((timer), (fn), (data), TIMER_PINNED) #define setup_deferrable_timer(timer, fn, data) \ __setup_timer((timer), (fn), (data), TIMER_DEFERRABLE) +#define setup_pinned_deferrable_timer(timer, fn, data) \ + __setup_timer((timer), (fn), (data), TIMER_DEFERRABLE | TIMER_PINNED) #define setup_timer_on_stack(timer, fn, data) \ __setup_timer_on_stack((timer), (fn), (data), 0) +#define setup_pinned_timer_on_stack(timer, fn, data) \ + __setup_timer_on_stack((timer), (fn), (data), TIMER_PINNED) #define setup_deferrable_timer_on_stack(timer, fn, data) \ __setup_timer_on_stack((timer), (fn), (data), TIMER_DEFERRABLE) +#define setup_pinned_deferrable_timer_on_stack(timer, fn, data) \ + __setup_timer_on_stack((timer), (fn), (data), TIMER_DEFERRABLE | TIMER_PINNED) /** * timer_pending - is a timer pending? @@ -175,8 +194,8 @@ extern int mod_timer_pinned(struct timer_list *timer, unsigned long expires); extern void set_timer_slack(struct timer_list *time, int slack_hz); -#define TIMER_NOT_PINNED 0 -#define TIMER_PINNED 1 +#define MOD_TIMER_NOT_PINNED 0 +#define MOD_TIMER_PINNED 1 /* * The jiffies value which is added to now, when there is no timer * in the timer wheel: diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 3a95f9728778..693f6d14058e 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -782,7 +782,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires, debug_activate(timer, expires); - new_base = get_target_base(base, pinned); + new_base = get_target_base(base, pinned || timer->flags & TIMER_PINNED); if (base != new_base) { /* @@ -825,7 +825,7 @@ out_unlock: */ int mod_timer_pending(struct timer_list *timer, unsigned long expires) { - return __mod_timer(timer, expires, true, TIMER_NOT_PINNED); + return __mod_timer(timer, expires, true, MOD_TIMER_NOT_PINNED); } EXPORT_SYMBOL(mod_timer_pending); @@ -900,7 +900,7 @@ int mod_timer(struct timer_list *timer, unsigned long expires) if (timer_pending(timer) && timer->expires == expires) return 1; - return __mod_timer(timer, expires, false, TIMER_NOT_PINNED); + return __mod_timer(timer, expires, false, MOD_TIMER_NOT_PINNED); } EXPORT_SYMBOL(mod_timer); @@ -928,7 +928,7 @@ int mod_timer_pinned(struct timer_list *timer, unsigned long expires) if (timer->expires == expires && timer_pending(timer)) return 1; - return __mod_timer(timer, expires, false, TIMER_PINNED); + return __mod_timer(timer, expires, false, MOD_TIMER_PINNED); } EXPORT_SYMBOL(mod_timer_pinned); @@ -1512,7 +1512,7 @@ signed long __sched schedule_timeout(signed long timeout) expire = timeout + jiffies; setup_timer_on_stack(&timer, process_timeout, (unsigned long)current); - __mod_timer(&timer, expire, false, TIMER_NOT_PINNED); + __mod_timer(&timer, expire, false, MOD_TIMER_NOT_PINNED); schedule(); del_singleshot_timer_sync(&timer); -- cgit v1.2.3 From 177ec0a0a531695210b277d734b2f92ee5796303 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 4 Jul 2016 09:50:24 +0000 Subject: timers: Remove the deprecated mod_timer_pinned() API We switched all users to initialize the timers as pinned and call mod_timer(). Remove the now unused timer API function. Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Cc: Arjan van de Ven Cc: Chris Mason Cc: Eric Dumazet Cc: George Spelvin Cc: Josh Triplett Cc: Len Brown Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Rik van Riel Cc: rt@linutronix.de Link: http://lkml.kernel.org/r/20160704094341.706205231@linutronix.de Signed-off-by: Ingo Molnar --- include/linux/timer.h | 3 --- kernel/time/timer.c | 39 +++++---------------------------------- 2 files changed, 5 insertions(+), 37 deletions(-) (limited to 'kernel') diff --git a/include/linux/timer.h b/include/linux/timer.h index 046d6cf26498..a8f6c70eb414 100644 --- a/include/linux/timer.h +++ b/include/linux/timer.h @@ -190,12 +190,9 @@ extern void add_timer_on(struct timer_list *timer, int cpu); extern int del_timer(struct timer_list * timer); extern int mod_timer(struct timer_list *timer, unsigned long expires); extern int mod_timer_pending(struct timer_list *timer, unsigned long expires); -extern int mod_timer_pinned(struct timer_list *timer, unsigned long expires); extern void set_timer_slack(struct timer_list *time, int slack_hz); -#define MOD_TIMER_NOT_PINNED 0 -#define MOD_TIMER_PINNED 1 /* * The jiffies value which is added to now, when there is no timer * in the timer wheel: diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 693f6d14058e..ba49c1cf80f5 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -764,8 +764,7 @@ static struct tvec_base *lock_timer_base(struct timer_list *timer, } static inline int -__mod_timer(struct timer_list *timer, unsigned long expires, - bool pending_only, int pinned) +__mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only) { struct tvec_base *base, *new_base; unsigned long flags; @@ -782,7 +781,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires, debug_activate(timer, expires); - new_base = get_target_base(base, pinned || timer->flags & TIMER_PINNED); + new_base = get_target_base(base, timer->flags & TIMER_PINNED); if (base != new_base) { /* @@ -825,7 +824,7 @@ out_unlock: */ int mod_timer_pending(struct timer_list *timer, unsigned long expires) { - return __mod_timer(timer, expires, true, MOD_TIMER_NOT_PINNED); + return __mod_timer(timer, expires, true); } EXPORT_SYMBOL(mod_timer_pending); @@ -900,38 +899,10 @@ int mod_timer(struct timer_list *timer, unsigned long expires) if (timer_pending(timer) && timer->expires == expires) return 1; - return __mod_timer(timer, expires, false, MOD_TIMER_NOT_PINNED); + return __mod_timer(timer, expires, false); } EXPORT_SYMBOL(mod_timer); -/** - * mod_timer_pinned - modify a timer's timeout - * @timer: the timer to be modified - * @expires: new timeout in jiffies - * - * mod_timer_pinned() is a way to update the expire field of an - * active timer (if the timer is inactive it will be activated) - * and to ensure that the timer is scheduled on the current CPU. - * - * Note that this does not prevent the timer from being migrated - * when the current CPU goes offline. If this is a problem for - * you, use CPU-hotplug notifiers to handle it correctly, for - * example, cancelling the timer when the corresponding CPU goes - * offline. - * - * mod_timer_pinned(timer, expires) is equivalent to: - * - * del_timer(timer); timer->expires = expires; add_timer(timer); - */ -int mod_timer_pinned(struct timer_list *timer, unsigned long expires) -{ - if (timer->expires == expires && timer_pending(timer)) - return 1; - - return __mod_timer(timer, expires, false, MOD_TIMER_PINNED); -} -EXPORT_SYMBOL(mod_timer_pinned); - /** * add_timer - start a timer * @timer: the timer to be added @@ -1512,7 +1483,7 @@ signed long __sched schedule_timeout(signed long timeout) expire = timeout + jiffies; setup_timer_on_stack(&timer, process_timeout, (unsigned long)current); - __mod_timer(&timer, expire, false, MOD_TIMER_NOT_PINNED); + __mod_timer(&timer, expire, false); schedule(); del_singleshot_timer_sync(&timer); -- cgit v1.2.3 From 2b1ecc3d1a6b10f8fbac7f83d80db30b5a2c2791 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 4 Jul 2016 09:50:25 +0000 Subject: signals: Use hrtimer for sigtimedwait() We've converted most timeout related syscalls to hrtimers, but sigtimedwait() did not get this treatment. Convert it so we get a reasonable accuracy and remove the user space exposure to the timer wheel properties. Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Cc: Al Viro Cc: Arjan van de Ven Cc: Chris Mason Cc: Cyril Hrubis Cc: George Spelvin Cc: Josh Triplett Cc: Len Brown Cc: Linus Torvalds Cc: Paul McKenney Cc: Peter Zijlstra Cc: Rik van Riel Cc: rt@linutronix.de Link: http://lkml.kernel.org/r/20160704094341.787164909@linutronix.de Signed-off-by: Ingo Molnar --- kernel/signal.c | 24 ++++++++++-------------- 1 file changed, 10 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 96e9bc40667f..af21afc00d08 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2751,23 +2751,18 @@ int copy_siginfo_to_user(siginfo_t __user *to, const siginfo_t *from) * @ts: upper bound on process time suspension */ int do_sigtimedwait(const sigset_t *which, siginfo_t *info, - const struct timespec *ts) + const struct timespec *ts) { + ktime_t *to = NULL, timeout = { .tv64 = KTIME_MAX }; struct task_struct *tsk = current; - long timeout = MAX_SCHEDULE_TIMEOUT; sigset_t mask = *which; - int sig; + int sig, ret = 0; if (ts) { if (!timespec_valid(ts)) return -EINVAL; - timeout = timespec_to_jiffies(ts); - /* - * We can be close to the next tick, add another one - * to ensure we will wait at least the time asked for. - */ - if (ts->tv_sec || ts->tv_nsec) - timeout++; + timeout = timespec_to_ktime(*ts); + to = &timeout; } /* @@ -2778,7 +2773,7 @@ int do_sigtimedwait(const sigset_t *which, siginfo_t *info, spin_lock_irq(&tsk->sighand->siglock); sig = dequeue_signal(tsk, &mask, info); - if (!sig && timeout) { + if (!sig && timeout.tv64) { /* * None ready, temporarily unblock those we're interested * while we are sleeping in so that we'll be awakened when @@ -2790,8 +2785,9 @@ int do_sigtimedwait(const sigset_t *which, siginfo_t *info, recalc_sigpending(); spin_unlock_irq(&tsk->sighand->siglock); - timeout = freezable_schedule_timeout_interruptible(timeout); - + __set_current_state(TASK_INTERRUPTIBLE); + ret = freezable_schedule_hrtimeout_range(to, tsk->timer_slack_ns, + HRTIMER_MODE_REL); spin_lock_irq(&tsk->sighand->siglock); __set_task_blocked(tsk, &tsk->real_blocked); sigemptyset(&tsk->real_blocked); @@ -2801,7 +2797,7 @@ int do_sigtimedwait(const sigset_t *which, siginfo_t *info, if (sig) return sig; - return timeout ? -EINTR : -EAGAIN; + return ret ? -EINTR : -EAGAIN; } /** -- cgit v1.2.3 From 494af3ed7848de08640d98ee5aff57a45c137c3c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 4 Jul 2016 09:50:28 +0000 Subject: timers: Give a few structs and members proper names Some of the names in the internal implementation of the timer code are not longer correct and others are simply too long to type. Clean it up before we switch the wheel implementation over to the new scheme. No functional change. Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Cc: Arjan van de Ven Cc: Chris Mason Cc: Eric Dumazet Cc: George Spelvin Cc: Josh Triplett Cc: Len Brown Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Rik van Riel Cc: rt@linutronix.de Link: http://lkml.kernel.org/r/20160704094341.948752516@linutronix.de Signed-off-by: Ingo Molnar --- kernel/time/timer.c | 118 ++++++++++++++++++++++++++-------------------------- 1 file changed, 59 insertions(+), 59 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timer.c b/kernel/time/timer.c index ba49c1cf80f5..f259a3ef4577 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -77,10 +77,10 @@ struct tvec_root { struct hlist_head vec[TVR_SIZE]; }; -struct tvec_base { +struct timer_base { spinlock_t lock; struct timer_list *running_timer; - unsigned long timer_jiffies; + unsigned long clk; unsigned long next_timer; unsigned long active_timers; unsigned long all_timers; @@ -95,7 +95,7 @@ struct tvec_base { } ____cacheline_aligned; -static DEFINE_PER_CPU(struct tvec_base, tvec_bases); +static DEFINE_PER_CPU(struct timer_base, timer_bases); #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) unsigned int sysctl_timer_migration = 1; @@ -106,15 +106,15 @@ void timers_update_migration(bool update_nohz) unsigned int cpu; /* Avoid the loop, if nothing to update */ - if (this_cpu_read(tvec_bases.migration_enabled) == on) + if (this_cpu_read(timer_bases.migration_enabled) == on) return; for_each_possible_cpu(cpu) { - per_cpu(tvec_bases.migration_enabled, cpu) = on; + per_cpu(timer_bases.migration_enabled, cpu) = on; per_cpu(hrtimer_bases.migration_enabled, cpu) = on; if (!update_nohz) continue; - per_cpu(tvec_bases.nohz_active, cpu) = true; + per_cpu(timer_bases.nohz_active, cpu) = true; per_cpu(hrtimer_bases.nohz_active, cpu) = true; } } @@ -134,18 +134,18 @@ int timer_migration_handler(struct ctl_table *table, int write, return ret; } -static inline struct tvec_base *get_target_base(struct tvec_base *base, +static inline struct timer_base *get_target_base(struct timer_base *base, int pinned) { if (pinned || !base->migration_enabled) - return this_cpu_ptr(&tvec_bases); - return per_cpu_ptr(&tvec_bases, get_nohz_timer_target()); + return this_cpu_ptr(&timer_bases); + return per_cpu_ptr(&timer_bases, get_nohz_timer_target()); } #else -static inline struct tvec_base *get_target_base(struct tvec_base *base, +static inline struct timer_base *get_target_base(struct timer_base *base, int pinned) { - return this_cpu_ptr(&tvec_bases); + return this_cpu_ptr(&timer_bases); } #endif @@ -371,10 +371,10 @@ void set_timer_slack(struct timer_list *timer, int slack_hz) EXPORT_SYMBOL_GPL(set_timer_slack); static void -__internal_add_timer(struct tvec_base *base, struct timer_list *timer) +__internal_add_timer(struct timer_base *base, struct timer_list *timer) { unsigned long expires = timer->expires; - unsigned long idx = expires - base->timer_jiffies; + unsigned long idx = expires - base->clk; struct hlist_head *vec; if (idx < TVR_SIZE) { @@ -394,7 +394,7 @@ __internal_add_timer(struct tvec_base *base, struct timer_list *timer) * Can happen if you add a timer with expires == jiffies, * or you set a timer to go off in the past */ - vec = base->tv1.vec + (base->timer_jiffies & TVR_MASK); + vec = base->tv1.vec + (base->clk & TVR_MASK); } else { int i; /* If the timeout is larger than MAX_TVAL (on 64-bit @@ -403,7 +403,7 @@ __internal_add_timer(struct tvec_base *base, struct timer_list *timer) */ if (idx > MAX_TVAL) { idx = MAX_TVAL; - expires = idx + base->timer_jiffies; + expires = idx + base->clk; } i = (expires >> (TVR_BITS + 3 * TVN_BITS)) & TVN_MASK; vec = base->tv5.vec + i; @@ -412,11 +412,11 @@ __internal_add_timer(struct tvec_base *base, struct timer_list *timer) hlist_add_head(&timer->entry, vec); } -static void internal_add_timer(struct tvec_base *base, struct timer_list *timer) +static void internal_add_timer(struct timer_base *base, struct timer_list *timer) { /* Advance base->jiffies, if the base is empty */ if (!base->all_timers++) - base->timer_jiffies = jiffies; + base->clk = jiffies; __internal_add_timer(base, timer); /* @@ -707,7 +707,7 @@ static inline void detach_timer(struct timer_list *timer, bool clear_pending) } static inline void -detach_expired_timer(struct timer_list *timer, struct tvec_base *base) +detach_expired_timer(struct timer_list *timer, struct timer_base *base) { detach_timer(timer, true); if (!(timer->flags & TIMER_DEFERRABLE)) @@ -715,7 +715,7 @@ detach_expired_timer(struct timer_list *timer, struct tvec_base *base) base->all_timers--; } -static int detach_if_pending(struct timer_list *timer, struct tvec_base *base, +static int detach_if_pending(struct timer_list *timer, struct timer_base *base, bool clear_pending) { if (!timer_pending(timer)) @@ -725,16 +725,16 @@ static int detach_if_pending(struct timer_list *timer, struct tvec_base *base, if (!(timer->flags & TIMER_DEFERRABLE)) { base->active_timers--; if (timer->expires == base->next_timer) - base->next_timer = base->timer_jiffies; + base->next_timer = base->clk; } /* If this was the last timer, advance base->jiffies */ if (!--base->all_timers) - base->timer_jiffies = jiffies; + base->clk = jiffies; return 1; } /* - * We are using hashed locking: holding per_cpu(tvec_bases).lock + * We are using hashed locking: holding per_cpu(timer_bases).lock * means that all timers which are tied to this base via timer->base are * locked, and the base itself is locked too. * @@ -744,16 +744,16 @@ static int detach_if_pending(struct timer_list *timer, struct tvec_base *base, * When the timer's base is locked and removed from the list, the * TIMER_MIGRATING flag is set, FIXME */ -static struct tvec_base *lock_timer_base(struct timer_list *timer, +static struct timer_base *lock_timer_base(struct timer_list *timer, unsigned long *flags) __acquires(timer->base->lock) { for (;;) { u32 tf = timer->flags; - struct tvec_base *base; + struct timer_base *base; if (!(tf & TIMER_MIGRATING)) { - base = per_cpu_ptr(&tvec_bases, tf & TIMER_CPUMASK); + base = per_cpu_ptr(&timer_bases, tf & TIMER_CPUMASK); spin_lock_irqsave(&base->lock, *flags); if (timer->flags == tf) return base; @@ -766,7 +766,7 @@ static struct tvec_base *lock_timer_base(struct timer_list *timer, static inline int __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only) { - struct tvec_base *base, *new_base; + struct timer_base *base, *new_base; unsigned long flags; int ret = 0; @@ -933,8 +933,8 @@ EXPORT_SYMBOL(add_timer); */ void add_timer_on(struct timer_list *timer, int cpu) { - struct tvec_base *new_base = per_cpu_ptr(&tvec_bases, cpu); - struct tvec_base *base; + struct timer_base *new_base = per_cpu_ptr(&timer_bases, cpu); + struct timer_base *base; unsigned long flags; timer_stats_timer_set_start_info(timer); @@ -975,7 +975,7 @@ EXPORT_SYMBOL_GPL(add_timer_on); */ int del_timer(struct timer_list *timer) { - struct tvec_base *base; + struct timer_base *base; unsigned long flags; int ret = 0; @@ -1001,7 +1001,7 @@ EXPORT_SYMBOL(del_timer); */ int try_to_del_timer_sync(struct timer_list *timer) { - struct tvec_base *base; + struct timer_base *base; unsigned long flags; int ret = -1; @@ -1085,7 +1085,7 @@ int del_timer_sync(struct timer_list *timer) EXPORT_SYMBOL(del_timer_sync); #endif -static int cascade(struct tvec_base *base, struct tvec *tv, int index) +static int cascade(struct timer_base *base, struct tvec *tv, int index) { /* cascade all the timers from tv up one level */ struct timer_list *timer; @@ -1149,7 +1149,7 @@ static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long), } } -#define INDEX(N) ((base->timer_jiffies >> (TVR_BITS + (N) * TVN_BITS)) & TVN_MASK) +#define INDEX(N) ((base->clk >> (TVR_BITS + (N) * TVN_BITS)) & TVN_MASK) /** * __run_timers - run all expired timers (if any) on this CPU. @@ -1158,23 +1158,23 @@ static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long), * This function cascades all vectors and executes all expired timer * vectors. */ -static inline void __run_timers(struct tvec_base *base) +static inline void __run_timers(struct timer_base *base) { struct timer_list *timer; spin_lock_irq(&base->lock); - while (time_after_eq(jiffies, base->timer_jiffies)) { + while (time_after_eq(jiffies, base->clk)) { struct hlist_head work_list; struct hlist_head *head = &work_list; int index; if (!base->all_timers) { - base->timer_jiffies = jiffies; + base->clk = jiffies; break; } - index = base->timer_jiffies & TVR_MASK; + index = base->clk & TVR_MASK; /* * Cascade timers: @@ -1184,7 +1184,7 @@ static inline void __run_timers(struct tvec_base *base) (!cascade(base, &base->tv3, INDEX(1))) && !cascade(base, &base->tv4, INDEX(2))) cascade(base, &base->tv5, INDEX(3)); - ++base->timer_jiffies; + ++base->clk; hlist_move_list(base->tv1.vec + index, head); while (!hlist_empty(head)) { void (*fn)(unsigned long); @@ -1222,16 +1222,16 @@ static inline void __run_timers(struct tvec_base *base) * is used on S/390 to stop all activity when a CPU is idle. * This function needs to be called with interrupts disabled. */ -static unsigned long __next_timer_interrupt(struct tvec_base *base) +static unsigned long __next_timer_interrupt(struct timer_base *base) { - unsigned long timer_jiffies = base->timer_jiffies; - unsigned long expires = timer_jiffies + NEXT_TIMER_MAX_DELTA; + unsigned long clk = base->clk; + unsigned long expires = clk + NEXT_TIMER_MAX_DELTA; int index, slot, array, found = 0; struct timer_list *nte; struct tvec *varray[4]; /* Look for timer events in tv1. */ - index = slot = timer_jiffies & TVR_MASK; + index = slot = clk & TVR_MASK; do { hlist_for_each_entry(nte, base->tv1.vec + slot, entry) { if (nte->flags & TIMER_DEFERRABLE) @@ -1250,8 +1250,8 @@ static unsigned long __next_timer_interrupt(struct tvec_base *base) cascade: /* Calculate the next cascade event */ if (index) - timer_jiffies += TVR_SIZE - index; - timer_jiffies >>= TVR_BITS; + clk += TVR_SIZE - index; + clk >>= TVR_BITS; /* Check tv2-tv5. */ varray[0] = &base->tv2; @@ -1262,7 +1262,7 @@ cascade: for (array = 0; array < 4; array++) { struct tvec *varp = varray[array]; - index = slot = timer_jiffies & TVN_MASK; + index = slot = clk & TVN_MASK; do { hlist_for_each_entry(nte, varp->vec + slot, entry) { if (nte->flags & TIMER_DEFERRABLE) @@ -1286,8 +1286,8 @@ cascade: } while (slot != index); if (index) - timer_jiffies += TVN_SIZE - index; - timer_jiffies >>= TVN_BITS; + clk += TVN_SIZE - index; + clk >>= TVN_BITS; } return expires; } @@ -1335,7 +1335,7 @@ static u64 cmp_next_hrtimer_event(u64 basem, u64 expires) */ u64 get_next_timer_interrupt(unsigned long basej, u64 basem) { - struct tvec_base *base = this_cpu_ptr(&tvec_bases); + struct timer_base *base = this_cpu_ptr(&timer_bases); u64 expires = KTIME_MAX; unsigned long nextevt; @@ -1348,7 +1348,7 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem) spin_lock(&base->lock); if (base->active_timers) { - if (time_before_eq(base->next_timer, base->timer_jiffies)) + if (time_before_eq(base->next_timer, base->clk)) base->next_timer = __next_timer_interrupt(base); nextevt = base->next_timer; if (time_before_eq(nextevt, basej)) @@ -1387,9 +1387,9 @@ void update_process_times(int user_tick) */ static void run_timer_softirq(struct softirq_action *h) { - struct tvec_base *base = this_cpu_ptr(&tvec_bases); + struct timer_base *base = this_cpu_ptr(&timer_bases); - if (time_after_eq(jiffies, base->timer_jiffies)) + if (time_after_eq(jiffies, base->clk)) __run_timers(base); } @@ -1534,7 +1534,7 @@ signed long __sched schedule_timeout_idle(signed long timeout) EXPORT_SYMBOL(schedule_timeout_idle); #ifdef CONFIG_HOTPLUG_CPU -static void migrate_timer_list(struct tvec_base *new_base, struct hlist_head *head) +static void migrate_timer_list(struct timer_base *new_base, struct hlist_head *head) { struct timer_list *timer; int cpu = new_base->cpu; @@ -1550,13 +1550,13 @@ static void migrate_timer_list(struct tvec_base *new_base, struct hlist_head *he static void migrate_timers(int cpu) { - struct tvec_base *old_base; - struct tvec_base *new_base; + struct timer_base *old_base; + struct timer_base *new_base; int i; BUG_ON(cpu_online(cpu)); - old_base = per_cpu_ptr(&tvec_bases, cpu); - new_base = get_cpu_ptr(&tvec_bases); + old_base = per_cpu_ptr(&timer_bases, cpu); + new_base = get_cpu_ptr(&timer_bases); /* * The caller is globally serialized and nobody else * takes two locks at once, deadlock is not possible. @@ -1580,7 +1580,7 @@ static void migrate_timers(int cpu) spin_unlock(&old_base->lock); spin_unlock_irq(&new_base->lock); - put_cpu_ptr(&tvec_bases); + put_cpu_ptr(&timer_bases); } static int timer_cpu_notify(struct notifier_block *self, @@ -1608,13 +1608,13 @@ static inline void timer_register_cpu_notifier(void) { } static void __init init_timer_cpu(int cpu) { - struct tvec_base *base = per_cpu_ptr(&tvec_bases, cpu); + struct timer_base *base = per_cpu_ptr(&timer_bases, cpu); base->cpu = cpu; spin_lock_init(&base->lock); - base->timer_jiffies = jiffies; - base->next_timer = base->timer_jiffies; + base->clk = jiffies; + base->next_timer = base->clk; } static void __init init_timer_cpus(void) -- cgit v1.2.3 From 500462a9de657f86edaa102f8ab6bff7f7e43fc2 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 4 Jul 2016 09:50:30 +0000 Subject: timers: Switch to a non-cascading wheel The current timer wheel has some drawbacks: 1) Cascading: Cascading can be an unbound operation and is completely pointless in most cases because the vast majority of the timer wheel timers are canceled or rearmed before expiration. (They are used as timeout safeguards, not as real timers to measure time.) 2) No fast lookup of the next expiring timer: In NOHZ scenarios the first timer soft interrupt after a long NOHZ period must fast forward the base time to the current value of jiffies. As we have no way to find the next expiring timer fast, the code loops linearly and increments the base time one by one and checks for expired timers in each step. This causes unbound overhead spikes exactly in the moment when we should wake up as fast as possible. After a thorough analysis of real world data gathered on laptops, workstations, webservers and other machines (thanks Chris!) I came to the conclusion that the current 'classic' timer wheel implementation can be modified to address the above issues. The vast majority of timer wheel timers is canceled or rearmed before expiry. Most of them are timeouts for networking and other I/O tasks. The nature of timeouts is to catch the exception from normal operation (TCP ack timed out, disk does not respond, etc.). For these kinds of timeouts the accuracy of the timeout is not really a concern. Timeouts are very often approximate worst-case values and in case the timeout fires, we already waited for a long time and performance is down the drain already. The few timers which actually expire can be split into two categories: 1) Short expiry times which expect halfways accurate expiry 2) Long term expiry times are inaccurate today already due to the batching which is done for NOHZ automatically and also via the set_timer_slack() API. So for long term expiry timers we can avoid the cascading property and just leave them in the less granular outer wheels until expiry or cancelation. Timers which are armed with a timeout larger than the wheel capacity are no longer cascaded. We expire them with the longest possible timeout (6+ days). We have not observed such timeouts in our data collection, but at least we handle them, applying the rule of the least surprise. To avoid extending the wheel levels for HZ=1000 so we can accomodate the longest observed timeouts (5 days in the network conntrack code) we reduce the first level granularity on HZ=1000 to 4ms, which effectively is the same as the HZ=250 behaviour. From our data analysis there is nothing which relies on that 1ms granularity and as a side effect we get better batching and timer locality for the networking code as well. Contrary to the classic wheel the granularity of the next wheel is not the capacity of the first wheel. The granularities of the wheels are in the currently chosen setting 8 times the granularity of the previous wheel. So for HZ=250 we end up with the following granularity levels: Level Offset Granularity Range 0 0 4 ms 0 ms - 252 ms 1 64 32 ms 256 ms - 2044 ms (256ms - ~2s) 2 128 256 ms 2048 ms - 16380 ms (~2s - ~16s) 3 192 2048 ms (~2s) 16384 ms - 131068 ms (~16s - ~2m) 4 256 16384 ms (~16s) 131072 ms - 1048572 ms (~2m - ~17m) 5 320 131072 ms (~2m) 1048576 ms - 8388604 ms (~17m - ~2h) 6 384 1048576 ms (~17m) 8388608 ms - 67108863 ms (~2h - ~18h) 7 448 8388608 ms (~2h) 67108864 ms - 536870911 ms (~18h - ~6d) That's a worst case inaccuracy of 12.5% for the timers which are queued at the beginning of a level. So the new wheel concept addresses the old issues: 1) Cascading is avoided completely 2) By keeping the timers in the bucket until expiry/cancelation we can track the buckets which have timers enqueued in a bucket bitmap and therefore can look up the next expiring timer very fast and O(1). A further benefit of the concept is that the slack calculation which is done on every timer start is no longer necessary because the granularity levels provide natural batching already. Our extensive testing with various loads did not show any performance degradation vs. the current wheel implementation. This patch does not address the 'fast lookup' issue as we wanted to make sure that there is no regression introduced by the wheel redesign. The optimizations are in follow up patches. This patch contains fixes from Anna-Maria Gleixner and Richard Cochran. Signed-off-by: Thomas Gleixner Cc: Arjan van de Ven Cc: Chris Mason Cc: Eric Dumazet Cc: Frederic Weisbecker Cc: George Spelvin Cc: Josh Triplett Cc: Len Brown Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Rik van Riel Cc: rt@linutronix.de Link: http://lkml.kernel.org/r/20160704094342.108621834@linutronix.de Signed-off-by: Ingo Molnar --- include/linux/timer.h | 2 + kernel/time/timer.c | 829 ++++++++++++++++++++++++++++---------------------- 2 files changed, 469 insertions(+), 362 deletions(-) (limited to 'kernel') diff --git a/include/linux/timer.h b/include/linux/timer.h index 989f33d16ebf..5869ab9848fe 100644 --- a/include/linux/timer.h +++ b/include/linux/timer.h @@ -64,6 +64,8 @@ struct timer_list { #define TIMER_DEFERRABLE 0x00080000 #define TIMER_PINNED 0x00100000 #define TIMER_IRQSAFE 0x00200000 +#define TIMER_ARRAYSHIFT 22 +#define TIMER_ARRAYMASK 0xFFC00000 #define __TIMER_INITIALIZER(_function, _expires, _data, _flags) { \ .entry = { .next = TIMER_ENTRY_STATIC }, \ diff --git a/kernel/time/timer.c b/kernel/time/timer.c index f259a3ef4577..86e95b72665d 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -59,43 +59,151 @@ __visible u64 jiffies_64 __cacheline_aligned_in_smp = INITIAL_JIFFIES; EXPORT_SYMBOL(jiffies_64); /* - * per-CPU timer vector definitions: + * The timer wheel has LVL_DEPTH array levels. Each level provides an array of + * LVL_SIZE buckets. Each level is driven by its own clock and therefor each + * level has a different granularity. + * + * The level granularity is: LVL_CLK_DIV ^ lvl + * The level clock frequency is: HZ / (LVL_CLK_DIV ^ level) + * + * The array level of a newly armed timer depends on the relative expiry + * time. The farther the expiry time is away the higher the array level and + * therefor the granularity becomes. + * + * Contrary to the original timer wheel implementation, which aims for 'exact' + * expiry of the timers, this implementation removes the need for recascading + * the timers into the lower array levels. The previous 'classic' timer wheel + * implementation of the kernel already violated the 'exact' expiry by adding + * slack to the expiry time to provide batched expiration. The granularity + * levels provide implicit batching. + * + * This is an optimization of the original timer wheel implementation for the + * majority of the timer wheel use cases: timeouts. The vast majority of + * timeout timers (networking, disk I/O ...) are canceled before expiry. If + * the timeout expires it indicates that normal operation is disturbed, so it + * does not matter much whether the timeout comes with a slight delay. + * + * The only exception to this are networking timers with a small expiry + * time. They rely on the granularity. Those fit into the first wheel level, + * which has HZ granularity. + * + * We don't have cascading anymore. timers with a expiry time above the + * capacity of the last wheel level are force expired at the maximum timeout + * value of the last wheel level. From data sampling we know that the maximum + * value observed is 5 days (network connection tracking), so this should not + * be an issue. + * + * The currently chosen array constants values are a good compromise between + * array size and granularity. + * + * This results in the following granularity and range levels: + * + * HZ 1000 steps + * Level Offset Granularity Range + * 0 0 1 ms 0 ms - 63 ms + * 1 64 8 ms 64 ms - 511 ms + * 2 128 64 ms 512 ms - 4095 ms (512ms - ~4s) + * 3 192 512 ms 4096 ms - 32767 ms (~4s - ~32s) + * 4 256 4096 ms (~4s) 32768 ms - 262143 ms (~32s - ~4m) + * 5 320 32768 ms (~32s) 262144 ms - 2097151 ms (~4m - ~34m) + * 6 384 262144 ms (~4m) 2097152 ms - 16777215 ms (~34m - ~4h) + * 7 448 2097152 ms (~34m) 16777216 ms - 134217727 ms (~4h - ~1d) + * 8 512 16777216 ms (~4h) 134217728 ms - 1073741822 ms (~1d - ~12d) + * + * HZ 300 + * Level Offset Granularity Range + * 0 0 3 ms 0 ms - 210 ms + * 1 64 26 ms 213 ms - 1703 ms (213ms - ~1s) + * 2 128 213 ms 1706 ms - 13650 ms (~1s - ~13s) + * 3 192 1706 ms (~1s) 13653 ms - 109223 ms (~13s - ~1m) + * 4 256 13653 ms (~13s) 109226 ms - 873810 ms (~1m - ~14m) + * 5 320 109226 ms (~1m) 873813 ms - 6990503 ms (~14m - ~1h) + * 6 384 873813 ms (~14m) 6990506 ms - 55924050 ms (~1h - ~15h) + * 7 448 6990506 ms (~1h) 55924053 ms - 447392423 ms (~15h - ~5d) + * 8 512 55924053 ms (~15h) 447392426 ms - 3579139406 ms (~5d - ~41d) + * + * HZ 250 + * Level Offset Granularity Range + * 0 0 4 ms 0 ms - 255 ms + * 1 64 32 ms 256 ms - 2047 ms (256ms - ~2s) + * 2 128 256 ms 2048 ms - 16383 ms (~2s - ~16s) + * 3 192 2048 ms (~2s) 16384 ms - 131071 ms (~16s - ~2m) + * 4 256 16384 ms (~16s) 131072 ms - 1048575 ms (~2m - ~17m) + * 5 320 131072 ms (~2m) 1048576 ms - 8388607 ms (~17m - ~2h) + * 6 384 1048576 ms (~17m) 8388608 ms - 67108863 ms (~2h - ~18h) + * 7 448 8388608 ms (~2h) 67108864 ms - 536870911 ms (~18h - ~6d) + * 8 512 67108864 ms (~18h) 536870912 ms - 4294967288 ms (~6d - ~49d) + * + * HZ 100 + * Level Offset Granularity Range + * 0 0 10 ms 0 ms - 630 ms + * 1 64 80 ms 640 ms - 5110 ms (640ms - ~5s) + * 2 128 640 ms 5120 ms - 40950 ms (~5s - ~40s) + * 3 192 5120 ms (~5s) 40960 ms - 327670 ms (~40s - ~5m) + * 4 256 40960 ms (~40s) 327680 ms - 2621430 ms (~5m - ~43m) + * 5 320 327680 ms (~5m) 2621440 ms - 20971510 ms (~43m - ~5h) + * 6 384 2621440 ms (~43m) 20971520 ms - 167772150 ms (~5h - ~1d) + * 7 448 20971520 ms (~5h) 167772160 ms - 1342177270 ms (~1d - ~15d) */ -#define TVN_BITS (CONFIG_BASE_SMALL ? 4 : 6) -#define TVR_BITS (CONFIG_BASE_SMALL ? 6 : 8) -#define TVN_SIZE (1 << TVN_BITS) -#define TVR_SIZE (1 << TVR_BITS) -#define TVN_MASK (TVN_SIZE - 1) -#define TVR_MASK (TVR_SIZE - 1) -#define MAX_TVAL ((unsigned long)((1ULL << (TVR_BITS + 4*TVN_BITS)) - 1)) - -struct tvec { - struct hlist_head vec[TVN_SIZE]; -}; -struct tvec_root { - struct hlist_head vec[TVR_SIZE]; -}; +/* Clock divisor for the next level */ +#define LVL_CLK_SHIFT 3 +#define LVL_CLK_DIV (1UL << LVL_CLK_SHIFT) +#define LVL_CLK_MASK (LVL_CLK_DIV - 1) +#define LVL_SHIFT(n) ((n) * LVL_CLK_SHIFT) +#define LVL_GRAN(n) (1UL << LVL_SHIFT(n)) + +/* + * The time start value for each level to select the bucket at enqueue + * time. + */ +#define LVL_START(n) ((LVL_SIZE - 1) << (((n) - 1) * LVL_CLK_SHIFT)) + +/* Size of each clock level */ +#define LVL_BITS 6 +#define LVL_SIZE (1UL << LVL_BITS) +#define LVL_MASK (LVL_SIZE - 1) +#define LVL_OFFS(n) ((n) * LVL_SIZE) + +/* Level depth */ +#if HZ > 100 +# define LVL_DEPTH 9 +# else +# define LVL_DEPTH 8 +#endif + +/* The cutoff (max. capacity of the wheel) */ +#define WHEEL_TIMEOUT_CUTOFF (LVL_START(LVL_DEPTH)) +#define WHEEL_TIMEOUT_MAX (WHEEL_TIMEOUT_CUTOFF - LVL_GRAN(LVL_DEPTH - 1)) + +/* + * The resulting wheel size. If NOHZ is configured we allocate two + * wheels so we have a separate storage for the deferrable timers. + */ +#define WHEEL_SIZE (LVL_SIZE * LVL_DEPTH) + +#ifdef CONFIG_NO_HZ_COMMON +# define NR_BASES 2 +# define BASE_STD 0 +# define BASE_DEF 1 +#else +# define NR_BASES 1 +# define BASE_STD 0 +# define BASE_DEF 0 +#endif struct timer_base { - spinlock_t lock; - struct timer_list *running_timer; - unsigned long clk; - unsigned long next_timer; - unsigned long active_timers; - unsigned long all_timers; - int cpu; - bool migration_enabled; - bool nohz_active; - struct tvec_root tv1; - struct tvec tv2; - struct tvec tv3; - struct tvec tv4; - struct tvec tv5; + spinlock_t lock; + struct timer_list *running_timer; + unsigned long clk; + unsigned int cpu; + bool migration_enabled; + bool nohz_active; + DECLARE_BITMAP(pending_map, WHEEL_SIZE); + struct hlist_head vectors[WHEEL_SIZE]; } ____cacheline_aligned; - -static DEFINE_PER_CPU(struct timer_base, timer_bases); +static DEFINE_PER_CPU(struct timer_base, timer_bases[NR_BASES]); #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) unsigned int sysctl_timer_migration = 1; @@ -106,15 +214,17 @@ void timers_update_migration(bool update_nohz) unsigned int cpu; /* Avoid the loop, if nothing to update */ - if (this_cpu_read(timer_bases.migration_enabled) == on) + if (this_cpu_read(timer_bases[BASE_STD].migration_enabled) == on) return; for_each_possible_cpu(cpu) { - per_cpu(timer_bases.migration_enabled, cpu) = on; + per_cpu(timer_bases[BASE_STD].migration_enabled, cpu) = on; + per_cpu(timer_bases[BASE_DEF].migration_enabled, cpu) = on; per_cpu(hrtimer_bases.migration_enabled, cpu) = on; if (!update_nohz) continue; - per_cpu(timer_bases.nohz_active, cpu) = true; + per_cpu(timer_bases[BASE_STD].nohz_active, cpu) = true; + per_cpu(timer_bases[BASE_DEF].nohz_active, cpu) = true; per_cpu(hrtimer_bases.nohz_active, cpu) = true; } } @@ -133,20 +243,6 @@ int timer_migration_handler(struct ctl_table *table, int write, mutex_unlock(&mutex); return ret; } - -static inline struct timer_base *get_target_base(struct timer_base *base, - int pinned) -{ - if (pinned || !base->migration_enabled) - return this_cpu_ptr(&timer_bases); - return per_cpu_ptr(&timer_bases, get_nohz_timer_target()); -} -#else -static inline struct timer_base *get_target_base(struct timer_base *base, - int pinned) -{ - return this_cpu_ptr(&timer_bases); -} #endif static unsigned long round_jiffies_common(unsigned long j, int cpu, @@ -370,78 +466,91 @@ void set_timer_slack(struct timer_list *timer, int slack_hz) } EXPORT_SYMBOL_GPL(set_timer_slack); +static inline unsigned int timer_get_idx(struct timer_list *timer) +{ + return (timer->flags & TIMER_ARRAYMASK) >> TIMER_ARRAYSHIFT; +} + +static inline void timer_set_idx(struct timer_list *timer, unsigned int idx) +{ + timer->flags = (timer->flags & ~TIMER_ARRAYMASK) | + idx << TIMER_ARRAYSHIFT; +} + +/* + * Helper function to calculate the array index for a given expiry + * time. + */ +static inline unsigned calc_index(unsigned expires, unsigned lvl) +{ + expires = (expires + LVL_GRAN(lvl)) >> LVL_SHIFT(lvl); + return LVL_OFFS(lvl) + (expires & LVL_MASK); +} + static void __internal_add_timer(struct timer_base *base, struct timer_list *timer) { unsigned long expires = timer->expires; - unsigned long idx = expires - base->clk; + unsigned long delta = expires - base->clk; struct hlist_head *vec; - - if (idx < TVR_SIZE) { - int i = expires & TVR_MASK; - vec = base->tv1.vec + i; - } else if (idx < 1 << (TVR_BITS + TVN_BITS)) { - int i = (expires >> TVR_BITS) & TVN_MASK; - vec = base->tv2.vec + i; - } else if (idx < 1 << (TVR_BITS + 2 * TVN_BITS)) { - int i = (expires >> (TVR_BITS + TVN_BITS)) & TVN_MASK; - vec = base->tv3.vec + i; - } else if (idx < 1 << (TVR_BITS + 3 * TVN_BITS)) { - int i = (expires >> (TVR_BITS + 2 * TVN_BITS)) & TVN_MASK; - vec = base->tv4.vec + i; - } else if ((signed long) idx < 0) { - /* - * Can happen if you add a timer with expires == jiffies, - * or you set a timer to go off in the past - */ - vec = base->tv1.vec + (base->clk & TVR_MASK); + unsigned int idx; + + if (delta < LVL_START(1)) { + idx = calc_index(expires, 0); + } else if (delta < LVL_START(2)) { + idx = calc_index(expires, 1); + } else if (delta < LVL_START(3)) { + idx = calc_index(expires, 2); + } else if (delta < LVL_START(4)) { + idx = calc_index(expires, 3); + } else if (delta < LVL_START(5)) { + idx = calc_index(expires, 4); + } else if (delta < LVL_START(6)) { + idx = calc_index(expires, 5); + } else if (delta < LVL_START(7)) { + idx = calc_index(expires, 6); + } else if (LVL_DEPTH > 8 && delta < LVL_START(8)) { + idx = calc_index(expires, 7); + } else if ((long) delta < 0) { + idx = base->clk & LVL_MASK; } else { - int i; - /* If the timeout is larger than MAX_TVAL (on 64-bit - * architectures or with CONFIG_BASE_SMALL=1) then we - * use the maximum timeout. + /* + * Force expire obscene large timeouts to expire at the + * capacity limit of the wheel. */ - if (idx > MAX_TVAL) { - idx = MAX_TVAL; - expires = idx + base->clk; - } - i = (expires >> (TVR_BITS + 3 * TVN_BITS)) & TVN_MASK; - vec = base->tv5.vec + i; - } + if (expires >= WHEEL_TIMEOUT_CUTOFF) + expires = WHEEL_TIMEOUT_MAX; + idx = calc_index(expires, LVL_DEPTH - 1); + } + /* + * Enqueue the timer into the array bucket, mark it pending in + * the bitmap and store the index in the timer flags. + */ + vec = base->vectors + idx; hlist_add_head(&timer->entry, vec); + __set_bit(idx, base->pending_map); + timer_set_idx(timer, idx); } static void internal_add_timer(struct timer_base *base, struct timer_list *timer) { - /* Advance base->jiffies, if the base is empty */ - if (!base->all_timers++) - base->clk = jiffies; - __internal_add_timer(base, timer); - /* - * Update base->active_timers and base->next_timer - */ - if (!(timer->flags & TIMER_DEFERRABLE)) { - if (!base->active_timers++ || - time_before(timer->expires, base->next_timer)) - base->next_timer = timer->expires; - } /* * Check whether the other CPU is in dynticks mode and needs - * to be triggered to reevaluate the timer wheel. - * We are protected against the other CPU fiddling - * with the timer by holding the timer base lock. This also - * makes sure that a CPU on the way to stop its tick can not - * evaluate the timer wheel. + * to be triggered to reevaluate the timer wheel. We are + * protected against the other CPU fiddling with the timer by + * holding the timer base lock. This also makes sure that a + * CPU on the way to stop its tick can not evaluate the timer + * wheel. * * Spare the IPI for deferrable timers on idle targets though. * The next busy ticks will take care of it. Except full dynticks * require special care against races with idle_cpu(), lets deal * with that later. */ - if (base->nohz_active) { + if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active) { if (!(timer->flags & TIMER_DEFERRABLE) || tick_nohz_full_cpu(base->cpu)) wake_up_nohz_cpu(base->cpu); @@ -706,54 +815,87 @@ static inline void detach_timer(struct timer_list *timer, bool clear_pending) entry->next = LIST_POISON2; } -static inline void -detach_expired_timer(struct timer_list *timer, struct timer_base *base) -{ - detach_timer(timer, true); - if (!(timer->flags & TIMER_DEFERRABLE)) - base->active_timers--; - base->all_timers--; -} - static int detach_if_pending(struct timer_list *timer, struct timer_base *base, bool clear_pending) { + unsigned idx = timer_get_idx(timer); + if (!timer_pending(timer)) return 0; + if (hlist_is_singular_node(&timer->entry, base->vectors + idx)) + __clear_bit(idx, base->pending_map); + detach_timer(timer, clear_pending); - if (!(timer->flags & TIMER_DEFERRABLE)) { - base->active_timers--; - if (timer->expires == base->next_timer) - base->next_timer = base->clk; - } - /* If this was the last timer, advance base->jiffies */ - if (!--base->all_timers) - base->clk = jiffies; return 1; } +static inline struct timer_base *get_timer_cpu_base(u32 tflags, u32 cpu) +{ + struct timer_base *base = per_cpu_ptr(&timer_bases[BASE_STD], cpu); + + /* + * If the timer is deferrable and nohz is active then we need to use + * the deferrable base. + */ + if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active && + (tflags & TIMER_DEFERRABLE)) + base = per_cpu_ptr(&timer_bases[BASE_DEF], cpu); + return base; +} + +static inline struct timer_base *get_timer_this_cpu_base(u32 tflags) +{ + struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]); + + /* + * If the timer is deferrable and nohz is active then we need to use + * the deferrable base. + */ + if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active && + (tflags & TIMER_DEFERRABLE)) + base = this_cpu_ptr(&timer_bases[BASE_DEF]); + return base; +} + +static inline struct timer_base *get_timer_base(u32 tflags) +{ + return get_timer_cpu_base(tflags, tflags & TIMER_CPUMASK); +} + +static inline struct timer_base *get_target_base(struct timer_base *base, + unsigned tflags) +{ +#if defined(CONFIG_NO_HZ_COMMON) && defined(CONFIG_SMP) + if ((tflags & TIMER_PINNED) || !base->migration_enabled) + return get_timer_this_cpu_base(tflags); + return get_timer_cpu_base(tflags, get_nohz_timer_target()); +#else + return get_timer_this_cpu_base(tflags); +#endif +} + /* - * We are using hashed locking: holding per_cpu(timer_bases).lock - * means that all timers which are tied to this base via timer->base are - * locked, and the base itself is locked too. + * We are using hashed locking: Holding per_cpu(timer_bases[x]).lock means + * that all timers which are tied to this base are locked, and the base itself + * is locked too. * * So __run_timers/migrate_timers can safely modify all timers which could - * be found on ->tvX lists. + * be found in the base->vectors array. * - * When the timer's base is locked and removed from the list, the - * TIMER_MIGRATING flag is set, FIXME + * When a timer is migrating then the TIMER_MIGRATING flag is set and we need + * to wait until the migration is done. */ static struct timer_base *lock_timer_base(struct timer_list *timer, - unsigned long *flags) + unsigned long *flags) __acquires(timer->base->lock) { for (;;) { - u32 tf = timer->flags; struct timer_base *base; + u32 tf = timer->flags; if (!(tf & TIMER_MIGRATING)) { - base = per_cpu_ptr(&timer_bases, tf & TIMER_CPUMASK); + base = get_timer_base(tf); spin_lock_irqsave(&base->lock, *flags); if (timer->flags == tf) return base; @@ -770,6 +912,27 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only) unsigned long flags; int ret = 0; + /* + * TODO: Calculate the array bucket of the timer right here w/o + * holding the base lock. This allows to check not only + * timer->expires == expires below, but also whether the timer + * ends up in the same bucket. If we really need to requeue + * the timer then we check whether base->clk have + * advanced between here and locking the timer base. If + * jiffies advanced we have to recalc the array bucket with the + * lock held. + */ + + /* + * This is a common optimization triggered by the + * networking code - if the timer is re-modified + * to be the same thing then just return: + */ + if (timer_pending(timer)) { + if (timer->expires == expires) + return 1; + } + timer_stats_timer_set_start_info(timer); BUG_ON(!timer->function); @@ -781,15 +944,15 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only) debug_activate(timer, expires); - new_base = get_target_base(base, timer->flags & TIMER_PINNED); + new_base = get_target_base(base, timer->flags); if (base != new_base) { /* - * We are trying to schedule the timer on the local CPU. + * We are trying to schedule the timer on the new base. * However we can't change timer's base while it is running, * otherwise del_timer_sync() can't detect that the timer's - * handler yet has not finished. This also guarantees that - * the timer is serialized wrt itself. + * handler yet has not finished. This also guarantees that the + * timer is serialized wrt itself. */ if (likely(base->running_timer != timer)) { /* See the comment in lock_timer_base() */ @@ -828,45 +991,6 @@ int mod_timer_pending(struct timer_list *timer, unsigned long expires) } EXPORT_SYMBOL(mod_timer_pending); -/* - * Decide where to put the timer while taking the slack into account - * - * Algorithm: - * 1) calculate the maximum (absolute) time - * 2) calculate the highest bit where the expires and new max are different - * 3) use this bit to make a mask - * 4) use the bitmask to round down the maximum time, so that all last - * bits are zeros - */ -static inline -unsigned long apply_slack(struct timer_list *timer, unsigned long expires) -{ - unsigned long expires_limit, mask; - int bit; - - if (timer->slack >= 0) { - expires_limit = expires + timer->slack; - } else { - long delta = expires - jiffies; - - if (delta < 256) - return expires; - - expires_limit = expires + delta / 256; - } - mask = expires ^ expires_limit; - if (mask == 0) - return expires; - - bit = __fls(mask); - - mask = (1UL << bit) - 1; - - expires_limit = expires_limit & ~(mask); - - return expires_limit; -} - /** * mod_timer - modify a timer's timeout * @timer: the timer to be modified @@ -889,16 +1013,6 @@ unsigned long apply_slack(struct timer_list *timer, unsigned long expires) */ int mod_timer(struct timer_list *timer, unsigned long expires) { - expires = apply_slack(timer, expires); - - /* - * This is a common optimization triggered by the - * networking code - if the timer is re-modified - * to be the same thing then just return: - */ - if (timer_pending(timer) && timer->expires == expires) - return 1; - return __mod_timer(timer, expires, false); } EXPORT_SYMBOL(mod_timer); @@ -933,13 +1047,14 @@ EXPORT_SYMBOL(add_timer); */ void add_timer_on(struct timer_list *timer, int cpu) { - struct timer_base *new_base = per_cpu_ptr(&timer_bases, cpu); - struct timer_base *base; + struct timer_base *new_base, *base; unsigned long flags; timer_stats_timer_set_start_info(timer); BUG_ON(timer_pending(timer) || !timer->function); + new_base = get_timer_cpu_base(timer->flags, cpu); + /* * If @timer was on a different CPU, it should be migrated with the * old base locked to prevent other operations proceeding with the @@ -1085,27 +1200,6 @@ int del_timer_sync(struct timer_list *timer) EXPORT_SYMBOL(del_timer_sync); #endif -static int cascade(struct timer_base *base, struct tvec *tv, int index) -{ - /* cascade all the timers from tv up one level */ - struct timer_list *timer; - struct hlist_node *tmp; - struct hlist_head tv_list; - - hlist_move_list(tv->vec + index, &tv_list); - - /* - * We are removing _all_ timers from the list, so we - * don't have to detach them individually. - */ - hlist_for_each_entry_safe(timer, tmp, &tv_list, entry) { - /* No accounting, while moving them */ - __internal_add_timer(base, timer); - } - - return index; -} - static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long), unsigned long data) { @@ -1149,68 +1243,80 @@ static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long), } } -#define INDEX(N) ((base->clk >> (TVR_BITS + (N) * TVN_BITS)) & TVN_MASK) +static void expire_timers(struct timer_base *base, struct hlist_head *head) +{ + while (!hlist_empty(head)) { + struct timer_list *timer; + void (*fn)(unsigned long); + unsigned long data; + + timer = hlist_entry(head->first, struct timer_list, entry); + timer_stats_account_timer(timer); + + base->running_timer = timer; + detach_timer(timer, true); + + fn = timer->function; + data = timer->data; + + if (timer->flags & TIMER_IRQSAFE) { + spin_unlock(&base->lock); + call_timer_fn(timer, fn, data); + spin_lock(&base->lock); + } else { + spin_unlock_irq(&base->lock); + call_timer_fn(timer, fn, data); + spin_lock_irq(&base->lock); + } + } +} + +static int collect_expired_timers(struct timer_base *base, + struct hlist_head *heads) +{ + unsigned long clk = base->clk; + struct hlist_head *vec; + int i, levels = 0; + unsigned int idx; + + for (i = 0; i < LVL_DEPTH; i++) { + idx = (clk & LVL_MASK) + i * LVL_SIZE; + + if (__test_and_clear_bit(idx, base->pending_map)) { + vec = base->vectors + idx; + hlist_move_list(vec, heads++); + levels++; + } + /* Is it time to look at the next level? */ + if (clk & LVL_CLK_MASK) + break; + /* Shift clock for the next level granularity */ + clk >>= LVL_CLK_SHIFT; + } + return levels; +} /** * __run_timers - run all expired timers (if any) on this CPU. * @base: the timer vector to be processed. - * - * This function cascades all vectors and executes all expired timer - * vectors. */ static inline void __run_timers(struct timer_base *base) { - struct timer_list *timer; + struct hlist_head heads[LVL_DEPTH]; + int levels; + + if (!time_after_eq(jiffies, base->clk)) + return; spin_lock_irq(&base->lock); while (time_after_eq(jiffies, base->clk)) { - struct hlist_head work_list; - struct hlist_head *head = &work_list; - int index; - if (!base->all_timers) { - base->clk = jiffies; - break; - } - - index = base->clk & TVR_MASK; + levels = collect_expired_timers(base, heads); + base->clk++; - /* - * Cascade timers: - */ - if (!index && - (!cascade(base, &base->tv2, INDEX(0))) && - (!cascade(base, &base->tv3, INDEX(1))) && - !cascade(base, &base->tv4, INDEX(2))) - cascade(base, &base->tv5, INDEX(3)); - ++base->clk; - hlist_move_list(base->tv1.vec + index, head); - while (!hlist_empty(head)) { - void (*fn)(unsigned long); - unsigned long data; - bool irqsafe; - - timer = hlist_entry(head->first, struct timer_list, entry); - fn = timer->function; - data = timer->data; - irqsafe = timer->flags & TIMER_IRQSAFE; - - timer_stats_account_timer(timer); - - base->running_timer = timer; - detach_expired_timer(timer, base); - - if (irqsafe) { - spin_unlock(&base->lock); - call_timer_fn(timer, fn, data); - spin_lock(&base->lock); - } else { - spin_unlock_irq(&base->lock); - call_timer_fn(timer, fn, data); - spin_lock_irq(&base->lock); - } - } + while (levels--) + expire_timers(base, heads + levels); } base->running_timer = NULL; spin_unlock_irq(&base->lock); @@ -1218,78 +1324,87 @@ static inline void __run_timers(struct timer_base *base) #ifdef CONFIG_NO_HZ_COMMON /* - * Find out when the next timer event is due to happen. This - * is used on S/390 to stop all activity when a CPU is idle. - * This function needs to be called with interrupts disabled. + * Find the next pending bucket of a level. Search from @offset + @clk upwards + * and if nothing there, search from start of the level (@offset) up to + * @offset + clk. + */ +static int next_pending_bucket(struct timer_base *base, unsigned offset, + unsigned clk) +{ + unsigned pos, start = offset + clk; + unsigned end = offset + LVL_SIZE; + + pos = find_next_bit(base->pending_map, end, start); + if (pos < end) + return pos - start; + + pos = find_next_bit(base->pending_map, start, offset); + return pos < start ? pos + LVL_SIZE - start : -1; +} + +/* + * Search the first expiring timer in the various clock levels. */ static unsigned long __next_timer_interrupt(struct timer_base *base) { - unsigned long clk = base->clk; - unsigned long expires = clk + NEXT_TIMER_MAX_DELTA; - int index, slot, array, found = 0; - struct timer_list *nte; - struct tvec *varray[4]; - - /* Look for timer events in tv1. */ - index = slot = clk & TVR_MASK; - do { - hlist_for_each_entry(nte, base->tv1.vec + slot, entry) { - if (nte->flags & TIMER_DEFERRABLE) - continue; - - found = 1; - expires = nte->expires; - /* Look at the cascade bucket(s)? */ - if (!index || slot < index) - goto cascade; - return expires; + unsigned long clk, next, adj; + unsigned lvl, offset = 0; + + spin_lock(&base->lock); + next = base->clk + NEXT_TIMER_MAX_DELTA; + clk = base->clk; + for (lvl = 0; lvl < LVL_DEPTH; lvl++, offset += LVL_SIZE) { + int pos = next_pending_bucket(base, offset, clk & LVL_MASK); + + if (pos >= 0) { + unsigned long tmp = clk + (unsigned long) pos; + + tmp <<= LVL_SHIFT(lvl); + if (time_before(tmp, next)) + next = tmp; } - slot = (slot + 1) & TVR_MASK; - } while (slot != index); - -cascade: - /* Calculate the next cascade event */ - if (index) - clk += TVR_SIZE - index; - clk >>= TVR_BITS; - - /* Check tv2-tv5. */ - varray[0] = &base->tv2; - varray[1] = &base->tv3; - varray[2] = &base->tv4; - varray[3] = &base->tv5; - - for (array = 0; array < 4; array++) { - struct tvec *varp = varray[array]; - - index = slot = clk & TVN_MASK; - do { - hlist_for_each_entry(nte, varp->vec + slot, entry) { - if (nte->flags & TIMER_DEFERRABLE) - continue; - - found = 1; - if (time_before(nte->expires, expires)) - expires = nte->expires; - } - /* - * Do we still search for the first timer or are - * we looking up the cascade buckets ? - */ - if (found) { - /* Look at the cascade bucket(s)? */ - if (!index || slot < index) - break; - return expires; - } - slot = (slot + 1) & TVN_MASK; - } while (slot != index); - - if (index) - clk += TVN_SIZE - index; - clk >>= TVN_BITS; + /* + * Clock for the next level. If the current level clock lower + * bits are zero, we look at the next level as is. If not we + * need to advance it by one because that's going to be the + * next expiring bucket in that level. base->clk is the next + * expiring jiffie. So in case of: + * + * LVL5 LVL4 LVL3 LVL2 LVL1 LVL0 + * 0 0 0 0 0 0 + * + * we have to look at all levels @index 0. With + * + * LVL5 LVL4 LVL3 LVL2 LVL1 LVL0 + * 0 0 0 0 0 2 + * + * LVL0 has the next expiring bucket @index 2. The upper + * levels have the next expiring bucket @index 1. + * + * In case that the propagation wraps the next level the same + * rules apply: + * + * LVL5 LVL4 LVL3 LVL2 LVL1 LVL0 + * 0 0 0 0 F 2 + * + * So after looking at LVL0 we get: + * + * LVL5 LVL4 LVL3 LVL2 LVL1 + * 0 0 0 1 0 + * + * So no propagation from LVL1 to LVL2 because that happened + * with the add already, but then we need to propagate further + * from LVL2 to LVL3. + * + * So the simple check whether the lower bits of the current + * level are 0 or not is sufficient for all cases. + */ + adj = clk & LVL_CLK_MASK ? 1 : 0; + clk >>= LVL_CLK_SHIFT; + clk += adj; } - return expires; + spin_unlock(&base->lock); + return next; } /* @@ -1335,7 +1450,7 @@ static u64 cmp_next_hrtimer_event(u64 basem, u64 expires) */ u64 get_next_timer_interrupt(unsigned long basej, u64 basem) { - struct timer_base *base = this_cpu_ptr(&timer_bases); + struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]); u64 expires = KTIME_MAX; unsigned long nextevt; @@ -1346,17 +1461,11 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem) if (cpu_is_offline(smp_processor_id())) return expires; - spin_lock(&base->lock); - if (base->active_timers) { - if (time_before_eq(base->next_timer, base->clk)) - base->next_timer = __next_timer_interrupt(base); - nextevt = base->next_timer; - if (time_before_eq(nextevt, basej)) - expires = basem; - else - expires = basem + (nextevt - basej) * TICK_NSEC; - } - spin_unlock(&base->lock); + nextevt = __next_timer_interrupt(base); + if (time_before_eq(nextevt, basej)) + expires = basem; + else + expires = basem + (nextevt - basej) * TICK_NSEC; return cmp_next_hrtimer_event(basem, expires); } @@ -1387,10 +1496,11 @@ void update_process_times(int user_tick) */ static void run_timer_softirq(struct softirq_action *h) { - struct timer_base *base = this_cpu_ptr(&timer_bases); + struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]); - if (time_after_eq(jiffies, base->clk)) - __run_timers(base); + __run_timers(base); + if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active) + __run_timers(this_cpu_ptr(&timer_bases[BASE_DEF])); } /* @@ -1541,7 +1651,6 @@ static void migrate_timer_list(struct timer_base *new_base, struct hlist_head *h while (!hlist_empty(head)) { timer = hlist_entry(head->first, struct timer_list, entry); - /* We ignore the accounting on the dying cpu */ detach_timer(timer, false); timer->flags = (timer->flags & ~TIMER_BASEMASK) | cpu; internal_add_timer(new_base, timer); @@ -1552,35 +1661,29 @@ static void migrate_timers(int cpu) { struct timer_base *old_base; struct timer_base *new_base; - int i; + int b, i; BUG_ON(cpu_online(cpu)); - old_base = per_cpu_ptr(&timer_bases, cpu); - new_base = get_cpu_ptr(&timer_bases); - /* - * The caller is globally serialized and nobody else - * takes two locks at once, deadlock is not possible. - */ - spin_lock_irq(&new_base->lock); - spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); - - BUG_ON(old_base->running_timer); - - for (i = 0; i < TVR_SIZE; i++) - migrate_timer_list(new_base, old_base->tv1.vec + i); - for (i = 0; i < TVN_SIZE; i++) { - migrate_timer_list(new_base, old_base->tv2.vec + i); - migrate_timer_list(new_base, old_base->tv3.vec + i); - migrate_timer_list(new_base, old_base->tv4.vec + i); - migrate_timer_list(new_base, old_base->tv5.vec + i); - } - old_base->active_timers = 0; - old_base->all_timers = 0; + for (b = 0; b < NR_BASES; b++) { + old_base = per_cpu_ptr(&timer_bases[b], cpu); + new_base = get_cpu_ptr(&timer_bases[b]); + /* + * The caller is globally serialized and nobody else + * takes two locks at once, deadlock is not possible. + */ + spin_lock_irq(&new_base->lock); + spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); + + BUG_ON(old_base->running_timer); + + for (i = 0; i < WHEEL_SIZE; i++) + migrate_timer_list(new_base, old_base->vectors + i); - spin_unlock(&old_base->lock); - spin_unlock_irq(&new_base->lock); - put_cpu_ptr(&timer_bases); + spin_unlock(&old_base->lock); + spin_unlock_irq(&new_base->lock); + put_cpu_ptr(&timer_bases); + } } static int timer_cpu_notify(struct notifier_block *self, @@ -1608,13 +1711,15 @@ static inline void timer_register_cpu_notifier(void) { } static void __init init_timer_cpu(int cpu) { - struct timer_base *base = per_cpu_ptr(&timer_bases, cpu); - - base->cpu = cpu; - spin_lock_init(&base->lock); + struct timer_base *base; + int i; - base->clk = jiffies; - base->next_timer = base->clk; + for (i = 0; i < NR_BASES; i++) { + base = per_cpu_ptr(&timer_bases[i], cpu); + base->cpu = cpu; + spin_lock_init(&base->lock); + base->clk = jiffies; + } } static void __init init_timer_cpus(void) -- cgit v1.2.3 From 53bf837b78d155b8e1110b3c25b4d0d6391b8ff3 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 4 Jul 2016 09:50:31 +0000 Subject: timers: Remove set_timer_slack() leftovers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We now have implicit batching in the timer wheel. The slack API is no longer used, so remove it. Signed-off-by: Thomas Gleixner Cc: Alan Stern Cc: Andrew F. Davis Cc: Arjan van de Ven Cc: Chris Mason Cc: David S. Miller Cc: David Woodhouse Cc: Dmitry Eremin-Solenikov Cc: Eric Dumazet Cc: Frederic Weisbecker Cc: George Spelvin Cc: Greg Kroah-Hartman Cc: Jaehoon Chung Cc: Jens Axboe Cc: John Stultz Cc: Josh Triplett Cc: Len Brown Cc: Linus Torvalds Cc: Mathias Nyman Cc: Pali Rohár Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Rik van Riel Cc: Sebastian Reichel Cc: Ulf Hansson Cc: linux-block@vger.kernel.org Cc: linux-kernel@vger.kernel.org Cc: linux-mmc@vger.kernel.org Cc: linux-pm@vger.kernel.org Cc: linux-usb@vger.kernel.org Cc: netdev@vger.kernel.org Cc: rt@linutronix.de Link: http://lkml.kernel.org/r/20160704094342.189813118@linutronix.de Signed-off-by: Ingo Molnar --- block/genhd.c | 5 ----- drivers/mmc/host/jz4740_mmc.c | 2 -- drivers/power/bq27xxx_battery.c | 5 +---- drivers/usb/host/ohci-hcd.c | 1 - drivers/usb/host/xhci.c | 2 -- include/linux/timer.h | 4 ---- kernel/time/timer.c | 19 ------------------- lib/random32.c | 1 - 8 files changed, 1 insertion(+), 38 deletions(-) (limited to 'kernel') diff --git a/block/genhd.c b/block/genhd.c index 9f42526b4d62..f06d7f3b075b 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1523,12 +1523,7 @@ static void __disk_unblock_events(struct gendisk *disk, bool check_now) if (--ev->block) goto out_unlock; - /* - * Not exactly a latency critical operation, set poll timer - * slack to 25% and kick event check. - */ intv = disk_events_poll_jiffies(disk); - set_timer_slack(&ev->dwork.timer, intv / 4); if (check_now) queue_delayed_work(system_freezable_power_efficient_wq, &ev->dwork, 0); diff --git a/drivers/mmc/host/jz4740_mmc.c b/drivers/mmc/host/jz4740_mmc.c index 03ddf0ecf402..684087db170b 100644 --- a/drivers/mmc/host/jz4740_mmc.c +++ b/drivers/mmc/host/jz4740_mmc.c @@ -1068,8 +1068,6 @@ static int jz4740_mmc_probe(struct platform_device* pdev) jz4740_mmc_clock_disable(host); setup_timer(&host->timeout_timer, jz4740_mmc_timeout, (unsigned long)host); - /* It is not important when it times out, it just needs to timeout. */ - set_timer_slack(&host->timeout_timer, HZ); host->use_dma = true; if (host->use_dma && jz4740_mmc_acquire_dma_channels(host) != 0) diff --git a/drivers/power/bq27xxx_battery.c b/drivers/power/bq27xxx_battery.c index 45f6ebf88df6..e90b3f307e0f 100644 --- a/drivers/power/bq27xxx_battery.c +++ b/drivers/power/bq27xxx_battery.c @@ -735,11 +735,8 @@ static void bq27xxx_battery_poll(struct work_struct *work) bq27xxx_battery_update(di); - if (poll_interval > 0) { - /* The timer does not have to be accurate. */ - set_timer_slack(&di->work.timer, poll_interval * HZ / 4); + if (poll_interval > 0) schedule_delayed_work(&di->work, poll_interval * HZ); - } } /* diff --git a/drivers/usb/host/ohci-hcd.c b/drivers/usb/host/ohci-hcd.c index 0449235d4f22..1700908b84ef 100644 --- a/drivers/usb/host/ohci-hcd.c +++ b/drivers/usb/host/ohci-hcd.c @@ -500,7 +500,6 @@ static int ohci_init (struct ohci_hcd *ohci) setup_timer(&ohci->io_watchdog, io_watchdog_func, (unsigned long) ohci); - set_timer_slack(&ohci->io_watchdog, msecs_to_jiffies(20)); ohci->hcca = dma_alloc_coherent (hcd->self.controller, sizeof(*ohci->hcca), &ohci->hcca_dma, GFP_KERNEL); diff --git a/drivers/usb/host/xhci.c b/drivers/usb/host/xhci.c index f2f9518c53ab..a986fe763d87 100644 --- a/drivers/usb/host/xhci.c +++ b/drivers/usb/host/xhci.c @@ -490,8 +490,6 @@ static void compliance_mode_recovery_timer_init(struct xhci_hcd *xhci) xhci->comp_mode_recovery_timer.expires = jiffies + msecs_to_jiffies(COMP_MODE_RCVRY_MSECS); - set_timer_slack(&xhci->comp_mode_recovery_timer, - msecs_to_jiffies(COMP_MODE_RCVRY_MSECS)); add_timer(&xhci->comp_mode_recovery_timer); xhci_dbg_trace(xhci, trace_xhci_dbg_quirks, "Compliance mode recovery timer initialized"); diff --git a/include/linux/timer.h b/include/linux/timer.h index 5869ab9848fe..4419506b564e 100644 --- a/include/linux/timer.h +++ b/include/linux/timer.h @@ -19,7 +19,6 @@ struct timer_list { void (*function)(unsigned long); unsigned long data; u32 flags; - int slack; #ifdef CONFIG_TIMER_STATS int start_pid; @@ -73,7 +72,6 @@ struct timer_list { .expires = (_expires), \ .data = (_data), \ .flags = (_flags), \ - .slack = -1, \ __TIMER_LOCKDEP_MAP_INITIALIZER( \ __FILE__ ":" __stringify(__LINE__)) \ } @@ -193,8 +191,6 @@ extern int del_timer(struct timer_list * timer); extern int mod_timer(struct timer_list *timer, unsigned long expires); extern int mod_timer_pending(struct timer_list *timer, unsigned long expires); -extern void set_timer_slack(struct timer_list *time, int slack_hz); - /* * The jiffies value which is added to now, when there is no timer * in the timer wheel: diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 86e95b72665d..a83e23d0bc25 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -447,24 +447,6 @@ unsigned long round_jiffies_up_relative(unsigned long j) } EXPORT_SYMBOL_GPL(round_jiffies_up_relative); -/** - * set_timer_slack - set the allowed slack for a timer - * @timer: the timer to be modified - * @slack_hz: the amount of time (in jiffies) allowed for rounding - * - * Set the amount of time, in jiffies, that a certain timer has - * in terms of slack. By setting this value, the timer subsystem - * will schedule the actual timer somewhere between - * the time mod_timer() asks for, and that time plus the slack. - * - * By setting the slack to -1, a percentage of the delay is used - * instead. - */ -void set_timer_slack(struct timer_list *timer, int slack_hz) -{ - timer->slack = slack_hz; -} -EXPORT_SYMBOL_GPL(set_timer_slack); static inline unsigned int timer_get_idx(struct timer_list *timer) { @@ -775,7 +757,6 @@ static void do_init_timer(struct timer_list *timer, unsigned int flags, { timer->entry.pprev = NULL; timer->flags = flags | raw_smp_processor_id(); - timer->slack = -1; #ifdef CONFIG_TIMER_STATS timer->start_site = NULL; timer->start_pid = -1; diff --git a/lib/random32.c b/lib/random32.c index 510d1ce7d4d2..69ed593aab07 100644 --- a/lib/random32.c +++ b/lib/random32.c @@ -233,7 +233,6 @@ static void __prandom_timer(unsigned long dontcare) static void __init __prandom_start_seed_timer(void) { - set_timer_slack(&seed_timer, HZ); seed_timer.expires = jiffies + msecs_to_jiffies(40 * MSEC_PER_SEC); add_timer(&seed_timer); } -- cgit v1.2.3 From 73420fea80c6c376d91a69defe64013baa0d7e95 Mon Sep 17 00:00:00 2001 From: Anna-Maria Gleixner Date: Mon, 4 Jul 2016 09:50:33 +0000 Subject: timers: Move __run_timers() function Move __run_timers() below __next_timer_interrupt() and next_pending_bucket() in preparation for __run_timers() NOHZ optimization. No functional change. Signed-off-by: Anna-Maria Gleixner Signed-off-by: Thomas Gleixner Cc: Arjan van de Ven Cc: Chris Mason Cc: Eric Dumazet Cc: Frederic Weisbecker Cc: George Spelvin Cc: Josh Triplett Cc: Len Brown Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Rik van Riel Cc: rt@linutronix.de Link: http://lkml.kernel.org/r/20160704094342.271872665@linutronix.de Signed-off-by: Ingo Molnar --- kernel/time/timer.c | 52 ++++++++++++++++++++++++++-------------------------- 1 file changed, 26 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timer.c b/kernel/time/timer.c index a83e23d0bc25..c16c48de01c5 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1277,32 +1277,6 @@ static int collect_expired_timers(struct timer_base *base, return levels; } -/** - * __run_timers - run all expired timers (if any) on this CPU. - * @base: the timer vector to be processed. - */ -static inline void __run_timers(struct timer_base *base) -{ - struct hlist_head heads[LVL_DEPTH]; - int levels; - - if (!time_after_eq(jiffies, base->clk)) - return; - - spin_lock_irq(&base->lock); - - while (time_after_eq(jiffies, base->clk)) { - - levels = collect_expired_timers(base, heads); - base->clk++; - - while (levels--) - expire_timers(base, heads + levels); - } - base->running_timer = NULL; - spin_unlock_irq(&base->lock); -} - #ifdef CONFIG_NO_HZ_COMMON /* * Find the next pending bucket of a level. Search from @offset + @clk upwards @@ -1472,6 +1446,32 @@ void update_process_times(int user_tick) run_posix_cpu_timers(p); } +/** + * __run_timers - run all expired timers (if any) on this CPU. + * @base: the timer vector to be processed. + */ +static inline void __run_timers(struct timer_base *base) +{ + struct hlist_head heads[LVL_DEPTH]; + int levels; + + if (!time_after_eq(jiffies, base->clk)) + return; + + spin_lock_irq(&base->lock); + + while (time_after_eq(jiffies, base->clk)) { + + levels = collect_expired_timers(base, heads); + base->clk++; + + while (levels--) + expire_timers(base, heads + levels); + } + base->running_timer = NULL; + spin_unlock_irq(&base->lock); +} + /* * This function runs timers and the timer-tq in bottom half context. */ -- cgit v1.2.3 From 236968383cf5cd48835ff0d8a265e299e220d140 Mon Sep 17 00:00:00 2001 From: Anna-Maria Gleixner Date: Mon, 4 Jul 2016 09:50:34 +0000 Subject: timers: Optimize collect_expired_timers() for NOHZ After a NOHZ idle sleep the timer wheel must be forwarded to current jiffies. There might be expired timers so the current code loops and checks the expired buckets for timers. This can take quite some time for long NOHZ idle periods. The pending bitmask in the timer base allows us to do a quick search for the next expiring timer and therefore a fast forward of the base time which prevents pointless long lasting loops. For a 3 seconds idle sleep this reduces the catchup time from ~1ms to 5us. Signed-off-by: Anna-Maria Gleixner Signed-off-by: Thomas Gleixner Cc: Arjan van de Ven Cc: Chris Mason Cc: Eric Dumazet Cc: Frederic Weisbecker Cc: George Spelvin Cc: Josh Triplett Cc: Len Brown Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Rik van Riel Cc: rt@linutronix.de Link: http://lkml.kernel.org/r/20160704094342.351296290@linutronix.de Signed-off-by: Ingo Molnar --- kernel/time/timer.c | 49 +++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 41 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timer.c b/kernel/time/timer.c index c16c48de01c5..658051c97a3c 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1252,8 +1252,8 @@ static void expire_timers(struct timer_base *base, struct hlist_head *head) } } -static int collect_expired_timers(struct timer_base *base, - struct hlist_head *heads) +static int __collect_expired_timers(struct timer_base *base, + struct hlist_head *heads) { unsigned long clk = base->clk; struct hlist_head *vec; @@ -1279,9 +1279,9 @@ static int collect_expired_timers(struct timer_base *base, #ifdef CONFIG_NO_HZ_COMMON /* - * Find the next pending bucket of a level. Search from @offset + @clk upwards - * and if nothing there, search from start of the level (@offset) up to - * @offset + clk. + * Find the next pending bucket of a level. Search from level start (@offset) + * + @clk upwards and if nothing there, search from start of the level + * (@offset) up to @offset + clk. */ static int next_pending_bucket(struct timer_base *base, unsigned offset, unsigned clk) @@ -1298,14 +1298,14 @@ static int next_pending_bucket(struct timer_base *base, unsigned offset, } /* - * Search the first expiring timer in the various clock levels. + * Search the first expiring timer in the various clock levels. Caller must + * hold base->lock. */ static unsigned long __next_timer_interrupt(struct timer_base *base) { unsigned long clk, next, adj; unsigned lvl, offset = 0; - spin_lock(&base->lock); next = base->clk + NEXT_TIMER_MAX_DELTA; clk = base->clk; for (lvl = 0; lvl < LVL_DEPTH; lvl++, offset += LVL_SIZE) { @@ -1358,7 +1358,6 @@ static unsigned long __next_timer_interrupt(struct timer_base *base) clk >>= LVL_CLK_SHIFT; clk += adj; } - spin_unlock(&base->lock); return next; } @@ -1416,7 +1415,10 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem) if (cpu_is_offline(smp_processor_id())) return expires; + spin_lock(&base->lock); nextevt = __next_timer_interrupt(base); + spin_unlock(&base->lock); + if (time_before_eq(nextevt, basej)) expires = basem; else @@ -1424,6 +1426,37 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem) return cmp_next_hrtimer_event(basem, expires); } + +static int collect_expired_timers(struct timer_base *base, + struct hlist_head *heads) +{ + /* + * NOHZ optimization. After a long idle sleep we need to forward the + * base to current jiffies. Avoid a loop by searching the bitfield for + * the next expiring timer. + */ + if ((long)(jiffies - base->clk) > 2) { + unsigned long next = __next_timer_interrupt(base); + + /* + * If the next timer is ahead of time forward to current + * jiffies, otherwise forward to the next expiry time. + */ + if (time_after(next, jiffies)) { + /* The call site will increment clock! */ + base->clk = jiffies - 1; + return 0; + } + base->clk = next; + } + return __collect_expired_timers(base, heads); +} +#else +static inline int collect_expired_timers(struct timer_base *base, + struct hlist_head *heads) +{ + return __collect_expired_timers(base, heads); +} #endif /* -- cgit v1.2.3 From ff00673292bd42a3688b33de47252a6a3c3f424c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 4 Jul 2016 09:50:35 +0000 Subject: timers/nohz: Remove pointless tick_nohz_kick_tick() function This was a failed attempt to optimize the timer expiry in idle, which was disabled and never revisited. Remove the cruft. Signed-off-by: Thomas Gleixner Cc: Arjan van de Ven Cc: Chris Mason Cc: Eric Dumazet Cc: Frederic Weisbecker Cc: George Spelvin Cc: Josh Triplett Cc: Len Brown Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Rik van Riel Cc: rt@linutronix.de Link: http://lkml.kernel.org/r/20160704094342.431073782@linutronix.de Signed-off-by: Ingo Molnar --- kernel/time/tick-sched.c | 33 +-------------------------------- 1 file changed, 1 insertion(+), 32 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 536ada80f6dd..69abc7bfe80f 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -1092,35 +1092,6 @@ static void tick_nohz_switch_to_nohz(void) tick_nohz_activate(ts, NOHZ_MODE_LOWRES); } -/* - * When NOHZ is enabled and the tick is stopped, we need to kick the - * tick timer from irq_enter() so that the jiffies update is kept - * alive during long running softirqs. That's ugly as hell, but - * correctness is key even if we need to fix the offending softirq in - * the first place. - * - * Note, this is different to tick_nohz_restart. We just kick the - * timer and do not touch the other magic bits which need to be done - * when idle is left. - */ -static void tick_nohz_kick_tick(struct tick_sched *ts, ktime_t now) -{ -#if 0 - /* Switch back to 2.6.27 behaviour */ - ktime_t delta; - - /* - * Do not touch the tick device, when the next expiry is either - * already reached or less/equal than the tick period. - */ - delta = ktime_sub(hrtimer_get_expires(&ts->sched_timer), now); - if (delta.tv64 <= tick_period.tv64) - return; - - tick_nohz_restart(ts, now); -#endif -} - static inline void tick_nohz_irq_enter(void) { struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); @@ -1131,10 +1102,8 @@ static inline void tick_nohz_irq_enter(void) now = ktime_get(); if (ts->idle_active) tick_nohz_stop_idle(ts, now); - if (ts->tick_stopped) { + if (ts->tick_stopped) tick_nohz_update_jiffies(now); - tick_nohz_kick_tick(ts, now); - } } #else -- cgit v1.2.3 From a683f390b93f4d1292f849fc48d28e322046120f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 4 Jul 2016 09:50:36 +0000 Subject: timers: Forward the wheel clock whenever possible The wheel clock is stale when a CPU goes into a long idle sleep. This has the side effect that timers which are queued end up in the outer wheel levels. That results in coarser granularity. To solve this, we keep track of the idle state and forward the wheel clock whenever possible. Signed-off-by: Thomas Gleixner Cc: Arjan van de Ven Cc: Chris Mason Cc: Eric Dumazet Cc: Frederic Weisbecker Cc: George Spelvin Cc: Josh Triplett Cc: Len Brown Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Rik van Riel Cc: rt@linutronix.de Link: http://lkml.kernel.org/r/20160704094342.512039360@linutronix.de Signed-off-by: Ingo Molnar --- kernel/time/tick-internal.h | 1 + kernel/time/tick-sched.c | 12 +++++ kernel/time/timer.c | 128 ++++++++++++++++++++++++++++++++++++-------- 3 files changed, 120 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index 966a5a6fdd0a..f738251000fe 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -164,3 +164,4 @@ static inline void timers_update_migration(bool update_nohz) { } DECLARE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases); extern u64 get_next_timer_interrupt(unsigned long basej, u64 basem); +void timer_clear_idle(void); diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 69abc7bfe80f..5d81f9aa30d2 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -700,6 +700,12 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, delta = next_tick - basemono; if (delta <= (u64)TICK_NSEC) { tick.tv64 = 0; + + /* + * Tell the timer code that the base is not idle, i.e. undo + * the effect of get_next_timer_interrupt(): + */ + timer_clear_idle(); /* * We've not stopped the tick yet, and there's a timer in the * next period, so no point in stopping it either, bail. @@ -809,6 +815,12 @@ static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now) tick_do_update_jiffies64(now); cpu_load_update_nohz_stop(); + /* + * Clear the timer idle flag, so we avoid IPIs on remote queueing and + * the clock forward checks in the enqueue path: + */ + timer_clear_idle(); + calc_load_exit_idle(); touch_softlockup_watchdog_sched(); /* diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 658051c97a3c..9339d71ee998 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -196,9 +196,11 @@ struct timer_base { spinlock_t lock; struct timer_list *running_timer; unsigned long clk; + unsigned long next_expiry; unsigned int cpu; bool migration_enabled; bool nohz_active; + bool is_idle; DECLARE_BITMAP(pending_map, WHEEL_SIZE); struct hlist_head vectors[WHEEL_SIZE]; } ____cacheline_aligned; @@ -519,24 +521,37 @@ static void internal_add_timer(struct timer_base *base, struct timer_list *timer { __internal_add_timer(base, timer); + if (!IS_ENABLED(CONFIG_NO_HZ_COMMON) || !base->nohz_active) + return; + /* - * Check whether the other CPU is in dynticks mode and needs - * to be triggered to reevaluate the timer wheel. We are - * protected against the other CPU fiddling with the timer by - * holding the timer base lock. This also makes sure that a - * CPU on the way to stop its tick can not evaluate the timer - * wheel. - * - * Spare the IPI for deferrable timers on idle targets though. - * The next busy ticks will take care of it. Except full dynticks - * require special care against races with idle_cpu(), lets deal - * with that later. + * TODO: This wants some optimizing similar to the code below, but we + * will do that when we switch from push to pull for deferrable timers. */ - if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active) { - if (!(timer->flags & TIMER_DEFERRABLE) || - tick_nohz_full_cpu(base->cpu)) + if (timer->flags & TIMER_DEFERRABLE) { + if (tick_nohz_full_cpu(base->cpu)) wake_up_nohz_cpu(base->cpu); + return; } + + /* + * We might have to IPI the remote CPU if the base is idle and the + * timer is not deferrable. If the other CPU is on the way to idle + * then it can't set base->is_idle as we hold the base lock: + */ + if (!base->is_idle) + return; + + /* Check whether this is the new first expiring timer: */ + if (time_after_eq(timer->expires, base->next_expiry)) + return; + + /* + * Set the next expiry time and kick the CPU so it can reevaluate the + * wheel: + */ + base->next_expiry = timer->expires; + wake_up_nohz_cpu(base->cpu); } #ifdef CONFIG_TIMER_STATS @@ -844,10 +859,11 @@ static inline struct timer_base *get_timer_base(u32 tflags) return get_timer_cpu_base(tflags, tflags & TIMER_CPUMASK); } -static inline struct timer_base *get_target_base(struct timer_base *base, - unsigned tflags) +#ifdef CONFIG_NO_HZ_COMMON +static inline struct timer_base * +__get_target_base(struct timer_base *base, unsigned tflags) { -#if defined(CONFIG_NO_HZ_COMMON) && defined(CONFIG_SMP) +#ifdef CONFIG_SMP if ((tflags & TIMER_PINNED) || !base->migration_enabled) return get_timer_this_cpu_base(tflags); return get_timer_cpu_base(tflags, get_nohz_timer_target()); @@ -856,6 +872,43 @@ static inline struct timer_base *get_target_base(struct timer_base *base, #endif } +static inline void forward_timer_base(struct timer_base *base) +{ + /* + * We only forward the base when it's idle and we have a delta between + * base clock and jiffies. + */ + if (!base->is_idle || (long) (jiffies - base->clk) < 2) + return; + + /* + * If the next expiry value is > jiffies, then we fast forward to + * jiffies otherwise we forward to the next expiry value. + */ + if (time_after(base->next_expiry, jiffies)) + base->clk = jiffies; + else + base->clk = base->next_expiry; +} +#else +static inline struct timer_base * +__get_target_base(struct timer_base *base, unsigned tflags) +{ + return get_timer_this_cpu_base(tflags); +} + +static inline void forward_timer_base(struct timer_base *base) { } +#endif + +static inline struct timer_base * +get_target_base(struct timer_base *base, unsigned tflags) +{ + struct timer_base *target = __get_target_base(base, tflags); + + forward_timer_base(target); + return target; +} + /* * We are using hashed locking: Holding per_cpu(timer_bases[x]).lock means * that all timers which are tied to this base are locked, and the base itself @@ -1417,16 +1470,49 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem) spin_lock(&base->lock); nextevt = __next_timer_interrupt(base); - spin_unlock(&base->lock); + base->next_expiry = nextevt; + /* + * We have a fresh next event. Check whether we can forward the base: + */ + if (time_after(nextevt, jiffies)) + base->clk = jiffies; + else if (time_after(nextevt, base->clk)) + base->clk = nextevt; - if (time_before_eq(nextevt, basej)) + if (time_before_eq(nextevt, basej)) { expires = basem; - else + base->is_idle = false; + } else { expires = basem + (nextevt - basej) * TICK_NSEC; + /* + * If we expect to sleep more than a tick, mark the base idle: + */ + if ((expires - basem) > TICK_NSEC) + base->is_idle = true; + } + spin_unlock(&base->lock); return cmp_next_hrtimer_event(basem, expires); } +/** + * timer_clear_idle - Clear the idle state of the timer base + * + * Called with interrupts disabled + */ +void timer_clear_idle(void) +{ + struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]); + + /* + * We do this unlocked. The worst outcome is a remote enqueue sending + * a pointless IPI, but taking the lock would just make the window for + * sending the IPI a few instructions smaller for the cost of taking + * the lock in the exit from idle path. + */ + base->is_idle = false; +} + static int collect_expired_timers(struct timer_base *base, struct hlist_head *heads) { @@ -1440,7 +1526,7 @@ static int collect_expired_timers(struct timer_base *base, /* * If the next timer is ahead of time forward to current - * jiffies, otherwise forward to the next expiry time. + * jiffies, otherwise forward to the next expiry time: */ if (time_after(next, jiffies)) { /* The call site will increment clock! */ -- cgit v1.2.3 From 4e85876a9d2a977b4a07389da8c07edf76d10825 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 4 Jul 2016 09:50:37 +0000 Subject: timers: Only wake softirq if necessary With the wheel forwading in place and with the HZ=1000 4ms folding we can avoid running the softirq at all. Signed-off-by: Thomas Gleixner Cc: Arjan van de Ven Cc: Chris Mason Cc: Frederic Weisbecker Cc: George Spelvin Cc: Josh Triplett Cc: Len Brown Cc: Linus Torvalds Cc: Paul McKenney Cc: Peter Zijlstra Cc: Rik van Riel Cc: rt@linutronix.de Link: http://lkml.kernel.org/r/20160704094342.607650550@linutronix.de Signed-off-by: Ingo Molnar --- kernel/time/timer.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'kernel') diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 9339d71ee998..8d830f1f6a6a 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1608,7 +1608,18 @@ static void run_timer_softirq(struct softirq_action *h) */ void run_local_timers(void) { + struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]); + hrtimer_run_queues(); + /* Raise the softirq only if required. */ + if (time_before(jiffies, base->clk)) { + if (!IS_ENABLED(CONFIG_NO_HZ_COMMON) || !base->nohz_active) + return; + /* CPU is awake, so check the deferrable base. */ + base++; + if (time_before(jiffies, base->clk)) + return; + } raise_softirq(TIMER_SOFTIRQ); } -- cgit v1.2.3 From ffdf047728f8f93df896b58049c7513856027141 Mon Sep 17 00:00:00 2001 From: Anna-Maria Gleixner Date: Mon, 4 Jul 2016 09:50:39 +0000 Subject: timers: Split out index calculation For further optimizations we need to seperate index calculation from queueing. No functional change. Signed-off-by: Anna-Maria Gleixner Signed-off-by: Thomas Gleixner Cc: Arjan van de Ven Cc: Chris Mason Cc: Eric Dumazet Cc: Frederic Weisbecker Cc: George Spelvin Cc: Josh Triplett Cc: Len Brown Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Rik van Riel Cc: rt@linutronix.de Link: http://lkml.kernel.org/r/20160704094342.691159619@linutronix.de Signed-off-by: Ingo Molnar --- kernel/time/timer.c | 47 ++++++++++++++++++++++++++++++++--------------- 1 file changed, 32 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 8d830f1f6a6a..8d7c23e55c85 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -471,12 +471,9 @@ static inline unsigned calc_index(unsigned expires, unsigned lvl) return LVL_OFFS(lvl) + (expires & LVL_MASK); } -static void -__internal_add_timer(struct timer_base *base, struct timer_list *timer) +static int calc_wheel_index(unsigned long expires, unsigned long clk) { - unsigned long expires = timer->expires; - unsigned long delta = expires - base->clk; - struct hlist_head *vec; + unsigned long delta = expires - clk; unsigned int idx; if (delta < LVL_START(1)) { @@ -496,7 +493,7 @@ __internal_add_timer(struct timer_base *base, struct timer_list *timer) } else if (LVL_DEPTH > 8 && delta < LVL_START(8)) { idx = calc_index(expires, 7); } else if ((long) delta < 0) { - idx = base->clk & LVL_MASK; + idx = clk & LVL_MASK; } else { /* * Force expire obscene large timeouts to expire at the @@ -507,20 +504,33 @@ __internal_add_timer(struct timer_base *base, struct timer_list *timer) idx = calc_index(expires, LVL_DEPTH - 1); } - /* - * Enqueue the timer into the array bucket, mark it pending in - * the bitmap and store the index in the timer flags. - */ - vec = base->vectors + idx; - hlist_add_head(&timer->entry, vec); + return idx; +} + +/* + * Enqueue the timer into the hash bucket, mark it pending in + * the bitmap and store the index in the timer flags. + */ +static void enqueue_timer(struct timer_base *base, struct timer_list *timer, + unsigned int idx) +{ + hlist_add_head(&timer->entry, base->vectors + idx); __set_bit(idx, base->pending_map); timer_set_idx(timer, idx); } -static void internal_add_timer(struct timer_base *base, struct timer_list *timer) +static void +__internal_add_timer(struct timer_base *base, struct timer_list *timer) { - __internal_add_timer(base, timer); + unsigned int idx; + + idx = calc_wheel_index(timer->expires, base->clk); + enqueue_timer(base, timer, idx); +} +static void +trigger_dyntick_cpu(struct timer_base *base, struct timer_list *timer) +{ if (!IS_ENABLED(CONFIG_NO_HZ_COMMON) || !base->nohz_active) return; @@ -551,7 +561,14 @@ static void internal_add_timer(struct timer_base *base, struct timer_list *timer * wheel: */ base->next_expiry = timer->expires; - wake_up_nohz_cpu(base->cpu); + wake_up_nohz_cpu(base->cpu); +} + +static void +internal_add_timer(struct timer_base *base, struct timer_list *timer) +{ + __internal_add_timer(base, timer); + trigger_dyntick_cpu(base, timer); } #ifdef CONFIG_TIMER_STATS -- cgit v1.2.3 From f00c0afdfa625165a609513bc74164d56752ec3e Mon Sep 17 00:00:00 2001 From: Anna-Maria Gleixner Date: Mon, 4 Jul 2016 09:50:40 +0000 Subject: timers: Implement optimization for same expiry time in mod_timer() The existing optimization for same expiry time in mod_timer() checks whether the timer expiry time is the same as the new requested expiry time. In the old timer wheel implementation this does not take the slack batching into account, neither does the new implementation evaluate whether the new expiry time will requeue the timer to the same bucket. To optimize that, we can calculate the resulting bucket and check if the new expiry time is different from the current expiry time. This calculation happens outside the base lock held region. If the resulting bucket is the same we can avoid taking the base lock and requeueing the timer. If the timer needs to be requeued then we have to check under the base lock whether the base time has changed between the lockless calculation and taking the lock. If it has changed we need to recalculate under the lock. This optimization takes effect for timers which are enqueued into the less granular wheel levels (1 and above). With a simple test case the functionality has been verified: Before After Match: 5.5% 86.6% Requeue: 94.5% 13.4% Recalc: <0.01% In the non optimized case the timer is requeued in 94.5% of the cases. With the index optimization in place the requeue rate drops to 13.4%. The case where the lockless index calculation has to be redone is less than 0.01%. With a real world test case (networking) we observed the following changes: Before After Match: 97.8% 99.7% Requeue: 2.2% 0.3% Recalc: <0.001% That means two percent fewer lock/requeue/unlock operations done in one of the hot path use cases of timers. Signed-off-by: Anna-Maria Gleixner Signed-off-by: Thomas Gleixner Cc: Arjan van de Ven Cc: Chris Mason Cc: Eric Dumazet Cc: Frederic Weisbecker Cc: George Spelvin Cc: Josh Triplett Cc: Len Brown Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Rik van Riel Cc: rt@linutronix.de Link: http://lkml.kernel.org/r/20160704094342.778527749@linutronix.de Signed-off-by: Ingo Molnar --- kernel/time/timer.c | 51 +++++++++++++++++++++++++++++++++++---------------- 1 file changed, 35 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 8d7c23e55c85..8f29abeb8c4d 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -960,28 +960,36 @@ static inline int __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only) { struct timer_base *base, *new_base; - unsigned long flags; + unsigned int idx = UINT_MAX; + unsigned long clk = 0, flags; int ret = 0; /* - * TODO: Calculate the array bucket of the timer right here w/o - * holding the base lock. This allows to check not only - * timer->expires == expires below, but also whether the timer - * ends up in the same bucket. If we really need to requeue - * the timer then we check whether base->clk have - * advanced between here and locking the timer base. If - * jiffies advanced we have to recalc the array bucket with the - * lock held. - */ - - /* - * This is a common optimization triggered by the - * networking code - if the timer is re-modified - * to be the same thing then just return: + * This is a common optimization triggered by the networking code - if + * the timer is re-modified to have the same timeout or ends up in the + * same array bucket then just return: */ if (timer_pending(timer)) { if (timer->expires == expires) return 1; + /* + * Take the current timer_jiffies of base, but without holding + * the lock! + */ + base = get_timer_base(timer->flags); + clk = base->clk; + + idx = calc_wheel_index(expires, clk); + + /* + * Retrieve and compare the array index of the pending + * timer. If it matches set the expiry to the new value so a + * subsequent call will exit in the expires check above. + */ + if (idx == timer_get_idx(timer)) { + timer->expires = expires; + return 1; + } } timer_stats_timer_set_start_info(timer); @@ -1018,7 +1026,18 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only) } timer->expires = expires; - internal_add_timer(base, timer); + /* + * If 'idx' was calculated above and the base time did not advance + * between calculating 'idx' and taking the lock, only enqueue_timer() + * and trigger_dyntick_cpu() is required. Otherwise we need to + * (re)calculate the wheel index via internal_add_timer(). + */ + if (idx != UINT_MAX && clk == base->clk) { + enqueue_timer(base, timer, idx); + trigger_dyntick_cpu(base, timer); + } else { + internal_add_timer(base, timer); + } out_unlock: spin_unlock_irqrestore(&base->lock, flags); -- cgit v1.2.3 From 9acacc2ac525ef1397af63b15cef7bb77a823c06 Mon Sep 17 00:00:00 2001 From: Zhao Lei Date: Mon, 20 Jun 2016 17:37:18 +0800 Subject: sched/cpuacct: Merge cpuacct_usage_index and cpuacct_stat_index enums These two types have similar function, no need to separate them. Signed-off-by: Zhao Lei Cc: KOSAKI Motohiro Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/436748885270d64363c7dc67167507d486c2057a.1466415271.git.zhaolei@cn.fujitsu.com Signed-off-by: Ingo Molnar --- kernel/sched/cpuacct.c | 47 ++++++++++++++++++++--------------------------- 1 file changed, 20 insertions(+), 27 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index 41f85c4d0938..74241eb5f3ff 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c @@ -25,15 +25,13 @@ enum cpuacct_stat_index { CPUACCT_STAT_NSTATS, }; -enum cpuacct_usage_index { - CPUACCT_USAGE_USER, /* ... user mode */ - CPUACCT_USAGE_SYSTEM, /* ... kernel mode */ - - CPUACCT_USAGE_NRUSAGE, +static const char * const cpuacct_stat_desc[] = { + [CPUACCT_STAT_USER] = "user", + [CPUACCT_STAT_SYSTEM] = "system", }; struct cpuacct_usage { - u64 usages[CPUACCT_USAGE_NRUSAGE]; + u64 usages[CPUACCT_STAT_NSTATS]; }; /* track cpu usage of a group of tasks and its child groups */ @@ -108,16 +106,16 @@ static void cpuacct_css_free(struct cgroup_subsys_state *css) } static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu, - enum cpuacct_usage_index index) + enum cpuacct_stat_index index) { struct cpuacct_usage *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); u64 data; /* - * We allow index == CPUACCT_USAGE_NRUSAGE here to read + * We allow index == CPUACCT_STAT_NSTATS here to read * the sum of suages. */ - BUG_ON(index > CPUACCT_USAGE_NRUSAGE); + BUG_ON(index > CPUACCT_STAT_NSTATS); #ifndef CONFIG_64BIT /* @@ -126,11 +124,11 @@ static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu, raw_spin_lock_irq(&cpu_rq(cpu)->lock); #endif - if (index == CPUACCT_USAGE_NRUSAGE) { + if (index == CPUACCT_STAT_NSTATS) { int i = 0; data = 0; - for (i = 0; i < CPUACCT_USAGE_NRUSAGE; i++) + for (i = 0; i < CPUACCT_STAT_NSTATS; i++) data += cpuusage->usages[i]; } else { data = cpuusage->usages[index]; @@ -155,7 +153,7 @@ static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val) raw_spin_lock_irq(&cpu_rq(cpu)->lock); #endif - for (i = 0; i < CPUACCT_USAGE_NRUSAGE; i++) + for (i = 0; i < CPUACCT_STAT_NSTATS; i++) cpuusage->usages[i] = val; #ifndef CONFIG_64BIT @@ -165,7 +163,7 @@ static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val) /* return total cpu usage (in nanoseconds) of a group */ static u64 __cpuusage_read(struct cgroup_subsys_state *css, - enum cpuacct_usage_index index) + enum cpuacct_stat_index index) { struct cpuacct *ca = css_ca(css); u64 totalcpuusage = 0; @@ -180,18 +178,18 @@ static u64 __cpuusage_read(struct cgroup_subsys_state *css, static u64 cpuusage_user_read(struct cgroup_subsys_state *css, struct cftype *cft) { - return __cpuusage_read(css, CPUACCT_USAGE_USER); + return __cpuusage_read(css, CPUACCT_STAT_USER); } static u64 cpuusage_sys_read(struct cgroup_subsys_state *css, struct cftype *cft) { - return __cpuusage_read(css, CPUACCT_USAGE_SYSTEM); + return __cpuusage_read(css, CPUACCT_STAT_SYSTEM); } static u64 cpuusage_read(struct cgroup_subsys_state *css, struct cftype *cft) { - return __cpuusage_read(css, CPUACCT_USAGE_NRUSAGE); + return __cpuusage_read(css, CPUACCT_STAT_NSTATS); } static int cpuusage_write(struct cgroup_subsys_state *css, struct cftype *cft, @@ -213,7 +211,7 @@ static int cpuusage_write(struct cgroup_subsys_state *css, struct cftype *cft, } static int __cpuacct_percpu_seq_show(struct seq_file *m, - enum cpuacct_usage_index index) + enum cpuacct_stat_index index) { struct cpuacct *ca = css_ca(seq_css(m)); u64 percpu; @@ -229,24 +227,19 @@ static int __cpuacct_percpu_seq_show(struct seq_file *m, static int cpuacct_percpu_user_seq_show(struct seq_file *m, void *V) { - return __cpuacct_percpu_seq_show(m, CPUACCT_USAGE_USER); + return __cpuacct_percpu_seq_show(m, CPUACCT_STAT_USER); } static int cpuacct_percpu_sys_seq_show(struct seq_file *m, void *V) { - return __cpuacct_percpu_seq_show(m, CPUACCT_USAGE_SYSTEM); + return __cpuacct_percpu_seq_show(m, CPUACCT_STAT_SYSTEM); } static int cpuacct_percpu_seq_show(struct seq_file *m, void *V) { - return __cpuacct_percpu_seq_show(m, CPUACCT_USAGE_NRUSAGE); + return __cpuacct_percpu_seq_show(m, CPUACCT_STAT_NSTATS); } -static const char * const cpuacct_stat_desc[] = { - [CPUACCT_STAT_USER] = "user", - [CPUACCT_STAT_SYSTEM] = "system", -}; - static int cpuacct_stats_show(struct seq_file *sf, void *v) { struct cpuacct *ca = css_ca(seq_css(sf)); @@ -316,11 +309,11 @@ static struct cftype files[] = { void cpuacct_charge(struct task_struct *tsk, u64 cputime) { struct cpuacct *ca; - int index = CPUACCT_USAGE_SYSTEM; + int index = CPUACCT_STAT_SYSTEM; struct pt_regs *regs = task_pt_regs(tsk); if (regs && user_mode(regs)) - index = CPUACCT_USAGE_USER; + index = CPUACCT_STAT_USER; rcu_read_lock(); -- cgit v1.2.3 From 8e546bfafb3121ed25c73a0c02311ec58459344a Mon Sep 17 00:00:00 2001 From: Zhao Lei Date: Mon, 20 Jun 2016 17:37:19 +0800 Subject: sched/cpuacct: Use loop to consolidate code in cpuacct_stats_show() In cpuacct_stats_show() we currently we have copies of similar code, for each cpustat(system/user) variant. Use a loop instead to consolidate the code. This will also work better if we extend the CPUACCT_STAT_NSTATS type. Signed-off-by: Zhao Lei Cc: KOSAKI Motohiro Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/b0597d4224655e9f333f1a6224ed9654c7d7d36a.1466415271.git.zhaolei@cn.fujitsu.com Signed-off-by: Ingo Molnar --- kernel/sched/cpuacct.c | 29 ++++++++++++++--------------- 1 file changed, 14 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index 74241eb5f3ff..677cd1ab33b7 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c @@ -243,27 +243,26 @@ static int cpuacct_percpu_seq_show(struct seq_file *m, void *V) static int cpuacct_stats_show(struct seq_file *sf, void *v) { struct cpuacct *ca = css_ca(seq_css(sf)); + s64 val[CPUACCT_STAT_NSTATS]; int cpu; - s64 val = 0; + int stat; + memset(val, 0, sizeof(val)); for_each_possible_cpu(cpu) { - struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu); - val += kcpustat->cpustat[CPUTIME_USER]; - val += kcpustat->cpustat[CPUTIME_NICE]; - } - val = cputime64_to_clock_t(val); - seq_printf(sf, "%s %lld\n", cpuacct_stat_desc[CPUACCT_STAT_USER], val); + u64 *cpustat = per_cpu_ptr(ca->cpustat, cpu)->cpustat; - val = 0; - for_each_possible_cpu(cpu) { - struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu); - val += kcpustat->cpustat[CPUTIME_SYSTEM]; - val += kcpustat->cpustat[CPUTIME_IRQ]; - val += kcpustat->cpustat[CPUTIME_SOFTIRQ]; + val[CPUACCT_STAT_USER] += cpustat[CPUTIME_USER]; + val[CPUACCT_STAT_USER] += cpustat[CPUTIME_NICE]; + val[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_SYSTEM]; + val[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_IRQ]; + val[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_SOFTIRQ]; } - val = cputime64_to_clock_t(val); - seq_printf(sf, "%s %lld\n", cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val); + for (stat = 0; stat < CPUACCT_STAT_NSTATS; stat++) { + seq_printf(sf, "%s %lld\n", + cpuacct_stat_desc[stat], + cputime64_to_clock_t(val[stat])); + } return 0; } -- cgit v1.2.3 From 277a13e4f0d661678a7084bf97ed96a99c7dac21 Mon Sep 17 00:00:00 2001 From: Zhao Lei Date: Mon, 20 Jun 2016 17:37:20 +0800 Subject: sched/cpuacct: Introduce cpuacct.usage_all to show all CPU stats together In current code, we can get cpuacct data from several files, but each file has various limitations. For example: - We can get CPU usage in user and kernel mode via cpuacct.stat, but we can't get detailed data about each CPU. - We can get each CPU's kernel mode usage in cpuacct.usage_percpu_sys, but we can't get user mode usage data at the same time. This patch introduces cpuacct.usage_all, to show all detailed CPU accounting data together: # cat cpuacct.usage_all cpu user system 0 3809760299 5807968992 1 3250329855 454612211 .. Signed-off-by: Zhao Lei Cc: KOSAKI Motohiro Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/7744460969edd7caaf0e903592ee52353ed9bdd6.1466415271.git.zhaolei@cn.fujitsu.com Signed-off-by: Ingo Molnar --- kernel/sched/cpuacct.c | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index 677cd1ab33b7..bc0b309c3f19 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c @@ -240,6 +240,42 @@ static int cpuacct_percpu_seq_show(struct seq_file *m, void *V) return __cpuacct_percpu_seq_show(m, CPUACCT_STAT_NSTATS); } +static int cpuacct_all_seq_show(struct seq_file *m, void *V) +{ + struct cpuacct *ca = css_ca(seq_css(m)); + int index; + int cpu; + + seq_puts(m, "cpu"); + for (index = 0; index < CPUACCT_STAT_NSTATS; index++) + seq_printf(m, " %s", cpuacct_stat_desc[index]); + seq_puts(m, "\n"); + + for_each_possible_cpu(cpu) { + struct cpuacct_usage *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); + + seq_printf(m, "%d", cpu); + + for (index = 0; index < CPUACCT_STAT_NSTATS; index++) { +#ifndef CONFIG_64BIT + /* + * Take rq->lock to make 64-bit read safe on 32-bit + * platforms. + */ + raw_spin_lock_irq(&cpu_rq(cpu)->lock); +#endif + + seq_printf(m, " %llu", cpuusage->usages[index]); + +#ifndef CONFIG_64BIT + raw_spin_unlock_irq(&cpu_rq(cpu)->lock); +#endif + } + seq_puts(m, "\n"); + } + return 0; +} + static int cpuacct_stats_show(struct seq_file *sf, void *v) { struct cpuacct *ca = css_ca(seq_css(sf)); @@ -293,6 +329,10 @@ static struct cftype files[] = { .name = "usage_percpu_sys", .seq_show = cpuacct_percpu_sys_seq_show, }, + { + .name = "usage_all", + .seq_show = cpuacct_all_seq_show, + }, { .name = "stat", .seq_show = cpuacct_stats_show, -- cgit v1.2.3 From 748c7201e622d1c24abb4f85072d2e74d12f295f Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Fri, 3 Jun 2016 17:10:18 -0300 Subject: sched/core: Panic on scheduling while atomic bugs if kernel.panic_on_warn is set Currently, a schedule while atomic error prints the stack trace to the kernel log and the system continue running. Although it is possible to collect the kernel log messages and analyze it, often more information are needed. Furthermore, keep the system running is not always the best choice. For example, when the preempt count underflows the system will not stop to complain about scheduling while atomic, so the kernel log can wrap around overwriting the first stack trace, tuning the analysis even more challenging. This patch uses the kernel.panic_on_warn sysctl to help out on these more complex situations. When kernel.panic_on_warn is set to 1, the kernel will panic() in the schedule while atomic detection. The default value of the sysctl is 0, maintaining the current behavior. Signed-off-by: Daniel Bristot de Oliveira Reviewed-by: Luis Claudio R. Goncalves Cc: Christian Borntraeger Cc: Linus Torvalds Cc: Luis Claudio R. Goncalves Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/e8f7b80f353aa22c63bd8557208163989af8493d.1464983675.git.bristot@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 28da50a5bc76..4e9617a7e7d9 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3168,6 +3168,9 @@ static noinline void __schedule_bug(struct task_struct *prev) pr_cont("\n"); } #endif + if (panic_on_warn) + panic("scheduling while atomic\n"); + dump_stack(); add_taint(TAINT_WARN, LOCKDEP_STILL_OK); } -- cgit v1.2.3 From 44530d588e142a96cf0cd345a7cb8911c4f88720 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 10 Jul 2016 20:58:36 +0200 Subject: Revert "perf/x86/intel, watchdog: Switch NMI watchdog to ref cycles on x86" This reverts commit 2c95afc1e83d93fac3be6923465e1753c2c53b0a. Stephane reported the following regression: > Since Andi added: > > commit 2c95afc1e83d93fac3be6923465e1753c2c53b0a > Author: Andi Kleen > Date: Thu Jun 9 06:14:38 2016 -0700 > > perf/x86/intel, watchdog: Switch NMI watchdog to ref cycles on x86 > > $ perf stat -e ref-cycles ls > .... > > fails systematically because the ref-cycles is now used by the > watchdog and given this is a system-wide pinned event, it monopolizes > the fixed counter 2 which is the only counter able to measure this event. Since the next merge window is near, fix the regression for now by reverting the commit. Reported-by: Stephane Eranian Cc: Andi Kleen Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Vince Weaver Cc: Alexander Shishkin Cc: Linus Torvalds Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/hw_nmi.c | 8 -------- include/linux/nmi.h | 1 - kernel/watchdog.c | 7 ------- 3 files changed, 16 deletions(-) (limited to 'kernel') diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c index 016f4263fad4..7788ce643bf4 100644 --- a/arch/x86/kernel/apic/hw_nmi.c +++ b/arch/x86/kernel/apic/hw_nmi.c @@ -18,16 +18,8 @@ #include #include #include -#include #ifdef CONFIG_HARDLOCKUP_DETECTOR -int hw_nmi_get_event(void) -{ - if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) - return PERF_COUNT_HW_REF_CPU_CYCLES; - return PERF_COUNT_HW_CPU_CYCLES; -} - u64 hw_nmi_get_sample_period(int watchdog_thresh) { return (u64)(cpu_khz) * 1000 * watchdog_thresh; diff --git a/include/linux/nmi.h b/include/linux/nmi.h index 79858af27209..4630eeae18e0 100644 --- a/include/linux/nmi.h +++ b/include/linux/nmi.h @@ -66,7 +66,6 @@ static inline bool trigger_allbutself_cpu_backtrace(void) #ifdef CONFIG_LOCKUP_DETECTOR u64 hw_nmi_get_sample_period(int watchdog_thresh); -int hw_nmi_get_event(void); extern int nmi_watchdog_enabled; extern int soft_watchdog_enabled; extern int watchdog_user_enabled; diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 8dd30fcd91be..9acb29f280ec 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -315,12 +315,6 @@ static int is_softlockup(unsigned long touch_ts) #ifdef CONFIG_HARDLOCKUP_DETECTOR -/* Can be overriden by architecture */ -__weak int hw_nmi_get_event(void) -{ - return PERF_COUNT_HW_CPU_CYCLES; -} - static struct perf_event_attr wd_hw_attr = { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CPU_CYCLES, @@ -610,7 +604,6 @@ static int watchdog_nmi_enable(unsigned int cpu) wd_attr = &wd_hw_attr; wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh); - wd_attr->config = hw_nmi_get_event(); /* Try to register using hardware perf events */ event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL); -- cgit v1.2.3 From 2c13ce8f6b2f6fd9ba2f9261b1939fc0f62d1307 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Fri, 8 Jul 2016 01:39:11 +0300 Subject: posix_cpu_timer: Exit early when process has been reaped Variable "now" seems to be genuinely used unintialized if branch if (CPUCLOCK_PERTHREAD(timer->it_clock)) { is not taken and branch if (unlikely(sighand == NULL)) { is taken. In this case the process has been reaped and the timer is marked as disarmed anyway. So none of the postprocessing of the sample is required. Return right away. Signed-off-by: Alexey Dobriyan Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/20160707223911.GA26483@p183.telecom.by Signed-off-by: Thomas Gleixner --- kernel/time/posix-cpu-timers.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 1cafba860b08..39008d78927a 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -777,6 +777,7 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp) timer->it.cpu.expires = 0; sample_to_timespec(timer->it_clock, timer->it.cpu.expires, &itp->it_value); + return; } else { cpu_timer_sample_group(timer->it_clock, p, &now); unlock_task_sighand(p, &flags); -- cgit v1.2.3 From a1b7b1a57b9919a0abb6c93fca04ac9cf840c992 Mon Sep 17 00:00:00 2001 From: Alexander Popov Date: Sun, 3 Jul 2016 03:24:08 +0300 Subject: irqdomain: Fix irq_domain_alloc_irqs_recursive() error handling If an irq_domain is auto-recursive and irq_domain_alloc_irqs_recursive() for its parent has returned an error, then do return and avoid calling irq_domain_free_irqs_recursive() uselessly, because: - if domain->ops->alloc() had failed for an auto-recursive irq_domain, then irq_domain_free_irqs_recursive() had already been called; - if domain->ops->alloc() had failed for a not auto-recursive irq_domain, then there is nothing to free at all. Signed-off-by: Alexander Popov Acked-by: Marc Zyngier Link: http://lkml.kernel.org/r/1467505448-2850-1-git-send-email-alex.popov@linux.com Signed-off-by: Thomas Gleixner --- kernel/irq/irqdomain.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index a82853739694..4752b43662e0 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -1192,8 +1192,10 @@ int irq_domain_alloc_irqs_recursive(struct irq_domain *domain, if (recursive) ret = irq_domain_alloc_irqs_recursive(parent, irq_base, nr_irqs, arg); - if (ret >= 0) - ret = domain->ops->alloc(domain, irq_base, nr_irqs, arg); + if (ret < 0) + return ret; + + ret = domain->ops->alloc(domain, irq_base, nr_irqs, arg); if (ret < 0 && recursive) irq_domain_free_irqs_recursive(parent, irq_base, nr_irqs); -- cgit v1.2.3 From a7c734140aa36413944eef0f8c660e0e2256357d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 12 Jul 2016 21:59:23 +0200 Subject: cpu/hotplug: Keep enough storage space if SMP=n to avoid array out of bounds scribble Xiaolong Ye reported lock debug warnings triggered by the following commit: 8de4a0066106 ("perf/x86: Convert the core to the hotplug state machine") The bug is the following: the cpuhp_bp_states[] array is cut short when CONFIG_SMP=n, but the dynamically registered callbacks are stored nevertheless and happily scribble outside of the array bounds... We need to store them in case that the state is unregistered so we can invoke the teardown function. That's independent of CONFIG_SMP. Make sure the array is large enough. Reported-by: kernel test robot Signed-off-by: Thomas Gleixner Cc: Adam Borowski Cc: Alexander Shishkin Cc: Anna-Maria Gleixner Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: Borislav Petkov Cc: Jiri Olsa Cc: Kan Liang Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Sebastian Andrzej Siewior Cc: Stephane Eranian Cc: Vince Weaver Cc: lkp@01.org Cc: stable@vger.kernel.org Cc: tipbuild@zytor.com Fixes: cff7d378d3fd "cpu/hotplug: Convert to a state machine for the control processor" Link: http://lkml.kernel.org/r/alpine.DEB.2.11.1607122144560.4083@nanos Signed-off-by: Ingo Molnar --- kernel/cpu.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index d948e44c471e..7b61887f7ccd 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -1201,6 +1201,8 @@ static struct cpuhp_step cpuhp_bp_states[] = { .teardown = takedown_cpu, .cant_stop = true, }, +#else + [CPUHP_BRINGUP_CPU] = { }, #endif }; -- cgit v1.2.3 From d60585c5766e9620d5d83e2b25dc042c7bdada2c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 12 Jul 2016 18:33:56 +0200 Subject: sched/core: Correct off by one bug in load migration calculation The move of calc_load_migrate() from CPU_DEAD to CPU_DYING did not take into account that the function is now called from a thread running on the outgoing CPU. As a result a cpu unplug leakes a load of 1 into the global load accounting mechanism. Fix it by adjusting for the currently running thread which calls calc_load_migrate(). Reported-by: Anton Blanchard Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra Cc: Linus Torvalds Cc: Michael Ellerman Cc: Vaidyanathan Srinivasan Cc: rt@linutronix.de Cc: shreyas@linux.vnet.ibm.com Fixes: e9cd8fa4fcfd: ("sched/migration: Move calc_load_migrate() into CPU_DYING") Link: http://lkml.kernel.org/r/alpine.DEB.2.11.1607121744350.4083@nanos Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 6 ++++-- kernel/sched/loadavg.c | 8 ++++---- kernel/sched/sched.h | 2 +- 3 files changed, 9 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 51d7105f529a..97ee9ac7e97c 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5394,13 +5394,15 @@ void idle_task_exit(void) /* * Since this CPU is going 'away' for a while, fold any nr_active delta * we might have. Assumes we're called after migrate_tasks() so that the - * nr_active count is stable. + * nr_active count is stable. We need to take the teardown thread which + * is calling this into account, so we hand in adjust = 1 to the load + * calculation. * * Also see the comment "Global load-average calculations". */ static void calc_load_migrate(struct rq *rq) { - long delta = calc_load_fold_active(rq); + long delta = calc_load_fold_active(rq, 1); if (delta) atomic_long_add(delta, &calc_load_tasks); } diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c index b0b93fd33af9..a2d6eb71f06b 100644 --- a/kernel/sched/loadavg.c +++ b/kernel/sched/loadavg.c @@ -78,11 +78,11 @@ void get_avenrun(unsigned long *loads, unsigned long offset, int shift) loads[2] = (avenrun[2] + offset) << shift; } -long calc_load_fold_active(struct rq *this_rq) +long calc_load_fold_active(struct rq *this_rq, long adjust) { long nr_active, delta = 0; - nr_active = this_rq->nr_running; + nr_active = this_rq->nr_running - adjust; nr_active += (long)this_rq->nr_uninterruptible; if (nr_active != this_rq->calc_load_active) { @@ -188,7 +188,7 @@ void calc_load_enter_idle(void) * We're going into NOHZ mode, if there's any pending delta, fold it * into the pending idle delta. */ - delta = calc_load_fold_active(this_rq); + delta = calc_load_fold_active(this_rq, 0); if (delta) { int idx = calc_load_write_idx(); @@ -389,7 +389,7 @@ void calc_global_load_tick(struct rq *this_rq) if (time_before(jiffies, this_rq->calc_load_update)) return; - delta = calc_load_fold_active(this_rq); + delta = calc_load_fold_active(this_rq, 0); if (delta) atomic_long_add(delta, &calc_load_tasks); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 7cbeb92a1cb9..898c0d2f18fe 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -28,7 +28,7 @@ extern unsigned long calc_load_update; extern atomic_long_t calc_load_tasks; extern void calc_global_load_tick(struct rq *this_rq); -extern long calc_load_fold_active(struct rq *this_rq); +extern long calc_load_fold_active(struct rq *this_rq, long adjust); #ifdef CONFIG_SMP extern void cpu_load_update_active(struct rq *this_rq); -- cgit v1.2.3 From 57430218317e5b280a80582a139b26029c25de6c Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Wed, 13 Jul 2016 16:50:01 +0200 Subject: sched/cputime: Count actually elapsed irq & softirq time Currently, if there was any irq or softirq time during 'ticks' jiffies, the entire period will be accounted as irq or softirq time. This is inaccurate if only a subset of the time was actually spent handling irqs, and could conceivably mis-count all of the ticks during a period as irq time, when there was some irq and some softirq time. This can actually happen when irqtime_account_process_tick is called from account_idle_ticks, which can pass a larger number of ticks down all at once. Fix this by changing irqtime_account_hi_update(), irqtime_account_si_update(), and steal_account_process_ticks() to work with cputime_t time units, and return the amount of time spent in each mode. Rename steal_account_process_ticks() to steal_account_process_time(), to reflect that time is now accounted in cputime_t, instead of ticks. Additionally, have irqtime_account_process_tick() take into account how much time was spent in each of steal, irq, and softirq time. The latter could help improve the accuracy of cputime accounting when returning from idle on a NO_HZ_IDLE CPU. Properly accounting how much time was spent in hardirq and softirq time will also allow the NO_HZ_FULL code to re-use these same functions for hardirq and softirq accounting. Signed-off-by: Rik van Riel [ Make nsecs_to_cputime64() actually return cputime64_t. ] Signed-off-by: Frederic Weisbecker Cc: Linus Torvalds Cc: Mike Galbraith Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Radim Krcmar Cc: Thomas Gleixner Cc: Wanpeng Li Link: http://lkml.kernel.org/r/1468421405-20056-2-git-send-email-fweisbec@gmail.com Signed-off-by: Ingo Molnar --- include/asm-generic/cputime_nsecs.h | 2 + kernel/sched/cputime.c | 124 ++++++++++++++++++++++-------------- 2 files changed, 79 insertions(+), 47 deletions(-) (limited to 'kernel') diff --git a/include/asm-generic/cputime_nsecs.h b/include/asm-generic/cputime_nsecs.h index 0f1c6f315cdc..a84e28e0c634 100644 --- a/include/asm-generic/cputime_nsecs.h +++ b/include/asm-generic/cputime_nsecs.h @@ -50,6 +50,8 @@ typedef u64 __nocast cputime64_t; (__force u64)(__ct) #define nsecs_to_cputime(__nsecs) \ (__force cputime_t)(__nsecs) +#define nsecs_to_cputime64(__nsecs) \ + (__force cputime64_t)(__nsecs) /* diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 3d60e5d76fdb..db82ae12cf01 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -79,40 +79,50 @@ void irqtime_account_irq(struct task_struct *curr) } EXPORT_SYMBOL_GPL(irqtime_account_irq); -static int irqtime_account_hi_update(void) +static cputime_t irqtime_account_hi_update(cputime_t maxtime) { u64 *cpustat = kcpustat_this_cpu->cpustat; unsigned long flags; - u64 latest_ns; - int ret = 0; + cputime_t irq_cputime; local_irq_save(flags); - latest_ns = this_cpu_read(cpu_hardirq_time); - if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_IRQ]) - ret = 1; + irq_cputime = nsecs_to_cputime64(this_cpu_read(cpu_hardirq_time)) - + cpustat[CPUTIME_IRQ]; + irq_cputime = min(irq_cputime, maxtime); + cpustat[CPUTIME_IRQ] += irq_cputime; local_irq_restore(flags); - return ret; + return irq_cputime; } -static int irqtime_account_si_update(void) +static cputime_t irqtime_account_si_update(cputime_t maxtime) { u64 *cpustat = kcpustat_this_cpu->cpustat; unsigned long flags; - u64 latest_ns; - int ret = 0; + cputime_t softirq_cputime; local_irq_save(flags); - latest_ns = this_cpu_read(cpu_softirq_time); - if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_SOFTIRQ]) - ret = 1; + softirq_cputime = nsecs_to_cputime64(this_cpu_read(cpu_softirq_time)) - + cpustat[CPUTIME_SOFTIRQ]; + softirq_cputime = min(softirq_cputime, maxtime); + cpustat[CPUTIME_SOFTIRQ] += softirq_cputime; local_irq_restore(flags); - return ret; + return softirq_cputime; } #else /* CONFIG_IRQ_TIME_ACCOUNTING */ #define sched_clock_irqtime (0) +static cputime_t irqtime_account_hi_update(cputime_t dummy) +{ + return 0; +} + +static cputime_t irqtime_account_si_update(cputime_t dummy) +{ + return 0; +} + #endif /* !CONFIG_IRQ_TIME_ACCOUNTING */ static inline void task_group_account_field(struct task_struct *p, int index, @@ -257,31 +267,44 @@ void account_idle_time(cputime_t cputime) cpustat[CPUTIME_IDLE] += (__force u64) cputime; } -static __always_inline unsigned long steal_account_process_tick(unsigned long max_jiffies) +static __always_inline cputime_t steal_account_process_time(cputime_t maxtime) { #ifdef CONFIG_PARAVIRT if (static_key_false(¶virt_steal_enabled)) { + cputime_t steal_cputime; u64 steal; - unsigned long steal_jiffies; steal = paravirt_steal_clock(smp_processor_id()); steal -= this_rq()->prev_steal_time; - /* - * steal is in nsecs but our caller is expecting steal - * time in jiffies. Lets cast the result to jiffies - * granularity and account the rest on the next rounds. - */ - steal_jiffies = min(nsecs_to_jiffies(steal), max_jiffies); - this_rq()->prev_steal_time += jiffies_to_nsecs(steal_jiffies); + steal_cputime = min(nsecs_to_cputime(steal), maxtime); + account_steal_time(steal_cputime); + this_rq()->prev_steal_time += cputime_to_nsecs(steal_cputime); - account_steal_time(jiffies_to_cputime(steal_jiffies)); - return steal_jiffies; + return steal_cputime; } #endif return 0; } +/* + * Account how much elapsed time was spent in steal, irq, or softirq time. + */ +static inline cputime_t account_other_time(cputime_t max) +{ + cputime_t accounted; + + accounted = steal_account_process_time(max); + + if (accounted < max) + accounted += irqtime_account_hi_update(max - accounted); + + if (accounted < max) + accounted += irqtime_account_si_update(max - accounted); + + return accounted; +} + /* * Accumulate raw cputime values of dead tasks (sig->[us]time) and live * tasks (sum on group iteration) belonging to @tsk's group. @@ -342,21 +365,23 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) static void irqtime_account_process_tick(struct task_struct *p, int user_tick, struct rq *rq, int ticks) { - cputime_t scaled = cputime_to_scaled(cputime_one_jiffy); - u64 cputime = (__force u64) cputime_one_jiffy; - u64 *cpustat = kcpustat_this_cpu->cpustat; + u64 cputime = (__force u64) cputime_one_jiffy * ticks; + cputime_t scaled, other; - if (steal_account_process_tick(ULONG_MAX)) + /* + * When returning from idle, many ticks can get accounted at + * once, including some ticks of steal, irq, and softirq time. + * Subtract those ticks from the amount of time accounted to + * idle, or potentially user or system time. Due to rounding, + * other time can exceed ticks occasionally. + */ + other = account_other_time(cputime); + if (other >= cputime) return; + cputime -= other; + scaled = cputime_to_scaled(cputime); - cputime *= ticks; - scaled *= ticks; - - if (irqtime_account_hi_update()) { - cpustat[CPUTIME_IRQ] += cputime; - } else if (irqtime_account_si_update()) { - cpustat[CPUTIME_SOFTIRQ] += cputime; - } else if (this_cpu_ksoftirqd() == p) { + if (this_cpu_ksoftirqd() == p) { /* * ksoftirqd time do not get accounted in cpu_softirq_time. * So, we have to handle it separately here. @@ -466,7 +491,7 @@ void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime */ void account_process_tick(struct task_struct *p, int user_tick) { - cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); + cputime_t cputime, scaled, steal; struct rq *rq = this_rq(); if (vtime_accounting_cpu_enabled()) @@ -477,16 +502,21 @@ void account_process_tick(struct task_struct *p, int user_tick) return; } - if (steal_account_process_tick(ULONG_MAX)) + cputime = cputime_one_jiffy; + steal = steal_account_process_time(cputime); + + if (steal >= cputime) return; + cputime -= steal; + scaled = cputime_to_scaled(cputime); + if (user_tick) - account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); + account_user_time(p, cputime, scaled); else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET)) - account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy, - one_jiffy_scaled); + account_system_time(p, HARDIRQ_OFFSET, cputime, scaled); else - account_idle_time(cputime_one_jiffy); + account_idle_time(cputime); } /* @@ -681,14 +711,14 @@ static cputime_t vtime_delta(struct task_struct *tsk) static cputime_t get_vtime_delta(struct task_struct *tsk) { unsigned long now = READ_ONCE(jiffies); - unsigned long delta_jiffies, steal_jiffies; + cputime_t delta, steal; - delta_jiffies = now - tsk->vtime_snap; - steal_jiffies = steal_account_process_tick(delta_jiffies); + delta = jiffies_to_cputime(now - tsk->vtime_snap); + steal = steal_account_process_time(delta); WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_INACTIVE); tsk->vtime_snap = now; - return jiffies_to_cputime(delta_jiffies - steal_jiffies); + return delta - steal; } static void __vtime_account_system(struct task_struct *tsk) -- cgit v1.2.3 From b58c35840521bb02b150e1d0d34ca9197f8b7145 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Wed, 13 Jul 2016 16:50:02 +0200 Subject: sched/cputime: Replace VTIME_GEN irq time code with IRQ_TIME_ACCOUNTING code The CONFIG_VIRT_CPU_ACCOUNTING_GEN irq time tracking code does not appear to currently work right. On CPUs without nohz_full=, only tick based irq time sampling is done, which breaks down when dealing with a nohz_idle CPU. On firewalls and similar systems, no ticks may happen on a CPU for a while, and the irq time spent may never get accounted properly. This can cause issues with capacity planning and power saving, which use the CPU statistics as inputs in decision making. Remove the VTIME_GEN vtime irq time code, and replace it with the IRQ_TIME_ACCOUNTING code, when selected as a config option by the user. Signed-off-by: Rik van Riel Signed-off-by: Frederic Weisbecker Cc: Linus Torvalds Cc: Mike Galbraith Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Radim Krcmar Cc: Thomas Gleixner Cc: Wanpeng Li Link: http://lkml.kernel.org/r/1468421405-20056-3-git-send-email-fweisbec@gmail.com Signed-off-by: Ingo Molnar --- include/linux/vtime.h | 32 ++++++++++++++------------------ init/Kconfig | 6 +++--- kernel/sched/cputime.c | 16 +++------------- 3 files changed, 20 insertions(+), 34 deletions(-) (limited to 'kernel') diff --git a/include/linux/vtime.h b/include/linux/vtime.h index fa2196990f84..d1977d84ebdf 100644 --- a/include/linux/vtime.h +++ b/include/linux/vtime.h @@ -14,6 +14,18 @@ struct task_struct; */ #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE static inline bool vtime_accounting_cpu_enabled(void) { return true; } + +#ifdef __ARCH_HAS_VTIME_ACCOUNT +extern void vtime_account_irq_enter(struct task_struct *tsk); +#else +extern void vtime_common_account_irq_enter(struct task_struct *tsk); +static inline void vtime_account_irq_enter(struct task_struct *tsk) +{ + if (vtime_accounting_cpu_enabled()) + vtime_common_account_irq_enter(tsk); +} +#endif /* __ARCH_HAS_VTIME_ACCOUNT */ + #endif /* CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN @@ -64,17 +76,6 @@ extern void vtime_account_system(struct task_struct *tsk); extern void vtime_account_idle(struct task_struct *tsk); extern void vtime_account_user(struct task_struct *tsk); -#ifdef __ARCH_HAS_VTIME_ACCOUNT -extern void vtime_account_irq_enter(struct task_struct *tsk); -#else -extern void vtime_common_account_irq_enter(struct task_struct *tsk); -static inline void vtime_account_irq_enter(struct task_struct *tsk) -{ - if (vtime_accounting_cpu_enabled()) - vtime_common_account_irq_enter(tsk); -} -#endif /* __ARCH_HAS_VTIME_ACCOUNT */ - #else /* !CONFIG_VIRT_CPU_ACCOUNTING */ static inline void vtime_task_switch(struct task_struct *prev) { } @@ -85,13 +86,8 @@ static inline void vtime_account_irq_enter(struct task_struct *tsk) { } #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN extern void arch_vtime_task_switch(struct task_struct *tsk); -extern void vtime_gen_account_irq_exit(struct task_struct *tsk); - -static inline void vtime_account_irq_exit(struct task_struct *tsk) -{ - if (vtime_accounting_cpu_enabled()) - vtime_gen_account_irq_exit(tsk); -} +static inline void vtime_account_irq_enter(struct task_struct *tsk) { } +static inline void vtime_account_irq_exit(struct task_struct *tsk) { } extern void vtime_user_enter(struct task_struct *tsk); diff --git a/init/Kconfig b/init/Kconfig index c02d89777713..787dd76acf29 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -375,9 +375,11 @@ config VIRT_CPU_ACCOUNTING_GEN If unsure, say N. +endchoice + config IRQ_TIME_ACCOUNTING bool "Fine granularity task level IRQ time accounting" - depends on HAVE_IRQ_TIME_ACCOUNTING && !NO_HZ_FULL + depends on HAVE_IRQ_TIME_ACCOUNTING && !VIRT_CPU_ACCOUNTING_NATIVE help Select this option to enable fine granularity task irq time accounting. This is done by reading a timestamp on each @@ -386,8 +388,6 @@ config IRQ_TIME_ACCOUNTING If in doubt, say N here. -endchoice - config BSD_PROCESS_ACCT bool "BSD Process Accounting" depends on MULTIUSER diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index db82ae12cf01..ca7e33cb0967 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -711,14 +711,14 @@ static cputime_t vtime_delta(struct task_struct *tsk) static cputime_t get_vtime_delta(struct task_struct *tsk) { unsigned long now = READ_ONCE(jiffies); - cputime_t delta, steal; + cputime_t delta, other; delta = jiffies_to_cputime(now - tsk->vtime_snap); - steal = steal_account_process_time(delta); + other = account_other_time(delta); WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_INACTIVE); tsk->vtime_snap = now; - return delta - steal; + return delta - other; } static void __vtime_account_system(struct task_struct *tsk) @@ -738,16 +738,6 @@ void vtime_account_system(struct task_struct *tsk) write_seqcount_end(&tsk->vtime_seqcount); } -void vtime_gen_account_irq_exit(struct task_struct *tsk) -{ - write_seqcount_begin(&tsk->vtime_seqcount); - if (vtime_delta(tsk)) - __vtime_account_system(tsk); - if (context_tracking_in_user()) - tsk->vtime_snap_whence = VTIME_USER; - write_seqcount_end(&tsk->vtime_seqcount); -} - void vtime_account_user(struct task_struct *tsk) { cputime_t delta_cpu; -- cgit v1.2.3 From 0cfdf9a198b0d4f5ad6c87d894db7830b796b2cc Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 13 Jul 2016 16:50:03 +0200 Subject: sched/cputime: Clean up the old vtime gen irqtime accounting completely Vtime generic irqtime accounting has been removed but there are a few remnants to clean up: * The vtime_accounting_cpu_enabled() check in irq entry was only used by CONFIG_VIRT_CPU_ACCOUNTING_GEN. We can safely remove it. * Without the vtime_accounting_cpu_enabled(), we no longer need to have a vtime_common_account_irq_enter() indirect function. * Move vtime_account_irq_enter() implementation under CONFIG_VIRT_CPU_ACCOUNTING_NATIVE which is the last user. * The vtime_account_user() call was only used on irq entry for CONFIG_VIRT_CPU_ACCOUNTING_GEN. We can remove that too. Signed-off-by: Frederic Weisbecker Cc: Linus Torvalds Cc: Mike Galbraith Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Radim Krcmar Cc: Rik van Riel Cc: Thomas Gleixner Cc: Wanpeng Li Link: http://lkml.kernel.org/r/1468421405-20056-4-git-send-email-fweisbec@gmail.com Signed-off-by: Ingo Molnar --- include/linux/vtime.h | 11 ----------- kernel/sched/cputime.c | 33 ++++++++++----------------------- 2 files changed, 10 insertions(+), 34 deletions(-) (limited to 'kernel') diff --git a/include/linux/vtime.h b/include/linux/vtime.h index d1977d84ebdf..65aef5e9d04e 100644 --- a/include/linux/vtime.h +++ b/include/linux/vtime.h @@ -14,18 +14,7 @@ struct task_struct; */ #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE static inline bool vtime_accounting_cpu_enabled(void) { return true; } - -#ifdef __ARCH_HAS_VTIME_ACCOUNT extern void vtime_account_irq_enter(struct task_struct *tsk); -#else -extern void vtime_common_account_irq_enter(struct task_struct *tsk); -static inline void vtime_account_irq_enter(struct task_struct *tsk) -{ - if (vtime_accounting_cpu_enabled()) - vtime_common_account_irq_enter(tsk); -} -#endif /* __ARCH_HAS_VTIME_ACCOUNT */ - #endif /* CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index ca7e33cb0967..16a873c203b1 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -431,6 +431,10 @@ void vtime_common_task_switch(struct task_struct *prev) } #endif +#endif /* CONFIG_VIRT_CPU_ACCOUNTING */ + + +#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE /* * Archs that account the whole time spent in the idle task * (outside irq) as idle time can rely on this and just implement @@ -440,33 +444,16 @@ void vtime_common_task_switch(struct task_struct *prev) * vtime_account(). */ #ifndef __ARCH_HAS_VTIME_ACCOUNT -void vtime_common_account_irq_enter(struct task_struct *tsk) +void vtime_account_irq_enter(struct task_struct *tsk) { - if (!in_interrupt()) { - /* - * If we interrupted user, context_tracking_in_user() - * is 1 because the context tracking don't hook - * on irq entry/exit. This way we know if - * we need to flush user time on kernel entry. - */ - if (context_tracking_in_user()) { - vtime_account_user(tsk); - return; - } - - if (is_idle_task(tsk)) { - vtime_account_idle(tsk); - return; - } - } - vtime_account_system(tsk); + if (!in_interrupt() && is_idle_task(tsk)) + vtime_account_idle(tsk); + else + vtime_account_system(tsk); } -EXPORT_SYMBOL_GPL(vtime_common_account_irq_enter); +EXPORT_SYMBOL_GPL(vtime_account_irq_enter); #endif /* __ARCH_HAS_VTIME_ACCOUNT */ -#endif /* CONFIG_VIRT_CPU_ACCOUNTING */ - -#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) { *ut = p->utime; -- cgit v1.2.3 From 553bf6bbfd8a540c70aee28eb50e24caff456a03 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Wed, 13 Jul 2016 16:50:05 +0200 Subject: sched/cputime: Drop local_irq_save/restore from irqtime_account_irq() Paolo pointed out that irqs are already blocked when irqtime_account_irq() is called. That means there is no reason to call local_irq_save/restore() again. Suggested-by: Paolo Bonzini Signed-off-by: Rik van Riel Signed-off-by: Frederic Weisbecker Reviewed-by: Paolo Bonzini Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Radim Krcmar Cc: Thomas Gleixner Cc: Wanpeng Li Link: http://lkml.kernel.org/r/1468421405-20056-6-git-send-email-fweisbec@gmail.com Signed-off-by: Ingo Molnar --- kernel/sched/cputime.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 16a873c203b1..ea0f6f31a244 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -49,15 +49,12 @@ DEFINE_PER_CPU(seqcount_t, irq_time_seq); */ void irqtime_account_irq(struct task_struct *curr) { - unsigned long flags; s64 delta; int cpu; if (!sched_clock_irqtime) return; - local_irq_save(flags); - cpu = smp_processor_id(); delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time); __this_cpu_add(irq_start_time, delta); @@ -75,7 +72,6 @@ void irqtime_account_irq(struct task_struct *curr) __this_cpu_add(cpu_softirq_time, delta); irq_time_write_end(); - local_irq_restore(flags); } EXPORT_SYMBOL_GPL(irqtime_account_irq); -- cgit v1.2.3 From d02038f972538b93011d78c068f44514fbde0a8c Mon Sep 17 00:00:00 2001 From: Florian Meier Date: Thu, 14 Jul 2016 12:07:26 -0700 Subject: gcov: add support for gcc version >= 6 Link: http://lkml.kernel.org/r/20160701130914.GA23225@styxhp Signed-off-by: Florian Meier Reviewed-by: Peter Oberparleiter Tested-by: Peter Oberparleiter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/gcov/gcc_4_7.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/gcov/gcc_4_7.c b/kernel/gcov/gcc_4_7.c index e25e92fb44fa..6a5c239c7669 100644 --- a/kernel/gcov/gcc_4_7.c +++ b/kernel/gcov/gcc_4_7.c @@ -18,7 +18,7 @@ #include #include "gcov.h" -#if __GNUC__ == 5 && __GNUC_MINOR__ >= 1 +#if (__GNUC__ > 5) || (__GNUC__ == 5 && __GNUC_MINOR__ >= 1) #define GCOV_COUNTERS 10 #elif __GNUC__ == 4 && __GNUC_MINOR__ >= 9 #define GCOV_COUNTERS 9 -- cgit v1.2.3 From 775be506266a860f141f6b848c92c316c602a94f Mon Sep 17 00:00:00 2001 From: Ben Dooks Date: Fri, 17 Jun 2016 16:56:14 +0100 Subject: clockevents: Make clockevents_subsys static The clockevents_subsys struct is used for sysfs support and is not declared or used outside the file it is defined in. Fix the following warning by making it static: kernel/time/clockevents.c:648:17: warning: symbol 'clockevents_subsys' was not declared. Should it be static? Signed-off-by: Ben Dooks Cc: linux-kernel@lists.codethink.co.uk Link: http://lkml.kernel.org/r/1466178974-7105-1-git-send-email-ben.dooks@codethink.co.uk Signed-off-by: Thomas Gleixner --- kernel/time/clockevents.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index a9b76a40319e..2c5bc77c0bb0 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -645,7 +645,7 @@ void tick_cleanup_dead_cpu(int cpu) #endif #ifdef CONFIG_SYSFS -struct bus_type clockevents_subsys = { +static struct bus_type clockevents_subsys = { .name = "clockevents", .dev_name = "clockevent", }; -- cgit v1.2.3 From eb0dc47ab6810c432e8193beccd9905ba0db8b22 Mon Sep 17 00:00:00 2001 From: Vincent Stehle Date: Mon, 18 Jul 2016 22:56:26 +0200 Subject: genirq: Fix missing irq allocation affinity hint MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The new affinity hint argument of __irq_domain_alloc_irqs() is missing in irq_reserve_ipi(). Add it. This fixes the following compilation error: kernel/irq/ipi.c: In function ‘irq_reserve_ipi’: kernel/irq/ipi.c:85:9: error: too few arguments to function ‘__irq_domain_alloc_irqs’ virq = __irq_domain_alloc_irqs(domain, virq, nr_irqs, NUMA_NO_NODE, ^ Fixes: 06ee6d571f0e ("genirq: Add affinity hint to irq allocation") Signed-off-by: Vincent Stehlé Cc: linux-pci@vger.kernel.org Cc: Christoph Hellwig Signed-off-by: Thomas Gleixner --- kernel/irq/ipi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/ipi.c b/kernel/irq/ipi.c index 4fd23510d5f2..1a9abc1c8ea0 100644 --- a/kernel/irq/ipi.c +++ b/kernel/irq/ipi.c @@ -83,7 +83,7 @@ int irq_reserve_ipi(struct irq_domain *domain, } virq = __irq_domain_alloc_irqs(domain, virq, nr_irqs, NUMA_NO_NODE, - (void *) dest, true); + (void *) dest, true, NULL); if (virq <= 0) { pr_warn("Can't reserve IPI, failed to alloc hw irqs\n"); -- cgit v1.2.3 From 1f3b0f8243cb934307f59bd4d8e43b868e61d4d9 Mon Sep 17 00:00:00 2001 From: Gaurav Jindal Date: Thu, 14 Jul 2016 12:04:20 +0000 Subject: tick/nohz: Optimize nohz idle enter tick_nohz_start_idle is called before checking whether the idle tick can be stopped. If the tick cannot be stopped, calling tick_nohz_start_idle() is pointless and just wasting CPU cycles. Only invoke tick_nohz_start_idle() when can_stop_idle_tick() returns true. A short one minute observation of the effect on ARM64 shows a reduction of calls by 1.5% thus optimizing the idle entry sequence. [tglx: Massaged changelog ] Co-developed-by: Sanjeev Yadav Signed-off-by: Gaurav Jindal Link: http://lkml.kernel.org/r/20160714120416.GB21099@gaurav.jindal@spreadtrum.com Signed-off-by: Thomas Gleixner --- kernel/time/tick-sched.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 2ec7c00228f3..204fdc86863d 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -908,11 +908,10 @@ static void __tick_nohz_idle_enter(struct tick_sched *ts) ktime_t now, expires; int cpu = smp_processor_id(); - now = tick_nohz_start_idle(ts); - if (can_stop_idle_tick(cpu, ts)) { int was_stopped = ts->tick_stopped; + now = tick_nohz_start_idle(ts); ts->idle_calls++; expires = tick_nohz_stop_sched_tick(ts, now, cpu); -- cgit v1.2.3