From dbda92d16f8655044e082930e4e9d244b87fde77 Mon Sep 17 00:00:00 2001 From: "Bu, Yitian" Date: Mon, 18 Feb 2013 12:53:37 +0000 Subject: printk: Fix rq->lock vs logbuf_lock unlock lock inversion commit 07354eb1a74d1 ("locking printk: Annotate logbuf_lock as raw") reintroduced a lock inversion problem which was fixed in commit 0b5e1c5255 ("printk: Release console_sem after logbuf_lock"). This happened probably when fixing up patch rejects. Restore the ordering and unlock logbuf_lock before releasing console_sem. Signed-off-by: ybu Cc: Peter Zijlstra Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/E807E903FE6CBE4D95E420FBFCC273B827413C@nasanexd01h.na.qualcomm.com Signed-off-by: Thomas Gleixner --- kernel/printk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index 267ce780abe8..e698e80d8428 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -1358,9 +1358,9 @@ static int console_trylock_for_printk(unsigned int cpu) } } logbuf_cpu = UINT_MAX; + raw_spin_unlock(&logbuf_lock); if (wake) up(&console_sem); - raw_spin_unlock(&logbuf_lock); return retval; } -- cgit v1.2.3 From 1b1d2fb4444231f25ddabc598aa2b5a9c0833fba Mon Sep 17 00:00:00 2001 From: Colin Cross Date: Mon, 6 May 2013 23:50:08 +0000 Subject: lockdep: remove task argument from debug_check_no_locks_held The only existing caller to debug_check_no_locks_held calls it with 'current' as the task, and the freezer needs to call debug_check_no_locks_held but doesn't already have a current task pointer, so remove the argument. It is already assuming that the current task is relevant by dumping the current stack trace as part of the warning. This was originally part of 6aa9707099c (lockdep: check that no locks held at freeze time) which was reverted in dbf520a9d7d4. Original-author: Mandeep Singh Baines Acked-by: Pavel Machek Acked-by: Tejun Heo Signed-off-by: Colin Cross Signed-off-by: Rafael J. Wysocki --- include/linux/debug_locks.h | 4 ++-- kernel/exit.c | 2 +- kernel/lockdep.c | 17 ++++++++--------- 3 files changed, 11 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/include/linux/debug_locks.h b/include/linux/debug_locks.h index 21ca773f77bf..822c1354f3a6 100644 --- a/include/linux/debug_locks.h +++ b/include/linux/debug_locks.h @@ -51,7 +51,7 @@ struct task_struct; extern void debug_show_all_locks(void); extern void debug_show_held_locks(struct task_struct *task); extern void debug_check_no_locks_freed(const void *from, unsigned long len); -extern void debug_check_no_locks_held(struct task_struct *task); +extern void debug_check_no_locks_held(void); #else static inline void debug_show_all_locks(void) { @@ -67,7 +67,7 @@ debug_check_no_locks_freed(const void *from, unsigned long len) } static inline void -debug_check_no_locks_held(struct task_struct *task) +debug_check_no_locks_held(void) { } #endif diff --git a/kernel/exit.c b/kernel/exit.c index af2eb3cbd499..e59756275000 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -835,7 +835,7 @@ void do_exit(long code) /* * Make sure we are holding no locks: */ - debug_check_no_locks_held(tsk); + debug_check_no_locks_held(); /* * We can do this unlocked here. The futex code uses this flag * just to verify whether the pi state cleanup has been done diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 1f3186b37fd5..e16c45b9ee77 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -4090,7 +4090,7 @@ void debug_check_no_locks_freed(const void *mem_from, unsigned long mem_len) } EXPORT_SYMBOL_GPL(debug_check_no_locks_freed); -static void print_held_locks_bug(struct task_struct *curr) +static void print_held_locks_bug(void) { if (!debug_locks_off()) return; @@ -4099,22 +4099,21 @@ static void print_held_locks_bug(struct task_struct *curr) printk("\n"); printk("=====================================\n"); - printk("[ BUG: lock held at task exit time! ]\n"); + printk("[ BUG: %s/%d still has locks held! ]\n", + current->comm, task_pid_nr(current)); print_kernel_ident(); printk("-------------------------------------\n"); - printk("%s/%d is exiting with locks still held!\n", - curr->comm, task_pid_nr(curr)); - lockdep_print_held_locks(curr); - + lockdep_print_held_locks(current); printk("\nstack backtrace:\n"); dump_stack(); } -void debug_check_no_locks_held(struct task_struct *task) +void debug_check_no_locks_held(void) { - if (unlikely(task->lockdep_depth > 0)) - print_held_locks_bug(task); + if (unlikely(current->lockdep_depth > 0)) + print_held_locks_bug(); } +EXPORT_SYMBOL_GPL(debug_check_no_locks_held); void debug_show_all_locks(void) { -- cgit v1.2.3 From 18ad0c6297df1d671ecea83b608cd9e432642a05 Mon Sep 17 00:00:00 2001 From: Colin Cross Date: Mon, 6 May 2013 23:50:10 +0000 Subject: freezer: shorten freezer sleep time using exponential backoff All tasks can easily be frozen in under 10 ms, switch to using an initial 1 ms sleep followed by exponential backoff until 8 ms. Also convert the printed time to ms instead of centiseconds. Acked-by: Pavel Machek Acked-by: Tejun Heo Signed-off-by: Colin Cross Signed-off-by: Rafael J. Wysocki --- kernel/power/process.c | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/power/process.c b/kernel/power/process.c index 98088e0e71e8..fc0df8486449 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -30,9 +30,10 @@ static int try_to_freeze_tasks(bool user_only) unsigned int todo; bool wq_busy = false; struct timeval start, end; - u64 elapsed_csecs64; - unsigned int elapsed_csecs; + u64 elapsed_msecs64; + unsigned int elapsed_msecs; bool wakeup = false; + int sleep_usecs = USEC_PER_MSEC; do_gettimeofday(&start); @@ -68,22 +69,25 @@ static int try_to_freeze_tasks(bool user_only) /* * We need to retry, but first give the freezing tasks some - * time to enter the refrigerator. + * time to enter the refrigerator. Start with an initial + * 1 ms sleep followed by exponential backoff until 8 ms. */ - msleep(10); + usleep_range(sleep_usecs / 2, sleep_usecs); + if (sleep_usecs < 8 * USEC_PER_MSEC) + sleep_usecs *= 2; } do_gettimeofday(&end); - elapsed_csecs64 = timeval_to_ns(&end) - timeval_to_ns(&start); - do_div(elapsed_csecs64, NSEC_PER_SEC / 100); - elapsed_csecs = elapsed_csecs64; + elapsed_msecs64 = timeval_to_ns(&end) - timeval_to_ns(&start); + do_div(elapsed_msecs64, NSEC_PER_MSEC); + elapsed_msecs = elapsed_msecs64; if (todo) { printk("\n"); - printk(KERN_ERR "Freezing of tasks %s after %d.%02d seconds " + printk(KERN_ERR "Freezing of tasks %s after %d.%03d seconds " "(%d tasks refusing to freeze, wq_busy=%d):\n", wakeup ? "aborted" : "failed", - elapsed_csecs / 100, elapsed_csecs % 100, + elapsed_msecs / 1000, elapsed_msecs % 1000, todo - wq_busy, wq_busy); if (!wakeup) { @@ -96,8 +100,8 @@ static int try_to_freeze_tasks(bool user_only) read_unlock(&tasklist_lock); } } else { - printk("(elapsed %d.%02d seconds) ", elapsed_csecs / 100, - elapsed_csecs % 100); + printk("(elapsed %d.%03d seconds) ", elapsed_msecs / 1000, + elapsed_msecs % 1000); } return todo ? -EBUSY : 0; -- cgit v1.2.3 From 613f5d13b569859171f0896fbc73ee0bfa811fda Mon Sep 17 00:00:00 2001 From: Colin Cross Date: Mon, 6 May 2013 23:50:11 +0000 Subject: freezer: skip waking up tasks with PF_FREEZER_SKIP set Android goes through suspend/resume very often (every few seconds when on a busy wifi network with the screen off), and a significant portion of the energy used to go in and out of suspend is spent in the freezer. If a task has called freezer_do_not_count(), don't bother waking it up. If it happens to wake up later it will call freezer_count() and immediately enter the refrigerator. Combined with patches to convert freezable helpers to use freezer_do_not_count() and convert common sites where idle userspace tasks are blocked to use the freezable helpers, this reduces the time and energy required to suspend and resume. Acked-by: Tejun Heo Acked-by: Pavel Machek Signed-off-by: Colin Cross Signed-off-by: Rafael J. Wysocki --- kernel/freezer.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'kernel') diff --git a/kernel/freezer.c b/kernel/freezer.c index c38893b0efba..8b2afc1c9df0 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c @@ -110,6 +110,18 @@ bool freeze_task(struct task_struct *p) { unsigned long flags; + /* + * This check can race with freezer_do_not_count, but worst case that + * will result in an extra wakeup being sent to the task. It does not + * race with freezer_count(), the barriers in freezer_count() and + * freezer_should_skip() ensure that either freezer_count() sees + * freezing == true in try_to_freeze() and freezes, or + * freezer_should_skip() sees !PF_FREEZE_SKIP and freezes the task + * normally. + */ + if (freezer_should_skip(p)) + return false; + spin_lock_irqsave(&freezer_lock, flags); if (!freezing(p) || frozen(p)) { spin_unlock_irqrestore(&freezer_lock, flags); -- cgit v1.2.3 From 56467c7697f5aef6974501fbe2c3e63674583549 Mon Sep 17 00:00:00 2001 From: Colin Cross Date: Mon, 6 May 2013 23:50:18 +0000 Subject: futex: use freezable blocking call Avoid waking up every thread sleeping in a futex_wait call during suspend and resume by calling a freezable blocking call. Previous patches modified the freezer to avoid sending wakeups to threads that are blocked in freezable blocking calls. This call was selected to be converted to a freezable call because it doesn't hold any locks or release any resources when interrupted that might be needed by another freezing task or a kernel driver during suspend, and is a common site where idle userspace tasks are blocked. Acked-by: Tejun Heo Acked-by: Thomas Gleixner Acked-by: Darren Hart Signed-off-by: Colin Cross Signed-off-by: Rafael J. Wysocki --- kernel/futex.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index b26dcfc02c94..d710fae8abbe 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -61,6 +61,7 @@ #include #include #include +#include #include @@ -1807,7 +1808,7 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q, * is no timeout, or if it has yet to expire. */ if (!timeout || timeout->task) - schedule(); + freezable_schedule(); } __set_current_state(TASK_RUNNING); } -- cgit v1.2.3 From b0f8c44f30e58c3aaaaaf864d5c3d3cc2e8a4c2d Mon Sep 17 00:00:00 2001 From: Colin Cross Date: Mon, 6 May 2013 23:50:19 +0000 Subject: nanosleep: use freezable blocking call Avoid waking up every thread sleeping in a nanosleep call during suspend and resume by calling a freezable blocking call. Previous patches modified the freezer to avoid sending wakeups to threads that are blocked in freezable blocking calls. This call was selected to be converted to a freezable call because it doesn't hold any locks or release any resources when interrupted that might be needed by another freezing task or a kernel driver during suspend, and is a common site where idle userspace tasks are blocked. Acked-by: Tejun Heo Acked-by: Thomas Gleixner Signed-off-by: Colin Cross Signed-off-by: Rafael J. Wysocki --- kernel/hrtimer.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index fd4b13b131f8..3ee4d06c6fc2 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -47,6 +47,7 @@ #include #include #include +#include #include @@ -1545,7 +1546,7 @@ static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mod t->task = NULL; if (likely(t->task)) - schedule(); + freezable_schedule(); hrtimer_cancel(&t->timer); mode = HRTIMER_MODE_ABS; -- cgit v1.2.3 From a2d5f1f5d941593e61071dc78e9de228eda5475f Mon Sep 17 00:00:00 2001 From: Colin Cross Date: Mon, 6 May 2013 23:50:20 +0000 Subject: sigtimedwait: use freezable blocking call Avoid waking up every thread sleeping in a sigtimedwait call during suspend and resume by calling a freezable blocking call. Previous patches modified the freezer to avoid sending wakeups to threads that are blocked in freezable blocking calls. This call was selected to be converted to a freezable call because it doesn't hold any locks or release any resources when interrupted that might be needed by another freezing task or a kernel driver during suspend, and is a common site where idle userspace tasks are blocked. Acked-by: Tejun Heo Signed-off-by: Colin Cross Signed-off-by: Rafael J. Wysocki --- kernel/signal.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 113411bfe8b1..50e41075ac77 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2848,7 +2848,7 @@ int do_sigtimedwait(const sigset_t *which, siginfo_t *info, recalc_sigpending(); spin_unlock_irq(&tsk->sighand->siglock); - timeout = schedule_timeout_interruptible(timeout); + timeout = freezable_schedule_timeout_interruptible(timeout); spin_lock_irq(&tsk->sighand->siglock); __set_task_blocked(tsk, &tsk->real_blocked); -- cgit v1.2.3 From cee22a15052faa817e3ec8985a28154d3fabc7aa Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Mon, 8 Apr 2013 16:45:40 +0530 Subject: workqueues: Introduce new flag WQ_POWER_EFFICIENT for power oriented workqueues Workqueues can be performance or power-oriented. Currently, most workqueues are bound to the CPU they were created on. This gives good performance (due to cache effects) at the cost of potentially waking up otherwise idle cores (Idle from scheduler's perspective. Which may or may not be physically idle) just to process some work. To save power, we can allow the work to be rescheduled on a core that is already awake. Workqueues created with the WQ_UNBOUND flag will allow some power savings. However, we don't change the default behaviour of the system. To enable power-saving behaviour, a new config option CONFIG_WQ_POWER_EFFICIENT needs to be turned on. This option can also be overridden by the workqueue.power_efficient boot parameter. tj: Updated config description and comments. Renamed CONFIG_WQ_POWER_EFFICIENT to CONFIG_WQ_POWER_EFFICIENT_DEFAULT. Signed-off-by: Viresh Kumar Reviewed-by: Amit Kucheria Signed-off-by: Tejun Heo --- Documentation/kernel-parameters.txt | 15 +++++++++++++++ include/linux/workqueue.h | 27 +++++++++++++++++++++++++++ kernel/power/Kconfig | 20 ++++++++++++++++++++ kernel/workqueue.c | 13 +++++++++++++ 4 files changed, 75 insertions(+) (limited to 'kernel') diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index c3bfacb92910..37dfd720de79 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -3320,6 +3320,21 @@ bytes respectively. Such letter suffixes can also be entirely omitted. that this also can be controlled per-workqueue for workqueues visible under /sys/bus/workqueue/. + workqueue.power_efficient + Per-cpu workqueues are generally preferred because + they show better performance thanks to cache + locality; unfortunately, per-cpu workqueues tend to + be more power hungry than unbound workqueues. + + Enabling this makes the per-cpu workqueues which + were observed to contribute significantly to power + consumption unbound, leading to measurably lower + power usage at the cost of small performance + overhead. + + The default value of this parameter is determined by + the config option CONFIG_WQ_POWER_EFFICIENT_DEFAULT. + x2apic_phys [X86-64,APIC] Use x2apic physical mode instead of default x2apic cluster mode on platforms supporting x2apic. diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 623488fdc1f5..fc0136b604f2 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -303,6 +303,33 @@ enum { WQ_CPU_INTENSIVE = 1 << 5, /* cpu instensive workqueue */ WQ_SYSFS = 1 << 6, /* visible in sysfs, see wq_sysfs_register() */ + /* + * Per-cpu workqueues are generally preferred because they tend to + * show better performance thanks to cache locality. Per-cpu + * workqueues exclude the scheduler from choosing the CPU to + * execute the worker threads, which has an unfortunate side effect + * of increasing power consumption. + * + * The scheduler considers a CPU idle if it doesn't have any task + * to execute and tries to keep idle cores idle to conserve power; + * however, for example, a per-cpu work item scheduled from an + * interrupt handler on an idle CPU will force the scheduler to + * excute the work item on that CPU breaking the idleness, which in + * turn may lead to more scheduling choices which are sub-optimal + * in terms of power consumption. + * + * Workqueues marked with WQ_POWER_EFFICIENT are per-cpu by default + * but become unbound if workqueue.power_efficient kernel param is + * specified. Per-cpu workqueues which are identified to + * contribute significantly to power-consumption are identified and + * marked with this flag and enabling the power_efficient mode + * leads to noticeable power saving at the cost of small + * performance disadvantage. + * + * http://thread.gmane.org/gmane.linux.kernel/1480396 + */ + WQ_POWER_EFFICIENT = 1 << 7, + __WQ_DRAINING = 1 << 16, /* internal: workqueue is draining */ __WQ_ORDERED = 1 << 17, /* internal: workqueue is ordered */ diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 5dfdc9ea180b..46455961a88f 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -263,6 +263,26 @@ config PM_GENERIC_DOMAINS bool depends on PM +config WQ_POWER_EFFICIENT_DEFAULT + bool "Enable workqueue power-efficient mode by default" + depends on PM + default n + help + Per-cpu workqueues are generally preferred because they show + better performance thanks to cache locality; unfortunately, + per-cpu workqueues tend to be more power hungry than unbound + workqueues. + + Enabling workqueue.power_efficient kernel parameter makes the + per-cpu workqueues which were observed to contribute + significantly to power consumption unbound, leading to measurably + lower power usage at the cost of small performance overhead. + + This config option determines whether workqueue.power_efficient + is enabled by default. + + If in doubt, say N. + config PM_GENERIC_DOMAINS_SLEEP def_bool y depends on PM_SLEEP && PM_GENERIC_DOMAINS diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 4aa9f5bc6b2d..8068d97ce141 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -272,6 +272,15 @@ static cpumask_var_t *wq_numa_possible_cpumask; static bool wq_disable_numa; module_param_named(disable_numa, wq_disable_numa, bool, 0444); +/* see the comment above the definition of WQ_POWER_EFFICIENT */ +#ifdef CONFIG_WQ_POWER_EFFICIENT_DEFAULT +static bool wq_power_efficient = true; +#else +static bool wq_power_efficient; +#endif + +module_param_named(power_efficient, wq_power_efficient, bool, 0444); + static bool wq_numa_enabled; /* unbound NUMA affinity enabled */ /* buf for wq_update_unbound_numa_attrs(), protected by CPU hotplug exclusion */ @@ -4085,6 +4094,10 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt, struct workqueue_struct *wq; struct pool_workqueue *pwq; + /* see the comment above the definition of WQ_POWER_EFFICIENT */ + if ((flags & WQ_POWER_EFFICIENT) && wq_power_efficient) + flags |= WQ_UNBOUND; + /* allocate wq and format name */ if (flags & WQ_UNBOUND) tbl_size = wq_numa_tbl_len * sizeof(wq->numa_pwq_tbl[0]); -- cgit v1.2.3 From 0668106ca3865ba945e155097fb042bf66d364d3 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Wed, 24 Apr 2013 17:12:54 +0530 Subject: workqueue: Add system wide power_efficient workqueues This patch adds system wide workqueues aligned towards power saving. This is done by allocating them with WQ_UNBOUND flag if 'wq_power_efficient' is set to 'true'. tj: updated comments a bit. Signed-off-by: Viresh Kumar Signed-off-by: Tejun Heo --- include/linux/workqueue.h | 8 ++++++++ kernel/workqueue.c | 13 ++++++++++++- 2 files changed, 20 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index fc0136b604f2..a9f4119c7e2e 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -360,11 +360,19 @@ enum { * * system_freezable_wq is equivalent to system_wq except that it's * freezable. + * + * *_power_efficient_wq are inclined towards saving power and converted + * into WQ_UNBOUND variants if 'wq_power_efficient' is enabled; otherwise, + * they are same as their non-power-efficient counterparts - e.g. + * system_power_efficient_wq is identical to system_wq if + * 'wq_power_efficient' is disabled. See WQ_POWER_EFFICIENT for more info. */ extern struct workqueue_struct *system_wq; extern struct workqueue_struct *system_long_wq; extern struct workqueue_struct *system_unbound_wq; extern struct workqueue_struct *system_freezable_wq; +extern struct workqueue_struct *system_power_efficient_wq; +extern struct workqueue_struct *system_freezable_power_efficient_wq; static inline struct workqueue_struct * __deprecated __system_nrt_wq(void) { diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 8068d97ce141..16ca2d3dd29f 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -314,6 +314,10 @@ struct workqueue_struct *system_unbound_wq __read_mostly; EXPORT_SYMBOL_GPL(system_unbound_wq); struct workqueue_struct *system_freezable_wq __read_mostly; EXPORT_SYMBOL_GPL(system_freezable_wq); +struct workqueue_struct *system_power_efficient_wq __read_mostly; +EXPORT_SYMBOL_GPL(system_power_efficient_wq); +struct workqueue_struct *system_freezable_power_efficient_wq __read_mostly; +EXPORT_SYMBOL_GPL(system_freezable_power_efficient_wq); static int worker_thread(void *__worker); static void copy_workqueue_attrs(struct workqueue_attrs *to, @@ -4987,8 +4991,15 @@ static int __init init_workqueues(void) WQ_UNBOUND_MAX_ACTIVE); system_freezable_wq = alloc_workqueue("events_freezable", WQ_FREEZABLE, 0); + system_power_efficient_wq = alloc_workqueue("events_power_efficient", + WQ_POWER_EFFICIENT, 0); + system_freezable_power_efficient_wq = alloc_workqueue("events_freezable_power_efficient", + WQ_FREEZABLE | WQ_POWER_EFFICIENT, + 0); BUG_ON(!system_wq || !system_highpri_wq || !system_long_wq || - !system_unbound_wq || !system_freezable_wq); + !system_unbound_wq || !system_freezable_wq || + !system_power_efficient_wq || + !system_freezable_power_efficient_wq); return 0; } early_initcall(init_workqueues); -- cgit v1.2.3 From fa3ca07e96185aa1496b405472399a2a2a336a17 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 14 Apr 2013 11:36:56 -0700 Subject: cgroup: refactor hierarchy_id handling We're planning to converting hierarchy_ida to an idr and use it to look up hierarchy from its id. As we want the mapping to happen atomically with cgroupfs_root registration, this patch refactors hierarchy_id init / exit so that ida operations happen inside cgroup_[root_]mutex. * s/init_root_id()/cgroup_init_root_id()/ and make it return 0 or -errno like a normal function. * Move hierarchy_id initialization from cgroup_root_from_opts() into cgroup_mount() block where the root is confirmed to be used and being registered while holding both mutexes. * Split cgroup_drop_id() into cgroup_exit_root_id() and cgroup_free_root(), so that ID release can happen before dropping the mutexes in cgroup_kill_sb(). The latter expects hierarchy_id to be exited before being invoked. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 56 +++++++++++++++++++++++++++++++++++--------------------- 1 file changed, 35 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 2a9926275f80..dbc84f7d23b8 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1426,13 +1426,13 @@ static void init_cgroup_root(struct cgroupfs_root *root) list_add_tail(&cgrp->allcg_node, &root->allcg_list); } -static bool init_root_id(struct cgroupfs_root *root) +static int cgroup_init_root_id(struct cgroupfs_root *root) { - int ret = 0; + int ret; do { if (!ida_pre_get(&hierarchy_ida, GFP_KERNEL)) - return false; + return -ENOMEM; spin_lock(&hierarchy_id_lock); /* Try to allocate the next unused ID */ ret = ida_get_new_above(&hierarchy_ida, next_hierarchy_id, @@ -1448,7 +1448,18 @@ static bool init_root_id(struct cgroupfs_root *root) } spin_unlock(&hierarchy_id_lock); } while (ret); - return true; + return 0; +} + +static void cgroup_exit_root_id(struct cgroupfs_root *root) +{ + if (root->hierarchy_id) { + spin_lock(&hierarchy_id_lock); + ida_remove(&hierarchy_ida, root->hierarchy_id); + spin_unlock(&hierarchy_id_lock); + + root->hierarchy_id = 0; + } } static int cgroup_test_super(struct super_block *sb, void *data) @@ -1482,10 +1493,6 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts) if (!root) return ERR_PTR(-ENOMEM); - if (!init_root_id(root)) { - kfree(root); - return ERR_PTR(-ENOMEM); - } init_cgroup_root(root); root->subsys_mask = opts->subsys_mask; @@ -1500,17 +1507,15 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts) return root; } -static void cgroup_drop_root(struct cgroupfs_root *root) +static void cgroup_free_root(struct cgroupfs_root *root) { - if (!root) - return; + if (root) { + /* hierarhcy ID shoulid already have been released */ + WARN_ON_ONCE(root->hierarchy_id); - BUG_ON(!root->hierarchy_id); - spin_lock(&hierarchy_id_lock); - ida_remove(&hierarchy_ida, root->hierarchy_id); - spin_unlock(&hierarchy_id_lock); - ida_destroy(&root->cgroup_ida); - kfree(root); + ida_destroy(&root->cgroup_ida); + kfree(root); + } } static int cgroup_set_super(struct super_block *sb, void *data) @@ -1597,7 +1602,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, sb = sget(fs_type, cgroup_test_super, cgroup_set_super, 0, &opts); if (IS_ERR(sb)) { ret = PTR_ERR(sb); - cgroup_drop_root(opts.new_root); + cgroup_free_root(opts.new_root); goto drop_modules; } @@ -1641,6 +1646,10 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, if (ret) goto unlock_drop; + ret = cgroup_init_root_id(root); + if (ret) + goto unlock_drop; + ret = rebind_subsystems(root, root->subsys_mask); if (ret == -EBUSY) { free_cg_links(&tmp_cg_links); @@ -1684,7 +1693,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, * We re-used an existing hierarchy - the new root (if * any) is not needed */ - cgroup_drop_root(opts.new_root); + cgroup_free_root(opts.new_root); if (((root->flags | opts.flags) & CGRP_ROOT_SANE_BEHAVIOR) && root->flags != opts.flags) { @@ -1702,6 +1711,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, return dget(sb->s_root); unlock_drop: + cgroup_exit_root_id(root); mutex_unlock(&cgroup_root_mutex); mutex_unlock(&cgroup_mutex); mutex_unlock(&inode->i_mutex); @@ -1754,13 +1764,15 @@ static void cgroup_kill_sb(struct super_block *sb) { root_count--; } + cgroup_exit_root_id(root); + mutex_unlock(&cgroup_root_mutex); mutex_unlock(&cgroup_mutex); simple_xattrs_free(&cgrp->xattrs); kill_litter_super(sb); - cgroup_drop_root(root); + cgroup_free_root(root); } static struct file_system_type cgroup_fs_type = { @@ -4642,7 +4654,9 @@ int __init cgroup_init(void) /* Add init_css_set to the hash table */ key = css_set_hash(init_css_set.subsys); hash_add(css_set_table, &init_css_set.hlist, key); - BUG_ON(!init_root_id(&rootnode)); + + /* allocate id for the dummy hierarchy */ + BUG_ON(cgroup_init_root_id(&rootnode)); cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj); if (!cgroup_kobj) { -- cgit v1.2.3 From 54e7b4eb15fc4354d5ada5469e3db4a220ddb3ed Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 14 Apr 2013 11:36:57 -0700 Subject: cgroup: drop hierarchy_id_lock Now that hierarchy_id alloc / free are protected by the cgroup mutexes, there's no need for this separate lock. Drop it. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index dbc84f7d23b8..3ef677d314bc 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -189,9 +189,13 @@ struct cgroup_event { static LIST_HEAD(roots); static int root_count; +/* + * Hierarchy ID allocation and mapping. It follows the same exclusion + * rules as other root ops - both cgroup_mutex and cgroup_root_mutex for + * writes, either for reads. + */ static DEFINE_IDA(hierarchy_ida); static int next_hierarchy_id; -static DEFINE_SPINLOCK(hierarchy_id_lock); /* dummytop is a shorthand for the dummy hierarchy's top cgroup */ #define dummytop (&rootnode.top_cgroup) @@ -1430,10 +1434,12 @@ static int cgroup_init_root_id(struct cgroupfs_root *root) { int ret; + lockdep_assert_held(&cgroup_mutex); + lockdep_assert_held(&cgroup_root_mutex); + do { if (!ida_pre_get(&hierarchy_ida, GFP_KERNEL)) return -ENOMEM; - spin_lock(&hierarchy_id_lock); /* Try to allocate the next unused ID */ ret = ida_get_new_above(&hierarchy_ida, next_hierarchy_id, &root->hierarchy_id); @@ -1446,18 +1452,17 @@ static int cgroup_init_root_id(struct cgroupfs_root *root) /* Can only get here if the 31-bit IDR is full ... */ BUG_ON(ret); } - spin_unlock(&hierarchy_id_lock); } while (ret); return 0; } static void cgroup_exit_root_id(struct cgroupfs_root *root) { + lockdep_assert_held(&cgroup_mutex); + lockdep_assert_held(&cgroup_root_mutex); + if (root->hierarchy_id) { - spin_lock(&hierarchy_id_lock); ida_remove(&hierarchy_ida, root->hierarchy_id); - spin_unlock(&hierarchy_id_lock); - root->hierarchy_id = 0; } } @@ -4656,8 +4661,14 @@ int __init cgroup_init(void) hash_add(css_set_table, &init_css_set.hlist, key); /* allocate id for the dummy hierarchy */ + mutex_lock(&cgroup_mutex); + mutex_lock(&cgroup_root_mutex); + BUG_ON(cgroup_init_root_id(&rootnode)); + mutex_unlock(&cgroup_root_mutex); + mutex_unlock(&cgroup_mutex); + cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj); if (!cgroup_kobj) { err = -ENOMEM; -- cgit v1.2.3 From 1a574231669f8c3065c83974e9557fcbbd94b8a6 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 14 Apr 2013 11:36:58 -0700 Subject: cgroup: make hierarchy_id use cyclic idr We want to be able to lookup a hierarchy from its id and cyclic allocation is a whole lot simpler with idr. Convert to idr and use idr_alloc_cyclc(). Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 28 ++++++++-------------------- 1 file changed, 8 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 3ef677d314bc..dcb417c6c242 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -194,8 +194,7 @@ static int root_count; * rules as other root ops - both cgroup_mutex and cgroup_root_mutex for * writes, either for reads. */ -static DEFINE_IDA(hierarchy_ida); -static int next_hierarchy_id; +static DEFINE_IDR(cgroup_hierarchy_idr); /* dummytop is a shorthand for the dummy hierarchy's top cgroup */ #define dummytop (&rootnode.top_cgroup) @@ -1432,27 +1431,16 @@ static void init_cgroup_root(struct cgroupfs_root *root) static int cgroup_init_root_id(struct cgroupfs_root *root) { - int ret; + int id; lockdep_assert_held(&cgroup_mutex); lockdep_assert_held(&cgroup_root_mutex); - do { - if (!ida_pre_get(&hierarchy_ida, GFP_KERNEL)) - return -ENOMEM; - /* Try to allocate the next unused ID */ - ret = ida_get_new_above(&hierarchy_ida, next_hierarchy_id, - &root->hierarchy_id); - if (ret == -ENOSPC) - /* Try again starting from 0 */ - ret = ida_get_new(&hierarchy_ida, &root->hierarchy_id); - if (!ret) { - next_hierarchy_id = root->hierarchy_id + 1; - } else if (ret != -EAGAIN) { - /* Can only get here if the 31-bit IDR is full ... */ - BUG_ON(ret); - } - } while (ret); + id = idr_alloc_cyclic(&cgroup_hierarchy_idr, root, 2, 0, GFP_KERNEL); + if (id < 0) + return id; + + root->hierarchy_id = id; return 0; } @@ -1462,7 +1450,7 @@ static void cgroup_exit_root_id(struct cgroupfs_root *root) lockdep_assert_held(&cgroup_root_mutex); if (root->hierarchy_id) { - ida_remove(&hierarchy_ida, root->hierarchy_id); + idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id); root->hierarchy_id = 0; } } -- cgit v1.2.3 From 857a2beb09ab83e9a8185821ae16db7dfbe8b837 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 14 Apr 2013 20:50:08 -0700 Subject: cgroup: implement task_cgroup_path_from_hierarchy() kdbus folks want a sane way to determine the cgroup path that a given task belongs to on a given hierarchy, which is a reasonble thing to expect from cgroup core. Implement task_cgroup_path_from_hierarchy(). v2: Dropped unnecessary NULL check on the return value of task_cgroup_from_root() as suggested by Li Zefan. Signed-off-by: Tejun Heo Acked-by: Greg Kroah-Hartman Acked-by: Li Zefan Cc: Kay Sievers Cc: Lennart Poettering Cc: Daniel Mack --- include/linux/cgroup.h | 2 ++ kernel/cgroup.c | 32 ++++++++++++++++++++++++++++++++ 2 files changed, 34 insertions(+) (limited to 'kernel') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 5047355b9a0f..383c630f36f9 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -542,6 +542,8 @@ int cgroup_is_removed(const struct cgroup *cgrp); bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor); int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen); +int task_cgroup_path_from_hierarchy(struct task_struct *task, int hierarchy_id, + char *buf, size_t buflen); int cgroup_task_count(const struct cgroup *cgrp); diff --git a/kernel/cgroup.c b/kernel/cgroup.c index dcb417c6c242..6b2b1d945df2 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1827,6 +1827,38 @@ out: } EXPORT_SYMBOL_GPL(cgroup_path); +/** + * task_cgroup_path_from_hierarchy - cgroup path of a task on a hierarchy + * @task: target task + * @hierarchy_id: the hierarchy to look up @task's cgroup from + * @buf: the buffer to write the path into + * @buflen: the length of the buffer + * + * Determine @task's cgroup on the hierarchy specified by @hierarchy_id and + * copy its path into @buf. This function grabs cgroup_mutex and shouldn't + * be used inside locks used by cgroup controller callbacks. + */ +int task_cgroup_path_from_hierarchy(struct task_struct *task, int hierarchy_id, + char *buf, size_t buflen) +{ + struct cgroupfs_root *root; + struct cgroup *cgrp = NULL; + int ret = -ENOENT; + + mutex_lock(&cgroup_mutex); + + root = idr_find(&cgroup_hierarchy_idr, hierarchy_id); + if (root) { + cgrp = task_cgroup_from_root(task, root); + ret = cgroup_path(cgrp, buf, buflen); + } + + mutex_unlock(&cgroup_mutex); + + return ret; +} +EXPORT_SYMBOL_GPL(task_cgroup_path_from_hierarchy); + /* * Control Group taskset */ -- cgit v1.2.3 From 5d33b883aed81c6fbcd09c6f7c3619eee850a7e2 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 25 Apr 2013 20:31:43 +0000 Subject: clocksource: Always verify highres capability If a clocksource has a (wrong) high rating, but can't be used as a timebase for oneshot tick mode, it is unconditionally selected even when the system is already in oneshot tick mode. This causes full system failure. Verify the clocksource selection against the oneshot mode. Signed-off-by: Thomas Gleixner Acked-by: John Stultz Cc: Magnus Damm Link: http://lkml.kernel.org/r/20130425143435.635040849@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/time/clocksource.c | 31 ++++++++++++++++++++++++++----- 1 file changed, 26 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index c9583382141a..dda5c7130d93 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -553,6 +553,26 @@ static u64 clocksource_max_deferment(struct clocksource *cs) #ifndef CONFIG_ARCH_USES_GETTIMEOFFSET +static struct clocksource *clocksource_find_best(bool oneshot) +{ + struct clocksource *cs; + + if (!finished_booting || list_empty(&clocksource_list)) + return NULL; + + /* + * We pick the clocksource with the highest rating. If oneshot + * mode is active, we pick the highres valid clocksource with + * the best rating. + */ + list_for_each_entry(cs, &clocksource_list, list) { + if (oneshot && !(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES)) + continue; + return cs; + } + return NULL; +} + /** * clocksource_select - Select the best clocksource available * @@ -563,12 +583,14 @@ static u64 clocksource_max_deferment(struct clocksource *cs) */ static void clocksource_select(void) { + bool oneshot = tick_oneshot_mode_active(); struct clocksource *best, *cs; - if (!finished_booting || list_empty(&clocksource_list)) + /* Find the best suitable clocksource */ + best = clocksource_find_best(oneshot); + if (!best) return; - /* First clocksource on the list has the best rating. */ - best = list_first_entry(&clocksource_list, struct clocksource, list); + /* Check for the override clocksource. */ list_for_each_entry(cs, &clocksource_list, list) { if (strcmp(cs->name, override_name) != 0) @@ -578,8 +600,7 @@ static void clocksource_select(void) * capable clocksource if the tick code is in oneshot * mode (highres or nohz) */ - if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) && - tick_oneshot_mode_active()) { + if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) && oneshot) { /* Override clocksource cannot be used. */ printk(KERN_WARNING "Override clocksource %s is not " "HRT compatible. Cannot switch while in " -- cgit v1.2.3 From ba919d1caa2e624eb8c6cae1f2ce0a253e697d45 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 25 Apr 2013 20:31:44 +0000 Subject: clocksource: Let timekeeping_notify return success/error timekeeping_notify() can fail due cs->enable() failure. Though the caller does not notice and happily keeps the wrong clocksource as the current one. Let the caller know about failure, so the current clocksource will be shown correctly in sysfs. Signed-off-by: Thomas Gleixner Acked-by: John Stultz Cc: Magnus Damm Link: http://lkml.kernel.org/r/20130425143435.696321912@linutronix.de Signed-off-by: Thomas Gleixner --- include/linux/clocksource.h | 2 +- kernel/time/clocksource.c | 6 +++--- kernel/time/timekeeping.c | 5 +++-- 3 files changed, 7 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h index 7279b94c01da..aa6ba44e75d5 100644 --- a/include/linux/clocksource.h +++ b/include/linux/clocksource.h @@ -321,7 +321,7 @@ static inline void __clocksource_updatefreq_khz(struct clocksource *cs, u32 khz) } -extern void timekeeping_notify(struct clocksource *clock); +extern int timekeeping_notify(struct clocksource *clock); extern cycle_t clocksource_mmio_readl_up(struct clocksource *); extern cycle_t clocksource_mmio_readl_down(struct clocksource *); diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index dda5c7130d93..1923a340bd91 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -611,10 +611,10 @@ static void clocksource_select(void) best = cs; break; } - if (curr_clocksource != best) { - printk(KERN_INFO "Switching to clocksource %s\n", best->name); + + if (curr_clocksource != best && !timekeeping_notify(best)) { + pr_info("Switched to clocksource %s\n", best->name); curr_clocksource = best; - timekeeping_notify(curr_clocksource); } } diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 98cd470bbe49..da6e10c7a378 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -648,14 +648,15 @@ static int change_clocksource(void *data) * This function is called from clocksource.c after a new, better clock * source has been registered. The caller holds the clocksource_mutex. */ -void timekeeping_notify(struct clocksource *clock) +int timekeeping_notify(struct clocksource *clock) { struct timekeeper *tk = &timekeeper; if (tk->clock == clock) - return; + return 0; stop_machine(change_clocksource, clock, NULL); tick_clock_notify(); + return tk->clock == clock ? 0 : -1; } /** -- cgit v1.2.3 From 09ac369c825d9d593404306d59062d854b321e9b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 25 Apr 2013 20:31:44 +0000 Subject: clocksource: Add module refcount Add a module refcount, so the current clocksource cannot be removed unconditionally. Signed-off-by: Thomas Gleixner Cc: John Stultz Cc: Magnus Damm Link: http://lkml.kernel.org/r/20130425143435.762417789@linutronix.de Signed-off-by: Thomas Gleixner --- include/linux/clocksource.h | 3 +++ kernel/time/timekeeping.c | 19 ++++++++++++++----- 2 files changed, 17 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h index aa6ba44e75d5..32a895b114d8 100644 --- a/include/linux/clocksource.h +++ b/include/linux/clocksource.h @@ -21,6 +21,7 @@ /* clocksource cycle base type */ typedef u64 cycle_t; struct clocksource; +struct module; #ifdef CONFIG_ARCH_CLOCKSOURCE_DATA #include @@ -162,6 +163,7 @@ extern u64 timecounter_cyc2time(struct timecounter *tc, * @suspend: suspend function for the clocksource, if necessary * @resume: resume function for the clocksource, if necessary * @cycle_last: most recent cycle counter value seen by ::read() + * @owner: module reference, must be set by clocksource in modules */ struct clocksource { /* @@ -195,6 +197,7 @@ struct clocksource { cycle_t cs_last; cycle_t wd_last; #endif + struct module *owner; } ____cacheline_aligned; /* diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index da6e10c7a378..933efa4071c3 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -627,11 +627,20 @@ static int change_clocksource(void *data) write_seqcount_begin(&timekeeper_seq); timekeeping_forward_now(tk); - if (!new->enable || new->enable(new) == 0) { - old = tk->clock; - tk_setup_internals(tk, new); - if (old->disable) - old->disable(old); + /* + * If the cs is in module, get a module reference. Succeeds + * for built-in code (owner == NULL) as well. + */ + if (try_module_get(new->owner)) { + if (!new->enable || new->enable(new) == 0) { + old = tk->clock; + tk_setup_internals(tk, new); + if (old->disable) + old->disable(old); + module_put(old->owner); + } else { + module_put(new->owner); + } } timekeeping_update(tk, true, true); -- cgit v1.2.3 From f5a2e34375a5e2b711aea488ac3ae50eeba6d57c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 25 Apr 2013 20:31:45 +0000 Subject: clocksource: Allow clocksource select to skip current clocksource Preparatory patch for clocksource unbind support. Split out code from clocksource_select and modify it, so it skips the current clocksource on request and tries to find a fallback clocksource. Convert all existing users. No functional change. Signed-off-by: Thomas Gleixner Cc: John Stultz Cc: Magnus Damm Link: http://lkml.kernel.org/r/20130425143435.834965397@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/time/clocksource.c | 31 ++++++++++++++++++++----------- 1 file changed, 20 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 1923a340bd91..9782997cb6cf 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -553,7 +553,7 @@ static u64 clocksource_max_deferment(struct clocksource *cs) #ifndef CONFIG_ARCH_USES_GETTIMEOFFSET -static struct clocksource *clocksource_find_best(bool oneshot) +static struct clocksource *clocksource_find_best(bool oneshot, bool skipcur) { struct clocksource *cs; @@ -566,6 +566,8 @@ static struct clocksource *clocksource_find_best(bool oneshot) * the best rating. */ list_for_each_entry(cs, &clocksource_list, list) { + if (skipcur && cs == curr_clocksource) + continue; if (oneshot && !(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES)) continue; return cs; @@ -573,26 +575,20 @@ static struct clocksource *clocksource_find_best(bool oneshot) return NULL; } -/** - * clocksource_select - Select the best clocksource available - * - * Private function. Must hold clocksource_mutex when called. - * - * Select the clocksource with the best rating, or the clocksource, - * which is selected by userspace override. - */ -static void clocksource_select(void) +static void __clocksource_select(bool skipcur) { bool oneshot = tick_oneshot_mode_active(); struct clocksource *best, *cs; /* Find the best suitable clocksource */ - best = clocksource_find_best(oneshot); + best = clocksource_find_best(oneshot, skipcur); if (!best) return; /* Check for the override clocksource. */ list_for_each_entry(cs, &clocksource_list, list) { + if (skipcur && cs == curr_clocksource) + continue; if (strcmp(cs->name, override_name) != 0) continue; /* @@ -618,6 +614,19 @@ static void clocksource_select(void) } } +/** + * clocksource_select - Select the best clocksource available + * + * Private function. Must hold clocksource_mutex when called. + * + * Select the clocksource with the best rating, or the clocksource, + * which is selected by userspace override. + */ +static void clocksource_select(void) +{ + return __clocksource_select(false); +} + #else /* !CONFIG_ARCH_USES_GETTIMEOFFSET */ static inline void clocksource_select(void) { } -- cgit v1.2.3 From 29b5407819f59731c9423238fae03b756822708c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 25 Apr 2013 20:31:45 +0000 Subject: clocksource: Split out user string input Split out the user string input for clocksource override. Preparatory patch for unbind. [ jstultz: Fix an off by one error ] Signed-off-by: Thomas Gleixner Cc: John Stultz Cc: Magnus Damm Link: http://lkml.kernel.org/r/20130425143435.895851338@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/time/clocksource.c | 37 +++++++++++++++++++++++-------------- 1 file changed, 23 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 9782997cb6cf..d7f1a45c2fa5 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -174,7 +174,8 @@ clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 maxsec) static struct clocksource *curr_clocksource; static LIST_HEAD(clocksource_list); static DEFINE_MUTEX(clocksource_mutex); -static char override_name[32]; +#define CS_NAME_LEN 32 +static char override_name[CS_NAME_LEN]; static int finished_booting; #ifdef CONFIG_CLOCKSOURCE_WATCHDOG @@ -838,6 +839,23 @@ sysfs_show_current_clocksources(struct device *dev, return count; } +static size_t clocksource_get_uname(const char *buf, char *dst, size_t cnt) +{ + size_t ret = cnt; + + /* strings from sysfs write are not 0 terminated! */ + if (!cnt || cnt >= CS_NAME_LEN) + return -EINVAL; + + /* strip of \n: */ + if (buf[cnt-1] == '\n') + cnt--; + if (cnt > 0) + memcpy(dst, buf, cnt); + dst[cnt] = 0; + return ret; +} + /** * sysfs_override_clocksource - interface for manually overriding clocksource * @dev: unused @@ -852,22 +870,13 @@ static ssize_t sysfs_override_clocksource(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { - size_t ret = count; - - /* strings from sysfs write are not 0 terminated! */ - if (count >= sizeof(override_name)) - return -EINVAL; - - /* strip of \n: */ - if (buf[count-1] == '\n') - count--; + size_t ret; mutex_lock(&clocksource_mutex); - if (count > 0) - memcpy(override_name, buf, count); - override_name[count] = 0; - clocksource_select(); + ret = clocksource_get_uname(buf, override_name, count); + if (ret >= 0) + clocksource_select(); mutex_unlock(&clocksource_mutex); -- cgit v1.2.3 From 7eaeb34305dee26634f7c98ae62646da5cebe91d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 25 Apr 2013 20:31:46 +0000 Subject: clocksource: Provide unbind interface in sysfs With the module refcount held for the current clocksource there is no way to unload the module. Provide a sysfs interface which allows to unbind the clocksource. One could argue that the clocksource override could be (ab)used to do so, but the clocksource override cannot be used from the kernel itself, while an unbind function can be used to programmatically check whether a clocksource can be shutdown or not. The unbind functionality uses the new skip current feature of clocksource_select and verifies that a fallback clocksource has been installed. If the clocksource which should be unbound is the current clocksource and no fallback can be found, unbind returns -EBUSY. This does not support the unbinding of a clocksource which is used as the watchdog clocksource. No point in fostering crappy hardware. Signed-off-by: Thomas Gleixner Cc: John Stultz Cc: Magnus Damm Link: http://lkml.kernel.org/r/20130425143435.964218245@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/time/clocksource.c | 73 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 73 insertions(+) (limited to 'kernel') diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index d7f1a45c2fa5..791d1aeb17ac 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -440,6 +440,11 @@ static int clocksource_watchdog_kthread(void *data) return 0; } +static bool clocksource_is_watchdog(struct clocksource *cs) +{ + return cs == watchdog; +} + #else /* CONFIG_CLOCKSOURCE_WATCHDOG */ static void clocksource_enqueue_watchdog(struct clocksource *cs) @@ -451,6 +456,7 @@ static void clocksource_enqueue_watchdog(struct clocksource *cs) static inline void clocksource_dequeue_watchdog(struct clocksource *cs) { } static inline void clocksource_resume_watchdog(void) { } static inline int clocksource_watchdog_kthread(void *data) { return 0; } +static bool clocksource_is_watchdog(struct clocksource *cs) { return false; } #endif /* CONFIG_CLOCKSOURCE_WATCHDOG */ @@ -628,6 +634,11 @@ static void clocksource_select(void) return __clocksource_select(false); } +static void clocksource_select_fallback(void) +{ + return __clocksource_select(true); +} + #else /* !CONFIG_ARCH_USES_GETTIMEOFFSET */ static inline void clocksource_select(void) { } @@ -803,6 +814,29 @@ void clocksource_change_rating(struct clocksource *cs, int rating) } EXPORT_SYMBOL(clocksource_change_rating); +/* + * Unbind clocksource @cs. Called with clocksource_mutex held + */ +static int clocksource_unbind(struct clocksource *cs) +{ + /* + * I really can't convince myself to support this on hardware + * designed by lobotomized monkeys. + */ + if (clocksource_is_watchdog(cs)) + return -EBUSY; + + if (cs == curr_clocksource) { + /* Select and try to install a replacement clock source */ + clocksource_select_fallback(); + if (curr_clocksource == cs) + return -EBUSY; + } + clocksource_dequeue_watchdog(cs); + list_del_init(&cs->list); + return 0; +} + /** * clocksource_unregister - remove a registered clocksource * @cs: clocksource to be unregistered @@ -883,6 +917,40 @@ static ssize_t sysfs_override_clocksource(struct device *dev, return ret; } +/** + * sysfs_unbind_current_clocksource - interface for manually unbinding clocksource + * @dev: unused + * @attr: unused + * @buf: unused + * @count: length of buffer + * + * Takes input from sysfs interface for manually unbinding a clocksource. + */ +static ssize_t sysfs_unbind_clocksource(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct clocksource *cs; + char name[CS_NAME_LEN]; + size_t ret; + + ret = clocksource_get_uname(buf, name, count); + if (ret < 0) + return ret; + + ret = -ENODEV; + mutex_lock(&clocksource_mutex); + list_for_each_entry(cs, &clocksource_list, list) { + if (strcmp(cs->name, name)) + continue; + ret = clocksource_unbind(cs); + break; + } + mutex_unlock(&clocksource_mutex); + + return ret ? ret : count; +} + /** * sysfs_show_available_clocksources - sysfs interface for listing clocksource * @dev: unused @@ -925,6 +993,8 @@ sysfs_show_available_clocksources(struct device *dev, static DEVICE_ATTR(current_clocksource, 0644, sysfs_show_current_clocksources, sysfs_override_clocksource); +static DEVICE_ATTR(unbind_clocksource, 0200, NULL, sysfs_unbind_clocksource); + static DEVICE_ATTR(available_clocksource, 0444, sysfs_show_available_clocksources, NULL); @@ -948,6 +1018,9 @@ static int __init init_clocksource_sysfs(void) error = device_create_file( &device_clocksource, &dev_attr_current_clocksource); + if (!error) + error = device_create_file(&device_clocksource, + &dev_attr_unbind_clocksource); if (!error) error = device_create_file( &device_clocksource, -- cgit v1.2.3 From a89c7edbe7d7aa80f507915f3dd801211b116b79 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 25 Apr 2013 20:31:46 +0000 Subject: clocksource: Let clocksource_unregister() return success/error The unregister call can fail, if the clocksource is the current one and there is no replacement clocksource available. It can also fail, if the clocksource is the watchdog clocksource and I'm not going to provide support for this. Signed-off-by: Thomas Gleixner Cc: John Stultz Cc: Magnus Damm Link: http://lkml.kernel.org/r/20130425143436.029915527@linutronix.de Signed-off-by: Thomas Gleixner --- include/linux/clocksource.h | 2 +- kernel/time/clocksource.c | 33 ++++++++++++--------------------- 2 files changed, 13 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h index 32a895b114d8..2f39a4911668 100644 --- a/include/linux/clocksource.h +++ b/include/linux/clocksource.h @@ -282,7 +282,7 @@ static inline s64 clocksource_cyc2ns(cycle_t cycles, u32 mult, u32 shift) extern int clocksource_register(struct clocksource*); -extern void clocksource_unregister(struct clocksource*); +extern int clocksource_unregister(struct clocksource*); extern void clocksource_touch_watchdog(void); extern struct clocksource* clocksource_get_next(void); extern void clocksource_change_rating(struct clocksource *cs, int rating); diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 791d1aeb17ac..31b90332f47b 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -389,28 +389,17 @@ static void clocksource_enqueue_watchdog(struct clocksource *cs) static void clocksource_dequeue_watchdog(struct clocksource *cs) { - struct clocksource *tmp; unsigned long flags; spin_lock_irqsave(&watchdog_lock, flags); - if (cs->flags & CLOCK_SOURCE_MUST_VERIFY) { - /* cs is a watched clocksource. */ - list_del_init(&cs->wd_list); - } else if (cs == watchdog) { - /* Reset watchdog cycles */ - clocksource_reset_watchdog(); - /* Current watchdog is removed. Find an alternative. */ - watchdog = NULL; - list_for_each_entry(tmp, &clocksource_list, list) { - if (tmp == cs || tmp->flags & CLOCK_SOURCE_MUST_VERIFY) - continue; - if (!watchdog || tmp->rating > watchdog->rating) - watchdog = tmp; + if (cs != watchdog) { + if (cs->flags & CLOCK_SOURCE_MUST_VERIFY) { + /* cs is a watched clocksource. */ + list_del_init(&cs->wd_list); + /* Check if the watchdog timer needs to be stopped. */ + clocksource_stop_watchdog(); } } - cs->flags &= ~CLOCK_SOURCE_WATCHDOG; - /* Check if the watchdog timer needs to be stopped. */ - clocksource_stop_watchdog(); spin_unlock_irqrestore(&watchdog_lock, flags); } @@ -841,13 +830,15 @@ static int clocksource_unbind(struct clocksource *cs) * clocksource_unregister - remove a registered clocksource * @cs: clocksource to be unregistered */ -void clocksource_unregister(struct clocksource *cs) +int clocksource_unregister(struct clocksource *cs) { + int ret = 0; + mutex_lock(&clocksource_mutex); - clocksource_dequeue_watchdog(cs); - list_del(&cs->list); - clocksource_select(); + if (!list_empty(&cs->list)) + ret = clocksource_unbind(cs); mutex_unlock(&clocksource_mutex); + return ret; } EXPORT_SYMBOL(clocksource_unregister); -- cgit v1.2.3 From 7172a286ced0c1f4f239a0fa09db54ed37d3ead2 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 25 Apr 2013 20:31:47 +0000 Subject: clockevents: Get rid of the notifier chain 7+ years and still a single user. Kill it. Signed-off-by: Thomas Gleixner Cc: John Stultz Cc: Magnus Damm Link: http://lkml.kernel.org/r/20130425143436.098520211@linutronix.de Signed-off-by: Thomas Gleixner --- include/linux/clockchips.h | 1 - kernel/time/clockevents.c | 35 +++-------------------------------- kernel/time/tick-broadcast.c | 5 ++--- kernel/time/tick-common.c | 30 +++++------------------------- kernel/time/tick-internal.h | 7 ++++--- 5 files changed, 14 insertions(+), 64 deletions(-) (limited to 'kernel') diff --git a/include/linux/clockchips.h b/include/linux/clockchips.h index 963d71431388..2f498f66c1bb 100644 --- a/include/linux/clockchips.h +++ b/include/linux/clockchips.h @@ -150,7 +150,6 @@ extern void clockevents_exchange_device(struct clock_event_device *old, struct clock_event_device *new); extern void clockevents_set_mode(struct clock_event_device *dev, enum clock_event_mode mode); -extern int clockevents_register_notifier(struct notifier_block *nb); extern int clockevents_program_event(struct clock_event_device *dev, ktime_t expires, bool force); diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index c6d6400ee137..dd70b4842c62 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -15,7 +15,6 @@ #include #include #include -#include #include #include "tick-internal.h" @@ -23,10 +22,6 @@ /* The registered clock event devices */ static LIST_HEAD(clockevent_devices); static LIST_HEAD(clockevents_released); - -/* Notification for clock events */ -static RAW_NOTIFIER_HEAD(clockevents_chain); - /* Protection for the above */ static DEFINE_RAW_SPINLOCK(clockevents_lock); @@ -232,30 +227,6 @@ int clockevents_program_event(struct clock_event_device *dev, ktime_t expires, return (rc && force) ? clockevents_program_min_delta(dev) : rc; } -/** - * clockevents_register_notifier - register a clock events change listener - */ -int clockevents_register_notifier(struct notifier_block *nb) -{ - unsigned long flags; - int ret; - - raw_spin_lock_irqsave(&clockevents_lock, flags); - ret = raw_notifier_chain_register(&clockevents_chain, nb); - raw_spin_unlock_irqrestore(&clockevents_lock, flags); - - return ret; -} - -/* - * Notify about a clock event change. Called with clockevents_lock - * held. - */ -static void clockevents_do_notify(unsigned long reason, void *dev) -{ - raw_notifier_call_chain(&clockevents_chain, reason, dev); -} - /* * Called after a notify add to make devices available which were * released from the notifier call. @@ -269,7 +240,7 @@ static void clockevents_notify_released(void) struct clock_event_device, list); list_del(&dev->list); list_add(&dev->list, &clockevent_devices); - clockevents_do_notify(CLOCK_EVT_NOTIFY_ADD, dev); + tick_check_new_device(dev); } } @@ -290,7 +261,7 @@ void clockevents_register_device(struct clock_event_device *dev) raw_spin_lock_irqsave(&clockevents_lock, flags); list_add(&dev->list, &clockevent_devices); - clockevents_do_notify(CLOCK_EVT_NOTIFY_ADD, dev); + tick_check_new_device(dev); clockevents_notify_released(); raw_spin_unlock_irqrestore(&clockevents_lock, flags); @@ -433,7 +404,7 @@ void clockevents_notify(unsigned long reason, void *arg) int cpu; raw_spin_lock_irqsave(&clockevents_lock, flags); - clockevents_do_notify(reason, arg); + tick_notify(reason, arg); switch (reason) { case CLOCK_EVT_NOTIFY_CPU_DEAD: diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 24938d577669..3500caaa0bfd 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -64,7 +64,7 @@ static void tick_broadcast_start_periodic(struct clock_event_device *bc) /* * Check, if the device can be utilized as broadcast device: */ -int tick_check_broadcast_device(struct clock_event_device *dev) +void tick_install_broadcast_device(struct clock_event_device *dev) { struct clock_event_device *cur = tick_broadcast_device.evtdev; @@ -72,7 +72,7 @@ int tick_check_broadcast_device(struct clock_event_device *dev) (tick_broadcast_device.evtdev && tick_broadcast_device.evtdev->rating >= dev->rating) || (dev->features & CLOCK_EVT_FEAT_C3STOP)) - return 0; + return; clockevents_exchange_device(tick_broadcast_device.evtdev, dev); if (cur) @@ -90,7 +90,6 @@ int tick_check_broadcast_device(struct clock_event_device *dev) */ if (dev->features & CLOCK_EVT_FEAT_ONESHOT) tick_clock_notify(); - return 1; } /* diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 5d3fb100bc06..dbf4e18d5101 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -208,11 +208,11 @@ static void tick_setup_device(struct tick_device *td, /* * Check, if the new registered device should be used. */ -static int tick_check_new_device(struct clock_event_device *newdev) +void tick_check_new_device(struct clock_event_device *newdev) { struct clock_event_device *curdev; struct tick_device *td; - int cpu, ret = NOTIFY_OK; + int cpu; unsigned long flags; raw_spin_lock_irqsave(&tick_device_lock, flags); @@ -275,18 +275,14 @@ static int tick_check_new_device(struct clock_event_device *newdev) tick_oneshot_notify(); raw_spin_unlock_irqrestore(&tick_device_lock, flags); - return NOTIFY_STOP; + return; out_bc: /* * Can the new device be used as a broadcast device ? */ - if (tick_check_broadcast_device(newdev)) - ret = NOTIFY_STOP; - + tick_install_broadcast_device(newdev); raw_spin_unlock_irqrestore(&tick_device_lock, flags); - - return ret; } /* @@ -360,17 +356,10 @@ static void tick_resume(void) raw_spin_unlock_irqrestore(&tick_device_lock, flags); } -/* - * Notification about clock event devices - */ -static int tick_notify(struct notifier_block *nb, unsigned long reason, - void *dev) +void tick_notify(unsigned long reason, void *dev) { switch (reason) { - case CLOCK_EVT_NOTIFY_ADD: - return tick_check_new_device(dev); - case CLOCK_EVT_NOTIFY_BROADCAST_ON: case CLOCK_EVT_NOTIFY_BROADCAST_OFF: case CLOCK_EVT_NOTIFY_BROADCAST_FORCE: @@ -404,21 +393,12 @@ static int tick_notify(struct notifier_block *nb, unsigned long reason, default: break; } - - return NOTIFY_OK; } -static struct notifier_block tick_notifier = { - .notifier_call = tick_notify, -}; - /** * tick_init - initialize the tick control - * - * Register the notifier with the clockevents framework */ void __init tick_init(void) { - clockevents_register_notifier(&tick_notifier); tick_broadcast_init(); } diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index f0299eae4602..60742fe6f63d 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -18,6 +18,8 @@ extern int tick_do_timer_cpu __read_mostly; extern void tick_setup_periodic(struct clock_event_device *dev, int broadcast); extern void tick_handle_periodic(struct clock_event_device *dev); +extern void tick_notify(unsigned long reason, void *dev); +extern void tick_check_new_device(struct clock_event_device *dev); extern void clockevents_shutdown(struct clock_event_device *dev); @@ -90,7 +92,7 @@ static inline bool tick_broadcast_oneshot_available(void) { return false; } */ #ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST extern int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu); -extern int tick_check_broadcast_device(struct clock_event_device *dev); +extern void tick_install_broadcast_device(struct clock_event_device *dev); extern int tick_is_broadcast_device(struct clock_event_device *dev); extern void tick_broadcast_on_off(unsigned long reason, int *oncpu); extern void tick_shutdown_broadcast(unsigned int *cpup); @@ -102,9 +104,8 @@ tick_set_periodic_handler(struct clock_event_device *dev, int broadcast); #else /* !BROADCAST */ -static inline int tick_check_broadcast_device(struct clock_event_device *dev) +static inline void tick_install_broadcast_device(struct clock_event_device *dev) { - return 0; } static inline int tick_is_broadcast_device(struct clock_event_device *dev) -- cgit v1.2.3 From 7126cac426137633e470167524e7bcb590fd49b3 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 25 Apr 2013 20:31:48 +0000 Subject: clockevents: Simplify locking Now that the notifier chain is gone there are no other users and it's pointless to nest tick_device_lock inside of clockevents_lock because there is no other use case. Signed-off-by: Thomas Gleixner Cc: John Stultz Cc: Magnus Damm Link: http://lkml.kernel.org/r/20130425143436.162888472@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/time/tick-common.c | 22 +++++----------------- 1 file changed, 5 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index dbf4e18d5101..170a4bdfa99e 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -33,7 +33,6 @@ DEFINE_PER_CPU(struct tick_device, tick_cpu_device); ktime_t tick_next_period; ktime_t tick_period; int tick_do_timer_cpu __read_mostly = TICK_DO_TIMER_BOOT; -static DEFINE_RAW_SPINLOCK(tick_device_lock); /* * Debugging: see timer_list.c @@ -206,16 +205,14 @@ static void tick_setup_device(struct tick_device *td, } /* - * Check, if the new registered device should be used. + * Check, if the new registered device should be used. Called with + * clockevents_lock held and interrupts disabled. */ void tick_check_new_device(struct clock_event_device *newdev) { struct clock_event_device *curdev; struct tick_device *td; int cpu; - unsigned long flags; - - raw_spin_lock_irqsave(&tick_device_lock, flags); cpu = smp_processor_id(); if (!cpumask_test_cpu(cpu, newdev->cpumask)) @@ -273,8 +270,6 @@ void tick_check_new_device(struct clock_event_device *newdev) tick_setup_device(td, newdev, cpu, cpumask_of(cpu)); if (newdev->features & CLOCK_EVT_FEAT_ONESHOT) tick_oneshot_notify(); - - raw_spin_unlock_irqrestore(&tick_device_lock, flags); return; out_bc: @@ -282,7 +277,6 @@ out_bc: * Can the new device be used as a broadcast device ? */ tick_install_broadcast_device(newdev); - raw_spin_unlock_irqrestore(&tick_device_lock, flags); } /* @@ -311,9 +305,7 @@ static void tick_shutdown(unsigned int *cpup) { struct tick_device *td = &per_cpu(tick_cpu_device, *cpup); struct clock_event_device *dev = td->evtdev; - unsigned long flags; - raw_spin_lock_irqsave(&tick_device_lock, flags); td->mode = TICKDEV_MODE_PERIODIC; if (dev) { /* @@ -325,26 +317,20 @@ static void tick_shutdown(unsigned int *cpup) dev->event_handler = clockevents_handle_noop; td->evtdev = NULL; } - raw_spin_unlock_irqrestore(&tick_device_lock, flags); } static void tick_suspend(void) { struct tick_device *td = &__get_cpu_var(tick_cpu_device); - unsigned long flags; - raw_spin_lock_irqsave(&tick_device_lock, flags); clockevents_shutdown(td->evtdev); - raw_spin_unlock_irqrestore(&tick_device_lock, flags); } static void tick_resume(void) { struct tick_device *td = &__get_cpu_var(tick_cpu_device); - unsigned long flags; int broadcast = tick_resume_broadcast(); - raw_spin_lock_irqsave(&tick_device_lock, flags); clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_RESUME); if (!broadcast) { @@ -353,9 +339,11 @@ static void tick_resume(void) else tick_resume_oneshot(); } - raw_spin_unlock_irqrestore(&tick_device_lock, flags); } +/* + * Called with clockevents_lock held and interrupts disabled + */ void tick_notify(unsigned long reason, void *dev) { switch (reason) { -- cgit v1.2.3 From 8c53daf63f56791ed47fc585206ef3049489612f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 25 Apr 2013 20:31:48 +0000 Subject: clockevents: Move the tick_notify() switch case to clockevents_notify() No need to call another function and have duplicated cases. Signed-off-by: Thomas Gleixner Cc: John Stultz Cc: Magnus Damm Link: http://lkml.kernel.org/r/20130425143436.235746557@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/time/clockevents.c | 28 ++++++++++++++++++++++++- kernel/time/tick-common.c | 50 ++++----------------------------------------- kernel/time/tick-internal.h | 5 ++++- 3 files changed, 35 insertions(+), 48 deletions(-) (limited to 'kernel') diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index dd70b4842c62..0e3a8448e115 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -404,10 +404,36 @@ void clockevents_notify(unsigned long reason, void *arg) int cpu; raw_spin_lock_irqsave(&clockevents_lock, flags); - tick_notify(reason, arg); switch (reason) { + case CLOCK_EVT_NOTIFY_BROADCAST_ON: + case CLOCK_EVT_NOTIFY_BROADCAST_OFF: + case CLOCK_EVT_NOTIFY_BROADCAST_FORCE: + tick_broadcast_on_off(reason, arg); + break; + + case CLOCK_EVT_NOTIFY_BROADCAST_ENTER: + case CLOCK_EVT_NOTIFY_BROADCAST_EXIT: + tick_broadcast_oneshot_control(reason); + break; + + case CLOCK_EVT_NOTIFY_CPU_DYING: + tick_handover_do_timer(arg); + break; + + case CLOCK_EVT_NOTIFY_SUSPEND: + tick_suspend(); + tick_suspend_broadcast(); + break; + + case CLOCK_EVT_NOTIFY_RESUME: + tick_resume(); + break; + case CLOCK_EVT_NOTIFY_CPU_DEAD: + tick_shutdown_broadcast_oneshot(arg); + tick_shutdown_broadcast(arg); + tick_shutdown(arg); /* * Unregister the clock event devices which were * released from the users in the notify chain. diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 170a4bdfa99e..84c7cfca4d7d 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -284,7 +284,7 @@ out_bc: * * Called with interrupts disabled. */ -static void tick_handover_do_timer(int *cpup) +void tick_handover_do_timer(int *cpup) { if (*cpup == tick_do_timer_cpu) { int cpu = cpumask_first(cpu_online_mask); @@ -301,7 +301,7 @@ static void tick_handover_do_timer(int *cpup) * access the hardware device itself. * We just set the mode and remove it from the lists. */ -static void tick_shutdown(unsigned int *cpup) +void tick_shutdown(unsigned int *cpup) { struct tick_device *td = &per_cpu(tick_cpu_device, *cpup); struct clock_event_device *dev = td->evtdev; @@ -319,14 +319,14 @@ static void tick_shutdown(unsigned int *cpup) } } -static void tick_suspend(void) +void tick_suspend(void) { struct tick_device *td = &__get_cpu_var(tick_cpu_device); clockevents_shutdown(td->evtdev); } -static void tick_resume(void) +void tick_resume(void) { struct tick_device *td = &__get_cpu_var(tick_cpu_device); int broadcast = tick_resume_broadcast(); @@ -341,48 +341,6 @@ static void tick_resume(void) } } -/* - * Called with clockevents_lock held and interrupts disabled - */ -void tick_notify(unsigned long reason, void *dev) -{ - switch (reason) { - - case CLOCK_EVT_NOTIFY_BROADCAST_ON: - case CLOCK_EVT_NOTIFY_BROADCAST_OFF: - case CLOCK_EVT_NOTIFY_BROADCAST_FORCE: - tick_broadcast_on_off(reason, dev); - break; - - case CLOCK_EVT_NOTIFY_BROADCAST_ENTER: - case CLOCK_EVT_NOTIFY_BROADCAST_EXIT: - tick_broadcast_oneshot_control(reason); - break; - - case CLOCK_EVT_NOTIFY_CPU_DYING: - tick_handover_do_timer(dev); - break; - - case CLOCK_EVT_NOTIFY_CPU_DEAD: - tick_shutdown_broadcast_oneshot(dev); - tick_shutdown_broadcast(dev); - tick_shutdown(dev); - break; - - case CLOCK_EVT_NOTIFY_SUSPEND: - tick_suspend(); - tick_suspend_broadcast(); - break; - - case CLOCK_EVT_NOTIFY_RESUME: - tick_resume(); - break; - - default: - break; - } -} - /** * tick_init - initialize the tick control */ diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index 60742fe6f63d..06bfc8802dfb 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -18,8 +18,11 @@ extern int tick_do_timer_cpu __read_mostly; extern void tick_setup_periodic(struct clock_event_device *dev, int broadcast); extern void tick_handle_periodic(struct clock_event_device *dev); -extern void tick_notify(unsigned long reason, void *dev); extern void tick_check_new_device(struct clock_event_device *dev); +extern void tick_handover_do_timer(int *cpup); +extern void tick_shutdown(unsigned int *cpup); +extern void tick_suspend(void); +extern void tick_resume(void); extern void clockevents_shutdown(struct clock_event_device *dev); -- cgit v1.2.3 From ccf33d6880f39a35158fff66db13000ae4943fac Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 25 Apr 2013 20:31:49 +0000 Subject: clockevents: Add module refcount We want to be able to remove clockevent modules as well. Add a refcount so we don't remove a module with an active clock event device. Signed-off-by: Thomas Gleixner Cc: John Stultz Cc: Magnus Damm Link: http://lkml.kernel.org/r/20130425143436.307435149@linutronix.de Signed-off-by: Thomas Gleixner --- include/linux/clockchips.h | 3 +++ kernel/time/clockevents.c | 1 + kernel/time/tick-broadcast.c | 3 +++ kernel/time/tick-common.c | 4 ++++ 4 files changed, 11 insertions(+) (limited to 'kernel') diff --git a/include/linux/clockchips.h b/include/linux/clockchips.h index 2f498f66c1bb..ae1193bcf074 100644 --- a/include/linux/clockchips.h +++ b/include/linux/clockchips.h @@ -30,6 +30,7 @@ enum clock_event_nofitiers { #include struct clock_event_device; +struct module; /* Clock event mode commands */ enum clock_event_mode { @@ -83,6 +84,7 @@ enum clock_event_mode { * @irq: IRQ number (only for non CPU local devices) * @cpumask: cpumask to indicate for which CPUs this device works * @list: list head for the management code + * @owner: module reference */ struct clock_event_device { void (*event_handler)(struct clock_event_device *); @@ -112,6 +114,7 @@ struct clock_event_device { int irq; const struct cpumask *cpumask; struct list_head list; + struct module *owner; } ____cacheline_aligned; /* diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 0e3a8448e115..89e394caa769 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -357,6 +357,7 @@ void clockevents_exchange_device(struct clock_event_device *old, * released list and do a notify add later. */ if (old) { + module_put(old->owner); clockevents_set_mode(old, CLOCK_EVT_MODE_UNUSED); list_del(&old->list); list_add(&old->list, &clockevents_released); diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 3500caaa0bfd..0e374cd2e0ef 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -19,6 +19,7 @@ #include #include #include +#include #include "tick-internal.h" @@ -73,6 +74,8 @@ void tick_install_broadcast_device(struct clock_event_device *dev) tick_broadcast_device.evtdev->rating >= dev->rating) || (dev->features & CLOCK_EVT_FEAT_C3STOP)) return; + if (!try_module_get(dev->owner)) + return; clockevents_exchange_device(tick_broadcast_device.evtdev, dev); if (cur) diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 84c7cfca4d7d..433a1e11d13b 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -18,6 +18,7 @@ #include #include #include +#include #include @@ -257,6 +258,9 @@ void tick_check_new_device(struct clock_event_device *newdev) goto out_bc; } + if (!try_module_get(newdev->owner)) + return; + /* * Replace the eventually existing device by the new * device. If the current device is the broadcast device, do -- cgit v1.2.3 From 501f867064e95f9a6f540e60705be0937280e7ec Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 25 Apr 2013 20:31:49 +0000 Subject: clockevents: Provide sysfs interface Provide a simple sysfs interface for the clockevent devices. Show the current active clockevent device. Signed-off-by: Thomas Gleixner Cc: John Stultz Cc: Magnus Damm Link: http://lkml.kernel.org/r/20130425143436.371634778@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/time/clockevents.c | 86 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 86 insertions(+) (limited to 'kernel') diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 89e394caa769..0a23f4f29934 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -16,6 +16,7 @@ #include #include #include +#include #include "tick-internal.h" @@ -460,4 +461,89 @@ void clockevents_notify(unsigned long reason, void *arg) raw_spin_unlock_irqrestore(&clockevents_lock, flags); } EXPORT_SYMBOL_GPL(clockevents_notify); + +#ifdef CONFIG_SYSFS +struct bus_type clockevents_subsys = { + .name = "clockevents", + .dev_name = "clockevent", +}; + +static DEFINE_PER_CPU(struct device, tick_percpu_dev); +static struct tick_device *tick_get_tick_dev(struct device *dev); + +static ssize_t sysfs_show_current_tick_dev(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct tick_device *td; + ssize_t count = 0; + + raw_spin_lock_irq(&clockevents_lock); + td = tick_get_tick_dev(dev); + if (td && td->evtdev) + count = snprintf(buf, PAGE_SIZE, "%s\n", td->evtdev->name); + raw_spin_unlock_irq(&clockevents_lock); + return count; +} +static DEVICE_ATTR(current_device, 0444, sysfs_show_current_tick_dev, NULL); + +#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST +static struct device tick_bc_dev = { + .init_name = "broadcast", + .id = 0, + .bus = &clockevents_subsys, +}; + +static struct tick_device *tick_get_tick_dev(struct device *dev) +{ + return dev == &tick_bc_dev ? tick_get_broadcast_device() : + &per_cpu(tick_cpu_device, dev->id); +} + +static __init int tick_broadcast_init_sysfs(void) +{ + int err = device_register(&tick_bc_dev); + + if (!err) + err = device_create_file(&tick_bc_dev, &dev_attr_current_device); + return err; +} +#else +static struct tick_device *tick_get_tick_dev(struct device *dev) +{ + return &per_cpu(tick_cpu_device, dev->id); +} +static inline int tick_broadcast_init_sysfs(void) { return 0; } #endif + +static int __init tick_init_sysfs(void) +{ + int cpu; + + for_each_possible_cpu(cpu) { + struct device *dev = &per_cpu(tick_percpu_dev, cpu); + int err; + + dev->id = cpu; + dev->bus = &clockevents_subsys; + err = device_register(dev); + if (!err) + err = device_create_file(dev, &dev_attr_current_device); + if (err) + return err; + } + return tick_broadcast_init_sysfs(); +} + +static int __init clockevents_init_sysfs(void) +{ + int err = subsys_system_register(&clockevents_subsys, NULL); + + if (!err) + err = tick_init_sysfs(); + return err; +} +device_initcall(clockevents_init_sysfs); +#endif /* SYSFS */ + +#endif /* GENERIC_CLOCK_EVENTS */ -- cgit v1.2.3 From 45cb8e01b2ecef1c2afb18333e95793fa1a90281 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 25 Apr 2013 20:31:50 +0000 Subject: clockevents: Split out selection logic Split out the clockevent device selection logic. Preparatory patch to allow unbinding active clockevent devices. Signed-off-by: Thomas Gleixner Cc: John Stultz Cc: Magnus Damm Link: http://lkml.kernel.org/r/20130425143436.431796247@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/time/tick-broadcast.c | 25 ++++++++++++---- kernel/time/tick-common.c | 69 +++++++++++++++++++++++--------------------- 2 files changed, 56 insertions(+), 38 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 0e374cd2e0ef..d067c01586f5 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -65,19 +65,34 @@ static void tick_broadcast_start_periodic(struct clock_event_device *bc) /* * Check, if the device can be utilized as broadcast device: */ +static bool tick_check_broadcast_device(struct clock_event_device *curdev, + struct clock_event_device *newdev) +{ + if ((newdev->features & CLOCK_EVT_FEAT_DUMMY) || + (newdev->features & CLOCK_EVT_FEAT_C3STOP)) + return false; + + if (tick_broadcast_device.mode == TICKDEV_MODE_ONESHOT && + !(newdev->features & CLOCK_EVT_FEAT_ONESHOT)) + return false; + + return !curdev || newdev->rating > curdev->rating; +} + +/* + * Conditionally install/replace broadcast device + */ void tick_install_broadcast_device(struct clock_event_device *dev) { struct clock_event_device *cur = tick_broadcast_device.evtdev; - if ((dev->features & CLOCK_EVT_FEAT_DUMMY) || - (tick_broadcast_device.evtdev && - tick_broadcast_device.evtdev->rating >= dev->rating) || - (dev->features & CLOCK_EVT_FEAT_C3STOP)) + if (!tick_check_broadcast_device(cur, dev)) return; + if (!try_module_get(dev->owner)) return; - clockevents_exchange_device(tick_broadcast_device.evtdev, dev); + clockevents_exchange_device(cur, dev); if (cur) cur->event_handler = clockevents_handle_noop; tick_broadcast_device.evtdev = dev; diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 433a1e11d13b..c34021650348 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -205,6 +205,37 @@ static void tick_setup_device(struct tick_device *td, tick_setup_oneshot(newdev, handler, next_event); } +static bool tick_check_percpu(struct clock_event_device *curdev, + struct clock_event_device *newdev, int cpu) +{ + if (!cpumask_test_cpu(cpu, newdev->cpumask)) + return false; + if (cpumask_equal(newdev->cpumask, cpumask_of(cpu))) + return true; + /* Check if irq affinity can be set */ + if (newdev->irq >= 0 && !irq_can_set_affinity(newdev->irq)) + return false; + /* Prefer an existing cpu local device */ + if (curdev && cpumask_equal(curdev->cpumask, cpumask_of(cpu))) + return false; + return true; +} + +static bool tick_check_preferred(struct clock_event_device *curdev, + struct clock_event_device *newdev) +{ + /* Prefer oneshot capable device */ + if (!(newdev->features & CLOCK_EVT_FEAT_ONESHOT)) { + if (curdev && (curdev->features & CLOCK_EVT_FEAT_ONESHOT)) + return false; + if (tick_oneshot_mode_active()) + return false; + } + + /* Use the higher rated one */ + return !curdev || newdev->rating > curdev->rating; +} + /* * Check, if the new registered device should be used. Called with * clockevents_lock held and interrupts disabled. @@ -223,40 +254,12 @@ void tick_check_new_device(struct clock_event_device *newdev) curdev = td->evtdev; /* cpu local device ? */ - if (!cpumask_equal(newdev->cpumask, cpumask_of(cpu))) { - - /* - * If the cpu affinity of the device interrupt can not - * be set, ignore it. - */ - if (!irq_can_set_affinity(newdev->irq)) - goto out_bc; - - /* - * If we have a cpu local device already, do not replace it - * by a non cpu local device - */ - if (curdev && cpumask_equal(curdev->cpumask, cpumask_of(cpu))) - goto out_bc; - } + if (!tick_check_percpu(curdev, newdev, cpu)) + goto out_bc; - /* - * If we have an active device, then check the rating and the oneshot - * feature. - */ - if (curdev) { - /* - * Prefer one shot capable devices ! - */ - if ((curdev->features & CLOCK_EVT_FEAT_ONESHOT) && - !(newdev->features & CLOCK_EVT_FEAT_ONESHOT)) - goto out_bc; - /* - * Check the rating - */ - if (curdev->rating >= newdev->rating) - goto out_bc; - } + /* Preference decision */ + if (!tick_check_preferred(curdev, newdev)) + goto out_bc; if (!try_module_get(newdev->owner)) return; -- cgit v1.2.3 From 03e13cf5ee60584fe0c831682c67212effb7fca4 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 25 Apr 2013 20:31:50 +0000 Subject: clockevents: Implement unbind functionality Provide a sysfs interface to allow unbinding of clockevent devices. The device is unbound if it is unused or if there is a replacement device available. Unbinding of broadcast devices is not supported as we don't want to foster that nonsense. If no replacement device is available the unbind returns -EBUSY. Unbind is available from the kernel and through sysfs, which is necessary to drop the module refcount. Signed-off-by: Thomas Gleixner Cc: John Stultz Cc: Magnus Damm Link: http://lkml.kernel.org/r/20130425143436.499216659@linutronix.de Signed-off-by: Thomas Gleixner --- include/linux/clockchips.h | 1 + kernel/time/clockevents.c | 125 ++++++++++++++++++++++++++++++++++++++++++++ kernel/time/clocksource.c | 9 ++-- kernel/time/tick-common.c | 24 +++++++++ kernel/time/tick-internal.h | 7 +++ 5 files changed, 162 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/include/linux/clockchips.h b/include/linux/clockchips.h index ae1193bcf074..0857922e8ad0 100644 --- a/include/linux/clockchips.h +++ b/include/linux/clockchips.h @@ -141,6 +141,7 @@ static inline unsigned long div_sc(unsigned long ticks, unsigned long nsec, extern u64 clockevent_delta2ns(unsigned long latch, struct clock_event_device *evt); extern void clockevents_register_device(struct clock_event_device *dev); +extern int clockevents_unbind_device(struct clock_event_device *ced, int cpu); extern void clockevents_config(struct clock_event_device *dev, u32 freq); extern void clockevents_config_and_register(struct clock_event_device *dev, diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 0a23f4f29934..38959c866789 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -25,6 +25,13 @@ static LIST_HEAD(clockevent_devices); static LIST_HEAD(clockevents_released); /* Protection for the above */ static DEFINE_RAW_SPINLOCK(clockevents_lock); +/* Protection for unbind operations */ +static DEFINE_MUTEX(clockevents_mutex); + +struct ce_unbind { + struct clock_event_device *ce; + int res; +}; /** * clockevents_delta2ns - Convert a latch value (device ticks) to nanoseconds @@ -245,6 +252,90 @@ static void clockevents_notify_released(void) } } +/* + * Try to install a replacement clock event device + */ +static int clockevents_replace(struct clock_event_device *ced) +{ + struct clock_event_device *dev, *newdev = NULL; + + list_for_each_entry(dev, &clockevent_devices, list) { + if (dev == ced || dev->mode != CLOCK_EVT_MODE_UNUSED) + continue; + + if (!tick_check_replacement(newdev, dev)) + continue; + + if (!try_module_get(dev->owner)) + continue; + + if (newdev) + module_put(newdev->owner); + newdev = dev; + } + if (newdev) { + tick_install_replacement(newdev); + list_del_init(&ced->list); + } + return newdev ? 0 : -EBUSY; +} + +/* + * Called with clockevents_mutex and clockevents_lock held + */ +static int __clockevents_try_unbind(struct clock_event_device *ced, int cpu) +{ + /* Fast track. Device is unused */ + if (ced->mode == CLOCK_EVT_MODE_UNUSED) { + list_del_init(&ced->list); + return 0; + } + + return ced == per_cpu(tick_cpu_device, cpu).evtdev ? -EAGAIN : -EBUSY; +} + +/* + * SMP function call to unbind a device + */ +static void __clockevents_unbind(void *arg) +{ + struct ce_unbind *cu = arg; + int res; + + raw_spin_lock(&clockevents_lock); + res = __clockevents_try_unbind(cu->ce, smp_processor_id()); + if (res == -EAGAIN) + res = clockevents_replace(cu->ce); + cu->res = res; + raw_spin_unlock(&clockevents_lock); +} + +/* + * Issues smp function call to unbind a per cpu device. Called with + * clockevents_mutex held. + */ +static int clockevents_unbind(struct clock_event_device *ced, int cpu) +{ + struct ce_unbind cu = { .ce = ced, .res = -ENODEV }; + + smp_call_function_single(cpu, __clockevents_unbind, &cu, 1); + return cu.res; +} + +/* + * Unbind a clockevents device. + */ +int clockevents_unbind_device(struct clock_event_device *ced, int cpu) +{ + int ret; + + mutex_lock(&clockevents_mutex); + ret = clockevents_unbind(ced, cpu); + mutex_unlock(&clockevents_mutex); + return ret; +} +EXPORT_SYMBOL_GPL(clockevents_unbind); + /** * clockevents_register_device - register a clock event device * @dev: device to register @@ -487,6 +578,38 @@ static ssize_t sysfs_show_current_tick_dev(struct device *dev, } static DEVICE_ATTR(current_device, 0444, sysfs_show_current_tick_dev, NULL); +/* We don't support the abomination of removable broadcast devices */ +static ssize_t sysfs_unbind_tick_dev(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + char name[CS_NAME_LEN]; + size_t ret = sysfs_get_uname(buf, name, count); + struct clock_event_device *ce; + + if (ret < 0) + return ret; + + ret = -ENODEV; + mutex_lock(&clockevents_mutex); + raw_spin_lock_irq(&clockevents_lock); + list_for_each_entry(ce, &clockevent_devices, list) { + if (!strcmp(ce->name, name)) { + ret = __clockevents_try_unbind(ce, dev->id); + break; + } + } + raw_spin_unlock_irq(&clockevents_lock); + /* + * We hold clockevents_mutex, so ce can't go away + */ + if (ret == -EAGAIN) + ret = clockevents_unbind(ce, dev->id); + mutex_unlock(&clockevents_mutex); + return ret ? ret : count; +} +static DEVICE_ATTR(unbind_device, 0200, NULL, sysfs_unbind_tick_dev); + #ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST static struct device tick_bc_dev = { .init_name = "broadcast", @@ -529,6 +652,8 @@ static int __init tick_init_sysfs(void) err = device_register(dev); if (!err) err = device_create_file(dev, &dev_attr_current_device); + if (!err) + err = device_create_file(dev, &dev_attr_unbind_device); if (err) return err; } diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 31b90332f47b..6d05b00410cc 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -31,6 +31,8 @@ #include #include +#include "tick-internal.h" + void timecounter_init(struct timecounter *tc, const struct cyclecounter *cc, u64 start_tstamp) @@ -174,7 +176,6 @@ clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 maxsec) static struct clocksource *curr_clocksource; static LIST_HEAD(clocksource_list); static DEFINE_MUTEX(clocksource_mutex); -#define CS_NAME_LEN 32 static char override_name[CS_NAME_LEN]; static int finished_booting; @@ -864,7 +865,7 @@ sysfs_show_current_clocksources(struct device *dev, return count; } -static size_t clocksource_get_uname(const char *buf, char *dst, size_t cnt) +size_t sysfs_get_uname(const char *buf, char *dst, size_t cnt) { size_t ret = cnt; @@ -899,7 +900,7 @@ static ssize_t sysfs_override_clocksource(struct device *dev, mutex_lock(&clocksource_mutex); - ret = clocksource_get_uname(buf, override_name, count); + ret = sysfs_get_uname(buf, override_name, count); if (ret >= 0) clocksource_select(); @@ -925,7 +926,7 @@ static ssize_t sysfs_unbind_clocksource(struct device *dev, char name[CS_NAME_LEN]; size_t ret; - ret = clocksource_get_uname(buf, name, count); + ret = sysfs_get_uname(buf, name, count); if (ret < 0) return ret; diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index c34021650348..5edfb4806032 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -205,6 +205,17 @@ static void tick_setup_device(struct tick_device *td, tick_setup_oneshot(newdev, handler, next_event); } +void tick_install_replacement(struct clock_event_device *newdev) +{ + struct tick_device *td = &__get_cpu_var(tick_cpu_device); + int cpu = smp_processor_id(); + + clockevents_exchange_device(td->evtdev, newdev); + tick_setup_device(td, newdev, cpu, cpumask_of(cpu)); + if (newdev->features & CLOCK_EVT_FEAT_ONESHOT) + tick_oneshot_notify(); +} + static bool tick_check_percpu(struct clock_event_device *curdev, struct clock_event_device *newdev, int cpu) { @@ -236,6 +247,19 @@ static bool tick_check_preferred(struct clock_event_device *curdev, return !curdev || newdev->rating > curdev->rating; } +/* + * Check whether the new device is a better fit than curdev. curdev + * can be NULL ! + */ +bool tick_check_replacement(struct clock_event_device *curdev, + struct clock_event_device *newdev) +{ + if (tick_check_percpu(curdev, newdev, smp_processor_id())) + return false; + + return tick_check_preferred(curdev, newdev); +} + /* * Check, if the new registered device should be used. Called with * clockevents_lock held and interrupts disabled. diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index 06bfc8802dfb..be1690eaecff 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -11,6 +11,8 @@ extern seqlock_t jiffies_lock; #define TICK_DO_TIMER_NONE -1 #define TICK_DO_TIMER_BOOT -2 +#define CS_NAME_LEN 32 + DECLARE_PER_CPU(struct tick_device, tick_cpu_device); extern ktime_t tick_next_period; extern ktime_t tick_period; @@ -23,9 +25,14 @@ extern void tick_handover_do_timer(int *cpup); extern void tick_shutdown(unsigned int *cpup); extern void tick_suspend(void); extern void tick_resume(void); +extern bool tick_check_replacement(struct clock_event_device *curdev, + struct clock_event_device *newdev); +extern void tick_install_replacement(struct clock_event_device *dev); extern void clockevents_shutdown(struct clock_event_device *dev); +extern size_t sysfs_get_uname(const char *buf, char *dst, size_t cnt); + /* * NO_HZ / high resolution timer shared code */ -- cgit v1.2.3 From bdc7119f1bdd0632d42f435941dc290216a436e7 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 24 May 2013 10:55:38 +0900 Subject: cgroup: make cgroup_is_removed() static cgroup_is_removed() no longer has external users and it shouldn't grow any - controllers should deal with cgroup_subsys_state on/offline state instead of cgroup removal state. Make it static. While at it, make it return bool. Signed-off-by: Tejun Heo --- include/linux/cgroup.h | 1 - kernel/cgroup.c | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 1df5f699be61..8d9f3c911fca 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -538,7 +538,6 @@ static inline const char *cgroup_name(const struct cgroup *cgrp) int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts); int cgroup_rm_cftypes(struct cgroup_subsys *ss, struct cftype *cfts); -int cgroup_is_removed(const struct cgroup *cgrp); bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor); int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen); diff --git a/kernel/cgroup.c b/kernel/cgroup.c index a19419f4af1a..501974823b33 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -226,7 +226,7 @@ static int css_refcnt(struct cgroup_subsys_state *css) } /* convenient tests for these bits */ -inline int cgroup_is_removed(const struct cgroup *cgrp) +static inline bool cgroup_is_removed(const struct cgroup *cgrp) { return test_bit(CGRP_REMOVED, &cgrp->flags); } -- cgit v1.2.3 From 53fa5261747a90746531e8a1c81eeb78fedc2f71 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 24 May 2013 10:55:38 +0900 Subject: cgroup: add cgroup->serial_nr and implement cgroup_next_sibling() Currently, there's no easy way to find out the next sibling cgroup unless it's known that the current cgroup is accessed from the parent's children list in a single RCU critical section. This in turn forces all iterators to require whole iteration to be enclosed in a single RCU critical section, which sometimes is too restrictive. This patch implements cgroup_next_sibling() which can reliably determine the next sibling regardless of the state of the current cgroup as long as it's accessible. It currently is impossible to determine the next sibling after dropping RCU read lock because the cgroup being iterated could be removed anytime and if RCU read lock is dropped, nothing guarantess its ->sibling.next pointer is accessible. A removed cgroup would continue to point to its next sibling for RCU accesses but stop receiving updates from the sibling. IOW, the next sibling could be removed and then complete its grace period while RCU read lock is dropped, making it unsafe to dereference ->sibling.next after dropping and re-acquiring RCU read lock. This can be solved by adding a way to traverse to the next sibling without dereferencing ->sibling.next. This patch adds a monotonically increasing cgroup serial number, cgroup->serial_nr, which guarantees that all cgroup->children lists are kept in increasing serial_nr order. A new function, cgroup_next_sibling(), is implemented, which, if CGRP_REMOVED is not set on the current cgroup, follows ->sibling.next; otherwise, traverses the parent's ->children list until it sees a sibling with higher ->serial_nr. This allows the function to always return the next sibling regardless of the state of the current cgroup without adding overhead in the fast path. Further patches will update the iterators to use cgroup_next_sibling() so that they allow dropping RCU read lock and blocking while iteration is in progress which in turn will be used to simplify controllers. v2: Typo fix as per Serge. Signed-off-by: Tejun Heo Acked-by: Serge E. Hallyn --- include/linux/cgroup.h | 10 ++++++++ kernel/cgroup.c | 62 ++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 72 insertions(+) (limited to 'kernel') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 8d9f3c911fca..ee041a01a67e 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -188,6 +188,14 @@ struct cgroup { struct cgroup *parent; /* my parent */ struct dentry *dentry; /* cgroup fs entry, RCU protected */ + /* + * Monotonically increasing unique serial number which defines a + * uniform order among all cgroups. It's guaranteed that all + * ->children lists are in the ascending order of ->serial_nr. + * It's used to allow interrupting and resuming iterations. + */ + u64 serial_nr; + /* * This is a copy of dentry->d_name, and it's needed because * we can't use dentry->d_name in cgroup_path(). @@ -675,6 +683,8 @@ static inline struct cgroup* task_cgroup(struct task_struct *task, return task_subsys_state(task, subsys_id)->cgroup; } +struct cgroup *cgroup_next_sibling(struct cgroup *pos); + /** * cgroup_for_each_child - iterate through children of a cgroup * @pos: the cgroup * to use as the loop cursor diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 501974823b33..b87c7a5a5497 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2975,6 +2975,55 @@ static void cgroup_enable_task_cg_lists(void) write_unlock(&css_set_lock); } +/** + * cgroup_next_sibling - find the next sibling of a given cgroup + * @pos: the current cgroup + * + * This function returns the next sibling of @pos and should be called + * under RCU read lock. The only requirement is that @pos is accessible. + * The next sibling is guaranteed to be returned regardless of @pos's + * state. + */ +struct cgroup *cgroup_next_sibling(struct cgroup *pos) +{ + struct cgroup *next; + + WARN_ON_ONCE(!rcu_read_lock_held()); + + /* + * @pos could already have been removed. Once a cgroup is removed, + * its ->sibling.next is no longer updated when its next sibling + * changes. As CGRP_REMOVED is set on removal which is fully + * serialized, if we see it unasserted, it's guaranteed that the + * next sibling hasn't finished its grace period even if it's + * already removed, and thus safe to dereference from this RCU + * critical section. If ->sibling.next is inaccessible, + * cgroup_is_removed() is guaranteed to be visible as %true here. + */ + if (likely(!cgroup_is_removed(pos))) { + next = list_entry_rcu(pos->sibling.next, struct cgroup, sibling); + if (&next->sibling != &pos->parent->children) + return next; + return NULL; + } + + /* + * Can't dereference the next pointer. Each cgroup is given a + * monotonically increasing unique serial number and always + * appended to the sibling list, so the next one can be found by + * walking the parent's children until we see a cgroup with higher + * serial number than @pos's. + * + * While this path can be slow, it's taken only when either the + * current cgroup is removed or iteration and removal race. + */ + list_for_each_entry_rcu(next, &pos->parent->children, sibling) + if (next->serial_nr > pos->serial_nr) + return next; + return NULL; +} +EXPORT_SYMBOL_GPL(cgroup_next_sibling); + /** * cgroup_next_descendant_pre - find the next descendant for pre-order walk * @pos: the current position (%NULL to initiate traversal) @@ -4137,6 +4186,7 @@ static void offline_css(struct cgroup_subsys *ss, struct cgroup *cgrp) static long cgroup_create(struct cgroup *parent, struct dentry *dentry, umode_t mode) { + static atomic64_t serial_nr_cursor = ATOMIC64_INIT(0); struct cgroup *cgrp; struct cgroup_name *name; struct cgroupfs_root *root = parent->root; @@ -4217,6 +4267,14 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, goto err_free_all; lockdep_assert_held(&dentry->d_inode->i_mutex); + /* + * Assign a monotonically increasing serial number. With the list + * appending below, it guarantees that sibling cgroups are always + * sorted in the ascending serial number order on the parent's + * ->children. + */ + cgrp->serial_nr = atomic64_inc_return(&serial_nr_cursor); + /* allocation complete, commit to creation */ list_add_tail(&cgrp->allcg_node, &root->allcg_list); list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children); @@ -4304,6 +4362,10 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) * removed. This makes future css_tryget() and child creation * attempts fail thus maintaining the removal conditions verified * above. + * + * Note that CGRP_REMVOED clearing is depended upon by + * cgroup_next_sibling() to resume iteration after dropping RCU + * read lock. See cgroup_next_sibling() for details. */ for_each_subsys(cgrp->root, ss) { struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; -- cgit v1.2.3 From 75501a6d59e989e5c286716e5b3b66ace4660e83 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 24 May 2013 10:55:38 +0900 Subject: cgroup: update iterators to use cgroup_next_sibling() This patch converts cgroup_for_each_child(), cgroup_next_descendant_pre/post() and thus cgroup_for_each_descendant_pre/post() to use cgroup_next_sibling() instead of manually dereferencing ->sibling.next. The only reason the iterators couldn't allow dropping RCU read lock while iteration is in progress was because they couldn't determine the next sibling safely once RCU read lock is dropped. Using cgroup_next_sibling() removes that problem and enables all iterators to allow dropping RCU read lock in the middle. Comments are updated accordingly. This makes the iterators easier to use and will simplify controllers. Note that @cgroup argument is renamed to @cgrp in cgroup_for_each_child() because it conflicts with "struct cgroup" used in the new macro body. Signed-off-by: Tejun Heo Acked-by: Serge E. Hallyn Reviewed-by: Michal Hocko --- include/linux/cgroup.h | 18 ++++++++++++++---- kernel/cgroup.c | 25 +++++++++++++++++++------ 2 files changed, 33 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index ee041a01a67e..d0ad3794b947 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -688,9 +688,9 @@ struct cgroup *cgroup_next_sibling(struct cgroup *pos); /** * cgroup_for_each_child - iterate through children of a cgroup * @pos: the cgroup * to use as the loop cursor - * @cgroup: cgroup whose children to walk + * @cgrp: cgroup whose children to walk * - * Walk @cgroup's children. Must be called under rcu_read_lock(). A child + * Walk @cgrp's children. Must be called under rcu_read_lock(). A child * cgroup which hasn't finished ->css_online() or already has finished * ->css_offline() may show up during traversal and it's each subsystem's * responsibility to verify that each @pos is alive. @@ -698,9 +698,15 @@ struct cgroup *cgroup_next_sibling(struct cgroup *pos); * If a subsystem synchronizes against the parent in its ->css_online() and * before starting iterating, a cgroup which finished ->css_online() is * guaranteed to be visible in the future iterations. + * + * It is allowed to temporarily drop RCU read lock during iteration. The + * caller is responsible for ensuring that @pos remains accessible until + * the start of the next iteration by, for example, bumping the css refcnt. */ -#define cgroup_for_each_child(pos, cgroup) \ - list_for_each_entry_rcu(pos, &(cgroup)->children, sibling) +#define cgroup_for_each_child(pos, cgrp) \ + for ((pos) = list_first_or_null_rcu(&(cgrp)->children, \ + struct cgroup, sibling); \ + (pos); (pos) = cgroup_next_sibling((pos))) struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos, struct cgroup *cgroup); @@ -759,6 +765,10 @@ struct cgroup *cgroup_rightmost_descendant(struct cgroup *pos); * Alternatively, a subsystem may choose to use a single global lock to * synchronize ->css_online() and ->css_offline() against tree-walking * operations. + * + * It is allowed to temporarily drop RCU read lock during iteration. The + * caller is responsible for ensuring that @pos remains accessible until + * the start of the next iteration by, for example, bumping the css refcnt. */ #define cgroup_for_each_descendant_pre(pos, cgroup) \ for (pos = cgroup_next_descendant_pre(NULL, (cgroup)); (pos); \ diff --git a/kernel/cgroup.c b/kernel/cgroup.c index b87c7a5a5497..fefc41c1a147 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -3031,6 +3031,11 @@ EXPORT_SYMBOL_GPL(cgroup_next_sibling); * * To be used by cgroup_for_each_descendant_pre(). Find the next * descendant to visit for pre-order traversal of @cgroup's descendants. + * + * While this function requires RCU read locking, it doesn't require the + * whole traversal to be contained in a single RCU critical section. This + * function will return the correct next descendant as long as both @pos + * and @cgroup are accessible and @pos is a descendant of @cgroup. */ struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos, struct cgroup *cgroup) @@ -3050,11 +3055,9 @@ struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos, /* no child, visit my or the closest ancestor's next sibling */ while (pos != cgroup) { - next = list_entry_rcu(pos->sibling.next, struct cgroup, - sibling); - if (&next->sibling != &pos->parent->children) + next = cgroup_next_sibling(pos); + if (next) return next; - pos = pos->parent; } @@ -3069,6 +3072,11 @@ EXPORT_SYMBOL_GPL(cgroup_next_descendant_pre); * Return the rightmost descendant of @pos. If there's no descendant, * @pos is returned. This can be used during pre-order traversal to skip * subtree of @pos. + * + * While this function requires RCU read locking, it doesn't require the + * whole traversal to be contained in a single RCU critical section. This + * function will return the correct rightmost descendant as long as @pos is + * accessible. */ struct cgroup *cgroup_rightmost_descendant(struct cgroup *pos) { @@ -3108,6 +3116,11 @@ static struct cgroup *cgroup_leftmost_descendant(struct cgroup *pos) * * To be used by cgroup_for_each_descendant_post(). Find the next * descendant to visit for post-order traversal of @cgroup's descendants. + * + * While this function requires RCU read locking, it doesn't require the + * whole traversal to be contained in a single RCU critical section. This + * function will return the correct next descendant as long as both @pos + * and @cgroup are accessible and @pos is a descendant of @cgroup. */ struct cgroup *cgroup_next_descendant_post(struct cgroup *pos, struct cgroup *cgroup) @@ -3123,8 +3136,8 @@ struct cgroup *cgroup_next_descendant_post(struct cgroup *pos, } /* if there's an unvisited sibling, visit its leftmost descendant */ - next = list_entry_rcu(pos->sibling.next, struct cgroup, sibling); - if (&next->sibling != &pos->parent->children) + next = cgroup_next_sibling(pos); + if (next) return cgroup_leftmost_descendant(next); /* no sibling left, visit parent */ -- cgit v1.2.3 From c7e99fc75de8882bc4104455ace366d9d3599a96 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 28 May 2013 09:28:02 +0200 Subject: clockevents: Define CS_NAME_LEN unconditionally Unbreak architectures which do not use clockevents, but require to build some of the core timekeeping infrastructure Reported-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/time/tick-internal.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index be1690eaecff..bc906cad709b 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -6,13 +6,13 @@ extern seqlock_t jiffies_lock; +#define CS_NAME_LEN 32 + #ifdef CONFIG_GENERIC_CLOCKEVENTS_BUILD #define TICK_DO_TIMER_NONE -1 #define TICK_DO_TIMER_BOOT -2 -#define CS_NAME_LEN 32 - DECLARE_PER_CPU(struct tick_device, tick_cpu_device); extern ktime_t tick_next_period; extern ktime_t tick_period; -- cgit v1.2.3 From 1eaff67266b6b6c97bbd33cf2c20577822836413 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 28 May 2013 09:48:46 +0200 Subject: clocksource: Implement clocksource_select_fallback() for CONFIG_ARCH_USES_GETTIMEOFFSET=y commit 7eaeb34305 (clocksource: Provide unbind interface in sysfs) implemented clocksource_select_fallback() which is not defined for CONFIG_ARCH_USES_GETTIMEOFFSET=y. Add an empty inline function for that. Reported-by: Ingo Molnar Reported-by: fengguang.wu@intel.com Signed-off-by: Thomas Gleixner --- kernel/time/clocksource.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 6d05b00410cc..e713ef7d19a7 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -632,6 +632,7 @@ static void clocksource_select_fallback(void) #else /* !CONFIG_ARCH_USES_GETTIMEOFFSET */ static inline void clocksource_select(void) { } +static inline void clocksource_select_fallback(void) { } #endif -- cgit v1.2.3 From 6cffe00f7d4e24679eae6b7aae4caaf915288256 Mon Sep 17 00:00:00 2001 From: Todd Poynor Date: Wed, 15 May 2013 14:38:11 -0700 Subject: alarmtimer: Add functions for timerfd support Add functions needed for hooking up alarmtimer to timerfd: * alarm_restart: Similar to hrtimer_restart, restart an alarmtimer after the expires time has already been updated (as with alarm_forward). * alarm_forward_now: Similar to hrtimer_forward_now, move the expires time forward to an interval from the current time of the associated clock. * alarm_start_relative: Start an alarmtimer with an expires time relative to the current time of the associated clock. * alarm_expires_remaining: Similar to hrtimer_expires_remaining, return the amount of time remaining until alarm expiry. Signed-off-by: Todd Poynor Signed-off-by: John Stultz --- include/linux/alarmtimer.h | 4 ++++ kernel/time/alarmtimer.c | 39 ++++++++++++++++++++++++++++++++++++++- 2 files changed, 42 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/linux/alarmtimer.h b/include/linux/alarmtimer.h index 9069694e70eb..a899402a5a0e 100644 --- a/include/linux/alarmtimer.h +++ b/include/linux/alarmtimer.h @@ -44,10 +44,14 @@ struct alarm { void alarm_init(struct alarm *alarm, enum alarmtimer_type type, enum alarmtimer_restart (*function)(struct alarm *, ktime_t)); int alarm_start(struct alarm *alarm, ktime_t start); +int alarm_start_relative(struct alarm *alarm, ktime_t start); +void alarm_restart(struct alarm *alarm); int alarm_try_to_cancel(struct alarm *alarm); int alarm_cancel(struct alarm *alarm); u64 alarm_forward(struct alarm *alarm, ktime_t now, ktime_t interval); +u64 alarm_forward_now(struct alarm *alarm, ktime_t interval); +ktime_t alarm_expires_remaining(const struct alarm *alarm); /* Provide way to access the rtc device being used by alarmtimers */ struct rtc_device *alarmtimer_get_rtcdev(void); diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index f11d83b12949..3e5cba274475 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -199,6 +199,12 @@ static enum hrtimer_restart alarmtimer_fired(struct hrtimer *timer) } +ktime_t alarm_expires_remaining(const struct alarm *alarm) +{ + struct alarm_base *base = &alarm_bases[alarm->type]; + return ktime_sub(alarm->node.expires, base->gettime()); +} + #ifdef CONFIG_RTC_CLASS /** * alarmtimer_suspend - Suspend time callback @@ -305,7 +311,7 @@ void alarm_init(struct alarm *alarm, enum alarmtimer_type type, } /** - * alarm_start - Sets an alarm to fire + * alarm_start - Sets an absolute alarm to fire * @alarm: ptr to alarm to set * @start: time to run the alarm */ @@ -324,6 +330,31 @@ int alarm_start(struct alarm *alarm, ktime_t start) return ret; } +/** + * alarm_start_relative - Sets a relative alarm to fire + * @alarm: ptr to alarm to set + * @start: time relative to now to run the alarm + */ +int alarm_start_relative(struct alarm *alarm, ktime_t start) +{ + struct alarm_base *base = &alarm_bases[alarm->type]; + + start = ktime_add(start, base->gettime()); + return alarm_start(alarm, start); +} + +void alarm_restart(struct alarm *alarm) +{ + struct alarm_base *base = &alarm_bases[alarm->type]; + unsigned long flags; + + spin_lock_irqsave(&base->lock, flags); + hrtimer_set_expires(&alarm->timer, alarm->node.expires); + hrtimer_restart(&alarm->timer); + alarmtimer_enqueue(base, alarm); + spin_unlock_irqrestore(&base->lock, flags); +} + /** * alarm_try_to_cancel - Tries to cancel an alarm timer * @alarm: ptr to alarm to be canceled @@ -394,6 +425,12 @@ u64 alarm_forward(struct alarm *alarm, ktime_t now, ktime_t interval) return overrun; } +u64 alarm_forward_now(struct alarm *alarm, ktime_t interval) +{ + struct alarm_base *base = &alarm_bases[alarm->type]; + + return alarm_forward(alarm, base->gettime(), interval); +} -- cgit v1.2.3 From 5c83545f24ab3dd67e0ae0e2b795fea750f08c34 Mon Sep 17 00:00:00 2001 From: Colin Cross Date: Tue, 21 May 2013 22:32:14 -0700 Subject: power: Add option to log time spent in suspend Below is a patch from android kernel that maintains a histogram of suspend times. Please review and provide feedback. Statistices on the time spent in suspend are kept in /sys/kernel/debug/sleep_time. Cc: Android Kernel Team Cc: Colin Cross Cc: Todd Poynor Cc: San Mehat Cc: Benoit Goby Cc: John Stultz Cc: Thomas Gleixner Signed-off-by: Colin Cross Signed-off-by: Todd Poynor [zoran.markovic@linaro.org: Re-formatted suspend time table to better fit expected values. Moved accounting of suspend time into timekeeping core. Removed CONFIG_SUSPEND_TIME flag and made the feature conditional on CONFIG_DEBUG_FS. Changed the file name to sleep_time to better fit terminology in timekeeping core. Changed seq_printf to seq_puts. Tweaked commit message] Signed-off-by: Zoran Markovic Signed-off-by: John Stultz --- kernel/time/Makefile | 1 + kernel/time/timekeeping.c | 2 ++ kernel/time/timekeeping_debug.c | 72 ++++++++++++++++++++++++++++++++++++++ kernel/time/timekeeping_internal.h | 14 ++++++++ 4 files changed, 89 insertions(+) create mode 100644 kernel/time/timekeeping_debug.c create mode 100644 kernel/time/timekeeping_internal.h (limited to 'kernel') diff --git a/kernel/time/Makefile b/kernel/time/Makefile index ff7d9d2ab504..d52ac8bf0006 100644 --- a/kernel/time/Makefile +++ b/kernel/time/Makefile @@ -7,3 +7,4 @@ obj-$(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) += tick-broadcast.o obj-$(CONFIG_TICK_ONESHOT) += tick-oneshot.o obj-$(CONFIG_TICK_ONESHOT) += tick-sched.o obj-$(CONFIG_TIMER_STATS) += timer_stats.o +obj-$(CONFIG_DEBUG_FS) += timekeeping_debug.o diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 933efa4071c3..838fc0777b68 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -25,6 +25,7 @@ #include "tick-internal.h" #include "ntp_internal.h" +#include "timekeeping_internal.h" static struct timekeeper timekeeper; static DEFINE_RAW_SPINLOCK(timekeeper_lock); @@ -851,6 +852,7 @@ static void __timekeeping_inject_sleeptime(struct timekeeper *tk, tk_xtime_add(tk, delta); tk_set_wall_to_mono(tk, timespec_sub(tk->wall_to_monotonic, *delta)); tk_set_sleep_time(tk, timespec_add(tk->total_sleep_time, *delta)); + tk_debug_account_sleep_time(delta); } /** diff --git a/kernel/time/timekeeping_debug.c b/kernel/time/timekeeping_debug.c new file mode 100644 index 000000000000..802433a4f5eb --- /dev/null +++ b/kernel/time/timekeeping_debug.c @@ -0,0 +1,72 @@ +/* + * debugfs file to track time spent in suspend + * + * Copyright (c) 2011, Google, Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#include +#include +#include +#include +#include +#include + +static unsigned int sleep_time_bin[32] = {0}; + +static int tk_debug_show_sleep_time(struct seq_file *s, void *data) +{ + unsigned int bin; + seq_puts(s, " time (secs) count\n"); + seq_puts(s, "------------------------------\n"); + for (bin = 0; bin < 32; bin++) { + if (sleep_time_bin[bin] == 0) + continue; + seq_printf(s, "%10u - %-10u %4u\n", + bin ? 1 << (bin - 1) : 0, 1 << bin, + sleep_time_bin[bin]); + } + return 0; +} + +static int tk_debug_sleep_time_open(struct inode *inode, struct file *file) +{ + return single_open(file, tk_debug_show_sleep_time, NULL); +} + +static const struct file_operations tk_debug_sleep_time_fops = { + .open = tk_debug_sleep_time_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int __init tk_debug_sleep_time_init(void) +{ + struct dentry *d; + + d = debugfs_create_file("sleep_time", 0444, NULL, NULL, + &tk_debug_sleep_time_fops); + if (!d) { + pr_err("Failed to create sleep_time debug file\n"); + return -ENOMEM; + } + + return 0; +} +late_initcall(tk_debug_sleep_time_init); + +void tk_debug_account_sleep_time(struct timespec *t) +{ + sleep_time_bin[fls(t->tv_sec)]++; +} + diff --git a/kernel/time/timekeeping_internal.h b/kernel/time/timekeeping_internal.h new file mode 100644 index 000000000000..13323ea08ffa --- /dev/null +++ b/kernel/time/timekeeping_internal.h @@ -0,0 +1,14 @@ +#ifndef _TIMEKEEPING_INTERNAL_H +#define _TIMEKEEPING_INTERNAL_H +/* + * timekeeping debug functions + */ +#include + +#ifdef CONFIG_DEBUG_FS +extern void tk_debug_account_sleep_time(struct timespec *t); +#else +#define tk_debug_account_sleep_time(x) +#endif + +#endif /* _TIMEKEEPING_INTERNAL_H */ -- cgit v1.2.3 From cd38ca854de15b26eb91009137cbe157d8a8e773 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Mon, 3 Jun 2013 18:20:29 +0000 Subject: PM / Hibernate: print physical addresses consistently with other parts of kernel Print physical address info in a style consistent with the %pR style used elsewhere in the kernel. Commit 69f1d475cc did this for a similar printk in this file, but I must have missed this one. Signed-off-by: Bjorn Helgaas Signed-off-by: Rafael J. Wysocki --- kernel/power/snapshot.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 0de28576807d..7872a35eafe7 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -642,8 +642,9 @@ __register_nosave_region(unsigned long start_pfn, unsigned long end_pfn, region->end_pfn = end_pfn; list_add_tail(®ion->list, &nosave_regions); Report: - printk(KERN_INFO "PM: Registered nosave memory: %016lx - %016lx\n", - start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT); + printk(KERN_INFO "PM: Registered nosave memory: [mem %#010llx-%#010llx]\n", + (unsigned long long) start_pfn << PAGE_SHIFT, + ((unsigned long long) end_pfn << PAGE_SHIFT) - 1); } /* -- cgit v1.2.3 From f12dc020149fad7087e119e54cffea668272bf7d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 3 Jun 2013 19:13:02 -0700 Subject: cgroup: mark "tasks" cgroup file as insane Some resources controlled by cgroup aren't per-task and cgroup core allowing threads of a single thread_group to be in different cgroups forced memcg do explicitly find the group leader and use it. This is gonna be nasty when transitioning to unified hierarchy and in general we don't want and won't support granularity finer than processes. Mark "tasks" with CFTYPE_INSANE. Signed-off-by: Tejun Heo Acked-by: Li Zefan Cc: Johannes Weiner Cc: Michal Hocko Cc: Balbir Singh Cc: KAMEZAWA Hiroyuki Cc: cgroups@vger.kernel.org Cc: Vivek Goyal --- kernel/cgroup.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index fefc41c1a147..1e0f445b5b88 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4037,6 +4037,7 @@ static int cgroup_clone_children_write(struct cgroup *cgrp, static struct cftype files[] = { { .name = "tasks", + .flags = CFTYPE_INSANE, /* use "procs" instead */ .open = cgroup_tasks_open, .write_u64 = cgroup_tasks_write, .release = cgroup_pidlist_release, -- cgit v1.2.3 From cc5943a7816ba6c00639837a62131386619548dc Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 3 Jun 2013 19:13:55 -0700 Subject: cgroup: mark "notify_on_release" and "release_agent" cgroup files insane The empty cgroup notification mechanism currently implemented in cgroup is tragically outdated. Forking and execing userland process stopped being a viable notification mechanism more than a decade ago. We're gonna have a saner mechanism. Let's make it clear that this abomination is going away. Mark "notify_on_release" and "release_agent" with CFTYPE_INSANE. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 1e0f445b5b88..b3bb8a393642 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4052,6 +4052,7 @@ static struct cftype files[] = { }, { .name = "notify_on_release", + .flags = CFTYPE_INSANE, .read_u64 = cgroup_read_notify_on_release, .write_u64 = cgroup_write_notify_on_release, }, @@ -4073,7 +4074,7 @@ static struct cftype files[] = { }, { .name = "release_agent", - .flags = CFTYPE_ONLY_ON_ROOT, + .flags = CFTYPE_INSANE | CFTYPE_ONLY_ON_ROOT, .read_seq_string = cgroup_release_agent_show, .write_string = cgroup_release_agent_write, .max_write_len = PATH_MAX, -- cgit v1.2.3 From d5c56ced775f6bdc32b689b01c9c4f9b66e18610 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 3 Jun 2013 19:14:34 -0700 Subject: cgroup: clean up the cftype array for the base cgroup files * Rename it from files[] (really?) to cgroup_base_files[]. * Drop CGROUP_FILE_GENERIC_PREFIX which was defined as "cgroup." and used inconsistently. Just use "cgroup." directly. * Collect insane files at the end. Note that only the insane ones are missing "cgroup." prefix. This patch doesn't introduce any functional changes. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 47 ++++++++++++++++++++++++----------------------- 1 file changed, 24 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index b3bb8a393642..bc53d5014b28 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4029,35 +4029,16 @@ static int cgroup_clone_children_write(struct cgroup *cgrp, return 0; } -/* - * for the common functions, 'private' gives the type of file - */ -/* for hysterical raisins, we can't put this on the older files */ -#define CGROUP_FILE_GENERIC_PREFIX "cgroup." -static struct cftype files[] = { +static struct cftype cgroup_base_files[] = { { - .name = "tasks", - .flags = CFTYPE_INSANE, /* use "procs" instead */ - .open = cgroup_tasks_open, - .write_u64 = cgroup_tasks_write, - .release = cgroup_pidlist_release, - .mode = S_IRUGO | S_IWUSR, - }, - { - .name = CGROUP_FILE_GENERIC_PREFIX "procs", + .name = "cgroup.procs", .open = cgroup_procs_open, .write_u64 = cgroup_procs_write, .release = cgroup_pidlist_release, .mode = S_IRUGO | S_IWUSR, }, { - .name = "notify_on_release", - .flags = CFTYPE_INSANE, - .read_u64 = cgroup_read_notify_on_release, - .write_u64 = cgroup_write_notify_on_release, - }, - { - .name = CGROUP_FILE_GENERIC_PREFIX "event_control", + .name = "cgroup.event_control", .write_string = cgroup_write_event_control, .mode = S_IWUGO, }, @@ -4072,6 +4053,26 @@ static struct cftype files[] = { .flags = CFTYPE_ONLY_ON_ROOT, .read_seq_string = cgroup_sane_behavior_show, }, + + /* + * Historical crazy stuff. These don't have "cgroup." prefix and + * don't exist if sane_behavior. If you're depending on these, be + * prepared to be burned. + */ + { + .name = "tasks", + .flags = CFTYPE_INSANE, /* use "procs" instead */ + .open = cgroup_tasks_open, + .write_u64 = cgroup_tasks_write, + .release = cgroup_pidlist_release, + .mode = S_IRUGO | S_IWUSR, + }, + { + .name = "notify_on_release", + .flags = CFTYPE_INSANE, + .read_u64 = cgroup_read_notify_on_release, + .write_u64 = cgroup_write_notify_on_release, + }, { .name = "release_agent", .flags = CFTYPE_INSANE | CFTYPE_ONLY_ON_ROOT, @@ -4095,7 +4096,7 @@ static int cgroup_populate_dir(struct cgroup *cgrp, bool base_files, struct cgroup_subsys *ss; if (base_files) { - err = cgroup_addrm_files(cgrp, NULL, files, true); + err = cgroup_addrm_files(cgrp, NULL, cgroup_base_files, true); if (err < 0) return err; } -- cgit v1.2.3 From 06d6b3cbdf94bc37732df83e7c25774370411a56 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 5 Jun 2013 17:15:11 +0800 Subject: cpuset: remove redundant check in cpuset_cpus_allowed_fallback() task_cs() will never return NULL. Signed-off-by: Li Zefan Signed-off-by: Tejun Heo --- kernel/cpuset.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 64b3f791bbe5..f0c884a0e574 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -2253,8 +2253,7 @@ void cpuset_cpus_allowed_fallback(struct task_struct *tsk) rcu_read_lock(); cs = task_cs(tsk); - if (cs) - do_set_cpus_allowed(tsk, cs->cpus_allowed); + do_set_cpus_allowed(tsk, cs->cpus_allowed); rcu_read_unlock(); /* -- cgit v1.2.3 From 40df2deb50570b288b7067b111af0aa9ca640e6f Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 5 Jun 2013 17:15:23 +0800 Subject: cpuset: cleanup guarantee_online_{cpus|mems}() - We never pass a NULL @cs to these functions. - The top cpuset always has some online cpus/mems. Signed-off-by: Li Zefan Signed-off-by: Tejun Heo --- kernel/cpuset.c | 29 +++++++---------------------- 1 file changed, 7 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index f0c884a0e574..d753837cca33 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -304,53 +304,38 @@ static struct file_system_type cpuset_fs_type = { /* * Return in pmask the portion of a cpusets's cpus_allowed that * are online. If none are online, walk up the cpuset hierarchy - * until we find one that does have some online cpus. If we get - * all the way to the top and still haven't found any online cpus, - * return cpu_online_mask. Or if passed a NULL cs from an exit'ing - * task, return cpu_online_mask. + * until we find one that does have some online cpus. The top + * cpuset always has some cpus online. * * One way or another, we guarantee to return some non-empty subset * of cpu_online_mask. * * Call with callback_mutex held. */ - static void guarantee_online_cpus(const struct cpuset *cs, struct cpumask *pmask) { - while (cs && !cpumask_intersects(cs->cpus_allowed, cpu_online_mask)) + while (!cpumask_intersects(cs->cpus_allowed, cpu_online_mask)) cs = parent_cs(cs); - if (cs) - cpumask_and(pmask, cs->cpus_allowed, cpu_online_mask); - else - cpumask_copy(pmask, cpu_online_mask); - BUG_ON(!cpumask_intersects(pmask, cpu_online_mask)); + cpumask_and(pmask, cs->cpus_allowed, cpu_online_mask); } /* * Return in *pmask the portion of a cpusets's mems_allowed that * are online, with memory. If none are online with memory, walk * up the cpuset hierarchy until we find one that does have some - * online mems. If we get all the way to the top and still haven't - * found any online mems, return node_states[N_MEMORY]. + * online mems. The top cpuset always has some mems online. * * One way or another, we guarantee to return some non-empty subset * of node_states[N_MEMORY]. * * Call with callback_mutex held. */ - static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask) { - while (cs && !nodes_intersects(cs->mems_allowed, - node_states[N_MEMORY])) + while (!nodes_intersects(cs->mems_allowed, node_states[N_MEMORY])) cs = parent_cs(cs); - if (cs) - nodes_and(*pmask, cs->mems_allowed, - node_states[N_MEMORY]); - else - *pmask = node_states[N_MEMORY]; - BUG_ON(!nodes_intersects(*pmask, node_states[N_MEMORY])); + nodes_and(*pmask, cs->mems_allowed, node_states[N_MEMORY]); } /* -- cgit v1.2.3 From 67bd2c59850de20d0ecdc8084cbbfe34e53b6804 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 5 Jun 2013 17:15:35 +0800 Subject: cpuset: remove unnecessary variable in cpuset_attach() We can just use oldcs->mems_allowed. Signed-off-by: Li Zefan Signed-off-by: Tejun Heo --- kernel/cpuset.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index d753837cca33..dbef832e5e2d 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1407,8 +1407,7 @@ static cpumask_var_t cpus_attach; static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) { - /* static bufs protected by cpuset_mutex */ - static nodemask_t cpuset_attach_nodemask_from; + /* static buf protected by cpuset_mutex */ static nodemask_t cpuset_attach_nodemask_to; struct mm_struct *mm; struct task_struct *task; @@ -1442,13 +1441,12 @@ static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) * Change mm, possibly for multiple threads in a threadgroup. This is * expensive and may sleep. */ - cpuset_attach_nodemask_from = oldcs->mems_allowed; cpuset_attach_nodemask_to = cs->mems_allowed; mm = get_task_mm(leader); if (mm) { mpol_rebind_mm(mm, &cpuset_attach_nodemask_to); if (is_memory_migrate(cs)) - cpuset_migrate_mm(mm, &cpuset_attach_nodemask_from, + cpuset_migrate_mm(mm, &oldcs->mems_allowed, &cpuset_attach_nodemask_to); mmput(mm); } -- cgit v1.2.3 From 249cc86db7492dc8de1d2eddebc6bcc4ab2a8e9e Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 5 Jun 2013 17:15:48 +0800 Subject: cpuset: remove cpuset_test_cpumask() The test is done in set_cpus_allowed_ptr(), so it's redundant. Signed-off-by: Li Zefan Signed-off-by: Tejun Heo --- kernel/cpuset.c | 19 +------------------ 1 file changed, 1 insertion(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index dbef832e5e2d..51f8e1d5a2a9 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -783,23 +783,6 @@ void rebuild_sched_domains(void) mutex_unlock(&cpuset_mutex); } -/** - * cpuset_test_cpumask - test a task's cpus_allowed versus its cpuset's - * @tsk: task to test - * @scan: struct cgroup_scanner contained in its struct cpuset_hotplug_scanner - * - * Call with cpuset_mutex held. May take callback_mutex during call. - * Called for each task in a cgroup by cgroup_scan_tasks(). - * Return nonzero if this tasks's cpus_allowed mask should be changed (in other - * words, if its mask is not equal to its cpuset's mask). - */ -static int cpuset_test_cpumask(struct task_struct *tsk, - struct cgroup_scanner *scan) -{ - return !cpumask_equal(&tsk->cpus_allowed, - (cgroup_cs(scan->cg))->cpus_allowed); -} - /** * cpuset_change_cpumask - make a task's cpus_allowed the same as its cpuset's * @tsk: task to test @@ -835,7 +818,7 @@ static void update_tasks_cpumask(struct cpuset *cs, struct ptr_heap *heap) struct cgroup_scanner scan; scan.cg = cs->css.cgroup; - scan.test_task = cpuset_test_cpumask; + scan.test_task = NULL; scan.process_task = cpuset_change_cpumask; scan.heap = heap; cgroup_scan_tasks(&scan); -- cgit v1.2.3 From a73456f37b9dbc917398387d0cba926b4455b70f Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 5 Jun 2013 17:15:59 +0800 Subject: cpuset: re-structure update_cpumask() a bit Check if cpus_allowed is to be changed before calling validate_change(). This won't change any behavior, but later it will allow us to do this: # mkdir /cpuset/child # echo $$ > /cpuset/child/tasks /* empty cpuset */ # echo > /cpuset/child/cpuset.cpus /* do nothing, won't fail */ Without this patch, the last operation will fail. Signed-off-by: Li Zefan Signed-off-by: Tejun Heo --- kernel/cpuset.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 51f8e1d5a2a9..535dce685eec 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -856,14 +856,15 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, if (!cpumask_subset(trialcs->cpus_allowed, cpu_active_mask)) return -EINVAL; } - retval = validate_change(cs, trialcs); - if (retval < 0) - return retval; /* Nothing to do if the cpus didn't change */ if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed)) return 0; + retval = validate_change(cs, trialcs); + if (retval < 0) + return retval; + retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL); if (retval) return retval; -- cgit v1.2.3 From e44193d39e8d4d1de5d996fcd37ed75e5c704f10 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Sun, 9 Jun 2013 17:14:22 +0800 Subject: cpuset: let hotplug propagation work wait for task attaching Instead of triggering propagation work in cpuset_attach(), we make hotplug propagation work wait until there's no task attaching in progress. IMO this is more robust. We won't see empty masks in cpuset_attach(). Also it's a preparation for removing propagation work. Without asynchronous propagation we can't call move_tasks_in_empty_cpuset() in cpuset_attach(), because otherwise we'll deadlock on cgroup_mutex. tj: typo fixes. Signed-off-by: Li Zefan Signed-off-by: Tejun Heo --- kernel/cpuset.c | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 535dce685eec..e902473f76bf 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -59,6 +59,7 @@ #include #include #include +#include /* * Tracks how many cpusets are currently defined in system. @@ -275,6 +276,8 @@ static void schedule_cpuset_propagate_hotplug(struct cpuset *cs); static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn); +static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq); + /* * This is ugly, but preserves the userspace API for existing cpuset * users. If someone tries to mount the "cpuset" filesystem, we @@ -1436,14 +1439,8 @@ static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) } cs->attach_in_progress--; - - /* - * We may have raced with CPU/memory hotunplug. Trigger hotplug - * propagation if @cs doesn't have any CPU or memory. It will move - * the newly added tasks to the nearest parent which can execute. - */ - if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) - schedule_cpuset_propagate_hotplug(cs); + if (!cs->attach_in_progress) + wake_up(&cpuset_attach_wq); mutex_unlock(&cpuset_mutex); } @@ -1555,10 +1552,6 @@ static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft, * resources, wait for the previously scheduled operations before * proceeding, so that we don't end up keep removing tasks added * after execution capability is restored. - * - * Flushing cpuset_hotplug_work is enough to synchronize against - * hotplug hanlding; however, cpuset_attach() may schedule - * propagation work directly. Flush the workqueue too. */ flush_work(&cpuset_hotplug_work); flush_workqueue(cpuset_propagate_hotplug_wq); @@ -2005,8 +1998,20 @@ static void cpuset_propagate_hotplug_workfn(struct work_struct *work) struct cpuset *cs = container_of(work, struct cpuset, hotplug_work); bool is_empty; +retry: + wait_event(cpuset_attach_wq, cs->attach_in_progress == 0); + mutex_lock(&cpuset_mutex); + /* + * We have raced with task attaching. We wait until attaching + * is finished, so we won't attach a task to an empty cpuset. + */ + if (cs->attach_in_progress) { + mutex_unlock(&cpuset_mutex); + goto retry; + } + cpumask_andnot(&off_cpus, cs->cpus_allowed, top_cpuset.cpus_allowed); nodes_andnot(off_mems, cs->mems_allowed, top_cpuset.mems_allowed); -- cgit v1.2.3 From 388afd8549dc8be0920e00ae9404341593b6bd7c Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Sun, 9 Jun 2013 17:14:47 +0800 Subject: cpuset: remove async hotplug propagation work As we can drop rcu read lock while iterating cgroup hierarchy, we don't have to do propagation asynchronously via workqueue. Signed-off-by: Li Zefan Signed-off-by: Tejun Heo --- kernel/cpuset.c | 69 +++++++++++++-------------------------------------------- 1 file changed, 16 insertions(+), 53 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index e902473f76bf..608fe1308b22 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -101,8 +101,6 @@ struct cpuset { /* for custom sched domain */ int relax_domain_level; - - struct work_struct hotplug_work; }; /* Retrieve the cpuset for a cgroup */ @@ -268,12 +266,7 @@ static DEFINE_MUTEX(callback_mutex); /* * CPU / memory hotplug is handled asynchronously. */ -static struct workqueue_struct *cpuset_propagate_hotplug_wq; - static void cpuset_hotplug_workfn(struct work_struct *work); -static void cpuset_propagate_hotplug_workfn(struct work_struct *work); -static void schedule_cpuset_propagate_hotplug(struct cpuset *cs); - static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn); static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq); @@ -1554,7 +1547,6 @@ static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft, * after execution capability is restored. */ flush_work(&cpuset_hotplug_work); - flush_workqueue(cpuset_propagate_hotplug_wq); mutex_lock(&cpuset_mutex); if (!is_cpuset_online(cs)) @@ -1821,7 +1813,6 @@ static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cont) cpumask_clear(cs->cpus_allowed); nodes_clear(cs->mems_allowed); fmeter_init(&cs->fmeter); - INIT_WORK(&cs->hotplug_work, cpuset_propagate_hotplug_workfn); cs->relax_domain_level = -1; return &cs->css; @@ -1984,18 +1975,17 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs) } /** - * cpuset_propagate_hotplug_workfn - propagate CPU/memory hotplug to a cpuset + * cpuset_hotplug_update_tasks - update tasks in a cpuset for hotunplug * @cs: cpuset in interest * * Compare @cs's cpu and mem masks against top_cpuset and if some have gone * offline, update @cs accordingly. If @cs ends up with no CPU or memory, * all its tasks are moved to the nearest ancestor with both resources. */ -static void cpuset_propagate_hotplug_workfn(struct work_struct *work) +static void cpuset_hotplug_update_tasks(struct cpuset *cs) { static cpumask_t off_cpus; static nodemask_t off_mems, tmp_mems; - struct cpuset *cs = container_of(work, struct cpuset, hotplug_work); bool is_empty; retry: @@ -2044,34 +2034,6 @@ retry: */ if (is_empty) remove_tasks_in_empty_cpuset(cs); - - /* the following may free @cs, should be the last operation */ - css_put(&cs->css); -} - -/** - * schedule_cpuset_propagate_hotplug - schedule hotplug propagation to a cpuset - * @cs: cpuset of interest - * - * Schedule cpuset_propagate_hotplug_workfn() which will update CPU and - * memory masks according to top_cpuset. - */ -static void schedule_cpuset_propagate_hotplug(struct cpuset *cs) -{ - /* - * Pin @cs. The refcnt will be released when the work item - * finishes executing. - */ - if (!css_tryget(&cs->css)) - return; - - /* - * Queue @cs->hotplug_work. If already pending, lose the css ref. - * cpuset_propagate_hotplug_wq is ordered and propagation will - * happen in the order this function is called. - */ - if (!queue_work(cpuset_propagate_hotplug_wq, &cs->hotplug_work)) - css_put(&cs->css); } /** @@ -2084,8 +2046,8 @@ static void schedule_cpuset_propagate_hotplug(struct cpuset *cs) * actively using CPU hotplug but making no active use of cpusets. * * Non-root cpusets are only affected by offlining. If any CPUs or memory - * nodes have been taken down, cpuset_propagate_hotplug() is invoked on all - * descendants. + * nodes have been taken down, cpuset_hotplug_update_tasks() is invoked on + * all descendants. * * Note that CPU offlining during suspend is ignored. We don't modify * cpusets across suspend/resume cycles at all. @@ -2128,21 +2090,26 @@ static void cpuset_hotplug_workfn(struct work_struct *work) update_tasks_nodemask(&top_cpuset, &tmp_mems, NULL); } + mutex_unlock(&cpuset_mutex); + /* if cpus or mems went down, we need to propagate to descendants */ if (cpus_offlined || mems_offlined) { struct cpuset *cs; struct cgroup *pos_cgrp; rcu_read_lock(); - cpuset_for_each_descendant_pre(cs, pos_cgrp, &top_cpuset) - schedule_cpuset_propagate_hotplug(cs); - rcu_read_unlock(); - } + cpuset_for_each_descendant_pre(cs, pos_cgrp, &top_cpuset) { + if (!css_tryget(&cs->css)) + continue; + rcu_read_unlock(); - mutex_unlock(&cpuset_mutex); + cpuset_hotplug_update_tasks(cs); - /* wait for propagations to finish */ - flush_workqueue(cpuset_propagate_hotplug_wq); + rcu_read_lock(); + css_put(&cs->css); + } + rcu_read_unlock(); + } /* rebuild sched domains if cpus_allowed has changed */ if (cpus_updated) @@ -2193,10 +2160,6 @@ void __init cpuset_init_smp(void) top_cpuset.mems_allowed = node_states[N_MEMORY]; register_hotmemory_notifier(&cpuset_track_online_nodes_nb); - - cpuset_propagate_hotplug_wq = - alloc_ordered_workqueue("cpuset_hotplug", 0); - BUG_ON(!cpuset_propagate_hotplug_wq); } /** -- cgit v1.2.3 From 5e1cda5b8ae93f5f02e8c5a30390ac9b4d2c20e6 Mon Sep 17 00:00:00 2001 From: Grant Likely Date: Wed, 29 May 2013 03:10:53 +0100 Subject: irqdomain: Relax failure path on setting up mappings Commit 98aa468e, "irqdomain: Support for static IRQ mapping and association" introduced an API for directly associating blocks of hwirqs to linux irqs. However, if any irq in that block failed to map (say if the mapping functions returns an error because the irq is already mapped) then the whole thing will fail and roll back. This is probably too aggressive since there are valid reasons why a mapping may fail. ie. Firmware may have a particular IRQ marked as unusable. This patch drops the error path out of irq_domain_associate(). If a mapping fails, then it is simply skipped. There is no reason to fail the entire allocation. v2: Still output an information message on failed mappings and make sure attempted mapping gets cleared out of the irq_data structure. Signed-off-by: Grant Likely Cc: Paul Mundt Cc: Benjamin Herrenschmidt Cc: Thomas Gleixner --- kernel/irq/irqdomain.c | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 20b677dd0b27..61d6d3c80fee 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -464,23 +464,15 @@ int irq_domain_associate_many(struct irq_domain *domain, unsigned int irq_base, /* * If map() returns -EPERM, this interrupt is protected * by the firmware or some other service and shall not - * be mapped. - * - * Since on some platforms we blindly try to map everything - * we end up with a log full of backtraces. - * - * So instead, we silently fail on -EPERM, it is the - * responsibility of the PIC driver to display a relevant - * message if needed. + * be mapped. Don't bother telling the user about it. */ if (ret != -EPERM) { - pr_err("irq-%i==>hwirq-0x%lx mapping failed: %d\n", - virq, hwirq, ret); - WARN_ON(1); + pr_info("%s didn't like hwirq-0x%lx to VIRQ%i mapping (rc=%d)\n", + of_node_full_name(domain->of_node), hwirq, virq, ret); } irq_data->domain = NULL; irq_data->hwirq = 0; - goto err_unmap; + continue; } } -- cgit v1.2.3 From 9bbf877d3b6b8c5991000296f40a3f0fe66fa89b Mon Sep 17 00:00:00 2001 From: Grant Likely Date: Thu, 6 Jun 2013 12:10:24 +0100 Subject: irqdomain: Replace LEGACY mapping with LINEAR The LEGACY mapping unnecessarily complicates the irqdomain code and can easily be implemented with a linear mapping. By ripping it out and replacing it with the LINEAR mapping the object size of irqdomain.c shrinks by about 330 bytes (ARMv7) which offsets the additional allocation required by the linear map. It also makes it possible for current LEGACY map users to pre-allocate irq_descs for a subset of the hwirqs and dynamically allocate the rest as needed. Signed-off-by: Grant Likely Cc: Paul Mundt Cc: Benjamin Herrenschmidt Cc: Thomas Gleixner Cc: Rob Herring --- include/linux/irqdomain.h | 7 ---- kernel/irq/irqdomain.c | 84 ++++------------------------------------------- 2 files changed, 6 insertions(+), 85 deletions(-) (limited to 'kernel') diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h index ba2c708adcff..6f062416e5bf 100644 --- a/include/linux/irqdomain.h +++ b/include/linux/irqdomain.h @@ -93,11 +93,6 @@ struct irq_domain { /* type of reverse mapping_technique */ unsigned int revmap_type; union { - struct { - unsigned int size; - unsigned int first_irq; - irq_hw_number_t first_hwirq; - } legacy; struct { unsigned int size; unsigned int *revmap; @@ -117,8 +112,6 @@ struct irq_domain { struct irq_domain_chip_generic *gc; }; -#define IRQ_DOMAIN_MAP_LEGACY 0 /* driver allocated fixed range of irqs. - * ie. legacy 8259, gets irqs 1..15 */ #define IRQ_DOMAIN_MAP_NOMAP 1 /* no fast reverse mapping */ #define IRQ_DOMAIN_MAP_LINEAR 2 /* linear map of interrupts */ #define IRQ_DOMAIN_MAP_TREE 3 /* radix tree */ diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 61d6d3c80fee..1ac8cf41b9a5 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -82,13 +82,6 @@ void irq_domain_remove(struct irq_domain *domain) mutex_lock(&irq_domain_mutex); switch (domain->revmap_type) { - case IRQ_DOMAIN_MAP_LEGACY: - /* - * Legacy domains don't manage their own irq_desc - * allocations, we expect the caller to handle irq_desc - * freeing on their own. - */ - break; case IRQ_DOMAIN_MAP_TREE: /* * radix_tree_delete() takes care of destroying the root @@ -122,17 +115,6 @@ void irq_domain_remove(struct irq_domain *domain) } EXPORT_SYMBOL_GPL(irq_domain_remove); -static unsigned int irq_domain_legacy_revmap(struct irq_domain *domain, - irq_hw_number_t hwirq) -{ - irq_hw_number_t first_hwirq = domain->revmap_data.legacy.first_hwirq; - int size = domain->revmap_data.legacy.size; - - if (WARN_ON(hwirq < first_hwirq || hwirq >= first_hwirq + size)) - return 0; - return hwirq - first_hwirq + domain->revmap_data.legacy.first_irq; -} - /** * irq_domain_add_simple() - Allocate and register a simple irq_domain. * @of_node: pointer to interrupt controller's device tree node. @@ -213,57 +195,17 @@ struct irq_domain *irq_domain_add_legacy(struct device_node *of_node, void *host_data) { struct irq_domain *domain; - unsigned int i; - domain = irq_domain_alloc(of_node, IRQ_DOMAIN_MAP_LEGACY, ops, host_data); + pr_debug("Setting up legacy domain virq[%i:%i] ==> hwirq[%i:%i]\n", + first_irq, first_irq + size - 1, + (int)first_hwirq, (int)first_hwirq + size -1); + + domain = irq_domain_add_linear(of_node, first_hwirq + size, ops, host_data); if (!domain) return NULL; - domain->revmap_data.legacy.first_irq = first_irq; - domain->revmap_data.legacy.first_hwirq = first_hwirq; - domain->revmap_data.legacy.size = size; - - mutex_lock(&irq_domain_mutex); - /* Verify that all the irqs are available */ - for (i = 0; i < size; i++) { - int irq = first_irq + i; - struct irq_data *irq_data = irq_get_irq_data(irq); - - if (WARN_ON(!irq_data || irq_data->domain)) { - mutex_unlock(&irq_domain_mutex); - irq_domain_free(domain); - return NULL; - } - } + WARN_ON(irq_domain_associate_many(domain, first_irq, first_hwirq, size)); - /* Claim all of the irqs before registering a legacy domain */ - for (i = 0; i < size; i++) { - struct irq_data *irq_data = irq_get_irq_data(first_irq + i); - irq_data->hwirq = first_hwirq + i; - irq_data->domain = domain; - } - mutex_unlock(&irq_domain_mutex); - - for (i = 0; i < size; i++) { - int irq = first_irq + i; - int hwirq = first_hwirq + i; - - /* IRQ0 gets ignored */ - if (!irq) - continue; - - /* Legacy flags are left to default at this point, - * one can then use irq_create_mapping() to - * explicitly change them - */ - if (ops->map) - ops->map(domain, irq, hwirq); - - /* Clear norequest flags */ - irq_clear_status_flags(irq, IRQ_NOREQUEST); - } - - irq_domain_add(domain); return domain; } EXPORT_SYMBOL_GPL(irq_domain_add_legacy); @@ -492,10 +434,6 @@ int irq_domain_associate_many(struct irq_domain *domain, unsigned int irq_base, } return 0; - - err_unmap: - irq_domain_disassociate_many(domain, irq_base, i); - return -EINVAL; } EXPORT_SYMBOL_GPL(irq_domain_associate_many); @@ -575,10 +513,6 @@ unsigned int irq_create_mapping(struct irq_domain *domain, return virq; } - /* Get a virtual interrupt number */ - if (domain->revmap_type == IRQ_DOMAIN_MAP_LEGACY) - return irq_domain_legacy_revmap(domain, hwirq); - /* Allocate a virtual interrupt number */ hint = hwirq % nr_irqs; if (hint == 0) @@ -706,10 +640,6 @@ void irq_dispose_mapping(unsigned int virq) if (WARN_ON(domain == NULL)) return; - /* Never unmap legacy interrupts */ - if (domain->revmap_type == IRQ_DOMAIN_MAP_LEGACY) - return; - irq_domain_disassociate_many(domain, virq, 1); irq_free_desc(virq); } @@ -732,8 +662,6 @@ unsigned int irq_find_mapping(struct irq_domain *domain, return 0; switch (domain->revmap_type) { - case IRQ_DOMAIN_MAP_LEGACY: - return irq_domain_legacy_revmap(domain, hwirq); case IRQ_DOMAIN_MAP_LINEAR: return irq_linear_revmap(domain, hwirq); case IRQ_DOMAIN_MAP_TREE: -- cgit v1.2.3 From 0bb4afb45dd1add73ca643a865daa38716aeff0c Mon Sep 17 00:00:00 2001 From: Grant Likely Date: Thu, 6 Jun 2013 14:23:30 +0100 Subject: irqdomain: Add a name field This patch adds a name field to the irq_domain structure to help mere mortals understand the mappings between irq domains and virqs. It also converts a number of places that have open-coded some kind of fudging an irqdomain name to use the new field. This means a more consistent display of names in irq domain log messages and debugfs output. Signed-off-by: Grant Likely --- include/linux/irqdomain.h | 1 + kernel/irq/generic-chip.c | 1 + kernel/irq/irqdomain.c | 19 ++++++------------- 3 files changed, 8 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h index 6f062416e5bf..e5e513c2d104 100644 --- a/include/linux/irqdomain.h +++ b/include/linux/irqdomain.h @@ -89,6 +89,7 @@ struct irq_domain_chip_generic; */ struct irq_domain { struct list_head link; + const char *name; /* type of reverse mapping_technique */ unsigned int revmap_type; diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c index 95575d8d5392..ca98cc5d6308 100644 --- a/kernel/irq/generic-chip.c +++ b/kernel/irq/generic-chip.c @@ -305,6 +305,7 @@ int irq_alloc_domain_generic_chips(struct irq_domain *d, int irqs_per_chip, /* Calc pointer to the next generic chip */ tmp += sizeof(*gc) + num_ct * sizeof(struct irq_chip_type); } + d->name = name; return 0; } EXPORT_SYMBOL_GPL(irq_alloc_domain_generic_chips); diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 1ac8cf41b9a5..b1b5e6793fd2 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -410,12 +410,15 @@ int irq_domain_associate_many(struct irq_domain *domain, unsigned int irq_base, */ if (ret != -EPERM) { pr_info("%s didn't like hwirq-0x%lx to VIRQ%i mapping (rc=%d)\n", - of_node_full_name(domain->of_node), hwirq, virq, ret); + domain->name, hwirq, virq, ret); } irq_data->domain = NULL; irq_data->hwirq = 0; continue; } + /* If not already assigned, give the domain the chip's name */ + if (!domain->name && irq_data->chip) + domain->name = irq_data->chip->name; } switch (domain->revmap_type) { @@ -708,8 +711,6 @@ static int virq_debug_show(struct seq_file *m, void *private) { unsigned long flags; struct irq_desc *desc; - const char *p; - static const char none[] = "none"; void *data; int i; @@ -731,20 +732,12 @@ static int virq_debug_show(struct seq_file *m, void *private) seq_printf(m, "0x%05lx ", desc->irq_data.hwirq); chip = irq_desc_get_chip(desc); - if (chip && chip->name) - p = chip->name; - else - p = none; - seq_printf(m, "%-15s ", p); + seq_printf(m, "%-15s ", (chip && chip->name) ? chip->name : "none"); data = irq_desc_get_chip_data(desc); seq_printf(m, data ? "0x%p " : " %p ", data); - if (desc->irq_data.domain) - p = of_node_full_name(desc->irq_data.domain->of_node); - else - p = none; - seq_printf(m, "%s\n", p); + seq_printf(m, "%s\n", desc->irq_data.domain->name); } raw_spin_unlock_irqrestore(&desc->lock, flags); -- cgit v1.2.3 From cef5075c8c238ffd04c86a77a5a9bdbd18031137 Mon Sep 17 00:00:00 2001 From: Grant Likely Date: Wed, 11 Jul 2012 17:24:31 +0100 Subject: irqdomain: merge linear and tree reverse mappings. Keeping them separate makes irq_domain more complex and adds a lot of code (as proven by the diffstat). Merging them simplifies the whole scheme. This change makes it so both the tree and linear methods can be used by the same irq_domain instance. If the hwirq is less than the ->linear_size, then the linear map is used to reverse map the hwirq. Otherwise the radix tree is used. The test for which map to use is no more expensive that the existing code, so the performance of fast path is preserved. It also means that complex interrupt controllers can use both the linear map and a tree in the same domain. This may be useful for an interrupt controller with a base set of core irqs and a large number of GPIOs which might be used as irqs. The linear map could cover the core irqs, and the tree used for thas irqs. The linear map could cover the core irqs, and the tree used for the gpios. v2: Drop reorganization of revmap data Signed-off-by: Grant Likely Cc: Paul Mundt Cc: Benjamin Herrenschmidt Cc: Thomas Gleixner Cc: Rob Herring --- include/linux/irqdomain.h | 18 ++++---- kernel/irq/irqdomain.c | 107 +++++++++++++--------------------------------- 2 files changed, 39 insertions(+), 86 deletions(-) (limited to 'kernel') diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h index e5e513c2d104..1cbb7413c121 100644 --- a/include/linux/irqdomain.h +++ b/include/linux/irqdomain.h @@ -75,7 +75,6 @@ struct irq_domain_chip_generic; * @link: Element in global irq_domain list. * @revmap_type: Method used for reverse mapping hwirq numbers to linux irq. This * will be one of the IRQ_DOMAIN_MAP_* values. - * @revmap_data: Revmap method specific data. * @ops: pointer to irq_domain methods * @host_data: private data pointer for use by owner. Not touched by irq_domain * core code. @@ -93,10 +92,9 @@ struct irq_domain { /* type of reverse mapping_technique */ unsigned int revmap_type; - union { + struct { struct { unsigned int size; - unsigned int *revmap; } linear; struct { unsigned int max_irq; @@ -111,11 +109,13 @@ struct irq_domain { struct device_node *of_node; /* Optional pointer to generic interrupt chips */ struct irq_domain_chip_generic *gc; + + /* Linear reverse map */ + unsigned int linear_revmap[]; }; #define IRQ_DOMAIN_MAP_NOMAP 1 /* no fast reverse mapping */ #define IRQ_DOMAIN_MAP_LINEAR 2 /* linear map of interrupts */ -#define IRQ_DOMAIN_MAP_TREE 3 /* radix tree */ #ifdef CONFIG_IRQ_DOMAIN struct irq_domain *irq_domain_add_simple(struct device_node *of_node, @@ -137,10 +137,6 @@ struct irq_domain *irq_domain_add_nomap(struct device_node *of_node, unsigned int max_irq, const struct irq_domain_ops *ops, void *host_data); -struct irq_domain *irq_domain_add_tree(struct device_node *of_node, - const struct irq_domain_ops *ops, - void *host_data); - extern struct irq_domain *irq_find_host(struct device_node *node); extern void irq_set_default_host(struct irq_domain *host); @@ -152,6 +148,12 @@ static inline struct irq_domain *irq_domain_add_legacy_isa( return irq_domain_add_legacy(of_node, NUM_ISA_INTERRUPTS, 0, 0, ops, host_data); } +static inline struct irq_domain *irq_domain_add_tree(struct device_node *of_node, + const struct irq_domain_ops *ops, + void *host_data) +{ + return irq_domain_add_linear(of_node, 0, ops, host_data); +} extern void irq_domain_remove(struct irq_domain *host); diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index b1b5e6793fd2..5a1d8ec8509e 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -34,22 +34,24 @@ static struct irq_domain *irq_default_domain; * to IRQ domain, or NULL on failure. */ static struct irq_domain *irq_domain_alloc(struct device_node *of_node, - unsigned int revmap_type, + unsigned int revmap_type, int size, const struct irq_domain_ops *ops, void *host_data) { struct irq_domain *domain; - domain = kzalloc_node(sizeof(*domain), GFP_KERNEL, - of_node_to_nid(of_node)); + domain = kzalloc_node(sizeof(*domain) + (sizeof(unsigned int) * size), + GFP_KERNEL, of_node_to_nid(of_node)); if (WARN_ON(!domain)) return NULL; /* Fill structure */ + INIT_RADIX_TREE(&domain->revmap_data.tree, GFP_KERNEL); domain->revmap_type = revmap_type; domain->ops = ops; domain->host_data = host_data; domain->of_node = of_node_get(of_node); + domain->revmap_data.linear.size = size; return domain; } @@ -81,22 +83,12 @@ void irq_domain_remove(struct irq_domain *domain) { mutex_lock(&irq_domain_mutex); - switch (domain->revmap_type) { - case IRQ_DOMAIN_MAP_TREE: - /* - * radix_tree_delete() takes care of destroying the root - * node when all entries are removed. Shout if there are - * any mappings left. - */ - WARN_ON(domain->revmap_data.tree.height); - break; - case IRQ_DOMAIN_MAP_LINEAR: - kfree(domain->revmap_data.linear.revmap); - domain->revmap_data.linear.size = 0; - break; - case IRQ_DOMAIN_MAP_NOMAP: - break; - } + /* + * radix_tree_delete() takes care of destroying the root + * node when all entries are removed. Shout if there are + * any mappings left. + */ + WARN_ON(domain->revmap_data.tree.height); list_del(&domain->link); @@ -223,20 +215,11 @@ struct irq_domain *irq_domain_add_linear(struct device_node *of_node, void *host_data) { struct irq_domain *domain; - unsigned int *revmap; - revmap = kzalloc_node(sizeof(*revmap) * size, GFP_KERNEL, - of_node_to_nid(of_node)); - if (WARN_ON(!revmap)) + domain = irq_domain_alloc(of_node, IRQ_DOMAIN_MAP_LINEAR, size, ops, host_data); + if (!domain) return NULL; - domain = irq_domain_alloc(of_node, IRQ_DOMAIN_MAP_LINEAR, ops, host_data); - if (!domain) { - kfree(revmap); - return NULL; - } - domain->revmap_data.linear.size = size; - domain->revmap_data.linear.revmap = revmap; irq_domain_add(domain); return domain; } @@ -248,7 +231,7 @@ struct irq_domain *irq_domain_add_nomap(struct device_node *of_node, void *host_data) { struct irq_domain *domain = irq_domain_alloc(of_node, - IRQ_DOMAIN_MAP_NOMAP, ops, host_data); + IRQ_DOMAIN_MAP_NOMAP, 0, ops, host_data); if (domain) { domain->revmap_data.nomap.max_irq = max_irq ? max_irq : ~0; irq_domain_add(domain); @@ -257,28 +240,6 @@ struct irq_domain *irq_domain_add_nomap(struct device_node *of_node, } EXPORT_SYMBOL_GPL(irq_domain_add_nomap); -/** - * irq_domain_add_tree() - * @of_node: pointer to interrupt controller's device tree node. - * @ops: map/unmap domain callbacks - * - * Note: The radix tree will be allocated later during boot automatically - * (the reverse mapping will use the slow path until that happens). - */ -struct irq_domain *irq_domain_add_tree(struct device_node *of_node, - const struct irq_domain_ops *ops, - void *host_data) -{ - struct irq_domain *domain = irq_domain_alloc(of_node, - IRQ_DOMAIN_MAP_TREE, ops, host_data); - if (domain) { - INIT_RADIX_TREE(&domain->revmap_data.tree, GFP_KERNEL); - irq_domain_add(domain); - } - return domain; -} -EXPORT_SYMBOL_GPL(irq_domain_add_tree); - /** * irq_find_host() - Locates a domain for a given device node * @node: device-tree node of the interrupt controller @@ -359,17 +320,13 @@ static void irq_domain_disassociate_many(struct irq_domain *domain, irq_data->domain = NULL; irq_data->hwirq = 0; - /* Clear reverse map */ - switch(domain->revmap_type) { - case IRQ_DOMAIN_MAP_LINEAR: - if (hwirq < domain->revmap_data.linear.size) - domain->revmap_data.linear.revmap[hwirq] = 0; - break; - case IRQ_DOMAIN_MAP_TREE: + /* Clear reverse map for this hwirq */ + if (hwirq < domain->revmap_data.linear.size) { + domain->linear_revmap[hwirq] = 0; + } else { mutex_lock(&revmap_trees_mutex); radix_tree_delete(&domain->revmap_data.tree, hwirq); mutex_unlock(&revmap_trees_mutex); - break; } } } @@ -421,16 +378,12 @@ int irq_domain_associate_many(struct irq_domain *domain, unsigned int irq_base, domain->name = irq_data->chip->name; } - switch (domain->revmap_type) { - case IRQ_DOMAIN_MAP_LINEAR: - if (hwirq < domain->revmap_data.linear.size) - domain->revmap_data.linear.revmap[hwirq] = virq; - break; - case IRQ_DOMAIN_MAP_TREE: + if (hwirq < domain->revmap_data.linear.size) { + domain->linear_revmap[hwirq] = virq; + } else { mutex_lock(&revmap_trees_mutex); radix_tree_insert(&domain->revmap_data.tree, hwirq, irq_data); mutex_unlock(&revmap_trees_mutex); - break; } irq_clear_status_flags(virq, IRQ_NOREQUEST); @@ -667,13 +620,6 @@ unsigned int irq_find_mapping(struct irq_domain *domain, switch (domain->revmap_type) { case IRQ_DOMAIN_MAP_LINEAR: return irq_linear_revmap(domain, hwirq); - case IRQ_DOMAIN_MAP_TREE: - rcu_read_lock(); - data = radix_tree_lookup(&domain->revmap_data.tree, hwirq); - rcu_read_unlock(); - if (data) - return data->irq; - break; case IRQ_DOMAIN_MAP_NOMAP: data = irq_get_irq_data(hwirq); if (data && (data->domain == domain) && (data->hwirq == hwirq)) @@ -696,13 +642,18 @@ EXPORT_SYMBOL_GPL(irq_find_mapping); unsigned int irq_linear_revmap(struct irq_domain *domain, irq_hw_number_t hwirq) { + struct irq_data *data; BUG_ON(domain->revmap_type != IRQ_DOMAIN_MAP_LINEAR); /* Check revmap bounds; complain if exceeded */ - if (WARN_ON(hwirq >= domain->revmap_data.linear.size)) - return 0; + if (hwirq >= domain->revmap_data.linear.size) { + rcu_read_lock(); + data = radix_tree_lookup(&domain->revmap_data.tree, hwirq); + rcu_read_unlock(); + return data ? data->irq : 0; + } - return domain->revmap_data.linear.revmap[hwirq]; + return domain->linear_revmap[hwirq]; } EXPORT_SYMBOL_GPL(irq_linear_revmap); -- cgit v1.2.3 From 1aa0dd94ca07df818cf14588c9031ab1d7fd84d3 Mon Sep 17 00:00:00 2001 From: Grant Likely Date: Sat, 8 Jun 2013 12:03:59 +0100 Subject: irqdomain: Eliminate revmap type The NOMAP irq_domain type is only used by a handful of interrupt controllers and it unnecessarily complicates the code by adding special cases on how to look up mappings and different revmap functions are used for each type which need to validate the correct type is passed to it before performing the reverse map. Eliminating the revmap_type and making a single reverse mapping function simplifies the code. It also shouldn't be any slower than having separate revmap functions because the type of the revmap needed to be checked anyway. The linear and tree revmap types were already merged in a previous patch. This patch rolls the NOMAP or direct mapping behaviour into the same domain code making is possible for an irq domain to do any mapping type; linear, tree or direct; and that the mapping will be transparent to the interrupt controller driver. With this change, direct mappings will get stored in the linear or tree mapping for consistency. Reverse mapping from the hwirq to virq will go through the normal lookup process. However, any controller using a direct mapping can take advantage of knowing that hwirq==virq for any mapped interrupts skip doing a revmap lookup when handling IRQs. Signed-off-by: Grant Likely --- include/linux/irqdomain.h | 48 +++++++++++++++++------------------------ kernel/irq/generic-chip.c | 5 +---- kernel/irq/irqdomain.c | 55 +++++++++++++++++++---------------------------- 3 files changed, 43 insertions(+), 65 deletions(-) (limited to 'kernel') diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h index 1cbb7413c121..51ef84a3c990 100644 --- a/include/linux/irqdomain.h +++ b/include/linux/irqdomain.h @@ -73,50 +73,42 @@ struct irq_domain_chip_generic; /** * struct irq_domain - Hardware interrupt number translation object * @link: Element in global irq_domain list. - * @revmap_type: Method used for reverse mapping hwirq numbers to linux irq. This - * will be one of the IRQ_DOMAIN_MAP_* values. + * @name: Name of interrupt domain * @ops: pointer to irq_domain methods * @host_data: private data pointer for use by owner. Not touched by irq_domain * core code. - * @irq_base: Start of irq_desc range assigned to the irq_domain. The creator - * of the irq_domain is responsible for allocating the array of - * irq_desc structures. - * @nr_irq: Number of irqs managed by the irq domain - * @hwirq_base: Starting number for hwirqs managed by the irq domain - * @of_node: (optional) Pointer to device tree nodes associated with the - * irq_domain. Used when decoding device tree interrupt specifiers. + * + * Optional elements + * @of_node: Pointer to device tree nodes associated with the irq_domain. Used + * when decoding device tree interrupt specifiers. + * @gc: Pointer to a list of generic chips. There is a helper function for + * setting up one or more generic chips for interrupt controllers + * drivers using the generic chip library which uses this pointer. + * + * Revmap data, used internally by irq_domain + * @revmap_direct_max_irq: The largest hwirq that can be set for controllers that + * support direct mapping + * @revmap_size: Size of the linear map table @linear_revmap[] + * @revmap_tree: Radix map tree for hwirqs that don't fit in the linear map + * @linear_revmap: Linear table of hwirq->virq reverse mappings */ struct irq_domain { struct list_head link; const char *name; - - /* type of reverse mapping_technique */ - unsigned int revmap_type; - struct { - struct { - unsigned int size; - } linear; - struct { - unsigned int max_irq; - } nomap; - struct radix_tree_root tree; - } revmap_data; const struct irq_domain_ops *ops; void *host_data; - irq_hw_number_t inval_irq; - /* Optional device node pointer */ + /* Optional data */ struct device_node *of_node; - /* Optional pointer to generic interrupt chips */ struct irq_domain_chip_generic *gc; - /* Linear reverse map */ + /* reverse map data. The linear map gets appended to the irq_domain */ + unsigned int revmap_direct_max_irq; + unsigned int revmap_size; + struct radix_tree_root revmap_tree; unsigned int linear_revmap[]; }; -#define IRQ_DOMAIN_MAP_NOMAP 1 /* no fast reverse mapping */ -#define IRQ_DOMAIN_MAP_LINEAR 2 /* linear map of interrupts */ - #ifdef CONFIG_IRQ_DOMAIN struct irq_domain *irq_domain_add_simple(struct device_node *of_node, unsigned int size, diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c index ca98cc5d6308..4b011064e146 100644 --- a/kernel/irq/generic-chip.c +++ b/kernel/irq/generic-chip.c @@ -270,10 +270,7 @@ int irq_alloc_domain_generic_chips(struct irq_domain *d, int irqs_per_chip, if (d->gc) return -EBUSY; - if (d->revmap_type != IRQ_DOMAIN_MAP_LINEAR) - return -EINVAL; - - numchips = d->revmap_data.linear.size / irqs_per_chip; + numchips = d->revmap_size / irqs_per_chip; if (!numchips) return -EINVAL; diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 5a1d8ec8509e..c38be78fceb4 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -25,7 +25,6 @@ static struct irq_domain *irq_default_domain; /** * irq_domain_alloc() - Allocate a new irq_domain data structure * @of_node: optional device-tree node of the interrupt controller - * @revmap_type: type of reverse mapping to use * @ops: map/unmap domain callbacks * @host_data: Controller private data pointer * @@ -34,7 +33,7 @@ static struct irq_domain *irq_default_domain; * to IRQ domain, or NULL on failure. */ static struct irq_domain *irq_domain_alloc(struct device_node *of_node, - unsigned int revmap_type, int size, + int size, const struct irq_domain_ops *ops, void *host_data) { @@ -46,12 +45,11 @@ static struct irq_domain *irq_domain_alloc(struct device_node *of_node, return NULL; /* Fill structure */ - INIT_RADIX_TREE(&domain->revmap_data.tree, GFP_KERNEL); - domain->revmap_type = revmap_type; + INIT_RADIX_TREE(&domain->revmap_tree, GFP_KERNEL); domain->ops = ops; domain->host_data = host_data; domain->of_node = of_node_get(of_node); - domain->revmap_data.linear.size = size; + domain->revmap_size = size; return domain; } @@ -67,8 +65,7 @@ static void irq_domain_add(struct irq_domain *domain) mutex_lock(&irq_domain_mutex); list_add(&domain->link, &irq_domain_list); mutex_unlock(&irq_domain_mutex); - pr_debug("Allocated domain of type %d @0x%p\n", - domain->revmap_type, domain); + pr_debug("Added domain %s\n", domain->name); } /** @@ -88,7 +85,7 @@ void irq_domain_remove(struct irq_domain *domain) * node when all entries are removed. Shout if there are * any mappings left. */ - WARN_ON(domain->revmap_data.tree.height); + WARN_ON(domain->revmap_tree.height); list_del(&domain->link); @@ -100,8 +97,7 @@ void irq_domain_remove(struct irq_domain *domain) mutex_unlock(&irq_domain_mutex); - pr_debug("Removed domain of type %d @0x%p\n", - domain->revmap_type, domain); + pr_debug("Removed domain %s\n", domain->name); irq_domain_free(domain); } @@ -216,7 +212,7 @@ struct irq_domain *irq_domain_add_linear(struct device_node *of_node, { struct irq_domain *domain; - domain = irq_domain_alloc(of_node, IRQ_DOMAIN_MAP_LINEAR, size, ops, host_data); + domain = irq_domain_alloc(of_node, size, ops, host_data); if (!domain) return NULL; @@ -230,10 +226,9 @@ struct irq_domain *irq_domain_add_nomap(struct device_node *of_node, const struct irq_domain_ops *ops, void *host_data) { - struct irq_domain *domain = irq_domain_alloc(of_node, - IRQ_DOMAIN_MAP_NOMAP, 0, ops, host_data); + struct irq_domain *domain = irq_domain_alloc(of_node, 0, ops, host_data); if (domain) { - domain->revmap_data.nomap.max_irq = max_irq ? max_irq : ~0; + domain->revmap_direct_max_irq = max_irq ? max_irq : ~0; irq_domain_add(domain); } return domain; @@ -321,11 +316,11 @@ static void irq_domain_disassociate_many(struct irq_domain *domain, irq_data->hwirq = 0; /* Clear reverse map for this hwirq */ - if (hwirq < domain->revmap_data.linear.size) { + if (hwirq < domain->revmap_size) { domain->linear_revmap[hwirq] = 0; } else { mutex_lock(&revmap_trees_mutex); - radix_tree_delete(&domain->revmap_data.tree, hwirq); + radix_tree_delete(&domain->revmap_tree, hwirq); mutex_unlock(&revmap_trees_mutex); } } @@ -378,11 +373,11 @@ int irq_domain_associate_many(struct irq_domain *domain, unsigned int irq_base, domain->name = irq_data->chip->name; } - if (hwirq < domain->revmap_data.linear.size) { + if (hwirq < domain->revmap_size) { domain->linear_revmap[hwirq] = virq; } else { mutex_lock(&revmap_trees_mutex); - radix_tree_insert(&domain->revmap_data.tree, hwirq, irq_data); + radix_tree_insert(&domain->revmap_tree, hwirq, irq_data); mutex_unlock(&revmap_trees_mutex); } @@ -399,7 +394,9 @@ EXPORT_SYMBOL_GPL(irq_domain_associate_many); * * This routine is used for irq controllers which can choose the hardware * interrupt numbers they generate. In such a case it's simplest to use - * the linux irq as the hardware interrupt number. + * the linux irq as the hardware interrupt number. It still uses the linear + * or radix tree to store the mapping, but the irq controller can optimize + * the revmap path by using the hwirq directly. */ unsigned int irq_create_direct_mapping(struct irq_domain *domain) { @@ -408,17 +405,14 @@ unsigned int irq_create_direct_mapping(struct irq_domain *domain) if (domain == NULL) domain = irq_default_domain; - if (WARN_ON(!domain || domain->revmap_type != IRQ_DOMAIN_MAP_NOMAP)) - return 0; - virq = irq_alloc_desc_from(1, of_node_to_nid(domain->of_node)); if (!virq) { pr_debug("create_direct virq allocation failed\n"); return 0; } - if (virq >= domain->revmap_data.nomap.max_irq) { + if (virq >= domain->revmap_direct_max_irq) { pr_err("ERROR: no free irqs available below %i maximum\n", - domain->revmap_data.nomap.max_irq); + domain->revmap_direct_max_irq); irq_free_desc(virq); return 0; } @@ -617,17 +611,13 @@ unsigned int irq_find_mapping(struct irq_domain *domain, if (domain == NULL) return 0; - switch (domain->revmap_type) { - case IRQ_DOMAIN_MAP_LINEAR: - return irq_linear_revmap(domain, hwirq); - case IRQ_DOMAIN_MAP_NOMAP: + if (hwirq < domain->revmap_direct_max_irq) { data = irq_get_irq_data(hwirq); if (data && (data->domain == domain) && (data->hwirq == hwirq)) return hwirq; - break; } - return 0; + return irq_linear_revmap(domain, hwirq); } EXPORT_SYMBOL_GPL(irq_find_mapping); @@ -643,12 +633,11 @@ unsigned int irq_linear_revmap(struct irq_domain *domain, irq_hw_number_t hwirq) { struct irq_data *data; - BUG_ON(domain->revmap_type != IRQ_DOMAIN_MAP_LINEAR); /* Check revmap bounds; complain if exceeded */ - if (hwirq >= domain->revmap_data.linear.size) { + if (hwirq >= domain->revmap_size) { rcu_read_lock(); - data = radix_tree_lookup(&domain->revmap_data.tree, hwirq); + data = radix_tree_lookup(&domain->revmap_tree, hwirq); rcu_read_unlock(); return data ? data->irq : 0; } -- cgit v1.2.3 From fa40f377577752b83252b9d2b3165d4acee0eb7c Mon Sep 17 00:00:00 2001 From: Grant Likely Date: Sat, 8 Jun 2013 12:57:40 +0100 Subject: irqdomain: Clean up aftermath of irq_domain refactoring After refactoring the irqdomain code, there are a number of API functions that are merely empty wrappers around core code. Drop those wrappers out of the C file and replace them with static inlines in the header. Signed-off-by: Grant Likely --- arch/powerpc/platforms/cell/beat_interrupt.c | 2 +- arch/powerpc/platforms/powermac/smp.c | 2 +- include/linux/irqdomain.h | 31 +++++-- kernel/irq/irqdomain.c | 127 ++++++++------------------- 4 files changed, 62 insertions(+), 100 deletions(-) (limited to 'kernel') diff --git a/arch/powerpc/platforms/cell/beat_interrupt.c b/arch/powerpc/platforms/cell/beat_interrupt.c index 8c6dc42ecf65..9e5dfbcc00af 100644 --- a/arch/powerpc/platforms/cell/beat_interrupt.c +++ b/arch/powerpc/platforms/cell/beat_interrupt.c @@ -239,7 +239,7 @@ void __init beatic_init_IRQ(void) ppc_md.get_irq = beatic_get_irq; /* Allocate an irq host */ - beatic_host = irq_domain_add_nomap(NULL, 0, &beatic_pic_host_ops, NULL); + beatic_host = irq_domain_add_nomap(NULL, ~0, &beatic_pic_host_ops, NULL); BUG_ON(beatic_host == NULL); irq_set_default_host(beatic_host); } diff --git a/arch/powerpc/platforms/powermac/smp.c b/arch/powerpc/platforms/powermac/smp.c index bdb738a69e41..f921067d924d 100644 --- a/arch/powerpc/platforms/powermac/smp.c +++ b/arch/powerpc/platforms/powermac/smp.c @@ -192,7 +192,7 @@ static int psurge_secondary_ipi_init(void) { int rc = -ENOMEM; - psurge_host = irq_domain_add_nomap(NULL, 0, &psurge_host_ops, NULL); + psurge_host = irq_domain_add_nomap(NULL, ~0, &psurge_host_ops, NULL); if (psurge_host) psurge_secondary_virq = irq_create_direct_mapping(psurge_host); diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h index 51ef84a3c990..fd4b26f8f44c 100644 --- a/include/linux/irqdomain.h +++ b/include/linux/irqdomain.h @@ -110,6 +110,10 @@ struct irq_domain { }; #ifdef CONFIG_IRQ_DOMAIN +struct irq_domain *__irq_domain_add(struct device_node *of_node, + int size, int direct_max, + const struct irq_domain_ops *ops, + void *host_data); struct irq_domain *irq_domain_add_simple(struct device_node *of_node, unsigned int size, unsigned int first_irq, @@ -121,17 +125,30 @@ struct irq_domain *irq_domain_add_legacy(struct device_node *of_node, irq_hw_number_t first_hwirq, const struct irq_domain_ops *ops, void *host_data); -struct irq_domain *irq_domain_add_linear(struct device_node *of_node, +extern struct irq_domain *irq_find_host(struct device_node *node); +extern void irq_set_default_host(struct irq_domain *host); + +/** + * irq_domain_add_linear() - Allocate and register a linear revmap irq_domain. + * @of_node: pointer to interrupt controller's device tree node. + * @size: Number of interrupts in the domain. + * @ops: map/unmap domain callbacks + * @host_data: Controller private data pointer + */ +static inline struct irq_domain *irq_domain_add_linear(struct device_node *of_node, unsigned int size, const struct irq_domain_ops *ops, - void *host_data); -struct irq_domain *irq_domain_add_nomap(struct device_node *of_node, + void *host_data) +{ + return __irq_domain_add(of_node, size, 0, ops, host_data); +} +static inline struct irq_domain *irq_domain_add_nomap(struct device_node *of_node, unsigned int max_irq, const struct irq_domain_ops *ops, - void *host_data); -extern struct irq_domain *irq_find_host(struct device_node *node); -extern void irq_set_default_host(struct irq_domain *host); - + void *host_data) +{ + return __irq_domain_add(of_node, 0, max_irq, ops, host_data); +} static inline struct irq_domain *irq_domain_add_legacy_isa( struct device_node *of_node, const struct irq_domain_ops *ops, diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index c38be78fceb4..e0db59e2eef6 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -23,8 +23,11 @@ static DEFINE_MUTEX(revmap_trees_mutex); static struct irq_domain *irq_default_domain; /** - * irq_domain_alloc() - Allocate a new irq_domain data structure + * __irq_domain_add() - Allocate a new irq_domain data structure * @of_node: optional device-tree node of the interrupt controller + * @size: Size of linear map; 0 for radix mapping only + * @direct_max: Maximum value of direct maps; Use ~0 for no limit; 0 for no + * direct mapping * @ops: map/unmap domain callbacks * @host_data: Controller private data pointer * @@ -32,10 +35,10 @@ static struct irq_domain *irq_default_domain; * register allocated irq_domain with irq_domain_register(). Returns pointer * to IRQ domain, or NULL on failure. */ -static struct irq_domain *irq_domain_alloc(struct device_node *of_node, - int size, - const struct irq_domain_ops *ops, - void *host_data) +struct irq_domain *__irq_domain_add(struct device_node *of_node, + int size, int direct_max, + const struct irq_domain_ops *ops, + void *host_data) { struct irq_domain *domain; @@ -50,23 +53,16 @@ static struct irq_domain *irq_domain_alloc(struct device_node *of_node, domain->host_data = host_data; domain->of_node = of_node_get(of_node); domain->revmap_size = size; + domain->revmap_direct_max_irq = direct_max; - return domain; -} - -static void irq_domain_free(struct irq_domain *domain) -{ - of_node_put(domain->of_node); - kfree(domain); -} - -static void irq_domain_add(struct irq_domain *domain) -{ mutex_lock(&irq_domain_mutex); list_add(&domain->link, &irq_domain_list); mutex_unlock(&irq_domain_mutex); + pr_debug("Added domain %s\n", domain->name); + return domain; } +EXPORT_SYMBOL_GPL(__irq_domain_add); /** * irq_domain_remove() - Remove an irq domain. @@ -99,30 +95,28 @@ void irq_domain_remove(struct irq_domain *domain) pr_debug("Removed domain %s\n", domain->name); - irq_domain_free(domain); + of_node_put(domain->of_node); + kfree(domain); } EXPORT_SYMBOL_GPL(irq_domain_remove); /** - * irq_domain_add_simple() - Allocate and register a simple irq_domain. + * irq_domain_add_simple() - Register an irq_domain and optionally map a range of irqs * @of_node: pointer to interrupt controller's device tree node. * @size: total number of irqs in mapping * @first_irq: first number of irq block assigned to the domain, - * pass zero to assign irqs on-the-fly. This will result in a - * linear IRQ domain so it is important to use irq_create_mapping() - * for each used IRQ, especially when SPARSE_IRQ is enabled. + * pass zero to assign irqs on-the-fly. If first_irq is non-zero, then + * pre-map all of the irqs in the domain to virqs starting at first_irq. * @ops: map/unmap domain callbacks * @host_data: Controller private data pointer * - * Allocates a legacy irq_domain if irq_base is positive or a linear - * domain otherwise. For the legacy domain, IRQ descriptors will also - * be allocated. + * Allocates an irq_domain, and optionally if first_irq is positive then also + * allocate irq_descs and map all of the hwirqs to virqs starting at first_irq. * * This is intended to implement the expected behaviour for most - * interrupt controllers which is that a linear mapping should - * normally be used unless the system requires a legacy mapping in - * order to support supplying interrupt numbers during non-DT - * registration of devices. + * interrupt controllers. If device tree is used, then first_irq will be 0 and + * irqs get mapped dynamically on the fly. However, if the controller requires + * static virq assignments (non-DT boot) then it will set that up correctly. */ struct irq_domain *irq_domain_add_simple(struct device_node *of_node, unsigned int size, @@ -130,33 +124,25 @@ struct irq_domain *irq_domain_add_simple(struct device_node *of_node, const struct irq_domain_ops *ops, void *host_data) { - if (first_irq > 0) { - int irq_base; + struct irq_domain *domain; + + domain = __irq_domain_add(of_node, size, 0, ops, host_data); + if (!domain) + return NULL; + if (first_irq > 0) { if (IS_ENABLED(CONFIG_SPARSE_IRQ)) { - /* - * Set the descriptor allocator to search for a - * 1-to-1 mapping, such as irq_alloc_desc_at(). - * Use of_node_to_nid() which is defined to - * numa_node_id() on platforms that have no custom - * implementation. - */ - irq_base = irq_alloc_descs(first_irq, first_irq, size, - of_node_to_nid(of_node)); - if (irq_base < 0) { + /* attempt to allocated irq_descs */ + int rc = irq_alloc_descs(first_irq, first_irq, size, + of_node_to_nid(of_node)); + if (rc < 0) pr_info("Cannot allocate irq_descs @ IRQ%d, assuming pre-allocated\n", first_irq); - irq_base = first_irq; - } - } else - irq_base = first_irq; - - return irq_domain_add_legacy(of_node, size, irq_base, 0, - ops, host_data); + } + WARN_ON(irq_domain_associate_many(domain, first_irq, 0, size)); } - /* A linear domain is the default */ - return irq_domain_add_linear(of_node, size, ops, host_data); + return domain; } EXPORT_SYMBOL_GPL(irq_domain_add_simple); @@ -184,11 +170,7 @@ struct irq_domain *irq_domain_add_legacy(struct device_node *of_node, { struct irq_domain *domain; - pr_debug("Setting up legacy domain virq[%i:%i] ==> hwirq[%i:%i]\n", - first_irq, first_irq + size - 1, - (int)first_hwirq, (int)first_hwirq + size -1); - - domain = irq_domain_add_linear(of_node, first_hwirq + size, ops, host_data); + domain = __irq_domain_add(of_node, first_hwirq + size, 0, ops, host_data); if (!domain) return NULL; @@ -198,43 +180,6 @@ struct irq_domain *irq_domain_add_legacy(struct device_node *of_node, } EXPORT_SYMBOL_GPL(irq_domain_add_legacy); -/** - * irq_domain_add_linear() - Allocate and register a linear revmap irq_domain. - * @of_node: pointer to interrupt controller's device tree node. - * @size: Number of interrupts in the domain. - * @ops: map/unmap domain callbacks - * @host_data: Controller private data pointer - */ -struct irq_domain *irq_domain_add_linear(struct device_node *of_node, - unsigned int size, - const struct irq_domain_ops *ops, - void *host_data) -{ - struct irq_domain *domain; - - domain = irq_domain_alloc(of_node, size, ops, host_data); - if (!domain) - return NULL; - - irq_domain_add(domain); - return domain; -} -EXPORT_SYMBOL_GPL(irq_domain_add_linear); - -struct irq_domain *irq_domain_add_nomap(struct device_node *of_node, - unsigned int max_irq, - const struct irq_domain_ops *ops, - void *host_data) -{ - struct irq_domain *domain = irq_domain_alloc(of_node, 0, ops, host_data); - if (domain) { - domain->revmap_direct_max_irq = max_irq ? max_irq : ~0; - irq_domain_add(domain); - } - return domain; -} -EXPORT_SYMBOL_GPL(irq_domain_add_nomap); - /** * irq_find_host() - Locates a domain for a given device node * @node: device-tree node of the interrupt controller -- cgit v1.2.3 From 1400ea86025a22862f97e7fe544433751b43ecec Mon Sep 17 00:00:00 2001 From: Grant Likely Date: Thu, 6 Jun 2013 22:20:44 +0100 Subject: irqdomain: Beef up debugfs output This patch increases the amount of output produced by the irq_domain_mapping debugfs file by first listing all of the registered irq domains at the beginning of the output, and then by including all mapped IRQs in the output, not just the active ones. It is very useful when debugging irqdomain issues to be able to see the entire list of mapped irqs, not just the ones that happen to be connected to devices. Signed-off-by: Grant Likely --- kernel/irq/irqdomain.c | 35 ++++++++++++++++++++++++++++++----- 1 file changed, 30 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index e0db59e2eef6..280b8047d8db 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -596,12 +596,29 @@ static int virq_debug_show(struct seq_file *m, void *private) { unsigned long flags; struct irq_desc *desc; - void *data; + struct irq_domain *domain; + struct radix_tree_iter iter; + void *data, **slot; int i; - seq_printf(m, "%-5s %-7s %-15s %-*s %s\n", "irq", "hwirq", + seq_printf(m, " %-16s %-6s %-10s %-10s %s\n", + "name", "mapped", "linear-max", "direct-max", "devtree-node"); + mutex_lock(&irq_domain_mutex); + list_for_each_entry(domain, &irq_domain_list, link) { + int count = 0; + radix_tree_for_each_slot(slot, &domain->revmap_tree, &iter, 0) + count++; + seq_printf(m, "%c%-16s %6u %10u %10u %s\n", + domain == irq_default_domain ? '*' : ' ', domain->name, + domain->revmap_size + count, domain->revmap_size, + domain->revmap_direct_max_irq, + domain->of_node ? of_node_full_name(domain->of_node) : ""); + } + mutex_unlock(&irq_domain_mutex); + + seq_printf(m, "%-5s %-7s %-15s %-*s %6s %-14s %s\n", "irq", "hwirq", "chip name", (int)(2 * sizeof(void *) + 2), "chip data", - "domain name"); + "active", "type", "domain"); for (i = 1; i < nr_irqs; i++) { desc = irq_to_desc(i); @@ -609,12 +626,15 @@ static int virq_debug_show(struct seq_file *m, void *private) continue; raw_spin_lock_irqsave(&desc->lock, flags); + domain = desc->irq_data.domain; - if (desc->action && desc->action->handler) { + if (domain) { struct irq_chip *chip; + int hwirq = desc->irq_data.hwirq; + bool direct; seq_printf(m, "%5d ", i); - seq_printf(m, "0x%05lx ", desc->irq_data.hwirq); + seq_printf(m, "0x%05x ", hwirq); chip = irq_desc_get_chip(desc); seq_printf(m, "%-15s ", (chip && chip->name) ? chip->name : "none"); @@ -622,6 +642,11 @@ static int virq_debug_show(struct seq_file *m, void *private) data = irq_desc_get_chip_data(desc); seq_printf(m, data ? "0x%p " : " %p ", data); + seq_printf(m, " %c ", (desc->action && desc->action->handler) ? '*' : ' '); + direct = (i == hwirq) && (i < domain->revmap_direct_max_irq); + seq_printf(m, "%6s%-8s ", + (hwirq < domain->revmap_size) ? "LINEAR" : "RADIX", + direct ? "(DIRECT)" : ""); seq_printf(m, "%s\n", desc->irq_data.domain->name); } -- cgit v1.2.3 From 9350de06be45a5a8b927ac6577c9d35de61c90ca Mon Sep 17 00:00:00 2001 From: Bernie Thompson Date: Sat, 1 Jun 2013 00:47:43 +0000 Subject: PM / wakeup: Adjust messaging for wake events during suspend This adds in a new message to the wakeup code which adds an indication to the log that suspend was cancelled due to a wake event occouring during the suspend sequence. It also adjusts the message printed in suspend.c to reflect the potential that a suspend was aborted, as opposed to a device failing to suspend. Without these message adjustments one can end up with a kernel log that says that a device failed to suspend with no actual device suspend failures, which can be confusing to the log examiner. Signed-off-by: Bernie Thompson Signed-off-by: Rafael J. Wysocki --- drivers/base/power/wakeup.c | 4 +++- kernel/power/suspend.c | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/drivers/base/power/wakeup.c b/drivers/base/power/wakeup.c index 79715e7fa43e..407a2efa10bb 100644 --- a/drivers/base/power/wakeup.c +++ b/drivers/base/power/wakeup.c @@ -707,8 +707,10 @@ bool pm_wakeup_pending(void) } spin_unlock_irqrestore(&events_lock, flags); - if (ret) + if (ret) { + pr_info("PM: Wakeup pending, aborting suspend\n"); print_active_wakeup_sources(); + } return ret; } diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index bef86d121eb2..ece04223bb1e 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -269,7 +269,7 @@ int suspend_devices_and_enter(suspend_state_t state) suspend_test_start(); error = dpm_suspend_start(PMSG_SUSPEND); if (error) { - printk(KERN_ERR "PM: Some devices failed to suspend\n"); + pr_err("PM: Some devices failed to suspend, or early wake event detected\n"); goto Recover_platform; } suspend_test_finish("suspend devices"); -- cgit v1.2.3 From ad71d889b88055e61e3970a6744a271a51a94f42 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Tue, 30 Apr 2013 15:46:14 -0400 Subject: tracing: Add function probe to trigger a ftrace dump to console Add the "dump" command to have the ftrace buffer dumped to console if a function is hit. This is useful when debugging a tripple fault, where you have an idea of a function that is called just before the tripple fault occurs, and can tell ftrace to dump its content out to the console before it continues. Format is: :dump echo 'bad_address:dump' > /debug/tracing/set_ftrace_filter To remove this: echo '!bad_address:dump' > /debug/tracing/set_ftrace_filter Requested-by: Luis Claudio R. Goncalves Signed-off-by: Steven Rostedt --- Documentation/trace/ftrace.txt | 7 +++++ kernel/trace/trace_functions.c | 59 ++++++++++++++++++++++++++++++++++++++---- 2 files changed, 61 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/Documentation/trace/ftrace.txt b/Documentation/trace/ftrace.txt index bfe8c29b1f1d..cc9ec57e157c 100644 --- a/Documentation/trace/ftrace.txt +++ b/Documentation/trace/ftrace.txt @@ -2430,6 +2430,13 @@ The following commands are supported: echo '!schedule:disable_event:sched:sched_switch' > \ set_ftrace_filter +- dump + When the function is hit, it will dump the contents of the ftrace + ring buffer to the console. This is useful if you need to debug + something, and want to dump the trace when a certain function + is hit. Perhaps its a function that is called before a tripple + fault happens and does not allow you to get a regular dump. + trace_pipe ---------- diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index c4d6d7191988..d7c8719734b8 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -290,6 +290,13 @@ ftrace_stacktrace_count(unsigned long ip, unsigned long parent_ip, void **data) trace_dump_stack(STACK_SKIP); } +static void +ftrace_dump_probe(unsigned long ip, unsigned long parent_ip, void **data) +{ + if (update_count(data)) + ftrace_dump(DUMP_ALL); +} + static int ftrace_probe_print(const char *name, struct seq_file *m, unsigned long ip, void *data) @@ -327,6 +334,13 @@ ftrace_stacktrace_print(struct seq_file *m, unsigned long ip, return ftrace_probe_print("stacktrace", m, ip, data); } +static int +ftrace_dump_print(struct seq_file *m, unsigned long ip, + struct ftrace_probe_ops *ops, void *data) +{ + return ftrace_probe_print("dump", m, ip, data); +} + static struct ftrace_probe_ops traceon_count_probe_ops = { .func = ftrace_traceon_count, .print = ftrace_traceon_print, @@ -342,6 +356,11 @@ static struct ftrace_probe_ops stacktrace_count_probe_ops = { .print = ftrace_stacktrace_print, }; +static struct ftrace_probe_ops dump_probe_ops = { + .func = ftrace_dump_probe, + .print = ftrace_dump_print, +}; + static struct ftrace_probe_ops traceon_probe_ops = { .func = ftrace_traceon, .print = ftrace_traceon_print, @@ -425,6 +444,19 @@ ftrace_stacktrace_callback(struct ftrace_hash *hash, param, enable); } +static int +ftrace_dump_callback(struct ftrace_hash *hash, + char *glob, char *cmd, char *param, int enable) +{ + struct ftrace_probe_ops *ops; + + ops = &dump_probe_ops; + + /* Only dump once. */ + return ftrace_trace_probe_callback(ops, hash, glob, cmd, + "1", enable); +} + static struct ftrace_func_command ftrace_traceon_cmd = { .name = "traceon", .func = ftrace_trace_onoff_callback, @@ -440,6 +472,11 @@ static struct ftrace_func_command ftrace_stacktrace_cmd = { .func = ftrace_stacktrace_callback, }; +static struct ftrace_func_command ftrace_dump_cmd = { + .name = "dump", + .func = ftrace_dump_callback, +}; + static int __init init_func_cmd_traceon(void) { int ret; @@ -450,13 +487,25 @@ static int __init init_func_cmd_traceon(void) ret = register_ftrace_command(&ftrace_traceon_cmd); if (ret) - unregister_ftrace_command(&ftrace_traceoff_cmd); + goto out_free_traceoff; ret = register_ftrace_command(&ftrace_stacktrace_cmd); - if (ret) { - unregister_ftrace_command(&ftrace_traceoff_cmd); - unregister_ftrace_command(&ftrace_traceon_cmd); - } + if (ret) + goto out_free_traceon; + + ret = register_ftrace_command(&ftrace_dump_cmd); + if (ret) + goto out_free_stacktrace; + + return 0; + + out_free_stacktrace: + unregister_ftrace_command(&ftrace_stacktrace_cmd); + out_free_traceon: + unregister_ftrace_command(&ftrace_traceon_cmd); + out_free_traceoff: + unregister_ftrace_command(&ftrace_traceoff_cmd); + return ret; } #else -- cgit v1.2.3 From 90e3c03c3a09a7b176b3fe59d78f5d9755ac8e37 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Tue, 30 Apr 2013 19:00:46 -0400 Subject: tracing: Add function probe to trigger a ftrace dump of current CPU trace Add the "cpudump" command to have the current CPU ftrace buffer dumped to console if a function is hit. This is useful when debugging a tripple fault, where you have an idea of a function that is called just before the tripple fault occurs, and can tell ftrace to dump its content out to the console before it continues. This differs from the "dump" command as it only dumps the content of the ring buffer for the currently executing CPU, and does not show the contents of the other CPUs. Format is: :cpudump echo 'bad_address:cpudump' > /debug/tracing/set_ftrace_filter To remove this: echo '!bad_address:cpudump' > /debug/tracing/set_ftrace_filter Signed-off-by: Steven Rostedt --- Documentation/trace/ftrace.txt | 6 ++++++ kernel/trace/trace_functions.c | 44 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 50 insertions(+) (limited to 'kernel') diff --git a/Documentation/trace/ftrace.txt b/Documentation/trace/ftrace.txt index cc9ec57e157c..b937c6e2163c 100644 --- a/Documentation/trace/ftrace.txt +++ b/Documentation/trace/ftrace.txt @@ -2437,6 +2437,12 @@ The following commands are supported: is hit. Perhaps its a function that is called before a tripple fault happens and does not allow you to get a regular dump. +- cpudump + When the function is hit, it will dump the contents of the ftrace + ring buffer for the current CPU to the console. Unlike the "dump" + command, it only prints out the contents of the ring buffer for the + CPU that executed the function that triggered the dump. + trace_pipe ---------- diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index d7c8719734b8..b863f93b30f3 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -297,6 +297,14 @@ ftrace_dump_probe(unsigned long ip, unsigned long parent_ip, void **data) ftrace_dump(DUMP_ALL); } +/* Only dump the current CPU buffer. */ +static void +ftrace_cpudump_probe(unsigned long ip, unsigned long parent_ip, void **data) +{ + if (update_count(data)) + ftrace_dump(DUMP_ORIG); +} + static int ftrace_probe_print(const char *name, struct seq_file *m, unsigned long ip, void *data) @@ -341,6 +349,13 @@ ftrace_dump_print(struct seq_file *m, unsigned long ip, return ftrace_probe_print("dump", m, ip, data); } +static int +ftrace_cpudump_print(struct seq_file *m, unsigned long ip, + struct ftrace_probe_ops *ops, void *data) +{ + return ftrace_probe_print("cpudump", m, ip, data); +} + static struct ftrace_probe_ops traceon_count_probe_ops = { .func = ftrace_traceon_count, .print = ftrace_traceon_print, @@ -361,6 +376,11 @@ static struct ftrace_probe_ops dump_probe_ops = { .print = ftrace_dump_print, }; +static struct ftrace_probe_ops cpudump_probe_ops = { + .func = ftrace_cpudump_probe, + .print = ftrace_cpudump_print, +}; + static struct ftrace_probe_ops traceon_probe_ops = { .func = ftrace_traceon, .print = ftrace_traceon_print, @@ -457,6 +477,19 @@ ftrace_dump_callback(struct ftrace_hash *hash, "1", enable); } +static int +ftrace_cpudump_callback(struct ftrace_hash *hash, + char *glob, char *cmd, char *param, int enable) +{ + struct ftrace_probe_ops *ops; + + ops = &cpudump_probe_ops; + + /* Only dump once. */ + return ftrace_trace_probe_callback(ops, hash, glob, cmd, + "1", enable); +} + static struct ftrace_func_command ftrace_traceon_cmd = { .name = "traceon", .func = ftrace_trace_onoff_callback, @@ -477,6 +510,11 @@ static struct ftrace_func_command ftrace_dump_cmd = { .func = ftrace_dump_callback, }; +static struct ftrace_func_command ftrace_cpudump_cmd = { + .name = "cpudump", + .func = ftrace_cpudump_callback, +}; + static int __init init_func_cmd_traceon(void) { int ret; @@ -497,8 +535,14 @@ static int __init init_func_cmd_traceon(void) if (ret) goto out_free_stacktrace; + ret = register_ftrace_command(&ftrace_cpudump_cmd); + if (ret) + goto out_free_dump; + return 0; + out_free_dump: + unregister_ftrace_command(&ftrace_dump_cmd); out_free_stacktrace: unregister_ftrace_command(&ftrace_stacktrace_cmd); out_free_traceon: -- cgit v1.2.3 From 8092e808a31839c502a52d391b15f31c1d8764f5 Mon Sep 17 00:00:00 2001 From: Harsh Prateek Bora Date: Fri, 24 May 2013 12:52:17 +0530 Subject: tracing/trivial: Consolidate error return condition Consolidate the checks for !enabled and !param to return -EINVAL in event_enable_func(). Link: http://lkml.kernel.org/r/1369380137-12452-1-git-send-email-harsh@linux.vnet.ibm.com Signed-off-by: Harsh Prateek Bora Signed-off-by: Steven Rostedt --- kernel/trace/trace_events.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 27963e2bf4bf..db086f172cf5 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -2011,10 +2011,7 @@ event_enable_func(struct ftrace_hash *hash, int ret; /* hash funcs only work with set_ftrace_filter */ - if (!enabled) - return -EINVAL; - - if (!param) + if (!enabled || !param) return -EINVAL; system = strsep(¶m, ":"); -- cgit v1.2.3 From 238ae93d699d59876b470bf6455de22bcfaa9a1b Mon Sep 17 00:00:00 2001 From: Wang YanQing Date: Sun, 26 May 2013 16:52:01 +0800 Subject: tracing: Fix file mode of free_buffer Commit 4f271a2a60c748599b30bb4dafff30d770439b96 (tracing: Add a proc file to stop tracing and free buffer) implement a method to free up ring buffer in kernel memory in the release code path of free_buffer's fd. Then we don't need read/write support for free_buffer, indeed we just have a dummy write fop, and don't implement read fop. So the 0200 is more reasonable file mode for free_buffer than the current file mode 0644. Link: http://lkml.kernel.org/r/20130526085201.GA3183@udknight Acked-by: Vaibhav Nagarnaik Acked-by: David Sharp Signed-off-by: Wang YanQing Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 1a41023a1f88..5f4a09c12e0b 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -5935,7 +5935,7 @@ init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer) trace_create_file("buffer_total_size_kb", 0444, d_tracer, tr, &tracing_total_entries_fops); - trace_create_file("free_buffer", 0644, d_tracer, + trace_create_file("free_buffer", 0200, d_tracer, tr, &tracing_free_buffer_fops); trace_create_file("trace_marker", 0220, d_tracer, -- cgit v1.2.3 From 7614c3dc74733dff4b0e774f7a894b9ea6ec508c Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 28 May 2013 20:01:16 -0400 Subject: ftrace: Use schedule_on_each_cpu() as a heavy synchronize_sched() The function tracer uses preempt_disable/enable_notrace() for synchronization between reading registered ftrace_ops and unregistering them. Most of the ftrace_ops are global permanent structures that do not require this synchronization. That is, ops may be added and removed from the hlist but are never freed, and wont hurt if a synchronization is missed. But this is not true for dynamically created ftrace_ops or control_ops, which are used by the perf function tracing. The problem here is that the function tracer can be used to trace kernel/user context switches as well as going to and from idle. Basically, it can be used to trace blind spots of the RCU subsystem. This means that even though preempt_disable() is done, a synchronize_sched() will ignore CPUs that haven't made it out of user space or idle. These can include functions that are being traced just before entering or exiting the kernel sections. To implement the RCU synchronization, instead of using synchronize_sched() the use of schedule_on_each_cpu() is performed. This means that when a dynamically allocated ftrace_ops, or a control ops is being unregistered, all CPUs must be touched and execute a ftrace_sync() stub function via the work queues. This will rip CPUs out from idle or in dynamic tick mode. This only happens when a user disables perf function tracing or other dynamically allocated function tracers, but it allows us to continue to debug RCU and context tracking with function tracing. Link: http://lkml.kernel.org/r/1369785676.15552.55.camel@gandalf.local.home Cc: "Paul E. McKenney" Cc: Tejun Heo Cc: Ingo Molnar Cc: Frederic Weisbecker Cc: Jiri Olsa Cc: Peter Zijlstra Acked-by: Paul E. McKenney Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 6c508ff33c62..800a8a2fbddb 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -413,6 +413,17 @@ static int __register_ftrace_function(struct ftrace_ops *ops) return 0; } +static void ftrace_sync(struct work_struct *work) +{ + /* + * This function is just a stub to implement a hard force + * of synchronize_sched(). This requires synchronizing + * tasks even in userspace and idle. + * + * Yes, function tracing is rude. + */ +} + static int __unregister_ftrace_function(struct ftrace_ops *ops) { int ret; @@ -440,8 +451,12 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops) * so there'll be no new users. We must ensure * all current users are done before we free * the control data. + * Note synchronize_sched() is not enough, as we + * use preempt_disable() to do RCU, but the function + * tracer can be called where RCU is not active + * (before user_exit()). */ - synchronize_sched(); + schedule_on_each_cpu(ftrace_sync); control_ops_free(ops); } } else @@ -456,9 +471,13 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops) /* * Dynamic ops may be freed, we must make sure that all * callers are done before leaving this function. + * + * Again, normal synchronize_sched() is not good enough. + * We need to do a hard force of sched synchronization. */ if (ops->flags & FTRACE_OPS_FL_DYNAMIC) - synchronize_sched(); + schedule_on_each_cpu(ftrace_sync); + return 0; } -- cgit v1.2.3 From aaf6ac0f0871cb7fc0f28f3a00edf329bc7adc29 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Fri, 7 Jun 2013 15:07:48 +0900 Subject: tracing: Do not call kmem_cache_free() on allocation failure There's no point calling it when _alloc() failed. Link: http://lkml.kernel.org/r/1370585268-29169-1-git-send-email-namhyung@kernel.org Signed-off-by: Namhyung Kim Signed-off-by: Steven Rostedt --- kernel/trace/trace_events.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index db086f172cf5..f57b01574a30 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -97,7 +97,7 @@ static int __trace_define_field(struct list_head *head, const char *type, field = kmem_cache_alloc(field_cachep, GFP_TRACE); if (!field) - goto err; + return -ENOMEM; field->name = name; field->type = type; @@ -114,11 +114,6 @@ static int __trace_define_field(struct list_head *head, const char *type, list_add(&field->link, head); return 0; - -err: - kmem_cache_free(field_cachep, field); - - return -ENOMEM; } int trace_define_field(struct ftrace_event_call *call, const char *type, -- cgit v1.2.3 From 11682a41618f8094cb7a9330b4b6a12ffaef5774 Mon Sep 17 00:00:00 2001 From: Marcus Gelderie Date: Tue, 4 Jun 2013 09:32:09 +0200 Subject: alarmtimer: Export symbols of functions declared in linux/alarmtimer.h Export symbols so they can be used by drivers/staging/android/alarm-dev.c if it is built as a module. So far alarm-dev is built-in but module support is planned (see drivers/staging/android/TODO). Signed-off-by: Marcus Gelderie [jstultz: tweaked commit message, also export newly added functions] Signed-off-by: John Stultz --- kernel/time/alarmtimer.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 3e5cba274475..eec50fcef9e4 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -204,6 +204,7 @@ ktime_t alarm_expires_remaining(const struct alarm *alarm) struct alarm_base *base = &alarm_bases[alarm->type]; return ktime_sub(alarm->node.expires, base->gettime()); } +EXPORT_SYMBOL_GPL(alarm_expires_remaining); #ifdef CONFIG_RTC_CLASS /** @@ -309,6 +310,7 @@ void alarm_init(struct alarm *alarm, enum alarmtimer_type type, alarm->type = type; alarm->state = ALARMTIMER_STATE_INACTIVE; } +EXPORT_SYMBOL_GPL(alarm_init); /** * alarm_start - Sets an absolute alarm to fire @@ -329,6 +331,7 @@ int alarm_start(struct alarm *alarm, ktime_t start) spin_unlock_irqrestore(&base->lock, flags); return ret; } +EXPORT_SYMBOL_GPL(alarm_start); /** * alarm_start_relative - Sets a relative alarm to fire @@ -342,6 +345,7 @@ int alarm_start_relative(struct alarm *alarm, ktime_t start) start = ktime_add(start, base->gettime()); return alarm_start(alarm, start); } +EXPORT_SYMBOL_GPL(alarm_start_relative); void alarm_restart(struct alarm *alarm) { @@ -354,6 +358,7 @@ void alarm_restart(struct alarm *alarm) alarmtimer_enqueue(base, alarm); spin_unlock_irqrestore(&base->lock, flags); } +EXPORT_SYMBOL_GPL(alarm_restart); /** * alarm_try_to_cancel - Tries to cancel an alarm timer @@ -375,6 +380,7 @@ int alarm_try_to_cancel(struct alarm *alarm) spin_unlock_irqrestore(&base->lock, flags); return ret; } +EXPORT_SYMBOL_GPL(alarm_try_to_cancel); /** @@ -392,6 +398,7 @@ int alarm_cancel(struct alarm *alarm) cpu_relax(); } } +EXPORT_SYMBOL_GPL(alarm_cancel); u64 alarm_forward(struct alarm *alarm, ktime_t now, ktime_t interval) @@ -424,6 +431,7 @@ u64 alarm_forward(struct alarm *alarm, ktime_t now, ktime_t interval) alarm->node.expires = ktime_add(alarm->node.expires, interval); return overrun; } +EXPORT_SYMBOL_GPL(alarm_forward); u64 alarm_forward_now(struct alarm *alarm, ktime_t interval) { @@ -431,7 +439,7 @@ u64 alarm_forward_now(struct alarm *alarm, ktime_t interval) return alarm_forward(alarm, base->gettime(), interval); } - +EXPORT_SYMBOL_GPL(alarm_forward_now); /** -- cgit v1.2.3 From 38ff87f77af0b5a93fc8581cff1d6e5692ab8970 Mon Sep 17 00:00:00 2001 From: Stephen Boyd Date: Sat, 1 Jun 2013 23:39:40 -0700 Subject: sched_clock: Make ARM's sched_clock generic for all architectures Nothing about the sched_clock implementation in the ARM port is specific to the architecture. Generalize the code so that other architectures can use it by selecting GENERIC_SCHED_CLOCK. Signed-off-by: Stephen Boyd [jstultz: Merge minor collisions with other patches in my tree] Signed-off-by: John Stultz --- arch/arm/Kconfig | 1 + arch/arm/common/timer-sp.c | 2 +- arch/arm/include/asm/sched_clock.h | 16 --- arch/arm/kernel/Makefile | 2 +- arch/arm/kernel/arch_timer.c | 2 +- arch/arm/kernel/sched_clock.c | 216 ------------------------------ arch/arm/kernel/time.c | 4 +- arch/arm/mach-davinci/time.c | 2 +- arch/arm/mach-imx/time.c | 2 +- arch/arm/mach-integrator/integrator_ap.c | 2 +- arch/arm/mach-ixp4xx/common.c | 2 +- arch/arm/mach-mmp/time.c | 2 +- arch/arm/mach-msm/timer.c | 2 +- arch/arm/mach-omap1/time.c | 2 +- arch/arm/mach-omap2/timer.c | 2 +- arch/arm/mach-pxa/time.c | 2 +- arch/arm/mach-sa1100/time.c | 2 +- arch/arm/mach-u300/timer.c | 2 +- arch/arm/plat-iop/time.c | 2 +- arch/arm/plat-omap/counter_32k.c | 2 +- arch/arm/plat-orion/time.c | 2 +- arch/arm/plat-samsung/samsung-time.c | 2 +- arch/arm/plat-versatile/sched-clock.c | 2 +- drivers/clocksource/bcm2835_timer.c | 2 +- drivers/clocksource/clksrc-dbx500-prcmu.c | 3 +- drivers/clocksource/dw_apb_timer_of.c | 3 +- drivers/clocksource/mxs_timer.c | 2 +- drivers/clocksource/nomadik-mtu.c | 2 +- drivers/clocksource/samsung_pwm_timer.c | 2 +- drivers/clocksource/tegra20_timer.c | 2 +- drivers/clocksource/time-armada-370-xp.c | 2 +- drivers/clocksource/timer-marco.c | 2 +- drivers/clocksource/timer-prima2.c | 2 +- include/linux/sched_clock.h | 21 +++ init/Kconfig | 3 + init/main.c | 2 + kernel/time/Makefile | 1 + kernel/time/sched_clock.c | 215 +++++++++++++++++++++++++++++ 38 files changed, 273 insertions(+), 266 deletions(-) delete mode 100644 arch/arm/include/asm/sched_clock.h delete mode 100644 arch/arm/kernel/sched_clock.c create mode 100644 include/linux/sched_clock.h create mode 100644 kernel/time/sched_clock.c (limited to 'kernel') diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 49d993cee512..53d3a356f61f 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -14,6 +14,7 @@ config ARM select GENERIC_IRQ_PROBE select GENERIC_IRQ_SHOW select GENERIC_PCI_IOMAP + select GENERIC_SCHED_CLOCK select GENERIC_SMP_IDLE_THREAD select GENERIC_IDLE_POLL_SETUP select GENERIC_STRNCPY_FROM_USER diff --git a/arch/arm/common/timer-sp.c b/arch/arm/common/timer-sp.c index ddc740769601..023ee63827a2 100644 --- a/arch/arm/common/timer-sp.c +++ b/arch/arm/common/timer-sp.c @@ -28,8 +28,8 @@ #include #include #include +#include -#include #include #include diff --git a/arch/arm/include/asm/sched_clock.h b/arch/arm/include/asm/sched_clock.h deleted file mode 100644 index 3d520ddca61b..000000000000 --- a/arch/arm/include/asm/sched_clock.h +++ /dev/null @@ -1,16 +0,0 @@ -/* - * sched_clock.h: support for extending counters to full 64-bit ns counter - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ -#ifndef ASM_SCHED_CLOCK -#define ASM_SCHED_CLOCK - -extern void sched_clock_postinit(void); -extern void setup_sched_clock(u32 (*read)(void), int bits, unsigned long rate); - -extern unsigned long long (*sched_clock_func)(void); - -#endif diff --git a/arch/arm/kernel/Makefile b/arch/arm/kernel/Makefile index 5f3338eacad2..97cb0576d07c 100644 --- a/arch/arm/kernel/Makefile +++ b/arch/arm/kernel/Makefile @@ -16,7 +16,7 @@ CFLAGS_REMOVE_return_address.o = -pg # Object file lists. obj-y := elf.o entry-armv.o entry-common.o irq.o opcodes.o \ - process.o ptrace.o return_address.o sched_clock.o \ + process.o ptrace.o return_address.o \ setup.o signal.o stacktrace.o sys_arm.o time.o traps.o obj-$(CONFIG_ATAGS) += atags_parse.o diff --git a/arch/arm/kernel/arch_timer.c b/arch/arm/kernel/arch_timer.c index 59dcdced6e30..221f07b11ccb 100644 --- a/arch/arm/kernel/arch_timer.c +++ b/arch/arm/kernel/arch_timer.c @@ -11,9 +11,9 @@ #include #include #include +#include #include -#include #include diff --git a/arch/arm/kernel/sched_clock.c b/arch/arm/kernel/sched_clock.c deleted file mode 100644 index a781c59b93c0..000000000000 --- a/arch/arm/kernel/sched_clock.c +++ /dev/null @@ -1,216 +0,0 @@ -/* - * sched_clock.c: support for extending counters to full 64-bit ns counter - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -struct clock_data { - u64 epoch_ns; - u32 epoch_cyc; - u32 epoch_cyc_copy; - unsigned long rate; - u32 mult; - u32 shift; - bool suspended; -}; - -static void sched_clock_poll(unsigned long wrap_ticks); -static DEFINE_TIMER(sched_clock_timer, sched_clock_poll, 0, 0); -static int irqtime = -1; - -core_param(irqtime, irqtime, int, 0400); - -static struct clock_data cd = { - .mult = NSEC_PER_SEC / HZ, -}; - -static u32 __read_mostly sched_clock_mask = 0xffffffff; - -static u32 notrace jiffy_sched_clock_read(void) -{ - return (u32)(jiffies - INITIAL_JIFFIES); -} - -static u32 __read_mostly (*read_sched_clock)(void) = jiffy_sched_clock_read; - -static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift) -{ - return (cyc * mult) >> shift; -} - -static unsigned long long notrace cyc_to_sched_clock(u32 cyc, u32 mask) -{ - u64 epoch_ns; - u32 epoch_cyc; - - /* - * Load the epoch_cyc and epoch_ns atomically. We do this by - * ensuring that we always write epoch_cyc, epoch_ns and - * epoch_cyc_copy in strict order, and read them in strict order. - * If epoch_cyc and epoch_cyc_copy are not equal, then we're in - * the middle of an update, and we should repeat the load. - */ - do { - epoch_cyc = cd.epoch_cyc; - smp_rmb(); - epoch_ns = cd.epoch_ns; - smp_rmb(); - } while (epoch_cyc != cd.epoch_cyc_copy); - - return epoch_ns + cyc_to_ns((cyc - epoch_cyc) & mask, cd.mult, cd.shift); -} - -/* - * Atomically update the sched_clock epoch. - */ -static void notrace update_sched_clock(void) -{ - unsigned long flags; - u32 cyc; - u64 ns; - - cyc = read_sched_clock(); - ns = cd.epoch_ns + - cyc_to_ns((cyc - cd.epoch_cyc) & sched_clock_mask, - cd.mult, cd.shift); - /* - * Write epoch_cyc and epoch_ns in a way that the update is - * detectable in cyc_to_fixed_sched_clock(). - */ - raw_local_irq_save(flags); - cd.epoch_cyc_copy = cyc; - smp_wmb(); - cd.epoch_ns = ns; - smp_wmb(); - cd.epoch_cyc = cyc; - raw_local_irq_restore(flags); -} - -static void sched_clock_poll(unsigned long wrap_ticks) -{ - mod_timer(&sched_clock_timer, round_jiffies(jiffies + wrap_ticks)); - update_sched_clock(); -} - -void __init setup_sched_clock(u32 (*read)(void), int bits, unsigned long rate) -{ - unsigned long r, w; - u64 res, wrap; - char r_unit; - - if (cd.rate > rate) - return; - - BUG_ON(bits > 32); - WARN_ON(!irqs_disabled()); - read_sched_clock = read; - sched_clock_mask = (1 << bits) - 1; - cd.rate = rate; - - /* calculate the mult/shift to convert counter ticks to ns. */ - clocks_calc_mult_shift(&cd.mult, &cd.shift, rate, NSEC_PER_SEC, 0); - - r = rate; - if (r >= 4000000) { - r /= 1000000; - r_unit = 'M'; - } else if (r >= 1000) { - r /= 1000; - r_unit = 'k'; - } else - r_unit = ' '; - - /* calculate how many ns until we wrap */ - wrap = cyc_to_ns((1ULL << bits) - 1, cd.mult, cd.shift); - do_div(wrap, NSEC_PER_MSEC); - w = wrap; - - /* calculate the ns resolution of this counter */ - res = cyc_to_ns(1ULL, cd.mult, cd.shift); - pr_info("sched_clock: %u bits at %lu%cHz, resolution %lluns, wraps every %lums\n", - bits, r, r_unit, res, w); - - /* - * Start the timer to keep sched_clock() properly updated and - * sets the initial epoch. - */ - sched_clock_timer.data = msecs_to_jiffies(w - (w / 10)); - update_sched_clock(); - - /* - * Ensure that sched_clock() starts off at 0ns - */ - cd.epoch_ns = 0; - - /* Enable IRQ time accounting if we have a fast enough sched_clock */ - if (irqtime > 0 || (irqtime == -1 && rate >= 1000000)) - enable_sched_clock_irqtime(); - - pr_debug("Registered %pF as sched_clock source\n", read); -} - -static unsigned long long notrace sched_clock_32(void) -{ - u32 cyc = read_sched_clock(); - return cyc_to_sched_clock(cyc, sched_clock_mask); -} - -unsigned long long __read_mostly (*sched_clock_func)(void) = sched_clock_32; - -unsigned long long notrace sched_clock(void) -{ - if (cd.suspended) - return cd.epoch_ns; - - return sched_clock_func(); -} - -void __init sched_clock_postinit(void) -{ - /* - * If no sched_clock function has been provided at that point, - * make it the final one one. - */ - if (read_sched_clock == jiffy_sched_clock_read) - setup_sched_clock(jiffy_sched_clock_read, 32, HZ); - - sched_clock_poll(sched_clock_timer.data); -} - -static int sched_clock_suspend(void) -{ - sched_clock_poll(sched_clock_timer.data); - cd.suspended = true; - return 0; -} - -static void sched_clock_resume(void) -{ - cd.epoch_cyc = read_sched_clock(); - cd.epoch_cyc_copy = cd.epoch_cyc; - cd.suspended = false; -} - -static struct syscore_ops sched_clock_ops = { - .suspend = sched_clock_suspend, - .resume = sched_clock_resume, -}; - -static int __init sched_clock_syscore_init(void) -{ - register_syscore_ops(&sched_clock_ops); - return 0; -} -device_initcall(sched_clock_syscore_init); diff --git a/arch/arm/kernel/time.c b/arch/arm/kernel/time.c index abff4e9aaee0..98aee3258398 100644 --- a/arch/arm/kernel/time.c +++ b/arch/arm/kernel/time.c @@ -24,9 +24,9 @@ #include #include #include +#include #include -#include #include #include #include @@ -120,6 +120,4 @@ void __init time_init(void) machine_desc->init_time(); else clocksource_of_init(); - - sched_clock_postinit(); } diff --git a/arch/arm/mach-davinci/time.c b/arch/arm/mach-davinci/time.c index bad361ec1666..7a55b5c95971 100644 --- a/arch/arm/mach-davinci/time.c +++ b/arch/arm/mach-davinci/time.c @@ -18,8 +18,8 @@ #include #include #include +#include -#include #include #include diff --git a/arch/arm/mach-imx/time.c b/arch/arm/mach-imx/time.c index fea91313678b..cd46529e9eaa 100644 --- a/arch/arm/mach-imx/time.c +++ b/arch/arm/mach-imx/time.c @@ -26,8 +26,8 @@ #include #include #include +#include -#include #include #include "common.h" diff --git a/arch/arm/mach-integrator/integrator_ap.c b/arch/arm/mach-integrator/integrator_ap.c index b23c8e4f28e8..aa4346227c41 100644 --- a/arch/arm/mach-integrator/integrator_ap.c +++ b/arch/arm/mach-integrator/integrator_ap.c @@ -41,6 +41,7 @@ #include #include #include +#include #include