From 5762ba1873b0bb9faa631aaa02f533c2b9837f82 Mon Sep 17 00:00:00 2001 From: Sebastien Dugue Date: Mon, 1 Dec 2008 14:09:07 +0100 Subject: hrtimers: allow the hot-unplugging of all cpus Impact: fix CPU hotplug hang on Power6 testbox On architectures that support offlining all cpus (at least powerpc/pseries), hot-unpluging the tick_do_timer_cpu can result in a system hang. This comes from the fact that if the cpu going down happens to be the cpu doing the tick, then as the tick_do_timer_cpu handover happens after the cpu is dead (via the CPU_DEAD notification), we're left without ticks, jiffies are frozen and any task relying on timers (msleep, ...) is stuck. That's particularly the case for the cpu looping in __cpu_die() waiting for the dying cpu to be dead. This patch addresses this by having the tick_do_timer_cpu handover happen earlier during the CPU_DYING notification. For this, a new clockevent notification type is introduced (CLOCK_EVT_NOTIFY_CPU_DYING) which is triggered in hrtimer_cpu_notify(). Signed-off-by: Sebastien Dugue Cc: Signed-off-by: Ingo Molnar --- include/linux/clockchips.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/clockchips.h b/include/linux/clockchips.h index ed3a5d473e52..c6de413c5dd1 100644 --- a/include/linux/clockchips.h +++ b/include/linux/clockchips.h @@ -36,6 +36,7 @@ enum clock_event_nofitiers { CLOCK_EVT_NOTIFY_BROADCAST_EXIT, CLOCK_EVT_NOTIFY_SUSPEND, CLOCK_EVT_NOTIFY_RESUME, + CLOCK_EVT_NOTIFY_CPU_DYING, CLOCK_EVT_NOTIFY_CPU_DEAD, }; -- cgit v1.2.3 From 74019224ac34b044b44a31dd89a54e3477db4896 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 18 Feb 2009 12:23:29 +0100 Subject: timers: add mod_timer_pending() Impact: new timer API Based on an idea from Martin Josefsson with the help of Patrick McHardy and Stephen Hemminger: introduce the mod_timer_pending() API which is a mod_timer() offspring that is an invariant on already removed timers. (regular mod_timer() re-activates non-pending timers.) This is useful for the networking code in that it can allow unserialized mod_timer_pending() timer-forwarding calls, but a single del_timer*() will stop the timer from being reactivated again. Also while at it: - optimize the regular mod_timer() path some more, the timer-stat and a debug check was needlessly duplicated in __mod_timer(). - make the exports come straight after the function, as most other exports in timer.c already did. - eliminate __mod_timer() as an external API, change the users to mod_timer(). The regular mod_timer() code path is not impacted significantly, due to inlining optimizations and due to the simplifications. Based-on-patch-from: Stephen Hemminger Acked-by: Stephen Hemminger Cc: "David S. Miller" Cc: Patrick McHardy Cc: netdev@vger.kernel.org Cc: Oleg Nesterov Cc: Andrew Morton Signed-off-by: Ingo Molnar --- arch/powerpc/platforms/cell/spufs/sched.c | 2 +- drivers/infiniband/hw/ipath/ipath_driver.c | 6 +- include/linux/timer.h | 22 +----- kernel/relay.c | 2 +- kernel/timer.c | 110 +++++++++++++++++++---------- 5 files changed, 80 insertions(+), 62 deletions(-) (limited to 'include/linux') diff --git a/arch/powerpc/platforms/cell/spufs/sched.c b/arch/powerpc/platforms/cell/spufs/sched.c index 6a0ad196aeb3..f085369301b1 100644 --- a/arch/powerpc/platforms/cell/spufs/sched.c +++ b/arch/powerpc/platforms/cell/spufs/sched.c @@ -508,7 +508,7 @@ static void __spu_add_to_rq(struct spu_context *ctx) list_add_tail(&ctx->rq, &spu_prio->runq[ctx->prio]); set_bit(ctx->prio, spu_prio->bitmap); if (!spu_prio->nr_waiting++) - __mod_timer(&spusched_timer, jiffies + SPUSCHED_TICK); + mod_timer(&spusched_timer, jiffies + SPUSCHED_TICK); } } diff --git a/drivers/infiniband/hw/ipath/ipath_driver.c b/drivers/infiniband/hw/ipath/ipath_driver.c index 69c0ce321b4e..cb9daa6ac029 100644 --- a/drivers/infiniband/hw/ipath/ipath_driver.c +++ b/drivers/infiniband/hw/ipath/ipath_driver.c @@ -2715,7 +2715,7 @@ static void ipath_hol_signal_up(struct ipath_devdata *dd) * to prevent HoL blocking, then start the HoL timer that * periodically continues, then stop procs, so they can detect * link down if they want, and do something about it. - * Timer may already be running, so use __mod_timer, not add_timer. + * Timer may already be running, so use mod_timer, not add_timer. */ void ipath_hol_down(struct ipath_devdata *dd) { @@ -2724,7 +2724,7 @@ void ipath_hol_down(struct ipath_devdata *dd) dd->ipath_hol_next = IPATH_HOL_DOWNCONT; dd->ipath_hol_timer.expires = jiffies + msecs_to_jiffies(ipath_hol_timeout_ms); - __mod_timer(&dd->ipath_hol_timer, dd->ipath_hol_timer.expires); + mod_timer(&dd->ipath_hol_timer, dd->ipath_hol_timer.expires); } /* @@ -2763,7 +2763,7 @@ void ipath_hol_event(unsigned long opaque) else { dd->ipath_hol_timer.expires = jiffies + msecs_to_jiffies(ipath_hol_timeout_ms); - __mod_timer(&dd->ipath_hol_timer, + mod_timer(&dd->ipath_hol_timer, dd->ipath_hol_timer.expires); } } diff --git a/include/linux/timer.h b/include/linux/timer.h index daf9685b861c..e2d662e3416e 100644 --- a/include/linux/timer.h +++ b/include/linux/timer.h @@ -86,8 +86,8 @@ static inline int timer_pending(const struct timer_list * timer) extern void add_timer_on(struct timer_list *timer, int cpu); extern int del_timer(struct timer_list * timer); -extern int __mod_timer(struct timer_list *timer, unsigned long expires); extern int mod_timer(struct timer_list *timer, unsigned long expires); +extern int mod_timer_pending(struct timer_list *timer, unsigned long expires); /* * The jiffies value which is added to now, when there is no timer @@ -146,25 +146,7 @@ static inline void timer_stats_timer_clear_start_info(struct timer_list *timer) } #endif -/** - * add_timer - start a timer - * @timer: the timer to be added - * - * The kernel will do a ->function(->data) callback from the - * timer interrupt at the ->expires point in the future. The - * current time is 'jiffies'. - * - * The timer's ->expires, ->function (and if the handler uses it, ->data) - * fields must be set prior calling this function. - * - * Timers with an ->expires field in the past will be executed in the next - * timer tick. - */ -static inline void add_timer(struct timer_list *timer) -{ - BUG_ON(timer_pending(timer)); - __mod_timer(timer, timer->expires); -} +extern void add_timer(struct timer_list *timer); #ifdef CONFIG_SMP extern int try_to_del_timer_sync(struct timer_list *timer); diff --git a/kernel/relay.c b/kernel/relay.c index 9d79b7854fa6..8f2179c8056f 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -750,7 +750,7 @@ size_t relay_switch_subbuf(struct rchan_buf *buf, size_t length) * from the scheduler (trying to re-grab * rq->lock), so defer it. */ - __mod_timer(&buf->timer, jiffies + 1); + mod_timer(&buf->timer, jiffies + 1); } old = buf->data; diff --git a/kernel/timer.c b/kernel/timer.c index 13dd64fe143d..9b77fc9a9ac8 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -589,11 +589,14 @@ static struct tvec_base *lock_timer_base(struct timer_list *timer, } } -int __mod_timer(struct timer_list *timer, unsigned long expires) +static inline int +__mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only) { struct tvec_base *base, *new_base; unsigned long flags; - int ret = 0; + int ret; + + ret = 0; timer_stats_timer_set_start_info(timer); BUG_ON(!timer->function); @@ -603,6 +606,9 @@ int __mod_timer(struct timer_list *timer, unsigned long expires) if (timer_pending(timer)) { detach_timer(timer, 0); ret = 1; + } else { + if (pending_only) + goto out_unlock; } debug_timer_activate(timer); @@ -629,42 +635,28 @@ int __mod_timer(struct timer_list *timer, unsigned long expires) timer->expires = expires; internal_add_timer(base, timer); + +out_unlock: spin_unlock_irqrestore(&base->lock, flags); return ret; } -EXPORT_SYMBOL(__mod_timer); - /** - * add_timer_on - start a timer on a particular CPU - * @timer: the timer to be added - * @cpu: the CPU to start it on + * mod_timer_pending - modify a pending timer's timeout + * @timer: the pending timer to be modified + * @expires: new timeout in jiffies * - * This is not very scalable on SMP. Double adds are not possible. + * mod_timer_pending() is the same for pending timers as mod_timer(), + * but will not re-activate and modify already deleted timers. + * + * It is useful for unserialized use of timers. */ -void add_timer_on(struct timer_list *timer, int cpu) +int mod_timer_pending(struct timer_list *timer, unsigned long expires) { - struct tvec_base *base = per_cpu(tvec_bases, cpu); - unsigned long flags; - - timer_stats_timer_set_start_info(timer); - BUG_ON(timer_pending(timer) || !timer->function); - spin_lock_irqsave(&base->lock, flags); - timer_set_base(timer, base); - debug_timer_activate(timer); - internal_add_timer(base, timer); - /* - * Check whether the other CPU is idle and needs to be - * triggered to reevaluate the timer wheel when nohz is - * active. We are protected against the other CPU fiddling - * with the timer by holding the timer base lock. This also - * makes sure that a CPU on the way to idle can not evaluate - * the timer wheel. - */ - wake_up_idle_cpu(cpu); - spin_unlock_irqrestore(&base->lock, flags); + return __mod_timer(timer, expires, true); } +EXPORT_SYMBOL(mod_timer_pending); /** * mod_timer - modify a timer's timeout @@ -688,9 +680,6 @@ void add_timer_on(struct timer_list *timer, int cpu) */ int mod_timer(struct timer_list *timer, unsigned long expires) { - BUG_ON(!timer->function); - - timer_stats_timer_set_start_info(timer); /* * This is a common optimization triggered by the * networking code - if the timer is re-modified @@ -699,11 +688,61 @@ int mod_timer(struct timer_list *timer, unsigned long expires) if (timer->expires == expires && timer_pending(timer)) return 1; - return __mod_timer(timer, expires); + return __mod_timer(timer, expires, false); } - EXPORT_SYMBOL(mod_timer); +/** + * add_timer - start a timer + * @timer: the timer to be added + * + * The kernel will do a ->function(->data) callback from the + * timer interrupt at the ->expires point in the future. The + * current time is 'jiffies'. + * + * The timer's ->expires, ->function (and if the handler uses it, ->data) + * fields must be set prior calling this function. + * + * Timers with an ->expires field in the past will be executed in the next + * timer tick. + */ +void add_timer(struct timer_list *timer) +{ + BUG_ON(timer_pending(timer)); + mod_timer(timer, timer->expires); +} +EXPORT_SYMBOL(add_timer); + +/** + * add_timer_on - start a timer on a particular CPU + * @timer: the timer to be added + * @cpu: the CPU to start it on + * + * This is not very scalable on SMP. Double adds are not possible. + */ +void add_timer_on(struct timer_list *timer, int cpu) +{ + struct tvec_base *base = per_cpu(tvec_bases, cpu); + unsigned long flags; + + timer_stats_timer_set_start_info(timer); + BUG_ON(timer_pending(timer) || !timer->function); + spin_lock_irqsave(&base->lock, flags); + timer_set_base(timer, base); + debug_timer_activate(timer); + internal_add_timer(base, timer); + /* + * Check whether the other CPU is idle and needs to be + * triggered to reevaluate the timer wheel when nohz is + * active. We are protected against the other CPU fiddling + * with the timer by holding the timer base lock. This also + * makes sure that a CPU on the way to idle can not evaluate + * the timer wheel. + */ + wake_up_idle_cpu(cpu); + spin_unlock_irqrestore(&base->lock, flags); +} + /** * del_timer - deactive a timer. * @timer: the timer to be deactivated @@ -733,7 +772,6 @@ int del_timer(struct timer_list *timer) return ret; } - EXPORT_SYMBOL(del_timer); #ifdef CONFIG_SMP @@ -767,7 +805,6 @@ out: return ret; } - EXPORT_SYMBOL(try_to_del_timer_sync); /** @@ -796,7 +833,6 @@ int del_timer_sync(struct timer_list *timer) cpu_relax(); } } - EXPORT_SYMBOL(del_timer_sync); #endif @@ -1268,7 +1304,7 @@ signed long __sched schedule_timeout(signed long timeout) expire = timeout + jiffies; setup_timer_on_stack(&timer, process_timeout, (unsigned long)current); - __mod_timer(&timer, expire); + __mod_timer(&timer, expire, false); schedule(); del_singleshot_timer_sync(&timer); -- cgit v1.2.3 From 2b9d1496e7835a603c340e8f0dd81f4b74d5f248 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 22 Feb 2009 15:48:43 +0100 Subject: time: ntp: make 64-bit constants more robust Impact: cleanup, no functionality changed - make PPM_SCALE an explicit s64 constant, to remove (s64) casts from usage sites. kernel/time/ntp.o: text data bss dec hex filename 2536 114 136 2786 ae2 ntp.o.before 2536 114 136 2786 ae2 ntp.o.after md5: 40a7728d1188aa18e83e21a81fa7b150 ntp.o.before.asm 40a7728d1188aa18e83e21a81fa7b150 ntp.o.after.asm Signed-off-by: Ingo Molnar --- include/linux/timex.h | 2 +- kernel/time/ntp.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/timex.h b/include/linux/timex.h index 998a55d80acf..aa3475fcff64 100644 --- a/include/linux/timex.h +++ b/include/linux/timex.h @@ -190,7 +190,7 @@ struct timex { * offset and maximum frequency tolerance. */ #define SHIFT_USEC 16 /* frequency offset scale (shift) */ -#define PPM_SCALE (NSEC_PER_USEC << (NTP_SCALE_SHIFT - SHIFT_USEC)) +#define PPM_SCALE ((s64)NSEC_PER_USEC << (NTP_SCALE_SHIFT - SHIFT_USEC)) #define PPM_SCALE_INV_SHIFT 19 #define PPM_SCALE_INV ((1ll << (PPM_SCALE_INV_SHIFT + NTP_SCALE_SHIFT)) / \ PPM_SCALE + 1) diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 4346ed6e623f..7447d57e021a 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -408,7 +408,7 @@ static inline void process_adjtimex_modes(struct timex *txc, struct timespec *ts time_status &= ~STA_NANO; if (txc->modes & ADJ_FREQUENCY) { - time_freq = (s64)txc->freq * PPM_SCALE; + time_freq = txc->freq * PPM_SCALE; time_freq = min(time_freq, MAXFREQ_SCALED); time_freq = max(time_freq, -MAXFREQ_SCALED); } @@ -505,7 +505,7 @@ int do_adjtimex(struct timex *txc) result = TIME_ERROR; txc->freq = shift_right((time_freq >> PPM_SCALE_INV_SHIFT) * - (s64)PPM_SCALE_INV, NTP_SCALE_SHIFT); + PPM_SCALE_INV, NTP_SCALE_SHIFT); txc->maxerror = time_maxerror; txc->esterror = time_esterror; txc->status = time_status; -- cgit v1.2.3