From b5227d03b7191a9a44bf75a4c228a6a9ddbe781b Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas <bhelgaas@google.com>
Date: Tue, 31 May 2016 16:23:02 -0500
Subject: timers: Clarify usleep_range() function comment

Update the usleep_range() function comment to make it clear that it can
only be used in non-atomic context.

Previously we claimed usleep_range() was a drop-in replacement for udelay()
where wakeup is flexible.  But that's only true in non-atomic contexts,
where it's possible to sleep instead of delay.

Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Cc: John Stultz <john.stultz@linaro.org>
Link: http://lkml.kernel.org/r/20160531212302.28502.44995.stgit@bhelgaas-glaptop2.roam.corp.google.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/timer.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'kernel/time')

diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 3a95f9728778..67dd6103003a 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -1702,9 +1702,15 @@ static void __sched do_usleep_range(unsigned long min, unsigned long max)
 }
 
 /**
- * usleep_range - Drop in replacement for udelay where wakeup is flexible
+ * usleep_range - Sleep for an approximate time
  * @min: Minimum time in usecs to sleep
  * @max: Maximum time in usecs to sleep
+ *
+ * In non-atomic context where the exact wakeup time is flexible, use
+ * usleep_range() instead of udelay().  The sleep improves responsiveness
+ * by avoiding the CPU-hogging busy-wait of udelay(), and the range reduces
+ * power usage by allowing hrtimers to take advantage of an already-
+ * scheduled interrupt instead of scheduling a new one just for this sleep.
  */
 void __sched usleep_range(unsigned long min, unsigned long max)
 {
-- 
cgit v1.2.3


From 86721ab63b61ef1dd7305308e4049f644703decf Mon Sep 17 00:00:00 2001
From: Pratyush Patel <pratyushpatel.1995@gmail.com>
Date: Tue, 1 Mar 2016 22:58:49 +0530
Subject: hrtimer: Remove redundant #ifdef block

Only need CONFIG_NO_HZ_COMMON as this block is already in a
CONFIG_SMP block.

Signed-off-by: Pratyush Patel <pratyushpatel.1995@gmail.com>
Link: http://lkml.kernel.org/r/20160301172849.GA18152@cyborg
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/hrtimer.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel/time')

diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index e99df0ff1d42..d13c9aebf7a3 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -177,7 +177,7 @@ hrtimer_check_target(struct hrtimer *timer, struct hrtimer_clock_base *new_base)
 #endif
 }
 
-#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
+#ifdef CONFIG_NO_HZ_COMMON
 static inline
 struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base,
 					 int pinned)
-- 
cgit v1.2.3


From 0fb71d340d355156818bb53eb36ae79a3f88bda9 Mon Sep 17 00:00:00 2001
From: Minfei Huang <mnghuan@gmail.com>
Date: Mon, 25 Apr 2016 17:20:28 +0800
Subject: clocksource: Make clocksource insert entry more efficient

In clocksource_enqueue(), it is unnecessary to continue looping
the list, if we find there is an entry that the value of rating
is smaller than the new one. It is safe to be out the loop,
because all of entry are inserted in descending order.

Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Minfei Huang <mnghuan@gmail.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 kernel/time/clocksource.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'kernel/time')

diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 56ece145a814..6a5a310a1a53 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -669,10 +669,12 @@ static void clocksource_enqueue(struct clocksource *cs)
 	struct list_head *entry = &clocksource_list;
 	struct clocksource *tmp;
 
-	list_for_each_entry(tmp, &clocksource_list, list)
+	list_for_each_entry(tmp, &clocksource_list, list) {
 		/* Keep track of the place, where to insert */
-		if (tmp->rating >= cs->rating)
-			entry = &tmp->list;
+		if (tmp->rating < cs->rating)
+			break;
+		entry = &tmp->list;
+	}
 	list_add(&cs->list, entry);
 }
 
-- 
cgit v1.2.3


From 0209b937569a133dedfe930cdfff3a0d1d68c9e9 Mon Sep 17 00:00:00 2001
From: Thomas Graziadei <thomas.graziadei@omicronenergy.com>
Date: Tue, 31 May 2016 15:06:06 +0200
Subject: timekeeping: Fix 1ns/tick drift with GENERIC_TIME_VSYSCALL_OLD

The user notices the problem in a raw and real time drift, calling
clock_gettime with CLOCK_REALTIME / CLOCK_MONOTONIC_RAW on a system
with no ntp correction taking place (no ntpd or ptp stuff running).

The problem is, that old_vsyscall_fixup adds an extra 1ns even though
xtime_nsec is already held in full nsecs and the remainder in this
case is 0. Do the rounding up buisness only if needed.

Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Thomas Graziadei <thomas.graziadei@omicronenergy.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 kernel/time/timekeeping.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'kernel/time')

diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 479d25cd3d4f..a196e08324e7 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -480,10 +480,12 @@ static inline void old_vsyscall_fixup(struct timekeeper *tk)
 	* users are removed, this can be killed.
 	*/
 	remainder = tk->tkr_mono.xtime_nsec & ((1ULL << tk->tkr_mono.shift) - 1);
-	tk->tkr_mono.xtime_nsec -= remainder;
-	tk->tkr_mono.xtime_nsec += 1ULL << tk->tkr_mono.shift;
-	tk->ntp_error += remainder << tk->ntp_error_shift;
-	tk->ntp_error -= (1ULL << tk->tkr_mono.shift) << tk->ntp_error_shift;
+	if (remainder != 0) {
+		tk->tkr_mono.xtime_nsec -= remainder;
+		tk->tkr_mono.xtime_nsec += 1ULL << tk->tkr_mono.shift;
+		tk->ntp_error += remainder << tk->ntp_error_shift;
+		tk->ntp_error -= (1ULL << tk->tkr_mono.shift) << tk->ntp_error_shift;
+	}
 }
 #else
 #define old_vsyscall_fixup(tk)
-- 
cgit v1.2.3


From af4afb40085f04f8b2ea8d28df878f7f0be02f89 Mon Sep 17 00:00:00 2001
From: Pratyush Patel <pratyushpatel.1995@gmail.com>
Date: Tue, 14 Jun 2016 11:00:42 +0200
Subject: alarmtimer: Fix comments describing structure fields

Updated struct alarm and struct alarm_timer descriptions.

Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Pratyush Patel <pratyushpatel.1995@gmail.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 include/linux/alarmtimer.h | 6 +++---
 kernel/time/alarmtimer.c   | 1 -
 2 files changed, 3 insertions(+), 4 deletions(-)

(limited to 'kernel/time')

diff --git a/include/linux/alarmtimer.h b/include/linux/alarmtimer.h
index 52f3b7da4f2d..9d8031257a90 100644
--- a/include/linux/alarmtimer.h
+++ b/include/linux/alarmtimer.h
@@ -26,10 +26,10 @@ enum alarmtimer_restart {
  * struct alarm - Alarm timer structure
  * @node:	timerqueue node for adding to the event list this value
  *		also includes the expiration time.
- * @period:	Period for recuring alarms
+ * @timer:	hrtimer used to schedule events while running
  * @function:	Function pointer to be executed when the timer fires.
- * @type:	Alarm type (BOOTTIME/REALTIME)
- * @enabled:	Flag that represents if the alarm is set to fire or not
+ * @type:	Alarm type (BOOTTIME/REALTIME).
+ * @state:	Flag that represents if the alarm is set to fire or not.
  * @data:	Internal data value.
  */
 struct alarm {
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index e840ed867a5d..c3aad685bbc0 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -30,7 +30,6 @@
  * struct alarm_base - Alarm timer bases
  * @lock:		Lock for syncrhonized access to the base
  * @timerqueue:		Timerqueue head managing the list of events
- * @timer: 		hrtimer used to schedule events while running
  * @gettime:		Function to read the time correlating to the base
  * @base_clockid:	clockid for the base
  */
-- 
cgit v1.2.3


From e6c2682a1da36a2e79d9bab470412374434ce89e Mon Sep 17 00:00:00 2001
From: Deepa Dinamani <deepa.kernel@gmail.com>
Date: Wed, 8 Jun 2016 22:04:59 -0700
Subject: time: Add time64_to_tm()

time_to_tm() takes time_t as an argument.
time_t is not y2038 safe.
Add time64_to_tm() that takes time64_t as an argument
which is y2038 safe.
The plan is to eventually replace all calls to time_to_tm()
by time64_to_tm().

Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Deepa Dinamani <deepa.kernel@gmail.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 include/linux/time.h   | 15 ++++++++++++++-
 kernel/time/timeconv.c | 11 ++++++-----
 2 files changed, 20 insertions(+), 6 deletions(-)

(limited to 'kernel/time')

diff --git a/include/linux/time.h b/include/linux/time.h
index 297f09f23896..4cea09d94208 100644
--- a/include/linux/time.h
+++ b/include/linux/time.h
@@ -205,7 +205,20 @@ struct tm {
 	int tm_yday;
 };
 
-void time_to_tm(time_t totalsecs, int offset, struct tm *result);
+void time64_to_tm(time64_t totalsecs, int offset, struct tm *result);
+
+/**
+ * time_to_tm - converts the calendar time to local broken-down time
+ *
+ * @totalsecs	the number of seconds elapsed since 00:00:00 on January 1, 1970,
+ *		Coordinated Universal Time (UTC).
+ * @offset	offset seconds adding to totalsecs.
+ * @result	pointer to struct tm variable to receive broken-down time
+ */
+static inline void time_to_tm(time_t totalsecs, int offset, struct tm *result)
+{
+	time64_to_tm(totalsecs, offset, result);
+}
 
 /**
  * timespec_to_ns - Convert timespec to nanoseconds
diff --git a/kernel/time/timeconv.c b/kernel/time/timeconv.c
index 86628e755f38..7142580ad94f 100644
--- a/kernel/time/timeconv.c
+++ b/kernel/time/timeconv.c
@@ -67,20 +67,21 @@ static const unsigned short __mon_yday[2][13] = {
 #define SECS_PER_DAY	(SECS_PER_HOUR * 24)
 
 /**
- * time_to_tm - converts the calendar time to local broken-down time
+ * time64_to_tm - converts the calendar time to local broken-down time
  *
  * @totalsecs	the number of seconds elapsed since 00:00:00 on January 1, 1970,
  *		Coordinated Universal Time (UTC).
  * @offset	offset seconds adding to totalsecs.
  * @result	pointer to struct tm variable to receive broken-down time
  */
-void time_to_tm(time_t totalsecs, int offset, struct tm *result)
+void time64_to_tm(time64_t totalsecs, int offset, struct tm *result)
 {
 	long days, rem, y;
+	int remainder;
 	const unsigned short *ip;
 
-	days = totalsecs / SECS_PER_DAY;
-	rem = totalsecs % SECS_PER_DAY;
+	days = div_s64_rem(totalsecs, SECS_PER_DAY, &remainder);
+	rem = remainder;
 	rem += offset;
 	while (rem < 0) {
 		rem += SECS_PER_DAY;
@@ -124,4 +125,4 @@ void time_to_tm(time_t totalsecs, int offset, struct tm *result)
 	result->tm_mon = y;
 	result->tm_mday = days + 1;
 }
-EXPORT_SYMBOL(time_to_tm);
+EXPORT_SYMBOL(time64_to_tm);
-- 
cgit v1.2.3


From 4a19bd3d22d51a0c89db10879dacaffa0f52aecf Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Fri, 17 Jun 2016 18:03:02 +0200
Subject: time: Avoid timespec in udelay_test

udelay_test_single() uses ktime_get_ts() to get two timespec values
and calculate the difference between them, while udelay_test_show()
uses the same to printk() the current monotonic time.

Both of these are y2038 safe on all machines, but we want to
get rid of struct timespec anyway, so this converts the code to
use ktime_get_ns() and ktime_get_ts64() respectively.

Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 kernel/time/test_udelay.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'kernel/time')

diff --git a/kernel/time/test_udelay.c b/kernel/time/test_udelay.c
index e622ba365a13..b0928ab3270f 100644
--- a/kernel/time/test_udelay.c
+++ b/kernel/time/test_udelay.c
@@ -43,13 +43,13 @@ static int udelay_test_single(struct seq_file *s, int usecs, uint32_t iters)
 	int allowed_error_ns = usecs * 5;
 
 	for (i = 0; i < iters; ++i) {
-		struct timespec ts1, ts2;
+		s64 kt1, kt2;
 		int time_passed;
 
-		ktime_get_ts(&ts1);
+		kt1 = ktime_get_ns();
 		udelay(usecs);
-		ktime_get_ts(&ts2);
-		time_passed = timespec_to_ns(&ts2) - timespec_to_ns(&ts1);
+		kt2 = ktime_get_ns();
+		time_passed = kt2 - kt1;
 
 		if (i == 0 || time_passed < min)
 			min = time_passed;
@@ -87,11 +87,11 @@ static int udelay_test_show(struct seq_file *s, void *v)
 	if (usecs > 0 && iters > 0) {
 		return udelay_test_single(s, usecs, iters);
 	} else if (usecs == 0) {
-		struct timespec ts;
+		struct timespec64 ts;
 
-		ktime_get_ts(&ts);
-		seq_printf(s, "udelay() test (lpj=%ld kt=%ld.%09ld)\n",
-				loops_per_jiffy, ts.tv_sec, ts.tv_nsec);
+		ktime_get_ts64(&ts);
+		seq_printf(s, "udelay() test (lpj=%ld kt=%lld.%09ld)\n",
+				loops_per_jiffy, (s64)ts.tv_sec, ts.tv_nsec);
 		seq_puts(s, "usage:\n");
 		seq_puts(s, "echo USECS [ITERS] > " DEBUGFS_FILENAME "\n");
 		seq_puts(s, "cat " DEBUGFS_FILENAME "\n");
-- 
cgit v1.2.3


From 7c71feb0a6766c7c3a262e3cc33ae231f3953cb6 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Fri, 17 Jun 2016 17:30:47 +0200
Subject: timer: Avoid using timespec

The tstats_show() function prints a ktime_t variable by converting
it to struct timespec first. The algorithm is ok, but we want to
stop using timespec in general because of the 32-bit time_t
overflow problem.

This changes the code to use struct timespec64, without any
functional change.

Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 kernel/time/timer_stats.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel/time')

diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c
index 1adecb4b87c8..087204c733eb 100644
--- a/kernel/time/timer_stats.c
+++ b/kernel/time/timer_stats.c
@@ -279,7 +279,7 @@ static void print_name_offset(struct seq_file *m, unsigned long addr)
 
 static int tstats_show(struct seq_file *m, void *v)
 {
-	struct timespec period;
+	struct timespec64 period;
 	struct entry *entry;
 	unsigned long ms;
 	long events = 0;
@@ -295,11 +295,11 @@ static int tstats_show(struct seq_file *m, void *v)
 
 	time = ktime_sub(time_stop, time_start);
 
-	period = ktime_to_timespec(time);
+	period = ktime_to_timespec64(time);
 	ms = period.tv_nsec / 1000000;
 
 	seq_puts(m, "Timer Stats Version: v0.3\n");
-	seq_printf(m, "Sample period: %ld.%03ld s\n", period.tv_sec, ms);
+	seq_printf(m, "Sample period: %ld.%03ld s\n", (long)period.tv_sec, ms);
 	if (atomic_read(&overflow_count))
 		seq_printf(m, "Overflow: %d entries\n", atomic_read(&overflow_count));
 	seq_printf(m, "Collection: %s\n", timer_stats_active ? "active" : "inactive");
-- 
cgit v1.2.3


From 6168f8ed01dc46a277908938294f1132d723f58d Mon Sep 17 00:00:00 2001
From: Wei Jiangang <weijg.fnst@cn.fujitsu.com>
Date: Wed, 29 Jun 2016 12:51:50 +0800
Subject: timers/nohz: Fix several typos

Signed-off-by: Wei Jiangang <weijg.fnst@cn.fujitsu.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: fenghua.yu@intel.com
Link: http://lkml.kernel.org/r/1467175910-2966-2-git-send-email-weijg.fnst@cn.fujitsu.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/time/tick-sched.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'kernel/time')

diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 536ada80f6dd..6d83e9c4a302 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -61,7 +61,7 @@ static void tick_do_update_jiffies64(ktime_t now)
 	if (delta.tv64 < tick_period.tv64)
 		return;
 
-	/* Reevalute with jiffies_lock held */
+	/* Reevaluate with jiffies_lock held */
 	write_seqlock(&jiffies_lock);
 
 	delta = ktime_sub(now, last_jiffies_update);
@@ -117,7 +117,7 @@ static void tick_sched_do_timer(ktime_t now)
 	/*
 	 * Check if the do_timer duty was dropped. We don't care about
 	 * concurrency: This happens only when the cpu in charge went
-	 * into a long sleep. If two cpus happen to assign themself to
+	 * into a long sleep. If two cpus happen to assign themselves to
 	 * this duty, then the jiffies update is still serialized by
 	 * jiffies_lock.
 	 */
@@ -571,7 +571,7 @@ static ktime_t tick_nohz_start_idle(struct tick_sched *ts)
  * @last_update_time: variable to store update time in. Do not update
  * counters if NULL.
  *
- * Return the cummulative idle time (since boot) for a given
+ * Return the cumulative idle time (since boot) for a given
  * CPU, in microseconds.
  *
  * This time is measured via accounting rather than sampling,
@@ -612,7 +612,7 @@ EXPORT_SYMBOL_GPL(get_cpu_idle_time_us);
  * @last_update_time: variable to store update time in. Do not update
  * counters if NULL.
  *
- * Return the cummulative iowait time (since boot) for a given
+ * Return the cumulative iowait time (since boot) for a given
  * CPU, in microseconds.
  *
  * This time is measured via accounting rather than sampling,
@@ -733,7 +733,7 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
 	 * do_timer() never invoked. Keep track of the fact that it
 	 * was the one which had the do_timer() duty last. If this cpu
 	 * is the one which had the do_timer() duty last, we limit the
-	 * sleep time to the timekeeping max_deferement value.
+	 * sleep time to the timekeeping max_deferment value.
 	 * Otherwise we can sleep as long as we want.
 	 */
 	delta = timekeeping_max_deferment();
-- 
cgit v1.2.3


From 0de7611a1031f25b713fda7d36de44f17c2ed790 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Fri, 1 Jul 2016 12:42:35 +0200
Subject: timers/nohz: Capitalize 'CPU' consistently

While reviewing another patch I noticed that kernel/time/tick-sched.c
had a charmingly (confusingly, annoyingly) rich set of variants for
spelling 'CPU':

  cpu
  cpus
  CPU
  CPUs
  per CPU
  per-CPU
  per cpu

... sometimes these were mixed even within the same comment block!

Compress these variants down to a single consistent set of:

  CPU
  CPUs
  per-CPU

Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/time/tick-sched.c | 42 +++++++++++++++++++++---------------------
 1 file changed, 21 insertions(+), 21 deletions(-)

(limited to 'kernel/time')

diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 6d83e9c4a302..db57d1ba73eb 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -31,7 +31,7 @@
 #include <trace/events/timer.h>
 
 /*
- * Per cpu nohz control structure
+ * Per-CPU nohz control structure
  */
 static DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched);
 
@@ -116,8 +116,8 @@ static void tick_sched_do_timer(ktime_t now)
 #ifdef CONFIG_NO_HZ_COMMON
 	/*
 	 * Check if the do_timer duty was dropped. We don't care about
-	 * concurrency: This happens only when the cpu in charge went
-	 * into a long sleep. If two cpus happen to assign themselves to
+	 * concurrency: This happens only when the CPU in charge went
+	 * into a long sleep. If two CPUs happen to assign themselves to
 	 * this duty, then the jiffies update is still serialized by
 	 * jiffies_lock.
 	 */
@@ -349,7 +349,7 @@ void tick_nohz_dep_clear_signal(struct signal_struct *sig, enum tick_dep_bits bi
 /*
  * Re-evaluate the need for the tick as we switch the current task.
  * It might need the tick due to per task/process properties:
- * perf events, posix cpu timers, ...
+ * perf events, posix CPU timers, ...
  */
 void __tick_nohz_task_switch(void)
 {
@@ -509,8 +509,8 @@ int tick_nohz_tick_stopped(void)
  *
  * In case the sched_tick was stopped on this CPU, we have to check if jiffies
  * must be updated. Otherwise an interrupt handler could use a stale jiffy
- * value. We do this unconditionally on any cpu, as we don't know whether the
- * cpu, which has the update task assigned is in a long sleep.
+ * value. We do this unconditionally on any CPU, as we don't know whether the
+ * CPU, which has the update task assigned is in a long sleep.
  */
 static void tick_nohz_update_jiffies(ktime_t now)
 {
@@ -526,7 +526,7 @@ static void tick_nohz_update_jiffies(ktime_t now)
 }
 
 /*
- * Updates the per cpu time idle statistics counters
+ * Updates the per-CPU time idle statistics counters
  */
 static void
 update_ts_time_stats(int cpu, struct tick_sched *ts, ktime_t now, u64 *last_update_time)
@@ -566,7 +566,7 @@ static ktime_t tick_nohz_start_idle(struct tick_sched *ts)
 }
 
 /**
- * get_cpu_idle_time_us - get the total idle time of a cpu
+ * get_cpu_idle_time_us - get the total idle time of a CPU
  * @cpu: CPU number to query
  * @last_update_time: variable to store update time in. Do not update
  * counters if NULL.
@@ -607,7 +607,7 @@ u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)
 EXPORT_SYMBOL_GPL(get_cpu_idle_time_us);
 
 /**
- * get_cpu_iowait_time_us - get the total iowait time of a cpu
+ * get_cpu_iowait_time_us - get the total iowait time of a CPU
  * @cpu: CPU number to query
  * @last_update_time: variable to store update time in. Do not update
  * counters if NULL.
@@ -726,12 +726,12 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
 	}
 
 	/*
-	 * If this cpu is the one which updates jiffies, then give up
-	 * the assignment and let it be taken by the cpu which runs
-	 * the tick timer next, which might be this cpu as well. If we
+	 * If this CPU is the one which updates jiffies, then give up
+	 * the assignment and let it be taken by the CPU which runs
+	 * the tick timer next, which might be this CPU as well. If we
 	 * don't drop this here the jiffies might be stale and
 	 * do_timer() never invoked. Keep track of the fact that it
-	 * was the one which had the do_timer() duty last. If this cpu
+	 * was the one which had the do_timer() duty last. If this CPU
 	 * is the one which had the do_timer() duty last, we limit the
 	 * sleep time to the timekeeping max_deferment value.
 	 * Otherwise we can sleep as long as we want.
@@ -841,9 +841,9 @@ static void tick_nohz_full_update_tick(struct tick_sched *ts)
 static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)
 {
 	/*
-	 * If this cpu is offline and it is the one which updates
+	 * If this CPU is offline and it is the one which updates
 	 * jiffies, then give up the assignment and let it be taken by
-	 * the cpu which runs the tick timer next. If we don't drop
+	 * the CPU which runs the tick timer next. If we don't drop
 	 * this here the jiffies might be stale and do_timer() never
 	 * invoked.
 	 */
@@ -933,11 +933,11 @@ void tick_nohz_idle_enter(void)
 	WARN_ON_ONCE(irqs_disabled());
 
 	/*
- 	 * Update the idle state in the scheduler domain hierarchy
- 	 * when tick_nohz_stop_sched_tick() is called from the idle loop.
- 	 * State will be updated to busy during the first busy tick after
- 	 * exiting idle.
- 	 */
+	 * Update the idle state in the scheduler domain hierarchy
+	 * when tick_nohz_stop_sched_tick() is called from the idle loop.
+	 * State will be updated to busy during the first busy tick after
+	 * exiting idle.
+	 */
 	set_cpu_sd_state_idle();
 
 	local_irq_disable();
@@ -1211,7 +1211,7 @@ void tick_setup_sched_timer(void)
 	hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
 	ts->sched_timer.function = tick_sched_timer;
 
-	/* Get the next period (per cpu) */
+	/* Get the next period (per-CPU) */
 	hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update());
 
 	/* Offset the tick to avert jiffies_lock contention. */
-- 
cgit v1.2.3


From 5130213721d01b6632c255d4295a8102cbb58379 Mon Sep 17 00:00:00 2001
From: Jisheng Zhang <jszhang@marvell.com>
Date: Tue, 5 Jul 2016 16:57:51 +0800
Subject: tick/broadcast-hrtimer: Set name of the ce_broadcast_hrtimer

This is to avoid the "null" name when we either

~ # cat /sys/devices/system/clockevents/broadcast/current_device
(null)

or

~ # cat /proc/timer_list
...
Tick Device: mode:     1
Broadcast device
Clock Event Device: (null)
...

Signed-off-by: Jisheng Zhang <jszhang@marvell.com>
Cc: linux-arm-kernel@lists.infradead.org
Link: http://lkml.kernel.org/r/1467709071-3667-1-git-send-email-jszhang@marvell.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/tick-broadcast-hrtimer.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel/time')

diff --git a/kernel/time/tick-broadcast-hrtimer.c b/kernel/time/tick-broadcast-hrtimer.c
index 53d7184da0be..690b797f522e 100644
--- a/kernel/time/tick-broadcast-hrtimer.c
+++ b/kernel/time/tick-broadcast-hrtimer.c
@@ -75,6 +75,7 @@ static int bc_set_next(ktime_t expires, struct clock_event_device *bc)
 }
 
 static struct clock_event_device ce_broadcast_hrtimer = {
+	.name			= "bc_hrtimer",
 	.set_state_shutdown	= bc_shutdown,
 	.set_next_ktime		= bc_set_next,
 	.features		= CLOCK_EVT_FEAT_ONESHOT |
-- 
cgit v1.2.3


From e675447bda51c1ea72d1ac9132ce3bed974f1da3 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 4 Jul 2016 09:50:15 +0000
Subject: timers: Make 'pinned' a timer property

We want to move the timer migration logic from a 'push' to a 'pull' model.

Under the current 'push' model pinned timers are handled via
a runtime API variant: mod_timer_pinned().

The 'pull' model requires us to store the pinned attribute of a timer
in the timer_list structure itself, as a new TIMER_PINNED bit in
timer->flags.

This flag must be set at initialization time and the timer APIs
recognize the flag.

This patch:

 - Implements the new flag and associated new-style initialization
   methods

 - makes mod_timer() recognize new-style pinned timers,

 - and adds some migration helper facility to allow
   step by step conversion of old-style to new-style
   pinned timers.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Arjan van de Ven <arjan@infradead.org>
Cc: Chris Mason <clm@fb.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: George Spelvin <linux@sciencehorizons.net>
Cc: Josh Triplett <josh@joshtriplett.org>
Cc: Len Brown <lenb@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: rt@linutronix.de
Link: http://lkml.kernel.org/r/20160704094341.049338558@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/timer.h | 25 ++++++++++++++++++++++---
 kernel/time/timer.c   | 10 +++++-----
 2 files changed, 27 insertions(+), 8 deletions(-)

(limited to 'kernel/time')

diff --git a/include/linux/timer.h b/include/linux/timer.h
index 20ac746f3eb3..046d6cf26498 100644
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -62,7 +62,8 @@ struct timer_list {
 #define TIMER_MIGRATING		0x00080000
 #define TIMER_BASEMASK		(TIMER_CPUMASK | TIMER_MIGRATING)
 #define TIMER_DEFERRABLE	0x00100000
-#define TIMER_IRQSAFE		0x00200000
+#define TIMER_PINNED		0x00200000
+#define TIMER_IRQSAFE		0x00400000
 
 #define __TIMER_INITIALIZER(_function, _expires, _data, _flags) { \
 		.entry = { .next = TIMER_ENTRY_STATIC },	\
@@ -78,9 +79,15 @@ struct timer_list {
 #define TIMER_INITIALIZER(_function, _expires, _data)		\
 	__TIMER_INITIALIZER((_function), (_expires), (_data), 0)
 
+#define TIMER_PINNED_INITIALIZER(_function, _expires, _data)	\
+	__TIMER_INITIALIZER((_function), (_expires), (_data), TIMER_PINNED)
+
 #define TIMER_DEFERRED_INITIALIZER(_function, _expires, _data)	\
 	__TIMER_INITIALIZER((_function), (_expires), (_data), TIMER_DEFERRABLE)
 
+#define TIMER_PINNED_DEFERRED_INITIALIZER(_function, _expires, _data)	\
+	__TIMER_INITIALIZER((_function), (_expires), (_data), TIMER_DEFERRABLE | TIMER_PINNED)
+
 #define DEFINE_TIMER(_name, _function, _expires, _data)		\
 	struct timer_list _name =				\
 		TIMER_INITIALIZER(_function, _expires, _data)
@@ -124,8 +131,12 @@ static inline void init_timer_on_stack_key(struct timer_list *timer,
 
 #define init_timer(timer)						\
 	__init_timer((timer), 0)
+#define init_timer_pinned(timer)					\
+	__init_timer((timer), TIMER_PINNED)
 #define init_timer_deferrable(timer)					\
 	__init_timer((timer), TIMER_DEFERRABLE)
+#define init_timer_pinned_deferrable(timer)				\
+	__init_timer((timer), TIMER_DEFERRABLE | TIMER_PINNED)
 #define init_timer_on_stack(timer)					\
 	__init_timer_on_stack((timer), 0)
 
@@ -145,12 +156,20 @@ static inline void init_timer_on_stack_key(struct timer_list *timer,
 
 #define setup_timer(timer, fn, data)					\
 	__setup_timer((timer), (fn), (data), 0)
+#define setup_pinned_timer(timer, fn, data)				\
+	__setup_timer((timer), (fn), (data), TIMER_PINNED)
 #define setup_deferrable_timer(timer, fn, data)				\
 	__setup_timer((timer), (fn), (data), TIMER_DEFERRABLE)
+#define setup_pinned_deferrable_timer(timer, fn, data)			\
+	__setup_timer((timer), (fn), (data), TIMER_DEFERRABLE | TIMER_PINNED)
 #define setup_timer_on_stack(timer, fn, data)				\
 	__setup_timer_on_stack((timer), (fn), (data), 0)
+#define setup_pinned_timer_on_stack(timer, fn, data)			\
+	__setup_timer_on_stack((timer), (fn), (data), TIMER_PINNED)
 #define setup_deferrable_timer_on_stack(timer, fn, data)		\
 	__setup_timer_on_stack((timer), (fn), (data), TIMER_DEFERRABLE)
+#define setup_pinned_deferrable_timer_on_stack(timer, fn, data)		\
+	__setup_timer_on_stack((timer), (fn), (data), TIMER_DEFERRABLE | TIMER_PINNED)
 
 /**
  * timer_pending - is a timer pending?
@@ -175,8 +194,8 @@ extern int mod_timer_pinned(struct timer_list *timer, unsigned long expires);
 
 extern void set_timer_slack(struct timer_list *time, int slack_hz);
 
-#define TIMER_NOT_PINNED	0
-#define TIMER_PINNED		1
+#define MOD_TIMER_NOT_PINNED	0
+#define MOD_TIMER_PINNED	1
 /*
  * The jiffies value which is added to now, when there is no timer
  * in the timer wheel:
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 3a95f9728778..693f6d14058e 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -782,7 +782,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires,
 
 	debug_activate(timer, expires);
 
-	new_base = get_target_base(base, pinned);
+	new_base = get_target_base(base, pinned || timer->flags & TIMER_PINNED);
 
 	if (base != new_base) {
 		/*
@@ -825,7 +825,7 @@ out_unlock:
  */
 int mod_timer_pending(struct timer_list *timer, unsigned long expires)
 {
-	return __mod_timer(timer, expires, true, TIMER_NOT_PINNED);
+	return __mod_timer(timer, expires, true, MOD_TIMER_NOT_PINNED);
 }
 EXPORT_SYMBOL(mod_timer_pending);
 
@@ -900,7 +900,7 @@ int mod_timer(struct timer_list *timer, unsigned long expires)
 	if (timer_pending(timer) && timer->expires == expires)
 		return 1;
 
-	return __mod_timer(timer, expires, false, TIMER_NOT_PINNED);
+	return __mod_timer(timer, expires, false, MOD_TIMER_NOT_PINNED);
 }
 EXPORT_SYMBOL(mod_timer);
 
@@ -928,7 +928,7 @@ int mod_timer_pinned(struct timer_list *timer, unsigned long expires)
 	if (timer->expires == expires && timer_pending(timer))
 		return 1;
 
-	return __mod_timer(timer, expires, false, TIMER_PINNED);
+	return __mod_timer(timer, expires, false, MOD_TIMER_PINNED);
 }
 EXPORT_SYMBOL(mod_timer_pinned);
 
@@ -1512,7 +1512,7 @@ signed long __sched schedule_timeout(signed long timeout)
 	expire = timeout + jiffies;
 
 	setup_timer_on_stack(&timer, process_timeout, (unsigned long)current);
-	__mod_timer(&timer, expire, false, TIMER_NOT_PINNED);
+	__mod_timer(&timer, expire, false, MOD_TIMER_NOT_PINNED);
 	schedule();
 	del_singleshot_timer_sync(&timer);
 
-- 
cgit v1.2.3


From 177ec0a0a531695210b277d734b2f92ee5796303 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 4 Jul 2016 09:50:24 +0000
Subject: timers: Remove the deprecated mod_timer_pinned() API

We switched all users to initialize the timers as pinned and call
mod_timer(). Remove the now unused timer API function.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Arjan van de Ven <arjan@infradead.org>
Cc: Chris Mason <clm@fb.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: George Spelvin <linux@sciencehorizons.net>
Cc: Josh Triplett <josh@joshtriplett.org>
Cc: Len Brown <lenb@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: rt@linutronix.de
Link: http://lkml.kernel.org/r/20160704094341.706205231@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/timer.h |  3 ---
 kernel/time/timer.c   | 39 +++++----------------------------------
 2 files changed, 5 insertions(+), 37 deletions(-)

(limited to 'kernel/time')

diff --git a/include/linux/timer.h b/include/linux/timer.h
index 046d6cf26498..a8f6c70eb414 100644
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -190,12 +190,9 @@ extern void add_timer_on(struct timer_list *timer, int cpu);
 extern int del_timer(struct timer_list * timer);
 extern int mod_timer(struct timer_list *timer, unsigned long expires);
 extern int mod_timer_pending(struct timer_list *timer, unsigned long expires);
-extern int mod_timer_pinned(struct timer_list *timer, unsigned long expires);
 
 extern void set_timer_slack(struct timer_list *time, int slack_hz);
 
-#define MOD_TIMER_NOT_PINNED	0
-#define MOD_TIMER_PINNED	1
 /*
  * The jiffies value which is added to now, when there is no timer
  * in the timer wheel:
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 693f6d14058e..ba49c1cf80f5 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -764,8 +764,7 @@ static struct tvec_base *lock_timer_base(struct timer_list *timer,
 }
 
 static inline int
-__mod_timer(struct timer_list *timer, unsigned long expires,
-	    bool pending_only, int pinned)
+__mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
 {
 	struct tvec_base *base, *new_base;
 	unsigned long flags;
@@ -782,7 +781,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires,
 
 	debug_activate(timer, expires);
 
-	new_base = get_target_base(base, pinned || timer->flags & TIMER_PINNED);
+	new_base = get_target_base(base, timer->flags & TIMER_PINNED);
 
 	if (base != new_base) {
 		/*
@@ -825,7 +824,7 @@ out_unlock:
  */
 int mod_timer_pending(struct timer_list *timer, unsigned long expires)
 {
-	return __mod_timer(timer, expires, true, MOD_TIMER_NOT_PINNED);
+	return __mod_timer(timer, expires, true);
 }
 EXPORT_SYMBOL(mod_timer_pending);
 
@@ -900,38 +899,10 @@ int mod_timer(struct timer_list *timer, unsigned long expires)
 	if (timer_pending(timer) && timer->expires == expires)
 		return 1;
 
-	return __mod_timer(timer, expires, false, MOD_TIMER_NOT_PINNED);
+	return __mod_timer(timer, expires, false);
 }
 EXPORT_SYMBOL(mod_timer);
 
-/**
- * mod_timer_pinned - modify a timer's timeout
- * @timer: the timer to be modified
- * @expires: new timeout in jiffies
- *
- * mod_timer_pinned() is a way to update the expire field of an
- * active timer (if the timer is inactive it will be activated)
- * and to ensure that the timer is scheduled on the current CPU.
- *
- * Note that this does not prevent the timer from being migrated
- * when the current CPU goes offline.  If this is a problem for
- * you, use CPU-hotplug notifiers to handle it correctly, for
- * example, cancelling the timer when the corresponding CPU goes
- * offline.
- *
- * mod_timer_pinned(timer, expires) is equivalent to:
- *
- *     del_timer(timer); timer->expires = expires; add_timer(timer);
- */
-int mod_timer_pinned(struct timer_list *timer, unsigned long expires)
-{
-	if (timer->expires == expires && timer_pending(timer))
-		return 1;
-
-	return __mod_timer(timer, expires, false, MOD_TIMER_PINNED);
-}
-EXPORT_SYMBOL(mod_timer_pinned);
-
 /**
  * add_timer - start a timer
  * @timer: the timer to be added
@@ -1512,7 +1483,7 @@ signed long __sched schedule_timeout(signed long timeout)
 	expire = timeout + jiffies;
 
 	setup_timer_on_stack(&timer, process_timeout, (unsigned long)current);
-	__mod_timer(&timer, expire, false, MOD_TIMER_NOT_PINNED);
+	__mod_timer(&timer, expire, false);
 	schedule();
 	del_singleshot_timer_sync(&timer);
 
-- 
cgit v1.2.3


From 494af3ed7848de08640d98ee5aff57a45c137c3c Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 4 Jul 2016 09:50:28 +0000
Subject: timers: Give a few structs and members proper names

Some of the names in the internal implementation of the timer code
are not longer correct and others are simply too long to type.

Clean it up before we switch the wheel implementation over to
the new scheme.

No functional change.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Arjan van de Ven <arjan@infradead.org>
Cc: Chris Mason <clm@fb.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: George Spelvin <linux@sciencehorizons.net>
Cc: Josh Triplett <josh@joshtriplett.org>
Cc: Len Brown <lenb@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: rt@linutronix.de
Link: http://lkml.kernel.org/r/20160704094341.948752516@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/time/timer.c | 118 ++++++++++++++++++++++++++--------------------------
 1 file changed, 59 insertions(+), 59 deletions(-)

(limited to 'kernel/time')

diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index ba49c1cf80f5..f259a3ef4577 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -77,10 +77,10 @@ struct tvec_root {
 	struct hlist_head vec[TVR_SIZE];
 };
 
-struct tvec_base {
+struct timer_base {
 	spinlock_t lock;
 	struct timer_list *running_timer;
-	unsigned long timer_jiffies;
+	unsigned long clk;
 	unsigned long next_timer;
 	unsigned long active_timers;
 	unsigned long all_timers;
@@ -95,7 +95,7 @@ struct tvec_base {
 } ____cacheline_aligned;
 
 
-static DEFINE_PER_CPU(struct tvec_base, tvec_bases);
+static DEFINE_PER_CPU(struct timer_base, timer_bases);
 
 #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
 unsigned int sysctl_timer_migration = 1;
@@ -106,15 +106,15 @@ void timers_update_migration(bool update_nohz)
 	unsigned int cpu;
 
 	/* Avoid the loop, if nothing to update */
-	if (this_cpu_read(tvec_bases.migration_enabled) == on)
+	if (this_cpu_read(timer_bases.migration_enabled) == on)
 		return;
 
 	for_each_possible_cpu(cpu) {
-		per_cpu(tvec_bases.migration_enabled, cpu) = on;
+		per_cpu(timer_bases.migration_enabled, cpu) = on;
 		per_cpu(hrtimer_bases.migration_enabled, cpu) = on;
 		if (!update_nohz)
 			continue;
-		per_cpu(tvec_bases.nohz_active, cpu) = true;
+		per_cpu(timer_bases.nohz_active, cpu) = true;
 		per_cpu(hrtimer_bases.nohz_active, cpu) = true;
 	}
 }
@@ -134,18 +134,18 @@ int timer_migration_handler(struct ctl_table *table, int write,
 	return ret;
 }
 
-static inline struct tvec_base *get_target_base(struct tvec_base *base,
+static inline struct timer_base *get_target_base(struct timer_base *base,
 						int pinned)
 {
 	if (pinned || !base->migration_enabled)
-		return this_cpu_ptr(&tvec_bases);
-	return per_cpu_ptr(&tvec_bases, get_nohz_timer_target());
+		return this_cpu_ptr(&timer_bases);
+	return per_cpu_ptr(&timer_bases, get_nohz_timer_target());
 }
 #else
-static inline struct tvec_base *get_target_base(struct tvec_base *base,
+static inline struct timer_base *get_target_base(struct timer_base *base,
 						int pinned)
 {
-	return this_cpu_ptr(&tvec_bases);
+	return this_cpu_ptr(&timer_bases);
 }
 #endif
 
@@ -371,10 +371,10 @@ void set_timer_slack(struct timer_list *timer, int slack_hz)
 EXPORT_SYMBOL_GPL(set_timer_slack);
 
 static void
-__internal_add_timer(struct tvec_base *base, struct timer_list *timer)
+__internal_add_timer(struct timer_base *base, struct timer_list *timer)
 {
 	unsigned long expires = timer->expires;
-	unsigned long idx = expires - base->timer_jiffies;
+	unsigned long idx = expires - base->clk;
 	struct hlist_head *vec;
 
 	if (idx < TVR_SIZE) {
@@ -394,7 +394,7 @@ __internal_add_timer(struct tvec_base *base, struct timer_list *timer)
 		 * Can happen if you add a timer with expires == jiffies,
 		 * or you set a timer to go off in the past
 		 */
-		vec = base->tv1.vec + (base->timer_jiffies & TVR_MASK);
+		vec = base->tv1.vec + (base->clk & TVR_MASK);
 	} else {
 		int i;
 		/* If the timeout is larger than MAX_TVAL (on 64-bit
@@ -403,7 +403,7 @@ __internal_add_timer(struct tvec_base *base, struct timer_list *timer)
 		 */
 		if (idx > MAX_TVAL) {
 			idx = MAX_TVAL;
-			expires = idx + base->timer_jiffies;
+			expires = idx + base->clk;
 		}
 		i = (expires >> (TVR_BITS + 3 * TVN_BITS)) & TVN_MASK;
 		vec = base->tv5.vec + i;
@@ -412,11 +412,11 @@ __internal_add_timer(struct tvec_base *base, struct timer_list *timer)
 	hlist_add_head(&timer->entry, vec);
 }
 
-static void internal_add_timer(struct tvec_base *base, struct timer_list *timer)
+static void internal_add_timer(struct timer_base *base, struct timer_list *timer)
 {
 	/* Advance base->jiffies, if the base is empty */
 	if (!base->all_timers++)
-		base->timer_jiffies = jiffies;
+		base->clk = jiffies;
 
 	__internal_add_timer(base, timer);
 	/*
@@ -707,7 +707,7 @@ static inline void detach_timer(struct timer_list *timer, bool clear_pending)
 }
 
 static inline void
-detach_expired_timer(struct timer_list *timer, struct tvec_base *base)
+detach_expired_timer(struct timer_list *timer, struct timer_base *base)
 {
 	detach_timer(timer, true);
 	if (!(timer->flags & TIMER_DEFERRABLE))
@@ -715,7 +715,7 @@ detach_expired_timer(struct timer_list *timer, struct tvec_base *base)
 	base->all_timers--;
 }
 
-static int detach_if_pending(struct timer_list *timer, struct tvec_base *base,
+static int detach_if_pending(struct timer_list *timer, struct timer_base *base,
 			     bool clear_pending)
 {
 	if (!timer_pending(timer))
@@ -725,16 +725,16 @@ static int detach_if_pending(struct timer_list *timer, struct tvec_base *base,
 	if (!(timer->flags & TIMER_DEFERRABLE)) {
 		base->active_timers--;
 		if (timer->expires == base->next_timer)
-			base->next_timer = base->timer_jiffies;
+			base->next_timer = base->clk;
 	}
 	/* If this was the last timer, advance base->jiffies */
 	if (!--base->all_timers)
-		base->timer_jiffies = jiffies;
+		base->clk = jiffies;
 	return 1;
 }
 
 /*
- * We are using hashed locking: holding per_cpu(tvec_bases).lock
+ * We are using hashed locking: holding per_cpu(timer_bases).lock
  * means that all timers which are tied to this base via timer->base are
  * locked, and the base itself is locked too.
  *
@@ -744,16 +744,16 @@ static int detach_if_pending(struct timer_list *timer, struct tvec_base *base,
  * When the timer's base is locked and removed from the list, the
  * TIMER_MIGRATING flag is set, FIXME
  */
-static struct tvec_base *lock_timer_base(struct timer_list *timer,
+static struct timer_base *lock_timer_base(struct timer_list *timer,
 					unsigned long *flags)
 	__acquires(timer->base->lock)
 {
 	for (;;) {
 		u32 tf = timer->flags;
-		struct tvec_base *base;
+		struct timer_base *base;
 
 		if (!(tf & TIMER_MIGRATING)) {
-			base = per_cpu_ptr(&tvec_bases, tf & TIMER_CPUMASK);
+			base = per_cpu_ptr(&timer_bases, tf & TIMER_CPUMASK);
 			spin_lock_irqsave(&base->lock, *flags);
 			if (timer->flags == tf)
 				return base;
@@ -766,7 +766,7 @@ static struct tvec_base *lock_timer_base(struct timer_list *timer,
 static inline int
 __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
 {
-	struct tvec_base *base, *new_base;
+	struct timer_base *base, *new_base;
 	unsigned long flags;
 	int ret = 0;
 
@@ -933,8 +933,8 @@ EXPORT_SYMBOL(add_timer);
  */
 void add_timer_on(struct timer_list *timer, int cpu)
 {
-	struct tvec_base *new_base = per_cpu_ptr(&tvec_bases, cpu);
-	struct tvec_base *base;
+	struct timer_base *new_base = per_cpu_ptr(&timer_bases, cpu);
+	struct timer_base *base;
 	unsigned long flags;
 
 	timer_stats_timer_set_start_info(timer);
@@ -975,7 +975,7 @@ EXPORT_SYMBOL_GPL(add_timer_on);
  */
 int del_timer(struct timer_list *timer)
 {
-	struct tvec_base *base;
+	struct timer_base *base;
 	unsigned long flags;
 	int ret = 0;
 
@@ -1001,7 +1001,7 @@ EXPORT_SYMBOL(del_timer);
  */
 int try_to_del_timer_sync(struct timer_list *timer)
 {
-	struct tvec_base *base;
+	struct timer_base *base;
 	unsigned long flags;
 	int ret = -1;
 
@@ -1085,7 +1085,7 @@ int del_timer_sync(struct timer_list *timer)
 EXPORT_SYMBOL(del_timer_sync);
 #endif
 
-static int cascade(struct tvec_base *base, struct tvec *tv, int index)
+static int cascade(struct timer_base *base, struct tvec *tv, int index)
 {
 	/* cascade all the timers from tv up one level */
 	struct timer_list *timer;
@@ -1149,7 +1149,7 @@ static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long),
 	}
 }
 
-#define INDEX(N) ((base->timer_jiffies >> (TVR_BITS + (N) * TVN_BITS)) & TVN_MASK)
+#define INDEX(N) ((base->clk >> (TVR_BITS + (N) * TVN_BITS)) & TVN_MASK)
 
 /**
  * __run_timers - run all expired timers (if any) on this CPU.
@@ -1158,23 +1158,23 @@ static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long),
  * This function cascades all vectors and executes all expired timer
  * vectors.
  */
-static inline void __run_timers(struct tvec_base *base)
+static inline void __run_timers(struct timer_base *base)
 {
 	struct timer_list *timer;
 
 	spin_lock_irq(&base->lock);
 
-	while (time_after_eq(jiffies, base->timer_jiffies)) {
+	while (time_after_eq(jiffies, base->clk)) {
 		struct hlist_head work_list;
 		struct hlist_head *head = &work_list;
 		int index;
 
 		if (!base->all_timers) {
-			base->timer_jiffies = jiffies;
+			base->clk = jiffies;
 			break;
 		}
 
-		index = base->timer_jiffies & TVR_MASK;
+		index = base->clk & TVR_MASK;
 
 		/*
 		 * Cascade timers:
@@ -1184,7 +1184,7 @@ static inline void __run_timers(struct tvec_base *base)
 				(!cascade(base, &base->tv3, INDEX(1))) &&
 					!cascade(base, &base->tv4, INDEX(2)))
 			cascade(base, &base->tv5, INDEX(3));
-		++base->timer_jiffies;
+		++base->clk;
 		hlist_move_list(base->tv1.vec + index, head);
 		while (!hlist_empty(head)) {
 			void (*fn)(unsigned long);
@@ -1222,16 +1222,16 @@ static inline void __run_timers(struct tvec_base *base)
  * is used on S/390 to stop all activity when a CPU is idle.
  * This function needs to be called with interrupts disabled.
  */
-static unsigned long __next_timer_interrupt(struct tvec_base *base)
+static unsigned long __next_timer_interrupt(struct timer_base *base)
 {
-	unsigned long timer_jiffies = base->timer_jiffies;
-	unsigned long expires = timer_jiffies + NEXT_TIMER_MAX_DELTA;
+	unsigned long clk = base->clk;
+	unsigned long expires = clk + NEXT_TIMER_MAX_DELTA;
 	int index, slot, array, found = 0;
 	struct timer_list *nte;
 	struct tvec *varray[4];
 
 	/* Look for timer events in tv1. */
-	index = slot = timer_jiffies & TVR_MASK;
+	index = slot = clk & TVR_MASK;
 	do {
 		hlist_for_each_entry(nte, base->tv1.vec + slot, entry) {
 			if (nte->flags & TIMER_DEFERRABLE)
@@ -1250,8 +1250,8 @@ static unsigned long __next_timer_interrupt(struct tvec_base *base)
 cascade:
 	/* Calculate the next cascade event */
 	if (index)
-		timer_jiffies += TVR_SIZE - index;
-	timer_jiffies >>= TVR_BITS;
+		clk += TVR_SIZE - index;
+	clk >>= TVR_BITS;
 
 	/* Check tv2-tv5. */
 	varray[0] = &base->tv2;
@@ -1262,7 +1262,7 @@ cascade:
 	for (array = 0; array < 4; array++) {
 		struct tvec *varp = varray[array];
 
-		index = slot = timer_jiffies & TVN_MASK;
+		index = slot = clk & TVN_MASK;
 		do {
 			hlist_for_each_entry(nte, varp->vec + slot, entry) {
 				if (nte->flags & TIMER_DEFERRABLE)
@@ -1286,8 +1286,8 @@ cascade:
 		} while (slot != index);
 
 		if (index)
-			timer_jiffies += TVN_SIZE - index;
-		timer_jiffies >>= TVN_BITS;
+			clk += TVN_SIZE - index;
+		clk >>= TVN_BITS;
 	}
 	return expires;
 }
@@ -1335,7 +1335,7 @@ static u64 cmp_next_hrtimer_event(u64 basem, u64 expires)
  */
 u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
 {
-	struct tvec_base *base = this_cpu_ptr(&tvec_bases);
+	struct timer_base *base = this_cpu_ptr(&timer_bases);
 	u64 expires = KTIME_MAX;
 	unsigned long nextevt;
 
@@ -1348,7 +1348,7 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
 
 	spin_lock(&base->lock);
 	if (base->active_timers) {
-		if (time_before_eq(base->next_timer, base->timer_jiffies))
+		if (time_before_eq(base->next_timer, base->clk))
 			base->next_timer = __next_timer_interrupt(base);
 		nextevt = base->next_timer;
 		if (time_before_eq(nextevt, basej))
@@ -1387,9 +1387,9 @@ void update_process_times(int user_tick)
  */
 static void run_timer_softirq(struct softirq_action *h)
 {
-	struct tvec_base *base = this_cpu_ptr(&tvec_bases);
+	struct timer_base *base = this_cpu_ptr(&timer_bases);
 
-	if (time_after_eq(jiffies, base->timer_jiffies))
+	if (time_after_eq(jiffies, base->clk))
 		__run_timers(base);
 }
 
@@ -1534,7 +1534,7 @@ signed long __sched schedule_timeout_idle(signed long timeout)
 EXPORT_SYMBOL(schedule_timeout_idle);
 
 #ifdef CONFIG_HOTPLUG_CPU
-static void migrate_timer_list(struct tvec_base *new_base, struct hlist_head *head)
+static void migrate_timer_list(struct timer_base *new_base, struct hlist_head *head)
 {
 	struct timer_list *timer;
 	int cpu = new_base->cpu;
@@ -1550,13 +1550,13 @@ static void migrate_timer_list(struct tvec_base *new_base, struct hlist_head *he
 
 static void migrate_timers(int cpu)
 {
-	struct tvec_base *old_base;
-	struct tvec_base *new_base;
+	struct timer_base *old_base;
+	struct timer_base *new_base;
 	int i;
 
 	BUG_ON(cpu_online(cpu));
-	old_base = per_cpu_ptr(&tvec_bases, cpu);
-	new_base = get_cpu_ptr(&tvec_bases);
+	old_base = per_cpu_ptr(&timer_bases, cpu);
+	new_base = get_cpu_ptr(&timer_bases);
 	/*
 	 * The caller is globally serialized and nobody else
 	 * takes two locks at once, deadlock is not possible.
@@ -1580,7 +1580,7 @@ static void migrate_timers(int cpu)
 
 	spin_unlock(&old_base->lock);
 	spin_unlock_irq(&new_base->lock);
-	put_cpu_ptr(&tvec_bases);
+	put_cpu_ptr(&timer_bases);
 }
 
 static int timer_cpu_notify(struct notifier_block *self,
@@ -1608,13 +1608,13 @@ static inline void timer_register_cpu_notifier(void) { }
 
 static void __init init_timer_cpu(int cpu)
 {
-	struct tvec_base *base = per_cpu_ptr(&tvec_bases, cpu);
+	struct timer_base *base = per_cpu_ptr(&timer_bases, cpu);
 
 	base->cpu = cpu;
 	spin_lock_init(&base->lock);
 
-	base->timer_jiffies = jiffies;
-	base->next_timer = base->timer_jiffies;
+	base->clk = jiffies;
+	base->next_timer = base->clk;
 }
 
 static void __init init_timer_cpus(void)
-- 
cgit v1.2.3


From 500462a9de657f86edaa102f8ab6bff7f7e43fc2 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 4 Jul 2016 09:50:30 +0000
Subject: timers: Switch to a non-cascading wheel

The current timer wheel has some drawbacks:

1) Cascading:

   Cascading can be an unbound operation and is completely pointless in most
   cases because the vast majority of the timer wheel timers are canceled or
   rearmed before expiration. (They are used as timeout safeguards, not as
   real timers to measure time.)

2) No fast lookup of the next expiring timer:

   In NOHZ scenarios the first timer soft interrupt after a long NOHZ period
   must fast forward the base time to the current value of jiffies. As we
   have no way to find the next expiring timer fast, the code loops linearly
   and increments the base time one by one and checks for expired timers
   in each step. This causes unbound overhead spikes exactly in the moment
   when we should wake up as fast as possible.

After a thorough analysis of real world data gathered on laptops,
workstations, webservers and other machines (thanks Chris!) I came to the
conclusion that the current 'classic' timer wheel implementation can be
modified to address the above issues.

The vast majority of timer wheel timers is canceled or rearmed before
expiry. Most of them are timeouts for networking and other I/O tasks. The
nature of timeouts is to catch the exception from normal operation (TCP ack
timed out, disk does not respond, etc.). For these kinds of timeouts the
accuracy of the timeout is not really a concern. Timeouts are very often
approximate worst-case values and in case the timeout fires, we already
waited for a long time and performance is down the drain already.

The few timers which actually expire can be split into two categories:

 1) Short expiry times which expect halfways accurate expiry

 2) Long term expiry times are inaccurate today already due to the
    batching which is done for NOHZ automatically and also via the
    set_timer_slack() API.

So for long term expiry timers we can avoid the cascading property and just
leave them in the less granular outer wheels until expiry or
cancelation. Timers which are armed with a timeout larger than the wheel
capacity are no longer cascaded. We expire them with the longest possible
timeout (6+ days). We have not observed such timeouts in our data collection,
but at least we handle them, applying the rule of the least surprise.

To avoid extending the wheel levels for HZ=1000 so we can accomodate the
longest observed timeouts (5 days in the network conntrack code) we reduce the
first level granularity on HZ=1000 to 4ms, which effectively is the same as
the HZ=250 behaviour. From our data analysis there is nothing which relies on
that 1ms granularity and as a side effect we get better batching and timer
locality for the networking code as well.

Contrary to the classic wheel the granularity of the next wheel is not the
capacity of the first wheel. The granularities of the wheels are in the
currently chosen setting 8 times the granularity of the previous wheel.

So for HZ=250 we end up with the following granularity levels:

 Level Offset   Granularity                  Range
     0      0          4 ms                 0 ms -        252 ms
     1     64         32 ms               256 ms -       2044 ms (256ms - ~2s)
     2    128        256 ms              2048 ms -      16380 ms (~2s   - ~16s)
     3    192       2048 ms (~2s)       16384 ms -     131068 ms (~16s  - ~2m)
     4    256      16384 ms (~16s)     131072 ms -    1048572 ms (~2m   - ~17m)
     5    320     131072 ms (~2m)     1048576 ms -    8388604 ms (~17m  - ~2h)
     6    384    1048576 ms (~17m)    8388608 ms -   67108863 ms (~2h   - ~18h)
     7    448    8388608 ms (~2h)    67108864 ms -  536870911 ms (~18h  - ~6d)

That's a worst case inaccuracy of 12.5% for the timers which are queued at the
beginning of a level.

So the new wheel concept addresses the old issues:

1) Cascading is avoided completely

2) By keeping the timers in the bucket until expiry/cancelation we can track
   the buckets which have timers enqueued in a bucket bitmap and therefore can
   look up the next expiring timer very fast and O(1).

A further benefit of the concept is that the slack calculation which is done
on every timer start is no longer necessary because the granularity levels
provide natural batching already.

Our extensive testing with various loads did not show any performance
degradation vs. the current wheel implementation.

This patch does not address the 'fast lookup' issue as we wanted to make sure
that there is no regression introduced by the wheel redesign. The
optimizations are in follow up patches.

This patch contains fixes from Anna-Maria Gleixner and Richard Cochran.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Arjan van de Ven <arjan@infradead.org>
Cc: Chris Mason <clm@fb.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: George Spelvin <linux@sciencehorizons.net>
Cc: Josh Triplett <josh@joshtriplett.org>
Cc: Len Brown <lenb@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: rt@linutronix.de
Link: http://lkml.kernel.org/r/20160704094342.108621834@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/timer.h |   2 +
 kernel/time/timer.c   | 829 ++++++++++++++++++++++++++++----------------------
 2 files changed, 469 insertions(+), 362 deletions(-)

(limited to 'kernel/time')

diff --git a/include/linux/timer.h b/include/linux/timer.h
index 989f33d16ebf..5869ab9848fe 100644
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -64,6 +64,8 @@ struct timer_list {
 #define TIMER_DEFERRABLE	0x00080000
 #define TIMER_PINNED		0x00100000
 #define TIMER_IRQSAFE		0x00200000
+#define TIMER_ARRAYSHIFT	22
+#define TIMER_ARRAYMASK		0xFFC00000
 
 #define __TIMER_INITIALIZER(_function, _expires, _data, _flags) { \
 		.entry = { .next = TIMER_ENTRY_STATIC },	\
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index f259a3ef4577..86e95b72665d 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -59,43 +59,151 @@ __visible u64 jiffies_64 __cacheline_aligned_in_smp = INITIAL_JIFFIES;
 EXPORT_SYMBOL(jiffies_64);
 
 /*
- * per-CPU timer vector definitions:
+ * The timer wheel has LVL_DEPTH array levels. Each level provides an array of
+ * LVL_SIZE buckets. Each level is driven by its own clock and therefor each
+ * level has a different granularity.
+ *
+ * The level granularity is:		LVL_CLK_DIV ^ lvl
+ * The level clock frequency is:	HZ / (LVL_CLK_DIV ^ level)
+ *
+ * The array level of a newly armed timer depends on the relative expiry
+ * time. The farther the expiry time is away the higher the array level and
+ * therefor the granularity becomes.
+ *
+ * Contrary to the original timer wheel implementation, which aims for 'exact'
+ * expiry of the timers, this implementation removes the need for recascading
+ * the timers into the lower array levels. The previous 'classic' timer wheel
+ * implementation of the kernel already violated the 'exact' expiry by adding
+ * slack to the expiry time to provide batched expiration. The granularity
+ * levels provide implicit batching.
+ *
+ * This is an optimization of the original timer wheel implementation for the
+ * majority of the timer wheel use cases: timeouts. The vast majority of
+ * timeout timers (networking, disk I/O ...) are canceled before expiry. If
+ * the timeout expires it indicates that normal operation is disturbed, so it
+ * does not matter much whether the timeout comes with a slight delay.
+ *
+ * The only exception to this are networking timers with a small expiry
+ * time. They rely on the granularity. Those fit into the first wheel level,
+ * which has HZ granularity.
+ *
+ * We don't have cascading anymore. timers with a expiry time above the
+ * capacity of the last wheel level are force expired at the maximum timeout
+ * value of the last wheel level. From data sampling we know that the maximum
+ * value observed is 5 days (network connection tracking), so this should not
+ * be an issue.
+ *
+ * The currently chosen array constants values are a good compromise between
+ * array size and granularity.
+ *
+ * This results in the following granularity and range levels:
+ *
+ * HZ 1000 steps
+ * Level Offset  Granularity            Range
+ *  0      0         1 ms                0 ms -         63 ms
+ *  1     64         8 ms               64 ms -        511 ms
+ *  2    128        64 ms              512 ms -       4095 ms (512ms - ~4s)
+ *  3    192       512 ms             4096 ms -      32767 ms (~4s - ~32s)
+ *  4    256      4096 ms (~4s)      32768 ms -     262143 ms (~32s - ~4m)
+ *  5    320     32768 ms (~32s)    262144 ms -    2097151 ms (~4m - ~34m)
+ *  6    384    262144 ms (~4m)    2097152 ms -   16777215 ms (~34m - ~4h)
+ *  7    448   2097152 ms (~34m)  16777216 ms -  134217727 ms (~4h - ~1d)
+ *  8    512  16777216 ms (~4h)  134217728 ms - 1073741822 ms (~1d - ~12d)
+ *
+ * HZ  300
+ * Level Offset  Granularity            Range
+ *  0	   0         3 ms                0 ms -        210 ms
+ *  1	  64        26 ms              213 ms -       1703 ms (213ms - ~1s)
+ *  2	 128       213 ms             1706 ms -      13650 ms (~1s - ~13s)
+ *  3	 192      1706 ms (~1s)      13653 ms -     109223 ms (~13s - ~1m)
+ *  4	 256     13653 ms (~13s)    109226 ms -     873810 ms (~1m - ~14m)
+ *  5	 320    109226 ms (~1m)     873813 ms -    6990503 ms (~14m - ~1h)
+ *  6	 384    873813 ms (~14m)   6990506 ms -   55924050 ms (~1h - ~15h)
+ *  7	 448   6990506 ms (~1h)   55924053 ms -  447392423 ms (~15h - ~5d)
+ *  8    512  55924053 ms (~15h) 447392426 ms - 3579139406 ms (~5d - ~41d)
+ *
+ * HZ  250
+ * Level Offset  Granularity            Range
+ *  0	   0         4 ms                0 ms -        255 ms
+ *  1	  64        32 ms              256 ms -       2047 ms (256ms - ~2s)
+ *  2	 128       256 ms             2048 ms -      16383 ms (~2s - ~16s)
+ *  3	 192      2048 ms (~2s)      16384 ms -     131071 ms (~16s - ~2m)
+ *  4	 256     16384 ms (~16s)    131072 ms -    1048575 ms (~2m - ~17m)
+ *  5	 320    131072 ms (~2m)    1048576 ms -    8388607 ms (~17m - ~2h)
+ *  6	 384   1048576 ms (~17m)   8388608 ms -   67108863 ms (~2h - ~18h)
+ *  7	 448   8388608 ms (~2h)   67108864 ms -  536870911 ms (~18h - ~6d)
+ *  8    512  67108864 ms (~18h) 536870912 ms - 4294967288 ms (~6d - ~49d)
+ *
+ * HZ  100
+ * Level Offset  Granularity            Range
+ *  0	   0         10 ms               0 ms -        630 ms
+ *  1	  64         80 ms             640 ms -       5110 ms (640ms - ~5s)
+ *  2	 128        640 ms            5120 ms -      40950 ms (~5s - ~40s)
+ *  3	 192       5120 ms (~5s)     40960 ms -     327670 ms (~40s - ~5m)
+ *  4	 256      40960 ms (~40s)   327680 ms -    2621430 ms (~5m - ~43m)
+ *  5	 320     327680 ms (~5m)   2621440 ms -   20971510 ms (~43m - ~5h)
+ *  6	 384    2621440 ms (~43m) 20971520 ms -  167772150 ms (~5h - ~1d)
+ *  7	 448   20971520 ms (~5h) 167772160 ms - 1342177270 ms (~1d - ~15d)
  */
-#define TVN_BITS (CONFIG_BASE_SMALL ? 4 : 6)
-#define TVR_BITS (CONFIG_BASE_SMALL ? 6 : 8)
-#define TVN_SIZE (1 << TVN_BITS)
-#define TVR_SIZE (1 << TVR_BITS)
-#define TVN_MASK (TVN_SIZE - 1)
-#define TVR_MASK (TVR_SIZE - 1)
-#define MAX_TVAL ((unsigned long)((1ULL << (TVR_BITS + 4*TVN_BITS)) - 1))
-
-struct tvec {
-	struct hlist_head vec[TVN_SIZE];
-};
 
-struct tvec_root {
-	struct hlist_head vec[TVR_SIZE];
-};
+/* Clock divisor for the next level */
+#define LVL_CLK_SHIFT	3
+#define LVL_CLK_DIV	(1UL << LVL_CLK_SHIFT)
+#define LVL_CLK_MASK	(LVL_CLK_DIV - 1)
+#define LVL_SHIFT(n)	((n) * LVL_CLK_SHIFT)
+#define LVL_GRAN(n)	(1UL << LVL_SHIFT(n))
+
+/*
+ * The time start value for each level to select the bucket at enqueue
+ * time.
+ */
+#define LVL_START(n)	((LVL_SIZE - 1) << (((n) - 1) * LVL_CLK_SHIFT))
+
+/* Size of each clock level */
+#define LVL_BITS	6
+#define LVL_SIZE	(1UL << LVL_BITS)
+#define LVL_MASK	(LVL_SIZE - 1)
+#define LVL_OFFS(n)	((n) * LVL_SIZE)
+
+/* Level depth */
+#if HZ > 100
+# define LVL_DEPTH	9
+# else
+# define LVL_DEPTH	8
+#endif
+
+/* The cutoff (max. capacity of the wheel) */
+#define WHEEL_TIMEOUT_CUTOFF	(LVL_START(LVL_DEPTH))
+#define WHEEL_TIMEOUT_MAX	(WHEEL_TIMEOUT_CUTOFF - LVL_GRAN(LVL_DEPTH - 1))
+
+/*
+ * The resulting wheel size. If NOHZ is configured we allocate two
+ * wheels so we have a separate storage for the deferrable timers.
+ */
+#define WHEEL_SIZE	(LVL_SIZE * LVL_DEPTH)
+
+#ifdef CONFIG_NO_HZ_COMMON
+# define NR_BASES	2
+# define BASE_STD	0
+# define BASE_DEF	1
+#else
+# define NR_BASES	1
+# define BASE_STD	0
+# define BASE_DEF	0
+#endif
 
 struct timer_base {
-	spinlock_t lock;
-	struct timer_list *running_timer;
-	unsigned long clk;
-	unsigned long next_timer;
-	unsigned long active_timers;
-	unsigned long all_timers;
-	int cpu;
-	bool migration_enabled;
-	bool nohz_active;
-	struct tvec_root tv1;
-	struct tvec tv2;
-	struct tvec tv3;
-	struct tvec tv4;
-	struct tvec tv5;
+	spinlock_t		lock;
+	struct timer_list	*running_timer;
+	unsigned long		clk;
+	unsigned int		cpu;
+	bool			migration_enabled;
+	bool			nohz_active;
+	DECLARE_BITMAP(pending_map, WHEEL_SIZE);
+	struct hlist_head	vectors[WHEEL_SIZE];
 } ____cacheline_aligned;
 
-
-static DEFINE_PER_CPU(struct timer_base, timer_bases);
+static DEFINE_PER_CPU(struct timer_base, timer_bases[NR_BASES]);
 
 #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
 unsigned int sysctl_timer_migration = 1;
@@ -106,15 +214,17 @@ void timers_update_migration(bool update_nohz)
 	unsigned int cpu;
 
 	/* Avoid the loop, if nothing to update */
-	if (this_cpu_read(timer_bases.migration_enabled) == on)
+	if (this_cpu_read(timer_bases[BASE_STD].migration_enabled) == on)
 		return;
 
 	for_each_possible_cpu(cpu) {
-		per_cpu(timer_bases.migration_enabled, cpu) = on;
+		per_cpu(timer_bases[BASE_STD].migration_enabled, cpu) = on;
+		per_cpu(timer_bases[BASE_DEF].migration_enabled, cpu) = on;
 		per_cpu(hrtimer_bases.migration_enabled, cpu) = on;
 		if (!update_nohz)
 			continue;
-		per_cpu(timer_bases.nohz_active, cpu) = true;
+		per_cpu(timer_bases[BASE_STD].nohz_active, cpu) = true;
+		per_cpu(timer_bases[BASE_DEF].nohz_active, cpu) = true;
 		per_cpu(hrtimer_bases.nohz_active, cpu) = true;
 	}
 }
@@ -133,20 +243,6 @@ int timer_migration_handler(struct ctl_table *table, int write,
 	mutex_unlock(&mutex);
 	return ret;
 }
-
-static inline struct timer_base *get_target_base(struct timer_base *base,
-						int pinned)
-{
-	if (pinned || !base->migration_enabled)
-		return this_cpu_ptr(&timer_bases);
-	return per_cpu_ptr(&timer_bases, get_nohz_timer_target());
-}
-#else
-static inline struct timer_base *get_target_base(struct timer_base *base,
-						int pinned)
-{
-	return this_cpu_ptr(&timer_bases);
-}
 #endif
 
 static unsigned long round_jiffies_common(unsigned long j, int cpu,
@@ -370,78 +466,91 @@ void set_timer_slack(struct timer_list *timer, int slack_hz)
 }
 EXPORT_SYMBOL_GPL(set_timer_slack);
 
+static inline unsigned int timer_get_idx(struct timer_list *timer)
+{
+	return (timer->flags & TIMER_ARRAYMASK) >> TIMER_ARRAYSHIFT;
+}
+
+static inline void timer_set_idx(struct timer_list *timer, unsigned int idx)
+{
+	timer->flags = (timer->flags & ~TIMER_ARRAYMASK) |
+			idx << TIMER_ARRAYSHIFT;
+}
+
+/*
+ * Helper function to calculate the array index for a given expiry
+ * time.
+ */
+static inline unsigned calc_index(unsigned expires, unsigned lvl)
+{
+	expires = (expires + LVL_GRAN(lvl)) >> LVL_SHIFT(lvl);
+	return LVL_OFFS(lvl) + (expires & LVL_MASK);
+}
+
 static void
 __internal_add_timer(struct timer_base *base, struct timer_list *timer)
 {
 	unsigned long expires = timer->expires;
-	unsigned long idx = expires - base->clk;
+	unsigned long delta = expires - base->clk;
 	struct hlist_head *vec;
-
-	if (idx < TVR_SIZE) {
-		int i = expires & TVR_MASK;
-		vec = base->tv1.vec + i;
-	} else if (idx < 1 << (TVR_BITS + TVN_BITS)) {
-		int i = (expires >> TVR_BITS) & TVN_MASK;
-		vec = base->tv2.vec + i;
-	} else if (idx < 1 << (TVR_BITS + 2 * TVN_BITS)) {
-		int i = (expires >> (TVR_BITS + TVN_BITS)) & TVN_MASK;
-		vec = base->tv3.vec + i;
-	} else if (idx < 1 << (TVR_BITS + 3 * TVN_BITS)) {
-		int i = (expires >> (TVR_BITS + 2 * TVN_BITS)) & TVN_MASK;
-		vec = base->tv4.vec + i;
-	} else if ((signed long) idx < 0) {
-		/*
-		 * Can happen if you add a timer with expires == jiffies,
-		 * or you set a timer to go off in the past
-		 */
-		vec = base->tv1.vec + (base->clk & TVR_MASK);
+	unsigned int idx;
+
+	if (delta < LVL_START(1)) {
+		idx = calc_index(expires, 0);
+	} else if (delta < LVL_START(2)) {
+		idx = calc_index(expires, 1);
+	} else if (delta < LVL_START(3)) {
+		idx = calc_index(expires, 2);
+	} else if (delta < LVL_START(4)) {
+		idx = calc_index(expires, 3);
+	} else if (delta < LVL_START(5)) {
+		idx = calc_index(expires, 4);
+	} else if (delta < LVL_START(6)) {
+		idx = calc_index(expires, 5);
+	} else if (delta < LVL_START(7)) {
+		idx = calc_index(expires, 6);
+	} else if (LVL_DEPTH > 8 && delta < LVL_START(8)) {
+		idx = calc_index(expires, 7);
+	} else if ((long) delta < 0) {
+		idx = base->clk & LVL_MASK;
 	} else {
-		int i;
-		/* If the timeout is larger than MAX_TVAL (on 64-bit
-		 * architectures or with CONFIG_BASE_SMALL=1) then we
-		 * use the maximum timeout.
+		/*
+		 * Force expire obscene large timeouts to expire at the
+		 * capacity limit of the wheel.
 		 */
-		if (idx > MAX_TVAL) {
-			idx = MAX_TVAL;
-			expires = idx + base->clk;
-		}
-		i = (expires >> (TVR_BITS + 3 * TVN_BITS)) & TVN_MASK;
-		vec = base->tv5.vec + i;
-	}
+		if (expires >= WHEEL_TIMEOUT_CUTOFF)
+			expires = WHEEL_TIMEOUT_MAX;
 
+		idx = calc_index(expires, LVL_DEPTH - 1);
+	}
+	/*
+	 * Enqueue the timer into the array bucket, mark it pending in
+	 * the bitmap and store the index in the timer flags.
+	 */
+	vec = base->vectors + idx;
 	hlist_add_head(&timer->entry, vec);
+	__set_bit(idx, base->pending_map);
+	timer_set_idx(timer, idx);
 }
 
 static void internal_add_timer(struct timer_base *base, struct timer_list *timer)
 {
-	/* Advance base->jiffies, if the base is empty */
-	if (!base->all_timers++)
-		base->clk = jiffies;
-
 	__internal_add_timer(base, timer);
-	/*
-	 * Update base->active_timers and base->next_timer
-	 */
-	if (!(timer->flags & TIMER_DEFERRABLE)) {
-		if (!base->active_timers++ ||
-		    time_before(timer->expires, base->next_timer))
-			base->next_timer = timer->expires;
-	}
 
 	/*
 	 * Check whether the other CPU is in dynticks mode and needs
-	 * to be triggered to reevaluate the timer wheel.
-	 * We are protected against the other CPU fiddling
-	 * with the timer by holding the timer base lock. This also
-	 * makes sure that a CPU on the way to stop its tick can not
-	 * evaluate the timer wheel.
+	 * to be triggered to reevaluate the timer wheel.  We are
+	 * protected against the other CPU fiddling with the timer by
+	 * holding the timer base lock. This also makes sure that a
+	 * CPU on the way to stop its tick can not evaluate the timer
+	 * wheel.
 	 *
 	 * Spare the IPI for deferrable timers on idle targets though.
 	 * The next busy ticks will take care of it. Except full dynticks
 	 * require special care against races with idle_cpu(), lets deal
 	 * with that later.
 	 */
-	if (base->nohz_active) {
+	if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active) {
 		if (!(timer->flags & TIMER_DEFERRABLE) ||
 		    tick_nohz_full_cpu(base->cpu))
 			wake_up_nohz_cpu(base->cpu);
@@ -706,54 +815,87 @@ static inline void detach_timer(struct timer_list *timer, bool clear_pending)
 	entry->next = LIST_POISON2;
 }
 
-static inline void
-detach_expired_timer(struct timer_list *timer, struct timer_base *base)
-{
-	detach_timer(timer, true);
-	if (!(timer->flags & TIMER_DEFERRABLE))
-		base->active_timers--;
-	base->all_timers--;
-}
-
 static int detach_if_pending(struct timer_list *timer, struct timer_base *base,
 			     bool clear_pending)
 {
+	unsigned idx = timer_get_idx(timer);
+
 	if (!timer_pending(timer))
 		return 0;
 
+	if (hlist_is_singular_node(&timer->entry, base->vectors + idx))
+		__clear_bit(idx, base->pending_map);
+
 	detach_timer(timer, clear_pending);
-	if (!(timer->flags & TIMER_DEFERRABLE)) {
-		base->active_timers--;
-		if (timer->expires == base->next_timer)
-			base->next_timer = base->clk;
-	}
-	/* If this was the last timer, advance base->jiffies */
-	if (!--base->all_timers)
-		base->clk = jiffies;
 	return 1;
 }
 
+static inline struct timer_base *get_timer_cpu_base(u32 tflags, u32 cpu)
+{
+	struct timer_base *base = per_cpu_ptr(&timer_bases[BASE_STD], cpu);
+
+	/*
+	 * If the timer is deferrable and nohz is active then we need to use
+	 * the deferrable base.
+	 */
+	if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active &&
+	    (tflags & TIMER_DEFERRABLE))
+		base = per_cpu_ptr(&timer_bases[BASE_DEF], cpu);
+	return base;
+}
+
+static inline struct timer_base *get_timer_this_cpu_base(u32 tflags)
+{
+	struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
+
+	/*
+	 * If the timer is deferrable and nohz is active then we need to use
+	 * the deferrable base.
+	 */
+	if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active &&
+	    (tflags & TIMER_DEFERRABLE))
+		base = this_cpu_ptr(&timer_bases[BASE_DEF]);
+	return base;
+}
+
+static inline struct timer_base *get_timer_base(u32 tflags)
+{
+	return get_timer_cpu_base(tflags, tflags & TIMER_CPUMASK);
+}
+
+static inline struct timer_base *get_target_base(struct timer_base *base,
+						 unsigned tflags)
+{
+#if defined(CONFIG_NO_HZ_COMMON) && defined(CONFIG_SMP)
+	if ((tflags & TIMER_PINNED) || !base->migration_enabled)
+		return get_timer_this_cpu_base(tflags);
+	return get_timer_cpu_base(tflags, get_nohz_timer_target());
+#else
+	return get_timer_this_cpu_base(tflags);
+#endif
+}
+
 /*
- * We are using hashed locking: holding per_cpu(timer_bases).lock
- * means that all timers which are tied to this base via timer->base are
- * locked, and the base itself is locked too.
+ * We are using hashed locking: Holding per_cpu(timer_bases[x]).lock means
+ * that all timers which are tied to this base are locked, and the base itself
+ * is locked too.
  *
  * So __run_timers/migrate_timers can safely modify all timers which could
- * be found on ->tvX lists.
+ * be found in the base->vectors array.
  *
- * When the timer's base is locked and removed from the list, the
- * TIMER_MIGRATING flag is set, FIXME
+ * When a timer is migrating then the TIMER_MIGRATING flag is set and we need
+ * to wait until the migration is done.
  */
 static struct timer_base *lock_timer_base(struct timer_list *timer,
-					unsigned long *flags)
+					  unsigned long *flags)
 	__acquires(timer->base->lock)
 {
 	for (;;) {
-		u32 tf = timer->flags;
 		struct timer_base *base;
+		u32 tf = timer->flags;
 
 		if (!(tf & TIMER_MIGRATING)) {
-			base = per_cpu_ptr(&timer_bases, tf & TIMER_CPUMASK);
+			base = get_timer_base(tf);
 			spin_lock_irqsave(&base->lock, *flags);
 			if (timer->flags == tf)
 				return base;
@@ -770,6 +912,27 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
 	unsigned long flags;
 	int ret = 0;
 
+	/*
+	 * TODO: Calculate the array bucket of the timer right here w/o
+	 * holding the base lock. This allows to check not only
+	 * timer->expires == expires below, but also whether the timer
+	 * ends up in the same bucket. If we really need to requeue
+	 * the timer then we check whether base->clk have
+	 * advanced between here and locking the timer base. If
+	 * jiffies advanced we have to recalc the array bucket with the
+	 * lock held.
+	 */
+
+	/*
+	 * This is a common optimization triggered by the
+	 * networking code - if the timer is re-modified
+	 * to be the same thing then just return:
+	 */
+	if (timer_pending(timer)) {
+		if (timer->expires == expires)
+			return 1;
+	}
+
 	timer_stats_timer_set_start_info(timer);
 	BUG_ON(!timer->function);
 
@@ -781,15 +944,15 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
 
 	debug_activate(timer, expires);
 
-	new_base = get_target_base(base, timer->flags & TIMER_PINNED);
+	new_base = get_target_base(base, timer->flags);
 
 	if (base != new_base) {
 		/*
-		 * We are trying to schedule the timer on the local CPU.
+		 * We are trying to schedule the timer on the new base.
 		 * However we can't change timer's base while it is running,
 		 * otherwise del_timer_sync() can't detect that the timer's
-		 * handler yet has not finished. This also guarantees that
-		 * the timer is serialized wrt itself.
+		 * handler yet has not finished. This also guarantees that the
+		 * timer is serialized wrt itself.
 		 */
 		if (likely(base->running_timer != timer)) {
 			/* See the comment in lock_timer_base() */
@@ -828,45 +991,6 @@ int mod_timer_pending(struct timer_list *timer, unsigned long expires)
 }
 EXPORT_SYMBOL(mod_timer_pending);
 
-/*
- * Decide where to put the timer while taking the slack into account
- *
- * Algorithm:
- *   1) calculate the maximum (absolute) time
- *   2) calculate the highest bit where the expires and new max are different
- *   3) use this bit to make a mask
- *   4) use the bitmask to round down the maximum time, so that all last
- *      bits are zeros
- */
-static inline
-unsigned long apply_slack(struct timer_list *timer, unsigned long expires)
-{
-	unsigned long expires_limit, mask;
-	int bit;
-
-	if (timer->slack >= 0) {
-		expires_limit = expires + timer->slack;
-	} else {
-		long delta = expires - jiffies;
-
-		if (delta < 256)
-			return expires;
-
-		expires_limit = expires + delta / 256;
-	}
-	mask = expires ^ expires_limit;
-	if (mask == 0)
-		return expires;
-
-	bit = __fls(mask);
-
-	mask = (1UL << bit) - 1;
-
-	expires_limit = expires_limit & ~(mask);
-
-	return expires_limit;
-}
-
 /**
  * mod_timer - modify a timer's timeout
  * @timer: the timer to be modified
@@ -889,16 +1013,6 @@ unsigned long apply_slack(struct timer_list *timer, unsigned long expires)
  */
 int mod_timer(struct timer_list *timer, unsigned long expires)
 {
-	expires = apply_slack(timer, expires);
-
-	/*
-	 * This is a common optimization triggered by the
-	 * networking code - if the timer is re-modified
-	 * to be the same thing then just return:
-	 */
-	if (timer_pending(timer) && timer->expires == expires)
-		return 1;
-
 	return __mod_timer(timer, expires, false);
 }
 EXPORT_SYMBOL(mod_timer);
@@ -933,13 +1047,14 @@ EXPORT_SYMBOL(add_timer);
  */
 void add_timer_on(struct timer_list *timer, int cpu)
 {
-	struct timer_base *new_base = per_cpu_ptr(&timer_bases, cpu);
-	struct timer_base *base;
+	struct timer_base *new_base, *base;
 	unsigned long flags;
 
 	timer_stats_timer_set_start_info(timer);
 	BUG_ON(timer_pending(timer) || !timer->function);
 
+	new_base = get_timer_cpu_base(timer->flags, cpu);
+
 	/*
 	 * If @timer was on a different CPU, it should be migrated with the
 	 * old base locked to prevent other operations proceeding with the
@@ -1085,27 +1200,6 @@ int del_timer_sync(struct timer_list *timer)
 EXPORT_SYMBOL(del_timer_sync);
 #endif
 
-static int cascade(struct timer_base *base, struct tvec *tv, int index)
-{
-	/* cascade all the timers from tv up one level */
-	struct timer_list *timer;
-	struct hlist_node *tmp;
-	struct hlist_head tv_list;
-
-	hlist_move_list(tv->vec + index, &tv_list);
-
-	/*
-	 * We are removing _all_ timers from the list, so we
-	 * don't have to detach them individually.
-	 */
-	hlist_for_each_entry_safe(timer, tmp, &tv_list, entry) {
-		/* No accounting, while moving them */
-		__internal_add_timer(base, timer);
-	}
-
-	return index;
-}
-
 static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long),
 			  unsigned long data)
 {
@@ -1149,68 +1243,80 @@ static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long),
 	}
 }
 
-#define INDEX(N) ((base->clk >> (TVR_BITS + (N) * TVN_BITS)) & TVN_MASK)
+static void expire_timers(struct timer_base *base, struct hlist_head *head)
+{
+	while (!hlist_empty(head)) {
+		struct timer_list *timer;
+		void (*fn)(unsigned long);
+		unsigned long data;
+
+		timer = hlist_entry(head->first, struct timer_list, entry);
+		timer_stats_account_timer(timer);
+
+		base->running_timer = timer;
+		detach_timer(timer, true);
+
+		fn = timer->function;
+		data = timer->data;
+
+		if (timer->flags & TIMER_IRQSAFE) {
+			spin_unlock(&base->lock);
+			call_timer_fn(timer, fn, data);
+			spin_lock(&base->lock);
+		} else {
+			spin_unlock_irq(&base->lock);
+			call_timer_fn(timer, fn, data);
+			spin_lock_irq(&base->lock);
+		}
+	}
+}
+
+static int collect_expired_timers(struct timer_base *base,
+				  struct hlist_head *heads)
+{
+	unsigned long clk = base->clk;
+	struct hlist_head *vec;
+	int i, levels = 0;
+	unsigned int idx;
+
+	for (i = 0; i < LVL_DEPTH; i++) {
+		idx = (clk & LVL_MASK) + i * LVL_SIZE;
+
+		if (__test_and_clear_bit(idx, base->pending_map)) {
+			vec = base->vectors + idx;
+			hlist_move_list(vec, heads++);
+			levels++;
+		}
+		/* Is it time to look at the next level? */
+		if (clk & LVL_CLK_MASK)
+			break;
+		/* Shift clock for the next level granularity */
+		clk >>= LVL_CLK_SHIFT;
+	}
+	return levels;
+}
 
 /**
  * __run_timers - run all expired timers (if any) on this CPU.
  * @base: the timer vector to be processed.
- *
- * This function cascades all vectors and executes all expired timer
- * vectors.
  */
 static inline void __run_timers(struct timer_base *base)
 {
-	struct timer_list *timer;
+	struct hlist_head heads[LVL_DEPTH];
+	int levels;
+
+	if (!time_after_eq(jiffies, base->clk))
+		return;
 
 	spin_lock_irq(&base->lock);
 
 	while (time_after_eq(jiffies, base->clk)) {
-		struct hlist_head work_list;
-		struct hlist_head *head = &work_list;
-		int index;
 
-		if (!base->all_timers) {
-			base->clk = jiffies;
-			break;
-		}
-
-		index = base->clk & TVR_MASK;
+		levels = collect_expired_timers(base, heads);
+		base->clk++;
 
-		/*
-		 * Cascade timers:
-		 */
-		if (!index &&
-			(!cascade(base, &base->tv2, INDEX(0))) &&
-				(!cascade(base, &base->tv3, INDEX(1))) &&
-					!cascade(base, &base->tv4, INDEX(2)))
-			cascade(base, &base->tv5, INDEX(3));
-		++base->clk;
-		hlist_move_list(base->tv1.vec + index, head);
-		while (!hlist_empty(head)) {
-			void (*fn)(unsigned long);
-			unsigned long data;
-			bool irqsafe;
-
-			timer = hlist_entry(head->first, struct timer_list, entry);
-			fn = timer->function;
-			data = timer->data;
-			irqsafe = timer->flags & TIMER_IRQSAFE;
-
-			timer_stats_account_timer(timer);
-
-			base->running_timer = timer;
-			detach_expired_timer(timer, base);
-
-			if (irqsafe) {
-				spin_unlock(&base->lock);
-				call_timer_fn(timer, fn, data);
-				spin_lock(&base->lock);
-			} else {
-				spin_unlock_irq(&base->lock);
-				call_timer_fn(timer, fn, data);
-				spin_lock_irq(&base->lock);
-			}
-		}
+		while (levels--)
+			expire_timers(base, heads + levels);
 	}
 	base->running_timer = NULL;
 	spin_unlock_irq(&base->lock);
@@ -1218,78 +1324,87 @@ static inline void __run_timers(struct timer_base *base)
 
 #ifdef CONFIG_NO_HZ_COMMON
 /*
- * Find out when the next timer event is due to happen. This
- * is used on S/390 to stop all activity when a CPU is idle.
- * This function needs to be called with interrupts disabled.
+ * Find the next pending bucket of a level. Search from @offset + @clk upwards
+ * and if nothing there, search from start of the level (@offset) up to
+ * @offset + clk.
+ */
+static int next_pending_bucket(struct timer_base *base, unsigned offset,
+			       unsigned clk)
+{
+	unsigned pos, start = offset + clk;
+	unsigned end = offset + LVL_SIZE;
+
+	pos = find_next_bit(base->pending_map, end, start);
+	if (pos < end)
+		return pos - start;
+
+	pos = find_next_bit(base->pending_map, start, offset);
+	return pos < start ? pos + LVL_SIZE - start : -1;
+}
+
+/*
+ * Search the first expiring timer in the various clock levels.
  */
 static unsigned long __next_timer_interrupt(struct timer_base *base)
 {
-	unsigned long clk = base->clk;
-	unsigned long expires = clk + NEXT_TIMER_MAX_DELTA;
-	int index, slot, array, found = 0;
-	struct timer_list *nte;
-	struct tvec *varray[4];
-
-	/* Look for timer events in tv1. */
-	index = slot = clk & TVR_MASK;
-	do {
-		hlist_for_each_entry(nte, base->tv1.vec + slot, entry) {
-			if (nte->flags & TIMER_DEFERRABLE)
-				continue;
-
-			found = 1;
-			expires = nte->expires;
-			/* Look at the cascade bucket(s)? */
-			if (!index || slot < index)
-				goto cascade;
-			return expires;
+	unsigned long clk, next, adj;
+	unsigned lvl, offset = 0;
+
+	spin_lock(&base->lock);
+	next = base->clk + NEXT_TIMER_MAX_DELTA;
+	clk = base->clk;
+	for (lvl = 0; lvl < LVL_DEPTH; lvl++, offset += LVL_SIZE) {
+		int pos = next_pending_bucket(base, offset, clk & LVL_MASK);
+
+		if (pos >= 0) {
+			unsigned long tmp = clk + (unsigned long) pos;
+
+			tmp <<= LVL_SHIFT(lvl);
+			if (time_before(tmp, next))
+				next = tmp;
 		}
-		slot = (slot + 1) & TVR_MASK;
-	} while (slot != index);
-
-cascade:
-	/* Calculate the next cascade event */
-	if (index)
-		clk += TVR_SIZE - index;
-	clk >>= TVR_BITS;
-
-	/* Check tv2-tv5. */
-	varray[0] = &base->tv2;
-	varray[1] = &base->tv3;
-	varray[2] = &base->tv4;
-	varray[3] = &base->tv5;
-
-	for (array = 0; array < 4; array++) {
-		struct tvec *varp = varray[array];
-
-		index = slot = clk & TVN_MASK;
-		do {
-			hlist_for_each_entry(nte, varp->vec + slot, entry) {
-				if (nte->flags & TIMER_DEFERRABLE)
-					continue;
-
-				found = 1;
-				if (time_before(nte->expires, expires))
-					expires = nte->expires;
-			}
-			/*
-			 * Do we still search for the first timer or are
-			 * we looking up the cascade buckets ?
-			 */
-			if (found) {
-				/* Look at the cascade bucket(s)? */
-				if (!index || slot < index)
-					break;
-				return expires;
-			}
-			slot = (slot + 1) & TVN_MASK;
-		} while (slot != index);
-
-		if (index)
-			clk += TVN_SIZE - index;
-		clk >>= TVN_BITS;
+		/*
+		 * Clock for the next level. If the current level clock lower
+		 * bits are zero, we look at the next level as is. If not we
+		 * need to advance it by one because that's going to be the
+		 * next expiring bucket in that level. base->clk is the next
+		 * expiring jiffie. So in case of:
+		 *
+		 * LVL5 LVL4 LVL3 LVL2 LVL1 LVL0
+		 *  0    0    0    0    0    0
+		 *
+		 * we have to look at all levels @index 0. With
+		 *
+		 * LVL5 LVL4 LVL3 LVL2 LVL1 LVL0
+		 *  0    0    0    0    0    2
+		 *
+		 * LVL0 has the next expiring bucket @index 2. The upper
+		 * levels have the next expiring bucket @index 1.
+		 *
+		 * In case that the propagation wraps the next level the same
+		 * rules apply:
+		 *
+		 * LVL5 LVL4 LVL3 LVL2 LVL1 LVL0
+		 *  0    0    0    0    F    2
+		 *
+		 * So after looking at LVL0 we get:
+		 *
+		 * LVL5 LVL4 LVL3 LVL2 LVL1
+		 *  0    0    0    1    0
+		 *
+		 * So no propagation from LVL1 to LVL2 because that happened
+		 * with the add already, but then we need to propagate further
+		 * from LVL2 to LVL3.
+		 *
+		 * So the simple check whether the lower bits of the current
+		 * level are 0 or not is sufficient for all cases.
+		 */
+		adj = clk & LVL_CLK_MASK ? 1 : 0;
+		clk >>= LVL_CLK_SHIFT;
+		clk += adj;
 	}
-	return expires;
+	spin_unlock(&base->lock);
+	return next;
 }
 
 /*
@@ -1335,7 +1450,7 @@ static u64 cmp_next_hrtimer_event(u64 basem, u64 expires)
  */
 u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
 {
-	struct timer_base *base = this_cpu_ptr(&timer_bases);
+	struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
 	u64 expires = KTIME_MAX;
 	unsigned long nextevt;
 
@@ -1346,17 +1461,11 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
 	if (cpu_is_offline(smp_processor_id()))
 		return expires;
 
-	spin_lock(&base->lock);
-	if (base->active_timers) {
-		if (time_before_eq(base->next_timer, base->clk))
-			base->next_timer = __next_timer_interrupt(base);
-		nextevt = base->next_timer;
-		if (time_before_eq(nextevt, basej))
-			expires = basem;
-		else
-			expires = basem + (nextevt - basej) * TICK_NSEC;
-	}
-	spin_unlock(&base->lock);
+	nextevt = __next_timer_interrupt(base);
+	if (time_before_eq(nextevt, basej))
+		expires = basem;
+	else
+		expires = basem + (nextevt - basej) * TICK_NSEC;
 
 	return cmp_next_hrtimer_event(basem, expires);
 }
@@ -1387,10 +1496,11 @@ void update_process_times(int user_tick)
  */
 static void run_timer_softirq(struct softirq_action *h)
 {
-	struct timer_base *base = this_cpu_ptr(&timer_bases);
+	struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
 
-	if (time_after_eq(jiffies, base->clk))
-		__run_timers(base);
+	__run_timers(base);
+	if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active)
+		__run_timers(this_cpu_ptr(&timer_bases[BASE_DEF]));
 }
 
 /*
@@ -1541,7 +1651,6 @@ static void migrate_timer_list(struct timer_base *new_base, struct hlist_head *h
 
 	while (!hlist_empty(head)) {
 		timer = hlist_entry(head->first, struct timer_list, entry);
-		/* We ignore the accounting on the dying cpu */
 		detach_timer(timer, false);
 		timer->flags = (timer->flags & ~TIMER_BASEMASK) | cpu;
 		internal_add_timer(new_base, timer);
@@ -1552,35 +1661,29 @@ static void migrate_timers(int cpu)
 {
 	struct timer_base *old_base;
 	struct timer_base *new_base;
-	int i;
+	int b, i;
 
 	BUG_ON(cpu_online(cpu));
-	old_base = per_cpu_ptr(&timer_bases, cpu);
-	new_base = get_cpu_ptr(&timer_bases);
-	/*
-	 * The caller is globally serialized and nobody else
-	 * takes two locks at once, deadlock is not possible.
-	 */
-	spin_lock_irq(&new_base->lock);
-	spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);
-
-	BUG_ON(old_base->running_timer);
-
-	for (i = 0; i < TVR_SIZE; i++)
-		migrate_timer_list(new_base, old_base->tv1.vec + i);
-	for (i = 0; i < TVN_SIZE; i++) {
-		migrate_timer_list(new_base, old_base->tv2.vec + i);
-		migrate_timer_list(new_base, old_base->tv3.vec + i);
-		migrate_timer_list(new_base, old_base->tv4.vec + i);
-		migrate_timer_list(new_base, old_base->tv5.vec + i);
-	}
 
-	old_base->active_timers = 0;
-	old_base->all_timers = 0;
+	for (b = 0; b < NR_BASES; b++) {
+		old_base = per_cpu_ptr(&timer_bases[b], cpu);
+		new_base = get_cpu_ptr(&timer_bases[b]);
+		/*
+		 * The caller is globally serialized and nobody else
+		 * takes two locks at once, deadlock is not possible.
+		 */
+		spin_lock_irq(&new_base->lock);
+		spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);
+
+		BUG_ON(old_base->running_timer);
+
+		for (i = 0; i < WHEEL_SIZE; i++)
+			migrate_timer_list(new_base, old_base->vectors + i);
 
-	spin_unlock(&old_base->lock);
-	spin_unlock_irq(&new_base->lock);
-	put_cpu_ptr(&timer_bases);
+		spin_unlock(&old_base->lock);
+		spin_unlock_irq(&new_base->lock);
+		put_cpu_ptr(&timer_bases);
+	}
 }
 
 static int timer_cpu_notify(struct notifier_block *self,
@@ -1608,13 +1711,15 @@ static inline void timer_register_cpu_notifier(void) { }
 
 static void __init init_timer_cpu(int cpu)
 {
-	struct timer_base *base = per_cpu_ptr(&timer_bases, cpu);
-
-	base->cpu = cpu;
-	spin_lock_init(&base->lock);
+	struct timer_base *base;
+	int i;
 
-	base->clk = jiffies;
-	base->next_timer = base->clk;
+	for (i = 0; i < NR_BASES; i++) {
+		base = per_cpu_ptr(&timer_bases[i], cpu);
+		base->cpu = cpu;
+		spin_lock_init(&base->lock);
+		base->clk = jiffies;
+	}
 }
 
 static void __init init_timer_cpus(void)
-- 
cgit v1.2.3


From 53bf837b78d155b8e1110b3c25b4d0d6391b8ff3 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 4 Jul 2016 09:50:31 +0000
Subject: timers: Remove set_timer_slack() leftovers
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

We now have implicit batching in the timer wheel. The slack API is no longer
used, so remove it.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Alan Stern <stern@rowland.harvard.edu>
Cc: Andrew F. Davis <afd@ti.com>
Cc: Arjan van de Ven <arjan@infradead.org>
Cc: Chris Mason <clm@fb.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: David Woodhouse <dwmw2@infradead.org>
Cc: Dmitry Eremin-Solenikov <dbaryshkov@gmail.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: George Spelvin <linux@sciencehorizons.net>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Jaehoon Chung <jh80.chung@samsung.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Josh Triplett <josh@joshtriplett.org>
Cc: Len Brown <lenb@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mathias Nyman <mathias.nyman@intel.com>
Cc: Pali Rohár <pali.rohar@gmail.com>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Sebastian Reichel <sre@kernel.org>
Cc: Ulf Hansson <ulf.hansson@linaro.org>
Cc: linux-block@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Cc: linux-mmc@vger.kernel.org
Cc: linux-pm@vger.kernel.org
Cc: linux-usb@vger.kernel.org
Cc: netdev@vger.kernel.org
Cc: rt@linutronix.de
Link: http://lkml.kernel.org/r/20160704094342.189813118@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 block/genhd.c                   |  5 -----
 drivers/mmc/host/jz4740_mmc.c   |  2 --
 drivers/power/bq27xxx_battery.c |  5 +----
 drivers/usb/host/ohci-hcd.c     |  1 -
 drivers/usb/host/xhci.c         |  2 --
 include/linux/timer.h           |  4 ----
 kernel/time/timer.c             | 19 -------------------
 lib/random32.c                  |  1 -
 8 files changed, 1 insertion(+), 38 deletions(-)

(limited to 'kernel/time')

diff --git a/block/genhd.c b/block/genhd.c
index 9f42526b4d62..f06d7f3b075b 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -1523,12 +1523,7 @@ static void __disk_unblock_events(struct gendisk *disk, bool check_now)
 	if (--ev->block)
 		goto out_unlock;
 
-	/*
-	 * Not exactly a latency critical operation, set poll timer
-	 * slack to 25% and kick event check.
-	 */
 	intv = disk_events_poll_jiffies(disk);
-	set_timer_slack(&ev->dwork.timer, intv / 4);
 	if (check_now)
 		queue_delayed_work(system_freezable_power_efficient_wq,
 				&ev->dwork, 0);
diff --git a/drivers/mmc/host/jz4740_mmc.c b/drivers/mmc/host/jz4740_mmc.c
index 03ddf0ecf402..684087db170b 100644
--- a/drivers/mmc/host/jz4740_mmc.c
+++ b/drivers/mmc/host/jz4740_mmc.c
@@ -1068,8 +1068,6 @@ static int jz4740_mmc_probe(struct platform_device* pdev)
 	jz4740_mmc_clock_disable(host);
 	setup_timer(&host->timeout_timer, jz4740_mmc_timeout,
 			(unsigned long)host);
-	/* It is not important when it times out, it just needs to timeout. */
-	set_timer_slack(&host->timeout_timer, HZ);
 
 	host->use_dma = true;
 	if (host->use_dma && jz4740_mmc_acquire_dma_channels(host) != 0)
diff --git a/drivers/power/bq27xxx_battery.c b/drivers/power/bq27xxx_battery.c
index 45f6ebf88df6..e90b3f307e0f 100644
--- a/drivers/power/bq27xxx_battery.c
+++ b/drivers/power/bq27xxx_battery.c
@@ -735,11 +735,8 @@ static void bq27xxx_battery_poll(struct work_struct *work)
 
 	bq27xxx_battery_update(di);
 
-	if (poll_interval > 0) {
-		/* The timer does not have to be accurate. */
-		set_timer_slack(&di->work.timer, poll_interval * HZ / 4);
+	if (poll_interval > 0)
 		schedule_delayed_work(&di->work, poll_interval * HZ);
-	}
 }
 
 /*
diff --git a/drivers/usb/host/ohci-hcd.c b/drivers/usb/host/ohci-hcd.c
index 0449235d4f22..1700908b84ef 100644
--- a/drivers/usb/host/ohci-hcd.c
+++ b/drivers/usb/host/ohci-hcd.c
@@ -500,7 +500,6 @@ static int ohci_init (struct ohci_hcd *ohci)
 
 	setup_timer(&ohci->io_watchdog, io_watchdog_func,
 			(unsigned long) ohci);
-	set_timer_slack(&ohci->io_watchdog, msecs_to_jiffies(20));
 
 	ohci->hcca = dma_alloc_coherent (hcd->self.controller,
 			sizeof(*ohci->hcca), &ohci->hcca_dma, GFP_KERNEL);
diff --git a/drivers/usb/host/xhci.c b/drivers/usb/host/xhci.c
index f2f9518c53ab..a986fe763d87 100644
--- a/drivers/usb/host/xhci.c
+++ b/drivers/usb/host/xhci.c
@@ -490,8 +490,6 @@ static void compliance_mode_recovery_timer_init(struct xhci_hcd *xhci)
 	xhci->comp_mode_recovery_timer.expires = jiffies +
 			msecs_to_jiffies(COMP_MODE_RCVRY_MSECS);
 
-	set_timer_slack(&xhci->comp_mode_recovery_timer,
-			msecs_to_jiffies(COMP_MODE_RCVRY_MSECS));
 	add_timer(&xhci->comp_mode_recovery_timer);
 	xhci_dbg_trace(xhci, trace_xhci_dbg_quirks,
 			"Compliance mode recovery timer initialized");
diff --git a/include/linux/timer.h b/include/linux/timer.h
index 5869ab9848fe..4419506b564e 100644
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -19,7 +19,6 @@ struct timer_list {
 	void			(*function)(unsigned long);
 	unsigned long		data;
 	u32			flags;
-	int			slack;
 
 #ifdef CONFIG_TIMER_STATS
 	int			start_pid;
@@ -73,7 +72,6 @@ struct timer_list {
 		.expires = (_expires),				\
 		.data = (_data),				\
 		.flags = (_flags),				\
-		.slack = -1,					\
 		__TIMER_LOCKDEP_MAP_INITIALIZER(		\
 			__FILE__ ":" __stringify(__LINE__))	\
 	}
@@ -193,8 +191,6 @@ extern int del_timer(struct timer_list * timer);
 extern int mod_timer(struct timer_list *timer, unsigned long expires);
 extern int mod_timer_pending(struct timer_list *timer, unsigned long expires);
 
-extern void set_timer_slack(struct timer_list *time, int slack_hz);
-
 /*
  * The jiffies value which is added to now, when there is no timer
  * in the timer wheel:
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 86e95b72665d..a83e23d0bc25 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -447,24 +447,6 @@ unsigned long round_jiffies_up_relative(unsigned long j)
 }
 EXPORT_SYMBOL_GPL(round_jiffies_up_relative);
 
-/**
- * set_timer_slack - set the allowed slack for a timer
- * @timer: the timer to be modified
- * @slack_hz: the amount of time (in jiffies) allowed for rounding
- *
- * Set the amount of time, in jiffies, that a certain timer has
- * in terms of slack. By setting this value, the timer subsystem
- * will schedule the actual timer somewhere between
- * the time mod_timer() asks for, and that time plus the slack.
- *
- * By setting the slack to -1, a percentage of the delay is used
- * instead.
- */
-void set_timer_slack(struct timer_list *timer, int slack_hz)
-{
-	timer->slack = slack_hz;
-}
-EXPORT_SYMBOL_GPL(set_timer_slack);
 
 static inline unsigned int timer_get_idx(struct timer_list *timer)
 {
@@ -775,7 +757,6 @@ static void do_init_timer(struct timer_list *timer, unsigned int flags,
 {
 	timer->entry.pprev = NULL;
 	timer->flags = flags | raw_smp_processor_id();
-	timer->slack = -1;
 #ifdef CONFIG_TIMER_STATS
 	timer->start_site = NULL;
 	timer->start_pid = -1;
diff --git a/lib/random32.c b/lib/random32.c
index 510d1ce7d4d2..69ed593aab07 100644
--- a/lib/random32.c
+++ b/lib/random32.c
@@ -233,7 +233,6 @@ static void __prandom_timer(unsigned long dontcare)
 
 static void __init __prandom_start_seed_timer(void)
 {
-	set_timer_slack(&seed_timer, HZ);
 	seed_timer.expires = jiffies + msecs_to_jiffies(40 * MSEC_PER_SEC);
 	add_timer(&seed_timer);
 }
-- 
cgit v1.2.3


From 73420fea80c6c376d91a69defe64013baa0d7e95 Mon Sep 17 00:00:00 2001
From: Anna-Maria Gleixner <anna-maria@linutronix.de>
Date: Mon, 4 Jul 2016 09:50:33 +0000
Subject: timers: Move __run_timers() function

Move __run_timers() below __next_timer_interrupt() and next_pending_bucket()
in preparation for __run_timers() NOHZ optimization.

No functional change.

Signed-off-by: Anna-Maria Gleixner <anna-maria@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Arjan van de Ven <arjan@infradead.org>
Cc: Chris Mason <clm@fb.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: George Spelvin <linux@sciencehorizons.net>
Cc: Josh Triplett <josh@joshtriplett.org>
Cc: Len Brown <lenb@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: rt@linutronix.de
Link: http://lkml.kernel.org/r/20160704094342.271872665@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/time/timer.c | 52 ++++++++++++++++++++++++++--------------------------
 1 file changed, 26 insertions(+), 26 deletions(-)

(limited to 'kernel/time')

diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index a83e23d0bc25..c16c48de01c5 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -1277,32 +1277,6 @@ static int collect_expired_timers(struct timer_base *base,
 	return levels;
 }
 
-/**
- * __run_timers - run all expired timers (if any) on this CPU.
- * @base: the timer vector to be processed.
- */
-static inline void __run_timers(struct timer_base *base)
-{
-	struct hlist_head heads[LVL_DEPTH];
-	int levels;
-
-	if (!time_after_eq(jiffies, base->clk))
-		return;
-
-	spin_lock_irq(&base->lock);
-
-	while (time_after_eq(jiffies, base->clk)) {
-
-		levels = collect_expired_timers(base, heads);
-		base->clk++;
-
-		while (levels--)
-			expire_timers(base, heads + levels);
-	}
-	base->running_timer = NULL;
-	spin_unlock_irq(&base->lock);
-}
-
 #ifdef CONFIG_NO_HZ_COMMON
 /*
  * Find the next pending bucket of a level. Search from @offset + @clk upwards
@@ -1472,6 +1446,32 @@ void update_process_times(int user_tick)
 	run_posix_cpu_timers(p);
 }
 
+/**
+ * __run_timers - run all expired timers (if any) on this CPU.
+ * @base: the timer vector to be processed.
+ */
+static inline void __run_timers(struct timer_base *base)
+{
+	struct hlist_head heads[LVL_DEPTH];
+	int levels;
+
+	if (!time_after_eq(jiffies, base->clk))
+		return;
+
+	spin_lock_irq(&base->lock);
+
+	while (time_after_eq(jiffies, base->clk)) {
+
+		levels = collect_expired_timers(base, heads);
+		base->clk++;
+
+		while (levels--)
+			expire_timers(base, heads + levels);
+	}
+	base->running_timer = NULL;
+	spin_unlock_irq(&base->lock);
+}
+
 /*
  * This function runs timers and the timer-tq in bottom half context.
  */
-- 
cgit v1.2.3


From 236968383cf5cd48835ff0d8a265e299e220d140 Mon Sep 17 00:00:00 2001
From: Anna-Maria Gleixner <anna-maria@linutronix.de>
Date: Mon, 4 Jul 2016 09:50:34 +0000
Subject: timers: Optimize collect_expired_timers() for NOHZ

After a NOHZ idle sleep the timer wheel must be forwarded to current jiffies.
There might be expired timers so the current code loops and checks the expired
buckets for timers. This can take quite some time for long NOHZ idle periods.

The pending bitmask in the timer base allows us to do a quick search for the
next expiring timer and therefore a fast forward of the base time which
prevents pointless long lasting loops.

For a 3 seconds idle sleep this reduces the catchup time from ~1ms to 5us.

Signed-off-by: Anna-Maria Gleixner <anna-maria@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Arjan van de Ven <arjan@infradead.org>
Cc: Chris Mason <clm@fb.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: George Spelvin <linux@sciencehorizons.net>
Cc: Josh Triplett <josh@joshtriplett.org>
Cc: Len Brown <lenb@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: rt@linutronix.de
Link: http://lkml.kernel.org/r/20160704094342.351296290@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/time/timer.c | 49 +++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 41 insertions(+), 8 deletions(-)

(limited to 'kernel/time')

diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index c16c48de01c5..658051c97a3c 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -1252,8 +1252,8 @@ static void expire_timers(struct timer_base *base, struct hlist_head *head)
 	}
 }
 
-static int collect_expired_timers(struct timer_base *base,
-				  struct hlist_head *heads)
+static int __collect_expired_timers(struct timer_base *base,
+				    struct hlist_head *heads)
 {
 	unsigned long clk = base->clk;
 	struct hlist_head *vec;
@@ -1279,9 +1279,9 @@ static int collect_expired_timers(struct timer_base *base,
 
 #ifdef CONFIG_NO_HZ_COMMON
 /*
- * Find the next pending bucket of a level. Search from @offset + @clk upwards
- * and if nothing there, search from start of the level (@offset) up to
- * @offset + clk.
+ * Find the next pending bucket of a level. Search from level start (@offset)
+ * + @clk upwards and if nothing there, search from start of the level
+ * (@offset) up to @offset + clk.
  */
 static int next_pending_bucket(struct timer_base *base, unsigned offset,
 			       unsigned clk)
@@ -1298,14 +1298,14 @@ static int next_pending_bucket(struct timer_base *base, unsigned offset,
 }
 
 /*
- * Search the first expiring timer in the various clock levels.
+ * Search the first expiring timer in the various clock levels. Caller must
+ * hold base->lock.
  */
 static unsigned long __next_timer_interrupt(struct timer_base *base)
 {
 	unsigned long clk, next, adj;
 	unsigned lvl, offset = 0;
 
-	spin_lock(&base->lock);
 	next = base->clk + NEXT_TIMER_MAX_DELTA;
 	clk = base->clk;
 	for (lvl = 0; lvl < LVL_DEPTH; lvl++, offset += LVL_SIZE) {
@@ -1358,7 +1358,6 @@ static unsigned long __next_timer_interrupt(struct timer_base *base)
 		clk >>= LVL_CLK_SHIFT;
 		clk += adj;
 	}
-	spin_unlock(&base->lock);
 	return next;
 }
 
@@ -1416,7 +1415,10 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
 	if (cpu_is_offline(smp_processor_id()))
 		return expires;
 
+	spin_lock(&base->lock);
 	nextevt = __next_timer_interrupt(base);
+	spin_unlock(&base->lock);
+
 	if (time_before_eq(nextevt, basej))
 		expires = basem;
 	else
@@ -1424,6 +1426,37 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
 
 	return cmp_next_hrtimer_event(basem, expires);
 }
+
+static int collect_expired_timers(struct timer_base *base,
+				  struct hlist_head *heads)
+{
+	/*
+	 * NOHZ optimization. After a long idle sleep we need to forward the
+	 * base to current jiffies. Avoid a loop by searching the bitfield for
+	 * the next expiring timer.
+	 */
+	if ((long)(jiffies - base->clk) > 2) {
+		unsigned long next = __next_timer_interrupt(base);
+
+		/*
+		 * If the next timer is ahead of time forward to current
+		 * jiffies, otherwise forward to the next expiry time.
+		 */
+		if (time_after(next, jiffies)) {
+			/* The call site will increment clock! */
+			base->clk = jiffies - 1;
+			return 0;
+		}
+		base->clk = next;
+	}
+	return __collect_expired_timers(base, heads);
+}
+#else
+static inline int collect_expired_timers(struct timer_base *base,
+					 struct hlist_head *heads)
+{
+	return __collect_expired_timers(base, heads);
+}
 #endif
 
 /*
-- 
cgit v1.2.3


From ff00673292bd42a3688b33de47252a6a3c3f424c Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 4 Jul 2016 09:50:35 +0000
Subject: timers/nohz: Remove pointless tick_nohz_kick_tick() function

This was a failed attempt to optimize the timer expiry in idle, which was
disabled and never revisited. Remove the cruft.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Arjan van de Ven <arjan@infradead.org>
Cc: Chris Mason <clm@fb.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: George Spelvin <linux@sciencehorizons.net>
Cc: Josh Triplett <josh@joshtriplett.org>
Cc: Len Brown <lenb@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: rt@linutronix.de
Link: http://lkml.kernel.org/r/20160704094342.431073782@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/time/tick-sched.c | 33 +--------------------------------
 1 file changed, 1 insertion(+), 32 deletions(-)

(limited to 'kernel/time')

diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 536ada80f6dd..69abc7bfe80f 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -1092,35 +1092,6 @@ static void tick_nohz_switch_to_nohz(void)
 	tick_nohz_activate(ts, NOHZ_MODE_LOWRES);
 }
 
-/*
- * When NOHZ is enabled and the tick is stopped, we need to kick the
- * tick timer from irq_enter() so that the jiffies update is kept
- * alive during long running softirqs. That's ugly as hell, but
- * correctness is key even if we need to fix the offending softirq in
- * the first place.
- *
- * Note, this is different to tick_nohz_restart. We just kick the
- * timer and do not touch the other magic bits which need to be done
- * when idle is left.
- */
-static void tick_nohz_kick_tick(struct tick_sched *ts, ktime_t now)
-{
-#if 0
-	/* Switch back to 2.6.27 behaviour */
-	ktime_t delta;
-
-	/*
-	 * Do not touch the tick device, when the next expiry is either
-	 * already reached or less/equal than the tick period.
-	 */
-	delta =	ktime_sub(hrtimer_get_expires(&ts->sched_timer), now);
-	if (delta.tv64 <= tick_period.tv64)
-		return;
-
-	tick_nohz_restart(ts, now);
-#endif
-}
-
 static inline void tick_nohz_irq_enter(void)
 {
 	struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
@@ -1131,10 +1102,8 @@ static inline void tick_nohz_irq_enter(void)
 	now = ktime_get();
 	if (ts->idle_active)
 		tick_nohz_stop_idle(ts, now);
-	if (ts->tick_stopped) {
+	if (ts->tick_stopped)
 		tick_nohz_update_jiffies(now);
-		tick_nohz_kick_tick(ts, now);
-	}
 }
 
 #else
-- 
cgit v1.2.3


From a683f390b93f4d1292f849fc48d28e322046120f Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 4 Jul 2016 09:50:36 +0000
Subject: timers: Forward the wheel clock whenever possible

The wheel clock is stale when a CPU goes into a long idle sleep. This has the
side effect that timers which are queued end up in the outer wheel levels.
That results in coarser granularity.

To solve this, we keep track of the idle state and forward the wheel clock
whenever possible.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Arjan van de Ven <arjan@infradead.org>
Cc: Chris Mason <clm@fb.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: George Spelvin <linux@sciencehorizons.net>
Cc: Josh Triplett <josh@joshtriplett.org>
Cc: Len Brown <lenb@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: rt@linutronix.de
Link: http://lkml.kernel.org/r/20160704094342.512039360@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/time/tick-internal.h |   1 +
 kernel/time/tick-sched.c    |  12 +++++
 kernel/time/timer.c         | 128 ++++++++++++++++++++++++++++++++++++--------
 3 files changed, 120 insertions(+), 21 deletions(-)

(limited to 'kernel/time')

diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index 966a5a6fdd0a..f738251000fe 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -164,3 +164,4 @@ static inline void timers_update_migration(bool update_nohz) { }
 DECLARE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases);
 
 extern u64 get_next_timer_interrupt(unsigned long basej, u64 basem);
+void timer_clear_idle(void);
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 69abc7bfe80f..5d81f9aa30d2 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -700,6 +700,12 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
 	delta = next_tick - basemono;
 	if (delta <= (u64)TICK_NSEC) {
 		tick.tv64 = 0;
+
+		/*
+		 * Tell the timer code that the base is not idle, i.e. undo
+		 * the effect of get_next_timer_interrupt():
+		 */
+		timer_clear_idle();
 		/*
 		 * We've not stopped the tick yet, and there's a timer in the
 		 * next period, so no point in stopping it either, bail.
@@ -809,6 +815,12 @@ static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now)
 	tick_do_update_jiffies64(now);
 	cpu_load_update_nohz_stop();
 
+	/*
+	 * Clear the timer idle flag, so we avoid IPIs on remote queueing and
+	 * the clock forward checks in the enqueue path:
+	 */
+	timer_clear_idle();
+
 	calc_load_exit_idle();
 	touch_softlockup_watchdog_sched();
 	/*
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 658051c97a3c..9339d71ee998 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -196,9 +196,11 @@ struct timer_base {
 	spinlock_t		lock;
 	struct timer_list	*running_timer;
 	unsigned long		clk;
+	unsigned long		next_expiry;
 	unsigned int		cpu;
 	bool			migration_enabled;
 	bool			nohz_active;
+	bool			is_idle;
 	DECLARE_BITMAP(pending_map, WHEEL_SIZE);
 	struct hlist_head	vectors[WHEEL_SIZE];
 } ____cacheline_aligned;
@@ -519,24 +521,37 @@ static void internal_add_timer(struct timer_base *base, struct timer_list *timer
 {
 	__internal_add_timer(base, timer);
 
+	if (!IS_ENABLED(CONFIG_NO_HZ_COMMON) || !base->nohz_active)
+		return;
+
 	/*
-	 * Check whether the other CPU is in dynticks mode and needs
-	 * to be triggered to reevaluate the timer wheel.  We are
-	 * protected against the other CPU fiddling with the timer by
-	 * holding the timer base lock. This also makes sure that a
-	 * CPU on the way to stop its tick can not evaluate the timer
-	 * wheel.
-	 *
-	 * Spare the IPI for deferrable timers on idle targets though.
-	 * The next busy ticks will take care of it. Except full dynticks
-	 * require special care against races with idle_cpu(), lets deal
-	 * with that later.
+	 * TODO: This wants some optimizing similar to the code below, but we
+	 * will do that when we switch from push to pull for deferrable timers.
 	 */
-	if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active) {
-		if (!(timer->flags & TIMER_DEFERRABLE) ||
-		    tick_nohz_full_cpu(base->cpu))
+	if (timer->flags & TIMER_DEFERRABLE) {
+		if (tick_nohz_full_cpu(base->cpu))
 			wake_up_nohz_cpu(base->cpu);
+		return;
 	}
+
+	/*
+	 * We might have to IPI the remote CPU if the base is idle and the
+	 * timer is not deferrable. If the other CPU is on the way to idle
+	 * then it can't set base->is_idle as we hold the base lock:
+	 */
+	if (!base->is_idle)
+		return;
+
+	/* Check whether this is the new first expiring timer: */
+	if (time_after_eq(timer->expires, base->next_expiry))
+		return;
+
+	/*
+	 * Set the next expiry time and kick the CPU so it can reevaluate the
+	 * wheel:
+	 */
+	base->next_expiry = timer->expires;
+	wake_up_nohz_cpu(base->cpu);
 }
 
 #ifdef CONFIG_TIMER_STATS
@@ -844,10 +859,11 @@ static inline struct timer_base *get_timer_base(u32 tflags)
 	return get_timer_cpu_base(tflags, tflags & TIMER_CPUMASK);
 }
 
-static inline struct timer_base *get_target_base(struct timer_base *base,
-						 unsigned tflags)
+#ifdef CONFIG_NO_HZ_COMMON
+static inline struct timer_base *
+__get_target_base(struct timer_base *base, unsigned tflags)
 {
-#if defined(CONFIG_NO_HZ_COMMON) && defined(CONFIG_SMP)
+#ifdef CONFIG_SMP
 	if ((tflags & TIMER_PINNED) || !base->migration_enabled)
 		return get_timer_this_cpu_base(tflags);
 	return get_timer_cpu_base(tflags, get_nohz_timer_target());
@@ -856,6 +872,43 @@ static inline struct timer_base *get_target_base(struct timer_base *base,
 #endif
 }
 
+static inline void forward_timer_base(struct timer_base *base)
+{
+	/*
+	 * We only forward the base when it's idle and we have a delta between
+	 * base clock and jiffies.
+	 */
+	if (!base->is_idle || (long) (jiffies - base->clk) < 2)
+		return;
+
+	/*
+	 * If the next expiry value is > jiffies, then we fast forward to
+	 * jiffies otherwise we forward to the next expiry value.
+	 */
+	if (time_after(base->next_expiry, jiffies))
+		base->clk = jiffies;
+	else
+		base->clk = base->next_expiry;
+}
+#else
+static inline struct timer_base *
+__get_target_base(struct timer_base *base, unsigned tflags)
+{
+	return get_timer_this_cpu_base(tflags);
+}
+
+static inline void forward_timer_base(struct timer_base *base) { }
+#endif
+
+static inline struct timer_base *
+get_target_base(struct timer_base *base, unsigned tflags)
+{
+	struct timer_base *target = __get_target_base(base, tflags);
+
+	forward_timer_base(target);
+	return target;
+}
+
 /*
  * We are using hashed locking: Holding per_cpu(timer_bases[x]).lock means
  * that all timers which are tied to this base are locked, and the base itself
@@ -1417,16 +1470,49 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
 
 	spin_lock(&base->lock);
 	nextevt = __next_timer_interrupt(base);
-	spin_unlock(&base->lock);
+	base->next_expiry = nextevt;
+	/*
+	 * We have a fresh next event. Check whether we can forward the base:
+	 */
+	if (time_after(nextevt, jiffies))
+		base->clk = jiffies;
+	else if (time_after(nextevt, base->clk))
+		base->clk = nextevt;
 
-	if (time_before_eq(nextevt, basej))
+	if (time_before_eq(nextevt, basej)) {
 		expires = basem;
-	else
+		base->is_idle = false;
+	} else {
 		expires = basem + (nextevt - basej) * TICK_NSEC;
+		/*
+		 * If we expect to sleep more than a tick, mark the base idle:
+		 */
+		if ((expires - basem) > TICK_NSEC)
+			base->is_idle = true;
+	}
+	spin_unlock(&base->lock);
 
 	return cmp_next_hrtimer_event(basem, expires);
 }
 
+/**
+ * timer_clear_idle - Clear the idle state of the timer base
+ *
+ * Called with interrupts disabled
+ */
+void timer_clear_idle(void)
+{
+	struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
+
+	/*
+	 * We do this unlocked. The worst outcome is a remote enqueue sending
+	 * a pointless IPI, but taking the lock would just make the window for
+	 * sending the IPI a few instructions smaller for the cost of taking
+	 * the lock in the exit from idle path.
+	 */
+	base->is_idle = false;
+}
+
 static int collect_expired_timers(struct timer_base *base,
 				  struct hlist_head *heads)
 {
@@ -1440,7 +1526,7 @@ static int collect_expired_timers(struct timer_base *base,
 
 		/*
 		 * If the next timer is ahead of time forward to current
-		 * jiffies, otherwise forward to the next expiry time.
+		 * jiffies, otherwise forward to the next expiry time:
 		 */
 		if (time_after(next, jiffies)) {
 			/* The call site will increment clock! */
-- 
cgit v1.2.3


From 4e85876a9d2a977b4a07389da8c07edf76d10825 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 4 Jul 2016 09:50:37 +0000
Subject: timers: Only wake softirq if necessary

With the wheel forwading in place and with the HZ=1000 4ms folding we can
avoid running the softirq at all.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Arjan van de Ven <arjan@infradead.org>
Cc: Chris Mason <clm@fb.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: George Spelvin <linux@sciencehorizons.net>
Cc: Josh Triplett <josh@joshtriplett.org>
Cc: Len Brown <lenb@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: rt@linutronix.de
Link: http://lkml.kernel.org/r/20160704094342.607650550@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/time/timer.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'kernel/time')

diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 9339d71ee998..8d830f1f6a6a 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -1608,7 +1608,18 @@ static void run_timer_softirq(struct softirq_action *h)
  */
 void run_local_timers(void)
 {
+	struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
+
 	hrtimer_run_queues();
+	/* Raise the softirq only if required. */
+	if (time_before(jiffies, base->clk)) {
+		if (!IS_ENABLED(CONFIG_NO_HZ_COMMON) || !base->nohz_active)
+			return;
+		/* CPU is awake, so check the deferrable base. */
+		base++;
+		if (time_before(jiffies, base->clk))
+			return;
+	}
 	raise_softirq(TIMER_SOFTIRQ);
 }
 
-- 
cgit v1.2.3


From ffdf047728f8f93df896b58049c7513856027141 Mon Sep 17 00:00:00 2001
From: Anna-Maria Gleixner <anna-maria@linutronix.de>
Date: Mon, 4 Jul 2016 09:50:39 +0000
Subject: timers: Split out index calculation

For further optimizations we need to seperate index calculation
from queueing. No functional change.

Signed-off-by: Anna-Maria Gleixner <anna-maria@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Arjan van de Ven <arjan@infradead.org>
Cc: Chris Mason <clm@fb.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: George Spelvin <linux@sciencehorizons.net>
Cc: Josh Triplett <josh@joshtriplett.org>
Cc: Len Brown <lenb@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: rt@linutronix.de
Link: http://lkml.kernel.org/r/20160704094342.691159619@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/time/timer.c | 47 ++++++++++++++++++++++++++++++++---------------
 1 file changed, 32 insertions(+), 15 deletions(-)

(limited to 'kernel/time')

diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 8d830f1f6a6a..8d7c23e55c85 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -471,12 +471,9 @@ static inline unsigned calc_index(unsigned expires, unsigned lvl)
 	return LVL_OFFS(lvl) + (expires & LVL_MASK);
 }
 
-static void
-__internal_add_timer(struct timer_base *base, struct timer_list *timer)
+static int calc_wheel_index(unsigned long expires, unsigned long clk)
 {
-	unsigned long expires = timer->expires;
-	unsigned long delta = expires - base->clk;
-	struct hlist_head *vec;
+	unsigned long delta = expires - clk;
 	unsigned int idx;
 
 	if (delta < LVL_START(1)) {
@@ -496,7 +493,7 @@ __internal_add_timer(struct timer_base *base, struct timer_list *timer)
 	} else if (LVL_DEPTH > 8 && delta < LVL_START(8)) {
 		idx = calc_index(expires, 7);
 	} else if ((long) delta < 0) {
-		idx = base->clk & LVL_MASK;
+		idx = clk & LVL_MASK;
 	} else {
 		/*
 		 * Force expire obscene large timeouts to expire at the
@@ -507,20 +504,33 @@ __internal_add_timer(struct timer_base *base, struct timer_list *timer)
 
 		idx = calc_index(expires, LVL_DEPTH - 1);
 	}
-	/*
-	 * Enqueue the timer into the array bucket, mark it pending in
-	 * the bitmap and store the index in the timer flags.
-	 */
-	vec = base->vectors + idx;
-	hlist_add_head(&timer->entry, vec);
+	return idx;
+}
+
+/*
+ * Enqueue the timer into the hash bucket, mark it pending in
+ * the bitmap and store the index in the timer flags.
+ */
+static void enqueue_timer(struct timer_base *base, struct timer_list *timer,
+			  unsigned int idx)
+{
+	hlist_add_head(&timer->entry, base->vectors + idx);
 	__set_bit(idx, base->pending_map);
 	timer_set_idx(timer, idx);
 }
 
-static void internal_add_timer(struct timer_base *base, struct timer_list *timer)
+static void
+__internal_add_timer(struct timer_base *base, struct timer_list *timer)
 {
-	__internal_add_timer(base, timer);
+	unsigned int idx;
+
+	idx = calc_wheel_index(timer->expires, base->clk);
+	enqueue_timer(base, timer, idx);
+}
 
+static void
+trigger_dyntick_cpu(struct timer_base *base, struct timer_list *timer)
+{
 	if (!IS_ENABLED(CONFIG_NO_HZ_COMMON) || !base->nohz_active)
 		return;
 
@@ -551,7 +561,14 @@ static void internal_add_timer(struct timer_base *base, struct timer_list *timer
 	 * wheel:
 	 */
 	base->next_expiry = timer->expires;
-	wake_up_nohz_cpu(base->cpu);
+		wake_up_nohz_cpu(base->cpu);
+}
+
+static void
+internal_add_timer(struct timer_base *base, struct timer_list *timer)
+{
+	__internal_add_timer(base, timer);
+	trigger_dyntick_cpu(base, timer);
 }
 
 #ifdef CONFIG_TIMER_STATS
-- 
cgit v1.2.3


From f00c0afdfa625165a609513bc74164d56752ec3e Mon Sep 17 00:00:00 2001
From: Anna-Maria Gleixner <anna-maria@linutronix.de>
Date: Mon, 4 Jul 2016 09:50:40 +0000
Subject: timers: Implement optimization for same expiry time in mod_timer()

The existing optimization for same expiry time in mod_timer() checks whether
the timer expiry time is the same as the new requested expiry time. In the old
timer wheel implementation this does not take the slack batching into account,
neither does the new implementation evaluate whether the new expiry time will
requeue the timer to the same bucket.

To optimize that, we can calculate the resulting bucket and check if the new
expiry time is different from the current expiry time. This calculation
happens outside the base lock held region. If the resulting bucket is the same
we can avoid taking the base lock and requeueing the timer.

If the timer needs to be requeued then we have to check under the base lock
whether the base time has changed between the lockless calculation and taking
the lock. If it has changed we need to recalculate under the lock.

This optimization takes effect for timers which are enqueued into the less
granular wheel levels (1 and above). With a simple test case the functionality
has been verified:

            Before        After
 Match:       5.5%        86.6%
 Requeue:    94.5%        13.4%
 Recalc:                  <0.01%

In the non optimized case the timer is requeued in 94.5% of the cases. With
the index optimization in place the requeue rate drops to 13.4%. The case
where the lockless index calculation has to be redone is less than 0.01%.

With a real world test case (networking) we observed the following changes:

            Before        After
 Match:      97.8%        99.7%
 Requeue:     2.2%         0.3%
 Recalc:                  <0.001%

That means two percent fewer lock/requeue/unlock operations done in one of
the hot path use cases of timers.

Signed-off-by: Anna-Maria Gleixner <anna-maria@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Arjan van de Ven <arjan@infradead.org>
Cc: Chris Mason <clm@fb.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: George Spelvin <linux@sciencehorizons.net>
Cc: Josh Triplett <josh@joshtriplett.org>
Cc: Len Brown <lenb@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: rt@linutronix.de
Link: http://lkml.kernel.org/r/20160704094342.778527749@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/time/timer.c | 51 +++++++++++++++++++++++++++++++++++----------------
 1 file changed, 35 insertions(+), 16 deletions(-)

(limited to 'kernel/time')

diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 8d7c23e55c85..8f29abeb8c4d 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -960,28 +960,36 @@ static inline int
 __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
 {
 	struct timer_base *base, *new_base;
-	unsigned long flags;
+	unsigned int idx = UINT_MAX;
+	unsigned long clk = 0, flags;
 	int ret = 0;
 
 	/*
-	 * TODO: Calculate the array bucket of the timer right here w/o
-	 * holding the base lock. This allows to check not only
-	 * timer->expires == expires below, but also whether the timer
-	 * ends up in the same bucket. If we really need to requeue
-	 * the timer then we check whether base->clk have
-	 * advanced between here and locking the timer base. If
-	 * jiffies advanced we have to recalc the array bucket with the
-	 * lock held.
-	 */
-
-	/*
-	 * This is a common optimization triggered by the
-	 * networking code - if the timer is re-modified
-	 * to be the same thing then just return:
+	 * This is a common optimization triggered by the networking code - if
+	 * the timer is re-modified to have the same timeout or ends up in the
+	 * same array bucket then just return:
 	 */
 	if (timer_pending(timer)) {
 		if (timer->expires == expires)
 			return 1;
+		/*
+		 * Take the current timer_jiffies of base, but without holding
+		 * the lock!
+		 */
+		base = get_timer_base(timer->flags);
+		clk = base->clk;
+
+		idx = calc_wheel_index(expires, clk);
+
+		/*
+		 * Retrieve and compare the array index of the pending
+		 * timer. If it matches set the expiry to the new value so a
+		 * subsequent call will exit in the expires check above.
+		 */
+		if (idx == timer_get_idx(timer)) {
+			timer->expires = expires;
+			return 1;
+		}
 	}
 
 	timer_stats_timer_set_start_info(timer);
@@ -1018,7 +1026,18 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
 	}
 
 	timer->expires = expires;
-	internal_add_timer(base, timer);
+	/*
+	 * If 'idx' was calculated above and the base time did not advance
+	 * between calculating 'idx' and taking the lock, only enqueue_timer()
+	 * and trigger_dyntick_cpu() is required. Otherwise we need to
+	 * (re)calculate the wheel index via internal_add_timer().
+	 */
+	if (idx != UINT_MAX && clk == base->clk) {
+		enqueue_timer(base, timer, idx);
+		trigger_dyntick_cpu(base, timer);
+	} else {
+		internal_add_timer(base, timer);
+	}
 
 out_unlock:
 	spin_unlock_irqrestore(&base->lock, flags);
-- 
cgit v1.2.3


From 775be506266a860f141f6b848c92c316c602a94f Mon Sep 17 00:00:00 2001
From: Ben Dooks <ben.dooks@codethink.co.uk>
Date: Fri, 17 Jun 2016 16:56:14 +0100
Subject: clockevents: Make clockevents_subsys static

The clockevents_subsys struct is used for sysfs support and
is not declared or used outside the file it is defined in.
Fix the following warning by making it static:

kernel/time/clockevents.c:648:17: warning: symbol 'clockevents_subsys' was not declared. Should it be static?

Signed-off-by: Ben Dooks <ben.dooks@codethink.co.uk>
Cc: linux-kernel@lists.codethink.co.uk
Link: http://lkml.kernel.org/r/1466178974-7105-1-git-send-email-ben.dooks@codethink.co.uk
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/clockevents.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel/time')

diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index a9b76a40319e..2c5bc77c0bb0 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -645,7 +645,7 @@ void tick_cleanup_dead_cpu(int cpu)
 #endif
 
 #ifdef CONFIG_SYSFS
-struct bus_type clockevents_subsys = {
+static struct bus_type clockevents_subsys = {
 	.name		= "clockevents",
 	.dev_name       = "clockevent",
 };
-- 
cgit v1.2.3


From 1f3b0f8243cb934307f59bd4d8e43b868e61d4d9 Mon Sep 17 00:00:00 2001
From: Gaurav Jindal <Gaurav.Jindal@spreadtrum.com>
Date: Thu, 14 Jul 2016 12:04:20 +0000
Subject: tick/nohz: Optimize nohz idle enter

tick_nohz_start_idle is called before checking whether the idle tick can be
stopped. If the tick cannot be stopped, calling tick_nohz_start_idle() is
pointless and just wasting CPU cycles.

Only invoke tick_nohz_start_idle() when can_stop_idle_tick() returns true. A
short one minute observation of the effect on ARM64 shows a reduction of calls
by 1.5% thus optimizing the idle entry sequence.

[tglx: Massaged changelog ]

Co-developed-by: Sanjeev Yadav<sanjeev.yadav@spreadtrum.com>
Signed-off-by: Gaurav Jindal<gaurav.jindal@spreadtrum.com>
Link: http://lkml.kernel.org/r/20160714120416.GB21099@gaurav.jindal@spreadtrum.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/tick-sched.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'kernel/time')

diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 2ec7c00228f3..204fdc86863d 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -908,11 +908,10 @@ static void __tick_nohz_idle_enter(struct tick_sched *ts)
 	ktime_t now, expires;
 	int cpu = smp_processor_id();
 
-	now = tick_nohz_start_idle(ts);
-
 	if (can_stop_idle_tick(cpu, ts)) {
 		int was_stopped = ts->tick_stopped;
 
+		now = tick_nohz_start_idle(ts);
 		ts->idle_calls++;
 
 		expires = tick_nohz_stop_sched_tick(ts, now, cpu);
-- 
cgit v1.2.3