From 71c696b1d0310da3ab8033d743282959bd49d28b Mon Sep 17 00:00:00 2001
From: Phil Carmody <ext-phil.2.carmody@nokia.com>
Date: Tue, 22 Mar 2011 16:34:12 -0700
Subject: calibrate: extract fall-back calculation into own helper

The motivation for this patch series is that currently our OMAP calibrates
itself using the trial-and-error binary chop fallback that some other
architectures no longer need to perform.  This is a lengthy process,
taking 0.2s in an environment where boot time is of great interest.

Patch 2/4 has two optimisations.  Firstly, it replaces the initial
repeated- doubling to find the relevant power of 2 with a tight loop that
just does as much as it can in a jiffy.  Secondly, it doesn't binary chop
over an entire power of 2 range, it choses a much smaller range based on
how much it squeezed in, and failed to squeeze in, during the first stage.
 Both are significant optimisations, and bring our calibration down from
23 jiffies to 5, and, in the process, often arrive at a more accurate lpj
value.

The 'bands' and 'sub-logarithmic' growth may look over-engineered, but
they only cost a small level of inaccuracy in the initial guess (for all
architectures) in order to avoid the very large inaccuracies that appeared
during testing (on x86_64 architectures, and presumably others with less
metronomic operation).  Note that due to the existence of the TSC and
other timers, the x86_64 will not typically use this fallback routine, but
I wanted to code defensively, able to cope with all kinds of processor
behaviours and kernel command line options.

Patch 3/4 is an additional trap for the nightmare scenario where the
initial estimate is very inaccurate, possibly due to things like SMIs.
It simply retries with a larger bound.

Stephen said:

I tried this patch set out on an MSM7630.
:
: Before:
:
: Calibrating delay loop... 681.57 BogoMIPS (lpj=3407872)
:
: After:
:
: Calibrating delay loop... 680.75 BogoMIPS (lpj=3403776)
:
: But the really good news is calibration time dropped from ~247ms to ~56ms.
:  Sadly we won't be able to benefit from this should my udelay patches make
: it into ARM because we would be using calibrate_delay_direct() instead (at
: least on machines who choose to).  Can we somehow reapply the logic behind
: this to calibrate_delay_direct()?  That would be even better, but this is
: definitely a boot time improvement.
:
: Or maybe we could just replace calibrate_delay_direct() with this fallback
: calculation?  If __delay() is a thin wrapper around read_current_timer()
: it should work just as well (plus patch 3 makes it handle SMIs).  I'll try
: that out.

This patch:

... so that it can be modified more clinically.

This is almost entirely cosmetic. The only change to the operation
is that the global variable is only set once after the estimation is
completed, rather than taking on all the intermediate values. However,
there are no readers of that variable, so this change is unimportant.

Signed-off-by: Phil Carmody <ext-phil.2.carmody@nokia.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Tested-by: Stephen Boyd <sboyd@codeaurora.org>
Cc: Greg KH <greg@kroah.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 init/calibrate.c | 73 +++++++++++++++++++++++++++++++-------------------------
 1 file changed, 40 insertions(+), 33 deletions(-)

(limited to 'init/calibrate.c')

diff --git a/init/calibrate.c b/init/calibrate.c
index 24fe022c55f9..b71643a7acae 100644
--- a/init/calibrate.c
+++ b/init/calibrate.c
@@ -119,10 +119,47 @@ static unsigned long __cpuinit calibrate_delay_direct(void) {return 0;}
  */
 #define LPS_PREC 8
 
-void __cpuinit calibrate_delay(void)
+static unsigned long __cpuinit calibrate_delay_converge(void)
 {
-	unsigned long ticks, loopbit;
+	unsigned long lpj, ticks, loopbit;
 	int lps_precision = LPS_PREC;
+
+	lpj = (1<<12);
+	while ((lpj <<= 1) != 0) {
+		/* wait for "start of" clock tick */
+		ticks = jiffies;
+		while (ticks == jiffies)
+			/* nothing */;
+		/* Go .. */
+		ticks = jiffies;
+		__delay(lpj);
+		ticks = jiffies - ticks;
+		if (ticks)
+			break;
+	}
+
+	/*
+	 * Do a binary approximation to get lpj set to
+	 * equal one clock (up to lps_precision bits)
+	 */
+	lpj >>= 1;
+	loopbit = lpj;
+	while (lps_precision-- && (loopbit >>= 1)) {
+		lpj |= loopbit;
+		ticks = jiffies;
+		while (ticks == jiffies)
+			/* nothing */;
+		ticks = jiffies;
+		__delay(lpj);
+		if (jiffies != ticks)	/* longer than 1 tick */
+			lpj &= ~loopbit;
+	}
+
+	return lpj;
+}
+
+void __cpuinit calibrate_delay(void)
+{
 	static bool printed;
 
 	if (preset_lpj) {
@@ -139,39 +176,9 @@ void __cpuinit calibrate_delay(void)
 			pr_info("Calibrating delay using timer "
 				"specific routine.. ");
 	} else {
-		loops_per_jiffy = (1<<12);
-
 		if (!printed)
 			pr_info("Calibrating delay loop... ");
-		while ((loops_per_jiffy <<= 1) != 0) {
-			/* wait for "start of" clock tick */
-			ticks = jiffies;
-			while (ticks == jiffies)
-				/* nothing */;
-			/* Go .. */
-			ticks = jiffies;
-			__delay(loops_per_jiffy);
-			ticks = jiffies - ticks;
-			if (ticks)
-				break;
-		}
-
-		/*
-		 * Do a binary approximation to get loops_per_jiffy set to
-		 * equal one clock (up to lps_precision bits)
-		 */
-		loops_per_jiffy >>= 1;
-		loopbit = loops_per_jiffy;
-		while (lps_precision-- && (loopbit >>= 1)) {
-			loops_per_jiffy |= loopbit;
-			ticks = jiffies;
-			while (ticks == jiffies)
-				/* nothing */;
-			ticks = jiffies;
-			__delay(loops_per_jiffy);
-			if (jiffies != ticks)	/* longer than 1 tick */
-				loops_per_jiffy &= ~loopbit;
-		}
+		loops_per_jiffy = calibrate_delay_converge();
 	}
 	if (!printed)
 		pr_cont("%lu.%02lu BogoMIPS (lpj=%lu)\n",
-- 
cgit v1.2.3


From 191e56880a6a638ce931859317f37deb084b6433 Mon Sep 17 00:00:00 2001
From: Phil Carmody <ext-phil.2.carmody@nokia.com>
Date: Tue, 22 Mar 2011 16:34:13 -0700
Subject: calibrate: home in on correct lpj value more quickly

Binary chop with a jiffy-resync on each step to find an upper bound is
slow, so just race in a tight-ish loop to find an underestimate.

If done with lots of individual steps, sometimes several hundreds of
iterations would be required, which would impose a significant overhead,
and make the initial estimate very low.  By taking slowly increasing steps
there will be less overhead.

E.g.  an x86_64 2.67GHz could have fitted in 613 individual small delays,
but in reality should have been able to fit in a single delay 644 times
longer, so underestimated by 31 steps.  To reach the equivalent of 644
small delays with the accelerating scheme now requires about 130
iterations, so has <1/4th of the overhead, and can therefore be expected
to underestimate by only 7 steps.

As now we have a better initial estimate we can binary chop over a smaller
range.  With the loop overhead in the initial estimate kept low, and the
step sizes moderate, we won't have under-estimated by much, so chose as
tight a range as we can.

Signed-off-by: Phil Carmody <ext-phil.2.carmody@nokia.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Tested-by: Stephen Boyd <sboyd@codeaurora.org>
Cc: Greg KH <greg@kroah.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 init/calibrate.c | 57 +++++++++++++++++++++++++++++++++-----------------------
 1 file changed, 34 insertions(+), 23 deletions(-)

(limited to 'init/calibrate.c')

diff --git a/init/calibrate.c b/init/calibrate.c
index b71643a7acae..f9000dfbe227 100644
--- a/init/calibrate.c
+++ b/init/calibrate.c
@@ -110,8 +110,8 @@ static unsigned long __cpuinit calibrate_delay_direct(void) {return 0;}
 
 /*
  * This is the number of bits of precision for the loops_per_jiffy.  Each
- * bit takes on average 1.5/HZ seconds.  This (like the original) is a little
- * better than 1%
+ * time we refine our estimate after the first takes 1.5/HZ seconds, so try
+ * to start with a good estimate.
  * For the boot cpu we can skip the delay calibration and assign it a value
  * calculated based on the timer frequency.
  * For the rest of the CPUs we cannot assume that the timer frequency is same as
@@ -121,38 +121,49 @@ static unsigned long __cpuinit calibrate_delay_direct(void) {return 0;}
 
 static unsigned long __cpuinit calibrate_delay_converge(void)
 {
-	unsigned long lpj, ticks, loopbit;
-	int lps_precision = LPS_PREC;
+	/* First stage - slowly accelerate to find initial bounds */
+	unsigned long lpj, ticks, loopadd, chop_limit;
+	int trials = 0, band = 0, trial_in_band = 0;
 
 	lpj = (1<<12);
-	while ((lpj <<= 1) != 0) {
-		/* wait for "start of" clock tick */
-		ticks = jiffies;
-		while (ticks == jiffies)
-			/* nothing */;
-		/* Go .. */
-		ticks = jiffies;
-		__delay(lpj);
-		ticks = jiffies - ticks;
-		if (ticks)
-			break;
-	}
+
+	/* wait for "start of" clock tick */
+	ticks = jiffies;
+	while (ticks == jiffies)
+		; /* nothing */
+	/* Go .. */
+	ticks = jiffies;
+	do {
+		if (++trial_in_band == (1<<band)) {
+			++band;
+			trial_in_band = 0;
+		}
+		__delay(lpj * band);
+		trials += band;
+	} while (ticks == jiffies);
+	/*
+	 * We overshot, so retreat to a clear underestimate. Then estimate
+	 * the largest likely undershoot. This defines our chop bounds.
+	 */
+	trials -= band;
+	loopadd = lpj * band;
+	lpj *= trials;
+	chop_limit = lpj >> (LPS_PREC + 1);
 
 	/*
 	 * Do a binary approximation to get lpj set to
-	 * equal one clock (up to lps_precision bits)
+	 * equal one clock (up to LPS_PREC bits)
 	 */
-	lpj >>= 1;
-	loopbit = lpj;
-	while (lps_precision-- && (loopbit >>= 1)) {
-		lpj |= loopbit;
+	while (loopadd > chop_limit) {
+		lpj += loopadd;
 		ticks = jiffies;
 		while (ticks == jiffies)
-			/* nothing */;
+			; /* nothing */
 		ticks = jiffies;
 		__delay(lpj);
 		if (jiffies != ticks)	/* longer than 1 tick */
-			lpj &= ~loopbit;
+			lpj -= loopadd;
+		loopadd >>= 1;
 	}
 
 	return lpj;
-- 
cgit v1.2.3


From b1b5f65e53af770ede22c113e249de2f6fa53706 Mon Sep 17 00:00:00 2001
From: Phil Carmody <ext-phil.2.carmody@nokia.com>
Date: Tue, 22 Mar 2011 16:34:15 -0700
Subject: calibrate: retry with wider bounds when converge seems to fail

Systems with unmaskable interrupts such as SMIs may massively
underestimate loops_per_jiffy, and fail to converge anywhere near the real
value.  A case seen on x86_64 was an initial estimate of 256<<12, which
converged to 511<<12 where the real value should have been over 630<<12.
This admitedly requires bypassing the TSC calibration (lpj_fine), and a
failure to settle in the direct calibration too, but is physically
possible.  This failure does not depend on my previous calibration
optimisation, but by luck is easy to fix with the optimisation in place
with a trivial retry loop.

In the context of the optimised converging method, as we can no longer
trust the starting estimate, enlarge the search bounds exponentially so
that the number of retries is logarithmically bounded.

[akpm@linux-foundation.org: mention x86_64 SMIs in comment]
Signed-off-by: Phil Carmody <ext-phil.2.carmody@nokia.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Tested-by: Stephen Boyd <sboyd@codeaurora.org>
Cc: Greg KH <greg@kroah.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 init/calibrate.c | 22 ++++++++++++++++++----
 1 file changed, 18 insertions(+), 4 deletions(-)

(limited to 'init/calibrate.c')

diff --git a/init/calibrate.c b/init/calibrate.c
index f9000dfbe227..76ac9194cbc4 100644
--- a/init/calibrate.c
+++ b/init/calibrate.c
@@ -122,7 +122,7 @@ static unsigned long __cpuinit calibrate_delay_direct(void) {return 0;}
 static unsigned long __cpuinit calibrate_delay_converge(void)
 {
 	/* First stage - slowly accelerate to find initial bounds */
-	unsigned long lpj, ticks, loopadd, chop_limit;
+	unsigned long lpj, lpj_base, ticks, loopadd, loopadd_base, chop_limit;
 	int trials = 0, band = 0, trial_in_band = 0;
 
 	lpj = (1<<12);
@@ -146,14 +146,18 @@ static unsigned long __cpuinit calibrate_delay_converge(void)
 	 * the largest likely undershoot. This defines our chop bounds.
 	 */
 	trials -= band;
-	loopadd = lpj * band;
-	lpj *= trials;
-	chop_limit = lpj >> (LPS_PREC + 1);
+	loopadd_base = lpj * band;
+	lpj_base = lpj * trials;
+
+recalibrate:
+	lpj = lpj_base;
+	loopadd = loopadd_base;
 
 	/*
 	 * Do a binary approximation to get lpj set to
 	 * equal one clock (up to LPS_PREC bits)
 	 */
+	chop_limit = lpj >> LPS_PREC;
 	while (loopadd > chop_limit) {
 		lpj += loopadd;
 		ticks = jiffies;
@@ -165,6 +169,16 @@ static unsigned long __cpuinit calibrate_delay_converge(void)
 			lpj -= loopadd;
 		loopadd >>= 1;
 	}
+	/*
+	 * If we incremented every single time possible, presume we've
+	 * massively underestimated initially, and retry with a higher
+	 * start, and larger range. (Only seen on x86_64, due to SMIs)
+	 */
+	if (lpj + loopadd * 2 == lpj_base + loopadd_base * 2) {
+		lpj_base = lpj;
+		loopadd_base <<= 2;
+		goto recalibrate;
+	}
 
 	return lpj;
 }
-- 
cgit v1.2.3