Merge branch 'reciprocal'

Hannes Frederic Sowa says: ==================== reciprocal_divide update This patch is on top of aee636c4809fa5 ("bpf: do not use reciprocal divide") from Eric that sits in net tree. It will not create a merge conflict, but it depends on this one, so we suggest, if possible, to merge net into net-next. We are proposing this change with only small modifications from the v2 version, namely updating the name of trim to reciprocal_scale (as commented on by Ben Hutchings and Eric Dumazet, thanks!). We thought about introducing the reciprocal_divide algorithm in parallel to the one already used by the kernel but faced organizational issues, leading us to the conclusion that it is best to just replace the old one: We could not come up with names for the different implementations and also with a way to describe the differences to guide developers which one to choose in which situation. This is because we cannot specify the correct semantics for the version which is currently used by the kernel. Altough it seems to not be causing problems in the kernel, we cannot surely say so in the case of flex_array for the future. Current usage seems ok, but future users could run into problems. Changelog: v1->v2: - changed name to prandom_u32_max in p1 - changed name to trim in p2 - reworked code in p3 v2->v3: - p1 and p3 stays unchanged, only small update in commit message in p3 - changed name to reciprocal_scale in p2 - fixed kernel doc format v3->v4: - pseduo -> pseudo (thanks to Tilman Schmidt) v4->v5: - fix pseduo -> pseudo for real now, sorry for the noise ==================== Signed-off-by: David S. Miller <davem@davemloft.net>
author: David S. Miller <davem@davemloft.net> 2014-01-22 11:19:26 +0400
committer: David S. Miller <davem@davemloft.net> 2014-01-22 11:19:26 +0400
commit: 374d1125237e94f16ffa3185cff62df03977a988 (patch)
tree: 3bb15ec5b897df4ea197339478bb5d76049a2761 /include/linux
parent: 6cd28f044b47aeeba91807d97d6f3ea5a048e88d (diff)
parent: 809fa972fd90ff27225294b17a027e908b2d7b7a (diff)
download: linux-374d1125237e94f16ffa3185cff62df03977a988.tar.xz
5 files changed, 62 insertions, 21 deletions
diff --git a/include/linux/flex_array.h b/include/linux/flex_array.h
index 6843cf193a44..b6efb0c64408 100644
--- a/include/linux/flex_array.h
+++ b/include/linux/flex_array.h
@@ -2,6 +2,7 @@
 #define _FLEX_ARRAY_H
 
 #include <linux/types.h>
+#include <linux/reciprocal_div.h>
 #include <asm/page.h>
 
 #define FLEX_ARRAY_PART_SIZE PAGE_SIZE
@@ -22,7 +23,7 @@ struct flex_array {
 			int element_size;
 			int total_nr_elements;
 			int elems_per_part;
-			u32 reciprocal_elems;
+			struct reciprocal_value reciprocal_elems;
 			struct flex_array_part *parts[];
 		};
 		/*
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index ecb87544cc5d..03d8a6b0e2e8 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -193,6 +193,25 @@ extern int _cond_resched(void);
 		(__x < 0) ? -__x : __x;		\
 	})
 
+/**
+ * reciprocal_scale - "scale" a value into range [0, ep_ro)
+ * @val: value
+ * @ep_ro: right open interval endpoint
+ *
+ * Perform a "reciprocal multiplication" in order to "scale" a value into
+ * range [0, ep_ro), where the upper interval endpoint is right-open.
+ * This is useful, e.g. for accessing a index of an array containing
+ * ep_ro elements, for example. Think of it as sort of modulus, only that
+ * the result isn't that of modulo. ;) Note that if initial input is a
+ * small value, then result will return 0.
+ *
+ * Return: a result based on val in interval [0, ep_ro).
+ */
+static inline u32 reciprocal_scale(u32 val, u32 ep_ro)
+{
+	return (u32)(((u64) val * ep_ro) >> 32);
+}
+
 #if defined(CONFIG_MMU) && \
 	(defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_DEBUG_ATOMIC_SLEEP))
 void might_fault(void);
diff --git a/include/linux/random.h b/include/linux/random.h
index 4002b3df4c85..1cfce0e24dbd 100644
--- a/include/linux/random.h
+++ b/include/linux/random.h
@@ -8,7 +8,6 @@
 
 #include <uapi/linux/random.h>
 
-
 extern void add_device_randomness(const void *, unsigned int);
 extern void add_input_randomness(unsigned int type, unsigned int code,
 				 unsigned int value);
@@ -38,6 +37,23 @@ struct rnd_state {
 u32 prandom_u32_state(struct rnd_state *state);
 void prandom_bytes_state(struct rnd_state *state, void *buf, int nbytes);
 
+/**
+ * prandom_u32_max - returns a pseudo-random number in interval [0, ep_ro)
+ * @ep_ro: right open interval endpoint
+ *
+ * Returns a pseudo-random number that is in interval [0, ep_ro). Note
+ * that the result depends on PRNG being well distributed in [0, ~0U]
+ * u32 space. Here we use maximally equidistributed combined Tausworthe
+ * generator, that is, prandom_u32(). This is useful when requesting a
+ * random index of an array containing ep_ro elements, for example.
+ *
+ * Returns: pseudo-random number in interval [0, ep_ro)
+ */
+static inline u32 prandom_u32_max(u32 ep_ro)
+{
+	return (u32)(((u64) prandom_u32() * ep_ro) >> 32);
+}
+
 /*
  * Handle minimum values for seeds
  */
diff --git a/include/linux/reciprocal_div.h b/include/linux/reciprocal_div.h
index f9c90b33285b..8c5a3fb6c6c5 100644
--- a/include/linux/reciprocal_div.h
+++ b/include/linux/reciprocal_div.h
@@ -4,29 +4,32 @@
 #include <linux/types.h>
 
 /*
- * This file describes reciprocical division.
+ * This algorithm is based on the paper "Division by Invariant
+ * Integers Using Multiplication" by Torbjörn Granlund and Peter
+ * L. Montgomery.
  *
- * This optimizes the (A/B) problem, when A and B are two u32
- * and B is a known value (but not known at compile time)
+ * The assembler implementation from Agner Fog, which this code is
+ * based on, can be found here:
+ * http://www.agner.org/optimize/asmlib.zip
  *
- * The math principle used is :
- *   Let RECIPROCAL_VALUE(B) be (((1LL << 32) + (B - 1))/ B)
- *   Then A / B = (u32)(((u64)(A) * (R)) >> 32)
- *
- * This replaces a divide by a multiply (and a shift), and
- * is generally less expensive in CPU cycles.
+ * This optimization for A/B is helpful if the divisor B is mostly
+ * runtime invariant. The reciprocal of B is calculated in the
+ * slow-path with reciprocal_value(). The fast-path can then just use
+ * a much faster multiplication operation with a variable dividend A
+ * to calculate the division A/B.
  */
 
-/*
- * Computes the reciprocal value (R) for the value B of the divisor.
- * Should not be called before each reciprocal_divide(),
- * or else the performance is slower than a normal divide.
- */
-extern u32 reciprocal_value(u32 B);
+struct reciprocal_value {
+	u32 m;
+	u8 sh1, sh2;
+};
 
+struct reciprocal_value reciprocal_value(u32 d);
 
-static inline u32 reciprocal_divide(u32 A, u32 R)
+static inline u32 reciprocal_divide(u32 a, struct reciprocal_value R)
 {
-	return (u32)(((u64)A * R) >> 32);
+	u32 t = (u32)(((u64)a * R.m) >> 32);
+	return (t + ((a - t) >> R.sh1)) >> R.sh2;
 }
-#endif
+
+#endif /* _LINUX_RECIPROCAL_DIV_H */
diff --git a/include/linux/slab_def.h b/include/linux/slab_def.h
index 09bfffb08a56..96e8abae19a9 100644
--- a/include/linux/slab_def.h
+++ b/include/linux/slab_def.h
@@ -1,6 +1,8 @@
 #ifndef _LINUX_SLAB_DEF_H
 #define	_LINUX_SLAB_DEF_H
 
+#include <linux/reciprocal_div.h>
+
 /*
  * Definitions unique to the original Linux SLAB allocator.
  */
@@ -12,7 +14,7 @@ struct kmem_cache {
 	unsigned int shared;
 
 	unsigned int size;
-	u32 reciprocal_buffer_size;
+	struct reciprocal_value reciprocal_buffer_size;
 /* 2) touched by every alloc & free from the backend */
 
 	unsigned int flags;		/* constant flags */
author	David S. Miller <davem@davemloft.net>	2014-01-22 11:19:26 +0400
committer	David S. Miller <davem@davemloft.net>	2014-01-22 11:19:26 +0400
commit	374d1125237e94f16ffa3185cff62df03977a988 (patch)
tree	3bb15ec5b897df4ea197339478bb5d76049a2761 /include/linux
parent	6cd28f044b47aeeba91807d97d6f3ea5a048e88d (diff)
parent	809fa972fd90ff27225294b17a027e908b2d7b7a (diff)
download	linux-374d1125237e94f16ffa3185cff62df03977a988.tar.xz