ARM: 7984/1: prefetch: add prefetchw invocations for barriered atomics

After a bunch of benchmarking on the interaction between dmb and pldw, it turns out that issuing the pldw *after* the dmb instruction can give modest performance gains (~3% atomic_add_return improvement on a dual A15). This patch adds prefetchw invocations to our barriered atomic operations including cmpxchg, test_and_xxx and futexes. Signed-off-by: Will Deacon <will.deacon@arm.com> Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
author: Will Deacon <will.deacon@arm.com> 2014-02-21 20:01:48 +0400
committer: Russell King <rmk+kernel@arm.linux.org.uk> 2014-02-25 15:30:20 +0400
commit: c32ffce0f66e5d1d4856254516e24f5ef275cd00 (patch)
tree: 125229cdd38bfd6e7e62cff7eb8771a34cc999a7 /arch/arm/lib
parent: 6ea41c80115f49e7d8b80312ffc99973d283471f (diff)
download: linux-c32ffce0f66e5d1d4856254516e24f5ef275cd00.tar.xz
1 files changed, 5 insertions, 0 deletions
diff --git a/arch/arm/lib/bitops.h b/arch/arm/lib/bitops.h
index 52886b89706c..9f12ed1eea86 100644
--- a/arch/arm/lib/bitops.h
+++ b/arch/arm/lib/bitops.h
@@ -37,6 +37,11 @@ UNWIND(	.fnstart	)
 	add	r1, r1, r0, lsl #2	@ Get word offset
 	mov	r3, r2, lsl r3		@ create mask
 	smp_dmb
+#if __LINUX_ARM_ARCH__ >= 7 && defined(CONFIG_SMP)
+	.arch_extension	mp
+	ALT_SMP(W(pldw)	[r1])
+	ALT_UP(W(nop))
+#endif
 1:	ldrex	r2, [r1]
 	ands	r0, r2, r3		@ save old value of bit
 	\instr	r2, r2, r3		@ toggle bit
author	Will Deacon <will.deacon@arm.com>	2014-02-21 20:01:48 +0400
committer	Russell King <rmk+kernel@arm.linux.org.uk>	2014-02-25 15:30:20 +0400
commit	c32ffce0f66e5d1d4856254516e24f5ef275cd00 (patch)
tree	125229cdd38bfd6e7e62cff7eb8771a34cc999a7 /arch/arm/lib
parent	6ea41c80115f49e7d8b80312ffc99973d283471f (diff)
download	linux-c32ffce0f66e5d1d4856254516e24f5ef275cd00.tar.xz