2 files changed, 209 insertions, 111 deletions
diff --git a/arch/mips/include/asm/barrier.h b/arch/mips/include/asm/barrier.h
index 9228f7386220..5ad39bfd3b6d 100644
--- a/arch/mips/include/asm/barrier.h
+++ b/arch/mips/include/asm/barrier.h
@@ -9,116 +9,7 @@
 #define __ASM_BARRIER_H
 
 #include <asm/addrspace.h>
-
-/*
- * Sync types defined by the MIPS architecture (document MD00087 table 6.5)
- * These values are used with the sync instruction to perform memory barriers.
- * Types of ordering guarantees available through the SYNC instruction:
- * - Completion Barriers
- * - Ordering Barriers
- * As compared to the completion barrier, the ordering barrier is a
- * lighter-weight operation as it does not require the specified instructions
- * before the SYNC to be already completed. Instead it only requires that those
- * specified instructions which are subsequent to the SYNC in the instruction
- * stream are never re-ordered for processing ahead of the specified
- * instructions which are before the SYNC in the instruction stream.
- * This potentially reduces how many cycles the barrier instruction must stall
- * before it completes.
- * Implementations that do not use any of the non-zero values of stype to define
- * different barriers, such as ordering barriers, must make those stype values
- * act the same as stype zero.
- */
-
-/*
- * Completion barriers:
- * - Every synchronizable specified memory instruction (loads or stores or both)
- *   that occurs in the instruction stream before the SYNC instruction must be
- *   already globally performed before any synchronizable specified memory
- *   instructions that occur after the SYNC are allowed to be performed, with
- *   respect to any other processor or coherent I/O module.
- *
- * - The barrier does not guarantee the order in which instruction fetches are
- *   performed.
- *
- * - A stype value of zero will always be defined such that it performs the most
- *   complete set of synchronization operations that are defined.This means
- *   stype zero always does a completion barrier that affects both loads and
- *   stores preceding the SYNC instruction and both loads and stores that are
- *   subsequent to the SYNC instruction. Non-zero values of stype may be defined
- *   by the architecture or specific implementations to perform synchronization
- *   behaviors that are less complete than that of stype zero. If an
- *   implementation does not use one of these non-zero values to define a
- *   different synchronization behavior, then that non-zero value of stype must
- *   act the same as stype zero completion barrier. This allows software written
- *   for an implementation with a lighter-weight barrier to work on another
- *   implementation which only implements the stype zero completion barrier.
- *
- * - A completion barrier is required, potentially in conjunction with SSNOP (in
- *   Release 1 of the Architecture) or EHB (in Release 2 of the Architecture),
- *   to guarantee that memory reference results are visible across operating
- *   mode changes. For example, a completion barrier is required on some
- *   implementations on entry to and exit from Debug Mode to guarantee that
- *   memory effects are handled correctly.
- */
-
-/*
- * stype 0 - A completion barrier that affects preceding loads and stores and
- * subsequent loads and stores.
- * Older instructions which must reach the load/store ordering point before the
- * SYNC instruction completes: Loads, Stores
- * Younger instructions which must reach the load/store ordering point only
- * after the SYNC instruction completes: Loads, Stores
- * Older instructions which must be globally performed when the SYNC instruction
- * completes: Loads, Stores
- */
-#define STYPE_SYNC 0x0
-
-/*
- * Ordering barriers:
- * - Every synchronizable specified memory instruction (loads or stores or both)
- *   that occurs in the instruction stream before the SYNC instruction must
- *   reach a stage in the load/store datapath after which no instruction
- *   re-ordering is possible before any synchronizable specified memory
- *   instruction which occurs after the SYNC instruction in the instruction
- *   stream reaches the same stage in the load/store datapath.
- *
- * - If any memory instruction before the SYNC instruction in program order,
- *   generates a memory request to the external memory and any memory
- *   instruction after the SYNC instruction in program order also generates a
- *   memory request to external memory, the memory request belonging to the
- *   older instruction must be globally performed before the time the memory
- *   request belonging to the younger instruction is globally performed.
- *
- * - The barrier does not guarantee the order in which instruction fetches are
- *   performed.
- */
-
-/*
- * stype 0x10 - An ordering barrier that affects preceding loads and stores and
- * subsequent loads and stores.
- * Older instructions which must reach the load/store ordering point before the
- * SYNC instruction completes: Loads, Stores
- * Younger instructions which must reach the load/store ordering point only
- * after the SYNC instruction completes: Loads, Stores
- * Older instructions which must be globally performed when the SYNC instruction
- * completes: N/A
- */
-#define STYPE_SYNC_MB 0x10
-
-/*
- * stype 0x14 - A completion barrier specific to global invalidations
- *
- * When a sync instruction of this type completes any preceding GINVI or GINVT
- * operation has been globalized & completed on all coherent CPUs. Anything
- * that the GINV* instruction should invalidate will have been invalidated on
- * all coherent CPUs when this instruction completes. It is implementation
- * specific whether the GINV* instructions themselves will ensure completion,
- * or this sync type will.
- *
- * In systems implementing global invalidates (ie. with Config5.GI == 2 or 3)
- * this sync type also requires that previous SYNCI operations have completed.
- */
-#define STYPE_GINV	0x14
+#include <asm/sync.h>
 
 #ifdef CONFIG_CPU_HAS_SYNC
 #define __sync()				\
@@ -286,7 +177,7 @@
 
 static inline void sync_ginv(void)
 {
-	asm volatile("sync\t%0" :: "i"(STYPE_GINV));
+	asm volatile("sync\t%0" :: "i"(__SYNC_ginv));
 }
 
 #include <asm-generic/barrier.h>
diff --git a/arch/mips/include/asm/sync.h b/arch/mips/include/asm/sync.h
new file mode 100644
index 000000000000..7c6a1095f556
--- /dev/null
+++ b/arch/mips/include/asm/sync.h
@@ -0,0 +1,207 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef __MIPS_ASM_SYNC_H__
+#define __MIPS_ASM_SYNC_H__
+
+/*
+ * sync types are defined by the MIPS64 Instruction Set documentation in Volume
+ * II-A of the MIPS Architecture Reference Manual, which can be found here:
+ *
+ *   https://www.mips.com/?do-download=the-mips64-instruction-set-v6-06
+ *
+ * Two types of barrier are provided:
+ *
+ *   1) Completion barriers, which ensure that a memory operation has actually
+ *      completed & often involve stalling the CPU pipeline to do so.
+ *
+ *   2) Ordering barriers, which only ensure that affected memory operations
+ *      won't be reordered in the CPU pipeline in a manner that violates the
+ *      restrictions imposed by the barrier.
+ *
+ * Ordering barriers can be more efficient than completion barriers, since:
+ *
+ *   a) Ordering barriers only require memory access instructions which preceed
+ *      them in program order (older instructions) to reach a point in the
+ *      load/store datapath beyond which reordering is not possible before
+ *      allowing memory access instructions which follow them (younger
+ *      instructions) to be performed.  That is, older instructions don't
+ *      actually need to complete - they just need to get far enough that all
+ *      other coherent CPUs will observe their completion before they observe
+ *      the effects of younger instructions.
+ *
+ *   b) Multiple variants of ordering barrier are provided which allow the
+ *      effects to be restricted to different combinations of older or younger
+ *      loads or stores. By way of example, if we only care that stores older
+ *      than a barrier are observed prior to stores that are younger than a
+ *      barrier & don't care about the ordering of loads then the 'wmb'
+ *      ordering barrier can be used. Limiting the barrier's effects to stores
+ *      allows loads to continue unaffected & potentially allows the CPU to
+ *      make progress faster than if younger loads had to wait for older stores
+ *      to complete.
+ */
+
+/*
+ * No sync instruction at all; used to allow code to nullify the effect of the
+ * __SYNC() macro without needing lots of #ifdefery.
+ */
+#define __SYNC_none	-1
+
+/*
+ * A full completion barrier; all memory accesses appearing prior to this sync
+ * instruction in program order must complete before any memory accesses
+ * appearing after this sync instruction in program order.
+ */
+#define __SYNC_full	0x00
+
+/*
+ * For now we use a full completion barrier to implement all sync types, until
+ * we're satisfied that lightweight ordering barriers defined by MIPSr6 are
+ * sufficient to uphold our desired memory model.
+ */
+#define __SYNC_aq	__SYNC_full
+#define __SYNC_rl	__SYNC_full
+#define __SYNC_mb	__SYNC_full
+
+/*
+ * ...except on Cavium Octeon CPUs, which have been using the 'wmb' ordering
+ * barrier since 2010 & omit 'rmb' barriers because the CPUs don't perform
+ * speculative reads.
+ */
+#ifdef CONFIG_CPU_CAVIUM_OCTEON
+# define __SYNC_rmb	__SYNC_none
+# define __SYNC_wmb	0x04
+#else
+# define __SYNC_rmb	__SYNC_full
+# define __SYNC_wmb	__SYNC_full
+#endif
+
+/*
+ * A GINV sync is a little different; it doesn't relate directly to loads or
+ * stores, but instead causes synchronization of an icache or TLB global
+ * invalidation operation triggered by the ginvi or ginvt instructions
+ * respectively. In cases where we need to know that a ginvi or ginvt operation
+ * has been performed by all coherent CPUs, we must issue a sync instruction of
+ * this type. Once this instruction graduates all coherent CPUs will have
+ * observed the invalidation.
+ */
+#define __SYNC_ginv	0x14
+
+/* Trivial; indicate that we always need this sync instruction. */
+#define __SYNC_always	(1 << 0)
+
+/*
+ * Indicate that we need this sync instruction only on systems with weakly
+ * ordered memory access. In general this is most MIPS systems, but there are
+ * exceptions which provide strongly ordered memory.
+ */
+#ifdef CONFIG_WEAK_ORDERING
+# define __SYNC_weak_ordering	(1 << 1)
+#else
+# define __SYNC_weak_ordering	0
+#endif
+
+/*
+ * Indicate that we need this sync instruction only on systems where LL/SC
+ * don't implicitly provide a memory barrier. In general this is most MIPS
+ * systems.
+ */
+#ifdef CONFIG_WEAK_REORDERING_BEYOND_LLSC
+# define __SYNC_weak_llsc	(1 << 2)
+#else
+# define __SYNC_weak_llsc	0
+#endif
+
+/*
+ * Some Loongson 3 CPUs have a bug wherein execution of a memory access (load,
+ * store or prefetch) in between an LL & SC can cause the SC instruction to
+ * erroneously succeed, breaking atomicity. Whilst it's unusual to write code
+ * containing such sequences, this bug bites harder than we might otherwise
+ * expect due to reordering & speculation:
+ *
+ * 1) A memory access appearing prior to the LL in program order may actually
+ *    be executed after the LL - this is the reordering case.
+ *
+ *    In order to avoid this we need to place a memory barrier (ie. a SYNC
+ *    instruction) prior to every LL instruction, in between it and any earlier
+ *    memory access instructions.
+ *
+ *    This reordering case is fixed by 3A R2 CPUs, ie. 3A2000 models and later.
+ *
+ * 2) If a conditional branch exists between an LL & SC with a target outside
+ *    of the LL-SC loop, for example an exit upon value mismatch in cmpxchg()
+ *    or similar, then misprediction of the branch may allow speculative
+ *    execution of memory accesses from outside of the LL-SC loop.
+ *
+ *    In order to avoid this we need a memory barrier (ie. a SYNC instruction)
+ *    at each affected branch target.
+ *
+ *    This case affects all current Loongson 3 CPUs.
+ *
+ * The above described cases cause an error in the cache coherence protocol;
+ * such that the Invalidate of a competing LL-SC goes 'missing' and SC
+ * erroneously observes its core still has Exclusive state and lets the SC
+ * proceed.
+ *
+ * Therefore the error only occurs on SMP systems.
+ */
+#ifdef CONFIG_CPU_LOONGSON3_WORKAROUNDS
+# define __SYNC_loongson3_war	(1 << 31)
+#else
+# define __SYNC_loongson3_war	0
+#endif
+
+/*
+ * Some Cavium Octeon CPUs suffer from a bug that causes a single wmb ordering
+ * barrier to be ineffective, requiring the use of 2 in sequence to provide an
+ * effective barrier as noted by commit 6b07d38aaa52 ("MIPS: Octeon: Use
+ * optimized memory barrier primitives."). Here we specify that the affected
+ * sync instructions should be emitted twice.
+ */
+#ifdef CONFIG_CPU_CAVIUM_OCTEON
+# define __SYNC_rpt(type)	(1 + (type == __SYNC_wmb))
+#else
+# define __SYNC_rpt(type)	1
+#endif
+
+/*
+ * The main event. Here we actually emit a sync instruction of a given type, if
+ * reason is non-zero.
+ *
+ * In future we have the option of emitting entries in a fixups-style table
+ * here that would allow us to opportunistically remove some sync instructions
+ * when we detect at runtime that we're running on a CPU that doesn't need
+ * them.
+ */
+#ifdef CONFIG_CPU_HAS_SYNC
+# define ____SYNC(_type, _reason, _else)			\
+	.if	(( _type ) != -1) && ( _reason );		\
+	.set	push;						\
+	.set	MIPS_ISA_LEVEL_RAW;				\
+	.rept	__SYNC_rpt(_type);				\
+	sync	_type;						\
+	.endr;							\
+	.set	pop;						\
+	.else;							\
+	_else;							\
+	.endif
+#else
+# define ____SYNC(_type, _reason, _else)
+#endif
+
+/*
+ * Preprocessor magic to expand macros used as arguments before we insert them
+ * into assembly code.
+ */
+#ifdef __ASSEMBLY__
+# define ___SYNC(type, reason, else)				\
+	____SYNC(type, reason, else)
+#else
+# define ___SYNC(type, reason, else)				\
+	__stringify(____SYNC(type, reason, else))
+#endif
+
+#define __SYNC(type, reason)					\
+	___SYNC(__SYNC_##type, __SYNC_##reason, )
+#define __SYNC_ELSE(type, reason, else)				\
+	___SYNC(__SYNC_##type, __SYNC_##reason, else)
+
+#endif /* __MIPS_ASM_SYNC_H__ */