diff options
Diffstat (limited to 'arch/powerpc/include/asm/qspinlock.h')
-rw-r--r-- | arch/powerpc/include/asm/qspinlock.h | 192 |
1 files changed, 141 insertions, 51 deletions
diff --git a/arch/powerpc/include/asm/qspinlock.h b/arch/powerpc/include/asm/qspinlock.h index b676c4fb90fd..28a53fb69b38 100644 --- a/arch/powerpc/include/asm/qspinlock.h +++ b/arch/powerpc/include/asm/qspinlock.h @@ -2,83 +2,173 @@ #ifndef _ASM_POWERPC_QSPINLOCK_H #define _ASM_POWERPC_QSPINLOCK_H -#include <asm-generic/qspinlock_types.h> +#include <linux/compiler.h> +#include <asm/qspinlock_types.h> #include <asm/paravirt.h> -#define _Q_PENDING_LOOPS (1 << 9) /* not tuned */ +#ifdef CONFIG_PPC64 +/* + * Use the EH=1 hint for accesses that result in the lock being acquired. + * The hardware is supposed to optimise this pattern by holding the lock + * cacheline longer, and releasing when a store to the same memory (the + * unlock) is performed. + */ +#define _Q_SPIN_EH_HINT 1 +#else +#define _Q_SPIN_EH_HINT 0 +#endif -#ifdef CONFIG_PARAVIRT_SPINLOCKS -extern void native_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val); -extern void __pv_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val); -extern void __pv_queued_spin_unlock(struct qspinlock *lock); +/* + * The trylock itself may steal. This makes trylocks slightly stronger, and + * makes locks slightly more efficient when stealing. + * + * This is compile-time, so if true then there may always be stealers, so the + * nosteal paths become unused. + */ +#define _Q_SPIN_TRY_LOCK_STEAL 1 -static __always_inline void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val) -{ - if (!is_shared_processor()) - native_queued_spin_lock_slowpath(lock, val); - else - __pv_queued_spin_lock_slowpath(lock, val); -} +/* + * Put a speculation barrier after testing the lock/node and finding it + * busy. Try to prevent pointless speculation in slow paths. + * + * Slows down the lockstorm microbenchmark with no stealing, where locking + * is purely FIFO through the queue. May have more benefit in real workload + * where speculating into the wrong place could have a greater cost. + */ +#define _Q_SPIN_SPEC_BARRIER 0 -#define queued_spin_unlock queued_spin_unlock -static inline void queued_spin_unlock(struct qspinlock *lock) -{ - if (!is_shared_processor()) - smp_store_release(&lock->locked, 0); - else - __pv_queued_spin_unlock(lock); -} +#ifdef CONFIG_PPC64 +/* + * Execute a miso instruction after passing the MCS lock ownership to the + * queue head. Miso is intended to make stores visible to other CPUs sooner. + * + * This seems to make the lockstorm microbenchmark nospin test go slightly + * faster on POWER10, but disable for now. + */ +#define _Q_SPIN_MISO 0 +#else +#define _Q_SPIN_MISO 0 +#endif +#ifdef CONFIG_PPC64 +/* + * This executes miso after an unlock of the lock word, having ownership + * pass to the next CPU sooner. This will slow the uncontended path to some + * degree. Not evidence it helps yet. + */ +#define _Q_SPIN_MISO_UNLOCK 0 #else -extern void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val); +#define _Q_SPIN_MISO_UNLOCK 0 #endif -static __always_inline void queued_spin_lock(struct qspinlock *lock) +/* + * Seems to slow down lockstorm microbenchmark, suspect queue node just + * has to become shared again right afterwards when its waiter spins on + * the lock field. + */ +#define _Q_SPIN_PREFETCH_NEXT 0 + +static __always_inline int queued_spin_is_locked(struct qspinlock *lock) { - u32 val = 0; + return READ_ONCE(lock->val); +} - if (likely(arch_atomic_try_cmpxchg_lock(&lock->val, &val, _Q_LOCKED_VAL))) - return; +static __always_inline int queued_spin_value_unlocked(struct qspinlock lock) +{ + return !lock.val; +} - queued_spin_lock_slowpath(lock, val); +static __always_inline int queued_spin_is_contended(struct qspinlock *lock) +{ + return !!(READ_ONCE(lock->val) & _Q_TAIL_CPU_MASK); } -#define queued_spin_lock queued_spin_lock -#ifdef CONFIG_PARAVIRT_SPINLOCKS -#define SPIN_THRESHOLD (1<<15) /* not tuned */ +static __always_inline u32 queued_spin_encode_locked_val(void) +{ + /* XXX: make this use lock value in paca like simple spinlocks? */ + return _Q_LOCKED_VAL | (smp_processor_id() << _Q_OWNER_CPU_OFFSET); +} -static __always_inline void pv_wait(u8 *ptr, u8 val) +static __always_inline int __queued_spin_trylock_nosteal(struct qspinlock *lock) { - if (*ptr != val) - return; - yield_to_any(); - /* - * We could pass in a CPU here if waiting in the queue and yield to - * the previous CPU in the queue. - */ + u32 new = queued_spin_encode_locked_val(); + u32 prev; + + /* Trylock succeeds only when unlocked and no queued nodes */ + asm volatile( +"1: lwarx %0,0,%1,%3 # __queued_spin_trylock_nosteal \n" +" cmpwi 0,%0,0 \n" +" bne- 2f \n" +" stwcx. %2,0,%1 \n" +" bne- 1b \n" +"\t" PPC_ACQUIRE_BARRIER " \n" +"2: \n" + : "=&r" (prev) + : "r" (&lock->val), "r" (new), + "i" (_Q_SPIN_EH_HINT) + : "cr0", "memory"); + + return likely(prev == 0); } -static __always_inline void pv_kick(int cpu) +static __always_inline int __queued_spin_trylock_steal(struct qspinlock *lock) { - prod_cpu(cpu); + u32 new = queued_spin_encode_locked_val(); + u32 prev, tmp; + + /* Trylock may get ahead of queued nodes if it finds unlocked */ + asm volatile( +"1: lwarx %0,0,%2,%5 # __queued_spin_trylock_steal \n" +" andc. %1,%0,%4 \n" +" bne- 2f \n" +" and %1,%0,%4 \n" +" or %1,%1,%3 \n" +" stwcx. %1,0,%2 \n" +" bne- 1b \n" +"\t" PPC_ACQUIRE_BARRIER " \n" +"2: \n" + : "=&r" (prev), "=&r" (tmp) + : "r" (&lock->val), "r" (new), "r" (_Q_TAIL_CPU_MASK), + "i" (_Q_SPIN_EH_HINT) + : "cr0", "memory"); + + return likely(!(prev & ~_Q_TAIL_CPU_MASK)); } -extern void __pv_init_lock_hash(void); +static __always_inline int queued_spin_trylock(struct qspinlock *lock) +{ + if (!_Q_SPIN_TRY_LOCK_STEAL) + return __queued_spin_trylock_nosteal(lock); + else + return __queued_spin_trylock_steal(lock); +} -static inline void pv_spinlocks_init(void) +void queued_spin_lock_slowpath(struct qspinlock *lock); + +static __always_inline void queued_spin_lock(struct qspinlock *lock) { - __pv_init_lock_hash(); + if (!queued_spin_trylock(lock)) + queued_spin_lock_slowpath(lock); } -#endif +static inline void queued_spin_unlock(struct qspinlock *lock) +{ + smp_store_release(&lock->locked, 0); + if (_Q_SPIN_MISO_UNLOCK) + asm volatile("miso" ::: "memory"); +} -/* - * Queued spinlocks rely heavily on smp_cond_load_relaxed() to busy-wait, - * which was found to have performance problems if implemented with - * the preferred spin_begin()/spin_end() SMT priority pattern. Use the - * generic version instead. - */ +#define arch_spin_is_locked(l) queued_spin_is_locked(l) +#define arch_spin_is_contended(l) queued_spin_is_contended(l) +#define arch_spin_value_unlocked(l) queued_spin_value_unlocked(l) +#define arch_spin_lock(l) queued_spin_lock(l) +#define arch_spin_trylock(l) queued_spin_trylock(l) +#define arch_spin_unlock(l) queued_spin_unlock(l) -#include <asm-generic/qspinlock.h> +#ifdef CONFIG_PARAVIRT_SPINLOCKS +void pv_spinlocks_init(void); +#else +static inline void pv_spinlocks_init(void) { } +#endif #endif /* _ASM_POWERPC_QSPINLOCK_H */ |