From b3ac891b67bd4b1fc728d1c784cad1212dea433d Mon Sep 17 00:00:00 2001 From: Luca Barbieri Date: Wed, 24 Feb 2010 10:54:22 +0100 Subject: x86: Add support for lock prefix in alternatives The current lock prefix UP/SMP alternative code doesn't allow LOCK_PREFIX to be used in alternatives code. This patch solves the problem by adding a new LOCK_PREFIX_ALTERNATIVE_PATCH macro that only records the lock prefix location but does not emit the prefix. The user of this macro can then start any alternative sequence with "lock" and have it UP/SMP patched. To make this work, the UP/SMP alternative code is changed to do the lock/DS prefix switching only if the byte actually contains a lock or DS prefix. Thus, if an alternative without the "lock" is selected, it will now do nothing instead of clobbering the code. Changes in v2: - Naming change - Change label to not conflict with alternatives Signed-off-by: Luca Barbieri LKML-Reference: <1267005265-27958-2-git-send-email-luca@luca-barbieri.com> Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/alternative.h | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'arch/x86/include/asm/alternative.h') diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index 3b5b828767b6..55fee12cea6d 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -28,12 +28,14 @@ */ #ifdef CONFIG_SMP -#define LOCK_PREFIX \ +#define LOCK_PREFIX_HERE \ ".section .smp_locks,\"a\"\n" \ _ASM_ALIGN "\n" \ - _ASM_PTR "661f\n" /* address */ \ + _ASM_PTR "671f\n" /* address */ \ ".previous\n" \ - "661:\n\tlock; " + "671:" + +#define LOCK_PREFIX LOCK_PREFIX_HERE "\n\tlock; " #else /* ! CONFIG_SMP */ #define LOCK_PREFIX "" -- cgit v1.2.3 From d61931d89be506372d01a90d1755f6d0a9fafe2d Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Fri, 5 Mar 2010 17:34:46 +0100 Subject: x86: Add optimized popcnt variants Add support for the hardware version of the Hamming weight function, popcnt, present in CPUs which advertize it under CPUID, Function 0x0000_0001_ECX[23]. On CPUs which don't support it, we fallback to the default lib/hweight.c sw versions. A synthetic benchmark comparing popcnt with __sw_hweight64 showed almost a 3x speedup on a F10h machine. Signed-off-by: Borislav Petkov LKML-Reference: <20100318112015.GC11152@aftab> Signed-off-by: H. Peter Anvin --- arch/x86/Kconfig | 5 +++ arch/x86/include/asm/alternative.h | 9 +++-- arch/x86/include/asm/arch_hweight.h | 59 +++++++++++++++++++++++++++++++ arch/x86/include/asm/bitops.h | 4 ++- include/asm-generic/bitops/arch_hweight.h | 22 +++++++++--- lib/Makefile | 3 ++ lib/hweight.c | 20 +++++------ scripts/Makefile.lib | 4 +++ 8 files changed, 108 insertions(+), 18 deletions(-) create mode 100644 arch/x86/include/asm/arch_hweight.h (limited to 'arch/x86/include/asm/alternative.h') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 0eacb1ffb421..89d8c54cdd37 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -238,6 +238,11 @@ config X86_32_LAZY_GS def_bool y depends on X86_32 && !CC_STACKPROTECTOR +config ARCH_HWEIGHT_CFLAGS + string + default "-fcall-saved-ecx -fcall-saved-edx" if X86_32 + default "-fcall-saved-rdi -fcall-saved-rsi -fcall-saved-rdx -fcall-saved-rcx -fcall-saved-r8 -fcall-saved-r9 -fcall-saved-r10 -fcall-saved-r11" if X86_64 + config KTIME_SCALAR def_bool X86_32 source "init/Kconfig" diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index b09ec55650b3..67dae51e7fd0 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -39,9 +39,6 @@ #define LOCK_PREFIX "" #endif -/* This must be included *after* the definition of LOCK_PREFIX */ -#include - struct alt_instr { u8 *instr; /* original instruction */ u8 *replacement; @@ -95,6 +92,12 @@ static inline int alternatives_text_reserved(void *start, void *end) "663:\n\t" newinstr "\n664:\n" /* replacement */ \ ".previous" +/* + * This must be included *after* the definition of ALTERNATIVE due to + * + */ +#include + /* * Alternative instructions for different CPU types or capabilities. * diff --git a/arch/x86/include/asm/arch_hweight.h b/arch/x86/include/asm/arch_hweight.h new file mode 100644 index 000000000000..d1fc3c219ae6 --- /dev/null +++ b/arch/x86/include/asm/arch_hweight.h @@ -0,0 +1,59 @@ +#ifndef _ASM_X86_HWEIGHT_H +#define _ASM_X86_HWEIGHT_H + +#ifdef CONFIG_64BIT +/* popcnt %rdi, %rax */ +#define POPCNT ".byte 0xf3,0x48,0x0f,0xb8,0xc7" +#define REG_IN "D" +#define REG_OUT "a" +#else +/* popcnt %eax, %eax */ +#define POPCNT ".byte 0xf3,0x0f,0xb8,0xc0" +#define REG_IN "a" +#define REG_OUT "a" +#endif + +/* + * __sw_hweightXX are called from within the alternatives below + * and callee-clobbered registers need to be taken care of. See + * ARCH_HWEIGHT_CFLAGS in for the respective + * compiler switches. + */ +static inline unsigned int __arch_hweight32(unsigned int w) +{ + unsigned int res = 0; + + asm (ALTERNATIVE("call __sw_hweight32", POPCNT, X86_FEATURE_POPCNT) + : "="REG_OUT (res) + : REG_IN (w)); + + return res; +} + +static inline unsigned int __arch_hweight16(unsigned int w) +{ + return __arch_hweight32(w & 0xffff); +} + +static inline unsigned int __arch_hweight8(unsigned int w) +{ + return __arch_hweight32(w & 0xff); +} + +static inline unsigned long __arch_hweight64(__u64 w) +{ + unsigned long res = 0; + +#ifdef CONFIG_X86_32 + return __arch_hweight32((u32)w) + + __arch_hweight32((u32)(w >> 32)); +#else + asm (ALTERNATIVE("call __sw_hweight64", POPCNT, X86_FEATURE_POPCNT) + : "="REG_OUT (res) + : REG_IN (w)); +#endif /* CONFIG_X86_32 */ + + return res; +} + +#endif diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h index 02b47a603fc8..545776efeb16 100644 --- a/arch/x86/include/asm/bitops.h +++ b/arch/x86/include/asm/bitops.h @@ -444,7 +444,9 @@ static inline int fls(int x) #define ARCH_HAS_FAST_MULTIPLIER 1 -#include +#include + +#include #endif /* __KERNEL__ */ diff --git a/include/asm-generic/bitops/arch_hweight.h b/include/asm-generic/bitops/arch_hweight.h index 3a7be842cdce..9a81c1e9436c 100644 --- a/include/asm-generic/bitops/arch_hweight.h +++ b/include/asm-generic/bitops/arch_hweight.h @@ -3,9 +3,23 @@ #include -extern unsigned int __arch_hweight32(unsigned int w); -extern unsigned int __arch_hweight16(unsigned int w); -extern unsigned int __arch_hweight8(unsigned int w); -extern unsigned long __arch_hweight64(__u64 w); +inline unsigned int __arch_hweight32(unsigned int w) +{ + return __sw_hweight32(w); +} +inline unsigned int __arch_hweight16(unsigned int w) +{ + return __sw_hweight16(w); +} + +inline unsigned int __arch_hweight8(unsigned int w) +{ + return __sw_hweight8(w); +} + +inline unsigned long __arch_hweight64(__u64 w) +{ + return __sw_hweight64(w); +} #endif /* _ASM_GENERIC_BITOPS_HWEIGHT_H_ */ diff --git a/lib/Makefile b/lib/Makefile index 2e152aed7198..abe63a8ad143 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -39,7 +39,10 @@ lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o lib-$(CONFIG_GENERIC_FIND_FIRST_BIT) += find_next_bit.o lib-$(CONFIG_GENERIC_FIND_NEXT_BIT) += find_next_bit.o obj-$(CONFIG_GENERIC_FIND_LAST_BIT) += find_last_bit.o + +CFLAGS_hweight.o = $(subst $(quote),,$(CONFIG_ARCH_HWEIGHT_CFLAGS)) obj-$(CONFIG_GENERIC_HWEIGHT) += hweight.o + obj-$(CONFIG_LOCK_KERNEL) += kernel_lock.o obj-$(CONFIG_BTREE) += btree.o obj-$(CONFIG_DEBUG_PREEMPT) += smp_processor_id.o diff --git a/lib/hweight.c b/lib/hweight.c index a6927e76840f..3c79d50814cf 100644 --- a/lib/hweight.c +++ b/lib/hweight.c @@ -9,7 +9,7 @@ * The Hamming Weight of a number is the total number of bits set in it. */ -unsigned int __arch_hweight32(unsigned int w) +unsigned int __sw_hweight32(unsigned int w) { #ifdef ARCH_HAS_FAST_MULTIPLIER w -= (w >> 1) & 0x55555555; @@ -24,30 +24,30 @@ unsigned int __arch_hweight32(unsigned int w) return (res + (res >> 16)) & 0x000000FF; #endif } -EXPORT_SYMBOL(__arch_hweight32); +EXPORT_SYMBOL(__sw_hweight32); -unsigned int __arch_hweight16(unsigned int w) +unsigned int __sw_hweight16(unsigned int w) { unsigned int res = w - ((w >> 1) & 0x5555); res = (res & 0x3333) + ((res >> 2) & 0x3333); res = (res + (res >> 4)) & 0x0F0F; return (res + (res >> 8)) & 0x00FF; } -EXPORT_SYMBOL(__arch_hweight16); +EXPORT_SYMBOL(__sw_hweight16); -unsigned int __arch_hweight8(unsigned int w) +unsigned int __sw_hweight8(unsigned int w) { unsigned int res = w - ((w >> 1) & 0x55); res = (res & 0x33) + ((res >> 2) & 0x33); return (res + (res >> 4)) & 0x0F; } -EXPORT_SYMBOL(__arch_hweight8); +EXPORT_SYMBOL(__sw_hweight8); -unsigned long __arch_hweight64(__u64 w) +unsigned long __sw_hweight64(__u64 w) { #if BITS_PER_LONG == 32 - return __arch_hweight32((unsigned int)(w >> 32)) + - __arch_hweight32((unsigned int)w); + return __sw_hweight32((unsigned int)(w >> 32)) + + __sw_hweight32((unsigned int)w); #elif BITS_PER_LONG == 64 #ifdef ARCH_HAS_FAST_MULTIPLIER w -= (w >> 1) & 0x5555555555555555ul; @@ -64,4 +64,4 @@ unsigned long __arch_hweight64(__u64 w) #endif #endif } -EXPORT_SYMBOL(__arch_hweight64); +EXPORT_SYMBOL(__sw_hweight64); diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib index f9bdf264473d..cbcd654215e6 100644 --- a/scripts/Makefile.lib +++ b/scripts/Makefile.lib @@ -245,3 +245,7 @@ quiet_cmd_lzo = LZO $@ cmd_lzo = (cat $(filter-out FORCE,$^) | \ lzop -9 && $(call size_append, $(filter-out FORCE,$^))) > $@ || \ (rm -f $@ ; false) + +# misc stuff +# --------------------------------------------------------------------------- +quote:=" -- cgit v1.2.3 From 5967ed87ade85a421ef814296c3c7f182b08c225 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Wed, 21 Apr 2010 16:08:14 +0100 Subject: x86-64: Reduce SMP locks table size Reduce the SMP locks table size by using relative pointers instead of absolute ones, thus cutting the table size by half. Signed-off-by: Jan Beulich LKML-Reference: <4BCF30FE020000780003B3B6@vpn.id2.novell.com> Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/alternative-asm.h | 4 +-- arch/x86/include/asm/alternative.h | 4 +-- arch/x86/kernel/alternative.c | 45 +++++++++++++++++++--------------- 3 files changed, 29 insertions(+), 24 deletions(-) (limited to 'arch/x86/include/asm/alternative.h') diff --git a/arch/x86/include/asm/alternative-asm.h b/arch/x86/include/asm/alternative-asm.h index b97f786a48d5..a63a68be1cce 100644 --- a/arch/x86/include/asm/alternative-asm.h +++ b/arch/x86/include/asm/alternative-asm.h @@ -6,8 +6,8 @@ .macro LOCK_PREFIX 1: lock .section .smp_locks,"a" - _ASM_ALIGN - _ASM_PTR 1b + .balign 4 + .long 1b - . .previous .endm #else diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index b09ec55650b3..714bf2417284 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -30,8 +30,8 @@ #ifdef CONFIG_SMP #define LOCK_PREFIX \ ".section .smp_locks,\"a\"\n" \ - _ASM_ALIGN "\n" \ - _ASM_PTR "661f\n" /* address */ \ + ".balign 4\n" \ + ".long 661f - .\n" /* offset */ \ ".previous\n" \ "661:\n\tlock; " diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 1a160d5d44d0..936738427223 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -194,7 +194,7 @@ static void __init_or_module add_nops(void *insns, unsigned int len) } extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; -extern u8 *__smp_locks[], *__smp_locks_end[]; +extern s32 __smp_locks[], __smp_locks_end[]; static void *text_poke_early(void *addr, const void *opcode, size_t len); /* Replace instructions with better alternatives for this CPU type. @@ -235,37 +235,39 @@ void __init_or_module apply_alternatives(struct alt_instr *start, #ifdef CONFIG_SMP -static void alternatives_smp_lock(u8 **start, u8 **end, u8 *text, u8 *text_end) +static void alternatives_smp_lock(const s32 *start, const s32 *end, + u8 *text, u8 *text_end) { - u8 **ptr; + const s32 *poff; mutex_lock(&text_mutex); - for (ptr = start; ptr < end; ptr++) { - if (*ptr < text) - continue; - if (*ptr > text_end) + for (poff = start; poff < end; poff++) { + u8 *ptr = (u8 *)poff + *poff; + + if (!*poff || ptr < text || ptr >= text_end) continue; /* turn DS segment override prefix into lock prefix */ - text_poke(*ptr, ((unsigned char []){0xf0}), 1); + text_poke(ptr, ((unsigned char []){0xf0}), 1); }; mutex_unlock(&text_mutex); } -static void alternatives_smp_unlock(u8 **start, u8 **end, u8 *text, u8 *text_end) +static void alternatives_smp_unlock(const s32 *start, const s32 *end, + u8 *text, u8 *text_end) { - u8 **ptr; + const s32 *poff; if (noreplace_smp) return; mutex_lock(&text_mutex); - for (ptr = start; ptr < end; ptr++) { - if (*ptr < text) - continue; - if (*ptr > text_end) + for (poff = start; poff < end; poff++) { + u8 *ptr = (u8 *)poff + *poff; + + if (!*poff || ptr < text || ptr >= text_end) continue; /* turn lock prefix into DS segment override prefix */ - text_poke(*ptr, ((unsigned char []){0x3E}), 1); + text_poke(ptr, ((unsigned char []){0x3E}), 1); }; mutex_unlock(&text_mutex); } @@ -276,8 +278,8 @@ struct smp_alt_module { char *name; /* ptrs to lock prefixes */ - u8 **locks; - u8 **locks_end; + const s32 *locks; + const s32 *locks_end; /* .text segment, needed to avoid patching init code ;) */ u8 *text; @@ -398,16 +400,19 @@ void alternatives_smp_switch(int smp) int alternatives_text_reserved(void *start, void *end) { struct smp_alt_module *mod; - u8 **ptr; + const s32 *poff; u8 *text_start = start; u8 *text_end = end; list_for_each_entry(mod, &smp_alt_modules, next) { if (mod->text > text_end || mod->text_end < text_start) continue; - for (ptr = mod->locks; ptr < mod->locks_end; ptr++) - if (text_start <= *ptr && text_end >= *ptr) + for (poff = mod->locks; poff < mod->locks_end; poff++) { + const u8 *ptr = (const u8 *)poff + *poff; + + if (text_start <= ptr && text_end > ptr) return 1; + } } return 0; -- cgit v1.2.3 From b701a47ba48b698976fb2fe05fb285b0edc1d26a Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Thu, 29 Apr 2010 16:03:57 -0700 Subject: x86: Fix LOCK_PREFIX_HERE for uniprocessor build Checkin b3ac891b67bd4b1fc728d1c784cad1212dea433d: x86: Add support for lock prefix in alternatives ... did not define LOCK_PREFIX_HERE in the case of a uniprocessor build. As a result, it would cause any of the usages of this macro to fail on a uniprocessor build. Fix this by defining LOCK_PREFIX_HERE as a null string. Signed-off-by: H. Peter Anvin Cc: Luca Barbieri LKML-Reference: <1267005265-27958-2-git-send-email-luca@luca-barbieri.com> --- arch/x86/include/asm/alternative.h | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/include/asm/alternative.h') diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index 55fee12cea6d..e29a6c9bba00 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -38,6 +38,7 @@ #define LOCK_PREFIX LOCK_PREFIX_HERE "\n\tlock; " #else /* ! CONFIG_SMP */ +#define LOCK_PREFIX_HERE "" #define LOCK_PREFIX "" #endif -- cgit v1.2.3