diff options
Diffstat (limited to 'arch/x86/lib')
-rw-r--r-- | arch/x86/lib/clear_page_64.S | 66 | ||||
-rw-r--r-- | arch/x86/lib/copy_page_64.S | 37 | ||||
-rw-r--r-- | arch/x86/lib/copy_user_64.S | 46 | ||||
-rw-r--r-- | arch/x86/lib/memcpy_64.S | 68 | ||||
-rw-r--r-- | arch/x86/lib/memmove_64.S | 19 | ||||
-rw-r--r-- | arch/x86/lib/memset_64.S | 61 |
6 files changed, 96 insertions, 201 deletions
diff --git a/arch/x86/lib/clear_page_64.S b/arch/x86/lib/clear_page_64.S index f2145cfa12a6..e67e579c93bd 100644 --- a/arch/x86/lib/clear_page_64.S +++ b/arch/x86/lib/clear_page_64.S @@ -1,31 +1,35 @@ #include <linux/linkage.h> #include <asm/dwarf2.h> +#include <asm/cpufeature.h> #include <asm/alternative-asm.h> /* - * Zero a page. - * rdi page - */ -ENTRY(clear_page_c) + * Most CPUs support enhanced REP MOVSB/STOSB instructions. It is + * recommended to use this when possible and we do use them by default. + * If enhanced REP MOVSB/STOSB is not available, try to use fast string. + * Otherwise, use original. + */ + +/* + * Zero a page. + * %rdi - page + */ +ENTRY(clear_page) CFI_STARTPROC + + ALTERNATIVE_2 "jmp clear_page_orig", "", X86_FEATURE_REP_GOOD, \ + "jmp clear_page_c_e", X86_FEATURE_ERMS + movl $4096/8,%ecx xorl %eax,%eax rep stosq ret CFI_ENDPROC -ENDPROC(clear_page_c) +ENDPROC(clear_page) -ENTRY(clear_page_c_e) +ENTRY(clear_page_orig) CFI_STARTPROC - movl $4096,%ecx - xorl %eax,%eax - rep stosb - ret - CFI_ENDPROC -ENDPROC(clear_page_c_e) -ENTRY(clear_page) - CFI_STARTPROC xorl %eax,%eax movl $4096/64,%ecx .p2align 4 @@ -45,29 +49,13 @@ ENTRY(clear_page) nop ret CFI_ENDPROC -.Lclear_page_end: -ENDPROC(clear_page) - - /* - * Some CPUs support enhanced REP MOVSB/STOSB instructions. - * It is recommended to use this when possible. - * If enhanced REP MOVSB/STOSB is not available, try to use fast string. - * Otherwise, use original function. - * - */ +ENDPROC(clear_page_orig) -#include <asm/cpufeature.h> - - .section .altinstr_replacement,"ax" -1: .byte 0xeb /* jmp <disp8> */ - .byte (clear_page_c - clear_page) - (2f - 1b) /* offset */ -2: .byte 0xeb /* jmp <disp8> */ - .byte (clear_page_c_e - clear_page) - (3f - 2b) /* offset */ -3: - .previous - .section .altinstructions,"a" - altinstruction_entry clear_page,1b,X86_FEATURE_REP_GOOD,\ - .Lclear_page_end-clear_page, 2b-1b - altinstruction_entry clear_page,2b,X86_FEATURE_ERMS, \ - .Lclear_page_end-clear_page,3b-2b - .previous +ENTRY(clear_page_c_e) + CFI_STARTPROC + movl $4096,%ecx + xorl %eax,%eax + rep stosb + ret + CFI_ENDPROC +ENDPROC(clear_page_c_e) diff --git a/arch/x86/lib/copy_page_64.S b/arch/x86/lib/copy_page_64.S index 176cca67212b..8239dbcbf984 100644 --- a/arch/x86/lib/copy_page_64.S +++ b/arch/x86/lib/copy_page_64.S @@ -2,23 +2,26 @@ #include <linux/linkage.h> #include <asm/dwarf2.h> +#include <asm/cpufeature.h> #include <asm/alternative-asm.h> +/* + * Some CPUs run faster using the string copy instructions (sane microcode). + * It is also a lot simpler. Use this when possible. But, don't use streaming + * copy unless the CPU indicates X86_FEATURE_REP_GOOD. Could vary the + * prefetch distance based on SMP/UP. + */ ALIGN -copy_page_rep: +ENTRY(copy_page) CFI_STARTPROC + ALTERNATIVE "jmp copy_page_regs", "", X86_FEATURE_REP_GOOD movl $4096/8, %ecx rep movsq ret CFI_ENDPROC -ENDPROC(copy_page_rep) - -/* - * Don't use streaming copy unless the CPU indicates X86_FEATURE_REP_GOOD. - * Could vary the prefetch distance based on SMP/UP. -*/ +ENDPROC(copy_page) -ENTRY(copy_page) +ENTRY(copy_page_regs) CFI_STARTPROC subq $2*8, %rsp CFI_ADJUST_CFA_OFFSET 2*8 @@ -90,21 +93,5 @@ ENTRY(copy_page) addq $2*8, %rsp CFI_ADJUST_CFA_OFFSET -2*8 ret -.Lcopy_page_end: CFI_ENDPROC -ENDPROC(copy_page) - - /* Some CPUs run faster using the string copy instructions. - It is also a lot simpler. Use this when possible */ - -#include <asm/cpufeature.h> - - .section .altinstr_replacement,"ax" -1: .byte 0xeb /* jmp <disp8> */ - .byte (copy_page_rep - copy_page) - (2f - 1b) /* offset */ -2: - .previous - .section .altinstructions,"a" - altinstruction_entry copy_page, 1b, X86_FEATURE_REP_GOOD, \ - .Lcopy_page_end-copy_page, 2b-1b - .previous +ENDPROC(copy_page_regs) diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S index dee945d55594..fa997dfaef24 100644 --- a/arch/x86/lib/copy_user_64.S +++ b/arch/x86/lib/copy_user_64.S @@ -8,9 +8,6 @@ #include <linux/linkage.h> #include <asm/dwarf2.h> - -#define FIX_ALIGNMENT 1 - #include <asm/current.h> #include <asm/asm-offsets.h> #include <asm/thread_info.h> @@ -19,33 +16,7 @@ #include <asm/asm.h> #include <asm/smap.h> -/* - * By placing feature2 after feature1 in altinstructions section, we logically - * implement: - * If CPU has feature2, jmp to alt2 is used - * else if CPU has feature1, jmp to alt1 is used - * else jmp to orig is used. - */ - .macro ALTERNATIVE_JUMP feature1,feature2,orig,alt1,alt2 -0: - .byte 0xe9 /* 32bit jump */ - .long \orig-1f /* by default jump to orig */ -1: - .section .altinstr_replacement,"ax" -2: .byte 0xe9 /* near jump with 32bit immediate */ - .long \alt1-1b /* offset */ /* or alternatively to alt1 */ -3: .byte 0xe9 /* near jump with 32bit immediate */ - .long \alt2-1b /* offset */ /* or alternatively to alt2 */ - .previous - - .section .altinstructions,"a" - altinstruction_entry 0b,2b,\feature1,5,5 - altinstruction_entry 0b,3b,\feature2,5,5 - .previous - .endm - .macro ALIGN_DESTINATION -#ifdef FIX_ALIGNMENT /* check for bad alignment of destination */ movl %edi,%ecx andl $7,%ecx @@ -67,7 +38,6 @@ _ASM_EXTABLE(100b,103b) _ASM_EXTABLE(101b,103b) -#endif .endm /* Standard copy_to_user with segment limit checking */ @@ -79,9 +49,11 @@ ENTRY(_copy_to_user) jc bad_to_user cmpq TI_addr_limit(%rax),%rcx ja bad_to_user - ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,X86_FEATURE_ERMS, \ - copy_user_generic_unrolled,copy_user_generic_string, \ - copy_user_enhanced_fast_string + ALTERNATIVE_2 "jmp copy_user_generic_unrolled", \ + "jmp copy_user_generic_string", \ + X86_FEATURE_REP_GOOD, \ + "jmp copy_user_enhanced_fast_string", \ + X86_FEATURE_ERMS CFI_ENDPROC ENDPROC(_copy_to_user) @@ -94,9 +66,11 @@ ENTRY(_copy_from_user) jc bad_from_user cmpq TI_addr_limit(%rax),%rcx ja bad_from_user - ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,X86_FEATURE_ERMS, \ - copy_user_generic_unrolled,copy_user_generic_string, \ - copy_user_enhanced_fast_string + ALTERNATIVE_2 "jmp copy_user_generic_unrolled", \ + "jmp copy_user_generic_string", \ + X86_FEATURE_REP_GOOD, \ + "jmp copy_user_enhanced_fast_string", \ + X86_FEATURE_ERMS CFI_ENDPROC ENDPROC(_copy_from_user) diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S index 89b53c9968e7..b046664f5a1c 100644 --- a/arch/x86/lib/memcpy_64.S +++ b/arch/x86/lib/memcpy_64.S @@ -1,12 +1,20 @@ /* Copyright 2002 Andi Kleen */ #include <linux/linkage.h> - #include <asm/cpufeature.h> #include <asm/dwarf2.h> #include <asm/alternative-asm.h> /* + * We build a jump to memcpy_orig by default which gets NOPped out on + * the majority of x86 CPUs which set REP_GOOD. In addition, CPUs which + * have the enhanced REP MOVSB/STOSB feature (ERMS), change those NOPs + * to a jmp to memcpy_erms which does the REP; MOVSB mem copy. + */ + +.weak memcpy + +/* * memcpy - Copy a memory block. * * Input: @@ -17,15 +25,11 @@ * Output: * rax original destination */ +ENTRY(__memcpy) +ENTRY(memcpy) + ALTERNATIVE_2 "jmp memcpy_orig", "", X86_FEATURE_REP_GOOD, \ + "jmp memcpy_erms", X86_FEATURE_ERMS -/* - * memcpy_c() - fast string ops (REP MOVSQ) based variant. - * - * This gets patched over the unrolled variant (below) via the - * alternative instructions framework: - */ - .section .altinstr_replacement, "ax", @progbits -.Lmemcpy_c: movq %rdi, %rax movq %rdx, %rcx shrq $3, %rcx @@ -34,29 +38,21 @@ movl %edx, %ecx rep movsb ret -.Lmemcpy_e: - .previous +ENDPROC(memcpy) +ENDPROC(__memcpy) /* - * memcpy_c_e() - enhanced fast string memcpy. This is faster and simpler than - * memcpy_c. Use memcpy_c_e when possible. - * - * This gets patched over the unrolled variant (below) via the - * alternative instructions framework: + * memcpy_erms() - enhanced fast string memcpy. This is faster and + * simpler than memcpy. Use memcpy_erms when possible. */ - .section .altinstr_replacement, "ax", @progbits -.Lmemcpy_c_e: +ENTRY(memcpy_erms) movq %rdi, %rax movq %rdx, %rcx rep movsb ret -.Lmemcpy_e_e: - .previous - -.weak memcpy +ENDPROC(memcpy_erms) -ENTRY(__memcpy) -ENTRY(memcpy) +ENTRY(memcpy_orig) CFI_STARTPROC movq %rdi, %rax @@ -183,26 +179,4 @@ ENTRY(memcpy) .Lend: retq CFI_ENDPROC -ENDPROC(memcpy) -ENDPROC(__memcpy) - - /* - * Some CPUs are adding enhanced REP MOVSB/STOSB feature - * If the feature is supported, memcpy_c_e() is the first choice. - * If enhanced rep movsb copy is not available, use fast string copy - * memcpy_c() when possible. This is faster and code is simpler than - * original memcpy(). - * Otherwise, original memcpy() is used. - * In .altinstructions section, ERMS feature is placed after REG_GOOD - * feature to implement the right patch order. - * - * Replace only beginning, memcpy is used to apply alternatives, - * so it is silly to overwrite itself with nops - reboot is the - * only outcome... - */ - .section .altinstructions, "a" - altinstruction_entry __memcpy,.Lmemcpy_c,X86_FEATURE_REP_GOOD,\ - .Lmemcpy_e-.Lmemcpy_c,.Lmemcpy_e-.Lmemcpy_c - altinstruction_entry __memcpy,.Lmemcpy_c_e,X86_FEATURE_ERMS, \ - .Lmemcpy_e_e-.Lmemcpy_c_e,.Lmemcpy_e_e-.Lmemcpy_c_e - .previous +ENDPROC(memcpy_orig) diff --git a/arch/x86/lib/memmove_64.S b/arch/x86/lib/memmove_64.S index 9c4b530575da..0f8a0d0331b9 100644 --- a/arch/x86/lib/memmove_64.S +++ b/arch/x86/lib/memmove_64.S @@ -5,7 +5,6 @@ * This assembly file is re-written from memmove_64.c file. * - Copyright 2011 Fenghua Yu <fenghua.yu@intel.com> */ -#define _STRING_C #include <linux/linkage.h> #include <asm/dwarf2.h> #include <asm/cpufeature.h> @@ -44,6 +43,8 @@ ENTRY(__memmove) jg 2f .Lmemmove_begin_forward: + ALTERNATIVE "", "movq %rdx, %rcx; rep movsb; retq", X86_FEATURE_ERMS + /* * movsq instruction have many startup latency * so we handle small size by general register. @@ -207,21 +208,5 @@ ENTRY(__memmove) 13: retq CFI_ENDPROC - - .section .altinstr_replacement,"ax" -.Lmemmove_begin_forward_efs: - /* Forward moving data. */ - movq %rdx, %rcx - rep movsb - retq -.Lmemmove_end_forward_efs: - .previous - - .section .altinstructions,"a" - altinstruction_entry .Lmemmove_begin_forward, \ - .Lmemmove_begin_forward_efs,X86_FEATURE_ERMS, \ - .Lmemmove_end_forward-.Lmemmove_begin_forward, \ - .Lmemmove_end_forward_efs-.Lmemmove_begin_forward_efs - .previous ENDPROC(__memmove) ENDPROC(memmove) diff --git a/arch/x86/lib/memset_64.S b/arch/x86/lib/memset_64.S index 6f44935c6a60..93118fb23976 100644 --- a/arch/x86/lib/memset_64.S +++ b/arch/x86/lib/memset_64.S @@ -5,19 +5,30 @@ #include <asm/cpufeature.h> #include <asm/alternative-asm.h> +.weak memset + /* * ISO C memset - set a memory block to a byte value. This function uses fast * string to get better performance than the original function. The code is * simpler and shorter than the orignal function as well. - * + * * rdi destination - * rsi value (char) - * rdx count (bytes) - * + * rsi value (char) + * rdx count (bytes) + * * rax original destination - */ - .section .altinstr_replacement, "ax", @progbits -.Lmemset_c: + */ +ENTRY(memset) +ENTRY(__memset) + /* + * Some CPUs support enhanced REP MOVSB/STOSB feature. It is recommended + * to use it when possible. If not available, use fast string instructions. + * + * Otherwise, use original memset function. + */ + ALTERNATIVE_2 "jmp memset_orig", "", X86_FEATURE_REP_GOOD, \ + "jmp memset_erms", X86_FEATURE_ERMS + movq %rdi,%r9 movq %rdx,%rcx andl $7,%edx @@ -31,8 +42,8 @@ rep stosb movq %r9,%rax ret -.Lmemset_e: - .previous +ENDPROC(memset) +ENDPROC(__memset) /* * ISO C memset - set a memory block to a byte value. This function uses @@ -45,21 +56,16 @@ * * rax original destination */ - .section .altinstr_replacement, "ax", @progbits -.Lmemset_c_e: +ENTRY(memset_erms) movq %rdi,%r9 movb %sil,%al movq %rdx,%rcx rep stosb movq %r9,%rax ret -.Lmemset_e_e: - .previous - -.weak memset +ENDPROC(memset_erms) -ENTRY(memset) -ENTRY(__memset) +ENTRY(memset_orig) CFI_STARTPROC movq %rdi,%r10 @@ -134,23 +140,4 @@ ENTRY(__memset) jmp .Lafter_bad_alignment .Lfinal: CFI_ENDPROC -ENDPROC(memset) -ENDPROC(__memset) - - /* Some CPUs support enhanced REP MOVSB/STOSB feature. - * It is recommended to use this when possible. - * - * If enhanced REP MOVSB/STOSB feature is not available, use fast string - * instructions. - * - * Otherwise, use original memset function. - * - * In .altinstructions section, ERMS feature is placed after REG_GOOD - * feature to implement the right patch order. - */ - .section .altinstructions,"a" - altinstruction_entry __memset,.Lmemset_c,X86_FEATURE_REP_GOOD,\ - .Lfinal-__memset,.Lmemset_e-.Lmemset_c - altinstruction_entry __memset,.Lmemset_c_e,X86_FEATURE_ERMS, \ - .Lfinal-__memset,.Lmemset_e_e-.Lmemset_c_e - .previous +ENDPROC(memset_orig) |