diff options
author | Jan Beulich <JBeulich@suse.com> | 2012-01-26 19:55:32 +0400 |
---|---|---|
committer | Ingo Molnar <mingo@elte.hu> | 2012-01-27 00:19:20 +0400 |
commit | 9d8e22777e66f420e46490e9fc6f8cb7e0e2222b (patch) | |
tree | dd0ec6122dda1409206dda70f6ae4fd3c9a2cd35 /arch/x86 | |
parent | 2ab560911a427fdc73bfd3a7d2944d8ee0ca6db8 (diff) | |
download | linux-9d8e22777e66f420e46490e9fc6f8cb7e0e2222b.tar.xz |
x86-64: Handle byte-wise tail copying in memcpy() without a loop
While hard to measure, reducing the number of possibly/likely
mis-predicted branches can generally be expected to be slightly
better.
Other than apparent at the first glance, this also doesn't grow
the function size (the alignment gap to the next function just
gets smaller).
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: http://lkml.kernel.org/r/4F218584020000780006F422@nat28.tlf.novell.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'arch/x86')
-rw-r--r-- | arch/x86/lib/memcpy_64.S | 19 |
1 files changed, 10 insertions, 9 deletions
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S index 1235b04a9a60..1c273be7c97e 100644 --- a/arch/x86/lib/memcpy_64.S +++ b/arch/x86/lib/memcpy_64.S @@ -164,18 +164,19 @@ ENTRY(memcpy) retq .p2align 4 .Lless_3bytes: - cmpl $0, %edx - je .Lend + subl $1, %edx + jb .Lend /* * Move data from 1 bytes to 3 bytes. */ -.Lloop_1: - movb (%rsi), %r8b - movb %r8b, (%rdi) - incq %rdi - incq %rsi - decl %edx - jnz .Lloop_1 + movzbl (%rsi), %ecx + jz .Lstore_1byte + movzbq 1(%rsi), %r8 + movzbq (%rsi, %rdx), %r9 + movb %r8b, 1(%rdi) + movb %r9b, (%rdi, %rdx) +.Lstore_1byte: + movb %cl, (%rdi) .Lend: retq |