diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2023-04-16 05:31:34 +0300 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2023-04-19 03:05:28 +0300 |
commit | 577e6a7fd50d519c201d20968b6a027a6563dc4c (patch) | |
tree | ad8e4f8a75f78b627bf7ce28aabff48cac32bb54 /arch/x86/lib | |
parent | 3639a535587d7aac449cdce9710dfdc97a3c8c8e (diff) | |
download | linux-577e6a7fd50d519c201d20968b6a027a6563dc4c.tar.xz |
x86: inline the 'rep movs' in user copies for the FSRM case
This does the same thing for the user copies as commit 0db7058e8e23
("x86/clear_user: Make it faster") did for clear_user(). In other
words, it inlines the "rep movs" case when X86_FEATURE_FSRM is set,
avoiding the function call entirely.
In order to do that, it makes the calling convention for the out-of-line
case ("copy_user_generic_unrolled") match the 'rep movs' calling
convention, although it does also end up clobbering a number of
additional registers.
Also, to simplify code sharing in the low-level assembly with the
__copy_user_nocache() function (that uses the normal C calling
convention), we end up with a kind of mixed return value for the
low-level asm code: it will return the result in both %rcx (to work as
an alternative for the 'rep movs' case), _and_ in %rax (for the nocache
case).
We could avoid this by wrapping __copy_user_nocache() callers in an
inline asm, but since the cost is just an extra register copy, it's
probably not worth it.
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'arch/x86/lib')
-rw-r--r-- | arch/x86/lib/copy_user_64.S | 55 |
1 files changed, 21 insertions, 34 deletions
diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S index 818f2f728294..16a743f11b11 100644 --- a/arch/x86/lib/copy_user_64.S +++ b/arch/x86/lib/copy_user_64.S @@ -45,13 +45,29 @@ * Input: * rdi destination * rsi source - * rdx count + * rcx count * * Output: - * eax uncopied bytes or 0 if successful. + * rcx uncopied bytes or 0 if successful. + * + * NOTE! The calling convention is very intentionally the same as + * for 'rep movs', so that we can rewrite the function call with + * just a plain 'rep movs' on machines that have FSRM. + * + * HOWEVER! This function ends up having a lot of the code common + * with __copy_user_nocache(), which is a normal C function, and + * has a similar calling convention, but gets the 'count' in %rdx, + * and returns the result in %rax. + * + * To share as much code as possible, we end up returning the + * result in *both* %rcx/%rax, and we also move the initial count + * into %rdx. + * + * We can clobber rdx/rsi/rdi and r8-r11 */ SYM_FUNC_START(copy_user_generic_unrolled) - cmpl $8,%edx + movl %ecx,%edx + cmpl $8,%ecx jb .Lcopy_user_short_string_bytes ALIGN_DESTINATION movl %edx,%ecx @@ -104,37 +120,6 @@ SYM_FUNC_END(copy_user_generic_unrolled) EXPORT_SYMBOL(copy_user_generic_unrolled) /* - * Some CPUs support FSRM for Fast Short REP MOVS. - * - * Only 4GB of copy is supported. This shouldn't be a problem - * because the kernel normally only writes from/to page sized chunks - * even if user space passed a longer buffer. - * And more would be dangerous because both Intel and AMD have - * errata with rep movsq > 4GB. If someone feels the need to fix - * this please consider this. - * - * Input: - * rdi destination - * rsi source - * rdx count - * - * Output: - * eax uncopied bytes or 0 if successful. - */ -SYM_FUNC_START(copy_user_fast_string) - movl %edx,%ecx -1: rep movsb - xorl %eax,%eax - RET - -12: movl %ecx,%eax /* ecx is zerorest also */ - RET - - _ASM_EXTABLE_CPY(1b, 12b) -SYM_FUNC_END(copy_user_fast_string) -EXPORT_SYMBOL(copy_user_fast_string) - -/* * Try to copy last bytes and clear the rest if needed. * Since protection fault in copy_from/to_user is not a normal situation, * it is not necessary to optimize tail handling. @@ -160,6 +145,7 @@ SYM_CODE_START_LOCAL(.Lcopy_user_handle_tail) 3: movl %edx,%eax + movl %edx,%ecx RET _ASM_EXTABLE_CPY(1b, 2b) @@ -203,6 +189,7 @@ SYM_CODE_START_LOCAL(copy_user_short_string) decl %ecx jnz 21b 23: xor %eax,%eax + xor %ecx,%ecx RET 40: leal (%rdx,%rcx,8),%edx |