1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
|
/* Copyright 2002 Andi Kleen, SuSE Labs */
#include <linux/linkage.h>
#include <asm/cpufeature.h>
#include <asm/alternative-asm.h>
.weak memset
/*
* ISO C memset - set a memory block to a byte value. This function uses fast
* string to get better performance than the original function. The code is
* simpler and shorter than the orignal function as well.
*
* rdi destination
* rsi value (char)
* rdx count (bytes)
*
* rax original destination
*/
ENTRY(memset)
ENTRY(__memset)
/*
* Some CPUs support enhanced REP MOVSB/STOSB feature. It is recommended
* to use it when possible. If not available, use fast string instructions.
*
* Otherwise, use original memset function.
*/
ALTERNATIVE_2 "jmp memset_orig", "", X86_FEATURE_REP_GOOD, \
"jmp memset_erms", X86_FEATURE_ERMS
movq %rdi,%r9
movq %rdx,%rcx
andl $7,%edx
shrq $3,%rcx
/* expand byte value */
movzbl %sil,%esi
movabs $0x0101010101010101,%rax
imulq %rsi,%rax
rep stosq
movl %edx,%ecx
rep stosb
movq %r9,%rax
ret
ENDPROC(memset)
ENDPROC(__memset)
/*
* ISO C memset - set a memory block to a byte value. This function uses
* enhanced rep stosb to override the fast string function.
* The code is simpler and shorter than the fast string function as well.
*
* rdi destination
* rsi value (char)
* rdx count (bytes)
*
* rax original destination
*/
ENTRY(memset_erms)
movq %rdi,%r9
movb %sil,%al
movq %rdx,%rcx
rep stosb
movq %r9,%rax
ret
ENDPROC(memset_erms)
ENTRY(memset_orig)
movq %rdi,%r10
/* expand byte value */
movzbl %sil,%ecx
movabs $0x0101010101010101,%rax
imulq %rcx,%rax
/* align dst */
movl %edi,%r9d
andl $7,%r9d
jnz .Lbad_alignment
.Lafter_bad_alignment:
movq %rdx,%rcx
shrq $6,%rcx
jz .Lhandle_tail
.p2align 4
.Lloop_64:
decq %rcx
movq %rax,(%rdi)
movq %rax,8(%rdi)
movq %rax,16(%rdi)
movq %rax,24(%rdi)
movq %rax,32(%rdi)
movq %rax,40(%rdi)
movq %rax,48(%rdi)
movq %rax,56(%rdi)
leaq 64(%rdi),%rdi
jnz .Lloop_64
/* Handle tail in loops. The loops should be faster than hard
to predict jump tables. */
.p2align 4
.Lhandle_tail:
movl %edx,%ecx
andl $63&(~7),%ecx
jz .Lhandle_7
shrl $3,%ecx
.p2align 4
.Lloop_8:
decl %ecx
movq %rax,(%rdi)
leaq 8(%rdi),%rdi
jnz .Lloop_8
.Lhandle_7:
andl $7,%edx
jz .Lende
.p2align 4
.Lloop_1:
decl %edx
movb %al,(%rdi)
leaq 1(%rdi),%rdi
jnz .Lloop_1
.Lende:
movq %r10,%rax
ret
.Lbad_alignment:
cmpq $7,%rdx
jbe .Lhandle_7
movq %rax,(%rdi) /* unaligned store */
movq $8,%r8
subq %r9,%r8
addq %r8,%rdi
subq %r8,%rdx
jmp .Lafter_bad_alignment
.Lfinal:
ENDPROC(memset_orig)
|