summaryrefslogtreecommitdiff
path: root/arch/loongarch/vdso/vgetrandom-chacha.S
blob: 7e86a50f6e85c369d9390c55d661b680b1532604 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
// SPDX-License-Identifier: GPL-2.0
/*
 * Copyright (C) 2024 Xi Ruoyao <xry111@xry111.site>. All Rights Reserved.
 */

#include <asm/asm.h>
#include <asm/regdef.h>
#include <linux/linkage.h>

.text

/* Salsa20 quarter-round */
.macro	QR	a b c d
	add.w		\a, \a, \b
	xor		\d, \d, \a
	rotri.w		\d, \d, 16

	add.w		\c, \c, \d
	xor		\b, \b, \c
	rotri.w		\b, \b, 20

	add.w		\a, \a, \b
	xor		\d, \d, \a
	rotri.w		\d, \d, 24

	add.w		\c, \c, \d
	xor		\b, \b, \c
	rotri.w		\b, \b, 25
.endm

/*
 * Very basic LoongArch implementation of ChaCha20. Produces a given positive
 * number of blocks of output with a nonce of 0, taking an input key and
 * 8-byte counter. Importantly does not spill to the stack. Its arguments
 * are:
 *
 *	a0: output bytes
 *	a1: 32-byte key input
 *	a2: 8-byte counter input/output
 *	a3: number of 64-byte blocks to write to output
 */
SYM_FUNC_START(__arch_chacha20_blocks_nostack)

/* We don't need a frame pointer */
#define s9		fp

#define output		a0
#define key		a1
#define counter		a2
#define nblocks		a3
#define i		a4
#define state0		s0
#define state1		s1
#define state2		s2
#define state3		s3
#define state4		s4
#define state5		s5
#define state6		s6
#define state7		s7
#define state8		s8
#define state9		s9
#define state10		a5
#define state11		a6
#define state12		a7
#define state13		t0
#define state14		t1
#define state15		t2
#define cnt_lo		t3
#define cnt_hi		t4
#define copy0		t5
#define copy1		t6
#define copy2		t7

/* Reuse i as copy3 */
#define copy3		i

	/*
	 * The ABI requires s0-s9 saved, and sp aligned to 16-byte.
	 * This does not violate the stack-less requirement: no sensitive data
	 * is spilled onto the stack.
	 */
	PTR_ADDI	sp, sp, (-SZREG * 10) & STACK_ALIGN
	REG_S		s0, sp, 0
	REG_S		s1, sp, SZREG
	REG_S		s2, sp, SZREG * 2
	REG_S		s3, sp, SZREG * 3
	REG_S		s4, sp, SZREG * 4
	REG_S		s5, sp, SZREG * 5
	REG_S		s6, sp, SZREG * 6
	REG_S		s7, sp, SZREG * 7
	REG_S		s8, sp, SZREG * 8
	REG_S		s9, sp, SZREG * 9

	li.w		copy0, 0x61707865
	li.w		copy1, 0x3320646e
	li.w		copy2, 0x79622d32

	ld.w		cnt_lo, counter, 0
	ld.w		cnt_hi, counter, 4

.Lblock:
	/* state[0,1,2,3] = "expand 32-byte k" */
	move		state0, copy0
	move		state1, copy1
	move		state2, copy2
	li.w		state3, 0x6b206574

	/* state[4,5,..,11] = key */
	ld.w		state4, key, 0
	ld.w		state5, key, 4
	ld.w		state6, key, 8
	ld.w		state7, key, 12
	ld.w		state8, key, 16
	ld.w		state9, key, 20
	ld.w		state10, key, 24
	ld.w		state11, key, 28

	/* state[12,13] = counter */
	move		state12, cnt_lo
	move		state13, cnt_hi

	/* state[14,15] = 0 */
	move		state14, zero
	move		state15, zero

	li.w		i, 10
.Lpermute:
	/* odd round */
	QR		state0, state4, state8, state12
	QR		state1, state5, state9, state13
	QR		state2, state6, state10, state14
	QR		state3, state7, state11, state15

	/* even round */
	QR		state0, state5, state10, state15
	QR		state1, state6, state11, state12
	QR		state2, state7, state8, state13
	QR		state3, state4, state9, state14

	addi.w		i, i, -1
	bnez		i, .Lpermute

	/*
	 * copy[3] = "expa", materialize it here because copy[3] shares the
	 * same register with i which just became dead.
	 */
	li.w		copy3, 0x6b206574

	/* output[0,1,2,3] = copy[0,1,2,3] + state[0,1,2,3] */
	add.w		state0, state0, copy0
	add.w		state1, state1, copy1
	add.w		state2, state2, copy2
	add.w		state3, state3, copy3
	st.w		state0, output, 0
	st.w		state1, output, 4
	st.w		state2, output, 8
	st.w		state3, output, 12

	/* from now on state[0,1,2,3] are scratch registers  */

	/* state[0,1,2,3] = lo32(key) */
	ld.w		state0, key, 0
	ld.w		state1, key, 4
	ld.w		state2, key, 8
	ld.w		state3, key, 12

	/* output[4,5,6,7] = state[0,1,2,3] + state[4,5,6,7] */
	add.w		state4, state4, state0
	add.w		state5, state5, state1
	add.w		state6, state6, state2
	add.w		state7, state7, state3
	st.w		state4, output, 16
	st.w		state5, output, 20
	st.w		state6, output, 24
	st.w		state7, output, 28

	/* state[0,1,2,3] = hi32(key) */
	ld.w		state0, key, 16
	ld.w		state1, key, 20
	ld.w		state2, key, 24
	ld.w		state3, key, 28

	/* output[8,9,10,11] = state[0,1,2,3] + state[8,9,10,11] */
	add.w		state8, state8, state0
	add.w		state9, state9, state1
	add.w		state10, state10, state2
	add.w		state11, state11, state3
	st.w		state8, output, 32
	st.w		state9, output, 36
	st.w		state10, output, 40
	st.w		state11, output, 44

	/* output[12,13,14,15] = state[12,13,14,15] + [cnt_lo, cnt_hi, 0, 0] */
	add.w		state12, state12, cnt_lo
	add.w		state13, state13, cnt_hi
	st.w		state12, output, 48
	st.w		state13, output, 52
	st.w		state14, output, 56
	st.w		state15, output, 60

	/* ++counter  */
	addi.w		cnt_lo, cnt_lo, 1
	sltui		state0, cnt_lo, 1
	add.w		cnt_hi, cnt_hi, state0

	/* output += 64 */
	PTR_ADDI	output, output, 64
	/* --nblocks */
	PTR_ADDI	nblocks, nblocks, -1
	bnez		nblocks, .Lblock

	/* counter = [cnt_lo, cnt_hi] */
	st.w		cnt_lo, counter, 0
	st.w		cnt_hi, counter, 4

	/*
	 * Zero out the potentially sensitive regs, in case nothing uses these
	 * again. As at now copy[0,1,2,3] just contains "expand 32-byte k" and
	 * state[0,...,9] are s0-s9 those we'll restore in the epilogue, so we
	 * only need to zero state[11,...,15].
	 */
	move		state10, zero
	move		state11, zero
	move		state12, zero
	move		state13, zero
	move		state14, zero
	move		state15, zero

	REG_L		s0, sp, 0
	REG_L		s1, sp, SZREG
	REG_L		s2, sp, SZREG * 2
	REG_L		s3, sp, SZREG * 3
	REG_L		s4, sp, SZREG * 4
	REG_L		s5, sp, SZREG * 5
	REG_L		s6, sp, SZREG * 6
	REG_L		s7, sp, SZREG * 7
	REG_L		s8, sp, SZREG * 8
	REG_L		s9, sp, SZREG * 9
	PTR_ADDI	sp, sp, -((-SZREG * 10) & STACK_ALIGN)

	jr		ra
SYM_FUNC_END(__arch_chacha20_blocks_nostack)