// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2024 Xi Ruoyao . All Rights Reserved. */ #include #include #include .text /* Salsa20 quarter-round */ .macro QR a b c d add.w \a, \a, \b xor \d, \d, \a rotri.w \d, \d, 16 add.w \c, \c, \d xor \b, \b, \c rotri.w \b, \b, 20 add.w \a, \a, \b xor \d, \d, \a rotri.w \d, \d, 24 add.w \c, \c, \d xor \b, \b, \c rotri.w \b, \b, 25 .endm /* * Very basic LoongArch implementation of ChaCha20. Produces a given positive * number of blocks of output with a nonce of 0, taking an input key and * 8-byte counter. Importantly does not spill to the stack. Its arguments * are: * * a0: output bytes * a1: 32-byte key input * a2: 8-byte counter input/output * a3: number of 64-byte blocks to write to output */ SYM_FUNC_START(__arch_chacha20_blocks_nostack) /* We don't need a frame pointer */ #define s9 fp #define output a0 #define key a1 #define counter a2 #define nblocks a3 #define i a4 #define state0 s0 #define state1 s1 #define state2 s2 #define state3 s3 #define state4 s4 #define state5 s5 #define state6 s6 #define state7 s7 #define state8 s8 #define state9 s9 #define state10 a5 #define state11 a6 #define state12 a7 #define state13 t0 #define state14 t1 #define state15 t2 #define cnt_lo t3 #define cnt_hi t4 #define copy0 t5 #define copy1 t6 #define copy2 t7 /* Reuse i as copy3 */ #define copy3 i /* * The ABI requires s0-s9 saved, and sp aligned to 16-byte. * This does not violate the stack-less requirement: no sensitive data * is spilled onto the stack. */ PTR_ADDI sp, sp, (-SZREG * 10) & STACK_ALIGN REG_S s0, sp, 0 REG_S s1, sp, SZREG REG_S s2, sp, SZREG * 2 REG_S s3, sp, SZREG * 3 REG_S s4, sp, SZREG * 4 REG_S s5, sp, SZREG * 5 REG_S s6, sp, SZREG * 6 REG_S s7, sp, SZREG * 7 REG_S s8, sp, SZREG * 8 REG_S s9, sp, SZREG * 9 li.w copy0, 0x61707865 li.w copy1, 0x3320646e li.w copy2, 0x79622d32 ld.w cnt_lo, counter, 0 ld.w cnt_hi, counter, 4 .Lblock: /* state[0,1,2,3] = "expand 32-byte k" */ move state0, copy0 move state1, copy1 move state2, copy2 li.w state3, 0x6b206574 /* state[4,5,..,11] = key */ ld.w state4, key, 0 ld.w state5, key, 4 ld.w state6, key, 8 ld.w state7, key, 12 ld.w state8, key, 16 ld.w state9, key, 20 ld.w state10, key, 24 ld.w state11, key, 28 /* state[12,13] = counter */ move state12, cnt_lo move state13, cnt_hi /* state[14,15] = 0 */ move state14, zero move state15, zero li.w i, 10 .Lpermute: /* odd round */ QR state0, state4, state8, state12 QR state1, state5, state9, state13 QR state2, state6, state10, state14 QR state3, state7, state11, state15 /* even round */ QR state0, state5, state10, state15 QR state1, state6, state11, state12 QR state2, state7, state8, state13 QR state3, state4, state9, state14 addi.w i, i, -1 bnez i, .Lpermute /* * copy[3] = "expa", materialize it here because copy[3] shares the * same register with i which just became dead. */ li.w copy3, 0x6b206574 /* output[0,1,2,3] = copy[0,1,2,3] + state[0,1,2,3] */ add.w state0, state0, copy0 add.w state1, state1, copy1 add.w state2, state2, copy2 add.w state3, state3, copy3 st.w state0, output, 0 st.w state1, output, 4 st.w state2, output, 8 st.w state3, output, 12 /* from now on state[0,1,2,3] are scratch registers */ /* state[0,1,2,3] = lo32(key) */ ld.w state0, key, 0 ld.w state1, key, 4 ld.w state2, key, 8 ld.w state3, key, 12 /* output[4,5,6,7] = state[0,1,2,3] + state[4,5,6,7] */ add.w state4, state4, state0 add.w state5, state5, state1 add.w state6, state6, state2 add.w state7, state7, state3 st.w state4, output, 16 st.w state5, output, 20 st.w state6, output, 24 st.w state7, output, 28 /* state[0,1,2,3] = hi32(key) */ ld.w state0, key, 16 ld.w state1, key, 20 ld.w state2, key, 24 ld.w state3, key, 28 /* output[8,9,10,11] = state[0,1,2,3] + state[8,9,10,11] */ add.w state8, state8, state0 add.w state9, state9, state1 add.w state10, state10, state2 add.w state11, state11, state3 st.w state8, output, 32 st.w state9, output, 36 st.w state10, output, 40 st.w state11, output, 44 /* output[12,13,14,15] = state[12,13,14,15] + [cnt_lo, cnt_hi, 0, 0] */ add.w state12, state12, cnt_lo add.w state13, state13, cnt_hi st.w state12, output, 48 st.w state13, output, 52 st.w state14, output, 56 st.w state15, output, 60 /* ++counter */ addi.w cnt_lo, cnt_lo, 1 sltui state0, cnt_lo, 1 add.w cnt_hi, cnt_hi, state0 /* output += 64 */ PTR_ADDI output, output, 64 /* --nblocks */ PTR_ADDI nblocks, nblocks, -1 bnez nblocks, .Lblock /* counter = [cnt_lo, cnt_hi] */ st.w cnt_lo, counter, 0 st.w cnt_hi, counter, 4 /* * Zero out the potentially sensitive regs, in case nothing uses these * again. As at now copy[0,1,2,3] just contains "expand 32-byte k" and * state[0,...,9] are s0-s9 those we'll restore in the epilogue, so we * only need to zero state[11,...,15]. */ move state10, zero move state11, zero move state12, zero move state13, zero move state14, zero move state15, zero REG_L s0, sp, 0 REG_L s1, sp, SZREG REG_L s2, sp, SZREG * 2 REG_L s3, sp, SZREG * 3 REG_L s4, sp, SZREG * 4 REG_L s5, sp, SZREG * 5 REG_L s6, sp, SZREG * 6 REG_L s7, sp, SZREG * 7 REG_L s8, sp, SZREG * 8 REG_L s9, sp, SZREG * 9 PTR_ADDI sp, sp, -((-SZREG * 10) & STACK_ALIGN) jr ra SYM_FUNC_END(__arch_chacha20_blocks_nostack)