/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 2024 Christophe Leroy , CS GROUP France */ #include #include #define dst_bytes r3 #define key r4 #define counter r5 #define nblocks r6 #define idx_r0 r0 #define val4 r4 #define const0 0x61707865 #define const1 0x3320646e #define const2 0x79622d32 #define const3 0x6b206574 #define key0 r5 #define key1 r6 #define key2 r7 #define key3 r8 #define key4 r9 #define key5 r10 #define key6 r11 #define key7 r12 #define counter0 r14 #define counter1 r15 #define state0 r16 #define state1 r17 #define state2 r18 #define state3 r19 #define state4 r20 #define state5 r21 #define state6 r22 #define state7 r23 #define state8 r24 #define state9 r25 #define state10 r26 #define state11 r27 #define state12 r28 #define state13 r29 #define state14 r30 #define state15 r31 .macro quarterround4 a1 b1 c1 d1 a2 b2 c2 d2 a3 b3 c3 d3 a4 b4 c4 d4 add \a1, \a1, \b1 add \a2, \a2, \b2 add \a3, \a3, \b3 add \a4, \a4, \b4 xor \d1, \d1, \a1 xor \d2, \d2, \a2 xor \d3, \d3, \a3 xor \d4, \d4, \a4 rotlwi \d1, \d1, 16 rotlwi \d2, \d2, 16 rotlwi \d3, \d3, 16 rotlwi \d4, \d4, 16 add \c1, \c1, \d1 add \c2, \c2, \d2 add \c3, \c3, \d3 add \c4, \c4, \d4 xor \b1, \b1, \c1 xor \b2, \b2, \c2 xor \b3, \b3, \c3 xor \b4, \b4, \c4 rotlwi \b1, \b1, 12 rotlwi \b2, \b2, 12 rotlwi \b3, \b3, 12 rotlwi \b4, \b4, 12 add \a1, \a1, \b1 add \a2, \a2, \b2 add \a3, \a3, \b3 add \a4, \a4, \b4 xor \d1, \d1, \a1 xor \d2, \d2, \a2 xor \d3, \d3, \a3 xor \d4, \d4, \a4 rotlwi \d1, \d1, 8 rotlwi \d2, \d2, 8 rotlwi \d3, \d3, 8 rotlwi \d4, \d4, 8 add \c1, \c1, \d1 add \c2, \c2, \d2 add \c3, \c3, \d3 add \c4, \c4, \d4 xor \b1, \b1, \c1 xor \b2, \b2, \c2 xor \b3, \b3, \c3 xor \b4, \b4, \c4 rotlwi \b1, \b1, 7 rotlwi \b2, \b2, 7 rotlwi \b3, \b3, 7 rotlwi \b4, \b4, 7 .endm #define QUARTERROUND4(a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,a4,b4,c4,d4) \ quarterround4 state##a1 state##b1 state##c1 state##d1 \ state##a2 state##b2 state##c2 state##d2 \ state##a3 state##b3 state##c3 state##d3 \ state##a4 state##b4 state##c4 state##d4 /* * Very basic 32 bits implementation of ChaCha20. Produces a given positive number * of blocks of output with a nonce of 0, taking an input key and 8-byte * counter. Importantly does not spill to the stack. Its arguments are: * * r3: output bytes * r4: 32-byte key input * r5: 8-byte counter input/output (saved on stack) * r6: number of 64-byte blocks to write to output * * r0: counter of blocks (initialised with r6) * r4: Value '4' after key has been read. * r5-r12: key * r14-r15: counter * r16-r31: state */ SYM_FUNC_START(__arch_chacha20_blocks_nostack) #ifdef __powerpc64__ std counter, -216(r1) std r14, -144(r1) std r15, -136(r1) std r16, -128(r1) std r17, -120(r1) std r18, -112(r1) std r19, -104(r1) std r20, -96(r1) std r21, -88(r1) std r22, -80(r1) std r23, -72(r1) std r24, -64(r1) std r25, -56(r1) std r26, -48(r1) std r27, -40(r1) std r28, -32(r1) std r29, -24(r1) std r30, -16(r1) std r31, -8(r1) #else stwu r1, -96(r1) stw counter, 20(r1) #ifdef __BIG_ENDIAN__ stmw r14, 24(r1) #else stw r14, 24(r1) stw r15, 28(r1) stw r16, 32(r1) stw r17, 36(r1) stw r18, 40(r1) stw r19, 44(r1) stw r20, 48(r1) stw r21, 52(r1) stw r22, 56(r1) stw r23, 60(r1) stw r24, 64(r1) stw r25, 68(r1) stw r26, 72(r1) stw r27, 76(r1) stw r28, 80(r1) stw r29, 84(r1) stw r30, 88(r1) stw r31, 92(r1) #endif #endif /* __powerpc64__ */ lwz counter0, 0(counter) lwz counter1, 4(counter) #ifdef __powerpc64__ rldimi counter0, counter1, 32, 0 #endif mr idx_r0, nblocks subi dst_bytes, dst_bytes, 4 lwz key0, 0(key) lwz key1, 4(key) lwz key2, 8(key) lwz key3, 12(key) lwz key4, 16(key) lwz key5, 20(key) lwz key6, 24(key) lwz key7, 28(key) li val4, 4 .Lblock: li r31, 10 lis state0, const0@ha lis state1, const1@ha lis state2, const2@ha lis state3, const3@ha addi state0, state0, const0@l addi state1, state1, const1@l addi state2, state2, const2@l addi state3, state3, const3@l mtctr r31 mr state4, key0 mr state5, key1 mr state6, key2 mr state7, key3 mr state8, key4 mr state9, key5 mr state10, key6 mr state11, key7 mr state12, counter0 mr state13, counter1 li state14, 0 li state15, 0 .Lpermute: QUARTERROUND4( 0, 4, 8,12, 1, 5, 9,13, 2, 6,10,14, 3, 7,11,15) QUARTERROUND4( 0, 5,10,15, 1, 6,11,12, 2, 7, 8,13, 3, 4, 9,14) bdnz .Lpermute addis state0, state0, const0@ha addis state1, state1, const1@ha addis state2, state2, const2@ha addis state3, state3, const3@ha addi state0, state0, const0@l addi state1, state1, const1@l addi state2, state2, const2@l addi state3, state3, const3@l add state4, state4, key0 add state5, state5, key1 add state6, state6, key2 add state7, state7, key3 add state8, state8, key4 add state9, state9, key5 add state10, state10, key6 add state11, state11, key7 add state12, state12, counter0 add state13, state13, counter1 #ifdef __BIG_ENDIAN__ stwbrx state0, val4, dst_bytes addi dst_bytes, dst_bytes, 8 stwbrx state1, 0, dst_bytes stwbrx state2, val4, dst_bytes addi dst_bytes, dst_bytes, 8 stwbrx state3, 0, dst_bytes stwbrx state4, val4, dst_bytes addi dst_bytes, dst_bytes, 8 stwbrx state5, 0, dst_bytes stwbrx state6, val4, dst_bytes addi dst_bytes, dst_bytes, 8 stwbrx state7, 0, dst_bytes stwbrx state8, val4, dst_bytes addi dst_bytes, dst_bytes, 8 stwbrx state9, 0, dst_bytes stwbrx state10, val4, dst_bytes addi dst_bytes, dst_bytes, 8 stwbrx state11, 0, dst_bytes stwbrx state12, val4, dst_bytes addi dst_bytes, dst_bytes, 8 stwbrx state13, 0, dst_bytes stwbrx state14, val4, dst_bytes addi dst_bytes, dst_bytes, 8 stwbrx state15, 0, dst_bytes #else stw state0, 4(dst_bytes) stw state1, 8(dst_bytes) stw state2, 12(dst_bytes) stw state3, 16(dst_bytes) stw state4, 20(dst_bytes) stw state5, 24(dst_bytes) stw state6, 28(dst_bytes) stw state7, 32(dst_bytes) stw state8, 36(dst_bytes) stw state9, 40(dst_bytes) stw state10, 44(dst_bytes) stw state11, 48(dst_bytes) stw state12, 52(dst_bytes) stw state13, 56(dst_bytes) stw state14, 60(dst_bytes) stwu state15, 64(dst_bytes) #endif subic. idx_r0, idx_r0, 1 /* subi. can't use r0 as source */ #ifdef __powerpc64__ addi counter0, counter0, 1 srdi counter1, counter0, 32 #else addic counter0, counter0, 1 addze counter1, counter1 #endif bne .Lblock #ifdef __powerpc64__ ld counter, -216(r1) #else lwz counter, 20(r1) #endif stw counter0, 0(counter) stw counter1, 4(counter) li r6, 0 li r7, 0 li r8, 0 li r9, 0 li r10, 0 li r11, 0 li r12, 0 #ifdef __powerpc64__ ld r14, -144(r1) ld r15, -136(r1) ld r16, -128(r1) ld r17, -120(r1) ld r18, -112(r1) ld r19, -104(r1) ld r20, -96(r1) ld r21, -88(r1) ld r22, -80(r1) ld r23, -72(r1) ld r24, -64(r1) ld r25, -56(r1) ld r26, -48(r1) ld r27, -40(r1) ld r28, -32(r1) ld r29, -24(r1) ld r30, -16(r1) ld r31, -8(r1) #else #ifdef __BIG_ENDIAN__ lmw r14, 24(r1) #else lwz r14, 24(r1) lwz r15, 28(r1) lwz r16, 32(r1) lwz r17, 36(r1) lwz r18, 40(r1) lwz r19, 44(r1) lwz r20, 48(r1) lwz r21, 52(r1) lwz r22, 56(r1) lwz r23, 60(r1) lwz r24, 64(r1) lwz r25, 68(r1) lwz r26, 72(r1) lwz r27, 76(r1) lwz r28, 80(r1) lwz r29, 84(r1) lwz r30, 88(r1) lwz r31, 92(r1) #endif addi r1, r1, 96 #endif /* __powerpc64__ */ blr SYM_FUNC_END(__arch_chacha20_blocks_nostack)