/* SPDX-License-Identifier: GPL-2.0 */
/*
* Copyright (C) 2024 Christophe Leroy <[email protected]>, CS GROUP France
*/
#include <linux/linkage.h>
#include <asm/ppc_asm.h>
#define dst_bytes r3
#define key r4
#define counter r5
#define nblocks r6
#define idx_r0 r0
#define val4 r4
#define const0 0x61707865
#define const1 0x3320646e
#define const2 0x79622d32
#define const3 0x6b206574
#define key0 r5
#define key1 r6
#define key2 r7
#define key3 r8
#define key4 r9
#define key5 r10
#define key6 r11
#define key7 r12
#define counter0 r14
#define counter1 r15
#define state0 r16
#define state1 r17
#define state2 r18
#define state3 r19
#define state4 r20
#define state5 r21
#define state6 r22
#define state7 r23
#define state8 r24
#define state9 r25
#define state10 r26
#define state11 r27
#define state12 r28
#define state13 r29
#define state14 r30
#define state15 r31
.macro quarterround4 a1 b1 c1 d1 a2 b2 c2 d2 a3 b3 c3 d3 a4 b4 c4 d4
add \a1, \a1, \b1
add \a2, \a2, \b2
add \a3, \a3, \b3
add \a4, \a4, \b4
xor \d1, \d1, \a1
xor \d2, \d2, \a2
xor \d3, \d3, \a3
xor \d4, \d4, \a4
rotlwi \d1, \d1, 16
rotlwi \d2, \d2, 16
rotlwi \d3, \d3, 16
rotlwi \d4, \d4, 16
add \c1, \c1, \d1
add \c2, \c2, \d2
add \c3, \c3, \d3
add \c4, \c4, \d4
xor \b1, \b1, \c1
xor \b2, \b2, \c2
xor \b3, \b3, \c3
xor \b4, \b4, \c4
rotlwi \b1, \b1, 12
rotlwi \b2, \b2, 12
rotlwi \b3, \b3, 12
rotlwi \b4, \b4, 12
add \a1, \a1, \b1
add \a2, \a2, \b2
add \a3, \a3, \b3
add \a4, \a4, \b4
xor \d1, \d1, \a1
xor \d2, \d2, \a2
xor \d3, \d3, \a3
xor \d4, \d4, \a4
rotlwi \d1, \d1, 8
rotlwi \d2, \d2, 8
rotlwi \d3, \d3, 8
rotlwi \d4, \d4, 8
add \c1, \c1, \d1
add \c2, \c2, \d2
add \c3, \c3, \d3
add \c4, \c4, \d4
xor \b1, \b1, \c1
xor \b2, \b2, \c2
xor \b3, \b3, \c3
xor \b4, \b4, \c4
rotlwi \b1, \b1, 7
rotlwi \b2, \b2, 7
rotlwi \b3, \b3, 7
rotlwi \b4, \b4, 7
.endm
#define QUARTERROUND4(a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,a4,b4,c4,d4) \
quarterround4 state##a1 state##b1 state##c1 state##d1 \
state##a2 state##b2 state##c2 state##d2 \
state##a3 state##b3 state##c3 state##d3 \
state##a4 state##b4 state##c4 state##d4
/*
* Very basic 32 bits implementation of ChaCha20. Produces a given positive number
* of blocks of output with a nonce of 0, taking an input key and 8-byte
* counter. Importantly does not spill to the stack. Its arguments are:
*
* r3: output bytes
* r4: 32-byte key input
* r5: 8-byte counter input/output (saved on stack)
* r6: number of 64-byte blocks to write to output
*
* r0: counter of blocks (initialised with r6)
* r4: Value '4' after key has been read.
* r5-r12: key
* r14-r15: counter
* r16-r31: state
*/
SYM_FUNC_START(__arch_chacha20_blocks_nostack)
#ifdef __powerpc64__
std counter, -216(r1)
std r14, -144(r1)
std r15, -136(r1)
std r16, -128(r1)
std r17, -120(r1)
std r18, -112(r1)
std r19, -104(r1)
std r20, -96(r1)
std r21, -88(r1)
std r22, -80(r1)
std r23, -72(r1)
std r24, -64(r1)
std r25, -56(r1)
std r26, -48(r1)
std r27, -40(r1)
std r28, -32(r1)
std r29, -24(r1)
std r30, -16(r1)
std r31, -8(r1)
#else
stwu r1, -96(r1)
stw counter, 20(r1)
#ifdef __BIG_ENDIAN__
stmw r14, 24(r1)
#else
stw r14, 24(r1)
stw r15, 28(r1)
stw r16, 32(r1)
stw r17, 36(r1)
stw r18, 40(r1)
stw r19, 44(r1)
stw r20, 48(r1)
stw r21, 52(r1)
stw r22, 56(r1)
stw r23, 60(r1)
stw r24, 64(r1)
stw r25, 68(r1)
stw r26, 72(r1)
stw r27, 76(r1)
stw r28, 80(r1)
stw r29, 84(r1)
stw r30, 88(r1)
stw r31, 92(r1)
#endif
#endif /* __powerpc64__ */
lwz counter0, 0(counter)
lwz counter1, 4(counter)
#ifdef __powerpc64__
rldimi counter0, counter1, 32, 0
#endif
mr idx_r0, nblocks
subi dst_bytes, dst_bytes, 4
lwz key0, 0(key)
lwz key1, 4(key)
lwz key2, 8(key)
lwz key3, 12(key)
lwz key4, 16(key)
lwz key5, 20(key)
lwz key6, 24(key)
lwz key7, 28(key)
li val4, 4
.Lblock:
li r31, 10
lis state0, const0@ha
lis state1, const1@ha
lis state2, const2@ha
lis state3, const3@ha
addi state0, state0, const0@l
addi state1, state1, const1@l
addi state2, state2, const2@l
addi state3, state3, const3@l
mtctr r31
mr state4, key0
mr state5, key1
mr state6, key2
mr state7, key3
mr state8, key4
mr state9, key5
mr state10, key6
mr state11, key7
mr state12, counter0
mr state13, counter1
li state14, 0
li state15, 0
.Lpermute:
QUARTERROUND4( 0, 4, 8,12, 1, 5, 9,13, 2, 6,10,14, 3, 7,11,15)
QUARTERROUND4( 0, 5,10,15, 1, 6,11,12, 2, 7, 8,13, 3, 4, 9,14)
bdnz .Lpermute
addis state0, state0, const0@ha
addis state1, state1, const1@ha
addis state2, state2, const2@ha
addis state3, state3, const3@ha
addi state0, state0, const0@l
addi state1, state1, const1@l
addi state2, state2, const2@l
addi state3, state3, const3@l
add state4, state4, key0
add state5, state5, key1
add state6, state6, key2
add state7, state7, key3
add state8, state8, key4
add state9, state9, key5
add state10, state10, key6
add state11, state11, key7
add state12, state12, counter0
add state13, state13, counter1
#ifdef __BIG_ENDIAN__
stwbrx state0, val4, dst_bytes
addi dst_bytes, dst_bytes, 8
stwbrx state1, 0, dst_bytes
stwbrx state2, val4, dst_bytes
addi dst_bytes, dst_bytes, 8
stwbrx state3, 0, dst_bytes
stwbrx state4, val4, dst_bytes
addi dst_bytes, dst_bytes, 8
stwbrx state5, 0, dst_bytes
stwbrx state6, val4, dst_bytes
addi dst_bytes, dst_bytes, 8
stwbrx state7, 0, dst_bytes
stwbrx state8, val4, dst_bytes
addi dst_bytes, dst_bytes, 8
stwbrx state9, 0, dst_bytes
stwbrx state10, val4, dst_bytes
addi dst_bytes, dst_bytes, 8
stwbrx state11, 0, dst_bytes
stwbrx state12, val4, dst_bytes
addi dst_bytes, dst_bytes, 8
stwbrx state13, 0, dst_bytes
stwbrx state14, val4, dst_bytes
addi dst_bytes, dst_bytes, 8
stwbrx state15, 0, dst_bytes
#else
stw state0, 4(dst_bytes)
stw state1, 8(dst_bytes)
stw state2, 12(dst_bytes)
stw state3, 16(dst_bytes)
stw state4, 20(dst_bytes)
stw state5, 24(dst_bytes)
stw state6, 28(dst_bytes)
stw state7, 32(dst_bytes)
stw state8, 36(dst_bytes)
stw state9, 40(dst_bytes)
stw state10, 44(dst_bytes)
stw state11, 48(dst_bytes)
stw state12, 52(dst_bytes)
stw state13, 56(dst_bytes)
stw state14, 60(dst_bytes)
stwu state15, 64(dst_bytes)
#endif
subic. idx_r0, idx_r0, 1 /* subi. can't use r0 as source */
#ifdef __powerpc64__
addi counter0, counter0, 1
srdi counter1, counter0, 32
#else
addic counter0, counter0, 1
addze counter1, counter1
#endif
bne .Lblock
#ifdef __powerpc64__
ld counter, -216(r1)
#else
lwz counter, 20(r1)
#endif
stw counter0, 0(counter)
stw counter1, 4(counter)
li r6, 0
li r7, 0
li r8, 0
li r9, 0
li r10, 0
li r11, 0
li r12, 0
#ifdef __powerpc64__
ld r14, -144(r1)
ld r15, -136(r1)
ld r16, -128(r1)
ld r17, -120(r1)
ld r18, -112(r1)
ld r19, -104(r1)
ld r20, -96(r1)
ld r21, -88(r1)
ld r22, -80(r1)
ld r23, -72(r1)
ld r24, -64(r1)
ld r25, -56(r1)
ld r26, -48(r1)
ld r27, -40(r1)
ld r28, -32(r1)
ld r29, -24(r1)
ld r30, -16(r1)
ld r31, -8(r1)
#else
#ifdef __BIG_ENDIAN__
lmw r14, 24(r1)
#else
lwz r14, 24(r1)
lwz r15, 28(r1)
lwz r16, 32(r1)
lwz r17, 36(r1)
lwz r18, 40(r1)
lwz r19, 44(r1)
lwz r20, 48(r1)
lwz r21, 52(r1)
lwz r22, 56(r1)
lwz r23, 60(r1)
lwz r24, 64(r1)
lwz r25, 68(r1)
lwz r26, 72(r1)
lwz r27, 76(r1)
lwz r28, 80(r1)
lwz r29, 84(r1)
lwz r30, 88(r1)
lwz r31, 92(r1)
#endif
addi r1, r1, 96
#endif /* __powerpc64__ */
blr
SYM_FUNC_END(__arch_chacha20_blocks_nostack)