linux/arch/s390/crypto/chacha-s390.S

/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Original implementation written by Andy Polyakov, @dot-asm.
 * This is an adaptation of the original code for kernel use.
 *
 * Copyright (C) 2006-2019 CRYPTOGAMS by <[email protected]>. All Rights Reserved.
 */

#include <linux/linkage.h>
#include <asm/nospec-insn.h>
#include <asm/fpu-insn.h>

#define SP	%r15
#define FRAME	(16 * 8 + 4 * 8)

	.data
	.balign	32

SYM_DATA_START_LOCAL(sigma)
	.long	0x61707865,0x3320646e,0x79622d32,0x6b206574	# endian-neutral
	.long	1,0,0,0
	.long	2,0,0,0
	.long	3,0,0,0
	.long	0x03020100,0x07060504,0x0b0a0908,0x0f0e0d0c	# byte swap

	.long	0,1,2,3
	.long	0x61707865,0x61707865,0x61707865,0x61707865	# smashed sigma
	.long	0x3320646e,0x3320646e,0x3320646e,0x3320646e
	.long	0x79622d32,0x79622d32,0x79622d32,0x79622d32
	.long	0x6b206574,0x6b206574,0x6b206574,0x6b206574
SYM_DATA_END(sigma)

	.previous

	GEN_BR_THUNK %r14

	.text

#############################################################################
# void chacha20_vx_4x(u8 *out, counst u8 *inp, size_t len,
#		      counst u32 *key, const u32 *counter)

#define	OUT		%r2
#define	INP		%r3
#define	LEN		%r4
#define	KEY		%r5
#define	COUNTER		%r6

#define BEPERM		%v31
#define CTR		%v26

#define K0		%v16
#define K1		%v17
#define K2		%v18
#define K3		%v19

#define XA0		%v0
#define XA1		%v1
#define XA2		%v2
#define XA3		%v3

#define XB0		%v4
#define XB1		%v5
#define XB2		%v6
#define XB3		%v7

#define XC0		%v8
#define XC1		%v9
#define XC2		%v10
#define XC3		%v11

#define XD0		%v12
#define XD1		%v13
#define XD2		%v14
#define XD3		%v15

#define XT0		%v27
#define XT1		%v28
#define XT2		%v29
#define XT3		%v30

SYM_FUNC_START(chacha20_vx_4x)
	stmg	%r6,%r7,6*8(SP)

	larl	%r7,sigma
	lhi	%r0,10
	lhi	%r1,0

	VL	K0,0,,%r7		# load sigma
	VL	K1,0,,KEY		# load key
	VL	K2,16,,KEY
	VL	K3,0,,COUNTER		# load counter

	VL	BEPERM,0x40,,%r7
	VL	CTR,0x50,,%r7

	VLM	XA0,XA3,0x60,%r7,4	# load [smashed] sigma

	VREPF	XB0,K1,0		# smash the key
	VREPF	XB1,K1,1
	VREPF	XB2,K1,2
	VREPF	XB3,K1,3

	VREPF	XD0,K3,0
	VREPF	XD1,K3,1
	VREPF	XD2,K3,2
	VREPF	XD3,K3,3
	VAF	XD0,XD0,CTR

	VREPF	XC0,K2,0
	VREPF	XC1,K2,1
	VREPF	XC2,K2,2
	VREPF	XC3,K2,3

.Loop_4x:
	VAF	XA0,XA0,XB0
	VX	XD0,XD0,XA0
	VERLLF	XD0,XD0,16

	VAF	XA1,XA1,XB1
	VX	XD1,XD1,XA1
	VERLLF	XD1,XD1,16

	VAF	XA2,XA2,XB2
	VX	XD2,XD2,XA2
	VERLLF	XD2,XD2,16

	VAF	XA3,XA3,XB3
	VX	XD3,XD3,XA3
	VERLLF	XD3,XD3,16

	VAF	XC0,XC0,XD0
	VX	XB0,XB0,XC0
	VERLLF	XB0,XB0,12

	VAF	XC1,XC1,XD1
	VX	XB1,XB1,XC1
	VERLLF	XB1,XB1,12

	VAF	XC2,XC2,XD2
	VX	XB2,XB2,XC2
	VERLLF	XB2,XB2,12

	VAF	XC3,XC3,XD3
	VX	XB3,XB3,XC3
	VERLLF	XB3,XB3,12

	VAF	XA0,XA0,XB0
	VX	XD0,XD0,XA0
	VERLLF	XD0,XD0,8

	VAF	XA1,XA1,XB1
	VX	XD1,XD1,XA1
	VERLLF	XD1,XD1,8

	VAF	XA2,XA2,XB2
	VX	XD2,XD2,XA2
	VERLLF	XD2,XD2,8

	VAF	XA3,XA3,XB3
	VX	XD3,XD3,XA3
	VERLLF	XD3,XD3,8

	VAF	XC0,XC0,XD0
	VX	XB0,XB0,XC0
	VERLLF	XB0,XB0,7

	VAF	XC1,XC1,XD1
	VX	XB1,XB1,XC1
	VERLLF	XB1,XB1,7

	VAF	XC2,XC2,XD2
	VX	XB2,XB2,XC2
	VERLLF	XB2,XB2,7

	VAF	XC3,XC3,XD3
	VX	XB3,XB3,XC3
	VERLLF	XB3,XB3,7

	VAF	XA0,XA0,XB1
	VX	XD3,XD3,XA0
	VERLLF	XD3,XD3,16

	VAF	XA1,XA1,XB2
	VX	XD0,XD0,XA1
	VERLLF	XD0,XD0,16

	VAF	XA2,XA2,XB3
	VX	XD1,XD1,XA2
	VERLLF	XD1,XD1,16

	VAF	XA3,XA3,XB0
	VX	XD2,XD2,XA3
	VERLLF	XD2,XD2,16

	VAF	XC2,XC2,XD3
	VX	XB1,XB1,XC2
	VERLLF	XB1,XB1,12

	VAF	XC3,XC3,XD0
	VX	XB2,XB2,XC3
	VERLLF	XB2,XB2,12

	VAF	XC0,XC0,XD1
	VX	XB3,XB3,XC0
	VERLLF	XB3,XB3,12

	VAF	XC1,XC1,XD2
	VX	XB0,XB0,XC1
	VERLLF	XB0,XB0,12

	VAF	XA0,XA0,XB1
	VX	XD3,XD3,XA0
	VERLLF	XD3,XD3,8

	VAF	XA1,XA1,XB2
	VX	XD0,XD0,XA1
	VERLLF	XD0,XD0,8

	VAF	XA2,XA2,XB3
	VX	XD1,XD1,XA2
	VERLLF	XD1,XD1,8

	VAF	XA3,XA3,XB0
	VX	XD2,XD2,XA3
	VERLLF	XD2,XD2,8

	VAF	XC2,XC2,XD3
	VX	XB1,XB1,XC2
	VERLLF	XB1,XB1,7

	VAF	XC3,XC3,XD0
	VX	XB2,XB2,XC3
	VERLLF	XB2,XB2,7

	VAF	XC0,XC0,XD1
	VX	XB3,XB3,XC0
	VERLLF	XB3,XB3,7

	VAF	XC1,XC1,XD2
	VX	XB0,XB0,XC1
	VERLLF	XB0,XB0,7
	brct	%r0,.Loop_4x

	VAF	XD0,XD0,CTR

	VMRHF	XT0,XA0,XA1		# transpose data
	VMRHF	XT1,XA2,XA3
	VMRLF	XT2,XA0,XA1
	VMRLF	XT3,XA2,XA3
	VPDI	XA0,XT0,XT1,0b0000
	VPDI	XA1,XT0,XT1,0b0101
	VPDI	XA2,XT2,XT3,0b0000
	VPDI	XA3,XT2,XT3,0b0101

	VMRHF	XT0,XB0,XB1
	VMRHF	XT1,XB2,XB3
	VMRLF	XT2,XB0,XB1
	VMRLF	XT3,XB2,XB3
	VPDI	XB0,XT0,XT1,0b0000
	VPDI	XB1,XT0,XT1,0b0101
	VPDI	XB2,XT2,XT3,0b0000
	VPDI	XB3,XT2,XT3,0b0101

	VMRHF	XT0,XC0,XC1
	VMRHF	XT1,XC2,XC3
	VMRLF	XT2,XC0,XC1
	VMRLF	XT3,XC2,XC3
	VPDI	XC0,XT0,XT1,0b0000
	VPDI	XC1,XT0,XT1,0b0101
	VPDI	XC2,XT2,XT3,0b0000
	VPDI	XC3,XT2,XT3,0b0101

	VMRHF	XT0,XD0,XD1
	VMRHF	XT1,XD2,XD3
	VMRLF	XT2,XD0,XD1
	VMRLF	XT3,XD2,XD3
	VPDI	XD0,XT0,XT1,0b0000
	VPDI	XD1,XT0,XT1,0b0101
	VPDI	XD2,XT2,XT3,0b0000
	VPDI	XD3,XT2,XT3,0b0101

	VAF	XA0,XA0,K0
	VAF	XB0,XB0,K1
	VAF	XC0,XC0,K2
	VAF	XD0,XD0,K3

	VPERM	XA0,XA0,XA0,BEPERM
	VPERM	XB0,XB0,XB0,BEPERM
	VPERM	XC0,XC0,XC0,BEPERM
	VPERM	XD0,XD0,XD0,BEPERM

	VLM	XT0,XT3,0,INP,0

	VX	XT0,XT0,XA0
	VX	XT1,XT1,XB0
	VX	XT2,XT2,XC0
	VX	XT3,XT3,XD0

	VSTM	XT0,XT3,0,OUT,0

	la	INP,0x40(INP)
	la	OUT,0x40(OUT)
	aghi	LEN,-0x40

	VAF	XA0,XA1,K0
	VAF	XB0,XB1,K1
	VAF	XC0,XC1,K2
	VAF	XD0,XD1,K3

	VPERM	XA0,XA0,XA0,BEPERM
	VPERM	XB0,XB0,XB0,BEPERM
	VPERM	XC0,XC0,XC0,BEPERM
	VPERM	XD0,XD0,XD0,BEPERM

	clgfi	LEN,0x40
	jl	.Ltail_4x

	VLM	XT0,XT3,0,INP,0

	VX	XT0,XT0,XA0
	VX	XT1,XT1,XB0
	VX	XT2,XT2,XC0
	VX	XT3,XT3,XD0

	VSTM	XT0,XT3,0,OUT,0

	la	INP,0x40(INP)
	la	OUT,0x40(OUT)
	aghi	LEN,-0x40
	je	.Ldone_4x

	VAF	XA0,XA2,K0
	VAF	XB0,XB2,K1
	VAF	XC0,XC2,K2
	VAF	XD0,XD2,K3

	VPERM	XA0,XA0,XA0,BEPERM
	VPERM	XB0,XB0,XB0,BEPERM
	VPERM	XC0,XC0,XC0,BEPERM
	VPERM	XD0,XD0,XD0,BEPERM

	clgfi	LEN,0x40
	jl	.Ltail_4x

	VLM	XT0,XT3,0,INP,0

	VX	XT0,XT0,XA0
	VX	XT1,XT1,XB0
	VX	XT2,XT2,XC0
	VX	XT3,XT3,XD0

	VSTM	XT0,XT3,0,OUT,0

	la	INP,0x40(INP)
	la	OUT,0x40(OUT)
	aghi	LEN,-0x40
	je	.Ldone_4x

	VAF	XA0,XA3,K0
	VAF	XB0,XB3,K1
	VAF	XC0,XC3,K2
	VAF	XD0,XD3,K3

	VPERM	XA0,XA0,XA0,BEPERM
	VPERM	XB0,XB0,XB0,BEPERM
	VPERM	XC0,XC0,XC0,BEPERM
	VPERM	XD0,XD0,XD0,BEPERM

	clgfi	LEN,0x40
	jl	.Ltail_4x

	VLM	XT0,XT3,0,INP,0

	VX	XT0,XT0,XA0
	VX	XT1,XT1,XB0
	VX	XT2,XT2,XC0
	VX	XT3,XT3,XD0

	VSTM	XT0,XT3,0,OUT,0

.Ldone_4x:
	lmg	%r6,%r7,6*8(SP)
	BR_EX	%r14

.Ltail_4x:
	VLR	XT0,XC0
	VLR	XT1,XD0

	VST	XA0,8*8+0x00,,SP
	VST	XB0,8*8+0x10,,SP
	VST	XT0,8*8+0x20,,SP
	VST	XT1,8*8+0x30,,SP

	lghi	%r1,0

.Loop_tail_4x:
	llgc	%r5,0(%r1,INP)
	llgc	%r6,8*8(%r1,SP)
	xr	%r6,%r5
	stc	%r6,0(%r1,OUT)
	la	%r1,1(%r1)
	brct	LEN,.Loop_tail_4x

	lmg	%r6,%r7,6*8(SP)
	BR_EX	%r14
SYM_FUNC_END(chacha20_vx_4x)

#undef	OUT
#undef	INP
#undef	LEN
#undef	KEY
#undef	COUNTER

#undef BEPERM

#undef K0
#undef K1
#undef K2
#undef K3


#############################################################################
# void chacha20_vx(u8 *out, counst u8 *inp, size_t len,
#		   counst u32 *key, const u32 *counter)

#define	OUT		%r2
#define	INP		%r3
#define	LEN		%r4
#define	KEY		%r5
#define	COUNTER		%r6

#define BEPERM		%v31

#define K0		%v27
#define K1		%v24
#define K2		%v25
#define K3		%v26

#define A0		%v0
#define B0		%v1
#define C0		%v2
#define D0		%v3

#define A1		%v4
#define B1		%v5
#define C1		%v6
#define D1		%v7

#define A2		%v8
#define B2		%v9
#define C2		%v10
#define D2		%v11

#define A3		%v12
#define B3		%v13
#define C3		%v14
#define D3		%v15

#define A4		%v16
#define B4		%v17
#define C4		%v18
#define D4		%v19

#define A5		%v20
#define B5		%v21
#define C5		%v22
#define D5		%v23

#define T0		%v27
#define T1		%v28
#define T2		%v29
#define T3		%v30

SYM_FUNC_START(chacha20_vx)
	clgfi	LEN,256
	jle	chacha20_vx_4x
	stmg	%r6,%r7,6*8(SP)

	lghi	%r1,-FRAME
	lgr	%r0,SP
	la	SP,0(%r1,SP)
	stg	%r0,0(SP)		# back-chain

	larl	%r7,sigma
	lhi	%r0,10

	VLM	K1,K2,0,KEY,0		# load key
	VL	K3,0,,COUNTER		# load counter

	VLM	K0,BEPERM,0,%r7,4	# load sigma, increments, ...

.Loop_outer_vx:
	VLR	A0,K0
	VLR	B0,K1
	VLR	A1,K0
	VLR	B1,K1
	VLR	A2,K0
	VLR	B2,K1
	VLR	A3,K0
	VLR	B3,K1
	VLR	A4,K0
	VLR	B4,K1
	VLR	A5,K0
	VLR	B5,K1

	VLR	D0,K3
	VAF	D1,K3,T1		# K[3]+1
	VAF	D2,K3,T2		# K[3]+2
	VAF	D3,K3,T3		# K[3]+3
	VAF	D4,D2,T2		# K[3]+4
	VAF	D5,D2,T3		# K[3]+5

	VLR	C0,K2
	VLR	C1,K2
	VLR	C2,K2
	VLR	C3,K2
	VLR	C4,K2
	VLR	C5,K2

	VLR	T1,D1
	VLR	T2,D2
	VLR	T3,D3

.Loop_vx:
	VAF	A0,A0,B0
	VAF	A1,A1,B1
	VAF	A2,A2,B2
	VAF	A3,A3,B3
	VAF	A4,A4,B4
	VAF	A5,A5,B5
	VX	D0,D0,A0
	VX	D1,D1,A1
	VX	D2,D2,A2
	VX	D3,D3,A3
	VX	D4,D4,A4
	VX	D5,D5,A5
	VERLLF	D0,D0,16
	VERLLF	D1,D1,16
	VERLLF	D2,D2,16
	VERLLF	D3,D3,16
	VERLLF	D4,D4,16
	VERLLF	D5,D5,16

	VAF	C0,C0,D0
	VAF	C1,C1,D1
	VAF	C2,C2,D2
	VAF	C3,C3,D3
	VAF	C4,C4,D4
	VAF	C5,C5,D5
	VX	B0,B0,C0
	VX	B1,B1,C1
	VX	B2,B2,C2
	VX	B3,B3,C3
	VX	B4,B4,C4
	VX	B5,B5,C5
	VERLLF	B0,B0,12
	VERLLF	B1,B1,12
	VERLLF	B2,B2,12
	VERLLF	B3,B3,12
	VERLLF	B4,B4,12
	VERLLF	B5,B5,12

	VAF	A0,A0,B0
	VAF	A1,A1,B1
	VAF	A2,A2,B2
	VAF	A3,A3,B3
	VAF	A4,A4,B4
	VAF	A5,A5,B5
	VX	D0,D0,A0
	VX	D1,D1,A1
	VX	D2,D2,A2
	VX	D3,D3,A3
	VX	D4,D4,A4
	VX	D5,D5,A5
	VERLLF	D0,D0,8
	VERLLF	D1,D1,8
	VERLLF	D2,D2,8
	VERLLF	D3,D3,8
	VERLLF	D4,D4,8
	VERLLF	D5,D5,8

	VAF	C0,C0,D0
	VAF	C1,C1,D1
	VAF	C2,C2,D2
	VAF	C3,C3,D3
	VAF	C4,C4,D4
	VAF	C5,C5,D5
	VX	B0,B0,C0
	VX	B1,B1,C1
	VX	B2,B2,C2
	VX	B3,B3,C3
	VX	B4,B4,C4
	VX	B5,B5,C5
	VERLLF	B0,B0,7
	VERLLF	B1,B1,7
	VERLLF	B2,B2,7
	VERLLF	B3,B3,7
	VERLLF	B4,B4,7
	VERLLF	B5,B5,7

	VSLDB	C0,C0,C0,8
	VSLDB	C1,C1,C1,8
	VSLDB	C2,C2,C2,8
	VSLDB	C3,C3,C3,8
	VSLDB	C4,C4,C4,8
	VSLDB	C5,C5,C5,8
	VSLDB	B0,B0,B0,4
	VSLDB	B1,B1,B1,4
	VSLDB	B2,B2,B2,4
	VSLDB	B3,B3,B3,4
	VSLDB	B4,B4,B4,4
	VSLDB	B5,B5,B5,4
	VSLDB	D0,D0,D0,12
	VSLDB	D1,D1,D1,12
	VSLDB	D2,D2,D2,12
	VSLDB	D3,D3,D3,12
	VSLDB	D4,D4,D4,12
	VSLDB	D5,D5,D5,12

	VAF	A0,A0,B0
	VAF	A1,A1,B1
	VAF	A2,A2,B2
	VAF	A3,A3,B3
	VAF	A4,A4,B4
	VAF	A5,A5,B5
	VX	D0,D0,A0
	VX	D1,D1,A1
	VX	D2,D2,A2
	VX	D3,D3,A3
	VX	D4,D4,A4
	VX	D5,D5,A5
	VERLLF	D0,D0,16
	VERLLF	D1,D1,16
	VERLLF	D2,D2,16
	VERLLF	D3,D3,16
	VERLLF	D4,D4,16
	VERLLF	D5,D5,16

	VAF	C0,C0,D0
	VAF	C1,C1,D1
	VAF	C2,C2,D2
	VAF	C3,C3,D3
	VAF	C4,C4,D4
	VAF	C5,C5,D5
	VX	B0,B0,C0
	VX	B1,B1,C1
	VX	B2,B2,C2
	VX	B3,B3,C3
	VX	B4,B4,C4
	VX	B5,B5,C5
	VERLLF	B0,B0,12
	VERLLF	B1,B1,12
	VERLLF	B2,B2,12
	VERLLF	B3,B3,12
	VERLLF	B4,B4,12
	VERLLF	B5,B5,12

	VAF	A0,A0,B0
	VAF	A1,A1,B1
	VAF	A2,A2,B2
	VAF	A3,A3,B3
	VAF	A4,A4,B4
	VAF	A5,A5,B5
	VX	D0,D0,A0
	VX	D1,D1,A1
	VX	D2,D2,A2
	VX	D3,D3,A3
	VX	D4,D4,A4
	VX	D5,D5,A5
	VERLLF	D0,D0,8
	VERLLF	D1,D1,8
	VERLLF	D2,D2,8
	VERLLF	D3,D3,8
	VERLLF	D4,D4,8
	VERLLF	D5,D5,8

	VAF	C0,C0,D0
	VAF	C1,C1,D1
	VAF	C2,C2,D2
	VAF	C3,C3,D3
	VAF	C4,C4,D4
	VAF	C5,C5,D5
	VX	B0,B0,C0
	VX	B1,B1,C1
	VX	B2,B2,C2
	VX	B3,B3,C3
	VX	B4,B4,C4
	VX	B5,B5,C5
	VERLLF	B0,B0,7
	VERLLF	B1,B1,7
	VERLLF	B2,B2,7
	VERLLF	B3,B3,7
	VERLLF	B4,B4,7
	VERLLF	B5,B5,7

	VSLDB	C0,C0,C0,8
	VSLDB	C1,C1,C1,8
	VSLDB	C2,C2,C2,8
	VSLDB	C3,C3,C3,8
	VSLDB	C4,C4,C4,8
	VSLDB	C5,C5,C5,8
	VSLDB	B0,B0,B0,12
	VSLDB	B1,B1,B1,12
	VSLDB	B2,B2,B2,12
	VSLDB	B3,B3,B3,12
	VSLDB	B4,B4,B4,12
	VSLDB	B5,B5,B5,12
	VSLDB	D0,D0,D0,4
	VSLDB	D1,D1,D1,4
	VSLDB	D2,D2,D2,4
	VSLDB	D3,D3,D3,4
	VSLDB	D4,D4,D4,4
	VSLDB	D5,D5,D5,4
	brct	%r0,.Loop_vx

	VAF	A0,A0,K0
	VAF	B0,B0,K1
	VAF	C0,C0,K2
	VAF	D0,D0,K3
	VAF	A1,A1,K0
	VAF	D1,D1,T1		# +K[3]+1

	VPERM	A0,A0,A0,BEPERM
	VPERM	B0,B0,B0,BEPERM
	VPERM	C0,C0,C0,BEPERM
	VPERM	D0,D0,D0,BEPERM

	clgfi	LEN,0x40
	jl	.Ltail_vx

	VAF	D2,D2,T2		# +K[3]+2
	VAF	D3,D3,T3		# +K[3]+3
	VLM	T0,T3,0,INP,0

	VX	A0,A0,T0
	VX	B0,B0,T1
	VX	C0,C0,T2
	VX	D0,D0,T3

	VLM	K0,T3,0,%r7,4		# re-load sigma and increments

	VSTM	A0,D0,0,OUT,0

	la	INP,0x40(INP)
	la	OUT,0x40(OUT)
	aghi	LEN,-0x40
	je	.Ldone_vx

	VAF	B1,B1,K1
	VAF	C1,C1,K2

	VPERM	A0,A1,A1,BEPERM
	VPERM	B0,B1,B1,BEPERM
	VPERM	C0,C1,C1,BEPERM
	VPERM	D0,D1,D1,BEPERM

	clgfi	LEN,0x40
	jl	.Ltail_vx

	VLM	A1,D1,0,INP,0

	VX	A0,A0,A1
	VX	B0,B0,B1
	VX	C0,C0,C1
	VX	D0,D0,D1

	VSTM	A0,D0,0,OUT,0

	la	INP,0x40(INP)
	la	OUT,0x40(OUT)
	aghi	LEN,-0x40
	je	.Ldone_vx

	VAF	A2,A2,K0
	VAF	B2,B2,K1
	VAF	C2,C2,K2

	VPERM	A0,A2,A2,BEPERM
	VPERM	B0,B2,B2,BEPERM
	VPERM	C0,C2,C2,BEPERM
	VPERM	D0,D2,D2,BEPERM

	clgfi	LEN,0x40
	jl	.Ltail_vx

	VLM	A1,D1,0,INP,0

	VX	A0,A0,A1
	VX	B0,B0,B1
	VX	C0,C0,C1
	VX	D0,D0,D1

	VSTM	A0,D0,0,OUT,0

	la	INP,0x40(INP)
	la	OUT,0x40(OUT)
	aghi	LEN,-0x40
	je	.Ldone_vx

	VAF	A3,A3,K0
	VAF	B3,B3,K1
	VAF	C3,C3,K2
	VAF	D2,K3,T3		# K[3]+3

	VPERM	A0,A3,A3,BEPERM
	VPERM	B0,B3,B3,BEPERM
	VPERM	C0,C3,C3,BEPERM
	VPERM	D0,D3,D3,BEPERM

	clgfi	LEN,0x40
	jl	.Ltail_vx

	VAF	D3,D2,T1		# K[3]+4
	VLM	A1,D1,0,INP,0

	VX	A0,A0,A1
	VX	B0,B0,B1
	VX	C0,C0,C1
	VX	D0,D0,D1

	VSTM	A0,D0,0,OUT,0

	la	INP,0x40(INP)
	la	OUT,0x40(OUT)
	aghi	LEN,-0x40
	je	.Ldone_vx

	VAF	A4,A4,K0
	VAF	B4,B4,K1
	VAF	C4,C4,K2
	VAF	D4,D4,D3		# +K[3]+4
	VAF	D3,D3,T1		# K[3]+5
	VAF	K3,D2,T3		# K[3]+=6

	VPERM	A0,A4,A4,BEPERM
	VPERM	B0,B4,B4,BEPERM
	VPERM	C0,C4,C4,BEPERM
	VPERM	D0,D4,D4,BEPERM

	clgfi	LEN,0x40
	jl	.Ltail_vx

	VLM	A1,D1,0,INP,0

	VX	A0,A0,A1
	VX	B0,B0,B1
	VX	C0,C0,C1
	VX	D0,D0,D1

	VSTM	A0,D0,0,OUT,0

	la	INP,0x40(INP)
	la	OUT,0x40(OUT)
	aghi	LEN,-0x40
	je	.Ldone_vx

	VAF	A5,A5,K0
	VAF	B5,B5,K1
	VAF	C5,C5,K2
	VAF	D5,D5,D3		# +K[3]+5

	VPERM	A0,A5,A5,BEPERM
	VPERM	B0,B5,B5,BEPERM
	VPERM	C0,C5,C5,BEPERM
	VPERM	D0,D5,D5,BEPERM

	clgfi	LEN,0x40
	jl	.Ltail_vx

	VLM	A1,D1,0,INP,0

	VX	A0,A0,A1
	VX	B0,B0,B1
	VX	C0,C0,C1
	VX	D0,D0,D1

	VSTM	A0,D0,0,OUT,0

	la	INP,0x40(INP)
	la	OUT,0x40(OUT)
	lhi	%r0,10
	aghi	LEN,-0x40
	jne	.Loop_outer_vx

.Ldone_vx:
	lmg	%r6,%r7,FRAME+6*8(SP)
	la	SP,FRAME(SP)
	BR_EX	%r14

.Ltail_vx:
	VSTM	A0,D0,8*8,SP,3
	lghi	%r1,0

.Loop_tail_vx:
	llgc	%r5,0(%r1,INP)
	llgc	%r6,8*8(%r1,SP)
	xr	%r6,%r5
	stc	%r6,0(%r1,OUT)
	la	%r1,1(%r1)
	brct	LEN,.Loop_tail_vx

	lmg	%r6,%r7,FRAME+6*8(SP)
	la	SP,FRAME(SP)
	BR_EX	%r14
SYM_FUNC_END(chacha20_vx)

.previous