#include <linux/linkage.h>
#include <asm/frame.h>
.section .rodata, "a"
.align 16
CONSTANTS: .octa 0x6b20657479622d323320646e61707865
.text
SYM_FUNC_START(__arch_chacha20_blocks_nostack)
.set output, %rdi
.set key, %rsi
.set counter, %rdx
.set nblocks, %rcx
.set i, %al
.set temp, %xmm0
.set state0, %xmm1
.set state1, %xmm2
.set state2, %xmm3
.set state3, %xmm4
.set copy0, %xmm5
.set copy1, %xmm6
.set copy2, %xmm7
.set copy3, %xmm8
.set one, %xmm9
movaps CONSTANTS(%rip),copy0
movups 0x00(key),copy1
movups 0x10(key),copy2
movq 0x00(counter),copy3
movq $1,%rax
movq %rax,one
.Lblock:
movdqa copy0,state0
movdqa copy1,state1
movdqa copy2,state2
movdqa copy3,state3
movb $10,i
.Lpermute:
paddd state1,state0
pxor state0,state3
movdqa state3,temp
pslld $16,temp
psrld $16,state3
por temp,state3
paddd state3,state2
pxor state2,state1
movdqa state1,temp
pslld $12,temp
psrld $20,state1
por temp,state1
paddd state1,state0
pxor state0,state3
movdqa state3,temp
pslld $8,temp
psrld $24,state3
por temp,state3
paddd state3,state2
pxor state2,state1
movdqa state1,temp
pslld $7,temp
psrld $25,state1
por temp,state1
pshufd $0x39,state1,state1
pshufd $0x4e,state2,state2
pshufd $0x93,state3,state3
paddd state1,state0
pxor state0,state3
movdqa state3,temp
pslld $16,temp
psrld $16,state3
por temp,state3
paddd state3,state2
pxor state2,state1
movdqa state1,temp
pslld $12,temp
psrld $20,state1
por temp,state1
paddd state1,state0
pxor state0,state3
movdqa state3,temp
pslld $8,temp
psrld $24,state3
por temp,state3
paddd state3,state2
pxor state2,state1
movdqa state1,temp
pslld $7,temp
psrld $25,state1
por temp,state1
pshufd $0x93,state1,state1
pshufd $0x4e,state2,state2
pshufd $0x39,state3,state3
decb i
jnz .Lpermute
paddd copy0,state0
movups state0,0x00(output)
paddd copy1,state1
movups state1,0x10(output)
paddd copy2,state2
movups state2,0x20(output)
paddd copy3,state3
movups state3,0x30(output)
paddq one,copy3
addq $64,output
decq nblocks
jnz .Lblock
movq copy3,0x00(counter)
pxor state0,state0
pxor state1,state1
pxor state2,state2
pxor state3,state3
pxor copy1,copy1
pxor copy2,copy2
pxor temp,temp
ret
SYM_FUNC_END(__arch_chacha20_blocks_nostack)