#include <linux/linkage.h>
.section .rodata
.align 64
.Lconst:
.Lmask24:
.long 0x0ffffff,0,0x0ffffff,0,0x0ffffff,0,0x0ffffff,0
.L129:
.long 16777216,0,16777216,0,16777216,0,16777216,0
.Lmask26:
.long 0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0
.Lpermd_avx2:
.long 2,2,2,3,2,0,2,1
.Lpermd_avx512:
.long 0,0,0,1, 0,2,0,3, 0,4,0,5, 0,6,0,7
.L2_44_inp_permd:
.long 0,1,1,2,2,3,7,7
.L2_44_inp_shift:
.quad 0,12,24,64
.L2_44_mask:
.quad 0xfffffffffff,0xfffffffffff,0x3ffffffffff,0xffffffffffffffff
.L2_44_shift_rgt:
.quad 44,44,42,64
.L2_44_shift_lft:
.quad 8,8,10,64
.align 64
.Lx_mask44:
.quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
.quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
.Lx_mask42:
.quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
.quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
.text
SYM_FUNC_START(poly1305_init_x86_64)
.Lpoly1305_init_x86_64:
xor %eax,%eax
mov %rax,0(%rdi) # initialize hash value
mov %rax,8(%rdi)
mov %rax,16(%rdi)
test %rsi,%rsi
je .Lno_key
mov $0x0ffffffc0fffffff,%rax
mov $0x0ffffffc0ffffffc,%rcx
and 0(%rsi),%rax
and 8(%rsi),%rcx
mov %rax,24(%rdi)
mov %rcx,32(%rdi)
mov $1,%eax
.Lno_key:
RET
SYM_FUNC_END(poly1305_init_x86_64)
SYM_FUNC_START(poly1305_blocks_x86_64)
.Lpoly1305_blocks_x86_64:
.Lblocks:
shr $4,%rdx
jz .Lno_data # too short
push %rbx
push %r12
push %r13
push %r14
push %r15
push %rdi
.Lblocks_body:
mov %rdx,%r15 # reassign %rdx
mov 24(%rdi),%r11 # load r
mov 32(%rdi),%r13
mov 0(%rdi),%r14 # load hash value
mov 8(%rdi),%rbx
mov 16(%rdi),%r10
mov %r13,%r12
shr $2,%r13
mov %r12,%rax
add %r12,%r13 # s1 = r1 + (r1 >> 2)
jmp .Loop
.align 32
.Loop:
add 0(%rsi),%r14 # accumulate input
adc 8(%rsi),%rbx
lea 16(%rsi),%rsi
adc %rcx,%r10
mulq %r14 # h0*r1
mov %rax,%r9
mov %r11,%rax
mov %rdx,%rdi
mulq %r14 # h0*r0
mov %rax,%r14 # future %r14
mov %r11,%rax
mov %rdx,%r8
mulq %rbx # h1*r0
add %rax,%r9
mov %r13,%rax
adc %rdx,%rdi
mulq %rbx # h1*s1
mov %r10,%rbx # borrow %rbx
add %rax,%r14
adc %rdx,%r8
imulq %r13,%rbx # h2*s1
add %rbx,%r9
mov %r8,%rbx
adc $0,%rdi
imulq %r11,%r10 # h2*r0
add %r9,%rbx
mov $-4,%rax # mask value
adc %r10,%rdi
and %rdi,%rax # last reduction step
mov %rdi,%r10
shr $2,%rdi
and $3,%r10
add %rdi,%rax
add %rax,%r14
adc $0,%rbx
adc $0,%r10
mov %r12,%rax
dec %r15 # len-=16
jnz .Loop
mov 0(%rsp),%rdi
mov %r14,0(%rdi) # store hash value
mov %rbx,8(%rdi)
mov %r10,16(%rdi)
mov 8(%rsp),%r15
mov 16(%rsp),%r14
mov 24(%rsp),%r13
mov 32(%rsp),%r12
mov 40(%rsp),%rbx
lea 48(%rsp),%rsp
.Lno_data:
.Lblocks_epilogue:
RET
SYM_FUNC_END(poly1305_blocks_x86_64)
SYM_FUNC_START(poly1305_emit_x86_64)
.Lpoly1305_emit_x86_64:
.Lemit:
mov 0(%rdi),%r8 # load hash value
mov 8(%rdi),%r9
mov 16(%rdi),%r10
mov %r8,%rax
add $5,%r8 # compare to modulus
mov %r9,%rcx
adc $0,%r9
adc $0,%r10
shr $2,%r10 # did 130-bit value overflow?
cmovnz %r8,%rax
cmovnz %r9,%rcx
add 0(%rdx),%rax # accumulate nonce
adc 8(%rdx),%rcx
mov %rax,0(%rsi) # write result
mov %rcx,8(%rsi)
RET
SYM_FUNC_END(poly1305_emit_x86_64)
.type __poly1305_block,@function
.align 32
__poly1305_block:
push %rdi
mulq %r14 # h0*r1
mov %rax,%r9
mov %r11,%rax
mov %rdx,%rdi
mulq %r14 # h0*r0
mov %rax,%r14 # future %r14
mov %r11,%rax
mov %rdx,%r8
mulq %rbx # h1*r0
add %rax,%r9
mov %r13,%rax
adc %rdx,%rdi
mulq %rbx # h1*s1
mov %r10,%rbx # borrow %rbx
add %rax,%r14
adc %rdx,%r8
imulq %r13,%rbx # h2*s1
add %rbx,%r9
mov %r8,%rbx
adc $0,%rdi
imulq %r11,%r10 # h2*r0
add %r9,%rbx
mov $-4,%rax # mask value
adc %r10,%rdi
and %rdi,%rax # last reduction step
mov %rdi,%r10
shr $2,%rdi
and $3,%r10
add %rdi,%rax
add %rax,%r14
adc $0,%rbx
adc $0,%r10
pop %rdi
RET
.size __poly1305_block,.-__poly1305_block
.type __poly1305_init_avx,@function
.align 32
__poly1305_init_avx:
push %rbp
mov %rsp,%rbp
mov %r11,%r14
mov %r12,%rbx
xor %r10,%r10
lea 48+64(%rdi),%rdi # size optimization
mov %r12,%rax
call __poly1305_block # r^2
mov $0x3ffffff,%eax # save interleaved r^2 and r base 2^26
mov $0x3ffffff,%edx
mov %r14,%r8
and %r14d,%eax
mov %r11,%r9
and %r11d,%edx
mov %eax,-64(%rdi)
shr $26,%r8
mov %edx,-60(%rdi)
shr $26,%r9
mov $0x3ffffff,%eax
mov $0x3ffffff,%edx
and %r8d,%eax
and %r9d,%edx
mov %eax,-48(%rdi)
lea (%rax,%rax,4),%eax # *5
mov %edx,-44(%rdi)
lea (%rdx,%rdx,4),%edx # *5
mov %eax,-32(%rdi)
shr $26,%r8
mov %edx,-28(%rdi)
shr $26,%r9
mov %rbx,%rax
mov %r12,%rdx
shl $12,%rax
shl $12,%rdx
or %r8,%rax
or %r9,%rdx
and $0x3ffffff,%eax
and $0x3ffffff,%edx
mov %eax,-16(%rdi)
lea (%rax,%rax,4),%eax # *5
mov %edx,-12(%rdi)
lea (%rdx,%rdx,4),%edx # *5
mov %eax,0(%rdi)
mov %rbx,%r8
mov %edx,4(%rdi)
mov %r12,%r9
mov $0x3ffffff,%eax
mov $0x3ffffff,%edx
shr $14,%r8
shr $14,%r9
and %r8d,%eax
and %r9d,%edx
mov %eax,16(%rdi)
lea (%rax,%rax,4),%eax # *5
mov %edx,20(%rdi)
lea (%rdx,%rdx,4),%edx # *5
mov %eax,32(%rdi)
shr $26,%r8
mov %edx,36(%rdi)
shr $26,%r9
mov %r10,%rax
shl $24,%rax
or %rax,%r8
mov %r8d,48(%rdi)
lea (%r8,%r8,4),%r8 # *5
mov %r9d,52(%rdi)
lea (%r9,%r9,4),%r9 # *5
mov %r8d,64(%rdi)
mov %r9d,68(%rdi)
mov %r12,%rax
call __poly1305_block # r^3
mov $0x3ffffff,%eax # save r^3 base 2^26
mov %r14,%r8
and %r14d,%eax
shr $26,%r8
mov %eax,-52(%rdi)
mov $0x3ffffff,%edx
and %r8d,%edx
mov %edx,-36(%rdi)
lea (%rdx,%rdx,4),%edx # *5
shr $26,%r8
mov %edx,-20(%rdi)
mov %rbx,%rax
shl $12,%rax
or %r8,%rax
and $0x3ffffff,%eax
mov %eax,-4(%rdi)
lea (%rax,%rax,4),%eax # *5
mov %rbx,%r8
mov %eax,12(%rdi)
mov $0x3ffffff,%edx
shr $14,%r8
and %r8d,%edx
mov %edx,28(%rdi)
lea (%rdx,%rdx,4),%edx # *5
shr $26,%r8
mov %edx,44(%rdi)
mov %r10,%rax
shl $24,%rax
or %rax,%r8
mov %r8d,60(%rdi)
lea (%r8,%r8,4),%r8 # *5
mov %r8d,76(%rdi)
mov %r12,%rax
call __poly1305_block # r^4
mov $0x3ffffff,%eax # save r^4 base 2^26
mov %r14,%r8
and %r14d,%eax
shr $26,%r8
mov %eax,-56(%rdi)
mov $0x3ffffff,%edx
and %r8d,%edx
mov %edx,-40(%rdi)
lea (%rdx,%rdx,4),%edx # *5
shr $26,%r8
mov %edx,-24(%rdi)
mov %rbx,%rax
shl $12,%rax
or %r8,%rax
and $0x3ffffff,%eax
mov %eax,-8(%rdi)
lea (%rax,%rax,4),%eax # *5
mov %rbx,%r8
mov %eax,8(%rdi)
mov $0x3ffffff,%edx
shr $14,%r8
and %r8d,%edx
mov %edx,24(%rdi)
lea (%rdx,%rdx,4),%edx # *5
shr $26,%r8
mov %edx,40(%rdi)
mov %r10,%rax
shl $24,%rax
or %rax,%r8
mov %r8d,56(%rdi)
lea (%r8,%r8,4),%r8 # *5
mov %r8d,72(%rdi)
lea -48-64(%rdi),%rdi # size [de-]optimization
pop %rbp
RET
.size __poly1305_init_avx,.-__poly1305_init_avx
SYM_FUNC_START(poly1305_blocks_avx)
.Lpoly1305_blocks_avx:
mov 20(%rdi),%r8d # is_base2_26
cmp $128,%rdx
jae .Lblocks_avx
test %r8d,%r8d
jz .Lblocks
.Lblocks_avx:
and $-16,%rdx
jz .Lno_data_avx
vzeroupper
test %r8d,%r8d
jz .Lbase2_64_avx
test $31,%rdx
jz .Leven_avx
push %rbp
mov %rsp,%rbp
push %rbx
push %r12
push %r13
push %r14
push %r15
.Lblocks_avx_body:
mov %rdx,%r15 # reassign %rdx
mov 0(%rdi),%r8 # load hash value
mov 8(%rdi),%r9
mov 16(%rdi),%r10d
mov 24(%rdi),%r11 # load r
mov 32(%rdi),%r13
################################# base 2^26 -> base 2^64
mov %r8d,%r14d
and $-2147483648,%r8
mov %r9,%r12 # borrow %r12
mov %r9d,%ebx
and $-2147483648,%r9
shr $6,%r8
shl $52,%r12
add %r8,%r14
shr $12,%rbx
shr $18,%r9
add %r12,%r14
adc %r9,%rbx
mov %r10,%r8
shl $40,%r8
shr $24,%r10
add %r8,%rbx
adc $0,%r10 # can be partially reduced...
mov $-4,%r9 # ... so reduce
mov %r10,%r8
and %r10,%r9
shr $2,%r8
and $3,%r10
add %r9,%r8 # =*5
add %r8,%r14
adc $0,%rbx
adc $0,%r10
mov %r13,%r12
mov %r13,%rax
shr $2,%r13
add %r12,%r13 # s1 = r1 + (r1 >> 2)
add 0(%rsi),%r14 # accumulate input
adc 8(%rsi),%rbx
lea 16(%rsi),%rsi
adc %rcx,%r10
call __poly1305_block
test %rcx,%rcx # if %rcx is zero,
jz .Lstore_base2_64_avx # store hash in base 2^64 format
################################# base 2^64 -> base 2^26
mov %r14,%rax
mov %r14,%rdx
shr $52,%r14
mov %rbx,%r11
mov %rbx,%r12
shr $26,%rdx
and $0x3ffffff,%rax # h[0]
shl $12,%r11
and $0x3ffffff,%rdx # h[1]
shr $14,%rbx
or %r11,%r14
shl $24,%r10
and $0x3ffffff,%r14 # h[2]
shr $40,%r12
and $0x3ffffff,%rbx # h[3]
or %r12,%r10 # h[4]
sub $16,%r15
jz .Lstore_base2_26_avx
vmovd %eax,%xmm0
vmovd %edx,%xmm1
vmovd %r14d,%xmm2
vmovd %ebx,%xmm3
vmovd %r10d,%xmm4
jmp .Lproceed_avx
.align 32
.Lstore_base2_64_avx:
mov %r14,0(%rdi)
mov %rbx,8(%rdi)
mov %r10,16(%rdi) # note that is_base2_26 is zeroed
jmp .Ldone_avx
.align 16
.Lstore_base2_26_avx:
mov %eax,0(%rdi) # store hash value base 2^26
mov %edx,4(%rdi)
mov %r14d,8(%rdi)
mov %ebx,12(%rdi)
mov %r10d,16(%rdi)
.align 16
.Ldone_avx:
pop %r15
pop %r14
pop %r13
pop %r12
pop %rbx
pop %rbp
.Lno_data_avx:
.Lblocks_avx_epilogue:
RET
.align 32
.Lbase2_64_avx:
push %rbp
mov %rsp,%rbp
push %rbx
push %r12
push %r13
push %r14
push %r15
.Lbase2_64_avx_body:
mov %rdx,%r15 # reassign %rdx
mov 24(%rdi),%r11 # load r
mov 32(%rdi),%r13
mov 0(%rdi),%r14 # load hash value
mov 8(%rdi),%rbx
mov 16(%rdi),%r10d
mov %r13,%r12
mov %r13,%rax
shr $2,%r13
add %r12,%r13 # s1 = r1 + (r1 >> 2)
test $31,%rdx
jz .Linit_avx
add 0(%rsi),%r14 # accumulate input
adc 8(%rsi),%rbx
lea 16(%rsi),%rsi
adc %rcx,%r10
sub $16,%r15
call __poly1305_block
.Linit_avx:
################################# base 2^64 -> base 2^26
mov %r14,%rax
mov %r14,%rdx
shr $52,%r14
mov %rbx,%r8
mov %rbx,%r9
shr $26,%rdx
and $0x3ffffff,%rax # h[0]
shl $12,%r8
and $0x3ffffff,%rdx # h[1]
shr $14,%rbx
or %r8,%r14
shl $24,%r10
and $0x3ffffff,%r14 # h[2]
shr $40,%r9
and $0x3ffffff,%rbx # h[3]
or %r9,%r10 # h[4]
vmovd %eax,%xmm0
vmovd %edx,%xmm1
vmovd %r14d,%xmm2
vmovd %ebx,%xmm3
vmovd %r10d,%xmm4
movl $1,20(%rdi) # set is_base2_26
call __poly1305_init_avx
.Lproceed_avx:
mov %r15,%rdx
pop %r15
pop %r14
pop %r13
pop %r12
pop %rbx
pop %rbp
.Lbase2_64_avx_epilogue:
jmp .Ldo_avx
.align 32
.Leven_avx:
vmovd 4*0(%rdi),%xmm0 # load hash value
vmovd 4*1(%rdi),%xmm1
vmovd 4*2(%rdi),%xmm2
vmovd 4*3(%rdi),%xmm3
vmovd 4*4(%rdi),%xmm4
.Ldo_avx:
lea 8(%rsp),%r10
and $-32,%rsp
sub $-8,%rsp
lea -0x58(%rsp),%r11
sub $0x178,%rsp
sub $64,%rdx
lea -32(%rsi),%rax
cmovc %rax,%rsi
vmovdqu 48(%rdi),%xmm14 # preload r0^2
lea 112(%rdi),%rdi # size optimization
lea .Lconst(%rip),%rcx
################################################################
# load input
vmovdqu 16*2(%rsi),%xmm5
vmovdqu 16*3(%rsi),%xmm6
vmovdqa 64(%rcx),%xmm15 # .Lmask26
vpsrldq $6,%xmm5,%xmm7 # splat input
vpsrldq $6,%xmm6,%xmm8
vpunpckhqdq %xmm6,%xmm5,%xmm9 # 4
vpunpcklqdq %xmm6,%xmm5,%xmm5 # 0:1
vpunpcklqdq %xmm8,%xmm7,%xmm8 # 2:3
vpsrlq $40,%xmm9,%xmm9 # 4
vpsrlq $26,%xmm5,%xmm6
vpand %xmm15,%xmm5,%xmm5 # 0
vpsrlq $4,%xmm8,%xmm7
vpand %xmm15,%xmm6,%xmm6 # 1
vpsrlq $30,%xmm8,%xmm8
vpand %xmm15,%xmm7,%xmm7 # 2
vpand %xmm15,%xmm8,%xmm8 # 3
vpor 32(%rcx),%xmm9,%xmm9 # padbit, yes, always
jbe .Lskip_loop_avx
# expand and copy pre-calculated table to stack
vmovdqu -48(%rdi),%xmm11
vmovdqu -32(%rdi),%xmm12
vpshufd $0xEE,%xmm14,%xmm13 # 34xx -> 3434
vpshufd $0x44,%xmm14,%xmm10 # xx12 -> 1212
vmovdqa %xmm13,-0x90(%r11)
vmovdqa %xmm10,0x00(%rsp)
vpshufd $0xEE,%xmm11,%xmm14
vmovdqu -16(%rdi),%xmm10
vpshufd $0x44,%xmm11,%xmm11
vmovdqa %xmm14,-0x80(%r11)
vmovdqa %xmm11,0x10(%rsp)
vpshufd $0xEE,%xmm12,%xmm13
vmovdqu 0(%rdi),%xmm11
vpshufd $0x44,%xmm12,%xmm12
vmovdqa %xmm13,-0x70(%r11)
vmovdqa %xmm12,0x20(%rsp)
vpshufd $0xEE,%xmm10,%xmm14
vmovdqu 16(%rdi),%xmm12
vpshufd $0x44,%xmm10,%xmm10
vmovdqa %xmm14,-0x60(%r11)
vmovdqa %xmm10,0x30(%rsp)
vpshufd $0xEE,%xmm11,%xmm13
vmovdqu 32(%rdi),%xmm10
vpshufd $0x44,%xmm11,%xmm11
vmovdqa %xmm13,-0x50(%r11)
vmovdqa %xmm11,0x40(%rsp)
vpshufd $0xEE,%xmm12,%xmm14
vmovdqu 48(%rdi),%xmm11
vpshufd $0x44,%xmm12,%xmm12
vmovdqa %xmm14,-0x40(%r11)
vmovdqa %xmm12,0x50(%rsp)
vpshufd $0xEE,%xmm10,%xmm13
vmovdqu 64(%rdi),%xmm12
vpshufd $0x44,%xmm10,%xmm10
vmovdqa %xmm13,-0x30(%r11)
vmovdqa %xmm10,0x60(%rsp)
vpshufd $0xEE,%xmm11,%xmm14
vpshufd $0x44,%xmm11,%xmm11
vmovdqa %xmm14,-0x20(%r11)
vmovdqa %xmm11,0x70(%rsp)
vpshufd $0xEE,%xmm12,%xmm13
vmovdqa 0x00(%rsp),%xmm14 # preload r0^2
vpshufd $0x44,%xmm12,%xmm12
vmovdqa %xmm13,-0x10(%r11)
vmovdqa %xmm12,0x80(%rsp)
jmp .Loop_avx
.align 32
.Loop_avx:
################################################################
# ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
# ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
# ___________________/
# ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
# ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
# ___________________/ ____________________/
#
# Note that we start with inp[2:3]*r^2. This is because it
# doesn't depend on reduction in previous iteration.
################################################################
# d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
# d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
# d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
# d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
# d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
#
# though note that and are "reversed" in this section,
# and %xmm14 is preloaded with r0^2...
vpmuludq %xmm5,%xmm14,%xmm10 # d0 = h0*r0
vpmuludq %xmm6,%xmm14,%xmm11 # d1 = h1*r0
vmovdqa %xmm2,0x20(%r11) # offload hash
vpmuludq %xmm7,%xmm14,%xmm12 # d3 = h2*r0
vmovdqa 0x10(%rsp),%xmm2 # r1^2
vpmuludq %xmm8,%xmm14,%xmm13 # d3 = h3*r0
vpmuludq %xmm9,%xmm14,%xmm14 # d4 = h4*r0
vmovdqa %xmm0,0x00(%r11) #
vpmuludq 0x20(%rsp),%xmm9,%xmm0 # h4*s1
vmovdqa %xmm1,0x10(%r11) #
vpmuludq %xmm8,%xmm2,%xmm1 # h3*r1
vpaddq %xmm0,%xmm10,%xmm10 # d0 += h4*s1
vpaddq %xmm1,%xmm14,%xmm14 # d4 += h3*r1
vmovdqa %xmm3,0x30(%r11) #
vpmuludq %xmm7,%xmm2,%xmm0 # h2*r1
vpmuludq %xmm6,%xmm2,%xmm1 # h1*r1
vpaddq %xmm0,%xmm13,%xmm13 # d3 += h2*r1
vmovdqa 0x30(%rsp),%xmm3 # r2^2
vpaddq %xmm1,%xmm12,%xmm12 # d2 += h1*r1
vmovdqa %xmm4,0x40(%r11) #
vpmuludq %xmm5,%xmm2,%xmm2 # h0*r1
vpmuludq %xmm7,%xmm3,%xmm0 # h2*r2
vpaddq %xmm2,%xmm11,%xmm11 # d1 += h0*r1
vmovdqa 0x40(%rsp),%xmm4 # s2^2
vpaddq %xmm0,%xmm14,%xmm14 # d4 += h2*r2
vpmuludq %xmm6,%xmm3,%xmm1 # h1*r2
vpmuludq %xmm5,%xmm3,%xmm3 # h0*r2
vpaddq %xmm1,%xmm13,%xmm13 # d3 += h1*r2
vmovdqa 0x50(%rsp),%xmm2 # r3^2
vpaddq %xmm3,%xmm12,%xmm12 # d2 += h0*r2
vpmuludq %xmm9,%xmm4,%xmm0 # h4*s2
vpmuludq %xmm8,%xmm4,%xmm4 # h3*s2
vpaddq %xmm0,%xmm11,%xmm11 # d1 += h4*s2
vmovdqa 0x60(%rsp),%xmm3 # s3^2
vpaddq %xmm4,%xmm10,%xmm10 # d0 += h3*s2
vmovdqa 0x80(%rsp),%xmm4 # s4^2
vpmuludq %xmm6,%xmm2,%xmm1 # h1*r3
vpmuludq %xmm5,%xmm2,%xmm2 # h0*r3
vpaddq %xmm1,%xmm14,%xmm14 # d4 += h1*r3
vpaddq %xmm2,%xmm13,%xmm13 # d3 += h0*r3
vpmuludq %xmm9,%xmm3,%xmm0 # h4*s3
vpmuludq %xmm8,%xmm3,%xmm1 # h3*s3
vpaddq %xmm0,%xmm12,%xmm12 # d2 += h4*s3
vmovdqu 16*0(%rsi),%xmm0 # load input
vpaddq %xmm1,%xmm11,%xmm11 # d1 += h3*s3
vpmuludq %xmm7,%xmm3,%xmm3 # h2*s3
vpmuludq %xmm7,%xmm4,%xmm7 # h2*s4
vpaddq %xmm3,%xmm10,%xmm10 # d0 += h2*s3
vmovdqu 16*1(%rsi),%xmm1 #
vpaddq %xmm7,%xmm11,%xmm11 # d1 += h2*s4
vpmuludq %xmm8,%xmm4,%xmm8 # h3*s4
vpmuludq %xmm9,%xmm4,%xmm9 # h4*s4
vpsrldq $6,%xmm0,%xmm2 # splat input
vpaddq %xmm8,%xmm12,%xmm12 # d2 += h3*s4
vpaddq %xmm9,%xmm13,%xmm13 # d3 += h4*s4
vpsrldq $6,%xmm1,%xmm3 #
vpmuludq 0x70(%rsp),%xmm5,%xmm9 # h0*r4
vpmuludq %xmm6,%xmm4,%xmm5 # h1*s4
vpunpckhqdq %xmm1,%xmm0,%xmm4 # 4
vpaddq %xmm9,%xmm14,%xmm14 # d4 += h0*r4
vmovdqa -0x90(%r11),%xmm9 # r0^4
vpaddq %xmm5,%xmm10,%xmm10 # d0 += h1*s4
vpunpcklqdq %xmm1,%xmm0,%xmm0 # 0:1
vpunpcklqdq %xmm3,%xmm2,%xmm3 # 2:3
#vpsrlq $40,%xmm4,%xmm4 # 4
vpsrldq $5,%xmm4,%xmm4 # 4
vpsrlq $26,%xmm0,%xmm1
vpand %xmm15,%xmm0,%xmm0 # 0
vpsrlq $4,%xmm3,%xmm2
vpand %xmm15,%xmm1,%xmm1 # 1
vpand 0(%rcx),%xmm4,%xmm4 # .Lmask24
vpsrlq $30,%xmm3,%xmm3
vpand %xmm15,%xmm2,%xmm2 # 2
vpand %xmm15,%xmm3,%xmm3 # 3
vpor 32(%rcx),%xmm4,%xmm4 # padbit, yes, always
vpaddq 0x00(%r11),%xmm0,%xmm0 # add hash value
vpaddq 0x10(%r11),%xmm1,%xmm1
vpaddq 0x20(%r11),%xmm2,%xmm2
vpaddq 0x30(%r11),%xmm3,%xmm3
vpaddq 0x40(%r11),%xmm4,%xmm4
lea 16*2(%rsi),%rax
lea 16*4(%rsi),%rsi
sub $64,%rdx
cmovc %rax,%rsi
################################################################
# Now we accumulate (inp[0:1]+hash)*r^4
################################################################
# d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
# d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
# d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
# d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
# d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
vpmuludq %xmm0,%xmm9,%xmm5 # h0*r0
vpmuludq %xmm1,%xmm9,%xmm6 # h1*r0
vpaddq %xmm5,%xmm10,%xmm10
vpaddq %xmm6,%xmm11,%xmm11
vmovdqa -0x80(%r11),%xmm7 # r1^4
vpmuludq %xmm2,%xmm9,%xmm5 # h2*r0
vpmuludq %xmm3,%xmm9,%xmm6 # h3*r0
vpaddq %xmm5,%xmm12,%xmm12
vpaddq %xmm6,%xmm13,%xmm13
vpmuludq %xmm4,%xmm9,%xmm9 # h4*r0
vpmuludq -0x70(%r11),%xmm4,%xmm5 # h4*s1
vpaddq %xmm9,%xmm14,%xmm14
vpaddq %xmm5,%xmm10,%xmm10 # d0 += h4*s1
vpmuludq %xmm2,%xmm7,%xmm6 # h2*r1
vpmuludq %xmm3,%xmm7,%xmm5 # h3*r1
vpaddq %xmm6,%xmm13,%xmm13 # d3 += h2*r1
vmovdqa -0x60(%r11),%xmm8 # r2^4
vpaddq %xmm5,%xmm14,%xmm14 # d4 += h3*r1
vpmuludq %xmm1,%xmm7,%xmm6 # h1*r1
vpmuludq %xmm0,%xmm7,%xmm7 # h0*r1
vpaddq %xmm6,%xmm12,%xmm12 # d2 += h1*r1
vpaddq %xmm7,%xmm11,%xmm11 # d1 += h0*r1
vmovdqa -0x50(%r11),%xmm9 # s2^4
vpmuludq %xmm2,%xmm8,%xmm5 # h2*r2
vpmuludq %xmm1,%xmm8,%xmm6 # h1*r2
vpaddq %xmm5,%xmm14,%xmm14 # d4 += h2*r2
vpaddq %xmm6,%xmm13,%xmm13 # d3 += h1*r2
vmovdqa -0x40(%r11),%xmm7 # r3^4
vpmuludq %xmm0,%xmm8,%xmm8 # h0*r2
vpmuludq %xmm4,%xmm9,%xmm5 # h4*s2
vpaddq %xmm8,%xmm12,%xmm12 # d2 += h0*r2
vpaddq %xmm5,%xmm11,%xmm11 # d1 += h4*s2
vmovdqa -0x30(%r11),%xmm8 # s3^4
vpmuludq %xmm3,%xmm9,%xmm9 # h3*s2
vpmuludq %xmm1,%xmm7,%xmm6 # h1*r3
vpaddq %xmm9,%xmm10,%xmm10 # d0 += h3*s2
vmovdqa -0x10(%r11),%xmm9 # s4^4
vpaddq %xmm6,%xmm14,%xmm14 # d4 += h1*r3
vpmuludq %xmm0,%xmm7,%xmm7 # h0*r3
vpmuludq %xmm4,%xmm8,%xmm5 # h4*s3
vpaddq %xmm7,%xmm13,%xmm13 # d3 += h0*r3
vpaddq %xmm5,%xmm12,%xmm12 # d2 += h4*s3
vmovdqu 16*2(%rsi),%xmm5 # load input
vpmuludq %xmm3,%xmm8,%xmm7 # h3*s3
vpmuludq %xmm2,%xmm8,%xmm8 # h2*s3
vpaddq %xmm7,%xmm11,%xmm11 # d1 += h3*s3
vmovdqu 16*3(%rsi),%xmm6 #
vpaddq %xmm8,%xmm10,%xmm10 # d0 += h2*s3
vpmuludq %xmm2,%xmm9,%xmm2 # h2*s4
vpmuludq %xmm3,%xmm9,%xmm3 # h3*s4
vpsrldq $6,%xmm5,%xmm7 # splat input
vpaddq %xmm2,%xmm11,%xmm11 # d1 += h2*s4
vpmuludq %xmm4,%xmm9,%xmm4 # h4*s4
vpsrldq $6,%xmm6,%xmm8 #
vpaddq %xmm3,%xmm12,%xmm2 # h2 = d2 + h3*s4
vpaddq %xmm4,%xmm13,%xmm3 # h3 = d3 + h4*s4
vpmuludq -0x20(%r11),%xmm0,%xmm4 # h0*r4
vpmuludq %xmm1,%xmm9,%xmm0
vpunpckhqdq %xmm6,%xmm5,%xmm9 # 4
vpaddq %xmm4,%xmm14,%xmm4 # h4 = d4 + h0*r4
vpaddq %xmm0,%xmm10,%xmm0 # h0 = d0 + h1*s4
vpunpcklqdq %xmm6,%xmm5,%xmm5 # 0:1
vpunpcklqdq %xmm8,%xmm7,%xmm8 # 2:3
#vpsrlq $40,%xmm9,%xmm9 # 4
vpsrldq $5,%xmm9,%xmm9 # 4
vpsrlq $26,%xmm5,%xmm6
vmovdqa 0x00(%rsp),%xmm14 # preload r0^2
vpand %xmm15,%xmm5,%xmm5 # 0
vpsrlq $4,%xmm8,%xmm7
vpand %xmm15,%xmm6,%xmm6 # 1
vpand 0(%rcx),%xmm9,%xmm9 # .Lmask24
vpsrlq $30,%xmm8,%xmm8
vpand %xmm15,%xmm7,%xmm7 # 2
vpand %xmm15,%xmm8,%xmm8 # 3
vpor 32(%rcx),%xmm9,%xmm9 # padbit, yes, always
################################################################
# lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
# and P. Schwabe
vpsrlq $26,%xmm3,%xmm13
vpand %xmm15,%xmm3,%xmm3
vpaddq %xmm13,%xmm4,%xmm4 # h3 -> h4
vpsrlq $26,%xmm0,%xmm10
vpand %xmm15,%xmm0,%xmm0
vpaddq %xmm10,%xmm11,%xmm1 # h0 -> h1
vpsrlq $26,%xmm4,%xmm10
vpand %xmm15,%xmm4,%xmm4
vpsrlq $26,%xmm1,%xmm11
vpand %xmm15,%xmm1,%xmm1
vpaddq %xmm11,%xmm2,%xmm2 # h1 -> h2
vpaddq %xmm10,%xmm0,%xmm0
vpsllq $2,%xmm10,%xmm10
vpaddq %xmm10,%xmm0,%xmm0 # h4 -> h0
vpsrlq $26,%xmm2,%xmm12
vpand %xmm15,%xmm2,%xmm2
vpaddq %xmm12,%xmm3,%xmm3 # h2 -> h3
vpsrlq $26,%xmm0,%xmm10
vpand %xmm15,%xmm0,%xmm0
vpaddq %xmm10,%xmm1,%xmm1 # h0 -> h1
vpsrlq $26,%xmm3,%xmm13
vpand %xmm15,%xmm3,%xmm3
vpaddq %xmm13,%xmm4,%xmm4 # h3 -> h4
ja .Loop_avx
.Lskip_loop_avx:
################################################################
# multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
vpshufd $0x10,%xmm14,%xmm14 # r0^n, xx12 -> x1x2
add $32,%rdx
jnz .Long_tail_avx
vpaddq %xmm2,%xmm7,%xmm7
vpaddq %xmm0,%xmm5,%xmm5
vpaddq %xmm1,%xmm6,%xmm6
vpaddq %xmm3,%xmm8,%xmm8
vpaddq %xmm4,%xmm9,%xmm9
.Long_tail_avx:
vmovdqa %xmm2,0x20(%r11)
vmovdqa %xmm0,0x00(%r11)
vmovdqa %xmm1,0x10(%r11)
vmovdqa %xmm3,0x30(%r11)
vmovdqa %xmm4,0x40(%r11)
# d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
# d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
# d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
# d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
# d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
vpmuludq %xmm7,%xmm14,%xmm12 # d2 = h2*r0
vpmuludq %xmm5,%xmm14,%xmm10 # d0 = h0*r0
vpshufd $0x10,-48(%rdi),%xmm2 # r1^n
vpmuludq %xmm6,%xmm14,%xmm11 # d1 = h1*r0
vpmuludq %xmm8,%xmm14,%xmm13 # d3 = h3*r0
vpmuludq %xmm9,%xmm14,%xmm14 # d4 = h4*r0
vpmuludq %xmm8,%xmm2,%xmm0 # h3*r1
vpaddq %xmm0,%xmm14,%xmm14 # d4 += h3*r1
vpshufd $0x10,-32(%rdi),%xmm3 # s1^n
vpmuludq %xmm7,%xmm2,%xmm1 # h2*r1
vpaddq %xmm1,%xmm13,%xmm13 # d3 += h2*r1
vpshufd $0x10,-16(%rdi),%xmm4 # r2^n
vpmuludq %xmm6,%xmm2,%xmm0 # h1*r1
vpaddq %xmm0,%xmm12,%xmm12 # d2 += h1*r1
vpmuludq %xmm5,%xmm2,%xmm2 # h0*r1
vpaddq %xmm2,%xmm11,%xmm11 # d1 += h0*r1
vpmuludq %xmm9,%xmm3,%xmm3 # h4*s1
vpaddq %xmm3,%xmm10,%xmm10 # d0 += h4*s1
vpshufd $0x10,0(%rdi),%xmm2 # s2^n
vpmuludq %xmm7,%xmm4,%xmm1 # h2*r2
vpaddq %xmm1,%xmm14,%xmm14 # d4 += h2*r2
vpmuludq %xmm6,%xmm4,%xmm0 # h1*r2
vpaddq %xmm0,%xmm13,%xmm13 # d3 += h1*r2
vpshufd $0x10,16(%rdi),%xmm3 # r3^n
vpmuludq %xmm5,%xmm4,%xmm4 # h0*r2
vpaddq %xmm4,%xmm12,%xmm12 # d2 += h0*r2
vpmuludq %xmm9,%xmm2,%xmm1 # h4*s2
vpaddq %xmm1,%xmm11,%xmm11 # d1 += h4*s2
vpshufd $0x10,32(%rdi),%xmm4 # s3^n
vpmuludq %xmm8,%xmm2,%xmm2 # h3*s2
vpaddq %xmm2,%xmm10,%xmm10 # d0 += h3*s2
vpmuludq %xmm6,%xmm3,%xmm0 # h1*r3
vpaddq %xmm0,%xmm14,%xmm14 # d4 += h1*r3
vpmuludq %xmm5,%xmm3,%xmm3 # h0*r3
vpaddq %xmm3,%xmm13,%xmm13 # d3 += h0*r3
vpshufd $0x10,48(%rdi),%xmm2 # r4^n
vpmuludq %xmm9,%xmm4,%xmm1 # h4*s3
vpaddq %xmm1,%xmm12,%xmm12 # d2 += h4*s3
vpshufd $0x10,64(%rdi),%xmm3 # s4^n
vpmuludq %xmm8,%xmm4,%xmm0 # h3*s3
vpaddq %xmm0,%xmm11,%xmm11 # d1 += h3*s3
vpmuludq %xmm7,%xmm4,%xmm4 # h2*s3
vpaddq %xmm4,%xmm10,%xmm10 # d0 += h2*s3
vpmuludq %xmm5,%xmm2,%xmm2 # h0*r4
vpaddq %xmm2,%xmm14,%xmm14 # h4 = d4 + h0*r4
vpmuludq %xmm9,%xmm3,%xmm1 # h4*s4
vpaddq %xmm1,%xmm13,%xmm13 # h3 = d3 + h4*s4
vpmuludq %xmm8,%xmm3,%xmm0 # h3*s4
vpaddq %xmm0,%xmm12,%xmm12 # h2 = d2 + h3*s4
vpmuludq %xmm7,%xmm3,%xmm1 # h2*s4
vpaddq %xmm1,%xmm11,%xmm11 # h1 = d1 + h2*s4
vpmuludq %xmm6,%xmm3,%xmm3 # h1*s4
vpaddq %xmm3,%xmm10,%xmm10 # h0 = d0 + h1*s4
jz .Lshort_tail_avx
vmovdqu 16*0(%rsi),%xmm0 # load input
vmovdqu 16*1(%rsi),%xmm1
vpsrldq $6,%xmm0,%xmm2 # splat input
vpsrldq $6,%xmm1,%xmm3
vpunpckhqdq %xmm1,%xmm0,%xmm4 # 4
vpunpcklqdq %xmm1,%xmm0,%xmm0 # 0:1
vpunpcklqdq %xmm3,%xmm2,%xmm3 # 2:3
vpsrlq $40,%xmm4,%xmm4 # 4
vpsrlq $26,%xmm0,%xmm1
vpand %xmm15,%xmm0,%xmm0 # 0
vpsrlq $4,%xmm3,%xmm2
vpand %xmm15,%xmm1,%xmm1 # 1
vpsrlq $30,%xmm3,%xmm3
vpand %xmm15,%xmm2,%xmm2 # 2
vpand %xmm15,%xmm3,%xmm3 # 3
vpor 32(%rcx),%xmm4,%xmm4 # padbit, yes, always
vpshufd $0x32,-64(%rdi),%xmm9 # r0^n, 34xx -> x3x4
vpaddq 0x00(%r11),%xmm0,%xmm0
vpaddq 0x10(%r11),%xmm1,%xmm1
vpaddq 0x20(%r11),%xmm2,%xmm2
vpaddq 0x30(%r11),%xmm3,%xmm3
vpaddq 0x40(%r11),%xmm4,%xmm4
################################################################
# multiply (inp[0:1]+hash) by r^4:r^3 and accumulate
vpmuludq %xmm0,%xmm9,%xmm5 # h0*r0
vpaddq %xmm5,%xmm10,%xmm10 # d0 += h0*r0
vpmuludq %xmm1,%xmm9,%xmm6 # h1*r0
vpaddq %xmm6,%xmm11,%xmm11 # d1 += h1*r0
vpmuludq %xmm2,%xmm9,%xmm5 # h2*r0
vpaddq %xmm5,%xmm12,%xmm12 # d2 += h2*r0
vpshufd $0x32,-48(%rdi),%xmm7 # r1^n
vpmuludq %xmm3,%xmm9,%xmm6 # h3*r0
vpaddq %xmm6,%xmm13,%xmm13 # d3 += h3*r0
vpmuludq %xmm4,%xmm9,%xmm9 # h4*r0
vpaddq %xmm9,%xmm14,%xmm14 # d4 += h4*r0
vpmuludq %xmm3,%xmm7,%xmm5 # h3*r1
vpaddq %xmm5,%xmm14,%xmm14 # d4 += h3*r1
vpshufd $0x32,-32(%rdi),%xmm8 # s1
vpmuludq %xmm2,%xmm7,%xmm6 # h2*r1
vpaddq %xmm6,%xmm13,%xmm13 # d3 += h2*r1
vpshufd $0x32,-16(%rdi),%xmm9 # r2
vpmuludq %xmm1,%xmm7,%xmm5 # h1*r1
vpaddq %xmm5,%xmm12,%xmm12 # d2 += h1*r1
vpmuludq %xmm0,%xmm7,%xmm7 # h0*r1
vpaddq %xmm7,%xmm11,%xmm11 # d1 += h0*r1
vpmuludq %xmm4,%xmm8,%xmm8 # h4*s1
vpaddq %xmm8,%xmm10,%xmm10 # d0 += h4*s1
vpshufd $0x32,0(%rdi),%xmm7 # s2
vpmuludq %xmm2,%xmm9,%xmm6 # h2*r2
vpaddq %xmm6,%xmm14,%xmm14 # d4 += h2*r2
vpmuludq %xmm1,%xmm9,%xmm5 # h1*r2
vpaddq %xmm5,%xmm13,%xmm13 # d3 += h1*r2
vpshufd $0x32,16(%rdi),%xmm8 # r3
vpmuludq %xmm0,%xmm9,%xmm9 # h0*r2
vpaddq %xmm9,%xmm12,%xmm12 # d2 += h0*r2
vpmuludq %xmm4,%xmm7,%xmm6 # h4*s2
vpaddq %xmm6,%xmm11,%xmm11 # d1 += h4*s2
vpshufd $0x32,32(%rdi),%xmm9 # s3
vpmuludq %xmm3,%xmm7,%xmm7 # h3*s2
vpaddq %xmm7,%xmm10,%xmm10 # d0 += h3*s2
vpmuludq %xmm1,%xmm8,%xmm5 # h1*r3
vpaddq %xmm5,%xmm14,%xmm14 # d4 += h1*r3
vpmuludq %xmm0,%xmm8,%xmm8 # h0*r3
vpaddq %xmm8,%xmm13,%xmm13 # d3 += h0*r3
vpshufd $0x32,48(%rdi),%xmm7 # r4
vpmuludq %xmm4,%xmm9,%xmm6 # h4*s3
vpaddq %xmm6,%xmm12,%xmm12 # d2 += h4*s3
vpshufd $0x32,64(%rdi),%xmm8 # s4
vpmuludq %xmm3,%xmm9,%xmm5 # h3*s3
vpaddq %xmm5,%xmm11,%xmm11 # d1 += h3*s3
vpmuludq %xmm2,%xmm9,%xmm9 # h2*s3
vpaddq %xmm9,%xmm10,%xmm10 # d0 += h2*s3
vpmuludq %xmm0,%xmm7,%xmm7 # h0*r4
vpaddq %xmm7,%xmm14,%xmm14 # d4 += h0*r4
vpmuludq %xmm4,%xmm8,%xmm6 # h4*s4
vpaddq %xmm6,%xmm13,%xmm13 # d3 += h4*s4
vpmuludq %xmm3,%xmm8,%xmm5 # h3*s4
vpaddq %xmm5,%xmm12,%xmm12 # d2 += h3*s4
vpmuludq %xmm2,%xmm8,%xmm6 # h2*s4
vpaddq %xmm6,%xmm11,%xmm11 # d1 += h2*s4
vpmuludq %xmm1,%xmm8,%xmm8 # h1*s4
vpaddq %xmm8,%xmm10,%xmm10 # d0 += h1*s4
.Lshort_tail_avx:
################################################################
# horizontal addition
vpsrldq $8,%xmm14,%xmm9
vpsrldq $8,%xmm13,%xmm8
vpsrldq $8,%xmm11,%xmm6
vpsrldq $8,%xmm10,%xmm5
vpsrldq $8,%xmm12,%xmm7
vpaddq %xmm8,%xmm13,%xmm13
vpaddq %xmm9,%xmm14,%xmm14
vpaddq %xmm5,%xmm10,%xmm10
vpaddq %xmm6,%xmm11,%xmm11
vpaddq %xmm7,%xmm12,%xmm12
################################################################
# lazy reduction
vpsrlq $26,%xmm13,%xmm3
vpand %xmm15,%xmm13,%xmm13
vpaddq %xmm3,%xmm14,%xmm14 # h3 -> h4
vpsrlq $26,%xmm10,%xmm0
vpand %xmm15,%xmm10,%xmm10
vpaddq %xmm0,%xmm11,%xmm11 # h0 -> h1
vpsrlq $26,%xmm14,%xmm4
vpand %xmm15,%xmm14,%xmm14
vpsrlq $26,%xmm11,%xmm1
vpand %xmm15,%xmm11,%xmm11
vpaddq %xmm1,%xmm12,%xmm12 # h1 -> h2
vpaddq %xmm4,%xmm10,%xmm10
vpsllq $2,%xmm4,%xmm4
vpaddq %xmm4,%xmm10,%xmm10 # h4 -> h0
vpsrlq $26,%xmm12,%xmm2
vpand %xmm15,%xmm12,%xmm12
vpaddq %xmm2,%xmm13,%xmm13 # h2 -> h3
vpsrlq $26,%xmm10,%xmm0
vpand %xmm15,%xmm10,%xmm10
vpaddq %xmm0,%xmm11,%xmm11 # h0 -> h1
vpsrlq $26,%xmm13,%xmm3
vpand %xmm15,%xmm13,%xmm13
vpaddq %xmm3,%xmm14,%xmm14 # h3 -> h4
vmovd %xmm10,-112(%rdi) # save partially reduced
vmovd %xmm11,-108(%rdi)
vmovd %xmm12,-104(%rdi)
vmovd %xmm13,-100(%rdi)
vmovd %xmm14,-96(%rdi)
lea -8(%r10),%rsp
vzeroupper
RET
SYM_FUNC_END(poly1305_blocks_avx)
SYM_FUNC_START(poly1305_emit_avx)
.Lpoly1305_emit_avx:
cmpl $0,20(%rdi) # is_base2_26?
je .Lemit
mov 0(%rdi),%eax # load hash value base 2^26
mov 4(%rdi),%ecx
mov 8(%rdi),%r8d
mov 12(%rdi),%r11d
mov 16(%rdi),%r10d
shl $26,%rcx # base 2^26 -> base 2^64
mov %r8,%r9
shl $52,%r8
add %rcx,%rax
shr $12,%r9
add %rax,%r8 # h0
adc $0,%r9
shl $14,%r11
mov %r10,%rax
shr $24,%r10
add %r11,%r9
shl $40,%rax
add %rax,%r9 # h1
adc $0,%r10 # h2
mov %r10,%rax # could be partially reduced, so reduce
mov %r10,%rcx
and $3,%r10
shr $2,%rax
and $-4,%rcx
add %rcx,%rax
add %rax,%r8
adc $0,%r9
adc $0,%r10
mov %r8,%rax
add $5,%r8 # compare to modulus
mov %r9,%rcx
adc $0,%r9
adc $0,%r10
shr $2,%r10 # did 130-bit value overflow?
cmovnz %r8,%rax
cmovnz %r9,%rcx
add 0(%rdx),%rax # accumulate nonce
adc 8(%rdx),%rcx
mov %rax,0(%rsi) # write result
mov %rcx,8(%rsi)
RET
SYM_FUNC_END(poly1305_emit_avx)
SYM_FUNC_START(poly1305_blocks_avx2)
.Lpoly1305_blocks_avx2:
mov 20(%rdi),%r8d # is_base2_26
cmp $128,%rdx
jae .Lblocks_avx2
test %r8d,%r8d
jz .Lblocks
.Lblocks_avx2:
and $-16,%rdx
jz .Lno_data_avx2
vzeroupper
test %r8d,%r8d
jz .Lbase2_64_avx2
test $63,%rdx
jz .Leven_avx2
push %rbp
mov %rsp,%rbp
push %rbx
push %r12
push %r13
push %r14
push %r15
.Lblocks_avx2_body:
mov %rdx,%r15 # reassign %rdx
mov 0(%rdi),%r8 # load hash value
mov 8(%rdi),%r9
mov 16(%rdi),%r10d
mov 24(%rdi),%r11 # load r
mov 32(%rdi),%r13
################################# base 2^26 -> base 2^64
mov %r8d,%r14d
and $-2147483648,%r8
mov %r9,%r12 # borrow %r12
mov %r9d,%ebx
and $-2147483648,%r9
shr $6,%r8
shl $52,%r12
add %r8,%r14
shr $12,%rbx
shr $18,%r9
add %r12,%r14
adc %r9,%rbx
mov %r10,%r8
shl $40,%r8
shr $24,%r10
add %r8,%rbx
adc $0,%r10 # can be partially reduced...
mov $-4,%r9 # ... so reduce
mov %r10,%r8
and %r10,%r9
shr $2,%r8
and $3,%r10
add %r9,%r8 # =*5
add %r8,%r14
adc $0,%rbx
adc $0,%r10
mov %r13,%r12
mov %r13,%rax
shr $2,%r13
add %r12,%r13 # s1 = r1 + (r1 >> 2)
.Lbase2_26_pre_avx2:
add 0(%rsi),%r14 # accumulate input
adc 8(%rsi),%rbx
lea 16(%rsi),%rsi
adc %rcx,%r10
sub $16,%r15
call __poly1305_block
mov %r12,%rax
test $63,%r15
jnz .Lbase2_26_pre_avx2
test %rcx,%rcx # if %rcx is zero,
jz .Lstore_base2_64_avx2 # store hash in base 2^64 format
################################# base 2^64 -> base 2^26
mov %r14,%rax
mov %r14,%rdx
shr $52,%r14
mov %rbx,%r11
mov %rbx,%r12
shr $26,%rdx
and $0x3ffffff,%rax # h[0]
shl $12,%r11
and $0x3ffffff,%rdx # h[1]
shr $14,%rbx
or %r11,%r14
shl $24,%r10
and $0x3ffffff,%r14 # h[2]
shr $40,%r12
and $0x3ffffff,%rbx # h[3]
or %r12,%r10 # h[4]
test %r15,%r15
jz .Lstore_base2_26_avx2
vmovd %eax,%xmm0
vmovd %edx,%xmm1
vmovd %r14d,%xmm2
vmovd %ebx,%xmm3
vmovd %r10d,%xmm4
jmp .Lproceed_avx2
.align 32
.Lstore_base2_64_avx2:
mov %r14,0(%rdi)
mov %rbx,8(%rdi)
mov %r10,16(%rdi) # note that is_base2_26 is zeroed
jmp .Ldone_avx2
.align 16
.Lstore_base2_26_avx2:
mov %eax,0(%rdi) # store hash value base 2^26
mov %edx,4(%rdi)
mov %r14d,8(%rdi)
mov %ebx,12(%rdi)
mov %r10d,16(%rdi)
.align 16
.Ldone_avx2:
pop %r15
pop %r14
pop %r13
pop %r12
pop %rbx
pop %rbp
.Lno_data_avx2:
.Lblocks_avx2_epilogue:
RET
.align 32
.Lbase2_64_avx2:
push %rbp
mov %rsp,%rbp
push %rbx
push %r12
push %r13
push %r14
push %r15
.Lbase2_64_avx2_body:
mov %rdx,%r15 # reassign %rdx
mov 24(%rdi),%r11 # load r
mov 32(%rdi),%r13
mov 0(%rdi),%r14 # load hash value
mov 8(%rdi),%rbx
mov 16(%rdi),%r10d
mov %r13,%r12
mov %r13,%rax
shr $2,%r13
add %r12,%r13 # s1 = r1 + (r1 >> 2)
test $63,%rdx
jz .Linit_avx2
.Lbase2_64_pre_avx2:
add 0(%rsi),%r14 # accumulate input
adc 8(%rsi),%rbx
lea 16(%rsi),%rsi
adc %rcx,%r10
sub $16,%r15
call __poly1305_block
mov %r12,%rax
test $63,%r15
jnz .Lbase2_64_pre_avx2
.Linit_avx2:
################################# base 2^64 -> base 2^26
mov %r14,%rax
mov %r14,%rdx
shr $52,%r14
mov %rbx,%r8
mov %rbx,%r9
shr $26,%rdx
and $0x3ffffff,%rax # h[0]
shl $12,%r8
and $0x3ffffff,%rdx # h[1]
shr $14,%rbx
or %r8,%r14
shl $24,%r10
and $0x3ffffff,%r14 # h[2]
shr $40,%r9
and $0x3ffffff,%rbx # h[3]
or %r9,%r10 # h[4]
vmovd %eax,%xmm0
vmovd %edx,%xmm1
vmovd %r14d,%xmm2
vmovd %ebx,%xmm3
vmovd %r10d,%xmm4
movl $1,20(%rdi) # set is_base2_26
call __poly1305_init_avx
.Lproceed_avx2:
mov %r15,%rdx # restore %rdx
pop %r15
pop %r14
pop %r13
pop %r12
pop %rbx
pop %rbp
.Lbase2_64_avx2_epilogue:
jmp .Ldo_avx2
.align 32
.Leven_avx2:
vmovd 4*0(%rdi),%xmm0 # load hash value base 2^26
vmovd 4*1(%rdi),%xmm1
vmovd 4*2(%rdi),%xmm2
vmovd 4*3(%rdi),%xmm3
vmovd 4*4(%rdi),%xmm4
.Ldo_avx2:
lea 8(%rsp),%r10
sub $0x128,%rsp
lea .Lconst(%rip),%rcx
lea 48+64(%rdi),%rdi # size optimization
vmovdqa 96(%rcx),%ymm7 # .Lpermd_avx2
# expand and copy pre-calculated table to stack
vmovdqu -64(%rdi),%xmm9
and $-512,%rsp
vmovdqu -48(%rdi),%xmm10
vmovdqu -32(%rdi),%xmm6
vmovdqu -16(%rdi),%xmm11
vmovdqu 0(%rdi),%xmm12
vmovdqu 16(%rdi),%xmm13
lea 0x90(%rsp),%rax # size optimization
vmovdqu 32(%rdi),%xmm14
vpermd %ymm9,%ymm7,%ymm9 # 00003412 -> 14243444
vmovdqu 48(%rdi),%xmm15
vpermd %ymm10,%ymm7,%ymm10
vmovdqu 64(%rdi),%xmm5
vpermd %ymm6,%ymm7,%ymm6
vmovdqa %ymm9,0x00(%rsp)
vpermd %ymm11,%ymm7,%ymm11
vmovdqa %ymm10,0x20-0x90(%rax)
vpermd %ymm12,%ymm7,%ymm12
vmovdqa %ymm6,0x40-0x90(%rax)
vpermd %ymm13,%ymm7,%ymm13
vmovdqa %ymm11,0x60-0x90(%rax)
vpermd %ymm14,%ymm7,%ymm14
vmovdqa %ymm12,0x80-0x90(%rax)
vpermd %ymm15,%ymm7,%ymm15
vmovdqa %ymm13,0xa0-0x90(%rax)
vpermd %ymm5,%ymm7,%ymm5
vmovdqa %ymm14,0xc0-0x90(%rax)
vmovdqa %ymm15,0xe0-0x90(%rax)
vmovdqa %ymm5,0x100-0x90(%rax)
vmovdqa 64(%rcx),%ymm5 # .Lmask26
################################################################
# load input
vmovdqu 16*0(%rsi),%xmm7
vmovdqu 16*1(%rsi),%xmm8
vinserti128 $1,16*2(%rsi),%ymm7,%ymm7
vinserti128 $1,16*3(%rsi),%ymm8,%ymm8
lea 16*4(%rsi),%rsi
vpsrldq $6,%ymm7,%ymm9 # splat input
vpsrldq $6,%ymm8,%ymm10
vpunpckhqdq %ymm8,%ymm7,%ymm6 # 4
vpunpcklqdq %ymm10,%ymm9,%ymm9 # 2:3
vpunpcklqdq %ymm8,%ymm7,%ymm7 # 0:1
vpsrlq $30,%ymm9,%ymm10
vpsrlq $4,%ymm9,%ymm9
vpsrlq $26,%ymm7,%ymm8
vpsrlq $40,%ymm6,%ymm6 # 4
vpand %ymm5,%ymm9,%ymm9 # 2
vpand %ymm5,%ymm7,%ymm7 # 0
vpand %ymm5,%ymm8,%ymm8 # 1
vpand %ymm5,%ymm10,%ymm10 # 3
vpor 32(%rcx),%ymm6,%ymm6 # padbit, yes, always
vpaddq %ymm2,%ymm9,%ymm2 # accumulate input
sub $64,%rdx
jz .Ltail_avx2
jmp .Loop_avx2
.align 32
.Loop_avx2:
################################################################
# ((inp[0]*r^4+inp[4])*r^4+inp[ 8])*r^4
# ((inp[1]*r^4+inp[5])*r^4+inp[ 9])*r^3
# ((inp[2]*r^4+inp[6])*r^4+inp[10])*r^2
# ((inp[3]*r^4+inp[7])*r^4+inp[11])*r^1
# ________/__________/
################################################################
#vpaddq %ymm2,%ymm9,%ymm2 # accumulate input
vpaddq %ymm0,%ymm7,%ymm0
vmovdqa 0(%rsp),%ymm7 # r0^4
vpaddq %ymm1,%ymm8,%ymm1
vmovdqa 32(%rsp),%ymm8 # r1^4
vpaddq %ymm3,%ymm10,%ymm3
vmovdqa 96(%rsp),%ymm9 # r2^4
vpaddq %ymm4,%ymm6,%ymm4
vmovdqa 48(%rax),%ymm10 # s3^4
vmovdqa 112(%rax),%ymm5 # s4^4
# d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
# d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
# d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
# d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
# d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
#
# however, as h2 is "chronologically" first one available pull
# corresponding operations up, so it's
#
# d4 = h2*r2 + h4*r0 + h3*r1 + h1*r3 + h0*r4
# d3 = h2*r1 + h3*r0 + h1*r2 + h0*r3 + h4*5*r4
# d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
# d1 = h2*5*r4 + h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3
# d0 = h2*5*r3 + h0*r0 + h4*5*r1 + h3*5*r2 + h1*5*r4
vpmuludq %ymm2,%ymm7,%ymm13 # d2 = h2*r0
vpmuludq %ymm2,%ymm8,%ymm14 # d3 = h2*r1
vpmuludq %ymm2,%ymm9,%ymm15 # d4 = h2*r2
vpmuludq %ymm2,%ymm10,%ymm11 # d0 = h2*s3
vpmuludq %ymm2,%ymm5,%ymm12 # d1 = h2*s4
vpmuludq %ymm0,%ymm8,%ymm6 # h0*r1
vpmuludq %ymm1,%ymm8,%ymm2 # h1*r1, borrow %ymm2 as temp
vpaddq %ymm6,%ymm12,%ymm12 # d1 += h0*r1
vpaddq %ymm2,%ymm13,%ymm13 # d2 += h1*r1
vpmuludq %ymm3,%ymm8,%ymm6 # h3*r1
vpmuludq 64(%rsp),%ymm4,%ymm2 # h4*s1
vpaddq %ymm6,%ymm15,%ymm15 # d4 += h3*r1
vpaddq %ymm2,%ymm11,%ymm11 # d0 += h4*s1
vmovdqa -16(%rax),%ymm8 # s2
vpmuludq %ymm0,%ymm7,%ymm6 # h0*r0
vpmuludq %ymm1,%ymm7,%ymm2 # h1*r0
vpaddq %ymm6,%ymm11,%ymm11 # d0 += h0*r0
vpaddq %ymm2,%ymm12,%ymm12 # d1 += h1*r0
vpmuludq %ymm3,%ymm7,%ymm6 # h3*r0
vpmuludq %ymm4,%ymm7,%ymm2 # h4*r0
vmovdqu 16*0(%rsi),%xmm7 # load input
vpaddq %ymm6,%ymm14,%ymm14 # d3 += h3*r0
vpaddq %ymm2,%ymm15,%ymm15 # d4 += h4*r0
vinserti128 $1,16*2(%rsi),%ymm7,%ymm7
vpmuludq %ymm3,%ymm8,%ymm6 # h3*s2
vpmuludq %ymm4,%ymm8,%ymm2 # h4*s2
vmovdqu 16*1(%rsi),%xmm8
vpaddq %ymm6,%ymm11,%ymm11 # d0 += h3*s2
vpaddq %ymm2,%ymm12,%ymm12 # d1 += h4*s2
vmovdqa 16(%rax),%ymm2 # r3
vpmuludq %ymm1,%ymm9,%ymm6 # h1*r2
vpmuludq %ymm0,%ymm9,%ymm9 # h0*r2
vpaddq %ymm6,%ymm14,%ymm14 # d3 += h1*r2
vpaddq %ymm9,%ymm13,%ymm13 # d2 += h0*r2
vinserti128 $1,16*3(%rsi),%ymm8,%ymm8
lea 16*4(%rsi),%rsi
vpmuludq %ymm1,%ymm2,%ymm6 # h1*r3
vpmuludq %ymm0,%ymm2,%ymm2 # h0*r3
vpsrldq $6,%ymm7,%ymm9 # splat input
vpaddq %ymm6,%ymm15,%ymm15 # d4 += h1*r3
vpaddq %ymm2,%ymm14,%ymm14 # d3 += h0*r3
vpmuludq %ymm3,%ymm10,%ymm6 # h3*s3
vpmuludq %ymm4,%ymm10,%ymm2 # h4*s3
vpsrldq $6,%ymm8,%ymm10
vpaddq %ymm6,%ymm12,%ymm12 # d1 += h3*s3
vpaddq %ymm2,%ymm13,%ymm13 # d2 += h4*s3
vpunpckhqdq %ymm8,%ymm7,%ymm6 # 4
vpmuludq %ymm3,%ymm5,%ymm3 # h3*s4
vpmuludq %ymm4,%ymm5,%ymm4 # h4*s4
vpunpcklqdq %ymm8,%ymm7,%ymm7 # 0:1
vpaddq %ymm3,%ymm13,%ymm2 # h2 = d2 + h3*r4
vpaddq %ymm4,%ymm14,%ymm3 # h3 = d3 + h4*r4
vpunpcklqdq %ymm10,%ymm9,%ymm10 # 2:3
vpmuludq 80(%rax),%ymm0,%ymm4 # h0*r4
vpmuludq %ymm1,%ymm5,%ymm0 # h1*s4
vmovdqa 64(%rcx),%ymm5 # .Lmask26
vpaddq %ymm4,%ymm15,%ymm4 # h4 = d4 + h0*r4
vpaddq %ymm0,%ymm11,%ymm0 # h0 = d0 + h1*s4
################################################################
# lazy reduction (interleaved with tail of input splat)
vpsrlq $26,%ymm3,%ymm14
vpand %ymm5,%ymm3,%ymm3
vpaddq %ymm14,%ymm4,%ymm4 # h3 -> h4
vpsrlq $26,%ymm0,%ymm11
vpand %ymm5,%ymm0,%ymm0
vpaddq %ymm11,%ymm12,%ymm1 # h0 -> h1
vpsrlq $26,%ymm4,%ymm15
vpand %ymm5,%ymm4,%ymm4
vpsrlq $4,%ymm10,%ymm9
vpsrlq $26,%ymm1,%ymm12
vpand %ymm5,%ymm1,%ymm1
vpaddq %ymm12,%ymm2,%ymm2 # h1 -> h2
vpaddq %ymm15,%ymm0,%ymm0
vpsllq $2,%ymm15,%ymm15
vpaddq %ymm15,%ymm0,%ymm0 # h4 -> h0
vpand %ymm5,%ymm9,%ymm9 # 2
vpsrlq $26,%ymm7,%ymm8
vpsrlq $26,%ymm2,%ymm13
vpand %ymm5,%ymm2,%ymm2
vpaddq %ymm13,%ymm3,%ymm3 # h2 -> h3
vpaddq %ymm9,%ymm2,%ymm2 # modulo-scheduled
vpsrlq $30,%ymm10,%ymm10
vpsrlq $26,%ymm0,%ymm11
vpand %ymm5,%ymm0,%ymm0
vpaddq %ymm11,%ymm1,%ymm1 # h0 -> h1
vpsrlq $40,%ymm6,%ymm6 # 4
vpsrlq $26,%ymm3,%ymm14
vpand %ymm5,%ymm3,%ymm3
vpaddq %ymm14,%ymm4,%ymm4 # h3 -> h4
vpand %ymm5,%ymm7,%ymm7 # 0
vpand %ymm5,%ymm8,%ymm8 # 1
vpand %ymm5,%ymm10,%ymm10 # 3
vpor 32(%rcx),%ymm6,%ymm6 # padbit, yes, always
sub $64,%rdx
jnz .Loop_avx2
.byte 0x66,0x90
.Ltail_avx2:
################################################################
# while above multiplications were by r^4 in all lanes, in last
# iteration we multiply least significant lane by r^4 and most
# significant one by r, so copy of above except that references
# to the precomputed table are displaced by 4...
#vpaddq %ymm2,%ymm9,%ymm2 # accumulate input
vpaddq %ymm0,%ymm7,%ymm0
vmovdqu 4(%rsp),%ymm7 # r0^4
vpaddq %ymm1,%ymm8,%ymm1
vmovdqu 36(%rsp),%ymm8 # r1^4
vpaddq %ymm3,%ymm10,%ymm3
vmovdqu 100(%rsp),%ymm9 # r2^4
vpaddq %ymm4,%ymm6,%ymm4
vmovdqu 52(%rax),%ymm10 # s3^4
vmovdqu 116(%rax),%ymm5 # s4^4
vpmuludq %ymm2,%ymm7,%ymm13 # d2 = h2*r0
vpmuludq %ymm2,%ymm8,%ymm14 # d3 = h2*r1
vpmuludq %ymm2,%ymm9,%ymm15 # d4 = h2*r2
vpmuludq %ymm2,%ymm10,%ymm11 # d0 = h2*s3
vpmuludq %ymm2,%ymm5,%ymm12 # d1 = h2*s4
vpmuludq %ymm0,%ymm8,%ymm6 # h0*r1
vpmuludq %ymm1,%ymm8,%ymm2 # h1*r1
vpaddq %ymm6,%ymm12,%ymm12 # d1 += h0*r1
vpaddq %ymm2,%ymm13,%ymm13 # d2 += h1*r1
vpmuludq %ymm3,%ymm8,%ymm6 # h3*r1
vpmuludq 68(%rsp),%ymm4,%ymm2 # h4*s1
vpaddq %ymm6,%ymm15,%ymm15 # d4 += h3*r1
vpaddq %ymm2,%ymm11,%ymm11 # d0 += h4*s1
vpmuludq %ymm0,%ymm7,%ymm6 # h0*r0
vpmuludq %ymm1,%ymm7,%ymm2 # h1*r0
vpaddq %ymm6,%ymm11,%ymm11 # d0 += h0*r0
vmovdqu -12(%rax),%ymm8 # s2
vpaddq %ymm2,%ymm12,%ymm12 # d1 += h1*r0
vpmuludq %ymm3,%ymm7,%ymm6 # h3*r0
vpmuludq %ymm4,%ymm7,%ymm2 # h4*r0
vpaddq %ymm6,%ymm14,%ymm14 # d3 += h3*r0
vpaddq %ymm2,%ymm15,%ymm15 # d4 += h4*r0
vpmuludq %ymm3,%ymm8,%ymm6 # h3*s2
vpmuludq %ymm4,%ymm8,%ymm2 # h4*s2
vpaddq %ymm6,%ymm11,%ymm11 # d0 += h3*s2
vpaddq %ymm2,%ymm12,%ymm12 # d1 += h4*s2
vmovdqu 20(%rax),%ymm2 # r3
vpmuludq %ymm1,%ymm9,%ymm6 # h1*r2
vpmuludq %ymm0,%ymm9,%ymm9 # h0*r2
vpaddq %ymm6,%ymm14,%ymm14 # d3 += h1*r2
vpaddq %ymm9,%ymm13,%ymm13 # d2 += h0*r2
vpmuludq %ymm1,%ymm2,%ymm6 # h1*r3
vpmuludq %ymm0,%ymm2,%ymm2 # h0*r3
vpaddq %ymm6,%ymm15,%ymm15 # d4 += h1*r3
vpaddq %ymm2,%ymm14,%ymm14 # d3 += h0*r3
vpmuludq %ymm3,%ymm10,%ymm6 # h3*s3
vpmuludq %ymm4,%ymm10,%ymm2 # h4*s3
vpaddq %ymm6,%ymm12,%ymm12 # d1 += h3*s3
vpaddq %ymm2,%ymm13,%ymm13 # d2 += h4*s3
vpmuludq %ymm3,%ymm5,%ymm3 # h3*s4
vpmuludq %ymm4,%ymm5,%ymm4 # h4*s4
vpaddq %ymm3,%ymm13,%ymm2 # h2 = d2 + h3*r4
vpaddq %ymm4,%ymm14,%ymm3 # h3 = d3 + h4*r4
vpmuludq 84(%rax),%ymm0,%ymm4 # h0*r4
vpmuludq %ymm1,%ymm5,%ymm0 # h1*s4
vmovdqa 64(%rcx),%ymm5 # .Lmask26
vpaddq %ymm4,%ymm15,%ymm4 # h4 = d4 + h0*r4
vpaddq %ymm0,%ymm11,%ymm0 # h0 = d0 + h1*s4
################################################################
# horizontal addition
vpsrldq $8,%ymm12,%ymm8
vpsrldq $8,%ymm2,%ymm9
vpsrldq $8,%ymm3,%ymm10
vpsrldq $8,%ymm4,%ymm6
vpsrldq $8,%ymm0,%ymm7
vpaddq %ymm8,%ymm12,%ymm12
vpaddq %ymm9,%ymm2,%ymm2
vpaddq %ymm10,%ymm3,%ymm3
vpaddq %ymm6,%ymm4,%ymm4
vpaddq %ymm7,%ymm0,%ymm0
vpermq $0x2,%ymm3,%ymm10
vpermq $0x2,%ymm4,%ymm6
vpermq $0x2,%ymm0,%ymm7
vpermq $0x2,%ymm12,%ymm8
vpermq $0x2,%ymm2,%ymm9
vpaddq %ymm10,%ymm3,%ymm3
vpaddq %ymm6,%ymm4,%ymm4
vpaddq %ymm7,%ymm0,%ymm0
vpaddq %ymm8,%ymm12,%ymm12
vpaddq %ymm9,%ymm2,%ymm2
################################################################
# lazy reduction
vpsrlq $26,%ymm3,%ymm14
vpand %ymm5,%ymm3,%ymm3
vpaddq %ymm14,%ymm4,%ymm4 # h3 -> h4
vpsrlq $26,%ymm0,%ymm11
vpand %ymm5,%ymm0,%ymm0
vpaddq %ymm11,%ymm12,%ymm1 # h0 -> h1
vpsrlq $26,%ymm4,%ymm15
vpand %ymm5,%ymm4,%ymm4
vpsrlq $26,%ymm1,%ymm12
vpand %ymm5,%ymm1,%ymm1
vpaddq %ymm12,%ymm2,%ymm2 # h1 -> h2
vpaddq %ymm15,%ymm0,%ymm0
vpsllq $2,%ymm15,%ymm15
vpaddq %ymm15,%ymm0,%ymm0 # h4 -> h0
vpsrlq $26,%ymm2,%ymm13
vpand %ymm5,%ymm2,%ymm2
vpaddq %ymm13,%ymm3,%ymm3 # h2 -> h3
vpsrlq $26,%ymm0,%ymm11
vpand %ymm5,%ymm0,%ymm0
vpaddq %ymm11,%ymm1,%ymm1 # h0 -> h1
vpsrlq $26,%ymm3,%ymm14
vpand %ymm5,%ymm3,%ymm3
vpaddq %ymm14,%ymm4,%ymm4 # h3 -> h4
vmovd %xmm0,-112(%rdi)# save partially reduced
vmovd %xmm1,-108(%rdi)
vmovd %xmm2,-104(%rdi)
vmovd %xmm3,-100(%rdi)
vmovd %xmm4,-96(%rdi)
lea -8(%r10),%rsp
vzeroupper
RET
SYM_FUNC_END(poly1305_blocks_avx2)
#ifdef CONFIG_AS_AVX512
SYM_FUNC_START(poly1305_blocks_avx512)
.Lpoly1305_blocks_avx512:
mov 20(%rdi),%r8d # is_base2_26
cmp $128,%rdx
jae .Lblocks_avx2_avx512
test %r8d,%r8d
jz .Lblocks
.Lblocks_avx2_avx512:
and $-16,%rdx
jz .Lno_data_avx2_avx512
vzeroupper
test %r8d,%r8d
jz .Lbase2_64_avx2_avx512
test $63,%rdx
jz .Leven_avx2_avx512
push %rbp
mov %rsp,%rbp
push %rbx
push %r12
push %r13
push %r14
push %r15
.Lblocks_avx2_body_avx512:
mov %rdx,%r15 # reassign %rdx
mov 0(%rdi),%r8 # load hash value
mov 8(%rdi),%r9
mov 16(%rdi),%r10d
mov 24(%rdi),%r11 # load r
mov 32(%rdi),%r13
################################# base 2^26 -> base 2^64
mov %r8d,%r14d
and $-2147483648,%r8
mov %r9,%r12 # borrow %r12
mov %r9d,%ebx
and $-2147483648,%r9
shr $6,%r8
shl $52,%r12
add %r8,%r14
shr $12,%rbx
shr $18,%r9
add %r12,%r14
adc %r9,%rbx
mov %r10,%r8
shl $40,%r8
shr $24,%r10
add %r8,%rbx
adc $0,%r10 # can be partially reduced...
mov $-4,%r9 # ... so reduce
mov %r10,%r8
and %r10,%r9
shr $2,%r8
and $3,%r10
add %r9,%r8 # =*5
add %r8,%r14
adc $0,%rbx
adc $0,%r10
mov %r13,%r12
mov %r13,%rax
shr $2,%r13
add %r12,%r13 # s1 = r1 + (r1 >> 2)
.Lbase2_26_pre_avx2_avx512:
add 0(%rsi),%r14 # accumulate input
adc 8(%rsi),%rbx
lea 16(%rsi),%rsi
adc %rcx,%r10
sub $16,%r15
call __poly1305_block
mov %r12,%rax
test $63,%r15
jnz .Lbase2_26_pre_avx2_avx512
test %rcx,%rcx # if %rcx is zero,
jz .Lstore_base2_64_avx2_avx512 # store hash in base 2^64 format
################################# base 2^64 -> base 2^26
mov %r14,%rax
mov %r14,%rdx
shr $52,%r14
mov %rbx,%r11
mov %rbx,%r12
shr $26,%rdx
and $0x3ffffff,%rax # h[0]
shl $12,%r11
and $0x3ffffff,%rdx # h[1]
shr $14,%rbx
or %r11,%r14
shl $24,%r10
and $0x3ffffff,%r14 # h[2]
shr $40,%r12
and $0x3ffffff,%rbx # h[3]
or %r12,%r10 # h[4]
test %r15,%r15
jz .Lstore_base2_26_avx2_avx512
vmovd %eax,%xmm0
vmovd %edx,%xmm1
vmovd %r14d,%xmm2
vmovd %ebx,%xmm3
vmovd %r10d,%xmm4
jmp .Lproceed_avx2_avx512
.align 32
.Lstore_base2_64_avx2_avx512:
mov %r14,0(%rdi)
mov %rbx,8(%rdi)
mov %r10,16(%rdi) # note that is_base2_26 is zeroed
jmp .Ldone_avx2_avx512
.align 16
.Lstore_base2_26_avx2_avx512:
mov %eax,0(%rdi) # store hash value base 2^26
mov %edx,4(%rdi)
mov %r14d,8(%rdi)
mov %ebx,12(%rdi)
mov %r10d,16(%rdi)
.align 16
.Ldone_avx2_avx512:
pop %r15
pop %r14
pop %r13
pop %r12
pop %rbx
pop %rbp
.Lno_data_avx2_avx512:
.Lblocks_avx2_epilogue_avx512:
RET
.align 32
.Lbase2_64_avx2_avx512:
push %rbp
mov %rsp,%rbp
push %rbx
push %r12
push %r13
push %r14
push %r15
.Lbase2_64_avx2_body_avx512:
mov %rdx,%r15 # reassign %rdx
mov 24(%rdi),%r11 # load r
mov 32(%rdi),%r13
mov 0(%rdi),%r14 # load hash value
mov 8(%rdi),%rbx
mov 16(%rdi),%r10d
mov %r13,%r12
mov %r13,%rax
shr $2,%r13
add %r12,%r13 # s1 = r1 + (r1 >> 2)
test $63,%rdx
jz .Linit_avx2_avx512
.Lbase2_64_pre_avx2_avx512:
add 0(%rsi),%r14 # accumulate input
adc 8(%rsi),%rbx
lea 16(%rsi),%rsi
adc %rcx,%r10
sub $16,%r15
call __poly1305_block
mov %r12,%rax
test $63,%r15
jnz .Lbase2_64_pre_avx2_avx512
.Linit_avx2_avx512:
################################# base 2^64 -> base 2^26
mov %r14,%rax
mov %r14,%rdx
shr $52,%r14
mov %rbx,%r8
mov %rbx,%r9
shr $26,%rdx
and $0x3ffffff,%rax # h[0]
shl $12,%r8
and $0x3ffffff,%rdx # h[1]
shr $14,%rbx
or %r8,%r14
shl $24,%r10
and $0x3ffffff,%r14 # h[2]
shr $40,%r9
and $0x3ffffff,%rbx # h[3]
or %r9,%r10 # h[4]
vmovd %eax,%xmm0
vmovd %edx,%xmm1
vmovd %r14d,%xmm2
vmovd %ebx,%xmm3
vmovd %r10d,%xmm4
movl $1,20(%rdi) # set is_base2_26
call __poly1305_init_avx
.Lproceed_avx2_avx512:
mov %r15,%rdx # restore %rdx
pop %r15
pop %r14
pop %r13
pop %r12
pop %rbx
pop %rbp
.Lbase2_64_avx2_epilogue_avx512:
jmp .Ldo_avx2_avx512
.align 32
.Leven_avx2_avx512:
vmovd 4*0(%rdi),%xmm0 # load hash value base 2^26
vmovd 4*1(%rdi),%xmm1
vmovd 4*2(%rdi),%xmm2
vmovd 4*3(%rdi),%xmm3
vmovd 4*4(%rdi),%xmm4
.Ldo_avx2_avx512:
cmp $512,%rdx
jae .Lblocks_avx512
lea 8(%rsp),%r10
sub $0x128,%rsp
lea .Lconst(%rip),%rcx
lea 48+64(%rdi),%rdi # size optimization
vmovdqa 96(%rcx),%ymm7 # .Lpermd_avx2
# expand and copy pre-calculated table to stack
vmovdqu -64(%rdi),%xmm9
and $-512,%rsp
vmovdqu -48(%rdi),%xmm10
vmovdqu -32(%rdi),%xmm6
vmovdqu -16(%rdi),%xmm11
vmovdqu 0(%rdi),%xmm12
vmovdqu 16(%rdi),%xmm13
lea 0x90(%rsp),%rax # size optimization
vmovdqu 32(%rdi),%xmm14
vpermd %ymm9,%ymm7,%ymm9 # 00003412 -> 14243444
vmovdqu 48(%rdi),%xmm15
vpermd %ymm10,%ymm7,%ymm10
vmovdqu 64(%rdi),%xmm5
vpermd %ymm6,%ymm7,%ymm6
vmovdqa %ymm9,0x00(%rsp)
vpermd %ymm11,%ymm7,%ymm11
vmovdqa %ymm10,0x20-0x90(%rax)
vpermd %ymm12,%ymm7,%ymm12
vmovdqa %ymm6,0x40-0x90(%rax)
vpermd %ymm13,%ymm7,%ymm13
vmovdqa %ymm11,0x60-0x90(%rax)
vpermd %ymm14,%ymm7,%ymm14
vmovdqa %ymm12,0x80-0x90(%rax)
vpermd %ymm15,%ymm7,%ymm15
vmovdqa %ymm13,0xa0-0x90(%rax)
vpermd %ymm5,%ymm7,%ymm5
vmovdqa %ymm14,0xc0-0x90(%rax)
vmovdqa %ymm15,0xe0-0x90(%rax)
vmovdqa %ymm5,0x100-0x90(%rax)
vmovdqa 64(%rcx),%ymm5 # .Lmask26
################################################################
# load input
vmovdqu 16*0(%rsi),%xmm7
vmovdqu 16*1(%rsi),%xmm8
vinserti128 $1,16*2(%rsi),%ymm7,%ymm7
vinserti128 $1,16*3(%rsi),%ymm8,%ymm8
lea 16*4(%rsi),%rsi
vpsrldq $6,%ymm7,%ymm9 # splat input
vpsrldq $6,%ymm8,%ymm10
vpunpckhqdq %ymm8,%ymm7,%ymm6 # 4
vpunpcklqdq %ymm10,%ymm9,%ymm9 # 2:3
vpunpcklqdq %ymm8,%ymm7,%ymm7 # 0:1
vpsrlq $30,%ymm9,%ymm10
vpsrlq $4,%ymm9,%ymm9
vpsrlq $26,%ymm7,%ymm8
vpsrlq $40,%ymm6,%ymm6 # 4
vpand %ymm5,%ymm9,%ymm9 # 2
vpand %ymm5,%ymm7,%ymm7 # 0
vpand %ymm5,%ymm8,%ymm8 # 1
vpand %ymm5,%ymm10,%ymm10 # 3
vpor 32(%rcx),%ymm6,%ymm6 # padbit, yes, always
vpaddq %ymm2,%ymm9,%ymm2 # accumulate input
sub $64,%rdx
jz .Ltail_avx2_avx512
jmp .Loop_avx2_avx512
.align 32
.Loop_avx2_avx512:
################################################################
# ((inp[0]*r^4+inp[4])*r^4+inp[ 8])*r^4
# ((inp[1]*r^4+inp[5])*r^4+inp[ 9])*r^3
# ((inp[2]*r^4+inp[6])*r^4+inp[10])*r^2
# ((inp[3]*r^4+inp[7])*r^4+inp[11])*r^1
# ________/__________/
################################################################
#vpaddq %ymm2,%ymm9,%ymm2 # accumulate input
vpaddq %ymm0,%ymm7,%ymm0
vmovdqa 0(%rsp),%ymm7 # r0^4
vpaddq %ymm1,%ymm8,%ymm1
vmovdqa 32(%rsp),%ymm8 # r1^4
vpaddq %ymm3,%ymm10,%ymm3
vmovdqa 96(%rsp),%ymm9 # r2^4
vpaddq %ymm4,%ymm6,%ymm4
vmovdqa 48(%rax),%ymm10 # s3^4
vmovdqa 112(%rax),%ymm5 # s4^4
# d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
# d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
# d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
# d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
# d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
#
# however, as h2 is "chronologically" first one available pull
# corresponding operations up, so it's
#
# d4 = h2*r2 + h4*r0 + h3*r1 + h1*r3 + h0*r4
# d3 = h2*r1 + h3*r0 + h1*r2 + h0*r3 + h4*5*r4
# d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
# d1 = h2*5*r4 + h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3
# d0 = h2*5*r3 + h0*r0 + h4*5*r1 + h3*5*r2 + h1*5*r4
vpmuludq %ymm2,%ymm7,%ymm13 # d2 = h2*r0
vpmuludq %ymm2,%ymm8,%ymm14 # d3 = h2*r1
vpmuludq %ymm2,%ymm9,%ymm15 # d4 = h2*r2
vpmuludq %ymm2,%ymm10,%ymm11 # d0 = h2*s3
vpmuludq %ymm2,%ymm5,%ymm12 # d1 = h2*s4
vpmuludq %ymm0,%ymm8,%ymm6 # h0*r1
vpmuludq %ymm1,%ymm8,%ymm2 # h1*r1, borrow %ymm2 as temp
vpaddq %ymm6,%ymm12,%ymm12 # d1 += h0*r1
vpaddq %ymm2,%ymm13,%ymm13 # d2 += h1*r1
vpmuludq %ymm3,%ymm8,%ymm6 # h3*r1
vpmuludq 64(%rsp),%ymm4,%ymm2 # h4*s1
vpaddq %ymm6,%ymm15,%ymm15 # d4 += h3*r1
vpaddq %ymm2,%ymm11,%ymm11 # d0 += h4*s1
vmovdqa -16(%rax),%ymm8 # s2
vpmuludq %ymm0,%ymm7,%ymm6 # h0*r0
vpmuludq %ymm1,%ymm7,%ymm2 # h1*r0
vpaddq %ymm6,%ymm11,%ymm11 # d0 += h0*r0
vpaddq %ymm2,%ymm12,%ymm12 # d1 += h1*r0
vpmuludq %ymm3,%ymm7,%ymm6 # h3*r0
vpmuludq %ymm4,%ymm7,%ymm2 # h4*r0
vmovdqu 16*0(%rsi),%xmm7 # load input
vpaddq %ymm6,%ymm14,%ymm14 # d3 += h3*r0
vpaddq %ymm2,%ymm15,%ymm15 # d4 += h4*r0
vinserti128 $1,16*2(%rsi),%ymm7,%ymm7
vpmuludq %ymm3,%ymm8,%ymm6 # h3*s2
vpmuludq %ymm4,%ymm8,%ymm2 # h4*s2
vmovdqu 16*1(%rsi),%xmm8
vpaddq %ymm6,%ymm11,%ymm11 # d0 += h3*s2
vpaddq %ymm2,%ymm12,%ymm12 # d1 += h4*s2
vmovdqa 16(%rax),%ymm2 # r3
vpmuludq %ymm1,%ymm9,%ymm6 # h1*r2
vpmuludq %ymm0,%ymm9,%ymm9 # h0*r2
vpaddq %ymm6,%ymm14,%ymm14 # d3 += h1*r2
vpaddq %ymm9,%ymm13,%ymm13 # d2 += h0*r2
vinserti128 $1,16*3(%rsi),%ymm8,%ymm8
lea 16*4(%rsi),%rsi
vpmuludq %ymm1,%ymm2,%ymm6 # h1*r3
vpmuludq %ymm0,%ymm2,%ymm2 # h0*r3
vpsrldq $6,%ymm7,%ymm9 # splat input
vpaddq %ymm6,%ymm15,%ymm15 # d4 += h1*r3
vpaddq %ymm2,%ymm14,%ymm14 # d3 += h0*r3
vpmuludq %ymm3,%ymm10,%ymm6 # h3*s3
vpmuludq %ymm4,%ymm10,%ymm2 # h4*s3
vpsrldq $6,%ymm8,%ymm10
vpaddq %ymm6,%ymm12,%ymm12 # d1 += h3*s3
vpaddq %ymm2,%ymm13,%ymm13 # d2 += h4*s3
vpunpckhqdq %ymm8,%ymm7,%ymm6 # 4
vpmuludq %ymm3,%ymm5,%ymm3 # h3*s4
vpmuludq %ymm4,%ymm5,%ymm4 # h4*s4
vpunpcklqdq %ymm8,%ymm7,%ymm7 # 0:1
vpaddq %ymm3,%ymm13,%ymm2 # h2 = d2 + h3*r4
vpaddq %ymm4,%ymm14,%ymm3 # h3 = d3 + h4*r4
vpunpcklqdq %ymm10,%ymm9,%ymm10 # 2:3
vpmuludq 80(%rax),%ymm0,%ymm4 # h0*r4
vpmuludq %ymm1,%ymm5,%ymm0 # h1*s4
vmovdqa 64(%rcx),%ymm5 # .Lmask26
vpaddq %ymm4,%ymm15,%ymm4 # h4 = d4 + h0*r4
vpaddq %ymm0,%ymm11,%ymm0 # h0 = d0 + h1*s4
################################################################
# lazy reduction (interleaved with tail of input splat)
vpsrlq $26,%ymm3,%ymm14
vpand %ymm5,%ymm3,%ymm3
vpaddq %ymm14,%ymm4,%ymm4 # h3 -> h4
vpsrlq $26,%ymm0,%ymm11
vpand %ymm5,%ymm0,%ymm0
vpaddq %ymm11,%ymm12,%ymm1 # h0 -> h1
vpsrlq $26,%ymm4,%ymm15
vpand %ymm5,%ymm4,%ymm4
vpsrlq $4,%ymm10,%ymm9
vpsrlq $26,%ymm1,%ymm12
vpand %ymm5,%ymm1,%ymm1
vpaddq %ymm12,%ymm2,%ymm2 # h1 -> h2
vpaddq %ymm15,%ymm0,%ymm0
vpsllq $2,%ymm15,%ymm15
vpaddq %ymm15,%ymm0,%ymm0 # h4 -> h0
vpand %ymm5,%ymm9,%ymm9 # 2
vpsrlq $26,%ymm7,%ymm8
vpsrlq $26,%ymm2,%ymm13
vpand %ymm5,%ymm2,%ymm2
vpaddq %ymm13,%ymm3,%ymm3 # h2 -> h3
vpaddq %ymm9,%ymm2,%ymm2 # modulo-scheduled
vpsrlq $30,%ymm10,%ymm10
vpsrlq $26,%ymm0,%ymm11
vpand %ymm5,%ymm0,%ymm0
vpaddq %ymm11,%ymm1,%ymm1 # h0 -> h1
vpsrlq $40,%ymm6,%ymm6 # 4
vpsrlq $26,%ymm3,%ymm14
vpand %ymm5,%ymm3,%ymm3
vpaddq %ymm14,%ymm4,%ymm4 # h3 -> h4
vpand %ymm5,%ymm7,%ymm7 # 0
vpand %ymm5,%ymm8,%ymm8 # 1
vpand %ymm5,%ymm10,%ymm10 # 3
vpor 32(%rcx),%ymm6,%ymm6 # padbit, yes, always
sub $64,%rdx
jnz .Loop_avx2_avx512
.byte 0x66,0x90
.Ltail_avx2_avx512:
################################################################
# while above multiplications were by r^4 in all lanes, in last
# iteration we multiply least significant lane by r^4 and most
# significant one by r, so copy of above except that references
# to the precomputed table are displaced by 4...
#vpaddq %ymm2,%ymm9,%ymm2 # accumulate input
vpaddq %ymm0,%ymm7,%ymm0
vmovdqu 4(%rsp),%ymm7 # r0^4
vpaddq %ymm1,%ymm8,%ymm1
vmovdqu 36(%rsp),%ymm8 # r1^4
vpaddq %ymm3,%ymm10,%ymm3
vmovdqu 100(%rsp),%ymm9 # r2^4
vpaddq %ymm4,%ymm6,%ymm4
vmovdqu 52(%rax),%ymm10 # s3^4
vmovdqu 116(%rax),%ymm5 # s4^4
vpmuludq %ymm2,%ymm7,%ymm13 # d2 = h2*r0
vpmuludq %ymm2,%ymm8,%ymm14 # d3 = h2*r1
vpmuludq %ymm2,%ymm9,%ymm15 # d4 = h2*r2
vpmuludq %ymm2,%ymm10,%ymm11 # d0 = h2*s3
vpmuludq %ymm2,%ymm5,%ymm12 # d1 = h2*s4
vpmuludq %ymm0,%ymm8,%ymm6 # h0*r1
vpmuludq %ymm1,%ymm8,%ymm2 # h1*r1
vpaddq %ymm6,%ymm12,%ymm12 # d1 += h0*r1
vpaddq %ymm2,%ymm13,%ymm13 # d2 += h1*r1
vpmuludq %ymm3,%ymm8,%ymm6 # h3*r1
vpmuludq 68(%rsp),%ymm4,%ymm2 # h4*s1
vpaddq %ymm6,%ymm15,%ymm15 # d4 += h3*r1
vpaddq %ymm2,%ymm11,%ymm11 # d0 += h4*s1
vpmuludq %ymm0,%ymm7,%ymm6 # h0*r0
vpmuludq %ymm1,%ymm7,%ymm2 # h1*r0
vpaddq %ymm6,%ymm11,%ymm11 # d0 += h0*r0
vmovdqu -12(%rax),%ymm8 # s2
vpaddq %ymm2,%ymm12,%ymm12 # d1 += h1*r0
vpmuludq %ymm3,%ymm7,%ymm6 # h3*r0
vpmuludq %ymm4,%ymm7,%ymm2 # h4*r0
vpaddq %ymm6,%ymm14,%ymm14 # d3 += h3*r0
vpaddq %ymm2,%ymm15,%ymm15 # d4 += h4*r0
vpmuludq %ymm3,%ymm8,%ymm6 # h3*s2
vpmuludq %ymm4,%ymm8,%ymm2 # h4*s2
vpaddq %ymm6,%ymm11,%ymm11 # d0 += h3*s2
vpaddq %ymm2,%ymm12,%ymm12 # d1 += h4*s2
vmovdqu 20(%rax),%ymm2 # r3
vpmuludq %ymm1,%ymm9,%ymm6 # h1*r2
vpmuludq %ymm0,%ymm9,%ymm9 # h0*r2
vpaddq %ymm6,%ymm14,%ymm14 # d3 += h1*r2
vpaddq %ymm9,%ymm13,%ymm13 # d2 += h0*r2
vpmuludq %ymm1,%ymm2,%ymm6 # h1*r3
vpmuludq %ymm0,%ymm2,%ymm2 # h0*r3
vpaddq %ymm6,%ymm15,%ymm15 # d4 += h1*r3
vpaddq %ymm2,%ymm14,%ymm14 # d3 += h0*r3
vpmuludq %ymm3,%ymm10,%ymm6 # h3*s3
vpmuludq %ymm4,%ymm10,%ymm2 # h4*s3
vpaddq %ymm6,%ymm12,%ymm12 # d1 += h3*s3
vpaddq %ymm2,%ymm13,%ymm13 # d2 += h4*s3
vpmuludq %ymm3,%ymm5,%ymm3 # h3*s4
vpmuludq %ymm4,%ymm5,%ymm4 # h4*s4
vpaddq %ymm3,%ymm13,%ymm2 # h2 = d2 + h3*r4
vpaddq %ymm4,%ymm14,%ymm3 # h3 = d3 + h4*r4
vpmuludq 84(%rax),%ymm0,%ymm4 # h0*r4
vpmuludq %ymm1,%ymm5,%ymm0 # h1*s4
vmovdqa 64(%rcx),%ymm5 # .Lmask26
vpaddq %ymm4,%ymm15,%ymm4 # h4 = d4 + h0*r4
vpaddq %ymm0,%ymm11,%ymm0 # h0 = d0 + h1*s4
################################################################
# horizontal addition
vpsrldq $8,%ymm12,%ymm8
vpsrldq $8,%ymm2,%ymm9
vpsrldq $8,%ymm3,%ymm10
vpsrldq $8,%ymm4,%ymm6
vpsrldq $8,%ymm0,%ymm7
vpaddq %ymm8,%ymm12,%ymm12
vpaddq %ymm9,%ymm2,%ymm2
vpaddq %ymm10,%ymm3,%ymm3
vpaddq %ymm6,%ymm4,%ymm4
vpaddq %ymm7,%ymm0,%ymm0
vpermq $0x2,%ymm3,%ymm10
vpermq $0x2,%ymm4,%ymm6
vpermq $0x2,%ymm0,%ymm7
vpermq $0x2,%ymm12,%ymm8
vpermq $0x2,%ymm2,%ymm9
vpaddq %ymm10,%ymm3,%ymm3
vpaddq %ymm6,%ymm4,%ymm4
vpaddq %ymm7,%ymm0,%ymm0
vpaddq %ymm8,%ymm12,%ymm12
vpaddq %ymm9,%ymm2,%ymm2
################################################################
# lazy reduction
vpsrlq $26,%ymm3,%ymm14
vpand %ymm5,%ymm3,%ymm3
vpaddq %ymm14,%ymm4,%ymm4 # h3 -> h4
vpsrlq $26,%ymm0,%ymm11
vpand %ymm5,%ymm0,%ymm0
vpaddq %ymm11,%ymm12,%ymm1 # h0 -> h1
vpsrlq $26,%ymm4,%ymm15
vpand %ymm5,%ymm4,%ymm4
vpsrlq $26,%ymm1,%ymm12
vpand %ymm5,%ymm1,%ymm1
vpaddq %ymm12,%ymm2,%ymm2 # h1 -> h2
vpaddq %ymm15,%ymm0,%ymm0
vpsllq $2,%ymm15,%ymm15
vpaddq %ymm15,%ymm0,%ymm0 # h4 -> h0
vpsrlq $26,%ymm2,%ymm13
vpand %ymm5,%ymm2,%ymm2
vpaddq %ymm13,%ymm3,%ymm3 # h2 -> h3
vpsrlq $26,%ymm0,%ymm11
vpand %ymm5,%ymm0,%ymm0
vpaddq %ymm11,%ymm1,%ymm1 # h0 -> h1
vpsrlq $26,%ymm3,%ymm14
vpand %ymm5,%ymm3,%ymm3
vpaddq %ymm14,%ymm4,%ymm4 # h3 -> h4
vmovd %xmm0,-112(%rdi)# save partially reduced
vmovd %xmm1,-108(%rdi)
vmovd %xmm2,-104(%rdi)
vmovd %xmm3,-100(%rdi)
vmovd %xmm4,-96(%rdi)
lea -8(%r10),%rsp
vzeroupper
RET
.Lblocks_avx512:
mov $15,%eax
kmovw %eax,%k2
lea 8(%rsp),%r10
sub $0x128,%rsp
lea .Lconst(%rip),%rcx
lea 48+64(%rdi),%rdi # size optimization
vmovdqa 96(%rcx),%ymm9 # .Lpermd_avx2
# expand pre-calculated table
vmovdqu -64(%rdi),%xmm11 # will become expanded %zmm16
and $-512,%rsp
vmovdqu -48(%rdi),%xmm12 # will become ... %zmm17
mov $0x20,%rax
vmovdqu -32(%rdi),%xmm7 # ... %zmm21
vmovdqu -16(%rdi),%xmm13 # ... %zmm18
vmovdqu 0(%rdi),%xmm8 # ... %zmm22
vmovdqu 16(%rdi),%xmm14 # ... %zmm19
vmovdqu 32(%rdi),%xmm10 # ... %zmm23
vmovdqu 48(%rdi),%xmm15 # ... %zmm20
vmovdqu 64(%rdi),%xmm6 # ... %zmm24
vpermd %zmm11,%zmm9,%zmm16 # 00003412 -> 14243444
vpbroadcastq 64(%rcx),%zmm5 # .Lmask26
vpermd %zmm12,%zmm9,%zmm17
vpermd %zmm7,%zmm9,%zmm21
vpermd %zmm13,%zmm9,%zmm18
vmovdqa64 %zmm16,0x00(%rsp){%k2} # save in case %rdx%128 != 0
vpsrlq $32,%zmm16,%zmm7 # 14243444 -> 01020304
vpermd %zmm8,%zmm9,%zmm22
vmovdqu64 %zmm17,0x00(%rsp,%rax){%k2}
vpsrlq $32,%zmm17,%zmm8
vpermd %zmm14,%zmm9,%zmm19
vmovdqa64 %zmm21,0x40(%rsp){%k2}
vpermd %zmm10,%zmm9,%zmm23
vpermd %zmm15,%zmm9,%zmm20
vmovdqu64 %zmm18,0x40(%rsp,%rax){%k2}
vpermd %zmm6,%zmm9,%zmm24
vmovdqa64 %zmm22,0x80(%rsp){%k2}
vmovdqu64 %zmm19,0x80(%rsp,%rax){%k2}
vmovdqa64 %zmm23,0xc0(%rsp){%k2}
vmovdqu64 %zmm20,0xc0(%rsp,%rax){%k2}
vmovdqa64 %zmm24,0x100(%rsp){%k2}
################################################################
# calculate 5th through 8th powers of the key
#
# d0 = r0'*r0 + r1'*5*r4 + r2'*5*r3 + r3'*5*r2 + r4'*5*r1
# d1 = r0'*r1 + r1'*r0 + r2'*5*r4 + r3'*5*r3 + r4'*5*r2
# d2 = r0'*r2 + r1'*r1 + r2'*r0 + r3'*5*r4 + r4'*5*r3
# d3 = r0'*r3 + r1'*r2 + r2'*r1 + r3'*r0 + r4'*5*r4
# d4 = r0'*r4 + r1'*r3 + r2'*r2 + r3'*r1 + r4'*r0
vpmuludq %zmm7,%zmm16,%zmm11 # d0 = r0'*r0
vpmuludq %zmm7,%zmm17,%zmm12 # d1 = r0'*r1
vpmuludq %zmm7,%zmm18,%zmm13 # d2 = r0'*r2
vpmuludq %zmm7,%zmm19,%zmm14 # d3 = r0'*r3
vpmuludq %zmm7,%zmm20,%zmm15 # d4 = r0'*r4
vpsrlq $32,%zmm18,%zmm9
vpmuludq %zmm8,%zmm24,%zmm25
vpmuludq %zmm8,%zmm16,%zmm26
vpmuludq %zmm8,%zmm17,%zmm27
vpmuludq %zmm8,%zmm18,%zmm28
vpmuludq %zmm8,%zmm19,%zmm29
vpsrlq $32,%zmm19,%zmm10
vpaddq %zmm25,%zmm11,%zmm11 # d0 += r1'*5*r4
vpaddq %zmm26,%zmm12,%zmm12 # d1 += r1'*r0
vpaddq %zmm27,%zmm13,%zmm13 # d2 += r1'*r1
vpaddq %zmm28,%zmm14,%zmm14 # d3 += r1'*r2
vpaddq %zmm29,%zmm15,%zmm15 # d4 += r1'*r3
vpmuludq %zmm9,%zmm23,%zmm25
vpmuludq %zmm9,%zmm24,%zmm26
vpmuludq %zmm9,%zmm17,%zmm28
vpmuludq %zmm9,%zmm18,%zmm29
vpmuludq %zmm9,%zmm16,%zmm27
vpsrlq $32,%zmm20,%zmm6
vpaddq %zmm25,%zmm11,%zmm11 # d0 += r2'*5*r3
vpaddq %zmm26,%zmm12,%zmm12 # d1 += r2'*5*r4
vpaddq %zmm28,%zmm14,%zmm14 # d3 += r2'*r1
vpaddq %zmm29,%zmm15,%zmm15 # d4 += r2'*r2
vpaddq %zmm27,%zmm13,%zmm13 # d2 += r2'*r0
vpmuludq %zmm10,%zmm22,%zmm25
vpmuludq %zmm10,%zmm16,%zmm28
vpmuludq %zmm10,%zmm17,%zmm29
vpmuludq %zmm10,%zmm23,%zmm26
vpmuludq %zmm10,%zmm24,%zmm27
vpaddq %zmm25,%zmm11,%zmm11 # d0 += r3'*5*r2
vpaddq %zmm28,%zmm14,%zmm14 # d3 += r3'*r0
vpaddq %zmm29,%zmm15,%zmm15 # d4 += r3'*r1
vpaddq %zmm26,%zmm12,%zmm12 # d1 += r3'*5*r3
vpaddq %zmm27,%zmm13,%zmm13 # d2 += r3'*5*r4
vpmuludq %zmm6,%zmm24,%zmm28
vpmuludq %zmm6,%zmm16,%zmm29
vpmuludq %zmm6,%zmm21,%zmm25
vpmuludq %zmm6,%zmm22,%zmm26
vpmuludq %zmm6,%zmm23,%zmm27
vpaddq %zmm28,%zmm14,%zmm14 # d3 += r2'*5*r4
vpaddq %zmm29,%zmm15,%zmm15 # d4 += r2'*r0
vpaddq %zmm25,%zmm11,%zmm11 # d0 += r2'*5*r1
vpaddq %zmm26,%zmm12,%zmm12 # d1 += r2'*5*r2
vpaddq %zmm27,%zmm13,%zmm13 # d2 += r2'*5*r3
################################################################
# load input
vmovdqu64 16*0(%rsi),%zmm10
vmovdqu64 16*4(%rsi),%zmm6
lea 16*8(%rsi),%rsi
################################################################
# lazy reduction
vpsrlq $26,%zmm14,%zmm28
vpandq %zmm5,%zmm14,%zmm14
vpaddq %zmm28,%zmm15,%zmm15 # d3 -> d4
vpsrlq $26,%zmm11,%zmm25
vpandq %zmm5,%zmm11,%zmm11
vpaddq %zmm25,%zmm12,%zmm12 # d0 -> d1
vpsrlq $26,%zmm15,%zmm29
vpandq %zmm5,%zmm15,%zmm15
vpsrlq $26,%zmm12,%zmm26
vpandq %zmm5,%zmm12,%zmm12
vpaddq %zmm26,%zmm13,%zmm13 # d1 -> d2
vpaddq %zmm29,%zmm11,%zmm11
vpsllq $2,%zmm29,%zmm29
vpaddq %zmm29,%zmm11,%zmm11 # d4 -> d0
vpsrlq $26,%zmm13,%zmm27
vpandq %zmm5,%zmm13,%zmm13
vpaddq %zmm27,%zmm14,%zmm14 # d2 -> d3
vpsrlq $26,%zmm11,%zmm25
vpandq %zmm5,%zmm11,%zmm11
vpaddq %zmm25,%zmm12,%zmm12 # d0 -> d1
vpsrlq $26,%zmm14,%zmm28
vpandq %zmm5,%zmm14,%zmm14
vpaddq %zmm28,%zmm15,%zmm15 # d3 -> d4
################################################################
# at this point we have 14243444 in %zmm16-%zmm24 and 05060708 in
# %zmm11-%zmm15, ...
vpunpcklqdq %zmm6,%zmm10,%zmm7 # transpose input
vpunpckhqdq %zmm6,%zmm10,%zmm6
# ... since input 64-bit lanes are ordered as 73625140, we could
# "vperm" it to 76543210 (here and in each loop iteration), *or*
# we could just flow along, hence the goal for %zmm16-%zmm24 is
# 1858286838784888 ...
vmovdqa32 128(%rcx),%zmm25 # .Lpermd_avx512:
mov $0x7777,%eax
kmovw %eax,%k1
vpermd %zmm16,%zmm25,%zmm16 # 14243444 -> 1---2---3---4---
vpermd %zmm17,%zmm25,%zmm17
vpermd %zmm18,%zmm25,%zmm18
vpermd %zmm19,%zmm25,%zmm19
vpermd %zmm20,%zmm25,%zmm20
vpermd %zmm11,%zmm25,%zmm16{%k1} # 05060708 -> 1858286838784888
vpermd %zmm12,%zmm25,%zmm17{%k1}
vpermd %zmm13,%zmm25,%zmm18{%k1}
vpermd %zmm14,%zmm25,%zmm19{%k1}
vpermd %zmm15,%zmm25,%zmm20{%k1}
vpslld $2,%zmm17,%zmm21 # *5
vpslld $2,%zmm18,%zmm22
vpslld $2,%zmm19,%zmm23
vpslld $2,%zmm20,%zmm24
vpaddd %zmm17,%zmm21,%zmm21
vpaddd %zmm18,%zmm22,%zmm22
vpaddd %zmm19,%zmm23,%zmm23
vpaddd %zmm20,%zmm24,%zmm24
vpbroadcastq 32(%rcx),%zmm30 # .L129
vpsrlq $52,%zmm7,%zmm9 # splat input
vpsllq $12,%zmm6,%zmm10
vporq %zmm10,%zmm9,%zmm9
vpsrlq $26,%zmm7,%zmm8
vpsrlq $14,%zmm6,%zmm10
vpsrlq $40,%zmm6,%zmm6 # 4
vpandq %zmm5,%zmm9,%zmm9 # 2
vpandq %zmm5,%zmm7,%zmm7 # 0
#vpandq %zmm5,%zmm8,%zmm8 # 1
#vpandq %zmm5,%zmm10,%zmm10 # 3
#vporq %zmm30,%zmm6,%zmm6 # padbit, yes, always
vpaddq %zmm2,%zmm9,%zmm2 # accumulate input
sub $192,%rdx
jbe .Ltail_avx512
jmp .Loop_avx512
.align 32
.Loop_avx512:
################################################################
# ((inp[0]*r^8+inp[ 8])*r^8+inp[16])*r^8
# ((inp[1]*r^8+inp[ 9])*r^8+inp[17])*r^7
# ((inp[2]*r^8+inp[10])*r^8+inp[18])*r^6
# ((inp[3]*r^8+inp[11])*r^8+inp[19])*r^5
# ((inp[4]*r^8+inp[12])*r^8+inp[20])*r^4
# ((inp[5]*r^8+inp[13])*r^8+inp[21])*r^3
# ((inp[6]*r^8+inp[14])*r^8+inp[22])*r^2
# ((inp[7]*r^8+inp[15])*r^8+inp[23])*r^1
# ________/___________/
################################################################
#vpaddq %zmm2,%zmm9,%zmm2 # accumulate input
# d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
# d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
# d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
# d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
# d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
#
# however, as h2 is "chronologically" first one available pull
# corresponding operations up, so it's
#
# d3 = h2*r1 + h0*r3 + h1*r2 + h3*r0 + h4*5*r4
# d4 = h2*r2 + h0*r4 + h1*r3 + h3*r1 + h4*r0
# d0 = h2*5*r3 + h0*r0 + h1*5*r4 + h3*5*r2 + h4*5*r1
# d1 = h2*5*r4 + h0*r1 + h1*r0 + h3*5*r3 + h4*5*r2
# d2 = h2*r0 + h0*r2 + h1*r1 + h3*5*r4 + h4*5*r3
vpmuludq %zmm2,%zmm17,%zmm14 # d3 = h2*r1
vpaddq %zmm0,%zmm7,%zmm0
vpmuludq %zmm2,%zmm18,%zmm15 # d4 = h2*r2
vpandq %zmm5,%zmm8,%zmm8 # 1
vpmuludq %zmm2,%zmm23,%zmm11 # d0 = h2*s3
vpandq %zmm5,%zmm10,%zmm10 # 3
vpmuludq %zmm2,%zmm24,%zmm12 # d1 = h2*s4
vporq %zmm30,%zmm6,%zmm6 # padbit, yes, always
vpmuludq %zmm2,%zmm16,%zmm13 # d2 = h2*r0
vpaddq %zmm1,%zmm8,%zmm1 # accumulate input
vpaddq %zmm3,%zmm10,%zmm3
vpaddq %zmm4,%zmm6,%zmm4
vmovdqu64 16*0(%rsi),%zmm10 # load input
vmovdqu64 16*4(%rsi),%zmm6
lea 16*8(%rsi),%rsi
vpmuludq %zmm0,%zmm19,%zmm28
vpmuludq %zmm0,%zmm20,%zmm29
vpmuludq %zmm0,%zmm16,%zmm25
vpmuludq %zmm0,%zmm17,%zmm26
vpaddq %zmm28,%zmm14,%zmm14 # d3 += h0*r3
vpaddq %zmm29,%zmm15,%zmm15 # d4 += h0*r4
vpaddq %zmm25,%zmm11,%zmm11 # d0 += h0*r0
vpaddq %zmm26,%zmm12,%zmm12 # d1 += h0*r1
vpmuludq %zmm1,%zmm18,%zmm28
vpmuludq %zmm1,%zmm19,%zmm29
vpmuludq %zmm1,%zmm24,%zmm25
vpmuludq %zmm0,%zmm18,%zmm27
vpaddq %zmm28,%zmm14,%zmm14 # d3 += h1*r2
vpaddq %zmm29,%zmm15,%zmm15 # d4 += h1*r3
vpaddq %zmm25,%zmm11,%zmm11 # d0 += h1*s4
vpaddq %zmm27,%zmm13,%zmm13 # d2 += h0*r2
vpunpcklqdq %zmm6,%zmm10,%zmm7 # transpose input
vpunpckhqdq %zmm6,%zmm10,%zmm6
vpmuludq %zmm3,%zmm16,%zmm28
vpmuludq %zmm3,%zmm17,%zmm29
vpmuludq %zmm1,%zmm16,%zmm26
vpmuludq %zmm1,%zmm17,%zmm27
vpaddq %zmm28,%zmm14,%zmm14 # d3 += h3*r0
vpaddq %zmm29,%zmm15,%zmm15 # d4 += h3*r1
vpaddq %zmm26,%zmm12,%zmm12 # d1 += h1*r0
vpaddq %zmm27,%zmm13,%zmm13 # d2 += h1*r1
vpmuludq %zmm4,%zmm24,%zmm28
vpmuludq %zmm4,%zmm16,%zmm29
vpmuludq %zmm3,%zmm22,%zmm25
vpmuludq %zmm3,%zmm23,%zmm26
vpaddq %zmm28,%zmm14,%zmm14 # d3 += h4*s4
vpmuludq %zmm3,%zmm24,%zmm27
vpaddq %zmm29,%zmm15,%zmm15 # d4 += h4*r0
vpaddq %zmm25,%zmm11,%zmm11 # d0 += h3*s2
vpaddq %zmm26,%zmm12,%zmm12 # d1 += h3*s3
vpaddq %zmm27,%zmm13,%zmm13 # d2 += h3*s4
vpmuludq %zmm4,%zmm21,%zmm25
vpmuludq %zmm4,%zmm22,%zmm26
vpmuludq %zmm4,%zmm23,%zmm27
vpaddq %zmm25,%zmm11,%zmm0 # h0 = d0 + h4*s1
vpaddq %zmm26,%zmm12,%zmm1 # h1 = d2 + h4*s2
vpaddq %zmm27,%zmm13,%zmm2 # h2 = d3 + h4*s3
################################################################
# lazy reduction (interleaved with input splat)
vpsrlq $52,%zmm7,%zmm9 # splat input
vpsllq $12,%zmm6,%zmm10
vpsrlq $26,%zmm14,%zmm3
vpandq %zmm5,%zmm14,%zmm14
vpaddq %zmm3,%zmm15,%zmm4 # h3 -> h4
vporq %zmm10,%zmm9,%zmm9
vpsrlq $26,%zmm0,%zmm11
vpandq %zmm5,%zmm0,%zmm0
vpaddq %zmm11,%zmm1,%zmm1 # h0 -> h1
vpandq %zmm5,%zmm9,%zmm9 # 2
vpsrlq $26,%zmm4,%zmm15
vpandq %zmm5,%zmm4,%zmm4
vpsrlq $26,%zmm1,%zmm12
vpandq %zmm5,%zmm1,%zmm1
vpaddq %zmm12,%zmm2,%zmm2 # h1 -> h2
vpaddq %zmm15,%zmm0,%zmm0
vpsllq $2,%zmm15,%zmm15
vpaddq %zmm15,%zmm0,%zmm0 # h4 -> h0
vpaddq %zmm9,%zmm2,%zmm2 # modulo-scheduled
vpsrlq $26,%zmm7,%zmm8
vpsrlq $26,%zmm2,%zmm13
vpandq %zmm5,%zmm2,%zmm2
vpaddq %zmm13,%zmm14,%zmm3 # h2 -> h3
vpsrlq $14,%zmm6,%zmm10
vpsrlq $26,%zmm0,%zmm11
vpandq %zmm5,%zmm0,%zmm0
vpaddq %zmm11,%zmm1,%zmm1 # h0 -> h1
vpsrlq $40,%zmm6,%zmm6 # 4
vpsrlq $26,%zmm3,%zmm14
vpandq %zmm5,%zmm3,%zmm3
vpaddq %zmm14,%zmm4,%zmm4 # h3 -> h4
vpandq %zmm5,%zmm7,%zmm7 # 0
#vpandq %zmm5,%zmm8,%zmm8 # 1
#vpandq %zmm5,%zmm10,%zmm10 # 3
#vporq %zmm30,%zmm6,%zmm6 # padbit, yes, always
sub $128,%rdx
ja .Loop_avx512
.Ltail_avx512:
################################################################
# while above multiplications were by r^8 in all lanes, in last
# iteration we multiply least significant lane by r^8 and most
# significant one by r, that's why table gets shifted...
vpsrlq $32,%zmm16,%zmm16 # 0105020603070408
vpsrlq $32,%zmm17,%zmm17
vpsrlq $32,%zmm18,%zmm18
vpsrlq $32,%zmm23,%zmm23
vpsrlq $32,%zmm24,%zmm24
vpsrlq $32,%zmm19,%zmm19
vpsrlq $32,%zmm20,%zmm20
vpsrlq $32,%zmm21,%zmm21
vpsrlq $32,%zmm22,%zmm22
################################################################
# load either next or last 64 byte of input
lea (%rsi,%rdx),%rsi
#vpaddq %zmm2,%zmm9,%zmm2 # accumulate input
vpaddq %zmm0,%zmm7,%zmm0
vpmuludq %zmm2,%zmm17,%zmm14 # d3 = h2*r1
vpmuludq %zmm2,%zmm18,%zmm15 # d4 = h2*r2
vpmuludq %zmm2,%zmm23,%zmm11 # d0 = h2*s3
vpandq %zmm5,%zmm8,%zmm8 # 1
vpmuludq %zmm2,%zmm24,%zmm12 # d1 = h2*s4
vpandq %zmm5,%zmm10,%zmm10 # 3
vpmuludq %zmm2,%zmm16,%zmm13 # d2 = h2*r0
vporq %zmm30,%zmm6,%zmm6 # padbit, yes, always
vpaddq %zmm1,%zmm8,%zmm1 # accumulate input
vpaddq %zmm3,%zmm10,%zmm3
vpaddq %zmm4,%zmm6,%zmm4
vmovdqu 16*0(%rsi),%xmm7
vpmuludq %zmm0,%zmm19,%zmm28
vpmuludq %zmm0,%zmm20,%zmm29
vpmuludq %zmm0,%zmm16,%zmm25
vpmuludq %zmm0,%zmm17,%zmm26
vpaddq %zmm28,%zmm14,%zmm14 # d3 += h0*r3
vpaddq %zmm29,%zmm15,%zmm15 # d4 += h0*r4
vpaddq %zmm25,%zmm11,%zmm11 # d0 += h0*r0
vpaddq %zmm26,%zmm12,%zmm12 # d1 += h0*r1
vmovdqu 16*1(%rsi),%xmm8
vpmuludq %zmm1,%zmm18,%zmm28
vpmuludq %zmm1,%zmm19,%zmm29
vpmuludq %zmm1,%zmm24,%zmm25
vpmuludq %zmm0,%zmm18,%zmm27
vpaddq %zmm28,%zmm14,%zmm14 # d3 += h1*r2
vpaddq %zmm29,%zmm15,%zmm15 # d4 += h1*r3
vpaddq %zmm25,%zmm11,%zmm11 # d0 += h1*s4
vpaddq %zmm27,%zmm13,%zmm13 # d2 += h0*r2
vinserti128 $1,16*2(%rsi),%ymm7,%ymm7
vpmuludq %zmm3,%zmm16,%zmm28
vpmuludq %zmm3,%zmm17,%zmm29
vpmuludq %zmm1,%zmm16,%zmm26
vpmuludq %zmm1,%zmm17,%zmm27
vpaddq %zmm28,%zmm14,%zmm14 # d3 += h3*r0
vpaddq %zmm29,%zmm15,%zmm15 # d4 += h3*r1
vpaddq %zmm26,%zmm12,%zmm12 # d1 += h1*r0
vpaddq %zmm27,%zmm13,%zmm13 # d2 += h1*r1
vinserti128 $1,16*3(%rsi),%ymm8,%ymm8
vpmuludq %zmm4,%zmm24,%zmm28
vpmuludq %zmm4,%zmm16,%zmm29
vpmuludq %zmm3,%zmm22,%zmm25
vpmuludq %zmm3,%zmm23,%zmm26
vpmuludq %zmm3,%zmm24,%zmm27
vpaddq %zmm28,%zmm14,%zmm3 # h3 = d3 + h4*s4
vpaddq %zmm29,%zmm15,%zmm15 # d4 += h4*r0
vpaddq %zmm25,%zmm11,%zmm11 # d0 += h3*s2
vpaddq %zmm26,%zmm12,%zmm12 # d1 += h3*s3
vpaddq %zmm27,%zmm13,%zmm13 # d2 += h3*s4
vpmuludq %zmm4,%zmm21,%zmm25
vpmuludq %zmm4,%zmm22,%zmm26
vpmuludq %zmm4,%zmm23,%zmm27
vpaddq %zmm25,%zmm11,%zmm0 # h0 = d0 + h4*s1
vpaddq %zmm26,%zmm12,%zmm1 # h1 = d2 + h4*s2
vpaddq %zmm27,%zmm13,%zmm2 # h2 = d3 + h4*s3
################################################################
# horizontal addition
mov $1,%eax
vpermq $0xb1,%zmm3,%zmm14
vpermq $0xb1,%zmm15,%zmm4
vpermq $0xb1,%zmm0,%zmm11
vpermq $0xb1,%zmm1,%zmm12
vpermq $0xb1,%zmm2,%zmm13
vpaddq %zmm14,%zmm3,%zmm3
vpaddq %zmm15,%zmm4,%zmm4
vpaddq %zmm11,%zmm0,%zmm0
vpaddq %zmm12,%zmm1,%zmm1
vpaddq %zmm13,%zmm2,%zmm2
kmovw %eax,%k3
vpermq $0x2,%zmm3,%zmm14
vpermq $0x2,%zmm4,%zmm15
vpermq $0x2,%zmm0,%zmm11
vpermq $0x2,%zmm1,%zmm12
vpermq $0x2,%zmm2,%zmm13
vpaddq %zmm14,%zmm3,%zmm3
vpaddq %zmm15,%zmm4,%zmm4
vpaddq %zmm11,%zmm0,%zmm0
vpaddq %zmm12,%zmm1,%zmm1
vpaddq %zmm13,%zmm2,%zmm2
vextracti64x4 $0x1,%zmm3,%ymm14
vextracti64x4 $0x1,%zmm4,%ymm15
vextracti64x4 $0x1,%zmm0,%ymm11
vextracti64x4 $0x1,%zmm1,%ymm12
vextracti64x4 $0x1,%zmm2,%ymm13
vpaddq %zmm14,%zmm3,%zmm3{%k3}{z} # keep single qword in case
vpaddq %zmm15,%zmm4,%zmm4{%k3}{z} # it's passed to .Ltail_avx2
vpaddq %zmm11,%zmm0,%zmm0{%k3}{z}
vpaddq %zmm12,%zmm1,%zmm1{%k3}{z}
vpaddq %zmm13,%zmm2,%zmm2{%k3}{z}
################################################################
# lazy reduction (interleaved with input splat)
vpsrlq $26,%ymm3,%ymm14
vpand %ymm5,%ymm3,%ymm3
vpsrldq $6,%ymm7,%ymm9 # splat input
vpsrldq $6,%ymm8,%ymm10
vpunpckhqdq %ymm8,%ymm7,%ymm6 # 4
vpaddq %ymm14,%ymm4,%ymm4 # h3 -> h4
vpsrlq $26,%ymm0,%ymm11
vpand %ymm5,%ymm0,%ymm0
vpunpcklqdq %ymm10,%ymm9,%ymm9 # 2:3
vpunpcklqdq %ymm8,%ymm7,%ymm7 # 0:1
vpaddq %ymm11,%ymm1,%ymm1 # h0 -> h1
vpsrlq $26,%ymm4,%ymm15
vpand %ymm5,%ymm4,%ymm4
vpsrlq $26,%ymm1,%ymm12
vpand %ymm5,%ymm1,%ymm1
vpsrlq $30,%ymm9,%ymm10
vpsrlq $4,%ymm9,%ymm9
vpaddq %ymm12,%ymm2,%ymm2 # h1 -> h2
vpaddq %ymm15,%ymm0,%ymm0
vpsllq $2,%ymm15,%ymm15
vpsrlq $26,%ymm7,%ymm8
vpsrlq $40,%ymm6,%ymm6 # 4
vpaddq %ymm15,%ymm0,%ymm0 # h4 -> h0
vpsrlq $26,%ymm2,%ymm13
vpand %ymm5,%ymm2,%ymm2
vpand %ymm5,%ymm9,%ymm9 # 2
vpand %ymm5,%ymm7,%ymm7 # 0
vpaddq %ymm13,%ymm3,%ymm3 # h2 -> h3
vpsrlq $26,%ymm0,%ymm11
vpand %ymm5,%ymm0,%ymm0
vpaddq %ymm2,%ymm9,%ymm2 # accumulate input for .Ltail_avx2
vpand %ymm5,%ymm8,%ymm8 # 1
vpaddq %ymm11,%ymm1,%ymm1 # h0 -> h1
vpsrlq $26,%ymm3,%ymm14
vpand %ymm5,%ymm3,%ymm3
vpand %ymm5,%ymm10,%ymm10 # 3
vpor 32(%rcx),%ymm6,%ymm6 # padbit, yes, always
vpaddq %ymm14,%ymm4,%ymm4 # h3 -> h4
lea 0x90(%rsp),%rax # size optimization for .Ltail_avx2
add $64,%rdx
jnz .Ltail_avx2_avx512
vpsubq %ymm9,%ymm2,%ymm2 # undo input accumulation
vmovd %xmm0,-112(%rdi)# save partially reduced
vmovd %xmm1,-108(%rdi)
vmovd %xmm2,-104(%rdi)
vmovd %xmm3,-100(%rdi)
vmovd %xmm4,-96(%rdi)
vzeroall
lea -8(%r10),%rsp
RET
SYM_FUNC_END(poly1305_blocks_avx512)
#endif