#include <linux/linkage.h>
#define VMOVDQ vmovdqu
#define xdata0 %xmm0
#define xdata1 %xmm1
#define xdata2 %xmm2
#define xdata3 %xmm3
#define xdata4 %xmm4
#define xdata5 %xmm5
#define xdata6 %xmm6
#define xdata7 %xmm7
#define xcounter %xmm8
#define xiv %xmm8
#define xbyteswap %xmm9
#define xtmp %xmm9
#define xkey0 %xmm10
#define xkey4 %xmm11
#define xkey8 %xmm12
#define xkey12 %xmm13
#define xkeyA %xmm14
#define xkeyB %xmm15
#define p_in %rdi
#define p_iv %rsi
#define p_keys %rdx
#define p_out %rcx
#define num_bytes %r8
#define counter %r9
#define tmp %r10
#define DDQ_DATA 0
#define XDATA 1
#define KEY_128 1
#define KEY_192 2
#define KEY_256 3
.section .rodata
.align 16
byteswap_const:
.octa 0x000102030405060708090A0B0C0D0E0F
ddq_low_msk:
.octa 0x0000000000000000FFFFFFFFFFFFFFFF
ddq_high_add_1:
.octa 0x00000000000000010000000000000000
ddq_add_1:
.octa 0x00000000000000000000000000000001
ddq_add_2:
.octa 0x00000000000000000000000000000002
ddq_add_3:
.octa 0x00000000000000000000000000000003
ddq_add_4:
.octa 0x00000000000000000000000000000004
ddq_add_5:
.octa 0x00000000000000000000000000000005
ddq_add_6:
.octa 0x00000000000000000000000000000006
ddq_add_7:
.octa 0x00000000000000000000000000000007
ddq_add_8:
.octa 0x00000000000000000000000000000008
.text
.macro setxdata n
var_xdata = %xmm\n
.endm
.macro club name, id
.altmacro
.if \name == XDATA
setxdata %\id
.endif
.noaltmacro
.endm
.macro do_aes b, k, key_len, xctr
.set by, \b
.set load_keys, \k
.set klen, \key_len
.if (load_keys)
vmovdqa 0*16(p_keys), xkey0
.endif
.if \xctr
movq counter, xtmp
.set i, 0
.rept (by)
club XDATA, i
vpaddq (ddq_add_1 + 16 * i)(%rip), xtmp, var_xdata
.set i, (i +1)
.endr
.set i, 0
.rept (by)
club XDATA, i
vpxor xiv, var_xdata, var_xdata
.set i, (i +1)
.endr
.else
vpshufb xbyteswap, xcounter, xdata0
.set i, 1
.rept (by - 1)
club XDATA, i
vpaddq (ddq_add_1 + 16 * (i - 1))(%rip), xcounter, var_xdata
vptest ddq_low_msk(%rip), var_xdata
jnz 1f
vpaddq ddq_high_add_1(%rip), var_xdata, var_xdata
vpaddq ddq_high_add_1(%rip), xcounter, xcounter
1:
vpshufb xbyteswap, var_xdata, var_xdata
.set i, (i +1)
.endr
.endif
vmovdqa 1*16(p_keys), xkeyA
vpxor xkey0, xdata0, xdata0
.if \xctr
add $by, counter
.else
vpaddq (ddq_add_1 + 16 * (by - 1))(%rip), xcounter, xcounter
vptest ddq_low_msk(%rip), xcounter
jnz 1f
vpaddq ddq_high_add_1(%rip), xcounter, xcounter
1:
.endif
.set i, 1
.rept (by - 1)
club XDATA, i
vpxor xkey0, var_xdata, var_xdata
.set i, (i +1)
.endr
vmovdqa 2*16(p_keys), xkeyB
.set i, 0
.rept by
club XDATA, i
vaesenc xkeyA, var_xdata, var_xdata
.set i, (i +1)
.endr
.if (klen == KEY_128)
.if (load_keys)
vmovdqa 3*16(p_keys), xkey4
.endif
.else
vmovdqa 3*16(p_keys), xkeyA
.endif
.set i, 0
.rept by
club XDATA, i
vaesenc xkeyB, var_xdata, var_xdata
.set i, (i +1)
.endr
add $(16*by), p_in
.if (klen == KEY_128)
vmovdqa 4*16(p_keys), xkeyB
.else
.if (load_keys)
vmovdqa 4*16(p_keys), xkey4
.endif
.endif
.set i, 0
.rept by
club XDATA, i
.if (klen == KEY_128)
vaesenc xkey4, var_xdata, var_xdata
.else
vaesenc xkeyA, var_xdata, var_xdata
.endif
.set i, (i +1)
.endr
vmovdqa 5*16(p_keys), xkeyA
.set i, 0
.rept by
club XDATA, i
.if (klen == KEY_128)
vaesenc xkeyB, var_xdata, var_xdata
.else
vaesenc xkey4, var_xdata, var_xdata
.endif
.set i, (i +1)
.endr
.if (klen == KEY_128)
.if (load_keys)
vmovdqa 6*16(p_keys), xkey8
.endif
.else
vmovdqa 6*16(p_keys), xkeyB
.endif
.set i, 0
.rept by
club XDATA, i
vaesenc xkeyA, var_xdata, var_xdata
.set i, (i +1)
.endr
vmovdqa 7*16(p_keys), xkeyA
.set i, 0
.rept by
club XDATA, i
.if (klen == KEY_128)
vaesenc xkey8, var_xdata, var_xdata
.else
vaesenc xkeyB, var_xdata, var_xdata
.endif
.set i, (i +1)
.endr
.if (klen == KEY_128)
vmovdqa 8*16(p_keys), xkeyB
.else
.if (load_keys)
vmovdqa 8*16(p_keys), xkey8
.endif
.endif
.set i, 0
.rept by
club XDATA, i
vaesenc xkeyA, var_xdata, var_xdata
.set i, (i +1)
.endr
.if (klen == KEY_128)
.if (load_keys)
vmovdqa 9*16(p_keys), xkey12
.endif
.else
vmovdqa 9*16(p_keys), xkeyA
.endif
.set i, 0
.rept by
club XDATA, i
.if (klen == KEY_128)
vaesenc xkeyB, var_xdata, var_xdata
.else
vaesenc xkey8, var_xdata, var_xdata
.endif
.set i, (i +1)
.endr
vmovdqa 10*16(p_keys), xkeyB
.set i, 0
.rept by
club XDATA, i
.if (klen == KEY_128)
vaesenc xkey12, var_xdata, var_xdata
.else
vaesenc xkeyA, var_xdata, var_xdata
.endif
.set i, (i +1)
.endr
.if (klen != KEY_128)
vmovdqa 11*16(p_keys), xkeyA
.endif
.set i, 0
.rept by
club XDATA, i
.if (klen == KEY_128)
vaesenclast xkeyB, var_xdata, var_xdata
.else
vaesenc xkeyB, var_xdata, var_xdata
.endif
.set i, (i +1)
.endr
.if (klen != KEY_128)
.if (load_keys)
vmovdqa 12*16(p_keys), xkey12
.endif
.set i, 0
.rept by
club XDATA, i
vaesenc xkeyA, var_xdata, var_xdata
.set i, (i +1)
.endr
.if (klen == KEY_256)
vmovdqa 13*16(p_keys), xkeyA
.endif
.set i, 0
.rept by
club XDATA, i
.if (klen == KEY_256)
vaesenc xkey12, var_xdata, var_xdata
.else
vaesenclast xkey12, var_xdata, var_xdata
.endif
.set i, (i +1)
.endr
.if (klen == KEY_256)
vmovdqa 14*16(p_keys), xkeyB
.set i, 0
.rept by
club XDATA, i
vaesenc xkeyA, var_xdata, var_xdata
.set i, (i +1)
.endr
.set i, 0
.rept by
club XDATA, i
vaesenclast xkeyB, var_xdata, var_xdata
.set i, (i +1)
.endr
.endif
.endif
.set i, 0
.rept (by / 2)
.set j, (i+1)
VMOVDQ (i*16 - 16*by)(p_in), xkeyA
VMOVDQ (j*16 - 16*by)(p_in), xkeyB
club XDATA, i
vpxor xkeyA, var_xdata, var_xdata
club XDATA, j
vpxor xkeyB, var_xdata, var_xdata
.set i, (i+2)
.endr
.if (i < by)
VMOVDQ (i*16 - 16*by)(p_in), xkeyA
club XDATA, i
vpxor xkeyA, var_xdata, var_xdata
.endif
.set i, 0
.rept by
club XDATA, i
VMOVDQ var_xdata, i*16(p_out)
.set i, (i+1)
.endr
.endm
.macro do_aes_load val, key_len, xctr
do_aes \val, 1, \key_len, \xctr
.endm
.macro do_aes_noload val, key_len, xctr
do_aes \val, 0, \key_len, \xctr
.endm
.macro do_aes_ctrmain key_len, xctr
cmp $16, num_bytes
jb .Ldo_return2\xctr\key_len
.if \xctr
shr $4, counter
vmovdqu (p_iv), xiv
.else
vmovdqa byteswap_const(%rip), xbyteswap
vmovdqu (p_iv), xcounter
vpshufb xbyteswap, xcounter, xcounter
.endif
mov num_bytes, tmp
and $(7*16), tmp
jz .Lmult_of_8_blks\xctr\key_len
cmp $(4*16), tmp
jg .Lgt4\xctr\key_len
je .Leq4\xctr\key_len
.Llt4\xctr\key_len:
cmp $(2*16), tmp
jg .Leq3\xctr\key_len
je .Leq2\xctr\key_len
.Leq1\xctr\key_len:
do_aes_load 1, \key_len, \xctr
add $(1*16), p_out
and $(~7*16), num_bytes
jz .Ldo_return2\xctr\key_len
jmp .Lmain_loop2\xctr\key_len
.Leq2\xctr\key_len:
do_aes_load 2, \key_len, \xctr
add $(2*16), p_out
and $(~7*16), num_bytes
jz .Ldo_return2\xctr\key_len
jmp .Lmain_loop2\xctr\key_len
.Leq3\xctr\key_len:
do_aes_load 3, \key_len, \xctr
add $(3*16), p_out
and $(~7*16), num_bytes
jz .Ldo_return2\xctr\key_len
jmp .Lmain_loop2\xctr\key_len
.Leq4\xctr\key_len:
do_aes_load 4, \key_len, \xctr
add $(4*16), p_out
and $(~7*16), num_bytes
jz .Ldo_return2\xctr\key_len
jmp .Lmain_loop2\xctr\key_len
.Lgt4\xctr\key_len:
cmp $(6*16), tmp
jg .Leq7\xctr\key_len
je .Leq6\xctr\key_len
.Leq5\xctr\key_len:
do_aes_load 5, \key_len, \xctr
add $(5*16), p_out
and $(~7*16), num_bytes
jz .Ldo_return2\xctr\key_len
jmp .Lmain_loop2\xctr\key_len
.Leq6\xctr\key_len:
do_aes_load 6, \key_len, \xctr
add $(6*16), p_out
and $(~7*16), num_bytes
jz .Ldo_return2\xctr\key_len
jmp .Lmain_loop2\xctr\key_len
.Leq7\xctr\key_len:
do_aes_load 7, \key_len, \xctr
add $(7*16), p_out
and $(~7*16), num_bytes
jz .Ldo_return2\xctr\key_len
jmp .Lmain_loop2\xctr\key_len
.Lmult_of_8_blks\xctr\key_len:
.if (\key_len != KEY_128)
vmovdqa 0*16(p_keys), xkey0
vmovdqa 4*16(p_keys), xkey4
vmovdqa 8*16(p_keys), xkey8
vmovdqa 12*16(p_keys), xkey12
.else
vmovdqa 0*16(p_keys), xkey0
vmovdqa 3*16(p_keys), xkey4
vmovdqa 6*16(p_keys), xkey8
vmovdqa 9*16(p_keys), xkey12
.endif
.align 16
.Lmain_loop2\xctr\key_len:
do_aes_noload 8, \key_len, \xctr
add $(8*16), p_out
sub $(8*16), num_bytes
jne .Lmain_loop2\xctr\key_len
.Ldo_return2\xctr\key_len:
.if !\xctr
vpshufb xbyteswap, xcounter, xcounter
vmovdqu xcounter, (p_iv)
.endif
RET
.endm
SYM_FUNC_START(aes_ctr_enc_128_avx_by8)
do_aes_ctrmain KEY_128 0
SYM_FUNC_END(aes_ctr_enc_128_avx_by8)
SYM_FUNC_START(aes_ctr_enc_192_avx_by8)
do_aes_ctrmain KEY_192 0
SYM_FUNC_END(aes_ctr_enc_192_avx_by8)
SYM_FUNC_START(aes_ctr_enc_256_avx_by8)
do_aes_ctrmain KEY_256 0
SYM_FUNC_END(aes_ctr_enc_256_avx_by8)
SYM_FUNC_START(aes_xctr_enc_128_avx_by8)
do_aes_ctrmain KEY_128 1
SYM_FUNC_END(aes_xctr_enc_128_avx_by8)
SYM_FUNC_START(aes_xctr_enc_192_avx_by8)
do_aes_ctrmain KEY_192 1
SYM_FUNC_END(aes_xctr_enc_192_avx_by8)
SYM_FUNC_START(aes_xctr_enc_256_avx_by8)
do_aes_ctrmain KEY_256 1
SYM_FUNC_END(aes_xctr_enc_256_avx_by8)