#include <linux/linkage.h>
#include <linux/cfi_types.h>
.section .rodata
.p2align 4
.Lgf_poly:
.quad 0x87, 1
.Lcts_permute_table:
.byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
.byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
.byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
.byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
.byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
.byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
.text
.set KEY, %rdi
.set SRC, %rsi
.set DST, %rdx
.set LEN, %ecx
.set LEN8, %cl
.set LEN64, %rcx
.set TWEAK, %r8
.set KEYLEN, %eax
.set KEYLEN64, %rax
.macro _define_Vi i
.if VL == 16
.set V\i, %xmm\i
.elseif VL == 32
.set V\i, %ymm\i
.elseif VL == 64
.set V\i, %zmm\i
.else
.error "Unsupported Vector Length (VL)"
.endif
.endm
.macro _define_aliases
_define_Vi 0
_define_Vi 1
_define_Vi 2
_define_Vi 3
_define_Vi 4
_define_Vi 5
_define_Vi 6
_define_Vi 7
_define_Vi 8
_define_Vi 9
_define_Vi 10
_define_Vi 11
_define_Vi 12
_define_Vi 13
_define_Vi 14
_define_Vi 15
.if USE_AVX10
_define_Vi 16
_define_Vi 17
_define_Vi 18
_define_Vi 19
_define_Vi 20
_define_Vi 21
_define_Vi 22
_define_Vi 23
_define_Vi 24
_define_Vi 25
_define_Vi 26
_define_Vi 27
_define_Vi 28
_define_Vi 29
_define_Vi 30
_define_Vi 31
.endif
.set TWEAK0_XMM, %xmm6
.set TWEAK0, V6
.set TWEAK1_XMM, %xmm7
.set TWEAK1, V7
.set TWEAK2, V8
.set TWEAK3, V9
.set NEXT_TWEAK0, V10
.set NEXT_TWEAK1, V11
.set NEXT_TWEAK2, V12
.set NEXT_TWEAK3, V13
.set GF_POLY_XMM, %xmm14
.set GF_POLY, V14
.set KEY0_XMM, %xmm15
.set KEY0, V15
.if USE_AVX10
.set KEY1_XMM, %xmm16
.set KEY1, V16
.set KEY2_XMM, %xmm17
.set KEY2, V17
.set KEY3_XMM, %xmm18
.set KEY3, V18
.set KEY4_XMM, %xmm19
.set KEY4, V19
.set KEY5_XMM, %xmm20
.set KEY5, V20
.set KEY6_XMM, %xmm21
.set KEY6, V21
.set KEY7_XMM, %xmm22
.set KEY7, V22
.set KEY8_XMM, %xmm23
.set KEY8, V23
.set KEY9_XMM, %xmm24
.set KEY9, V24
.set KEY10_XMM, %xmm25
.set KEY10, V25
.set KEY11_XMM, %xmm26
.set KEY11, V26
.set KEY12_XMM, %xmm27
.set KEY12, V27
.set KEY13_XMM, %xmm28
.set KEY13, V28
.set KEY14_XMM, %xmm29
.set KEY14, V29
.endif
.endm
.macro _vmovdqu src, dst
.if VL < 64
vmovdqu \src, \dst
.else
vmovdqu8 \src, \dst
.endif
.endm
.macro _vbroadcast128 src, dst
.if VL == 16 && !USE_AVX10
vmovdqu \src, \dst
.elseif VL == 32 && !USE_AVX10
vbroadcasti128 \src, \dst
.else
vbroadcasti32x4 \src, \dst
.endif
.endm
.macro _vpxor src1, src2, dst
.if USE_AVX10
vpxord \src1, \src2, \dst
.else
vpxor \src1, \src2, \dst
.endif
.endm
.macro _xor3 src1, src2, src3_and_dst
.if USE_AVX10
vpternlogd $0x96, \src1, \src2, \src3_and_dst
.else
vpxor \src1, \src3_and_dst, \src3_and_dst
vpxor \src2, \src3_and_dst, \src3_and_dst
.endif
.endm
.macro _next_tweak src, tmp, dst
vpshufd $0x13, \src, \tmp
vpaddq \src, \src, \dst
vpsrad $31, \tmp, \tmp
vpand GF_POLY_XMM, \tmp, \tmp
vpxor \tmp, \dst, \dst
.endm
.macro _next_tweakvec src, tmp1, tmp2, dst
.if VL == 16
_next_tweak \src, \tmp1, \dst
.else
vpsrlq $64 - VL/16, \src, \tmp1
vpclmulqdq $0x01, GF_POLY, \tmp1, \tmp2
vpslldq $8, \tmp1, \tmp1
vpsllq $VL/16, \src, \dst
_xor3 \tmp1, \tmp2, \dst
.endif
.endm
.macro _compute_first_set_of_tweaks
vmovdqu (TWEAK), TWEAK0_XMM
_vbroadcast128 .Lgf_poly(%rip), GF_POLY
.if VL == 16
_next_tweak TWEAK0, %xmm0, TWEAK1
_next_tweak TWEAK1, %xmm0, TWEAK2
_next_tweak TWEAK2, %xmm0, TWEAK3
.else
.if VL == 32
_next_tweak TWEAK0_XMM, %xmm0, %xmm1
vinserti128 $1, %xmm1, TWEAK0, TWEAK0
.elseif VL == 64
_next_tweak TWEAK0_XMM, %xmm0, %xmm1
_next_tweak %xmm1, %xmm0, %xmm2
_next_tweak %xmm2, %xmm0, %xmm3
vinserti32x4 $1, %xmm1, TWEAK0, TWEAK0
vinserti32x4 $2, %xmm2, TWEAK0, TWEAK0
vinserti32x4 $3, %xmm3, TWEAK0, TWEAK0
.endif
vpsrlq $64 - 1*VL/16, TWEAK0, V0
vpsrlq $64 - 2*VL/16, TWEAK0, V2
vpsrlq $64 - 3*VL/16, TWEAK0, V4
vpclmulqdq $0x01, GF_POLY, V0, V1
vpclmulqdq $0x01, GF_POLY, V2, V3
vpclmulqdq $0x01, GF_POLY, V4, V5
vpslldq $8, V0, V0
vpslldq $8, V2, V2
vpslldq $8, V4, V4
vpsllq $1*VL/16, TWEAK0, TWEAK1
vpsllq $2*VL/16, TWEAK0, TWEAK2
vpsllq $3*VL/16, TWEAK0, TWEAK3
.if USE_AVX10
vpternlogd $0x96, V0, V1, TWEAK1
vpternlogd $0x96, V2, V3, TWEAK2
vpternlogd $0x96, V4, V5, TWEAK3
.else
vpxor V0, TWEAK1, TWEAK1
vpxor V2, TWEAK2, TWEAK2
vpxor V4, TWEAK3, TWEAK3
vpxor V1, TWEAK1, TWEAK1
vpxor V3, TWEAK2, TWEAK2
vpxor V5, TWEAK3, TWEAK3
.endif
.endif
.endm
.macro _tweak_step_mulx i
.if \i == 0
.set PREV_TWEAK, TWEAK3
.set NEXT_TWEAK, NEXT_TWEAK0
.elseif \i == 5
.set PREV_TWEAK, NEXT_TWEAK0
.set NEXT_TWEAK, NEXT_TWEAK1
.elseif \i == 10
.set PREV_TWEAK, NEXT_TWEAK1
.set NEXT_TWEAK, NEXT_TWEAK2
.elseif \i == 15
.set PREV_TWEAK, NEXT_TWEAK2
.set NEXT_TWEAK, NEXT_TWEAK3
.endif
.if \i >= 0 && \i < 20 && \i % 5 == 0
vpshufd $0x13, PREV_TWEAK, V5
.elseif \i >= 0 && \i < 20 && \i % 5 == 1
vpaddq PREV_TWEAK, PREV_TWEAK, NEXT_TWEAK
.elseif \i >= 0 && \i < 20 && \i % 5 == 2
vpsrad $31, V5, V5
.elseif \i >= 0 && \i < 20 && \i % 5 == 3
vpand GF_POLY, V5, V5
.elseif \i >= 0 && \i < 20 && \i % 5 == 4
vpxor V5, NEXT_TWEAK, NEXT_TWEAK
.elseif \i == 1000
vmovdqa NEXT_TWEAK0, TWEAK0
vmovdqa NEXT_TWEAK1, TWEAK1
vmovdqa NEXT_TWEAK2, TWEAK2
vmovdqa NEXT_TWEAK3, TWEAK3
.endif
.endm
.macro _tweak_step_pclmul i
.if \i == 0
vpsrldq $(128 - 4*VL/16) / 8, TWEAK0, NEXT_TWEAK0
.elseif \i == 2
vpsrldq $(128 - 4*VL/16) / 8, TWEAK1, NEXT_TWEAK1
.elseif \i == 4
vpsrldq $(128 - 4*VL/16) / 8, TWEAK2, NEXT_TWEAK2
.elseif \i == 6
vpsrldq $(128 - 4*VL/16) / 8, TWEAK3, NEXT_TWEAK3
.elseif \i == 8
vpclmulqdq $0x00, GF_POLY, NEXT_TWEAK0, NEXT_TWEAK0
.elseif \i == 10
vpclmulqdq $0x00, GF_POLY, NEXT_TWEAK1, NEXT_TWEAK1
.elseif \i == 12
vpclmulqdq $0x00, GF_POLY, NEXT_TWEAK2, NEXT_TWEAK2
.elseif \i == 14
vpclmulqdq $0x00, GF_POLY, NEXT_TWEAK3, NEXT_TWEAK3
.elseif \i == 1000
vpslldq $(4*VL/16) / 8, TWEAK0, TWEAK0
vpslldq $(4*VL/16) / 8, TWEAK1, TWEAK1
vpslldq $(4*VL/16) / 8, TWEAK2, TWEAK2
vpslldq $(4*VL/16) / 8, TWEAK3, TWEAK3
_vpxor NEXT_TWEAK0, TWEAK0, TWEAK0
_vpxor NEXT_TWEAK1, TWEAK1, TWEAK1
_vpxor NEXT_TWEAK2, TWEAK2, TWEAK2
_vpxor NEXT_TWEAK3, TWEAK3, TWEAK3
.endif
.endm
.macro _tweak_step i
.if VL == 16
_tweak_step_mulx \i
.else
_tweak_step_pclmul \i
.endif
.endm
.macro _setup_round_keys enc
.if \enc
.set OFFS, 0
.else
.set OFFS, 240
.endif
_vbroadcast128 OFFS(KEY), KEY0
lea OFFS-16(KEY, KEYLEN64, 4), KEY
.if USE_AVX10
cmp $24, KEYLEN
jl .Laes128\@
je .Laes192\@
_vbroadcast128 -6*16(KEY), KEY1
_vbroadcast128 -5*16(KEY), KEY2
.Laes192\@:
_vbroadcast128 -4*16(KEY), KEY3
_vbroadcast128 -3*16(KEY), KEY4
.Laes128\@:
_vbroadcast128 -2*16(KEY), KEY5
_vbroadcast128 -1*16(KEY), KEY6
_vbroadcast128 0*16(KEY), KEY7
_vbroadcast128 1*16(KEY), KEY8
_vbroadcast128 2*16(KEY), KEY9
_vbroadcast128 3*16(KEY), KEY10
_vbroadcast128 4*16(KEY), KEY11
_vbroadcast128 5*16(KEY), KEY12
_vbroadcast128 6*16(KEY), KEY13
_vbroadcast128 7*16(KEY), KEY14
.endif
.endm
.macro _vaes enc, last, key, data
.if \enc
.if \last
vaesenclast \key, \data, \data
.else
vaesenc \key, \data, \data
.endif
.else
.if \last
vaesdeclast \key, \data, \data
.else
vaesdec \key, \data, \data
.endif
.endif
.endm
.macro _vaes_1x enc, last, i, xmm_suffix, data
.if USE_AVX10
_vaes \enc, \last, KEY\i\xmm_suffix, \data
.else
.ifnb \xmm_suffix
_vaes \enc, \last, (\i-7)*16(KEY), \data
.else
_vbroadcast128 (\i-7)*16(KEY), V4
_vaes \enc, \last, V4, \data
.endif
.endif
.endm
.macro _vaes_4x enc, last, i
.if USE_AVX10
_tweak_step (2*(\i-5))
_vaes \enc, \last, KEY\i, V0
_vaes \enc, \last, KEY\i, V1
_tweak_step (2*(\i-5) + 1)
_vaes \enc, \last, KEY\i, V2
_vaes \enc, \last, KEY\i, V3
.else
_vbroadcast128 (\i-7)*16(KEY), V4
_tweak_step (2*(\i-5))
_vaes \enc, \last, V4, V0
_vaes \enc, \last, V4, V1
_tweak_step (2*(\i-5) + 1)
_vaes \enc, \last, V4, V2
_vaes \enc, \last, V4, V3
.endif
.endm
.macro _aes_crypt enc, xmm_suffix, tweak, data
_xor3 KEY0\xmm_suffix, \tweak, \data
cmp $24, KEYLEN
jl .Laes128\@
je .Laes192\@
_vaes_1x \enc, 0, 1, \xmm_suffix, \data
_vaes_1x \enc, 0, 2, \xmm_suffix, \data
.Laes192\@:
_vaes_1x \enc, 0, 3, \xmm_suffix, \data
_vaes_1x \enc, 0, 4, \xmm_suffix, \data
.Laes128\@:
_vaes_1x \enc, 0, 5, \xmm_suffix, \data
_vaes_1x \enc, 0, 6, \xmm_suffix, \data
_vaes_1x \enc, 0, 7, \xmm_suffix, \data
_vaes_1x \enc, 0, 8, \xmm_suffix, \data
_vaes_1x \enc, 0, 9, \xmm_suffix, \data
_vaes_1x \enc, 0, 10, \xmm_suffix, \data
_vaes_1x \enc, 0, 11, \xmm_suffix, \data
_vaes_1x \enc, 0, 12, \xmm_suffix, \data
_vaes_1x \enc, 0, 13, \xmm_suffix, \data
_vaes_1x \enc, 1, 14, \xmm_suffix, \data
_vpxor \tweak, \data, \data
.endm
.macro _aes_xts_crypt enc
_define_aliases
.if !\enc
lea -16(LEN), %eax
test $15, LEN8
cmovnz %eax, LEN
.endif
movl 480(KEY), KEYLEN
_setup_round_keys \enc
_compute_first_set_of_tweaks
sub $4*VL, LEN
jl .Lhandle_remainder\@
.Lmain_loop\@:
.if USE_AVX10
vmovdqu8 0*VL(SRC), V0
vmovdqu8 1*VL(SRC), V1
vmovdqu8 2*VL(SRC), V2
vmovdqu8 3*VL(SRC), V3
vpternlogd $0x96, TWEAK0, KEY0, V0
vpternlogd $0x96, TWEAK1, KEY0, V1
vpternlogd $0x96, TWEAK2, KEY0, V2
vpternlogd $0x96, TWEAK3, KEY0, V3
.else
vpxor 0*VL(SRC), KEY0, V0
vpxor 1*VL(SRC), KEY0, V1
vpxor 2*VL(SRC), KEY0, V2
vpxor 3*VL(SRC), KEY0, V3
vpxor TWEAK0, V0, V0
vpxor TWEAK1, V1, V1
vpxor TWEAK2, V2, V2
vpxor TWEAK3, V3, V3
.endif
cmp $24, KEYLEN
jl .Laes128\@
je .Laes192\@
_vaes_4x \enc, 0, 1
_vaes_4x \enc, 0, 2
.Laes192\@:
_vaes_4x \enc, 0, 3
_vaes_4x \enc, 0, 4
.Laes128\@:
_vaes_4x \enc, 0, 5
_vaes_4x \enc, 0, 6
_vaes_4x \enc, 0, 7
_vaes_4x \enc, 0, 8
_vaes_4x \enc, 0, 9
_vaes_4x \enc, 0, 10
_vaes_4x \enc, 0, 11
_vaes_4x \enc, 0, 12
_vaes_4x \enc, 0, 13
_vaes_4x \enc, 1, 14
_vpxor TWEAK0, V0, V0
_vpxor TWEAK1, V1, V1
_vpxor TWEAK2, V2, V2
_vpxor TWEAK3, V3, V3
_vmovdqu V0, 0*VL(DST)
_vmovdqu V1, 1*VL(DST)
_vmovdqu V2, 2*VL(DST)
_vmovdqu V3, 3*VL(DST)
_tweak_step 1000
add $4*VL, SRC
add $4*VL, DST
sub $4*VL, LEN
jge .Lmain_loop\@
test $4*VL-1, LEN8
jnz .Lhandle_remainder\@
.Ldone\@:
vmovdqu TWEAK0_XMM, (TWEAK)
.if VL > 16
vzeroupper
.endif
RET
.Lhandle_remainder\@:
.if VL > 16
add $3*VL, LEN
jl .Lvec_at_a_time_done\@
.Lvec_at_a_time\@:
_vmovdqu (SRC), V0
_aes_crypt \enc, , TWEAK0, V0
_vmovdqu V0, (DST)
_next_tweakvec TWEAK0, V0, V1, TWEAK0
add $VL, SRC
add $VL, DST
sub $VL, LEN
jge .Lvec_at_a_time\@
.Lvec_at_a_time_done\@:
add $VL-16, LEN
.else
add $4*VL-16, LEN
.endif
jl .Lblock_at_a_time_done\@
.Lblock_at_a_time\@:
vmovdqu (SRC), %xmm0
_aes_crypt \enc, _XMM, TWEAK0_XMM, %xmm0
vmovdqu %xmm0, (DST)
_next_tweak TWEAK0_XMM, %xmm0, TWEAK0_XMM
add $16, SRC
add $16, DST
sub $16, LEN
jge .Lblock_at_a_time\@
.Lblock_at_a_time_done\@:
add $16, LEN
jz .Ldone\@
.if \enc
sub $16, SRC
sub $16, DST
vmovdqu (DST), %xmm0
.else
_next_tweak TWEAK0_XMM, %xmm0, TWEAK1_XMM
vmovdqu (SRC), %xmm0
_aes_crypt \enc, _XMM, TWEAK1_XMM, %xmm0
.endif
.if USE_AVX10
mov $-1, %r9d
bzhi LEN, %r9d, %r9d
kmovd %r9d, %k1
vmovdqa %xmm0, %xmm1
vmovdqu8 16(SRC), %xmm0{%k1}
vmovdqu8 %xmm1, 16(DST){%k1}
.else
lea .Lcts_permute_table(%rip), %r9
vmovdqu (SRC, LEN64, 1), %xmm1
vpshufb (%r9, LEN64, 1), %xmm0, %xmm2
vmovdqu %xmm2, (DST, LEN64, 1)
sub LEN64, %r9
vmovdqu 32(%r9), %xmm3
vpshufb %xmm3, %xmm1, %xmm1
vpblendvb %xmm3, %xmm0, %xmm1, %xmm0
.endif
_aes_crypt \enc, _XMM, TWEAK0_XMM, %xmm0
vmovdqu %xmm0, (DST)
jmp .Ldone\@
.endm
SYM_TYPED_FUNC_START(aes_xts_encrypt_iv)
vmovdqu (%rsi), %xmm0
vpxor (%rdi), %xmm0, %xmm0
movl 480(%rdi), %eax
lea -16(%rdi, %rax, 4), %rdi
cmp $24, %eax
jl .Lencrypt_iv_aes128
je .Lencrypt_iv_aes192
vaesenc -6*16(%rdi), %xmm0, %xmm0
vaesenc -5*16(%rdi), %xmm0, %xmm0
.Lencrypt_iv_aes192:
vaesenc -4*16(%rdi), %xmm0, %xmm0
vaesenc -3*16(%rdi), %xmm0, %xmm0
.Lencrypt_iv_aes128:
vaesenc -2*16(%rdi), %xmm0, %xmm0
vaesenc -1*16(%rdi), %xmm0, %xmm0
vaesenc 0*16(%rdi), %xmm0, %xmm0
vaesenc 1*16(%rdi), %xmm0, %xmm0
vaesenc 2*16(%rdi), %xmm0, %xmm0
vaesenc 3*16(%rdi), %xmm0, %xmm0
vaesenc 4*16(%rdi), %xmm0, %xmm0
vaesenc 5*16(%rdi), %xmm0, %xmm0
vaesenc 6*16(%rdi), %xmm0, %xmm0
vaesenclast 7*16(%rdi), %xmm0, %xmm0
vmovdqu %xmm0, (%rsi)
RET
SYM_FUNC_END(aes_xts_encrypt_iv)
.set VL, 16
.set USE_AVX10, 0
SYM_TYPED_FUNC_START(aes_xts_encrypt_aesni_avx)
_aes_xts_crypt 1
SYM_FUNC_END(aes_xts_encrypt_aesni_avx)
SYM_TYPED_FUNC_START(aes_xts_decrypt_aesni_avx)
_aes_xts_crypt 0
SYM_FUNC_END(aes_xts_decrypt_aesni_avx)
#if defined(CONFIG_AS_VAES) && defined(CONFIG_AS_VPCLMULQDQ)
.set VL, 32
.set USE_AVX10, 0
SYM_TYPED_FUNC_START(aes_xts_encrypt_vaes_avx2)
_aes_xts_crypt 1
SYM_FUNC_END(aes_xts_encrypt_vaes_avx2)
SYM_TYPED_FUNC_START(aes_xts_decrypt_vaes_avx2)
_aes_xts_crypt 0
SYM_FUNC_END(aes_xts_decrypt_vaes_avx2)
.set VL, 32
.set USE_AVX10, 1
SYM_TYPED_FUNC_START(aes_xts_encrypt_vaes_avx10_256)
_aes_xts_crypt 1
SYM_FUNC_END(aes_xts_encrypt_vaes_avx10_256)
SYM_TYPED_FUNC_START(aes_xts_decrypt_vaes_avx10_256)
_aes_xts_crypt 0
SYM_FUNC_END(aes_xts_decrypt_vaes_avx10_256)
.set VL, 64
.set USE_AVX10, 1
SYM_TYPED_FUNC_START(aes_xts_encrypt_vaes_avx10_512)
_aes_xts_crypt 1
SYM_FUNC_END(aes_xts_encrypt_vaes_avx10_512)
SYM_TYPED_FUNC_START(aes_xts_decrypt_vaes_avx10_512)
_aes_xts_crypt 0
SYM_FUNC_END(aes_xts_decrypt_vaes_avx10_512)
#endif