#include "../common/portability_macros.h"
#if defined(__ELF__) && defined(__GNUC__)
.section .note.GNU-stack,"",%progbits
#if defined(__aarch64__)
.pushsection .note.gnu.property, "a"
.p2align 3
.long 4
.long 0x10
.long 0x5
.asciz "GNU"
.long 0xc0000000
.long 4
.long 3
.p2align 3
.popsection
#endif
#endif
#if ZSTD_ENABLE_ASM_X86_64_BMI2
ZSTD_HIDE_ASM_FUNCTION(HUF_decompress4X1_usingDTable_internal_fast_asm_loop)
ZSTD_HIDE_ASM_FUNCTION(HUF_decompress4X2_usingDTable_internal_fast_asm_loop)
ZSTD_HIDE_ASM_FUNCTION(_HUF_decompress4X2_usingDTable_internal_fast_asm_loop)
ZSTD_HIDE_ASM_FUNCTION(_HUF_decompress4X1_usingDTable_internal_fast_asm_loop)
.global HUF_decompress4X1_usingDTable_internal_fast_asm_loop
.global HUF_decompress4X2_usingDTable_internal_fast_asm_loop
.global _HUF_decompress4X1_usingDTable_internal_fast_asm_loop
.global _HUF_decompress4X2_usingDTable_internal_fast_asm_loop
.text
#define op0 rsi
#define op1 rbx
#define op2 rcx
#define op3 rdi
#define ip0 r8
#define ip1 r9
#define ip2 r10
#define ip3 r11
#define bits0 rbp
#define bits1 rdx
#define bits2 r12
#define bits3 r13
#define dtable r14
#define olimit r15
#define var0 r15
#define var1 r9
#define var2 r10
#define var3 r11
#define vard0 r15d
#define vard1 r9d
#define vard2 r10d
#define vard3 r11d
#define FOR_EACH_STREAM(X) \
X(0); \
X(1); \
X(2); \
X(3)
#define FOR_EACH_STREAM_WITH_INDEX(X, idx) \
X(0, idx); \
X(1, idx); \
X(2, idx); \
X(3, idx)
_HUF_decompress4X1_usingDTable_internal_fast_asm_loop:
HUF_decompress4X1_usingDTable_internal_fast_asm_loop:
ZSTD_CET_ENDBRANCH
push %rax
push %rbx
push %rcx
push %rdx
push %rbp
push %rsi
push %rdi
push %r8
push %r9
push %r10
push %r11
push %r12
push %r13
push %r14
push %r15
movq %rdi, %rax
movq 0(%rax), %ip0
movq 8(%rax), %ip1
movq 16(%rax), %ip2
movq 24(%rax), %ip3
movq 32(%rax), %op0
movq 40(%rax), %op1
movq 48(%rax), %op2
movq 56(%rax), %op3
movq 64(%rax), %bits0
movq 72(%rax), %bits1
movq 80(%rax), %bits2
movq 88(%rax), %bits3
movq 96(%rax), %dtable
push %rax
push 104(%rax)
push 112(%rax)
push %olimit
subq $24, %rsp
.L_4X1_compute_olimit:
movq %rbx, 0(%rsp)
movq %rdx, 8(%rsp)
movq 32(%rsp), %rax
subq %op3, %rax
movabsq $-3689348814741910323, %rdx
mulq %rdx
movq %rdx, %r15
shrq $2, %r15
movq %ip0, %rax
movq 40(%rsp), %rdx
subq %rdx, %rax
movq %rax, %rbx
movabsq $2635249153387078803, %rdx
mulq %rdx
subq %rdx, %rbx
shrq %rbx
addq %rbx, %rdx
shrq $2, %rdx
cmpq %rdx, %r15
cmova %rdx, %r15
leaq (%r15, %r15, 4), %r15
addq %op3, %olimit
movq 8(%rsp), %rdx
movq 0(%rsp), %rbx
movq %op3, %rax
cmpq %rax, %olimit
je .L_4X1_exit
cmpq %ip0, %ip1
jb .L_4X1_exit
cmpq %ip1, %ip2
jb .L_4X1_exit
cmpq %ip2, %ip3
jb .L_4X1_exit
#define GET_NEXT_DELT(n) \
movq $53, %var##n; \
shrxq %var##n, %bits##n, %var##n; \
movzwl (%dtable,%var##n,2),%vard##n
#define DECODE_FROM_DELT(n, idx) \
movq %var##n, %rax; \
shlxq %var##n, %bits##n, %bits##n; \
movb %ah, idx(%op##n)
#define DECODE_AND_GET_NEXT(n, idx) \
DECODE_FROM_DELT(n, idx); \
GET_NEXT_DELT(n) \
#define RELOAD_BITS(n) \
bsfq %bits##n, %bits##n; \
movq %bits##n, %rax; \
andq $7, %rax; \
shrq $3, %bits##n; \
leaq 5(%op##n), %op##n; \
subq %bits##n, %ip##n; \
movq (%ip##n), %bits##n; \
orq $1, %bits##n; \
shlx %rax, %bits##n, %bits##n
movq %olimit, 24(%rsp)
movq %ip1, 0(%rsp)
movq %ip2, 8(%rsp)
movq %ip3, 16(%rsp)
FOR_EACH_STREAM(GET_NEXT_DELT)
.p2align 6
.L_4X1_loop_body:
FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 0)
FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 1)
FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 2)
FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 3)
FOR_EACH_STREAM_WITH_INDEX(DECODE_FROM_DELT, 4)
movq 0(%rsp), %ip1
movq 8(%rsp), %ip2
movq 16(%rsp), %ip3
RELOAD_BITS(0)
GET_NEXT_DELT(0)
RELOAD_BITS(1)
movq %ip1, 0(%rsp)
GET_NEXT_DELT(1)
RELOAD_BITS(2)
movq %ip2, 8(%rsp)
GET_NEXT_DELT(2)
RELOAD_BITS(3)
movq %ip3, 16(%rsp)
GET_NEXT_DELT(3)
cmp %op3, 24(%rsp)
ja .L_4X1_loop_body
movq 0(%rsp), %ip1
movq 8(%rsp), %ip2
movq 16(%rsp), %ip3
jmp .L_4X1_compute_olimit
#undef GET_NEXT_DELT
#undef DECODE_FROM_DELT
#undef DECODE
#undef RELOAD_BITS
.L_4X1_exit:
addq $24, %rsp
pop %rax
pop %rax
pop %rax
pop %rax
movq %ip0, 0(%rax)
movq %ip1, 8(%rax)
movq %ip2, 16(%rax)
movq %ip3, 24(%rax)
movq %op0, 32(%rax)
movq %op1, 40(%rax)
movq %op2, 48(%rax)
movq %op3, 56(%rax)
movq %bits0, 64(%rax)
movq %bits1, 72(%rax)
movq %bits2, 80(%rax)
movq %bits3, 88(%rax)
pop %r15
pop %r14
pop %r13
pop %r12
pop %r11
pop %r10
pop %r9
pop %r8
pop %rdi
pop %rsi
pop %rbp
pop %rdx
pop %rcx
pop %rbx
pop %rax
ret
_HUF_decompress4X2_usingDTable_internal_fast_asm_loop:
HUF_decompress4X2_usingDTable_internal_fast_asm_loop:
ZSTD_CET_ENDBRANCH
push %rax
push %rbx
push %rcx
push %rdx
push %rbp
push %rsi
push %rdi
push %r8
push %r9
push %r10
push %r11
push %r12
push %r13
push %r14
push %r15
movq %rdi, %rax
movq 0(%rax), %ip0
movq 8(%rax), %ip1
movq 16(%rax), %ip2
movq 24(%rax), %ip3
movq 32(%rax), %op0
movq 40(%rax), %op1
movq 48(%rax), %op2
movq 56(%rax), %op3
movq 64(%rax), %bits0
movq 72(%rax), %bits1
movq 80(%rax), %bits2
movq 88(%rax), %bits3
movq 96(%rax), %dtable
push %rax
push %rax
push 104(%rax)
movq 112(%rax), %rax
push %rax
movq %op3, %rax
push %rax
movq %op2, %rax
push %rax
movq %op1, %rax
push %rax
subq $8, %rsp
.L_4X2_compute_olimit:
movq %rdx, 0(%rsp)
movq %ip0, %rax
movq 40(%rsp), %rdx
subq %rdx, %rax
movq %rax, %r15
movabsq $2635249153387078803, %rdx
mulq %rdx
subq %rdx, %r15
shrq %r15
addq %r15, %rdx
shrq $2, %rdx
movq %rdx, %r15
movq 8(%rsp), %rax
subq %op0, %rax
movq 16(%rsp), %rdx
subq %op1, %rdx
cmpq %rax, %rdx
cmova %rax, %rdx
movq 24(%rsp), %rax
subq %op2, %rax
cmpq %rax, %rdx
cmova %rax, %rdx
movq 32(%rsp), %rax
subq %op3, %rax
cmpq %rax, %rdx
cmova %rax, %rdx
movabsq $-3689348814741910323, %rax
mulq %rdx
shrq $3, %rdx
cmpq %rdx, %r15
cmova %rdx, %r15
movq %r15, %rax
leaq (%op3, %rax, 4), %olimit
addq %rax, %olimit
movq 0(%rsp), %rdx
movq %op3, %rax
cmpq %rax, %olimit
je .L_4X2_exit
cmpq %ip0, %ip1
jb .L_4X2_exit
cmpq %ip1, %ip2
jb .L_4X2_exit
cmpq %ip2, %ip3
jb .L_4X2_exit
#define DECODE(n, idx) \
movq %bits##n, %rax; \
shrq $53, %rax; \
movzwl 0(%dtable,%rax,4),%r8d; \
movzbl 2(%dtable,%rax,4),%r15d; \
movzbl 3(%dtable,%rax,4),%eax; \
movw %r8w, (%op##n); \
shlxq %r15, %bits##n, %bits##n; \
addq %rax, %op##n
#define RELOAD_BITS(n) \
bsfq %bits##n, %bits##n; \
movq %bits##n, %rax; \
shrq $3, %bits##n; \
andq $7, %rax; \
subq %bits##n, %ip##n; \
movq (%ip##n), %bits##n; \
orq $1, %bits##n; \
shlxq %rax, %bits##n, %bits##n
movq %olimit, 48(%rsp)
.p2align 6
.L_4X2_loop_body:
movq %r8, 0(%rsp)
FOR_EACH_STREAM_WITH_INDEX(DECODE, 0)
FOR_EACH_STREAM_WITH_INDEX(DECODE, 1)
FOR_EACH_STREAM_WITH_INDEX(DECODE, 2)
FOR_EACH_STREAM_WITH_INDEX(DECODE, 3)
FOR_EACH_STREAM_WITH_INDEX(DECODE, 4)
movq 0(%rsp), %r8
FOR_EACH_STREAM(RELOAD_BITS)
cmp %op3, 48(%rsp)
ja .L_4X2_loop_body
jmp .L_4X2_compute_olimit
#undef DECODE
#undef RELOAD_BITS
.L_4X2_exit:
addq $8, %rsp
pop %rax
pop %rax
pop %rax
pop %rax
pop %rax
pop %rax
pop %rax
movq %ip0, 0(%rax)
movq %ip1, 8(%rax)
movq %ip2, 16(%rax)
movq %ip3, 24(%rax)
movq %op0, 32(%rax)
movq %op1, 40(%rax)
movq %op2, 48(%rax)
movq %op3, 56(%rax)
movq %bits0, 64(%rax)
movq %bits1, 72(%rax)
movq %bits2, 80(%rax)
movq %bits3, 88(%rax)
pop %r15
pop %r14
pop %r13
pop %r12
pop %r11
pop %r10
pop %r9
pop %r8
pop %rdi
pop %rsi
pop %rbp
pop %rdx
pop %rcx
pop %rbx
pop %rax
ret
#endif