#include <assert.h>
#include <immintrin.h>
#include "config/aom_config.h"
#include "config/av1_rtcd.h"
#include "av1/common/av1_txfm.h"
#include "av1/encoder/av1_fwd_txfm1d_cfg.h"
#include "aom_dsp/txfm_common.h"
#include "aom_ports/mem.h"
#include "aom_dsp/x86/txfm_common_sse2.h"
#include "aom_dsp/x86/txfm_common_avx2.h"
static inline void load_buffer_8x8_avx2(const int16_t *input, __m256i *out,
int stride, int flipud, int fliplr,
int shift) { … }
static inline void col_txfm_8x8_rounding(__m256i *in, int shift) { … }
static inline void load_buffer_8x16_avx2(const int16_t *input, __m256i *out,
int stride, int flipud, int fliplr,
int shift) { … }
static inline void load_buffer_16xn_avx2(const int16_t *input, __m256i *out,
int stride, int height, int outstride,
int flipud, int fliplr) { … }
static void fwd_txfm_transpose_8x8_avx2(const __m256i *in, __m256i *out,
const int instride,
const int outstride) { … }
static inline void round_shift_32_8xn_avx2(__m256i *in, int size, int bit,
int stride) { … }
static inline void store_buffer_avx2(const __m256i *const in, int32_t *out,
const int stride, const int out_size) { … }
static inline void fwd_txfm_transpose_16x16_avx2(const __m256i *in,
__m256i *out) { … }
static inline __m256i av1_half_btf_avx2(const __m256i *w0, const __m256i *n0,
const __m256i *w1, const __m256i *n1,
const __m256i *rounding, int bit) { … }
#define btf_32_avx2_type0(w0, w1, in0, in1, out0, out1, bit) …
#define btf_32_type0_avx2_new(ww0, ww1, in0, in1, out0, out1, r, bit) …
transform_1d_avx2;
static void fdct8_avx2(__m256i *in, __m256i *out, const int8_t bit,
const int col_num, const int outstride) { … }
static void fadst8_avx2(__m256i *in, __m256i *out, const int8_t bit,
const int col_num, const int outstirde) { … }
static void idtx8_avx2(__m256i *in, __m256i *out, const int8_t bit, int col_num,
int outstride) { … }
void av1_fwd_txfm2d_8x8_avx2(const int16_t *input, int32_t *coeff, int stride,
TX_TYPE tx_type, int bd) { … }
static void fdct16_avx2(__m256i *in, __m256i *out, const int8_t bit,
const int col_num, const int outstride) { … }
static void fadst16_avx2(__m256i *in, __m256i *out, const int8_t bit,
const int num_cols, const int outstride) { … }
static void idtx16_avx2(__m256i *in, __m256i *out, const int8_t bit,
int col_num, const int outstride) { … }
static const transform_1d_avx2 col_highbd_txfm8x16_arr[TX_TYPES] = …;
static const transform_1d_avx2 row_highbd_txfm8x8_arr[TX_TYPES] = …;
void av1_fwd_txfm2d_8x16_avx2(const int16_t *input, int32_t *coeff, int stride,
TX_TYPE tx_type, int bd) { … }
static const transform_1d_avx2 col_highbd_txfm8x8_arr[TX_TYPES] = …;
static const transform_1d_avx2 row_highbd_txfm8x16_arr[TX_TYPES] = …;
void av1_fwd_txfm2d_16x8_avx2(const int16_t *input, int32_t *coeff, int stride,
TX_TYPE tx_type, int bd) { … }
void av1_fwd_txfm2d_16x16_avx2(const int16_t *input, int32_t *coeff, int stride,
TX_TYPE tx_type, int bd) { … }
static inline void fdct32_avx2(__m256i *input, __m256i *output,
const int8_t cos_bit, const int instride,
const int outstride) { … }
static inline void idtx32x32_avx2(__m256i *input, __m256i *output,
const int8_t cos_bit, int instride,
int outstride) { … }
static const transform_1d_avx2 col_txfm8x32_arr[TX_TYPES] = …;
static const transform_1d_avx2 row_txfm8x32_arr[TX_TYPES] = …;
void av1_fwd_txfm2d_32x32_avx2(const int16_t *input, int32_t *output,
int stride, TX_TYPE tx_type, int bd) { … }
static inline void fdct64_stage2_avx2(__m256i *x1, __m256i *x2,
__m256i *cospi_m32, __m256i *cospi_p32,
const __m256i *__rounding,
int8_t cos_bit) { … }
static inline void fdct64_stage3_avx2(__m256i *x2, __m256i *x3,
__m256i *cospi_m32, __m256i *cospi_p32,
const __m256i *__rounding,
int8_t cos_bit) { … }
static inline void fdct64_stage4_avx2(__m256i *x3, __m256i *x4,
__m256i *cospi_m32, __m256i *cospi_p32,
__m256i *cospi_m16, __m256i *cospi_p48,
__m256i *cospi_m48,
const __m256i *__rounding,
int8_t cos_bit) { … }
static inline void fdct64_stage5_avx2(__m256i *x4, __m256i *x5,
__m256i *cospi_m32, __m256i *cospi_p32,
__m256i *cospi_m16, __m256i *cospi_p48,
__m256i *cospi_m48,
const __m256i *__rounding,
int8_t cos_bit) { … }
static inline void fdct64_stage6_avx2(
__m256i *x5, __m256i *x6, __m256i *cospi_p16, __m256i *cospi_p32,
__m256i *cospi_m16, __m256i *cospi_p48, __m256i *cospi_m48,
__m256i *cospi_m08, __m256i *cospi_p56, __m256i *cospi_m56,
__m256i *cospi_m40, __m256i *cospi_p24, __m256i *cospi_m24,
const __m256i *__rounding, int8_t cos_bit) { … }
static inline void fdct64_stage7_avx2(__m256i *x6, __m256i *x7,
__m256i *cospi_p08, __m256i *cospi_p56,
__m256i *cospi_p40, __m256i *cospi_p24,
__m256i *cospi_m08, __m256i *cospi_m56,
__m256i *cospi_m40, __m256i *cospi_m24,
const __m256i *__rounding,
int8_t cos_bit) { … }
static inline void fdct64_stage8_avx2(__m256i *x7, __m256i *x8,
const int32_t *cospi,
const __m256i *__rounding,
int8_t cos_bit) { … }
static inline void fdct64_stage9_avx2(__m256i *x8, __m256i *x9,
const int32_t *cospi,
const __m256i *__rounding,
int8_t cos_bit) { … }
static inline void fdct64_stage10_avx2(__m256i *x9, __m256i *x10,
const int32_t *cospi,
const __m256i *__rounding,
int8_t cos_bit) { … }
static void fdct64_avx2(__m256i *input, __m256i *output, int8_t cos_bit,
const int instride, const int outstride) { … }
void av1_fwd_txfm2d_64x64_avx2(const int16_t *input, int32_t *output,
int stride, TX_TYPE tx_type, int bd) { … }