#ifndef AOM_AV1_ENCODER_X86_AV1_TXFM1D_SSE4_H_
#define AOM_AV1_ENCODER_X86_AV1_TXFM1D_SSE4_H_
#include <smmintrin.h>
#include "av1/common/av1_txfm.h"
#include "av1/common/x86/av1_txfm_sse4.h"
#ifdef __cplusplus
extern "C" {
#endif
void av1_fdct4_sse4_1(const __m128i *input, __m128i *output,
const int8_t cos_bit, const int8_t *stage_range);
void av1_fdct8_sse4_1(const __m128i *input, __m128i *output,
const int8_t cos_bit, const int8_t *stage_range);
void av1_fdct16_sse4_1(const __m128i *input, __m128i *output,
const int8_t cos_bit, const int8_t *stage_range);
void av1_fdct32_sse4_1(__m128i *input, __m128i *output, int cos_bit,
const int stride);
void av1_fdct64_sse4_1(__m128i *input, __m128i *output, int8_t cos_bit,
const int instride, const int outstride);
void av1_fadst4_sse4_1(const __m128i *input, __m128i *output,
const int8_t cos_bit, const int8_t *stage_range);
void av1_fadst8_sse4_1(const __m128i *input, __m128i *output,
const int8_t cos_bit, const int8_t *stage_range);
void av1_fadst16_sse4_1(const __m128i *input, __m128i *output,
const int8_t cos_bit, const int8_t *stage_range);
void av1_idct4_sse4_1(const __m128i *input, __m128i *output,
const int8_t cos_bit, const int8_t *stage_range);
void av1_idct8_sse4_1(const __m128i *input, __m128i *output,
const int8_t cos_bit, const int8_t *stage_range);
void av1_idct16_sse4_1(const __m128i *input, __m128i *output,
const int8_t cos_bit, const int8_t *stage_range);
void av1_idct32_sse4_1(const __m128i *input, __m128i *output,
const int8_t cos_bit, const int8_t *stage_range);
void av1_idct64_sse4_1(const __m128i *input, __m128i *output,
const int8_t cos_bit, const int8_t *stage_range);
void av1_iadst4_sse4_1(const __m128i *input, __m128i *output,
const int8_t cos_bit, const int8_t *stage_range);
void av1_iadst8_sse4_1(const __m128i *input, __m128i *output,
const int8_t cos_bit, const int8_t *stage_range);
void av1_iadst16_sse4_1(const __m128i *input, __m128i *output,
const int8_t cos_bit, const int8_t *stage_range);
void av1_idtx32_sse4_1(__m128i *input, __m128i *output, int cos_bit,
const int col_num);
static inline void transpose_32_4x4(int stride, const __m128i *input,
__m128i *output) { … }
static inline void transpose_32(int txfm_size, const __m128i *input,
__m128i *output) { … }
#define btf_32_sse4_1_type0(w0, w1, in0, in1, out0, out1, bit) …
#define btf_32_sse4_1_type1(w0, w1, in0, in1, out0, out1, bit) …
#define btf_32_type0_sse4_1_new(ww0, ww1, in0, in1, out0, out1, r, bit) …
#define btf_32_type1_sse4_1_new(ww0, ww1, in0, in1, out0, out1, r, bit) …
#ifdef __cplusplus
}
#endif
#endif