#include <assert.h>
#include <smmintrin.h>
#include "aom_dsp/txfm_common.h"
#include "aom_dsp/x86/transpose_sse2.h"
#include "aom_dsp/x86/txfm_common_sse2.h"
#include "aom_ports/mem.h"
#include "av1/common/av1_txfm.h"
#include "av1/common/x86/highbd_txfm_utility_sse4.h"
#include "av1/encoder/av1_fwd_txfm1d_cfg.h"
#include "av1/encoder/x86/av1_txfm1d_sse4.h"
#include "config/aom_config.h"
#include "config/av1_rtcd.h"
static inline void store_output_w4(int32_t *const out, const __m128i *const in,
const int stride, const int out_size) { … }
void av1_fwht4x4_sse4_1(const int16_t *input, tran_low_t *output, int stride) { … }
static inline void load_buffer_4x4(const int16_t *input, __m128i *in,
int stride, int flipud, int fliplr,
int shift) { … }
static void fdct4x4_sse4_1(__m128i *in, __m128i *out, int bit,
const int num_col) { … }
static inline void write_buffer_4x4(__m128i *res, int32_t *output) { … }
static void fadst4x4_sse4_1(__m128i *in, __m128i *out, int bit,
const int num_col) { … }
static void idtx4x4_sse4_1(__m128i *in, __m128i *out, int bit, int col_num) { … }
void av1_fwd_txfm2d_4x4_sse4_1(const int16_t *input, int32_t *coeff,
int input_stride, TX_TYPE tx_type, int bd) { … }
static inline void load_buffer_8x8(const int16_t *input, __m128i *in,
int stride, int flipud, int fliplr,
int shift) { … }
static inline void col_txfm_8x8_rounding(__m128i *in, int shift) { … }
static inline void col_txfm_4x8_rounding(__m128i *in, int shift) { … }
static inline void write_buffer_8x8(const __m128i *res, int32_t *output) { … }
static inline void write_buffer_16x8(const __m128i *res, int32_t *output,
const int stride) { … }
static void fdct4x8_sse4_1(__m128i *in, __m128i *out, int bit,
const int col_num) { … }
static void fdct8x8_sse4_1(__m128i *in, __m128i *out, int bit,
const int col_num) { … }
static void fadst8x8_sse4_1(__m128i *in, __m128i *out, int bit,
const int col_num) { … }
static void idtx8x8_sse4_1(__m128i *in, __m128i *out, int bit, int col_num) { … }
#if !CONFIG_REALTIME_ONLY
static void idtx32x8_sse4_1(__m128i *in, __m128i *out, int bit, int col_num) {
(void)bit;
(void)col_num;
for (int j = 0; j < 2; j++) {
out[j + 8 * 0] = _mm_add_epi32(in[j + 8 * 0], in[j + 8 * 0]);
out[j + 8 * 1] = _mm_add_epi32(in[j + 8 * 1], in[j + 8 * 1]);
out[j + 8 * 2] = _mm_add_epi32(in[j + 8 * 2], in[j + 8 * 2]);
out[j + 8 * 3] = _mm_add_epi32(in[j + 8 * 3], in[j + 8 * 3]);
out[j + 8 * 4] = _mm_add_epi32(in[j + 8 * 4], in[j + 8 * 4]);
out[j + 8 * 5] = _mm_add_epi32(in[j + 8 * 5], in[j + 8 * 5]);
out[j + 8 * 6] = _mm_add_epi32(in[j + 8 * 6], in[j + 8 * 6]);
out[j + 8 * 7] = _mm_add_epi32(in[j + 8 * 7], in[j + 8 * 7]);
}
}
#endif
void av1_fwd_txfm2d_8x8_sse4_1(const int16_t *input, int32_t *coeff, int stride,
TX_TYPE tx_type, int bd) { … }
static inline void convert_8x8_to_16x16(const __m128i *in, __m128i *out) { … }
static inline void load_buffer_16x16(const int16_t *input, __m128i *out,
int stride, int flipud, int fliplr,
int shift) { … }
static inline void load_buffer_8x16(const int16_t *input, __m128i *out,
int stride, int flipud, int fliplr,
int shift) { … }
static inline void load_buffer_8x4(const int16_t *input, __m128i *out,
int stride, int flipud, int fliplr,
int shift) { … }
static inline void load_buffer_16x4(const int16_t *input, __m128i *out,
int stride, int flipud, int fliplr,
int shift) { … }
static inline void load_buffer_4x8(const int16_t *input, __m128i *out,
int stride, int flipud, int fliplr,
int shift) { … }
#if !CONFIG_REALTIME_ONLY
static inline void load_buffer_4x16(const int16_t *input, __m128i *out,
const int stride, const int flipud,
const int fliplr, const int shift) {
const int16_t *topL = input;
const int16_t *botL = input + 8 * stride;
const int16_t *tmp;
if (flipud) {
tmp = topL;
topL = botL;
botL = tmp;
}
load_buffer_4x8(topL, out, stride, flipud, fliplr, shift);
load_buffer_4x8(botL, out + 8, stride, flipud, fliplr, shift);
}
#endif
static inline void load_buffer_32x8n(const int16_t *input, __m128i *out,
int stride, int flipud, int fliplr,
int shift, const int height) { … }
static void fdct16x16_sse4_1(__m128i *in, __m128i *out, int bit,
const int col_num) { … }
static void fadst16x16_sse4_1(__m128i *in, __m128i *out, int bit,
const int num_cols) { … }
static void col_txfm_16x16_rounding(__m128i *in, int shift) { … }
static void col_txfm_8x16_rounding(__m128i *in, int shift) { … }
static void write_buffer_16x16(const __m128i *in, int32_t *output) { … }
static void idtx16x16_sse4_1(__m128i *in, __m128i *out, int bit, int col_num) { … }
void av1_fwd_txfm2d_16x16_sse4_1(const int16_t *input, int32_t *coeff,
int stride, TX_TYPE tx_type, int bd) { … }
static inline void flip_buf_sse4_1(__m128i *in, __m128i *out, int size) { … }
static const fwd_transform_1d_sse4_1 col_highbd_txfm8x8_arr[TX_TYPES] = …;
#if !CONFIG_REALTIME_ONLY
static const fwd_transform_1d_sse4_1 row_highbd_txfm32x8_arr[TX_TYPES] = {
fdct8x8_sse4_1,
NULL,
NULL,
NULL,
NULL,
NULL,
NULL,
NULL,
NULL,
idtx32x8_sse4_1,
NULL,
NULL,
NULL,
NULL,
NULL,
NULL,
};
#endif
static const fwd_transform_1d_sse4_1 col_highbd_txfm4x8_arr[TX_TYPES] = …;
static const fwd_transform_1d_sse4_1 row_highbd_txfm8x16_arr[TX_TYPES] = …;
static const fwd_transform_1d_sse4_1 col_highbd_txfm8x16_arr[TX_TYPES] = …;
static const fwd_transform_1d_sse4_1 row_highbd_txfm8x8_arr[TX_TYPES] = …;
static const fwd_transform_1d_sse4_1 row_highbd_txfm4x8_arr[TX_TYPES] = …;
static const fwd_transform_1d_sse4_1 row_highbd_txfm4x4_arr[TX_TYPES] = …;
static const fwd_transform_1d_sse4_1 col_highbd_txfm4x4_arr[TX_TYPES] = …;
static const fwd_transform_1d_sse4_1 col_highbd_txfm8x32_arr[TX_TYPES] = …;
static const fwd_transform_1d_sse4_1 row_highbd_txfm8x32_arr[TX_TYPES] = …;
void av1_fwd_txfm2d_16x8_sse4_1(const int16_t *input, int32_t *coeff,
int stride, TX_TYPE tx_type, int bd) { … }
void av1_fwd_txfm2d_8x16_sse4_1(const int16_t *input, int32_t *coeff,
int stride, TX_TYPE tx_type, int bd) { … }
#if !CONFIG_REALTIME_ONLY
void av1_fwd_txfm2d_4x16_sse4_1(const int16_t *input, int32_t *coeff,
int stride, TX_TYPE tx_type, int bd) {
__m128i in[16];
__m128i *outcoeff128 = (__m128i *)coeff;
const int8_t *shift = av1_fwd_txfm_shift_ls[TX_4X16];
const int txw_idx = get_txw_idx(TX_4X16);
const int txh_idx = get_txh_idx(TX_4X16);
const int txfm_size_col = tx_size_wide[TX_4X16];
const int txfm_size_row = tx_size_high[TX_4X16];
int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx];
int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx];
const fwd_transform_1d_sse4_1 col_txfm = col_highbd_txfm8x16_arr[tx_type];
const fwd_transform_1d_sse4_1 row_txfm = row_highbd_txfm4x4_arr[tx_type];
int ud_flip, lr_flip;
get_flip_cfg(tx_type, &ud_flip, &lr_flip);
load_buffer_4x16(input, in, stride, ud_flip, lr_flip, shift[0]);
col_txfm(in, outcoeff128, bitcol, 1);
col_txfm_8x8_rounding(outcoeff128, -shift[1]);
transpose_8nx8n(outcoeff128, in, txfm_size_col, txfm_size_row);
for (int i = 0; i < 4; i++) {
__m128i tmp[4];
row_txfm(in + i, tmp, bitrow, txfm_size_row >> 2);
store_output_w4(coeff + i * 4, tmp, txfm_size_row, txfm_size_col);
}
(void)bd;
}
#endif
void av1_fwd_txfm2d_16x4_sse4_1(const int16_t *input, int32_t *coeff,
int stride, TX_TYPE tx_type, int bd) { … }
void av1_fwd_txfm2d_16x32_sse4_1(const int16_t *input, int32_t *coeff,
int stride, TX_TYPE tx_type, int bd) { … }
void av1_fwd_txfm2d_32x64_sse4_1(const int16_t *input, int32_t *coeff,
int stride, TX_TYPE tx_type, int bd) { … }
void av1_fwd_txfm2d_64x32_sse4_1(const int16_t *input, int32_t *coeff,
int stride, TX_TYPE tx_type, int bd) { … }
void av1_fwd_txfm2d_32x16_sse4_1(const int16_t *input, int32_t *coeff,
int stride, TX_TYPE tx_type, int bd) { … }
#if !CONFIG_REALTIME_ONLY
void av1_fwd_txfm2d_8x32_sse4_1(const int16_t *input, int32_t *coeff,
int stride, TX_TYPE tx_type, int bd) {
__m128i in[64];
__m128i *outcoef128 = (__m128i *)coeff;
const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X32];
const int txw_idx = get_txw_idx(TX_8X32);
const int txh_idx = get_txh_idx(TX_8X32);
const fwd_transform_1d_sse4_1 col_txfm = col_highbd_txfm8x32_arr[tx_type];
const fwd_transform_1d_sse4_1 row_txfm = row_highbd_txfm32x8_arr[tx_type];
int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx];
int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx];
const int txfm_size_col = tx_size_wide[TX_8X32];
const int txfm_size_row = tx_size_high[TX_8X32];
const int num_col = txfm_size_col >> 2;
load_buffer_8x16(input, in, stride, 0, 0, shift[0]);
load_buffer_8x16(input + (txfm_size_row >> 1) * stride, in + txfm_size_row,
stride, 0, 0, shift[0]);
for (int i = 0; i < num_col; i++) {
col_txfm((in + i), (in + i), bitcol, num_col);
}
col_txfm_16x16_rounding(in, -shift[1]);
transpose_8nx8n(in, outcoef128, txfm_size_col, txfm_size_row);
for (int i = 0; i < txfm_size_col; i += 2) {
row_txfm((outcoef128 + i), (outcoef128 + i), bitrow, txfm_size_col);
}
(void)bd;
}
void av1_fwd_txfm2d_32x8_sse4_1(const int16_t *input, int32_t *coeff,
int stride, TX_TYPE tx_type, int bd) {
__m128i in[64];
__m128i *outcoef128 = (__m128i *)coeff;
const int8_t *shift = av1_fwd_txfm_shift_ls[TX_32X8];
const int txw_idx = get_txw_idx(TX_32X8);
const int txh_idx = get_txh_idx(TX_32X8);
const fwd_transform_1d_sse4_1 col_txfm = row_highbd_txfm32x8_arr[tx_type];
const fwd_transform_1d_sse4_1 row_txfm = col_highbd_txfm8x32_arr[tx_type];
int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx];
int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx];
const int txfm_size_col = tx_size_wide[TX_32X8];
const int txfm_size_row = tx_size_high[TX_32X8];
const int num_col = txfm_size_row >> 2;
load_buffer_32x8n(input, in, stride, 0, 0, shift[0], 8);
for (int i = 0; i < txfm_size_row; i += 2) {
col_txfm((in + i), (in + i), bitcol, txfm_size_row);
}
col_txfm_16x16_rounding(&in[0], -shift[1]);
transpose_8nx8n(in, outcoef128, txfm_size_col, txfm_size_row);
for (int i = 0; i < num_col; i++) {
row_txfm((outcoef128 + i), (outcoef128 + i), bitrow, num_col);
}
(void)bd;
}
#endif
void av1_fwd_txfm2d_4x8_sse4_1(const int16_t *input, int32_t *coeff, int stride,
TX_TYPE tx_type, int bd) { … }
void av1_fwd_txfm2d_8x4_sse4_1(const int16_t *input, int32_t *coeff, int stride,
TX_TYPE tx_type, int bd) { … }
#if !CONFIG_REALTIME_ONLY
void av1_fwd_txfm2d_16x64_sse4_1(const int16_t *input, int32_t *coeff,
int stride, TX_TYPE tx_type, int bd) {
__m128i in[256];
__m128i *outcoeff128 = (__m128i *)coeff;
const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X64];
const int txw_idx = get_txw_idx(TX_16X64);
const int txh_idx = get_txh_idx(TX_16X64);
const int txfm_size_col = tx_size_wide[TX_16X64];
const int txfm_size_row = tx_size_high[TX_16X64];
int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx];
int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx];
int ud_flip, lr_flip;
get_flip_cfg(tx_type, &ud_flip, &lr_flip);
const int num_col = txfm_size_col >> 2;
for (int i = 0; i < txfm_size_row; i += num_col) {
load_buffer_4x4(input + (i + 0) * stride, in + (i + 0) * num_col, num_col,
ud_flip, lr_flip, shift[0]);
load_buffer_4x4(input + (i + 1) * stride, in + (i + 1) * num_col, num_col,
ud_flip, lr_flip, shift[0]);
load_buffer_4x4(input + (i + 2) * stride, in + (i + 2) * num_col, num_col,
ud_flip, lr_flip, shift[0]);
load_buffer_4x4(input + (i + 3) * stride, in + (i + 3) * num_col, num_col,
ud_flip, lr_flip, shift[0]);
}
for (int i = 0; i < num_col; i++) {
av1_fdct64_sse4_1(in + i, outcoeff128 + i, bitcol, num_col, num_col);
}
col_txfm_16x16_rounding(outcoeff128, -shift[1]);
col_txfm_16x16_rounding(outcoeff128 + 64, -shift[1]);
col_txfm_16x16_rounding(outcoeff128 + 128, -shift[1]);
col_txfm_16x16_rounding(outcoeff128 + 192, -shift[1]);
transpose_8nx8n(outcoeff128, in, txfm_size_col, 32);
fdct16x16_sse4_1(in, outcoeff128, bitrow, 8);
(void)bd;
}
void av1_fwd_txfm2d_64x16_sse4_1(const int16_t *input, int32_t *coeff,
int stride, TX_TYPE tx_type, int bd) {
__m128i in[256];
__m128i *outcoeff128 = (__m128i *)coeff;
const int8_t *shift = av1_fwd_txfm_shift_ls[TX_64X16];
const int txw_idx = get_txw_idx(TX_64X16);
const int txh_idx = get_txh_idx(TX_64X16);
const int txfm_size_col = tx_size_wide[TX_64X16];
const int txfm_size_row = tx_size_high[TX_64X16];
int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx];
int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx];
int ud_flip, lr_flip;
get_flip_cfg(tx_type, &ud_flip, &lr_flip);
for (int i = 0; i < txfm_size_row; i++) {
load_buffer_4x4(input + 0 + i * stride, in + 0 + i * txfm_size_row, 4,
ud_flip, lr_flip, shift[0]);
load_buffer_4x4(input + 16 + i * stride, in + 4 + i * txfm_size_row, 4,
ud_flip, lr_flip, shift[0]);
load_buffer_4x4(input + 32 + i * stride, in + 8 + i * txfm_size_row, 4,
ud_flip, lr_flip, shift[0]);
load_buffer_4x4(input + 48 + i * stride, in + 12 + i * txfm_size_row, 4,
ud_flip, lr_flip, shift[0]);
}
fdct16x16_sse4_1(in, outcoeff128, bitcol, txfm_size_row);
col_txfm_16x16_rounding(outcoeff128, -shift[1]);
col_txfm_16x16_rounding(outcoeff128 + 64, -shift[1]);
col_txfm_16x16_rounding(outcoeff128 + 128, -shift[1]);
col_txfm_16x16_rounding(outcoeff128 + 192, -shift[1]);
transpose_8nx8n(outcoeff128, in, txfm_size_col, txfm_size_row);
for (int i = 0; i < 4; i++) {
av1_fdct64_sse4_1(in + i, outcoeff128 + i, bitrow, 4, 4);
}
memset(coeff + txfm_size_row * 32, 0, txfm_size_row * 32 * sizeof(*coeff));
(void)bd;
}
#endif