highbd_fwd_txfm_sse4.c | Explore in Territory

/*
 * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
 *
 * This source code is subject to the terms of the BSD 2 Clause License and
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
 * was not distributed with this source code in the LICENSE file, you can
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
 * Media Patent License 1.0 was not distributed with this source code in the
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 */
#include <assert.h>
#include <smmintrin.h> /* SSE4.1 */

#include "aom_dsp/txfm_common.h"
#include "aom_dsp/x86/transpose_sse2.h"
#include "aom_dsp/x86/txfm_common_sse2.h"
#include "aom_ports/mem.h"
#include "av1/common/av1_txfm.h"
#include "av1/common/x86/highbd_txfm_utility_sse4.h"
#include "av1/encoder/av1_fwd_txfm1d_cfg.h"
#include "av1/encoder/x86/av1_txfm1d_sse4.h"
#include "config/aom_config.h"
#include "config/av1_rtcd.h"

static inline void store_output_w4(int32_t *const out, const __m128i *const in,
                                   const int stride, const int out_size) { … }

void av1_fwht4x4_sse4_1(const int16_t *input, tran_low_t *output, int stride) { … }

static inline void load_buffer_4x4(const int16_t *input, __m128i *in,
                                   int stride, int flipud, int fliplr,
                                   int shift) { … }

// We only use stage-2 bit;
// shift[0] is used in load_buffer_4x4()
// shift[1] is used in txfm_func_col()
// shift[2] is used in txfm_func_row()
static void fdct4x4_sse4_1(__m128i *in, __m128i *out, int bit,
                           const int num_col) { … }

static inline void write_buffer_4x4(__m128i *res, int32_t *output) { … }

static void fadst4x4_sse4_1(__m128i *in, __m128i *out, int bit,
                            const int num_col) { … }
static void idtx4x4_sse4_1(__m128i *in, __m128i *out, int bit, int col_num) { … }
void av1_fwd_txfm2d_4x4_sse4_1(const int16_t *input, int32_t *coeff,
                               int input_stride, TX_TYPE tx_type, int bd) { … }

static inline void load_buffer_8x8(const int16_t *input, __m128i *in,
                                   int stride, int flipud, int fliplr,
                                   int shift) { … }

static inline void col_txfm_8x8_rounding(__m128i *in, int shift) { … }

static inline void col_txfm_4x8_rounding(__m128i *in, int shift) { … }

static inline void write_buffer_8x8(const __m128i *res, int32_t *output) { … }

static inline void write_buffer_16x8(const __m128i *res, int32_t *output,
                                     const int stride) { … }

static void fdct4x8_sse4_1(__m128i *in, __m128i *out, int bit,
                           const int col_num) { … }

static void fdct8x8_sse4_1(__m128i *in, __m128i *out, int bit,
                           const int col_num) { … }

static void fadst8x8_sse4_1(__m128i *in, __m128i *out, int bit,
                            const int col_num) { … }
static void idtx8x8_sse4_1(__m128i *in, __m128i *out, int bit, int col_num) { … }
#if !CONFIG_REALTIME_ONLY
static void idtx32x8_sse4_1(__m128i *in, __m128i *out, int bit, int col_num) {
  (void)bit;
  (void)col_num;
  for (int j = 0; j < 2; j++) {
    out[j + 8 * 0] = _mm_add_epi32(in[j + 8 * 0], in[j + 8 * 0]);
    out[j + 8 * 1] = _mm_add_epi32(in[j + 8 * 1], in[j + 8 * 1]);
    out[j + 8 * 2] = _mm_add_epi32(in[j + 8 * 2], in[j + 8 * 2]);
    out[j + 8 * 3] = _mm_add_epi32(in[j + 8 * 3], in[j + 8 * 3]);
    out[j + 8 * 4] = _mm_add_epi32(in[j + 8 * 4], in[j + 8 * 4]);
    out[j + 8 * 5] = _mm_add_epi32(in[j + 8 * 5], in[j + 8 * 5]);
    out[j + 8 * 6] = _mm_add_epi32(in[j + 8 * 6], in[j + 8 * 6]);
    out[j + 8 * 7] = _mm_add_epi32(in[j + 8 * 7], in[j + 8 * 7]);
  }
}
#endif
void av1_fwd_txfm2d_8x8_sse4_1(const int16_t *input, int32_t *coeff, int stride,
                               TX_TYPE tx_type, int bd) { … }

// Hybrid Transform 16x16

static inline void convert_8x8_to_16x16(const __m128i *in, __m128i *out) { … }

static inline void load_buffer_16x16(const int16_t *input, __m128i *out,
                                     int stride, int flipud, int fliplr,
                                     int shift) { … }

static inline void load_buffer_8x16(const int16_t *input, __m128i *out,
                                    int stride, int flipud, int fliplr,
                                    int shift) { … }

static inline void load_buffer_8x4(const int16_t *input, __m128i *out,
                                   int stride, int flipud, int fliplr,
                                   int shift) { … }

static inline void load_buffer_16x4(const int16_t *input, __m128i *out,
                                    int stride, int flipud, int fliplr,
                                    int shift) { … }

static inline void load_buffer_4x8(const int16_t *input, __m128i *out,
                                   int stride, int flipud, int fliplr,
                                   int shift) { … }

#if !CONFIG_REALTIME_ONLY
static inline void load_buffer_4x16(const int16_t *input, __m128i *out,
                                    const int stride, const int flipud,
                                    const int fliplr, const int shift) {
  const int16_t *topL = input;
  const int16_t *botL = input + 8 * stride;

  const int16_t *tmp;

  if (flipud) {
    tmp = topL;
    topL = botL;
    botL = tmp;
  }
  load_buffer_4x8(topL, out, stride, flipud, fliplr, shift);
  load_buffer_4x8(botL, out + 8, stride, flipud, fliplr, shift);
}
#endif

static inline void load_buffer_32x8n(const int16_t *input, __m128i *out,
                                     int stride, int flipud, int fliplr,
                                     int shift, const int height) { … }

static void fdct16x16_sse4_1(__m128i *in, __m128i *out, int bit,
                             const int col_num) { … }

static void fadst16x16_sse4_1(__m128i *in, __m128i *out, int bit,
                              const int num_cols) { … }

static void col_txfm_16x16_rounding(__m128i *in, int shift) { … }

static void col_txfm_8x16_rounding(__m128i *in, int shift) { … }

static void write_buffer_16x16(const __m128i *in, int32_t *output) { … }
static void idtx16x16_sse4_1(__m128i *in, __m128i *out, int bit, int col_num) { … }
void av1_fwd_txfm2d_16x16_sse4_1(const int16_t *input, int32_t *coeff,
                                 int stride, TX_TYPE tx_type, int bd) { … }

static inline void flip_buf_sse4_1(__m128i *in, __m128i *out, int size) { … }

static const fwd_transform_1d_sse4_1 col_highbd_txfm8x8_arr[TX_TYPES] = …;
#if !CONFIG_REALTIME_ONLY
static const fwd_transform_1d_sse4_1 row_highbd_txfm32x8_arr[TX_TYPES] = {
  fdct8x8_sse4_1,   // DCT_DCT
  NULL,             // ADST_DCT
  NULL,             // DCT_ADST
  NULL,             // ADST_ADST
  NULL,             // FLIPADST_DCT
  NULL,             // DCT_FLIPADST
  NULL,             // FLIPADST_FLIPADST
  NULL,             // ADST_FLIPADST
  NULL,             // FLIPADST-ADST
  idtx32x8_sse4_1,  // IDTX
  NULL,             // V_DCT
  NULL,             // H_DCT
  NULL,             // V_ADST
  NULL,             // H_ADST
  NULL,             // V_FLIPADST
  NULL,             // H_FLIPADST
};
#endif
static const fwd_transform_1d_sse4_1 col_highbd_txfm4x8_arr[TX_TYPES] = …;

static const fwd_transform_1d_sse4_1 row_highbd_txfm8x16_arr[TX_TYPES] = …;

static const fwd_transform_1d_sse4_1 col_highbd_txfm8x16_arr[TX_TYPES] = …;
static const fwd_transform_1d_sse4_1 row_highbd_txfm8x8_arr[TX_TYPES] = …;

static const fwd_transform_1d_sse4_1 row_highbd_txfm4x8_arr[TX_TYPES] = …;

static const fwd_transform_1d_sse4_1 row_highbd_txfm4x4_arr[TX_TYPES] = …;

static const fwd_transform_1d_sse4_1 col_highbd_txfm4x4_arr[TX_TYPES] = …;

static const fwd_transform_1d_sse4_1 col_highbd_txfm8x32_arr[TX_TYPES] = …;

static const fwd_transform_1d_sse4_1 row_highbd_txfm8x32_arr[TX_TYPES] = …;

void av1_fwd_txfm2d_16x8_sse4_1(const int16_t *input, int32_t *coeff,
                                int stride, TX_TYPE tx_type, int bd) { … }

void av1_fwd_txfm2d_8x16_sse4_1(const int16_t *input, int32_t *coeff,
                                int stride, TX_TYPE tx_type, int bd) { … }

#if !CONFIG_REALTIME_ONLY
void av1_fwd_txfm2d_4x16_sse4_1(const int16_t *input, int32_t *coeff,
                                int stride, TX_TYPE tx_type, int bd) {
  __m128i in[16];
  __m128i *outcoeff128 = (__m128i *)coeff;
  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_4X16];
  const int txw_idx = get_txw_idx(TX_4X16);
  const int txh_idx = get_txh_idx(TX_4X16);
  const int txfm_size_col = tx_size_wide[TX_4X16];
  const int txfm_size_row = tx_size_high[TX_4X16];
  int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx];
  int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx];
  const fwd_transform_1d_sse4_1 col_txfm = col_highbd_txfm8x16_arr[tx_type];
  const fwd_transform_1d_sse4_1 row_txfm = row_highbd_txfm4x4_arr[tx_type];

  int ud_flip, lr_flip;
  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
  // col transform
  load_buffer_4x16(input, in, stride, ud_flip, lr_flip, shift[0]);
  col_txfm(in, outcoeff128, bitcol, 1);
  col_txfm_8x8_rounding(outcoeff128, -shift[1]);
  transpose_8nx8n(outcoeff128, in, txfm_size_col, txfm_size_row);

  // row transform
  for (int i = 0; i < 4; i++) {
    __m128i tmp[4];
    row_txfm(in + i, tmp, bitrow, txfm_size_row >> 2);
    store_output_w4(coeff + i * 4, tmp, txfm_size_row, txfm_size_col);
  }
  (void)bd;
}
#endif

void av1_fwd_txfm2d_16x4_sse4_1(const int16_t *input, int32_t *coeff,
                                int stride, TX_TYPE tx_type, int bd) { … }

void av1_fwd_txfm2d_16x32_sse4_1(const int16_t *input, int32_t *coeff,
                                 int stride, TX_TYPE tx_type, int bd) { … }

void av1_fwd_txfm2d_32x64_sse4_1(const int16_t *input, int32_t *coeff,
                                 int stride, TX_TYPE tx_type, int bd) { … }

void av1_fwd_txfm2d_64x32_sse4_1(const int16_t *input, int32_t *coeff,
                                 int stride, TX_TYPE tx_type, int bd) { … }

void av1_fwd_txfm2d_32x16_sse4_1(const int16_t *input, int32_t *coeff,
                                 int stride, TX_TYPE tx_type, int bd) { … }

#if !CONFIG_REALTIME_ONLY
void av1_fwd_txfm2d_8x32_sse4_1(const int16_t *input, int32_t *coeff,
                                int stride, TX_TYPE tx_type, int bd) {
  __m128i in[64];
  __m128i *outcoef128 = (__m128i *)coeff;
  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X32];
  const int txw_idx = get_txw_idx(TX_8X32);
  const int txh_idx = get_txh_idx(TX_8X32);
  const fwd_transform_1d_sse4_1 col_txfm = col_highbd_txfm8x32_arr[tx_type];
  const fwd_transform_1d_sse4_1 row_txfm = row_highbd_txfm32x8_arr[tx_type];
  int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx];
  int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx];

  const int txfm_size_col = tx_size_wide[TX_8X32];
  const int txfm_size_row = tx_size_high[TX_8X32];
  const int num_col = txfm_size_col >> 2;

  // column transform
  load_buffer_8x16(input, in, stride, 0, 0, shift[0]);
  load_buffer_8x16(input + (txfm_size_row >> 1) * stride, in + txfm_size_row,
                   stride, 0, 0, shift[0]);

  for (int i = 0; i < num_col; i++) {
    col_txfm((in + i), (in + i), bitcol, num_col);
  }
  col_txfm_16x16_rounding(in, -shift[1]);
  transpose_8nx8n(in, outcoef128, txfm_size_col, txfm_size_row);

  // row transform
  for (int i = 0; i < txfm_size_col; i += 2) {
    row_txfm((outcoef128 + i), (outcoef128 + i), bitrow, txfm_size_col);
  }
  (void)bd;
}

void av1_fwd_txfm2d_32x8_sse4_1(const int16_t *input, int32_t *coeff,
                                int stride, TX_TYPE tx_type, int bd) {
  __m128i in[64];
  __m128i *outcoef128 = (__m128i *)coeff;
  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_32X8];
  const int txw_idx = get_txw_idx(TX_32X8);
  const int txh_idx = get_txh_idx(TX_32X8);
  const fwd_transform_1d_sse4_1 col_txfm = row_highbd_txfm32x8_arr[tx_type];
  const fwd_transform_1d_sse4_1 row_txfm = col_highbd_txfm8x32_arr[tx_type];
  int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx];
  int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx];

  const int txfm_size_col = tx_size_wide[TX_32X8];
  const int txfm_size_row = tx_size_high[TX_32X8];
  const int num_col = txfm_size_row >> 2;

  // column transform
  load_buffer_32x8n(input, in, stride, 0, 0, shift[0], 8);
  for (int i = 0; i < txfm_size_row; i += 2) {
    col_txfm((in + i), (in + i), bitcol, txfm_size_row);
  }

  col_txfm_16x16_rounding(&in[0], -shift[1]);
  transpose_8nx8n(in, outcoef128, txfm_size_col, txfm_size_row);

  // row transform
  for (int i = 0; i < num_col; i++) {
    row_txfm((outcoef128 + i), (outcoef128 + i), bitrow, num_col);
  }
  (void)bd;
}
#endif

void av1_fwd_txfm2d_4x8_sse4_1(const int16_t *input, int32_t *coeff, int stride,
                               TX_TYPE tx_type, int bd) { … }

void av1_fwd_txfm2d_8x4_sse4_1(const int16_t *input, int32_t *coeff, int stride,
                               TX_TYPE tx_type, int bd) { … }

#if !CONFIG_REALTIME_ONLY
void av1_fwd_txfm2d_16x64_sse4_1(const int16_t *input, int32_t *coeff,
                                 int stride, TX_TYPE tx_type, int bd) {
  __m128i in[256];
  __m128i *outcoeff128 = (__m128i *)coeff;
  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X64];
  const int txw_idx = get_txw_idx(TX_16X64);
  const int txh_idx = get_txh_idx(TX_16X64);
  const int txfm_size_col = tx_size_wide[TX_16X64];
  const int txfm_size_row = tx_size_high[TX_16X64];
  int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx];
  int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx];
  int ud_flip, lr_flip;
  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
  const int num_col = txfm_size_col >> 2;
  // col tranform
  for (int i = 0; i < txfm_size_row; i += num_col) {
    load_buffer_4x4(input + (i + 0) * stride, in + (i + 0) * num_col, num_col,
                    ud_flip, lr_flip, shift[0]);
    load_buffer_4x4(input + (i + 1) * stride, in + (i + 1) * num_col, num_col,
                    ud_flip, lr_flip, shift[0]);
    load_buffer_4x4(input + (i + 2) * stride, in + (i + 2) * num_col, num_col,
                    ud_flip, lr_flip, shift[0]);
    load_buffer_4x4(input + (i + 3) * stride, in + (i + 3) * num_col, num_col,
                    ud_flip, lr_flip, shift[0]);
  }

  for (int i = 0; i < num_col; i++) {
    av1_fdct64_sse4_1(in + i, outcoeff128 + i, bitcol, num_col, num_col);
  }

  col_txfm_16x16_rounding(outcoeff128, -shift[1]);
  col_txfm_16x16_rounding(outcoeff128 + 64, -shift[1]);
  col_txfm_16x16_rounding(outcoeff128 + 128, -shift[1]);
  col_txfm_16x16_rounding(outcoeff128 + 192, -shift[1]);

  transpose_8nx8n(outcoeff128, in, txfm_size_col, 32);
  fdct16x16_sse4_1(in, outcoeff128, bitrow, 8);
  (void)bd;
}

void av1_fwd_txfm2d_64x16_sse4_1(const int16_t *input, int32_t *coeff,
                                 int stride, TX_TYPE tx_type, int bd) {
  __m128i in[256];
  __m128i *outcoeff128 = (__m128i *)coeff;
  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_64X16];
  const int txw_idx = get_txw_idx(TX_64X16);
  const int txh_idx = get_txh_idx(TX_64X16);
  const int txfm_size_col = tx_size_wide[TX_64X16];
  const int txfm_size_row = tx_size_high[TX_64X16];
  int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx];
  int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx];
  int ud_flip, lr_flip;
  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
  // col tranform
  for (int i = 0; i < txfm_size_row; i++) {
    load_buffer_4x4(input + 0 + i * stride, in + 0 + i * txfm_size_row, 4,
                    ud_flip, lr_flip, shift[0]);
    load_buffer_4x4(input + 16 + i * stride, in + 4 + i * txfm_size_row, 4,
                    ud_flip, lr_flip, shift[0]);
    load_buffer_4x4(input + 32 + i * stride, in + 8 + i * txfm_size_row, 4,
                    ud_flip, lr_flip, shift[0]);
    load_buffer_4x4(input + 48 + i * stride, in + 12 + i * txfm_size_row, 4,
                    ud_flip, lr_flip, shift[0]);
  }

  fdct16x16_sse4_1(in, outcoeff128, bitcol, txfm_size_row);
  col_txfm_16x16_rounding(outcoeff128, -shift[1]);
  col_txfm_16x16_rounding(outcoeff128 + 64, -shift[1]);
  col_txfm_16x16_rounding(outcoeff128 + 128, -shift[1]);
  col_txfm_16x16_rounding(outcoeff128 + 192, -shift[1]);

  transpose_8nx8n(outcoeff128, in, txfm_size_col, txfm_size_row);
  for (int i = 0; i < 4; i++) {
    av1_fdct64_sse4_1(in + i, outcoeff128 + i, bitrow, 4, 4);
  }
  memset(coeff + txfm_size_row * 32, 0, txfm_size_row * 32 * sizeof(*coeff));
  (void)bd;
}
#endif
chromium/third_party/libaom/source/libaom/av1/encoder/x86/highbd_fwd_txfm_sse4.c