av1_fwd_txfm_sse2.c | Explore in Territory

/*
 * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
 *
 * This source code is subject to the terms of the BSD 2 Clause License and
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
 * was not distributed with this source code in the LICENSE file, you can
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
 * Media Patent License 1.0 was not distributed with this source code in the
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 */

#include "av1/common/x86/av1_txfm_sse2.h"
#include "av1/encoder/av1_fwd_txfm1d_cfg.h"
#include "av1/encoder/x86/av1_fwd_txfm_sse2.h"

// TODO(linfengz): refine fdct4x8 and fadst4x8 optimization (if possible).

static void fdct4x4_new_sse2(const __m128i *input, __m128i *output,
                             int8_t cos_bit) { … }

static void fdct8x4_new_sse2(const __m128i *input, __m128i *output,
                             int8_t cos_bit) { … }

static void fdct4x8_new_sse2(const __m128i *input, __m128i *output,
                             int8_t cos_bit) { … }

static void fdct8x16_new_sse2(const __m128i *input, __m128i *output,
                              int8_t cos_bit) { … }

void av1_fdct8x32_new_sse2(const __m128i *input, __m128i *output,
                           int8_t cos_bit) { … }

void av1_fdct8x64_new_sse2(const __m128i *input, __m128i *output,
                           int8_t cos_bit) { … }

static void fadst4x4_new_sse2(const __m128i *input, __m128i *output,
                              int8_t cos_bit) { … }

static void fadst4x8_new_sse2(const __m128i *input, __m128i *output,
                              int8_t cos_bit) { … }

static void fadst8x4_new_sse2(const __m128i *input, __m128i *output,
                              int8_t cos_bit) { … }

static void fadst8x16_new_sse2(const __m128i *input, __m128i *output,
                               int8_t cos_bit) { … }

static const transform_1d_sse2 col_txfm4x4_arr[TX_TYPES] = …;

static const transform_1d_sse2 row_txfm4x4_arr[TX_TYPES] = …;

static const transform_1d_sse2 col_txfm4x8_arr[TX_TYPES] = …;

static const transform_1d_sse2 row_txfm8x4_arr[TX_TYPES] = …;

static const transform_1d_sse2 col_txfm8x4_arr[TX_TYPES] = …;

static const transform_1d_sse2 row_txfm4x8_arr[TX_TYPES] = …;

static const transform_1d_sse2 col_txfm8x8_arr[TX_TYPES] = …;

static const transform_1d_sse2 row_txfm8x8_arr[TX_TYPES] = …;

static const transform_1d_sse2 col_txfm8x16_arr[TX_TYPES] = …;

static const transform_1d_sse2 row_txfm8x16_arr[TX_TYPES] = …;

static const transform_1d_sse2 row_txfm8x32_arr[TX_TYPES] = …;

void av1_lowbd_fwd_txfm2d_4x4_sse2(const int16_t *input, int32_t *output,
                                   int stride, TX_TYPE tx_type, int bd) { … }

void av1_lowbd_fwd_txfm2d_4x8_sse2(const int16_t *input, int32_t *output,
                                   int stride, TX_TYPE tx_type, int bd) { … }

void av1_lowbd_fwd_txfm2d_4x16_sse2(const int16_t *input, int32_t *output,
                                    int stride, TX_TYPE tx_type, int bd) { … }

void av1_lowbd_fwd_txfm2d_8x4_sse2(const int16_t *input, int32_t *output,
                                   int stride, TX_TYPE tx_type, int bd) { … }

void av1_lowbd_fwd_txfm2d_8x8_sse2(const int16_t *input, int32_t *output,
                                   int stride, TX_TYPE tx_type, int bd) { … }

void av1_lowbd_fwd_txfm2d_8x16_sse2(const int16_t *input, int32_t *output,
                                    int stride, TX_TYPE tx_type, int bd) { … }

void av1_lowbd_fwd_txfm2d_8x32_sse2(const int16_t *input, int32_t *output,
                                    int stride, TX_TYPE tx_type, int bd) { … }

void av1_lowbd_fwd_txfm2d_16x4_sse2(const int16_t *input, int32_t *output,
                                    int stride, TX_TYPE tx_type, int bd) { … }

void av1_lowbd_fwd_txfm2d_16x8_sse2(const int16_t *input, int32_t *output,
                                    int stride, TX_TYPE tx_type, int bd) { … }

void av1_lowbd_fwd_txfm2d_16x16_sse2(const int16_t *input, int32_t *output,
                                     int stride, TX_TYPE tx_type, int bd) { … }

void av1_lowbd_fwd_txfm2d_16x32_sse2(const int16_t *input, int32_t *output,
                                     int stride, TX_TYPE tx_type, int bd) { … }

void av1_lowbd_fwd_txfm2d_32x8_sse2(const int16_t *input, int32_t *output,
                                    int stride, TX_TYPE tx_type, int bd) { … }

void av1_lowbd_fwd_txfm2d_32x16_sse2(const int16_t *input, int32_t *output,
                                     int stride, TX_TYPE tx_type, int bd) { … }

void av1_lowbd_fwd_txfm2d_32x32_sse2(const int16_t *input, int32_t *output,
                                     int stride, TX_TYPE tx_type, int bd) { … }

void av1_lowbd_fwd_txfm2d_64x16_sse2(const int16_t *input, int32_t *output,
                                     int stride, TX_TYPE tx_type, int bd) { … }

void av1_lowbd_fwd_txfm2d_16x64_sse2(const int16_t *input, int32_t *output,
                                     int stride, TX_TYPE tx_type, int bd) { … }

// Include top-level function only for 32-bit x86, to support Valgrind.
// For normal use, we require SSE4.1, so av1_lowbd_fwd_txfm_sse4_1 will be used
// instead of this function. However, 32-bit Valgrind does not support SSE4.1,
// so we include a fallback to SSE2 to improve performance
#if AOM_ARCH_X86
static FwdTxfm2dFunc fwd_txfm2d_func_ls[TX_SIZES_ALL] = {
  av1_lowbd_fwd_txfm2d_4x4_sse2,    // 4x4 transform
  av1_lowbd_fwd_txfm2d_8x8_sse2,    // 8x8 transform
  av1_lowbd_fwd_txfm2d_16x16_sse2,  // 16x16 transform
  av1_lowbd_fwd_txfm2d_32x32_sse2,  // 32x32 transform
  NULL,                             // 64x64 transform
  av1_lowbd_fwd_txfm2d_4x8_sse2,    // 4x8 transform
  av1_lowbd_fwd_txfm2d_8x4_sse2,    // 8x4 transform
  av1_lowbd_fwd_txfm2d_8x16_sse2,   // 8x16 transform
  av1_lowbd_fwd_txfm2d_16x8_sse2,   // 16x8 transform
  av1_lowbd_fwd_txfm2d_16x32_sse2,  // 16x32 transform
  av1_lowbd_fwd_txfm2d_32x16_sse2,  // 32x16 transform
  NULL,                             // 32x64 transform
  NULL,                             // 64x32 transform
  av1_lowbd_fwd_txfm2d_4x16_sse2,   // 4x16 transform
  av1_lowbd_fwd_txfm2d_16x4_sse2,   // 16x4 transform
  av1_lowbd_fwd_txfm2d_8x32_sse2,   // 8x32 transform
  av1_lowbd_fwd_txfm2d_32x8_sse2,   // 32x8 transform
  av1_lowbd_fwd_txfm2d_16x64_sse2,  // 16x64 transform
  av1_lowbd_fwd_txfm2d_64x16_sse2,  // 64x16 transform
};

void av1_lowbd_fwd_txfm_sse2(const int16_t *src_diff, tran_low_t *coeff,
                             int diff_stride, TxfmParam *txfm_param) {
  FwdTxfm2dFunc fwd_txfm2d_func = fwd_txfm2d_func_ls[txfm_param->tx_size];

  if ((fwd_txfm2d_func == NULL) ||
      (txfm_param->lossless && txfm_param->tx_size == TX_4X4))
    av1_lowbd_fwd_txfm_c(src_diff, coeff, diff_stride, txfm_param);
  else
    fwd_txfm2d_func(src_diff, coeff, diff_stride, txfm_param->tx_type,
                    txfm_param->bd);
}
#endif  // AOM_ARCH_X86
chromium/third_party/libaom/source/libaom/av1/encoder/x86/av1_fwd_txfm_sse2.c