av1_inv_txfm_avx2.c | Explore in Territory

/*
 * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
 *
 * This source code is subject to the terms of the BSD 2 Clause License and
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
 * was not distributed with this source code in the LICENSE file, you can
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
 * Media Patent License 1.0 was not distributed with this source code in the
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 */

#include "config/aom_config.h"

#include "config/av1_rtcd.h"

#include "av1/common/av1_inv_txfm1d_cfg.h"
#include "av1/common/x86/av1_txfm_sse2.h"
#include "av1/common/x86/av1_inv_txfm_avx2.h"
#include "av1/common/x86/av1_inv_txfm_ssse3.h"

// TODO([email protected]): move this to header file

// Sqrt2, Sqrt2^2, Sqrt2^3, Sqrt2^4, Sqrt2^5
static int32_t NewSqrt2list[TX_SIZES] = …;

static inline void idct16_stage5_avx2(__m256i *x1, const int32_t *cospi,
                                      const __m256i _r, int8_t cos_bit) { … }

static inline void idct16_stage6_avx2(__m256i *x, const int32_t *cospi,
                                      const __m256i _r, int8_t cos_bit) { … }

static inline void idct16_stage7_avx2(__m256i *output, __m256i *x1) { … }

static void idct16_avx2(const __m256i *input, __m256i *output) { … }

static void idct16_low8_avx2(const __m256i *input, __m256i *output) { … }

static void idct16_low1_avx2(const __m256i *input, __m256i *output) { … }

static inline void iadst16_stage3_avx2(__m256i *x) { … }

static inline void iadst16_stage4_avx2(__m256i *x, const int32_t *cospi,
                                       const __m256i _r, int8_t cos_bit) { … }

static inline void iadst16_stage5_avx2(__m256i *x) { … }

static inline void iadst16_stage6_avx2(__m256i *x, const int32_t *cospi,
                                       const __m256i _r, int8_t cos_bit) { … }

static inline void iadst16_stage7_avx2(__m256i *x) { … }

static inline void iadst16_stage8_avx2(__m256i *x1, const int32_t *cospi,
                                       const __m256i _r, int8_t cos_bit) { … }

static inline void iadst16_stage9_avx2(__m256i *output, __m256i *x1) { … }

static void iadst16_avx2(const __m256i *input, __m256i *output) { … }

static void iadst16_low8_avx2(const __m256i *input, __m256i *output) { … }

static void iadst16_low1_avx2(const __m256i *input, __m256i *output) { … }

static inline void idct32_high16_stage3_avx2(__m256i *x) { … }

static inline void idct32_high16_stage4_avx2(__m256i *x, const int32_t *cospi,
                                             const __m256i _r, int8_t cos_bit) { … }

static inline void idct32_high24_stage5_avx2(__m256i *x, const int32_t *cospi,
                                             const __m256i _r, int8_t cos_bit) { … }

static inline void idct32_high28_stage6_avx2(__m256i *x, const int32_t *cospi,
                                             const __m256i _r, int8_t cos_bit) { … }

static inline void idct32_stage7_avx2(__m256i *x, const int32_t *cospi,
                                      const __m256i _r, int8_t cos_bit) { … }

static inline void idct32_stage8_avx2(__m256i *x, const int32_t *cospi,
                                      const __m256i _r, int8_t cos_bit) { … }

static inline void idct32_stage9_avx2(__m256i *output, __m256i *x) { … }

static void idct32_low1_avx2(const __m256i *input, __m256i *output) { … }

static void idct32_low8_avx2(const __m256i *input, __m256i *output) { … }

static void idct32_low16_avx2(const __m256i *input, __m256i *output) { … }

static void idct32_avx2(const __m256i *input, __m256i *output) { … }

static inline void idct64_stage4_high32_avx2(__m256i *x, const int32_t *cospi,
                                             const __m256i _r, int8_t cos_bit) { … }

static inline void idct64_stage5_high48_avx2(__m256i *x, const int32_t *cospi,
                                             const __m256i _r, int8_t cos_bit) { … }

static inline void idct64_stage6_high32_avx2(__m256i *x, const int32_t *cospi,
                                             const __m256i _r, int8_t cos_bit) { … }

static inline void idct64_stage6_high48_avx2(__m256i *x, const int32_t *cospi,
                                             const __m256i _r, int8_t cos_bit) { … }

static inline void idct64_stage7_high48_avx2(__m256i *x, const int32_t *cospi,
                                             const __m256i _r, int8_t cos_bit) { … }

static inline void idct64_stage8_high48_avx2(__m256i *x, const int32_t *cospi,
                                             const __m256i _r, int8_t cos_bit) { … }

static inline void idct64_stage9_avx2(__m256i *x, const int32_t *cospi,
                                      const __m256i _r, int8_t cos_bit) { … }

static inline void idct64_stage10_avx2(__m256i *x, const int32_t *cospi,
                                       const __m256i _r, int8_t cos_bit) { … }

static inline void idct64_stage11_avx2(__m256i *output, __m256i *x) { … }

static void idct64_low1_avx2(const __m256i *input, __m256i *output) { … }

static void idct64_low8_avx2(const __m256i *input, __m256i *output) { … }

static void idct64_low16_avx2(const __m256i *input, __m256i *output) { … }

static void idct64_low32_avx2(const __m256i *input, __m256i *output) { … }

transform_1d_avx2;

// 1D functions process 16 pixels at one time.
static const transform_1d_avx2
    lowbd_txfm_all_1d_zeros_w16_arr[TX_SIZES][ITX_TYPES_1D][4] = …;

// only process w >= 16 h >= 16
static inline void lowbd_inv_txfm2d_add_no_identity_avx2(
    const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
    TX_SIZE tx_size, int eob) { … }

static inline void iidentity_row_16xn_avx2(__m256i *out, const int32_t *input,
                                           int stride, int shift, int height,
                                           int txw_idx, int rect_type) { … }

static inline void iidentity_col_16xn_avx2(uint8_t *output, int stride,
                                           __m256i *buf, int shift, int height,
                                           int txh_idx) { … }

static inline void lowbd_inv_txfm2d_add_idtx_avx2(const int32_t *input,
                                                  uint8_t *output, int stride,
                                                  TX_SIZE tx_size,
                                                  int32_t eob) { … }

static inline void lowbd_inv_txfm2d_add_h_identity_avx2(
    const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
    TX_SIZE tx_size, int eob) { … }

static inline void lowbd_inv_txfm2d_add_v_identity_avx2(
    const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
    TX_SIZE tx_size, int eob) { … }

static const transform_1d_ssse3 lowbd_txfm_all_1d_zeros_8x8_arr[2][2] = …;

static inline void load_buffer_avx2(const int32_t *in, int stride,
                                    __m128i *out) { … }

static inline void round_and_transpose_avx2(const __m128i *const in,
                                            __m128i *const out, int bit,
                                            int *lr_flip) { … }

static inline void round_shift_lowbd_write_buffer_avx2(__m128i *in, int bit,
                                                       uint8_t *output,
                                                       int stride, int flipud) { … }

// AVX2 implementation has the advantage when combined multiple operations
// together.
static inline void lowbd_inv_txfm2d_8x8_no_identity_avx2(
    const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
    TX_SIZE tx_size, int eob) { … }

// AVX2 implementation of 8x8 inverse transform. Observed that coding AVX2 for
// tx_type with identity in either of the direction has no advantage.
static void lowbd_inv_txfm2d_add_8x8_avx2(const int32_t *input, uint8_t *output,
                                          int stride, TX_TYPE tx_type,
                                          TX_SIZE tx_size, int eob) { … }

// for 32x32,32x64,64x32,64x64,16x32,32x16,64x16,16x64
static inline void lowbd_inv_txfm2d_add_universe_avx2(
    const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
    TX_SIZE tx_size, int eob) { … }

void av1_lowbd_inv_txfm2d_add_avx2(const int32_t *input, uint8_t *output,
                                   int stride, TX_TYPE tx_type, TX_SIZE tx_size,
                                   int eob) { … }

void av1_inv_txfm_add_avx2(const tran_low_t *dqcoeff, uint8_t *dst, int stride,
                           const TxfmParam *txfm_param) { … }
chromium/third_party/libaom/source/libaom/av1/common/x86/av1_inv_txfm_avx2.c