highbd_inv_txfm_avx2.c | Explore in Territory

/*
 * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
 *
 * This source code is subject to the terms of the BSD 2 Clause License and
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
 * was not distributed with this source code in the LICENSE file, you can
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
 * Media Patent License 1.0 was not distributed with this source code in the
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 */
#include <assert.h>
#include <immintrin.h>

#include "config/aom_config.h"
#include "config/av1_rtcd.h"

#include "av1/common/av1_inv_txfm1d_cfg.h"
#include "av1/common/idct.h"
#include "av1/common/x86/av1_inv_txfm_ssse3.h"
#include "av1/common/x86/highbd_txfm_utility_sse4.h"
#include "aom_dsp/x86/txfm_common_avx2.h"

// Note:
//  Total 32x4 registers to represent 32x32 block coefficients.
//  For high bit depth, each coefficient is 4-byte.
//  Each __m256i register holds 8 coefficients.
//  So each "row" we needs 4 register. Totally 32 rows
//  Register layout:
//   v0,   v1,   v2,   v3,
//   v4,   v5,   v6,   v7,
//   ... ...
//   v124, v125, v126, v127

static inline __m256i highbd_clamp_epi16_avx2(__m256i u, int bd) { … }

static inline void round_shift_4x4_avx2(__m256i *in, int shift) { … }

static inline void round_shift_8x8_avx2(__m256i *in, int shift) { … }

static void highbd_clamp_epi32_avx2(__m256i *in, __m256i *out,
                                    const __m256i *clamp_lo,
                                    const __m256i *clamp_hi, int size) { … }

static inline __m256i highbd_get_recon_16x8_avx2(const __m256i pred,
                                                 __m256i res0, __m256i res1,
                                                 const int bd) { … }

static inline void highbd_write_buffer_16xn_avx2(__m256i *in, uint16_t *output,
                                                 int stride, int flipud,
                                                 int height, const int bd) { … }
static inline __m256i highbd_get_recon_8x8_avx2(const __m256i pred, __m256i res,
                                                const int bd) { … }

static inline void highbd_write_buffer_8xn_avx2(__m256i *in, uint16_t *output,
                                                int stride, int flipud,
                                                int height, const int bd) { … }
static void neg_shift_avx2(const __m256i in0, const __m256i in1, __m256i *out0,
                           __m256i *out1, const __m256i *clamp_lo,
                           const __m256i *clamp_hi, int shift) { … }

static void transpose_8x8_avx2(const __m256i *in, __m256i *out) { … }

static void transpose_8x8_flip_avx2(const __m256i *in, __m256i *out) { … }

static inline void load_buffer_32bit_input(const int32_t *in, int stride,
                                           __m256i *out, int out_size) { … }

static inline __m256i half_btf_0_avx2(const __m256i *w0, const __m256i *n0,
                                      const __m256i *rounding, int bit) { … }

static inline __m256i half_btf_avx2(const __m256i *w0, const __m256i *n0,
                                    const __m256i *w1, const __m256i *n1,
                                    const __m256i *rounding, int bit) { … }

static void addsub_avx2(const __m256i in0, const __m256i in1, __m256i *out0,
                        __m256i *out1, const __m256i *clamp_lo,
                        const __m256i *clamp_hi) { … }

static inline void idct32_stage4_avx2(
    __m256i *bf1, const __m256i *cospim8, const __m256i *cospi56,
    const __m256i *cospi8, const __m256i *cospim56, const __m256i *cospim40,
    const __m256i *cospi24, const __m256i *cospi40, const __m256i *cospim24,
    const __m256i *rounding, int bit) { … }

static inline void idct32_stage5_avx2(
    __m256i *bf1, const __m256i *cospim16, const __m256i *cospi48,
    const __m256i *cospi16, const __m256i *cospim48, const __m256i *clamp_lo,
    const __m256i *clamp_hi, const __m256i *rounding, int bit) { … }

static inline void idct32_stage6_avx2(
    __m256i *bf1, const __m256i *cospim32, const __m256i *cospi32,
    const __m256i *cospim16, const __m256i *cospi48, const __m256i *cospi16,
    const __m256i *cospim48, const __m256i *clamp_lo, const __m256i *clamp_hi,
    const __m256i *rounding, int bit) { … }

static inline void idct32_stage7_avx2(__m256i *bf1, const __m256i *cospim32,
                                      const __m256i *cospi32,
                                      const __m256i *clamp_lo,
                                      const __m256i *clamp_hi,
                                      const __m256i *rounding, int bit) { … }

static inline void idct32_stage8_avx2(__m256i *bf1, const __m256i *cospim32,
                                      const __m256i *cospi32,
                                      const __m256i *clamp_lo,
                                      const __m256i *clamp_hi,
                                      const __m256i *rounding, int bit) { … }

static inline void idct32_stage9_avx2(__m256i *bf1, __m256i *out,
                                      const int do_cols, const int bd,
                                      const int out_shift,
                                      const __m256i *clamp_lo,
                                      const __m256i *clamp_hi) { … }

static void idct32_low1_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
                             int bd, int out_shift) { … }

static void idct32_low8_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
                             int bd, int out_shift) { … }

static void idct32_low16_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
                              int bd, int out_shift) { … }

static void idct32_avx2(__m256i *in, __m256i *out, int bit, int do_cols, int bd,
                        int out_shift) { … }
static void idct16_low1_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
                             int bd, int out_shift) { … }

static void idct16_low8_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
                             int bd, int out_shift) { … }

static void idct16_avx2(__m256i *in, __m256i *out, int bit, int do_cols, int bd,
                        int out_shift) { … }

static void iadst16_low1_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
                              int bd, int out_shift) { … }

static void iadst16_low8_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
                              int bd, int out_shift) { … }

static void iadst16_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
                         int bd, int out_shift) { … }
static void idct8x8_low1_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
                              int bd, int out_shift) { … }
static void idct8x8_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
                         int bd, int out_shift) { … }
static void iadst8x8_low1_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
                               int bd, int out_shift) { … }

static void iadst8x8_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
                          int bd, int out_shift) { … }
static inline void idct64_stage8_avx2(
    __m256i *u, const __m256i *cospim32, const __m256i *cospi32,
    const __m256i *cospim16, const __m256i *cospi48, const __m256i *cospi16,
    const __m256i *cospim48, const __m256i *clamp_lo, const __m256i *clamp_hi,
    const __m256i *rnding, int bit) { … }

static inline void idct64_stage9_avx2(__m256i *u, const __m256i *cospim32,
                                      const __m256i *cospi32,
                                      const __m256i *clamp_lo,
                                      const __m256i *clamp_hi,
                                      const __m256i *rnding, int bit) { … }

static inline void idct64_stage10_avx2(__m256i *u, const __m256i *cospim32,
                                       const __m256i *cospi32,
                                       const __m256i *clamp_lo,
                                       const __m256i *clamp_hi,
                                       const __m256i *rnding, int bit) { … }

static inline void idct64_stage11_avx2(__m256i *u, __m256i *out, int do_cols,
                                       int bd, int out_shift,
                                       const __m256i *clamp_lo,
                                       const __m256i *clamp_hi) { … }

static void idct64_low1_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
                             int bd, int out_shift) { … }
static void idct64_low8_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
                             int bd, int out_shift) { … }
static void idct64_low16_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
                              int bd, int out_shift) { … }
static void idct64_avx2(__m256i *in, __m256i *out, int bit, int do_cols, int bd,
                        int out_shift) { … }
transform_1d_avx2;

static const transform_1d_avx2
    highbd_txfm_all_1d_zeros_w8_arr[TX_SIZES][ITX_TYPES_1D][4] = …;

static void highbd_inv_txfm2d_add_no_identity_avx2(const int32_t *input,
                                                   uint16_t *output, int stride,
                                                   TX_TYPE tx_type,
                                                   TX_SIZE tx_size, int eob,
                                                   const int bd) { … }

static void av1_highbd_inv_txfm2d_add_universe_avx2(const int32_t *input,
                                                    uint8_t *output, int stride,
                                                    TX_TYPE tx_type,
                                                    TX_SIZE tx_size, int eob,
                                                    const int bd) { … }
void av1_highbd_inv_txfm_add_avx2(const tran_low_t *input, uint8_t *dest,
                                  int stride, const TxfmParam *txfm_param) { … }
chromium/third_party/libaom/source/libaom/av1/common/x86/highbd_inv_txfm_avx2.c