warp_plane_sse4.c | Explore in Territory

/*
 * Copyright (c) 2017, Alliance for Open Media. All rights reserved.
 *
 * This source code is subject to the terms of the BSD 2 Clause License and
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
 * was not distributed with this source code in the LICENSE file, you can
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
 * Media Patent License 1.0 was not distributed with this source code in the
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 */

#include <emmintrin.h>
#include <smmintrin.h>

#include "config/av1_rtcd.h"

#include "av1/common/warped_motion.h"

/* This is a modified version of 'av1_warped_filter' from warped_motion.c:
   * Each coefficient is stored in 8 bits instead of 16 bits
   * The coefficients are rearranged in the column order 0, 2, 4, 6, 1, 3, 5, 7

     This is done in order to avoid overflow: Since the tap with the largest
     coefficient could be any of taps 2, 3, 4 or 5, we can't use the summation
     order ((0 + 1) + (4 + 5)) + ((2 + 3) + (6 + 7)) used in the regular
     convolve functions.

     Instead, we use the summation order
     ((0 + 2) + (4 + 6)) + ((1 + 3) + (5 + 7)).
     The rearrangement of coefficients in this table is so that we can get the
     coefficients into the correct order more quickly.
*/
/* clang-format off */
DECLARE_ALIGNED(8, const int8_t,
                av1_filter_8bit[WARPEDPIXEL_PREC_SHIFTS * 3 + 1][8]) = …;
/* clang-format on */

// Shuffle masks: we want to convert a sequence of bytes 0, 1, 2, ..., 15
// in an SSE register into two sequences:
// 0, 2, 2, 4, ..., 12, 12, 14, <don't care>
// 1, 3, 3, 5, ..., 13, 13, 15, <don't care>
DECLARE_ALIGNED(16, static const uint8_t,
                even_mask[16]) = …;

DECLARE_ALIGNED(16, static const uint8_t,
                odd_mask[16]) = …;

DECLARE_ALIGNED(16, static const uint8_t,
                shuffle_alpha0_mask01[16]) = …;

DECLARE_ALIGNED(16, static const uint8_t,
                shuffle_alpha0_mask23[16]) = …;

DECLARE_ALIGNED(16, static const uint8_t,
                shuffle_alpha0_mask45[16]) = …;

DECLARE_ALIGNED(16, static const uint8_t,
                shuffle_alpha0_mask67[16]) = …;

DECLARE_ALIGNED(16, static const uint8_t,
                shuffle_gamma0_mask0[16]) = …;

DECLARE_ALIGNED(16, static const uint8_t,
                shuffle_gamma0_mask1[16]) = …;

DECLARE_ALIGNED(16, static const uint8_t,
                shuffle_gamma0_mask2[16]) = …;

DECLARE_ALIGNED(16, static const uint8_t,
                shuffle_gamma0_mask3[16]) = …;

static inline void filter_src_pixels(__m128i src, __m128i *tmp, __m128i *coeff,
                                     const int offset_bits_horiz,
                                     const int reduce_bits_horiz, int k) { … }

static inline void prepare_horizontal_filter_coeff(int alpha, int sx,
                                                   __m128i *coeff) { … }

static inline void prepare_horizontal_filter_coeff_alpha0(int sx,
                                                          __m128i *coeff) { … }

static inline void horizontal_filter(__m128i src, __m128i *tmp, int sx,
                                     int alpha, int k,
                                     const int offset_bits_horiz,
                                     const int reduce_bits_horiz) { … }

static inline void warp_horizontal_filter(const uint8_t *ref, __m128i *tmp,
                                          int stride, int32_t ix4, int32_t iy4,
                                          int32_t sx4, int alpha, int beta,
                                          int p_height, int height, int i,
                                          const int offset_bits_horiz,
                                          const int reduce_bits_horiz) { … }

static inline void warp_horizontal_filter_alpha0(
    const uint8_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
    int32_t sx4, int alpha, int beta, int p_height, int height, int i,
    const int offset_bits_horiz, const int reduce_bits_horiz) { … }

static inline void warp_horizontal_filter_beta0(
    const uint8_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
    int32_t sx4, int alpha, int beta, int p_height, int height, int i,
    const int offset_bits_horiz, const int reduce_bits_horiz) { … }

static inline void warp_horizontal_filter_alpha0_beta0(
    const uint8_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
    int32_t sx4, int alpha, int beta, int p_height, int height, int i,
    const int offset_bits_horiz, const int reduce_bits_horiz) { … }

static inline void unpack_weights_and_set_round_const(
    ConvolveParams *conv_params, const int round_bits, const int offset_bits,
    __m128i *res_sub_const, __m128i *round_bits_const, __m128i *wt) { … }

static inline void prepare_vertical_filter_coeffs(int gamma, int sy,
                                                  __m128i *coeffs) { … }

static inline void prepare_vertical_filter_coeffs_gamma0(int sy,
                                                         __m128i *coeffs) { … }

static inline void filter_src_pixels_vertical(__m128i *tmp, __m128i *coeffs,
                                              __m128i *res_lo, __m128i *res_hi,
                                              int k) { … }

static inline void store_vertical_filter_output(
    __m128i *res_lo, __m128i *res_hi, const __m128i *res_add_const,
    const __m128i *wt, const __m128i *res_sub_const, __m128i *round_bits_const,
    uint8_t *pred, ConvolveParams *conv_params, int i, int j, int k,
    const int reduce_bits_vert, int p_stride, int p_width,
    const int round_bits) { … }

static inline void warp_vertical_filter(
    uint8_t *pred, __m128i *tmp, ConvolveParams *conv_params, int16_t gamma,
    int16_t delta, int p_height, int p_stride, int p_width, int i, int j,
    int sy4, const int reduce_bits_vert, const __m128i *res_add_const,
    const int round_bits, const int offset_bits) { … }

static inline void warp_vertical_filter_gamma0(
    uint8_t *pred, __m128i *tmp, ConvolveParams *conv_params, int16_t gamma,
    int16_t delta, int p_height, int p_stride, int p_width, int i, int j,
    int sy4, const int reduce_bits_vert, const __m128i *res_add_const,
    const int round_bits, const int offset_bits) { … }

static inline void warp_vertical_filter_delta0(
    uint8_t *pred, __m128i *tmp, ConvolveParams *conv_params, int16_t gamma,
    int16_t delta, int p_height, int p_stride, int p_width, int i, int j,
    int sy4, const int reduce_bits_vert, const __m128i *res_add_const,
    const int round_bits, const int offset_bits) { … }

static inline void warp_vertical_filter_gamma0_delta0(
    uint8_t *pred, __m128i *tmp, ConvolveParams *conv_params, int16_t gamma,
    int16_t delta, int p_height, int p_stride, int p_width, int i, int j,
    int sy4, const int reduce_bits_vert, const __m128i *res_add_const,
    const int round_bits, const int offset_bits) { … }

static inline void prepare_warp_vertical_filter(
    uint8_t *pred, __m128i *tmp, ConvolveParams *conv_params, int16_t gamma,
    int16_t delta, int p_height, int p_stride, int p_width, int i, int j,
    int sy4, const int reduce_bits_vert, const __m128i *res_add_const,
    const int round_bits, const int offset_bits) { … }

static inline void prepare_warp_horizontal_filter(
    const uint8_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
    int32_t sx4, int alpha, int beta, int p_height, int height, int i,
    const int offset_bits_horiz, const int reduce_bits_horiz) { … }

void av1_warp_affine_sse4_1(const int32_t *mat, const uint8_t *ref, int width,
                            int height, int stride, uint8_t *pred, int p_col,
                            int p_row, int p_width, int p_height, int p_stride,
                            int subsampling_x, int subsampling_y,
                            ConvolveParams *conv_params, int16_t alpha,
                            int16_t beta, int16_t gamma, int16_t delta) { … }
chromium/third_party/libaom/source/libaom/av1/common/x86/warp_plane_sse4.c