selfguided_avx2.c | Explore in Territory

/*
 * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
 *
 * This source code is subject to the terms of the BSD 2 Clause License and
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
 * was not distributed with this source code in the LICENSE file, you can
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
 * Media Patent License 1.0 was not distributed with this source code in the
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 */

#include <immintrin.h>

#include "config/aom_config.h"
#include "config/av1_rtcd.h"

#include "av1/common/restoration.h"
#include "aom_dsp/x86/synonyms.h"
#include "aom_dsp/x86/synonyms_avx2.h"

// Load 8 bytes from the possibly-misaligned pointer p, extend each byte to
// 32-bit precision and return them in an AVX2 register.
static __m256i yy256_load_extend_8_32(const void *p) { … }

// Load 8 halfwords from the possibly-misaligned pointer p, extend each
// halfword to 32-bit precision and return them in an AVX2 register.
static __m256i yy256_load_extend_16_32(const void *p) { … }

// Compute the scan of an AVX2 register holding 8 32-bit integers. If the
// register holds x0..x7 then the scan will hold x0, x0+x1, x0+x1+x2, ...,
// x0+x1+...+x7
//
// Let [...] represent a 128-bit block, and let a, ..., h be 32-bit integers
// (assumed small enough to be able to add them without overflow).
//
// Use -> as shorthand for summing, i.e. h->a = h + g + f + e + d + c + b + a.
//
// x   = [h g f e][d c b a]
// x01 = [g f e 0][c b a 0]
// x02 = [g+h f+g e+f e][c+d b+c a+b a]
// x03 = [e+f e 0 0][a+b a 0 0]
// x04 = [e->h e->g e->f e][a->d a->c a->b a]
// s   = a->d
// s01 = [a->d a->d a->d a->d]
// s02 = [a->d a->d a->d a->d][0 0 0 0]
// ret = [a->h a->g a->f a->e][a->d a->c a->b a]
static __m256i scan_32(__m256i x) { … }

// Compute two integral images from src. B sums elements; A sums their
// squares. The images are offset by one pixel, so will have width and height
// equal to width + 1, height + 1 and the first row and column will be zero.
//
// A+1 and B+1 should be aligned to 32 bytes. buf_stride should be a multiple
// of 8.

static void *memset_zero_avx(int32_t *dest, const __m256i *zero, size_t count) { … }

static void integral_images(const uint8_t *src, int src_stride, int width,
                            int height, int32_t *A, int32_t *B,
                            int buf_stride) { … }

// Compute two integral images from src. B sums elements; A sums their squares
//
// A and B should be aligned to 32 bytes. buf_stride should be a multiple of 8.
static void integral_images_highbd(const uint16_t *src, int src_stride,
                                   int width, int height, int32_t *A,
                                   int32_t *B, int buf_stride) { … }

// Compute 8 values of boxsum from the given integral image. ii should point
// at the middle of the box (for the first value). r is the box radius.
static inline __m256i boxsum_from_ii(const int32_t *ii, int stride, int r) { … }

static __m256i round_for_shift(unsigned shift) { … }

static __m256i compute_p(__m256i sum1, __m256i sum2, int bit_depth, int n) { … }

// Assumes that C, D are integral images for the original buffer which has been
// extended to have a padding of SGRPROJ_BORDER_VERT/SGRPROJ_BORDER_HORZ pixels
// on the sides. A, B, C, D point at logical position (0, 0).
static void calc_ab(int32_t *A, int32_t *B, const int32_t *C, const int32_t *D,
                    int width, int height, int buf_stride, int bit_depth,
                    int sgr_params_idx, int radius_idx) { … }

// Calculate 8 values of the "cross sum" starting at buf. This is a 3x3 filter
// where the outer four corners have weight 3 and all other pixels have weight
// 4.
//
// Pixels are indexed as follows:
// xtl  xt   xtr
// xl    x   xr
// xbl  xb   xbr
//
// buf points to x
//
// fours = xl + xt + xr + xb + x
// threes = xtl + xtr + xbr + xbl
// cross_sum = 4 * fours + 3 * threes
//           = 4 * (fours + threes) - threes
//           = (fours + threes) << 2 - threes
static inline __m256i cross_sum(const int32_t *buf, int stride) { … }

// The final filter for self-guided restoration. Computes a weighted average
// across A, B with "cross sums" (see cross_sum implementation above).
static void final_filter(int32_t *dst, int dst_stride, const int32_t *A,
                         const int32_t *B, int buf_stride, const void *dgd8,
                         int dgd_stride, int width, int height, int highbd) { … }

// Assumes that C, D are integral images for the original buffer which has been
// extended to have a padding of SGRPROJ_BORDER_VERT/SGRPROJ_BORDER_HORZ pixels
// on the sides. A, B, C, D point at logical position (0, 0).
static void calc_ab_fast(int32_t *A, int32_t *B, const int32_t *C,
                         const int32_t *D, int width, int height,
                         int buf_stride, int bit_depth, int sgr_params_idx,
                         int radius_idx) { … }

// Calculate 8 values of the "cross sum" starting at buf.
//
// Pixels are indexed like this:
// xtl  xt   xtr
//  -   buf   -
// xbl  xb   xbr
//
// Pixels are weighted like this:
//  5    6    5
//  0    0    0
//  5    6    5
//
// fives = xtl + xtr + xbl + xbr
// sixes = xt + xb
// cross_sum = 6 * sixes + 5 * fives
//           = 5 * (fives + sixes) - sixes
//           = (fives + sixes) << 2 + (fives + sixes) + sixes
static inline __m256i cross_sum_fast_even_row(const int32_t *buf, int stride) { … }

// Calculate 8 values of the "cross sum" starting at buf.
//
// Pixels are indexed like this:
// xl    x   xr
//
// Pixels are weighted like this:
//  5    6    5
//
// buf points to x
//
// fives = xl + xr
// sixes = x
// cross_sum = 5 * fives + 6 * sixes
//           = 4 * (fives + sixes) + (fives + sixes) + sixes
//           = (fives + sixes) << 2 + (fives + sixes) + sixes
static inline __m256i cross_sum_fast_odd_row(const int32_t *buf) { … }

// The final filter for the self-guided restoration. Computes a
// weighted average across A, B with "cross sums" (see cross_sum_...
// implementations above).
static void final_filter_fast(int32_t *dst, int dst_stride, const int32_t *A,
                              const int32_t *B, int buf_stride,
                              const void *dgd8, int dgd_stride, int width,
                              int height, int highbd) { … }

int av1_selfguided_restoration_avx2(const uint8_t *dgd8, int width, int height,
                                    int dgd_stride, int32_t *flt0,
                                    int32_t *flt1, int flt_stride,
                                    int sgr_params_idx, int bit_depth,
                                    int highbd) { … }

int av1_apply_selfguided_restoration_avx2(const uint8_t *dat8, int width,
                                          int height, int stride, int eps,
                                          const int *xqd, uint8_t *dst8,
                                          int dst_stride, int32_t *tmpbuf,
                                          int bit_depth, int highbd) { … }
chromium/third_party/libaom/source/libaom/av1/common/x86/selfguided_avx2.c