/* * Copyright (c) 2018, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include <immintrin.h> #include "config/aom_config.h" #include "config/av1_rtcd.h" #include "av1/common/restoration.h" #include "aom_dsp/x86/synonyms.h" #include "aom_dsp/x86/synonyms_avx2.h" // Load 8 bytes from the possibly-misaligned pointer p, extend each byte to // 32-bit precision and return them in an AVX2 register. static __m256i yy256_load_extend_8_32(const void *p) { … } // Load 8 halfwords from the possibly-misaligned pointer p, extend each // halfword to 32-bit precision and return them in an AVX2 register. static __m256i yy256_load_extend_16_32(const void *p) { … } // Compute the scan of an AVX2 register holding 8 32-bit integers. If the // register holds x0..x7 then the scan will hold x0, x0+x1, x0+x1+x2, ..., // x0+x1+...+x7 // // Let [...] represent a 128-bit block, and let a, ..., h be 32-bit integers // (assumed small enough to be able to add them without overflow). // // Use -> as shorthand for summing, i.e. h->a = h + g + f + e + d + c + b + a. // // x = [h g f e][d c b a] // x01 = [g f e 0][c b a 0] // x02 = [g+h f+g e+f e][c+d b+c a+b a] // x03 = [e+f e 0 0][a+b a 0 0] // x04 = [e->h e->g e->f e][a->d a->c a->b a] // s = a->d // s01 = [a->d a->d a->d a->d] // s02 = [a->d a->d a->d a->d][0 0 0 0] // ret = [a->h a->g a->f a->e][a->d a->c a->b a] static __m256i scan_32(__m256i x) { … } // Compute two integral images from src. B sums elements; A sums their // squares. The images are offset by one pixel, so will have width and height // equal to width + 1, height + 1 and the first row and column will be zero. // // A+1 and B+1 should be aligned to 32 bytes. buf_stride should be a multiple // of 8. static void *memset_zero_avx(int32_t *dest, const __m256i *zero, size_t count) { … } static void integral_images(const uint8_t *src, int src_stride, int width, int height, int32_t *A, int32_t *B, int buf_stride) { … } // Compute two integral images from src. B sums elements; A sums their squares // // A and B should be aligned to 32 bytes. buf_stride should be a multiple of 8. static void integral_images_highbd(const uint16_t *src, int src_stride, int width, int height, int32_t *A, int32_t *B, int buf_stride) { … } // Compute 8 values of boxsum from the given integral image. ii should point // at the middle of the box (for the first value). r is the box radius. static inline __m256i boxsum_from_ii(const int32_t *ii, int stride, int r) { … } static __m256i round_for_shift(unsigned shift) { … } static __m256i compute_p(__m256i sum1, __m256i sum2, int bit_depth, int n) { … } // Assumes that C, D are integral images for the original buffer which has been // extended to have a padding of SGRPROJ_BORDER_VERT/SGRPROJ_BORDER_HORZ pixels // on the sides. A, B, C, D point at logical position (0, 0). static void calc_ab(int32_t *A, int32_t *B, const int32_t *C, const int32_t *D, int width, int height, int buf_stride, int bit_depth, int sgr_params_idx, int radius_idx) { … } // Calculate 8 values of the "cross sum" starting at buf. This is a 3x3 filter // where the outer four corners have weight 3 and all other pixels have weight // 4. // // Pixels are indexed as follows: // xtl xt xtr // xl x xr // xbl xb xbr // // buf points to x // // fours = xl + xt + xr + xb + x // threes = xtl + xtr + xbr + xbl // cross_sum = 4 * fours + 3 * threes // = 4 * (fours + threes) - threes // = (fours + threes) << 2 - threes static inline __m256i cross_sum(const int32_t *buf, int stride) { … } // The final filter for self-guided restoration. Computes a weighted average // across A, B with "cross sums" (see cross_sum implementation above). static void final_filter(int32_t *dst, int dst_stride, const int32_t *A, const int32_t *B, int buf_stride, const void *dgd8, int dgd_stride, int width, int height, int highbd) { … } // Assumes that C, D are integral images for the original buffer which has been // extended to have a padding of SGRPROJ_BORDER_VERT/SGRPROJ_BORDER_HORZ pixels // on the sides. A, B, C, D point at logical position (0, 0). static void calc_ab_fast(int32_t *A, int32_t *B, const int32_t *C, const int32_t *D, int width, int height, int buf_stride, int bit_depth, int sgr_params_idx, int radius_idx) { … } // Calculate 8 values of the "cross sum" starting at buf. // // Pixels are indexed like this: // xtl xt xtr // - buf - // xbl xb xbr // // Pixels are weighted like this: // 5 6 5 // 0 0 0 // 5 6 5 // // fives = xtl + xtr + xbl + xbr // sixes = xt + xb // cross_sum = 6 * sixes + 5 * fives // = 5 * (fives + sixes) - sixes // = (fives + sixes) << 2 + (fives + sixes) + sixes static inline __m256i cross_sum_fast_even_row(const int32_t *buf, int stride) { … } // Calculate 8 values of the "cross sum" starting at buf. // // Pixels are indexed like this: // xl x xr // // Pixels are weighted like this: // 5 6 5 // // buf points to x // // fives = xl + xr // sixes = x // cross_sum = 5 * fives + 6 * sixes // = 4 * (fives + sixes) + (fives + sixes) + sixes // = (fives + sixes) << 2 + (fives + sixes) + sixes static inline __m256i cross_sum_fast_odd_row(const int32_t *buf) { … } // The final filter for the self-guided restoration. Computes a // weighted average across A, B with "cross sums" (see cross_sum_... // implementations above). static void final_filter_fast(int32_t *dst, int dst_stride, const int32_t *A, const int32_t *B, int buf_stride, const void *dgd8, int dgd_stride, int width, int height, int highbd) { … } int av1_selfguided_restoration_avx2(const uint8_t *dgd8, int width, int height, int dgd_stride, int32_t *flt0, int32_t *flt1, int flt_stride, int sgr_params_idx, int bit_depth, int highbd) { … } int av1_apply_selfguided_restoration_avx2(const uint8_t *dat8, int width, int height, int stride, int eps, const int *xqd, uint8_t *dst8, int dst_stride, int32_t *tmpbuf, int bit_depth, int highbd) { … }