dec_sse2.c | Explore in Territory

// Copyright 2011 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// SSE2 version of some decoding functions (idct, loop filtering).
//
// Author: [email protected] (Somnath Banerjee)
//         [email protected] (Christian Duvivier)

#include "src/dsp/dsp.h"

#if defined(WEBP_USE_SSE2)

// The 3-coeff sparse transform in SSE2 is not really faster than the plain-C
// one it seems => disable it by default. Uncomment the following to enable:
#if !defined(USE_TRANSFORM_AC3)
#define USE_TRANSFORM_AC3 …
#endif

#include <emmintrin.h>
#include "src/dsp/common_sse2.h"
#include "src/dec/vp8i_dec.h"
#include "src/utils/utils.h"

//------------------------------------------------------------------------------
// Transforms (Paragraph 14.4)

static void Transform_SSE2(const int16_t* in, uint8_t* dst, int do_two) { … }

#if (USE_TRANSFORM_AC3 == 1)

static void TransformAC3(const int16_t* in, uint8_t* dst) {
  const __m128i A = _mm_set1_epi16(in[0] + 4);
  const __m128i c4 = _mm_set1_epi16(WEBP_TRANSFORM_AC3_MUL2(in[4]));
  const __m128i d4 = _mm_set1_epi16(WEBP_TRANSFORM_AC3_MUL1(in[4]));
  const int c1 = WEBP_TRANSFORM_AC3_MUL2(in[1]);
  const int d1 = WEBP_TRANSFORM_AC3_MUL1(in[1]);
  const __m128i CD = _mm_set_epi16(0, 0, 0, 0, -d1, -c1, c1, d1);
  const __m128i B = _mm_adds_epi16(A, CD);
  const __m128i m0 = _mm_adds_epi16(B, d4);
  const __m128i m1 = _mm_adds_epi16(B, c4);
  const __m128i m2 = _mm_subs_epi16(B, c4);
  const __m128i m3 = _mm_subs_epi16(B, d4);
  const __m128i zero = _mm_setzero_si128();
  // Load the source pixels.
  __m128i dst0 = _mm_cvtsi32_si128(WebPMemToInt32(dst + 0 * BPS));
  __m128i dst1 = _mm_cvtsi32_si128(WebPMemToInt32(dst + 1 * BPS));
  __m128i dst2 = _mm_cvtsi32_si128(WebPMemToInt32(dst + 2 * BPS));
  __m128i dst3 = _mm_cvtsi32_si128(WebPMemToInt32(dst + 3 * BPS));
  // Convert to 16b.
  dst0 = _mm_unpacklo_epi8(dst0, zero);
  dst1 = _mm_unpacklo_epi8(dst1, zero);
  dst2 = _mm_unpacklo_epi8(dst2, zero);
  dst3 = _mm_unpacklo_epi8(dst3, zero);
  // Add the inverse transform.
  dst0 = _mm_adds_epi16(dst0, _mm_srai_epi16(m0, 3));
  dst1 = _mm_adds_epi16(dst1, _mm_srai_epi16(m1, 3));
  dst2 = _mm_adds_epi16(dst2, _mm_srai_epi16(m2, 3));
  dst3 = _mm_adds_epi16(dst3, _mm_srai_epi16(m3, 3));
  // Unsigned saturate to 8b.
  dst0 = _mm_packus_epi16(dst0, dst0);
  dst1 = _mm_packus_epi16(dst1, dst1);
  dst2 = _mm_packus_epi16(dst2, dst2);
  dst3 = _mm_packus_epi16(dst3, dst3);
  // Store the results.
  WebPInt32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32(dst0));
  WebPInt32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32(dst1));
  WebPInt32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(dst2));
  WebPInt32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(dst3));
}

#endif   // USE_TRANSFORM_AC3

//------------------------------------------------------------------------------
// Loop Filter (Paragraph 15)

// Compute abs(p - q) = subs(p - q) OR subs(q - p)
#define MM_ABS(p, q) …

// Shift each byte of "x" by 3 bits while preserving by the sign bit.
static WEBP_INLINE void SignedShift8b_SSE2(__m128i* const x) { … }

#define FLIP_SIGN_BIT2(a, b) …

#define FLIP_SIGN_BIT4(a, b, c, d) …

// input/output is uint8_t
static WEBP_INLINE void GetNotHEV_SSE2(const __m128i* const p1,
                                       const __m128i* const p0,
                                       const __m128i* const q0,
                                       const __m128i* const q1,
                                       int hev_thresh, __m128i* const not_hev) { … }

// input pixels are int8_t
static WEBP_INLINE void GetBaseDelta_SSE2(const __m128i* const p1,
                                          const __m128i* const p0,
                                          const __m128i* const q0,
                                          const __m128i* const q1,
                                          __m128i* const delta) { … }

// input and output are int8_t
static WEBP_INLINE void DoSimpleFilter_SSE2(__m128i* const p0,
                                            __m128i* const q0,
                                            const __m128i* const fl) { … }

// Updates values of 2 pixels at MB edge during complex filtering.
// Update operations:
// q = q - delta and p = p + delta; where delta = [(a_hi >> 7), (a_lo >> 7)]
// Pixels 'pi' and 'qi' are int8_t on input, uint8_t on output (sign flip).
static WEBP_INLINE void Update2Pixels_SSE2(__m128i* const pi, __m128i* const qi,
                                           const __m128i* const a0_lo,
                                           const __m128i* const a0_hi) { … }

// input pixels are uint8_t
static WEBP_INLINE void NeedsFilter_SSE2(const __m128i* const p1,
                                         const __m128i* const p0,
                                         const __m128i* const q0,
                                         const __m128i* const q1,
                                         int thresh, __m128i* const mask) { … }

//------------------------------------------------------------------------------
// Edge filtering functions

// Applies filter on 2 pixels (p0 and q0)
static WEBP_INLINE void DoFilter2_SSE2(__m128i* const p1, __m128i* const p0,
                                       __m128i* const q0, __m128i* const q1,
                                       int thresh) { … }

// Applies filter on 4 pixels (p1, p0, q0 and q1)
static WEBP_INLINE void DoFilter4_SSE2(__m128i* const p1, __m128i* const p0,
                                       __m128i* const q0, __m128i* const q1,
                                       const __m128i* const mask,
                                       int hev_thresh) { … }

// Applies filter on 6 pixels (p2, p1, p0, q0, q1 and q2)
static WEBP_INLINE void DoFilter6_SSE2(__m128i* const p2, __m128i* const p1,
                                       __m128i* const p0, __m128i* const q0,
                                       __m128i* const q1, __m128i* const q2,
                                       const __m128i* const mask,
                                       int hev_thresh) { … }

// reads 8 rows across a vertical edge.
static WEBP_INLINE void Load8x4_SSE2(const uint8_t* const b, int stride,
                                     __m128i* const p, __m128i* const q) { … }

static WEBP_INLINE void Load16x4_SSE2(const uint8_t* const r0,
                                      const uint8_t* const r8,
                                      int stride,
                                      __m128i* const p1, __m128i* const p0,
                                      __m128i* const q0, __m128i* const q1) { … }

static WEBP_INLINE void Store4x4_SSE2(__m128i* const x,
                                      uint8_t* dst, int stride) { … }

// Transpose back and store
static WEBP_INLINE void Store16x4_SSE2(const __m128i* const p1,
                                       const __m128i* const p0,
                                       const __m128i* const q0,
                                       const __m128i* const q1,
                                       uint8_t* r0, uint8_t* r8,
                                       int stride) { … }

//------------------------------------------------------------------------------
// Simple In-loop filtering (Paragraph 15.2)

static void SimpleVFilter16_SSE2(uint8_t* p, int stride, int thresh) { … }

static void SimpleHFilter16_SSE2(uint8_t* p, int stride, int thresh) { … }

static void SimpleVFilter16i_SSE2(uint8_t* p, int stride, int thresh) { … }

static void SimpleHFilter16i_SSE2(uint8_t* p, int stride, int thresh) { … }

//------------------------------------------------------------------------------
// Complex In-loop filtering (Paragraph 15.3)

#define MAX_DIFF1(p3, p2, p1, p0, m) …

#define MAX_DIFF2(p3, p2, p1, p0, m) …

#define LOAD_H_EDGES4(p, stride, e1, e2, e3, e4) …

#define LOADUV_H_EDGE(p, u, v, stride) …

#define LOADUV_H_EDGES4(u, v, stride, e1, e2, e3, e4) …

#define STOREUV(p, u, v, stride) …

static WEBP_INLINE void ComplexMask_SSE2(const __m128i* const p1,
                                         const __m128i* const p0,
                                         const __m128i* const q0,
                                         const __m128i* const q1,
                                         int thresh, int ithresh,
                                         __m128i* const mask) { … }

// on macroblock edges
static void VFilter16_SSE2(uint8_t* p, int stride,
                           int thresh, int ithresh, int hev_thresh) { … }

static void HFilter16_SSE2(uint8_t* p, int stride,
                           int thresh, int ithresh, int hev_thresh) { … }

// on three inner edges
static void VFilter16i_SSE2(uint8_t* p, int stride,
                            int thresh, int ithresh, int hev_thresh) { … }

static void HFilter16i_SSE2(uint8_t* p, int stride,
                            int thresh, int ithresh, int hev_thresh) { … }

// 8-pixels wide variant, for chroma filtering
static void VFilter8_SSE2(uint8_t* u, uint8_t* v, int stride,
                          int thresh, int ithresh, int hev_thresh) { … }

static void HFilter8_SSE2(uint8_t* u, uint8_t* v, int stride,
                          int thresh, int ithresh, int hev_thresh) { … }

static void VFilter8i_SSE2(uint8_t* u, uint8_t* v, int stride,
                           int thresh, int ithresh, int hev_thresh) { … }

static void HFilter8i_SSE2(uint8_t* u, uint8_t* v, int stride,
                           int thresh, int ithresh, int hev_thresh) { … }

//------------------------------------------------------------------------------
// 4x4 predictions

#define DST …
#define AVG3 …

// We use the following 8b-arithmetic tricks:
//     (a + 2 * b + c + 2) >> 2 = (AC + b + 1) >> 1
//   where: AC = (a + c) >> 1 = [(a + c + 1) >> 1] - [(a^c) & 1]
// and:
//     (a + 2 * b + c + 2) >> 2 = (AB + BC + 1) >> 1 - (ab|bc)&lsb
//   where: AC = (a + b + 1) >> 1,   BC = (b + c + 1) >> 1
//   and ab = a ^ b, bc = b ^ c, lsb = (AC^BC)&1

static void VE4_SSE2(uint8_t* dst) { … }

static void LD4_SSE2(uint8_t* dst) { … }

static void VR4_SSE2(uint8_t* dst) { … }

static void VL4_SSE2(uint8_t* dst) { … }

static void RD4_SSE2(uint8_t* dst) { … }

#undef DST
#undef AVG3

//------------------------------------------------------------------------------
// Luma 16x16

static WEBP_INLINE void TrueMotion_SSE2(uint8_t* dst, int size) { … }

static void TM4_SSE2(uint8_t* dst)   { … }
static void TM8uv_SSE2(uint8_t* dst) { … }
static void TM16_SSE2(uint8_t* dst)  { … }

static void VE16_SSE2(uint8_t* dst) { … }

static void HE16_SSE2(uint8_t* dst) { … }

static WEBP_INLINE void Put16_SSE2(uint8_t v, uint8_t* dst) { … }

static void DC16_SSE2(uint8_t* dst) { … }

static void DC16NoTop_SSE2(uint8_t* dst) { … }

static void DC16NoLeft_SSE2(uint8_t* dst) { … }

static void DC16NoTopLeft_SSE2(uint8_t* dst) { … }

//------------------------------------------------------------------------------
// Chroma

static void VE8uv_SSE2(uint8_t* dst) { … }

// helper for chroma-DC predictions
static WEBP_INLINE void Put8x8uv_SSE2(uint8_t v, uint8_t* dst) { … }

static void DC8uv_SSE2(uint8_t* dst) { … }

static void DC8uvNoLeft_SSE2(uint8_t* dst) { … }

static void DC8uvNoTop_SSE2(uint8_t* dst) { … }

static void DC8uvNoTopLeft_SSE2(uint8_t* dst) { … }

//------------------------------------------------------------------------------
// Entry point

extern void VP8DspInitSSE2(void);

WEBP_TSAN_IGNORE_FUNCTION void VP8DspInitSSE2(void) { … }

#else  // !WEBP_USE_SSE2

WEBP_DSP_INIT_STUB(VP8DspInitSSE2)

#endif  // WEBP_USE_SSE2
godot/thirdparty/libwebp/src/dsp/dec_sse2.c