#include "src/dsp/dsp.h"
#if defined(WEBP_USE_NEON)
#include "src/dsp/neon.h"
#include "src/dec/vp8i_dec.h"
#if !defined(WORK_AROUND_GCC)
static WEBP_INLINE uint8x8x4_t Load4x8_NEON(const uint8_t* const src,
int stride) {
const uint8x8_t zero = vdup_n_u8(0);
uint8x8x4_t out;
INIT_VECTOR4(out, zero, zero, zero, zero);
out = vld4_lane_u8(src + 0 * stride, out, 0);
out = vld4_lane_u8(src + 1 * stride, out, 1);
out = vld4_lane_u8(src + 2 * stride, out, 2);
out = vld4_lane_u8(src + 3 * stride, out, 3);
out = vld4_lane_u8(src + 4 * stride, out, 4);
out = vld4_lane_u8(src + 5 * stride, out, 5);
out = vld4_lane_u8(src + 6 * stride, out, 6);
out = vld4_lane_u8(src + 7 * stride, out, 7);
return out;
}
static WEBP_INLINE void Load4x16_NEON(const uint8_t* const src, int stride,
uint8x16_t* const p1,
uint8x16_t* const p0,
uint8x16_t* const q0,
uint8x16_t* const q1) {
const uint8x8x4_t row0 = Load4x8_NEON(src - 2 + 0 * stride, stride);
const uint8x8x4_t row8 = Load4x8_NEON(src - 2 + 8 * stride, stride);
*p1 = vcombine_u8(row0.val[0], row8.val[0]);
*p0 = vcombine_u8(row0.val[1], row8.val[1]);
*q0 = vcombine_u8(row0.val[2], row8.val[2]);
*q1 = vcombine_u8(row0.val[3], row8.val[3]);
}
#else
#define LOADQ_LANE_32b …
static WEBP_INLINE void Load4x16_NEON(const uint8_t* src, int stride,
uint8x16_t* const p1,
uint8x16_t* const p0,
uint8x16_t* const q0,
uint8x16_t* const q1) {
const uint32x4_t zero = vdupq_n_u32(0);
uint32x4x4_t in;
INIT_VECTOR4(in, zero, zero, zero, zero);
src -= 2;
LOADQ_LANE_32b(in.val[0], 0);
LOADQ_LANE_32b(in.val[1], 0);
LOADQ_LANE_32b(in.val[2], 0);
LOADQ_LANE_32b(in.val[3], 0);
LOADQ_LANE_32b(in.val[0], 1);
LOADQ_LANE_32b(in.val[1], 1);
LOADQ_LANE_32b(in.val[2], 1);
LOADQ_LANE_32b(in.val[3], 1);
LOADQ_LANE_32b(in.val[0], 2);
LOADQ_LANE_32b(in.val[1], 2);
LOADQ_LANE_32b(in.val[2], 2);
LOADQ_LANE_32b(in.val[3], 2);
LOADQ_LANE_32b(in.val[0], 3);
LOADQ_LANE_32b(in.val[1], 3);
LOADQ_LANE_32b(in.val[2], 3);
LOADQ_LANE_32b(in.val[3], 3);
{
const uint8x16x2_t row01 = vtrnq_u8(vreinterpretq_u8_u32(in.val[0]),
vreinterpretq_u8_u32(in.val[1]));
const uint8x16x2_t row23 = vtrnq_u8(vreinterpretq_u8_u32(in.val[2]),
vreinterpretq_u8_u32(in.val[3]));
const uint16x8x2_t row02 = vtrnq_u16(vreinterpretq_u16_u8(row01.val[0]),
vreinterpretq_u16_u8(row23.val[0]));
const uint16x8x2_t row13 = vtrnq_u16(vreinterpretq_u16_u8(row01.val[1]),
vreinterpretq_u16_u8(row23.val[1]));
*p1 = vreinterpretq_u8_u16(row02.val[0]);
*p0 = vreinterpretq_u8_u16(row13.val[0]);
*q0 = vreinterpretq_u8_u16(row02.val[1]);
*q1 = vreinterpretq_u8_u16(row13.val[1]);
}
}
#undef LOADQ_LANE_32b
#endif
static WEBP_INLINE void Load8x16_NEON(
const uint8_t* const src, int stride,
uint8x16_t* const p3, uint8x16_t* const p2, uint8x16_t* const p1,
uint8x16_t* const p0, uint8x16_t* const q0, uint8x16_t* const q1,
uint8x16_t* const q2, uint8x16_t* const q3) {
Load4x16_NEON(src - 2, stride, p3, p2, p1, p0);
Load4x16_NEON(src + 2, stride, q0, q1, q2, q3);
}
static WEBP_INLINE void Load16x4_NEON(const uint8_t* const src, int stride,
uint8x16_t* const p1,
uint8x16_t* const p0,
uint8x16_t* const q0,
uint8x16_t* const q1) {
*p1 = vld1q_u8(src - 2 * stride);
*p0 = vld1q_u8(src - 1 * stride);
*q0 = vld1q_u8(src + 0 * stride);
*q1 = vld1q_u8(src + 1 * stride);
}
static WEBP_INLINE void Load16x8_NEON(
const uint8_t* const src, int stride,
uint8x16_t* const p3, uint8x16_t* const p2, uint8x16_t* const p1,
uint8x16_t* const p0, uint8x16_t* const q0, uint8x16_t* const q1,
uint8x16_t* const q2, uint8x16_t* const q3) {
Load16x4_NEON(src - 2 * stride, stride, p3, p2, p1, p0);
Load16x4_NEON(src + 2 * stride, stride, q0, q1, q2, q3);
}
static WEBP_INLINE void Load8x8x2_NEON(
const uint8_t* const u, const uint8_t* const v, int stride,
uint8x16_t* const p3, uint8x16_t* const p2, uint8x16_t* const p1,
uint8x16_t* const p0, uint8x16_t* const q0, uint8x16_t* const q1,
uint8x16_t* const q2, uint8x16_t* const q3) {
*p3 = vcombine_u8(vld1_u8(u - 4 * stride), vld1_u8(v - 4 * stride));
*p2 = vcombine_u8(vld1_u8(u - 3 * stride), vld1_u8(v - 3 * stride));
*p1 = vcombine_u8(vld1_u8(u - 2 * stride), vld1_u8(v - 2 * stride));
*p0 = vcombine_u8(vld1_u8(u - 1 * stride), vld1_u8(v - 1 * stride));
*q0 = vcombine_u8(vld1_u8(u + 0 * stride), vld1_u8(v + 0 * stride));
*q1 = vcombine_u8(vld1_u8(u + 1 * stride), vld1_u8(v + 1 * stride));
*q2 = vcombine_u8(vld1_u8(u + 2 * stride), vld1_u8(v + 2 * stride));
*q3 = vcombine_u8(vld1_u8(u + 3 * stride), vld1_u8(v + 3 * stride));
}
#if !defined(WORK_AROUND_GCC)
#define LOAD_UV_8 …
static WEBP_INLINE void Load8x8x2T_NEON(
const uint8_t* const u, const uint8_t* const v, int stride,
uint8x16_t* const p3, uint8x16_t* const p2, uint8x16_t* const p1,
uint8x16_t* const p0, uint8x16_t* const q0, uint8x16_t* const q1,
uint8x16_t* const q2, uint8x16_t* const q3) {
const uint8x16_t row0 = LOAD_UV_8(0);
const uint8x16_t row1 = LOAD_UV_8(1);
const uint8x16_t row2 = LOAD_UV_8(2);
const uint8x16_t row3 = LOAD_UV_8(3);
const uint8x16_t row4 = LOAD_UV_8(4);
const uint8x16_t row5 = LOAD_UV_8(5);
const uint8x16_t row6 = LOAD_UV_8(6);
const uint8x16_t row7 = LOAD_UV_8(7);
const uint8x16x2_t row01 = vtrnq_u8(row0, row1);
const uint8x16x2_t row23 = vtrnq_u8(row2, row3);
const uint8x16x2_t row45 = vtrnq_u8(row4, row5);
const uint8x16x2_t row67 = vtrnq_u8(row6, row7);
const uint16x8x2_t row02 = vtrnq_u16(vreinterpretq_u16_u8(row01.val[0]),
vreinterpretq_u16_u8(row23.val[0]));
const uint16x8x2_t row13 = vtrnq_u16(vreinterpretq_u16_u8(row01.val[1]),
vreinterpretq_u16_u8(row23.val[1]));
const uint16x8x2_t row46 = vtrnq_u16(vreinterpretq_u16_u8(row45.val[0]),
vreinterpretq_u16_u8(row67.val[0]));
const uint16x8x2_t row57 = vtrnq_u16(vreinterpretq_u16_u8(row45.val[1]),
vreinterpretq_u16_u8(row67.val[1]));
const uint32x4x2_t row04 = vtrnq_u32(vreinterpretq_u32_u16(row02.val[0]),
vreinterpretq_u32_u16(row46.val[0]));
const uint32x4x2_t row26 = vtrnq_u32(vreinterpretq_u32_u16(row02.val[1]),
vreinterpretq_u32_u16(row46.val[1]));
const uint32x4x2_t row15 = vtrnq_u32(vreinterpretq_u32_u16(row13.val[0]),
vreinterpretq_u32_u16(row57.val[0]));
const uint32x4x2_t row37 = vtrnq_u32(vreinterpretq_u32_u16(row13.val[1]),
vreinterpretq_u32_u16(row57.val[1]));
*p3 = vreinterpretq_u8_u32(row04.val[0]);
*p2 = vreinterpretq_u8_u32(row15.val[0]);
*p1 = vreinterpretq_u8_u32(row26.val[0]);
*p0 = vreinterpretq_u8_u32(row37.val[0]);
*q0 = vreinterpretq_u8_u32(row04.val[1]);
*q1 = vreinterpretq_u8_u32(row15.val[1]);
*q2 = vreinterpretq_u8_u32(row26.val[1]);
*q3 = vreinterpretq_u8_u32(row37.val[1]);
}
#undef LOAD_UV_8
#endif
static WEBP_INLINE void Store2x8_NEON(const uint8x8x2_t v,
uint8_t* const dst, int stride) {
vst2_lane_u8(dst + 0 * stride, v, 0);
vst2_lane_u8(dst + 1 * stride, v, 1);
vst2_lane_u8(dst + 2 * stride, v, 2);
vst2_lane_u8(dst + 3 * stride, v, 3);
vst2_lane_u8(dst + 4 * stride, v, 4);
vst2_lane_u8(dst + 5 * stride, v, 5);
vst2_lane_u8(dst + 6 * stride, v, 6);
vst2_lane_u8(dst + 7 * stride, v, 7);
}
static WEBP_INLINE void Store2x16_NEON(const uint8x16_t p0, const uint8x16_t q0,
uint8_t* const dst, int stride) {
uint8x8x2_t lo, hi;
lo.val[0] = vget_low_u8(p0);
lo.val[1] = vget_low_u8(q0);
hi.val[0] = vget_high_u8(p0);
hi.val[1] = vget_high_u8(q0);
Store2x8_NEON(lo, dst - 1 + 0 * stride, stride);
Store2x8_NEON(hi, dst - 1 + 8 * stride, stride);
}
#if !defined(WORK_AROUND_GCC)
static WEBP_INLINE void Store4x8_NEON(const uint8x8x4_t v,
uint8_t* const dst, int stride) {
vst4_lane_u8(dst + 0 * stride, v, 0);
vst4_lane_u8(dst + 1 * stride, v, 1);
vst4_lane_u8(dst + 2 * stride, v, 2);
vst4_lane_u8(dst + 3 * stride, v, 3);
vst4_lane_u8(dst + 4 * stride, v, 4);
vst4_lane_u8(dst + 5 * stride, v, 5);
vst4_lane_u8(dst + 6 * stride, v, 6);
vst4_lane_u8(dst + 7 * stride, v, 7);
}
static WEBP_INLINE void Store4x16_NEON(const uint8x16_t p1, const uint8x16_t p0,
const uint8x16_t q0, const uint8x16_t q1,
uint8_t* const dst, int stride) {
uint8x8x4_t lo, hi;
INIT_VECTOR4(lo,
vget_low_u8(p1), vget_low_u8(p0),
vget_low_u8(q0), vget_low_u8(q1));
INIT_VECTOR4(hi,
vget_high_u8(p1), vget_high_u8(p0),
vget_high_u8(q0), vget_high_u8(q1));
Store4x8_NEON(lo, dst - 2 + 0 * stride, stride);
Store4x8_NEON(hi, dst - 2 + 8 * stride, stride);
}
#endif
static WEBP_INLINE void Store16x2_NEON(const uint8x16_t p0, const uint8x16_t q0,
uint8_t* const dst, int stride) {
vst1q_u8(dst - stride, p0);
vst1q_u8(dst, q0);
}
static WEBP_INLINE void Store16x4_NEON(const uint8x16_t p1, const uint8x16_t p0,
const uint8x16_t q0, const uint8x16_t q1,
uint8_t* const dst, int stride) {
Store16x2_NEON(p1, p0, dst - stride, stride);
Store16x2_NEON(q0, q1, dst + stride, stride);
}
static WEBP_INLINE void Store8x2x2_NEON(const uint8x16_t p0,
const uint8x16_t q0,
uint8_t* const u, uint8_t* const v,
int stride) {
vst1_u8(u - stride, vget_low_u8(p0));
vst1_u8(u, vget_low_u8(q0));
vst1_u8(v - stride, vget_high_u8(p0));
vst1_u8(v, vget_high_u8(q0));
}
static WEBP_INLINE void Store8x4x2_NEON(const uint8x16_t p1,
const uint8x16_t p0,
const uint8x16_t q0,
const uint8x16_t q1,
uint8_t* const u, uint8_t* const v,
int stride) {
Store8x2x2_NEON(p1, p0, u - stride, v - stride, stride);
Store8x2x2_NEON(q0, q1, u + stride, v + stride, stride);
}
#if !defined(WORK_AROUND_GCC)
#define STORE6_LANE …
static WEBP_INLINE void Store6x8x2_NEON(
const uint8x16_t p2, const uint8x16_t p1, const uint8x16_t p0,
const uint8x16_t q0, const uint8x16_t q1, const uint8x16_t q2,
uint8_t* u, uint8_t* v, int stride) {
uint8x8x3_t u0, u1, v0, v1;
INIT_VECTOR3(u0, vget_low_u8(p2), vget_low_u8(p1), vget_low_u8(p0));
INIT_VECTOR3(u1, vget_low_u8(q0), vget_low_u8(q1), vget_low_u8(q2));
INIT_VECTOR3(v0, vget_high_u8(p2), vget_high_u8(p1), vget_high_u8(p0));
INIT_VECTOR3(v1, vget_high_u8(q0), vget_high_u8(q1), vget_high_u8(q2));
STORE6_LANE(u, u0, u1, 0);
STORE6_LANE(u, u0, u1, 1);
STORE6_LANE(u, u0, u1, 2);
STORE6_LANE(u, u0, u1, 3);
STORE6_LANE(u, u0, u1, 4);
STORE6_LANE(u, u0, u1, 5);
STORE6_LANE(u, u0, u1, 6);
STORE6_LANE(u, u0, u1, 7);
STORE6_LANE(v, v0, v1, 0);
STORE6_LANE(v, v0, v1, 1);
STORE6_LANE(v, v0, v1, 2);
STORE6_LANE(v, v0, v1, 3);
STORE6_LANE(v, v0, v1, 4);
STORE6_LANE(v, v0, v1, 5);
STORE6_LANE(v, v0, v1, 6);
STORE6_LANE(v, v0, v1, 7);
}
#undef STORE6_LANE
static WEBP_INLINE void Store4x8x2_NEON(const uint8x16_t p1,
const uint8x16_t p0,
const uint8x16_t q0,
const uint8x16_t q1,
uint8_t* const u, uint8_t* const v,
int stride) {
uint8x8x4_t u0, v0;
INIT_VECTOR4(u0,
vget_low_u8(p1), vget_low_u8(p0),
vget_low_u8(q0), vget_low_u8(q1));
INIT_VECTOR4(v0,
vget_high_u8(p1), vget_high_u8(p0),
vget_high_u8(q0), vget_high_u8(q1));
vst4_lane_u8(u - 2 + 0 * stride, u0, 0);
vst4_lane_u8(u - 2 + 1 * stride, u0, 1);
vst4_lane_u8(u - 2 + 2 * stride, u0, 2);
vst4_lane_u8(u - 2 + 3 * stride, u0, 3);
vst4_lane_u8(u - 2 + 4 * stride, u0, 4);
vst4_lane_u8(u - 2 + 5 * stride, u0, 5);
vst4_lane_u8(u - 2 + 6 * stride, u0, 6);
vst4_lane_u8(u - 2 + 7 * stride, u0, 7);
vst4_lane_u8(v - 2 + 0 * stride, v0, 0);
vst4_lane_u8(v - 2 + 1 * stride, v0, 1);
vst4_lane_u8(v - 2 + 2 * stride, v0, 2);
vst4_lane_u8(v - 2 + 3 * stride, v0, 3);
vst4_lane_u8(v - 2 + 4 * stride, v0, 4);
vst4_lane_u8(v - 2 + 5 * stride, v0, 5);
vst4_lane_u8(v - 2 + 6 * stride, v0, 6);
vst4_lane_u8(v - 2 + 7 * stride, v0, 7);
}
#endif
static WEBP_INLINE int16x8_t ConvertU8ToS16_NEON(uint8x8_t v) {
return vreinterpretq_s16_u16(vmovl_u8(v));
}
static WEBP_INLINE void SaturateAndStore4x4_NEON(uint8_t* const dst,
const int16x8_t dst01,
const int16x8_t dst23) {
const uint8x8_t dst01_u8 = vqmovun_s16(dst01);
const uint8x8_t dst23_u8 = vqmovun_s16(dst23);
vst1_lane_u32((uint32_t*)(dst + 0 * BPS), vreinterpret_u32_u8(dst01_u8), 0);
vst1_lane_u32((uint32_t*)(dst + 1 * BPS), vreinterpret_u32_u8(dst01_u8), 1);
vst1_lane_u32((uint32_t*)(dst + 2 * BPS), vreinterpret_u32_u8(dst23_u8), 0);
vst1_lane_u32((uint32_t*)(dst + 3 * BPS), vreinterpret_u32_u8(dst23_u8), 1);
}
static WEBP_INLINE void Add4x4_NEON(const int16x8_t row01,
const int16x8_t row23,
uint8_t* const dst) {
uint32x2_t dst01 = vdup_n_u32(0);
uint32x2_t dst23 = vdup_n_u32(0);
dst01 = vld1_lane_u32((uint32_t*)(dst + 0 * BPS), dst01, 0);
dst23 = vld1_lane_u32((uint32_t*)(dst + 2 * BPS), dst23, 0);
dst01 = vld1_lane_u32((uint32_t*)(dst + 1 * BPS), dst01, 1);
dst23 = vld1_lane_u32((uint32_t*)(dst + 3 * BPS), dst23, 1);
{
const int16x8_t dst01_s16 = ConvertU8ToS16_NEON(vreinterpret_u8_u32(dst01));
const int16x8_t dst23_s16 = ConvertU8ToS16_NEON(vreinterpret_u8_u32(dst23));
const int16x8_t out01 = vrsraq_n_s16(dst01_s16, row01, 3);
const int16x8_t out23 = vrsraq_n_s16(dst23_s16, row23, 3);
SaturateAndStore4x4_NEON(dst, out01, out23);
}
}
static uint8x16_t NeedsFilter_NEON(const uint8x16_t p1, const uint8x16_t p0,
const uint8x16_t q0, const uint8x16_t q1,
int thresh) {
const uint8x16_t thresh_v = vdupq_n_u8((uint8_t)thresh);
const uint8x16_t a_p0_q0 = vabdq_u8(p0, q0);
const uint8x16_t a_p1_q1 = vabdq_u8(p1, q1);
const uint8x16_t a_p0_q0_2 = vqaddq_u8(a_p0_q0, a_p0_q0);
const uint8x16_t a_p1_q1_2 = vshrq_n_u8(a_p1_q1, 1);
const uint8x16_t sum = vqaddq_u8(a_p0_q0_2, a_p1_q1_2);
const uint8x16_t mask = vcgeq_u8(thresh_v, sum);
return mask;
}
static int8x16_t FlipSign_NEON(const uint8x16_t v) {
const uint8x16_t sign_bit = vdupq_n_u8(0x80);
return vreinterpretq_s8_u8(veorq_u8(v, sign_bit));
}
static uint8x16_t FlipSignBack_NEON(const int8x16_t v) {
const int8x16_t sign_bit = vdupq_n_s8(0x80);
return vreinterpretq_u8_s8(veorq_s8(v, sign_bit));
}
static int8x16_t GetBaseDelta_NEON(const int8x16_t p1, const int8x16_t p0,
const int8x16_t q0, const int8x16_t q1) {
const int8x16_t q0_p0 = vqsubq_s8(q0, p0);
const int8x16_t p1_q1 = vqsubq_s8(p1, q1);
const int8x16_t s1 = vqaddq_s8(p1_q1, q0_p0);
const int8x16_t s2 = vqaddq_s8(q0_p0, s1);
const int8x16_t s3 = vqaddq_s8(q0_p0, s2);
return s3;
}
static int8x16_t GetBaseDelta0_NEON(const int8x16_t p0, const int8x16_t q0) {
const int8x16_t q0_p0 = vqsubq_s8(q0, p0);
const int8x16_t s1 = vqaddq_s8(q0_p0, q0_p0);
const int8x16_t s2 = vqaddq_s8(q0_p0, s1);
return s2;
}
static void ApplyFilter2NoFlip_NEON(const int8x16_t p0s, const int8x16_t q0s,
const int8x16_t delta,
int8x16_t* const op0,
int8x16_t* const oq0) {
const int8x16_t kCst3 = vdupq_n_s8(0x03);
const int8x16_t kCst4 = vdupq_n_s8(0x04);
const int8x16_t delta_p3 = vqaddq_s8(delta, kCst3);
const int8x16_t delta_p4 = vqaddq_s8(delta, kCst4);
const int8x16_t delta3 = vshrq_n_s8(delta_p3, 3);
const int8x16_t delta4 = vshrq_n_s8(delta_p4, 3);
*op0 = vqaddq_s8(p0s, delta3);
*oq0 = vqsubq_s8(q0s, delta4);
}
#if defined(WEBP_USE_INTRINSICS)
static void ApplyFilter2_NEON(const int8x16_t p0s, const int8x16_t q0s,
const int8x16_t delta,
uint8x16_t* const op0, uint8x16_t* const oq0) {
const int8x16_t kCst3 = vdupq_n_s8(0x03);
const int8x16_t kCst4 = vdupq_n_s8(0x04);
const int8x16_t delta_p3 = vqaddq_s8(delta, kCst3);
const int8x16_t delta_p4 = vqaddq_s8(delta, kCst4);
const int8x16_t delta3 = vshrq_n_s8(delta_p3, 3);
const int8x16_t delta4 = vshrq_n_s8(delta_p4, 3);
const int8x16_t sp0 = vqaddq_s8(p0s, delta3);
const int8x16_t sq0 = vqsubq_s8(q0s, delta4);
*op0 = FlipSignBack_NEON(sp0);
*oq0 = FlipSignBack_NEON(sq0);
}
static void DoFilter2_NEON(const uint8x16_t p1, const uint8x16_t p0,
const uint8x16_t q0, const uint8x16_t q1,
const uint8x16_t mask,
uint8x16_t* const op0, uint8x16_t* const oq0) {
const int8x16_t p1s = FlipSign_NEON(p1);
const int8x16_t p0s = FlipSign_NEON(p0);
const int8x16_t q0s = FlipSign_NEON(q0);
const int8x16_t q1s = FlipSign_NEON(q1);
const int8x16_t delta0 = GetBaseDelta_NEON(p1s, p0s, q0s, q1s);
const int8x16_t delta1 = vandq_s8(delta0, vreinterpretq_s8_u8(mask));
ApplyFilter2_NEON(p0s, q0s, delta1, op0, oq0);
}
static void SimpleVFilter16_NEON(uint8_t* p, int stride, int thresh) {
uint8x16_t p1, p0, q0, q1, op0, oq0;
Load16x4_NEON(p, stride, &p1, &p0, &q0, &q1);
{
const uint8x16_t mask = NeedsFilter_NEON(p1, p0, q0, q1, thresh);
DoFilter2_NEON(p1, p0, q0, q1, mask, &op0, &oq0);
}
Store16x2_NEON(op0, oq0, p, stride);
}
static void SimpleHFilter16_NEON(uint8_t* p, int stride, int thresh) {
uint8x16_t p1, p0, q0, q1, oq0, op0;
Load4x16_NEON(p, stride, &p1, &p0, &q0, &q1);
{
const uint8x16_t mask = NeedsFilter_NEON(p1, p0, q0, q1, thresh);
DoFilter2_NEON(p1, p0, q0, q1, mask, &op0, &oq0);
}
Store2x16_NEON(op0, oq0, p, stride);
}
#else
#define LOAD8x4 …
#define STORE8x2 …
#define QRegs …
#define FLIP_SIGN_BIT2 … \
#define FLIP_SIGN_BIT4 … \
#define NEEDS_FILTER …
#define GET_BASE_DELTA …
#define DO_SIMPLE_FILTER …
#define DO_FILTER2 …
static void SimpleVFilter16_NEON(uint8_t* p, int stride, int thresh) {
__asm__ volatile (
"sub %[p], %[p], %[stride], lsl #1 \n"
"vld1.u8 {q1}, [%[p]], %[stride] \n"
"vld1.u8 {q2}, [%[p]], %[stride] \n"
"vld1.u8 {q3}, [%[p]], %[stride] \n"
"vld1.u8 {q12}, [%[p]] \n"
DO_FILTER2(q1, q2, q3, q12, %[thresh])
"sub %[p], %[p], %[stride], lsl #1 \n"
"vst1.u8 {q2}, [%[p]], %[stride] \n"
"vst1.u8 {q3}, [%[p]] \n"
: [p] "+r"(p)
: [stride] "r"(stride), [thresh] "r"(thresh)
: "memory", QRegs
);
}
static void SimpleHFilter16_NEON(uint8_t* p, int stride, int thresh) {
__asm__ volatile (
"sub r4, %[p], #2 \n"
"lsl r6, %[stride], #1 \n"
"add r5, r4, %[stride] \n"
LOAD8x4(d2, d3, d4, d5, [r4], [r5], r6)
LOAD8x4(d24, d25, d26, d27, [r4], [r5], r6)
"vswp d3, d24 \n"
"vswp d5, d26 \n"
"vswp q2, q12 \n"
DO_FILTER2(q1, q2, q12, q13, %[thresh])
"sub %[p], %[p], #1 \n"
"vswp d5, d24 \n"
STORE8x2(d4, d5, [%[p]], %[stride])
STORE8x2(d24, d25, [%[p]], %[stride])
: [p] "+r"(p)
: [stride] "r"(stride), [thresh] "r"(thresh)
: "memory", "r4", "r5", "r6", QRegs
);
}
#undef LOAD8x4
#undef STORE8x2
#endif
static void SimpleVFilter16i_NEON(uint8_t* p, int stride, int thresh) {
uint32_t k;
for (k = 3; k != 0; --k) {
p += 4 * stride;
SimpleVFilter16_NEON(p, stride, thresh);
}
}
static void SimpleHFilter16i_NEON(uint8_t* p, int stride, int thresh) {
uint32_t k;
for (k = 3; k != 0; --k) {
p += 4;
SimpleHFilter16_NEON(p, stride, thresh);
}
}
static uint8x16_t NeedsHev_NEON(const uint8x16_t p1, const uint8x16_t p0,
const uint8x16_t q0, const uint8x16_t q1,
int hev_thresh) {
const uint8x16_t hev_thresh_v = vdupq_n_u8((uint8_t)hev_thresh);
const uint8x16_t a_p1_p0 = vabdq_u8(p1, p0);
const uint8x16_t a_q1_q0 = vabdq_u8(q1, q0);
const uint8x16_t a_max = vmaxq_u8(a_p1_p0, a_q1_q0);
const uint8x16_t mask = vcgtq_u8(a_max, hev_thresh_v);
return mask;
}
static uint8x16_t NeedsFilter2_NEON(const uint8x16_t p3, const uint8x16_t p2,
const uint8x16_t p1, const uint8x16_t p0,
const uint8x16_t q0, const uint8x16_t q1,
const uint8x16_t q2, const uint8x16_t q3,
int ithresh, int thresh) {
const uint8x16_t ithresh_v = vdupq_n_u8((uint8_t)ithresh);
const uint8x16_t a_p3_p2 = vabdq_u8(p3, p2);
const uint8x16_t a_p2_p1 = vabdq_u8(p2, p1);
const uint8x16_t a_p1_p0 = vabdq_u8(p1, p0);
const uint8x16_t a_q3_q2 = vabdq_u8(q3, q2);
const uint8x16_t a_q2_q1 = vabdq_u8(q2, q1);
const uint8x16_t a_q1_q0 = vabdq_u8(q1, q0);
const uint8x16_t max1 = vmaxq_u8(a_p3_p2, a_p2_p1);
const uint8x16_t max2 = vmaxq_u8(a_p1_p0, a_q3_q2);
const uint8x16_t max3 = vmaxq_u8(a_q2_q1, a_q1_q0);
const uint8x16_t max12 = vmaxq_u8(max1, max2);
const uint8x16_t max123 = vmaxq_u8(max12, max3);
const uint8x16_t mask2 = vcgeq_u8(ithresh_v, max123);
const uint8x16_t mask1 = NeedsFilter_NEON(p1, p0, q0, q1, thresh);
const uint8x16_t mask = vandq_u8(mask1, mask2);
return mask;
}
static void ApplyFilter4_NEON(
const int8x16_t p1, const int8x16_t p0,
const int8x16_t q0, const int8x16_t q1,
const int8x16_t delta0,
uint8x16_t* const op1, uint8x16_t* const op0,
uint8x16_t* const oq0, uint8x16_t* const oq1) {
const int8x16_t kCst3 = vdupq_n_s8(0x03);
const int8x16_t kCst4 = vdupq_n_s8(0x04);
const int8x16_t delta1 = vqaddq_s8(delta0, kCst4);
const int8x16_t delta2 = vqaddq_s8(delta0, kCst3);
const int8x16_t a1 = vshrq_n_s8(delta1, 3);
const int8x16_t a2 = vshrq_n_s8(delta2, 3);
const int8x16_t a3 = vrshrq_n_s8(a1, 1);
*op0 = FlipSignBack_NEON(vqaddq_s8(p0, a2));
*oq0 = FlipSignBack_NEON(vqsubq_s8(q0, a1));
*op1 = FlipSignBack_NEON(vqaddq_s8(p1, a3));
*oq1 = FlipSignBack_NEON(vqsubq_s8(q1, a3));
}
static void DoFilter4_NEON(
const uint8x16_t p1, const uint8x16_t p0,
const uint8x16_t q0, const uint8x16_t q1,
const uint8x16_t mask, const uint8x16_t hev_mask,
uint8x16_t* const op1, uint8x16_t* const op0,
uint8x16_t* const oq0, uint8x16_t* const oq1) {
const int8x16_t p1s = FlipSign_NEON(p1);
int8x16_t p0s = FlipSign_NEON(p0);
int8x16_t q0s = FlipSign_NEON(q0);
const int8x16_t q1s = FlipSign_NEON(q1);
const uint8x16_t simple_lf_mask = vandq_u8(mask, hev_mask);
{
const int8x16_t delta = GetBaseDelta_NEON(p1s, p0s, q0s, q1s);
const int8x16_t simple_lf_delta =
vandq_s8(delta, vreinterpretq_s8_u8(simple_lf_mask));
ApplyFilter2NoFlip_NEON(p0s, q0s, simple_lf_delta, &p0s, &q0s);
}
{
const int8x16_t delta0 = GetBaseDelta0_NEON(p0s, q0s);
const uint8x16_t complex_lf_mask = veorq_u8(simple_lf_mask, mask);
const int8x16_t complex_lf_delta =
vandq_s8(delta0, vreinterpretq_s8_u8(complex_lf_mask));
ApplyFilter4_NEON(p1s, p0s, q0s, q1s, complex_lf_delta, op1, op0, oq0, oq1);
}
}
static void ApplyFilter6_NEON(
const int8x16_t p2, const int8x16_t p1, const int8x16_t p0,
const int8x16_t q0, const int8x16_t q1, const int8x16_t q2,
const int8x16_t delta,
uint8x16_t* const op2, uint8x16_t* const op1, uint8x16_t* const op0,
uint8x16_t* const oq0, uint8x16_t* const oq1, uint8x16_t* const oq2) {
const int8x8_t delta_lo = vget_low_s8(delta);
const int8x8_t delta_hi = vget_high_s8(delta);
const int8x8_t kCst9 = vdup_n_s8(9);
const int16x8_t kCstm1 = vdupq_n_s16(-1);
const int8x8_t kCst18 = vdup_n_s8(18);
const int16x8_t S_lo = vmlal_s8(kCstm1, kCst9, delta_lo);
const int16x8_t S_hi = vmlal_s8(kCstm1, kCst9, delta_hi);
const int16x8_t Z_lo = vmlal_s8(S_lo, kCst18, delta_lo);
const int16x8_t Z_hi = vmlal_s8(S_hi, kCst18, delta_hi);
const int8x8_t a3_lo = vqrshrn_n_s16(S_lo, 7);
const int8x8_t a3_hi = vqrshrn_n_s16(S_hi, 7);
const int8x8_t a2_lo = vqrshrn_n_s16(S_lo, 6);
const int8x8_t a2_hi = vqrshrn_n_s16(S_hi, 6);
const int8x8_t a1_lo = vqrshrn_n_s16(Z_lo, 7);
const int8x8_t a1_hi = vqrshrn_n_s16(Z_hi, 7);
const int8x16_t a1 = vcombine_s8(a1_lo, a1_hi);
const int8x16_t a2 = vcombine_s8(a2_lo, a2_hi);
const int8x16_t a3 = vcombine_s8(a3_lo, a3_hi);
*op0 = FlipSignBack_NEON(vqaddq_s8(p0, a1));
*oq0 = FlipSignBack_NEON(vqsubq_s8(q0, a1));
*oq1 = FlipSignBack_NEON(vqsubq_s8(q1, a2));
*op1 = FlipSignBack_NEON(vqaddq_s8(p1, a2));
*oq2 = FlipSignBack_NEON(vqsubq_s8(q2, a3));
*op2 = FlipSignBack_NEON(vqaddq_s8(p2, a3));
}
static void DoFilter6_NEON(
const uint8x16_t p2, const uint8x16_t p1, const uint8x16_t p0,
const uint8x16_t q0, const uint8x16_t q1, const uint8x16_t q2,
const uint8x16_t mask, const uint8x16_t hev_mask,
uint8x16_t* const op2, uint8x16_t* const op1, uint8x16_t* const op0,
uint8x16_t* const oq0, uint8x16_t* const oq1, uint8x16_t* const oq2) {
const int8x16_t p2s = FlipSign_NEON(p2);
const int8x16_t p1s = FlipSign_NEON(p1);
int8x16_t p0s = FlipSign_NEON(p0);
int8x16_t q0s = FlipSign_NEON(q0);
const int8x16_t q1s = FlipSign_NEON(q1);
const int8x16_t q2s = FlipSign_NEON(q2);
const uint8x16_t simple_lf_mask = vandq_u8(mask, hev_mask);
const int8x16_t delta0 = GetBaseDelta_NEON(p1s, p0s, q0s, q1s);
{
const int8x16_t simple_lf_delta =
vandq_s8(delta0, vreinterpretq_s8_u8(simple_lf_mask));
ApplyFilter2NoFlip_NEON(p0s, q0s, simple_lf_delta, &p0s, &q0s);
}
{
const uint8x16_t complex_lf_mask = veorq_u8(simple_lf_mask, mask);
const int8x16_t complex_lf_delta =
vandq_s8(delta0, vreinterpretq_s8_u8(complex_lf_mask));
ApplyFilter6_NEON(p2s, p1s, p0s, q0s, q1s, q2s, complex_lf_delta,
op2, op1, op0, oq0, oq1, oq2);
}
}
static void VFilter16_NEON(uint8_t* p, int stride,
int thresh, int ithresh, int hev_thresh) {
uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
Load16x8_NEON(p, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
{
const uint8x16_t mask = NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3,
ithresh, thresh);
const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
uint8x16_t op2, op1, op0, oq0, oq1, oq2;
DoFilter6_NEON(p2, p1, p0, q0, q1, q2, mask, hev_mask,
&op2, &op1, &op0, &oq0, &oq1, &oq2);
Store16x2_NEON(op2, op1, p - 2 * stride, stride);
Store16x2_NEON(op0, oq0, p + 0 * stride, stride);
Store16x2_NEON(oq1, oq2, p + 2 * stride, stride);
}
}
static void HFilter16_NEON(uint8_t* p, int stride,
int thresh, int ithresh, int hev_thresh) {
uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
Load8x16_NEON(p, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
{
const uint8x16_t mask = NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3,
ithresh, thresh);
const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
uint8x16_t op2, op1, op0, oq0, oq1, oq2;
DoFilter6_NEON(p2, p1, p0, q0, q1, q2, mask, hev_mask,
&op2, &op1, &op0, &oq0, &oq1, &oq2);
Store2x16_NEON(op2, op1, p - 2, stride);
Store2x16_NEON(op0, oq0, p + 0, stride);
Store2x16_NEON(oq1, oq2, p + 2, stride);
}
}
static void VFilter16i_NEON(uint8_t* p, int stride,
int thresh, int ithresh, int hev_thresh) {
uint32_t k;
uint8x16_t p3, p2, p1, p0;
Load16x4_NEON(p + 2 * stride, stride, &p3, &p2, &p1, &p0);
for (k = 3; k != 0; --k) {
uint8x16_t q0, q1, q2, q3;
p += 4 * stride;
Load16x4_NEON(p + 2 * stride, stride, &q0, &q1, &q2, &q3);
{
const uint8x16_t mask =
NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3, ithresh, thresh);
const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
DoFilter4_NEON(p1, p0, q0, q1, mask, hev_mask, &p1, &p0, &p3, &p2);
Store16x4_NEON(p1, p0, p3, p2, p, stride);
p1 = q2;
p0 = q3;
}
}
}
#if !defined(WORK_AROUND_GCC)
static void HFilter16i_NEON(uint8_t* p, int stride,
int thresh, int ithresh, int hev_thresh) {
uint32_t k;
uint8x16_t p3, p2, p1, p0;
Load4x16_NEON(p + 2, stride, &p3, &p2, &p1, &p0);
for (k = 3; k != 0; --k) {
uint8x16_t q0, q1, q2, q3;
p += 4;
Load4x16_NEON(p + 2, stride, &q0, &q1, &q2, &q3);
{
const uint8x16_t mask =
NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3, ithresh, thresh);
const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
DoFilter4_NEON(p1, p0, q0, q1, mask, hev_mask, &p1, &p0, &p3, &p2);
Store4x16_NEON(p1, p0, p3, p2, p, stride);
p1 = q2;
p0 = q3;
}
}
}
#endif
static void VFilter8_NEON(uint8_t* u, uint8_t* v, int stride,
int thresh, int ithresh, int hev_thresh) {
uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
Load8x8x2_NEON(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
{
const uint8x16_t mask = NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3,
ithresh, thresh);
const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
uint8x16_t op2, op1, op0, oq0, oq1, oq2;
DoFilter6_NEON(p2, p1, p0, q0, q1, q2, mask, hev_mask,
&op2, &op1, &op0, &oq0, &oq1, &oq2);
Store8x2x2_NEON(op2, op1, u - 2 * stride, v - 2 * stride, stride);
Store8x2x2_NEON(op0, oq0, u + 0 * stride, v + 0 * stride, stride);
Store8x2x2_NEON(oq1, oq2, u + 2 * stride, v + 2 * stride, stride);
}
}
static void VFilter8i_NEON(uint8_t* u, uint8_t* v, int stride,
int thresh, int ithresh, int hev_thresh) {
uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
u += 4 * stride;
v += 4 * stride;
Load8x8x2_NEON(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
{
const uint8x16_t mask = NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3,
ithresh, thresh);
const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
uint8x16_t op1, op0, oq0, oq1;
DoFilter4_NEON(p1, p0, q0, q1, mask, hev_mask, &op1, &op0, &oq0, &oq1);
Store8x4x2_NEON(op1, op0, oq0, oq1, u, v, stride);
}
}
#if !defined(WORK_AROUND_GCC)
static void HFilter8_NEON(uint8_t* u, uint8_t* v, int stride,
int thresh, int ithresh, int hev_thresh) {
uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
Load8x8x2T_NEON(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
{
const uint8x16_t mask = NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3,
ithresh, thresh);
const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
uint8x16_t op2, op1, op0, oq0, oq1, oq2;
DoFilter6_NEON(p2, p1, p0, q0, q1, q2, mask, hev_mask,
&op2, &op1, &op0, &oq0, &oq1, &oq2);
Store6x8x2_NEON(op2, op1, op0, oq0, oq1, oq2, u, v, stride);
}
}
static void HFilter8i_NEON(uint8_t* u, uint8_t* v, int stride,
int thresh, int ithresh, int hev_thresh) {
uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
u += 4;
v += 4;
Load8x8x2T_NEON(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
{
const uint8x16_t mask = NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3,
ithresh, thresh);
const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
uint8x16_t op1, op0, oq0, oq1;
DoFilter4_NEON(p1, p0, q0, q1, mask, hev_mask, &op1, &op0, &oq0, &oq1);
Store4x8x2_NEON(op1, op0, oq0, oq1, u, v, stride);
}
}
#endif
static const int16_t kC1 = WEBP_TRANSFORM_AC3_C1;
static const int16_t kC2 =
WEBP_TRANSFORM_AC3_C2 / 2;
#if defined(WEBP_USE_INTRINSICS)
static WEBP_INLINE void Transpose8x2_NEON(const int16x8_t in0,
const int16x8_t in1,
int16x8x2_t* const out) {
const int16x8x2_t tmp0 = vzipq_s16(in0, in1);
*out = vzipq_s16(tmp0.val[0], tmp0.val[1]);
}
static WEBP_INLINE void TransformPass_NEON(int16x8x2_t* const rows) {
const int16x8_t B1 =
vcombine_s16(vget_high_s16(rows->val[0]), vget_high_s16(rows->val[1]));
const int16x8_t C0 = vsraq_n_s16(B1, vqdmulhq_n_s16(B1, kC1), 1);
const int16x8_t C1 = vqdmulhq_n_s16(B1, kC2);
const int16x4_t a = vqadd_s16(vget_low_s16(rows->val[0]),
vget_low_s16(rows->val[1]));
const int16x4_t b = vqsub_s16(vget_low_s16(rows->val[0]),
vget_low_s16(rows->val[1]));
const int16x4_t c = vqsub_s16(vget_low_s16(C1), vget_high_s16(C0));
const int16x4_t d = vqadd_s16(vget_low_s16(C0), vget_high_s16(C1));
const int16x8_t D0 = vcombine_s16(a, b);
const int16x8_t D1 = vcombine_s16(d, c);
const int16x8_t E0 = vqaddq_s16(D0, D1);
const int16x8_t E_tmp = vqsubq_s16(D0, D1);
const int16x8_t E1 = vcombine_s16(vget_high_s16(E_tmp), vget_low_s16(E_tmp));
Transpose8x2_NEON(E0, E1, rows);
}
static void TransformOne_NEON(const int16_t* in, uint8_t* dst) {
int16x8x2_t rows;
INIT_VECTOR2(rows, vld1q_s16(in + 0), vld1q_s16(in + 8));
TransformPass_NEON(&rows);
TransformPass_NEON(&rows);
Add4x4_NEON(rows.val[0], rows.val[1], dst);
}
#else
static void TransformOne_NEON(const int16_t* in, uint8_t* dst) {
const int kBPS = BPS;
const int16_t constants[4] = { kC1, kC2, 0, 0 };
__asm__ volatile (
"vld1.16 {q1, q2}, [%[in]] \n"
"vld1.16 {d0}, [%[constants]] \n"
"vswp d3, d4 \n"
"vqdmulh.s16 q8, q2, d0[0] \n"
"vqdmulh.s16 q9, q2, d0[1] \n"
"vqadd.s16 d22, d2, d3 \n"
"vqsub.s16 d23, d2, d3 \n"
"vshr.s16 q8, q8, #1 \n"
"vqadd.s16 q8, q2, q8 \n"
"vqsub.s16 d20, d18, d17 \n"
"vqadd.s16 d21, d19, d16 \n"
"vqadd.s16 d2, d22, d21 \n"
"vqadd.s16 d3, d23, d20 \n"
"vqsub.s16 d4, d23, d20 \n"
"vqsub.s16 d5, d22, d21 \n"
"vzip.16 q1, q2 \n"
"vzip.16 q1, q2 \n"
"vswp d3, d4 \n"
"vqdmulh.s16 q8, q2, d0[0] \n"
"vqdmulh.s16 q9, q2, d0[1] \n"
"vqadd.s16 d22, d2, d3 \n"
"vqsub.s16 d23, d2, d3 \n"
"vshr.s16 q8, q8, #1 \n"
"vqadd.s16 q8, q2, q8 \n"
"vqsub.s16 d20, d18, d17 \n"
"vqadd.s16 d21, d19, d16 \n"
"vqadd.s16 d2, d22, d21 \n"
"vqadd.s16 d3, d23, d20 \n"
"vqsub.s16 d4, d23, d20 \n"
"vqsub.s16 d5, d22, d21 \n"
"vld1.32 d6[0], [%[dst]], %[kBPS] \n"
"vld1.32 d6[1], [%[dst]], %[kBPS] \n"
"vld1.32 d7[0], [%[dst]], %[kBPS] \n"
"vld1.32 d7[1], [%[dst]], %[kBPS] \n"
"sub %[dst], %[dst], %[kBPS], lsl #2 \n"
"vrshr.s16 d2, d2, #3 \n"
"vrshr.s16 d3, d3, #3 \n"
"vrshr.s16 d4, d4, #3 \n"
"vrshr.s16 d5, d5, #3 \n"
"vzip.16 q1, q2 \n"
"vzip.16 q1, q2 \n"
"vmovl.u8 q8, d6 \n"
"vmovl.u8 q9, d7 \n"
"vqadd.s16 q1, q1, q8 \n"
"vqadd.s16 q2, q2, q9 \n"
"vqmovun.s16 d0, q1 \n"
"vqmovun.s16 d1, q2 \n"
"vst1.32 d0[0], [%[dst]], %[kBPS] \n"
"vst1.32 d0[1], [%[dst]], %[kBPS] \n"
"vst1.32 d1[0], [%[dst]], %[kBPS] \n"
"vst1.32 d1[1], [%[dst]] \n"
: [in] "+r"(in), [dst] "+r"(dst)
: [kBPS] "r"(kBPS), [constants] "r"(constants)
: "memory", "q0", "q1", "q2", "q8", "q9", "q10", "q11"
);
}
#endif
static void TransformTwo_NEON(const int16_t* in, uint8_t* dst, int do_two) {
TransformOne_NEON(in, dst);
if (do_two) {
TransformOne_NEON(in + 16, dst + 4);
}
}
static void TransformDC_NEON(const int16_t* in, uint8_t* dst) {
const int16x8_t DC = vdupq_n_s16(in[0]);
Add4x4_NEON(DC, DC, dst);
}
#define STORE_WHT …
static void TransformWHT_NEON(const int16_t* in, int16_t* out) {
int32x4x4_t tmp;
{
const int16x4_t in00_03 = vld1_s16(in + 0);
const int16x4_t in04_07 = vld1_s16(in + 4);
const int16x4_t in08_11 = vld1_s16(in + 8);
const int16x4_t in12_15 = vld1_s16(in + 12);
const int32x4_t a0 = vaddl_s16(in00_03, in12_15);
const int32x4_t a1 = vaddl_s16(in04_07, in08_11);
const int32x4_t a2 = vsubl_s16(in04_07, in08_11);
const int32x4_t a3 = vsubl_s16(in00_03, in12_15);
tmp.val[0] = vaddq_s32(a0, a1);
tmp.val[1] = vaddq_s32(a3, a2);
tmp.val[2] = vsubq_s32(a0, a1);
tmp.val[3] = vsubq_s32(a3, a2);
tmp = Transpose4x4_NEON(tmp);
}
{
const int32x4_t kCst3 = vdupq_n_s32(3);
const int32x4_t dc = vaddq_s32(tmp.val[0], kCst3);
const int32x4_t a0 = vaddq_s32(dc, tmp.val[3]);
const int32x4_t a1 = vaddq_s32(tmp.val[1], tmp.val[2]);
const int32x4_t a2 = vsubq_s32(tmp.val[1], tmp.val[2]);
const int32x4_t a3 = vsubq_s32(dc, tmp.val[3]);
tmp.val[0] = vaddq_s32(a0, a1);
tmp.val[1] = vaddq_s32(a3, a2);
tmp.val[2] = vsubq_s32(a0, a1);
tmp.val[3] = vsubq_s32(a3, a2);
tmp.val[0] = vshrq_n_s32(tmp.val[0], 3);
tmp.val[1] = vshrq_n_s32(tmp.val[1], 3);
tmp.val[2] = vshrq_n_s32(tmp.val[2], 3);
tmp.val[3] = vshrq_n_s32(tmp.val[3], 3);
STORE_WHT(out, 0, tmp);
STORE_WHT(out, 1, tmp);
STORE_WHT(out, 2, tmp);
STORE_WHT(out, 3, tmp);
}
}
#undef STORE_WHT
static void TransformAC3_NEON(const int16_t* in, uint8_t* dst) {
const int16x4_t A = vld1_dup_s16(in);
const int16x4_t c4 = vdup_n_s16(WEBP_TRANSFORM_AC3_MUL2(in[4]));
const int16x4_t d4 = vdup_n_s16(WEBP_TRANSFORM_AC3_MUL1(in[4]));
const int c1 = WEBP_TRANSFORM_AC3_MUL2(in[1]);
const int d1 = WEBP_TRANSFORM_AC3_MUL1(in[1]);
const uint64_t cd = (uint64_t)( d1 & 0xffff) << 0 |
(uint64_t)( c1 & 0xffff) << 16 |
(uint64_t)(-c1 & 0xffff) << 32 |
(uint64_t)(-d1 & 0xffff) << 48;
const int16x4_t CD = vcreate_s16(cd);
const int16x4_t B = vqadd_s16(A, CD);
const int16x8_t m0_m1 = vcombine_s16(vqadd_s16(B, d4), vqadd_s16(B, c4));
const int16x8_t m2_m3 = vcombine_s16(vqsub_s16(B, c4), vqsub_s16(B, d4));
Add4x4_NEON(m0_m1, m2_m3, dst);
}
static void DC4_NEON(uint8_t* dst) {
const uint8x8_t A = vld1_u8(dst - BPS);
const uint16x4_t p0 = vpaddl_u8(A);
const uint16x4_t p1 = vpadd_u16(p0, p0);
const uint8x8_t L0 = vld1_u8(dst + 0 * BPS - 1);
const uint8x8_t L1 = vld1_u8(dst + 1 * BPS - 1);
const uint8x8_t L2 = vld1_u8(dst + 2 * BPS - 1);
const uint8x8_t L3 = vld1_u8(dst + 3 * BPS - 1);
const uint16x8_t s0 = vaddl_u8(L0, L1);
const uint16x8_t s1 = vaddl_u8(L2, L3);
const uint16x8_t s01 = vaddq_u16(s0, s1);
const uint16x8_t sum = vaddq_u16(s01, vcombine_u16(p1, p1));
const uint8x8_t dc0 = vrshrn_n_u16(sum, 3);
const uint8x8_t dc = vdup_lane_u8(dc0, 0);
int i;
for (i = 0; i < 4; ++i) {
vst1_lane_u32((uint32_t*)(dst + i * BPS), vreinterpret_u32_u8(dc), 0);
}
}
static WEBP_INLINE void TrueMotion_NEON(uint8_t* dst, int size) {
const uint8x8_t TL = vld1_dup_u8(dst - BPS - 1);
const uint8x8_t T = vld1_u8(dst - BPS);
const int16x8_t d = vreinterpretq_s16_u16(vsubl_u8(T, TL));
int y;
for (y = 0; y < size; y += 4) {
const int16x8_t L0 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 0 * BPS - 1));
const int16x8_t L1 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 1 * BPS - 1));
const int16x8_t L2 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 2 * BPS - 1));
const int16x8_t L3 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 3 * BPS - 1));
const int16x8_t r0 = vaddq_s16(L0, d);
const int16x8_t r1 = vaddq_s16(L1, d);
const int16x8_t r2 = vaddq_s16(L2, d);
const int16x8_t r3 = vaddq_s16(L3, d);
const uint32x2_t r0_u32 = vreinterpret_u32_u8(vqmovun_s16(r0));
const uint32x2_t r1_u32 = vreinterpret_u32_u8(vqmovun_s16(r1));
const uint32x2_t r2_u32 = vreinterpret_u32_u8(vqmovun_s16(r2));
const uint32x2_t r3_u32 = vreinterpret_u32_u8(vqmovun_s16(r3));
if (size == 4) {
vst1_lane_u32((uint32_t*)(dst + 0 * BPS), r0_u32, 0);
vst1_lane_u32((uint32_t*)(dst + 1 * BPS), r1_u32, 0);
vst1_lane_u32((uint32_t*)(dst + 2 * BPS), r2_u32, 0);
vst1_lane_u32((uint32_t*)(dst + 3 * BPS), r3_u32, 0);
} else {
vst1_u32((uint32_t*)(dst + 0 * BPS), r0_u32);
vst1_u32((uint32_t*)(dst + 1 * BPS), r1_u32);
vst1_u32((uint32_t*)(dst + 2 * BPS), r2_u32);
vst1_u32((uint32_t*)(dst + 3 * BPS), r3_u32);
}
dst += 4 * BPS;
}
}
static void TM4_NEON(uint8_t* dst) { TrueMotion_NEON(dst, 4); }
static void VE4_NEON(uint8_t* dst) {
const uint64x1_t A0 = vreinterpret_u64_u8(vld1_u8(dst - BPS - 1));
const uint64x1_t A1 = vshr_n_u64(A0, 8);
const uint64x1_t A2 = vshr_n_u64(A0, 16);
const uint8x8_t ABCDEFGH = vreinterpret_u8_u64(A0);
const uint8x8_t BCDEFGH0 = vreinterpret_u8_u64(A1);
const uint8x8_t CDEFGH00 = vreinterpret_u8_u64(A2);
const uint8x8_t b = vhadd_u8(ABCDEFGH, CDEFGH00);
const uint8x8_t avg = vrhadd_u8(b, BCDEFGH0);
int i;
for (i = 0; i < 4; ++i) {
vst1_lane_u32((uint32_t*)(dst + i * BPS), vreinterpret_u32_u8(avg), 0);
}
}
static void RD4_NEON(uint8_t* dst) {
const uint8x8_t XABCD_u8 = vld1_u8(dst - BPS - 1);
const uint64x1_t XABCD = vreinterpret_u64_u8(XABCD_u8);
const uint64x1_t ____XABC = vshl_n_u64(XABCD, 32);
const uint32_t I = dst[-1 + 0 * BPS];
const uint32_t J = dst[-1 + 1 * BPS];
const uint32_t K = dst[-1 + 2 * BPS];
const uint32_t L = dst[-1 + 3 * BPS];
const uint64x1_t LKJI____ =
vcreate_u64((uint64_t)L | (K << 8) | (J << 16) | (I << 24));
const uint64x1_t LKJIXABC = vorr_u64(LKJI____, ____XABC);
const uint8x8_t KJIXABC_ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 8));
const uint8x8_t JIXABC__ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 16));
const uint8_t D = vget_lane_u8(XABCD_u8, 4);
const uint8x8_t JIXABCD_ = vset_lane_u8(D, JIXABC__, 6);
const uint8x8_t LKJIXABC_u8 = vreinterpret_u8_u64(LKJIXABC);
const uint8x8_t avg1 = vhadd_u8(JIXABCD_, LKJIXABC_u8);
const uint8x8_t avg2 = vrhadd_u8(avg1, KJIXABC_);
const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2);
const uint32x2_t r3 = vreinterpret_u32_u8(avg2);
const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8));
const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16));
const uint32x2_t r0 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24));
vst1_lane_u32((uint32_t*)(dst + 0 * BPS), r0, 0);
vst1_lane_u32((uint32_t*)(dst + 1 * BPS), r1, 0);
vst1_lane_u32((uint32_t*)(dst + 2 * BPS), r2, 0);
vst1_lane_u32((uint32_t*)(dst + 3 * BPS), r3, 0);
}
static void LD4_NEON(uint8_t* dst) {
const uint8x8_t ABCDEFGH = vld1_u8(dst - BPS + 0);
const uint8x8_t BCDEFGH0 = vld1_u8(dst - BPS + 1);
const uint8x8_t CDEFGH00 = vld1_u8(dst - BPS + 2);
const uint8x8_t CDEFGHH0 = vset_lane_u8(dst[-BPS + 7], CDEFGH00, 6);
const uint8x8_t avg1 = vhadd_u8(ABCDEFGH, CDEFGHH0);
const uint8x8_t avg2 = vrhadd_u8(avg1, BCDEFGH0);
const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2);
const uint32x2_t r0 = vreinterpret_u32_u8(avg2);
const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8));
const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16));
const uint32x2_t r3 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24));
vst1_lane_u32((uint32_t*)(dst + 0 * BPS), r0, 0);
vst1_lane_u32((uint32_t*)(dst + 1 * BPS), r1, 0);
vst1_lane_u32((uint32_t*)(dst + 2 * BPS), r2, 0);
vst1_lane_u32((uint32_t*)(dst + 3 * BPS), r3, 0);
}
static void VE8uv_NEON(uint8_t* dst) {
const uint8x8_t top = vld1_u8(dst - BPS);
int j;
for (j = 0; j < 8; ++j) {
vst1_u8(dst + j * BPS, top);
}
}
static void HE8uv_NEON(uint8_t* dst) {
int j;
for (j = 0; j < 8; ++j) {
const uint8x8_t left = vld1_dup_u8(dst - 1);
vst1_u8(dst, left);
dst += BPS;
}
}
static WEBP_INLINE void DC8_NEON(uint8_t* dst, int do_top, int do_left) {
uint16x8_t sum_top;
uint16x8_t sum_left;
uint8x8_t dc0;
if (do_top) {
const uint8x8_t A = vld1_u8(dst - BPS);
#if WEBP_AARCH64
const uint16_t p2 = vaddlv_u8(A);
sum_top = vdupq_n_u16(p2);
#else
const uint16x4_t p0 = vpaddl_u8(A);
const uint16x4_t p1 = vpadd_u16(p0, p0);
const uint16x4_t p2 = vpadd_u16(p1, p1);
sum_top = vcombine_u16(p2, p2);
#endif
}
if (do_left) {
const uint8x8_t L0 = vld1_u8(dst + 0 * BPS - 1);
const uint8x8_t L1 = vld1_u8(dst + 1 * BPS - 1);
const uint8x8_t L2 = vld1_u8(dst + 2 * BPS - 1);
const uint8x8_t L3 = vld1_u8(dst + 3 * BPS - 1);
const uint8x8_t L4 = vld1_u8(dst + 4 * BPS - 1);
const uint8x8_t L5 = vld1_u8(dst + 5 * BPS - 1);
const uint8x8_t L6 = vld1_u8(dst + 6 * BPS - 1);
const uint8x8_t L7 = vld1_u8(dst + 7 * BPS - 1);
const uint16x8_t s0 = vaddl_u8(L0, L1);
const uint16x8_t s1 = vaddl_u8(L2, L3);
const uint16x8_t s2 = vaddl_u8(L4, L5);
const uint16x8_t s3 = vaddl_u8(L6, L7);
const uint16x8_t s01 = vaddq_u16(s0, s1);
const uint16x8_t s23 = vaddq_u16(s2, s3);
sum_left = vaddq_u16(s01, s23);
}
if (do_top && do_left) {
const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
dc0 = vrshrn_n_u16(sum, 4);
} else if (do_top) {
dc0 = vrshrn_n_u16(sum_top, 3);
} else if (do_left) {
dc0 = vrshrn_n_u16(sum_left, 3);
} else {
dc0 = vdup_n_u8(0x80);
}
{
const uint8x8_t dc = vdup_lane_u8(dc0, 0);
int i;
for (i = 0; i < 8; ++i) {
vst1_u32((uint32_t*)(dst + i * BPS), vreinterpret_u32_u8(dc));
}
}
}
static void DC8uv_NEON(uint8_t* dst) { DC8_NEON(dst, 1, 1); }
static void DC8uvNoTop_NEON(uint8_t* dst) { DC8_NEON(dst, 0, 1); }
static void DC8uvNoLeft_NEON(uint8_t* dst) { DC8_NEON(dst, 1, 0); }
static void DC8uvNoTopLeft_NEON(uint8_t* dst) { DC8_NEON(dst, 0, 0); }
static void TM8uv_NEON(uint8_t* dst) { TrueMotion_NEON(dst, 8); }
static void VE16_NEON(uint8_t* dst) {
const uint8x16_t top = vld1q_u8(dst - BPS);
int j;
for (j = 0; j < 16; ++j) {
vst1q_u8(dst + j * BPS, top);
}
}
static void HE16_NEON(uint8_t* dst) {
int j;
for (j = 0; j < 16; ++j) {
const uint8x16_t left = vld1q_dup_u8(dst - 1);
vst1q_u8(dst, left);
dst += BPS;
}
}
static WEBP_INLINE void DC16_NEON(uint8_t* dst, int do_top, int do_left) {
uint16x8_t sum_top;
uint16x8_t sum_left;
uint8x8_t dc0;
if (do_top) {
const uint8x16_t A = vld1q_u8(dst - BPS);
#if WEBP_AARCH64
const uint16_t p3 = vaddlvq_u8(A);
sum_top = vdupq_n_u16(p3);
#else
const uint16x8_t p0 = vpaddlq_u8(A);
const uint16x4_t p1 = vadd_u16(vget_low_u16(p0), vget_high_u16(p0));
const uint16x4_t p2 = vpadd_u16(p1, p1);
const uint16x4_t p3 = vpadd_u16(p2, p2);
sum_top = vcombine_u16(p3, p3);
#endif
}
if (do_left) {
int i;
sum_left = vdupq_n_u16(0);
for (i = 0; i < 16; i += 8) {
const uint8x8_t L0 = vld1_u8(dst + (i + 0) * BPS - 1);
const uint8x8_t L1 = vld1_u8(dst + (i + 1) * BPS - 1);
const uint8x8_t L2 = vld1_u8(dst + (i + 2) * BPS - 1);
const uint8x8_t L3 = vld1_u8(dst + (i + 3) * BPS - 1);
const uint8x8_t L4 = vld1_u8(dst + (i + 4) * BPS - 1);
const uint8x8_t L5 = vld1_u8(dst + (i + 5) * BPS - 1);
const uint8x8_t L6 = vld1_u8(dst + (i + 6) * BPS - 1);
const uint8x8_t L7 = vld1_u8(dst + (i + 7) * BPS - 1);
const uint16x8_t s0 = vaddl_u8(L0, L1);
const uint16x8_t s1 = vaddl_u8(L2, L3);
const uint16x8_t s2 = vaddl_u8(L4, L5);
const uint16x8_t s3 = vaddl_u8(L6, L7);
const uint16x8_t s01 = vaddq_u16(s0, s1);
const uint16x8_t s23 = vaddq_u16(s2, s3);
const uint16x8_t sum = vaddq_u16(s01, s23);
sum_left = vaddq_u16(sum_left, sum);
}
}
if (do_top && do_left) {
const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
dc0 = vrshrn_n_u16(sum, 5);
} else if (do_top) {
dc0 = vrshrn_n_u16(sum_top, 4);
} else if (do_left) {
dc0 = vrshrn_n_u16(sum_left, 4);
} else {
dc0 = vdup_n_u8(0x80);
}
{
const uint8x16_t dc = vdupq_lane_u8(dc0, 0);
int i;
for (i = 0; i < 16; ++i) {
vst1q_u8(dst + i * BPS, dc);
}
}
}
static void DC16TopLeft_NEON(uint8_t* dst) { DC16_NEON(dst, 1, 1); }
static void DC16NoTop_NEON(uint8_t* dst) { DC16_NEON(dst, 0, 1); }
static void DC16NoLeft_NEON(uint8_t* dst) { DC16_NEON(dst, 1, 0); }
static void DC16NoTopLeft_NEON(uint8_t* dst) { DC16_NEON(dst, 0, 0); }
static void TM16_NEON(uint8_t* dst) {
const uint8x8_t TL = vld1_dup_u8(dst - BPS - 1);
const uint8x16_t T = vld1q_u8(dst - BPS);
const int16x8_t d_lo = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(T), TL));
const int16x8_t d_hi = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(T), TL));
int y;
for (y = 0; y < 16; y += 4) {
const int16x8_t L0 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 0 * BPS - 1));
const int16x8_t L1 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 1 * BPS - 1));
const int16x8_t L2 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 2 * BPS - 1));
const int16x8_t L3 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 3 * BPS - 1));
const int16x8_t r0_lo = vaddq_s16(L0, d_lo);
const int16x8_t r1_lo = vaddq_s16(L1, d_lo);
const int16x8_t r2_lo = vaddq_s16(L2, d_lo);
const int16x8_t r3_lo = vaddq_s16(L3, d_lo);
const int16x8_t r0_hi = vaddq_s16(L0, d_hi);
const int16x8_t r1_hi = vaddq_s16(L1, d_hi);
const int16x8_t r2_hi = vaddq_s16(L2, d_hi);
const int16x8_t r3_hi = vaddq_s16(L3, d_hi);
const uint8x16_t row0 = vcombine_u8(vqmovun_s16(r0_lo), vqmovun_s16(r0_hi));
const uint8x16_t row1 = vcombine_u8(vqmovun_s16(r1_lo), vqmovun_s16(r1_hi));
const uint8x16_t row2 = vcombine_u8(vqmovun_s16(r2_lo), vqmovun_s16(r2_hi));
const uint8x16_t row3 = vcombine_u8(vqmovun_s16(r3_lo), vqmovun_s16(r3_hi));
vst1q_u8(dst + 0 * BPS, row0);
vst1q_u8(dst + 1 * BPS, row1);
vst1q_u8(dst + 2 * BPS, row2);
vst1q_u8(dst + 3 * BPS, row3);
dst += 4 * BPS;
}
}
extern void VP8DspInitNEON(void);
WEBP_TSAN_IGNORE_FUNCTION void VP8DspInitNEON(void) {
VP8Transform = TransformTwo_NEON;
VP8TransformAC3 = TransformAC3_NEON;
VP8TransformDC = TransformDC_NEON;
VP8TransformWHT = TransformWHT_NEON;
VP8VFilter16 = VFilter16_NEON;
VP8VFilter16i = VFilter16i_NEON;
VP8HFilter16 = HFilter16_NEON;
#if !defined(WORK_AROUND_GCC)
VP8HFilter16i = HFilter16i_NEON;
#endif
VP8VFilter8 = VFilter8_NEON;
VP8VFilter8i = VFilter8i_NEON;
#if !defined(WORK_AROUND_GCC)
VP8HFilter8 = HFilter8_NEON;
VP8HFilter8i = HFilter8i_NEON;
#endif
VP8SimpleVFilter16 = SimpleVFilter16_NEON;
VP8SimpleHFilter16 = SimpleHFilter16_NEON;
VP8SimpleVFilter16i = SimpleVFilter16i_NEON;
VP8SimpleHFilter16i = SimpleHFilter16i_NEON;
VP8PredLuma4[0] = DC4_NEON;
VP8PredLuma4[1] = TM4_NEON;
VP8PredLuma4[2] = VE4_NEON;
VP8PredLuma4[4] = RD4_NEON;
VP8PredLuma4[6] = LD4_NEON;
VP8PredLuma16[0] = DC16TopLeft_NEON;
VP8PredLuma16[1] = TM16_NEON;
VP8PredLuma16[2] = VE16_NEON;
VP8PredLuma16[3] = HE16_NEON;
VP8PredLuma16[4] = DC16NoTop_NEON;
VP8PredLuma16[5] = DC16NoLeft_NEON;
VP8PredLuma16[6] = DC16NoTopLeft_NEON;
VP8PredChroma8[0] = DC8uv_NEON;
VP8PredChroma8[1] = TM8uv_NEON;
VP8PredChroma8[2] = VE8uv_NEON;
VP8PredChroma8[3] = HE8uv_NEON;
VP8PredChroma8[4] = DC8uvNoTop_NEON;
VP8PredChroma8[5] = DC8uvNoLeft_NEON;
VP8PredChroma8[6] = DC8uvNoTopLeft_NEON;
}
#else
WEBP_DSP_INIT_STUB(VP8DspInitNEON)
#endif