#ifndef SkBitmapProcState_opts_DEFINED
#define SkBitmapProcState_opts_DEFINED
#include "src/base/SkMSAN.h"
#include "src/base/SkVx.h"
#include "src/core/SkBitmapProcState.h"
#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
#include <immintrin.h>
#elif defined(SK_ARM_HAS_NEON)
#include <arm_neon.h>
#elif SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LASX
#include <lasxintrin.h>
#elif SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LSX
#include <lsxintrin.h>
#endif
namespace SK_OPTS_NS {
template <typename U32, typename Out>
static void decode_packed_coordinates_and_weight(U32 packed, Out* v0, Out* v1, Out* w) { … }
#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
inline
void S32_alpha_D32_filter_DX(const SkBitmapProcState& s,
const uint32_t* xy, int count, uint32_t* colors) {
SkASSERT(count > 0 && colors != nullptr);
SkASSERT(s.fBilerp);
SkASSERT(kN32_SkColorType == s.fPixmap.colorType());
SkASSERT(s.fAlphaScale <= 256);
auto interpolate_in_x = [](uint32_t A0, uint32_t A1,
uint32_t B0, uint32_t B1,
__m128i interlaced_x_weights) {
__m128i interlaced_A = _mm_unpacklo_epi8(_mm_cvtsi32_si128(A0), _mm_cvtsi32_si128(A1)),
interlaced_B = _mm_unpacklo_epi8(_mm_cvtsi32_si128(B0), _mm_cvtsi32_si128(B1));
return _mm_maddubs_epi16(_mm_unpacklo_epi64(interlaced_A, interlaced_B),
interlaced_x_weights);
};
auto interpolate_in_x_and_y = [&](uint32_t A0, uint32_t A1,
uint32_t A2, uint32_t A3,
uint32_t B0, uint32_t B1,
uint32_t B2, uint32_t B3,
__m128i interlaced_x_weights,
int wy) {
__m128i top = interpolate_in_x(A0,A1, B0,B1, interlaced_x_weights),
bot = interpolate_in_x(A2,A3, B2,B3, interlaced_x_weights);
__m128i px = _mm_add_epi16(_mm_slli_epi16(top, 4),
_mm_mullo_epi16(_mm_sub_epi16(bot, top),
_mm_set1_epi16(wy)));
px = _mm_srli_epi16(px, 8);
if (s.fAlphaScale < 256) {
px = _mm_srli_epi16(_mm_mullo_epi16(px, _mm_set1_epi16(s.fAlphaScale)), 8);
}
return px;
};
int y0, y1, wy;
decode_packed_coordinates_and_weight(*xy++, &y0, &y1, &wy);
auto row0 = (const uint32_t*)((const uint8_t*)s.fPixmap.addr() + y0 * s.fPixmap.rowBytes()),
row1 = (const uint32_t*)((const uint8_t*)s.fPixmap.addr() + y1 * s.fPixmap.rowBytes());
while (count >= 4) {
int x0[4],
x1[4];
__m128i wx;
__m128i packed = _mm_loadu_si128((const __m128i*)xy);
_mm_storeu_si128((__m128i*)x0, _mm_srli_epi32(packed, 18));
_mm_storeu_si128((__m128i*)x1, _mm_and_si128 (packed, _mm_set1_epi32(0x3fff)));
wx = _mm_and_si128(_mm_srli_epi32(packed, 14), _mm_set1_epi32(0xf));
__m128i wr = _mm_shuffle_epi8(wx, _mm_setr_epi8(0,0,0,0,4,4,4,4,8,8,8,8,12,12,12,12)),
wl = _mm_sub_epi8(_mm_set1_epi8(16), wr);
__m128i interlaced_x_weights_AB = _mm_unpacklo_epi8(wl,wr),
interlaced_x_weights_CD = _mm_unpackhi_epi8(wl,wr);
enum { A,B,C,D };
__m128i AB = interpolate_in_x_and_y(row0[x0[A]], row0[x1[A]],
row1[x0[A]], row1[x1[A]],
row0[x0[B]], row0[x1[B]],
row1[x0[B]], row1[x1[B]],
interlaced_x_weights_AB, wy);
__m128i CD = interpolate_in_x_and_y(row0[x0[C]], row0[x1[C]],
row1[x0[C]], row1[x1[C]],
row0[x0[D]], row0[x1[D]],
row1[x0[D]], row1[x1[D]],
interlaced_x_weights_CD, wy);
_mm_storeu_si128((__m128i*)colors, _mm_packus_epi16(AB, CD));
xy += 4;
colors += 4;
count -= 4;
}
while (count --> 0) {
int x0, x1, wx;
decode_packed_coordinates_and_weight(*xy++, &x0, &x1, &wx);
__m128i wr = _mm_set1_epi8(wx),
wl = _mm_sub_epi8(_mm_set1_epi8(16), wr);
__m128i interlaced_x_weights = _mm_unpacklo_epi8(wl, wr);
__m128i A = interpolate_in_x_and_y(row0[x0], row0[x1],
row1[x0], row1[x1],
0, 0,
0, 0,
interlaced_x_weights, wy);
*colors++ = _mm_cvtsi128_si32(_mm_packus_epi16(A, _mm_setzero_si128()));
}
}
#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
inline
void S32_alpha_D32_filter_DX(const SkBitmapProcState& s,
const uint32_t* xy, int count, uint32_t* colors) { … }
#elif SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LASX
inline
void S32_alpha_D32_filter_DX(const SkBitmapProcState& s,
const uint32_t* xy, int count, uint32_t* colors) {
SkASSERT(count > 0 && colors != nullptr);
SkASSERT(s.fBilerp);
SkASSERT(kN32_SkColorType == s.fPixmap.colorType());
SkASSERT(s.fAlphaScale <= 256);
int y0, y1, wy;
decode_packed_coordinates_and_weight(*xy++, &y0, &y1, &wy);
auto row0 = (const uint32_t*)( (const char*)s.fPixmap.addr() + y0 * s.fPixmap.rowBytes() ),
row1 = (const uint32_t*)( (const char*)s.fPixmap.addr() + y1 * s.fPixmap.rowBytes() );
__m256i allY = __lasx_xvilvl_d(__lasx_xvreplgr2vr_h(16-wy), __lasx_xvreplgr2vr_h(wy));
while (count --> 0) {
int x0, x1, wx;
decode_packed_coordinates_and_weight(*xy++, &x0, &x1, &wx);
const __m256i zeros = __lasx_xvldi(0);
const __m256i tl = __lasx_xvinsgr2vr_w(zeros, row0[x0], 0),
tr = __lasx_xvinsgr2vr_w(zeros, row0[x1], 0),
bl = __lasx_xvinsgr2vr_w(zeros, row1[x0], 0),
br = __lasx_xvinsgr2vr_w(zeros, row1[x1], 0);
__m256i L = __lasx_xvilvl_b(__lasx_xvldi(0), __lasx_xvilvl_w(tl, bl)),
R = __lasx_xvilvl_b(__lasx_xvldi(0), __lasx_xvilvl_w(tr, br));
__m256i inner = __lasx_xvadd_h(__lasx_xvslli_h(L, 4),
__lasx_xvmul_h(__lasx_xvsub_h(R,L),
__lasx_xvreplgr2vr_h(wx)));
__m256i sum_in_x = __lasx_xvmul_h(inner, allY);
__m256i sum = __lasx_xvadd_h(sum_in_x, __lasx_xvbsrl_v(sum_in_x, 8));
sum = __lasx_xvsrli_h(sum, 8);
if (s.fAlphaScale < 256) {
sum = __lasx_xvmul_h(sum, __lasx_xvreplgr2vr_h(s.fAlphaScale));
sum = __lasx_xvsrli_h(sum, 8);
}
*colors++ = __lasx_xvpickve2gr_w(__lasx_xvpickev_b(__lasx_xvldi(0),
__lasx_xvsat_hu(sum, 8)), 0);
}
}
#elif SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LSX
inline
void S32_alpha_D32_filter_DX(const SkBitmapProcState& s,
const uint32_t* xy, int count, uint32_t* colors) {
SkASSERT(count > 0 && colors != nullptr);
SkASSERT(s.fBilerp);
SkASSERT(kN32_SkColorType == s.fPixmap.colorType());
SkASSERT(s.fAlphaScale <= 256);
int y0, y1, wy;
decode_packed_coordinates_and_weight(*xy++, &y0, &y1, &wy);
auto row0 = (const uint32_t*)( (const char*)s.fPixmap.addr() + y0 * s.fPixmap.rowBytes() ),
row1 = (const uint32_t*)( (const char*)s.fPixmap.addr() + y1 * s.fPixmap.rowBytes() );
__m128i allY = __lsx_vilvl_d(__lsx_vreplgr2vr_h(16-wy), __lsx_vreplgr2vr_h(wy));
while (count --> 0) {
int x0, x1, wx;
decode_packed_coordinates_and_weight(*xy++, &x0, &x1, &wx);
const __m128i zeros = __lsx_vldi(0);
const __m128i tl = __lsx_vinsgr2vr_w(zeros, row0[x0], 0),
tr = __lsx_vinsgr2vr_w(zeros, row0[x1], 0),
bl = __lsx_vinsgr2vr_w(zeros, row1[x0], 0),
br = __lsx_vinsgr2vr_w(zeros, row1[x1], 0);
__m128i L = __lsx_vilvl_b(__lsx_vldi(0), __lsx_vilvl_w(tl, bl)),
R = __lsx_vilvl_b(__lsx_vldi(0), __lsx_vilvl_w(tr, br));
__m128i inner = __lsx_vadd_h(__lsx_vslli_h(L, 4),
__lsx_vmul_h(__lsx_vsub_h(R,L),
__lsx_vreplgr2vr_h(wx)));
__m128i sum_in_x = __lsx_vmul_h(inner, allY);
__m128i sum = __lsx_vadd_h(sum_in_x, __lsx_vbsrl_v(sum_in_x, 8));
sum = __lsx_vsrli_h(sum, 8);
if (s.fAlphaScale < 256) {
sum = __lsx_vmul_h(sum, __lsx_vreplgr2vr_h(s.fAlphaScale));
sum = __lsx_vsrli_h(sum, 8);
}
*colors++ = __lsx_vpickve2gr_w(__lsx_vpickev_b(__lsx_vldi(0),
__lsx_vsat_hu(sum, 8)), 0);
}
}
#else
#if defined(SK_ARM_HAS_NEON)
static void filter_and_scale_by_alpha(unsigned x, unsigned y,
SkPMColor a00, SkPMColor a01,
SkPMColor a10, SkPMColor a11,
SkPMColor *dst,
uint16_t scale) {
uint8x8_t vy, vconst16_8, v16_y, vres;
uint16x4_t vx, vconst16_16, v16_x, tmp, vscale;
uint32x2_t va0, va1;
uint16x8_t tmp1, tmp2;
vy = vdup_n_u8(y);
vconst16_8 = vmov_n_u8(16);
v16_y = vsub_u8(vconst16_8, vy);
va0 = vdup_n_u32(a00);
va1 = vdup_n_u32(a10);
va0 = vset_lane_u32(a01, va0, 1);
va1 = vset_lane_u32(a11, va1, 1);
tmp1 = vmull_u8(vreinterpret_u8_u32(va0), v16_y);
tmp2 = vmull_u8(vreinterpret_u8_u32(va1), vy);
vx = vdup_n_u16(x);
vconst16_16 = vmov_n_u16(16);
v16_x = vsub_u16(vconst16_16, vx);
tmp = vmul_u16(vget_high_u16(tmp1), vx);
tmp = vmla_u16(tmp, vget_high_u16(tmp2), vx);
tmp = vmla_u16(tmp, vget_low_u16(tmp1), v16_x);
tmp = vmla_u16(tmp, vget_low_u16(tmp2), v16_x);
if (scale < 256) {
vscale = vdup_n_u16(scale);
tmp = vshr_n_u16(tmp, 8);
tmp = vmul_u16(tmp, vscale);
}
vres = vshrn_n_u16(vcombine_u16(tmp, vcreate_u16((uint64_t)0)), 8);
vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0);
}
#else
static void filter_and_scale_by_alpha(unsigned x, unsigned y,
SkPMColor a00, SkPMColor a01,
SkPMColor a10, SkPMColor a11,
SkPMColor* dstColor,
unsigned alphaScale) {
SkASSERT((unsigned)x <= 0xF);
SkASSERT((unsigned)y <= 0xF);
SkASSERT(alphaScale <= 256);
int xy = x * y;
const uint32_t mask = 0xFF00FF;
int scale = 256 - 16*y - 16*x + xy;
uint32_t lo = (a00 & mask) * scale;
uint32_t hi = ((a00 >> 8) & mask) * scale;
scale = 16*x - xy;
lo += (a01 & mask) * scale;
hi += ((a01 >> 8) & mask) * scale;
scale = 16*y - xy;
lo += (a10 & mask) * scale;
hi += ((a10 >> 8) & mask) * scale;
lo += (a11 & mask) * xy;
hi += ((a11 >> 8) & mask) * xy;
if (alphaScale < 256) {
lo = ((lo >> 8) & mask) * alphaScale;
hi = ((hi >> 8) & mask) * alphaScale;
}
*dstColor = ((lo >> 8) & mask) | (hi & ~mask);
}
#endif
inline
void S32_alpha_D32_filter_DX(const SkBitmapProcState& s,
const uint32_t* xy, int count, SkPMColor* colors) {
SkASSERT(count > 0 && colors != nullptr);
SkASSERT(s.fBilerp);
SkASSERT(4 == s.fPixmap.info().bytesPerPixel());
SkASSERT(s.fAlphaScale <= 256);
int y0, y1, wy;
decode_packed_coordinates_and_weight(*xy++, &y0, &y1, &wy);
auto row0 = (const uint32_t*)( (const char*)s.fPixmap.addr() + y0 * s.fPixmap.rowBytes() ),
row1 = (const uint32_t*)( (const char*)s.fPixmap.addr() + y1 * s.fPixmap.rowBytes() );
while (count --> 0) {
int x0, x1, wx;
decode_packed_coordinates_and_weight(*xy++, &x0, &x1, &wx);
filter_and_scale_by_alpha(wx, wy,
row0[x0], row0[x1],
row1[x0], row1[x1],
colors++,
s.fAlphaScale);
}
}
#endif
#if defined(SK_ARM_HAS_NEON)
inline
void S32_alpha_D32_filter_DXDY(const SkBitmapProcState& s,
const uint32_t* xy, int count, SkPMColor* colors) {
SkASSERT(count > 0 && colors != nullptr);
SkASSERT(s.fBilerp);
SkASSERT(4 == s.fPixmap.info().bytesPerPixel());
SkASSERT(s.fAlphaScale <= 256);
auto src = (const char*)s.fPixmap.addr();
size_t rb = s.fPixmap.rowBytes();
while (count --> 0) {
int y0, y1, wy,
x0, x1, wx;
decode_packed_coordinates_and_weight(*xy++, &y0, &y1, &wy);
decode_packed_coordinates_and_weight(*xy++, &x0, &x1, &wx);
auto row0 = (const uint32_t*)(src + y0*rb),
row1 = (const uint32_t*)(src + y1*rb);
filter_and_scale_by_alpha(wx, wy,
row0[x0], row0[x1],
row1[x0], row1[x1],
colors++,
s.fAlphaScale);
}
}
#else
constexpr static void (*S32_alpha_D32_filter_DXDY)(const SkBitmapProcState&,
const uint32_t*, int, SkPMColor*) = …;
#endif
}
namespace sktests {
template <typename U32, typename Out>
void decode_packed_coordinates_and_weight(U32 packed, Out* v0, Out* v1, Out* w) { … }
}
#endif