#include "meshoptimizer.h"
#include <assert.h>
#include <string.h>
#ifndef MESHOPTIMIZER_NO_SIMD
#if defined(__AVX__) || defined(__SSSE3__)
#define SIMD_SSE
#endif
#if defined(__AVX512VBMI2__) && defined(__AVX512VBMI__) && defined(__AVX512VL__) && defined(__POPCNT__)
#undef SIMD_SSE
#define SIMD_AVX
#endif
#if !defined(SIMD_SSE) && !defined(SIMD_AVX) && defined(_MSC_VER) && !defined(__clang__) && (defined(_M_IX86) || defined(_M_X64))
#define SIMD_SSE
#define SIMD_FALLBACK
#endif
#if !defined(SIMD_SSE) && !defined(SIMD_AVX) && ((defined(__clang__) && __clang_major__ * 100 + __clang_minor__ >= 308) || (defined(__GNUC__) && __GNUC__ * 100 + __GNUC_MINOR__ >= 409)) && (defined(__i386__) || defined(__x86_64__))
#define SIMD_SSE
#define SIMD_FALLBACK
#define SIMD_TARGET …
#endif
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
#define SIMD_NEON
#endif
#if !defined(SIMD_NEON) && defined(_MSC_VER) && (defined(_M_ARM) || defined(_M_ARM64))
#define SIMD_NEON
#endif
#if defined(__wasm_simd128__)
#define SIMD_WASM
#undef SIMD_NEON
#undef SIMD_SSE
#undef SIMD_AVX
#endif
#ifndef SIMD_TARGET
#define SIMD_TARGET
#endif
#if defined(__x86_64__) || defined(_M_X64) || defined(__aarch64__) || defined(_M_ARM64)
#define SIMD_LATENCYOPT
#endif
#endif
#ifdef SIMD_SSE
#include <tmmintrin.h>
#endif
#if defined(SIMD_SSE) && defined(SIMD_FALLBACK)
#ifdef _MSC_VER
#include <intrin.h>
#else
#include <cpuid.h>
#endif
#endif
#ifdef SIMD_AVX
#include <immintrin.h>
#endif
#ifdef SIMD_NEON
#if defined(_MSC_VER) && defined(_M_ARM64)
#include <arm64_neon.h>
#else
#include <arm_neon.h>
#endif
#endif
#ifdef SIMD_WASM
#include <wasm_simd128.h>
#endif
#ifndef TRACE
#define TRACE …
#endif
#if TRACE
#include <stdio.h>
#endif
#ifdef SIMD_WASM
#define wasmx_splat_v32x4 …
#define wasmx_unpacklo_v8x16 …
#define wasmx_unpackhi_v8x16 …
#define wasmx_unpacklo_v16x8 …
#define wasmx_unpackhi_v16x8 …
#define wasmx_unpacklo_v64x2 …
#define wasmx_unpackhi_v64x2 …
#endif
namespace meshopt
{
const unsigned char kVertexHeader = …;
static int gEncodeVertexVersion = …;
const size_t kVertexBlockSizeBytes = …;
const size_t kVertexBlockMaxSize = …;
const size_t kByteGroupSize = …;
const size_t kByteGroupDecodeLimit = …;
const size_t kTailMaxSize = …;
static size_t getVertexBlockSize(size_t vertex_size)
{ … }
inline unsigned char zigzag8(unsigned char v)
{ … }
inline unsigned char unzigzag8(unsigned char v)
{ … }
#if TRACE
struct Stats
{
size_t size;
size_t header;
size_t bitg[4];
size_t bitc[8];
};
static Stats* bytestats = NULL;
static Stats vertexstats[256];
#endif
static bool encodeBytesGroupZero(const unsigned char* buffer)
{ … }
static size_t encodeBytesGroupMeasure(const unsigned char* buffer, int bits)
{ … }
static unsigned char* encodeBytesGroup(unsigned char* data, const unsigned char* buffer, int bits)
{ … }
static unsigned char* encodeBytes(unsigned char* data, unsigned char* data_end, const unsigned char* buffer, size_t buffer_size)
{ … }
static unsigned char* encodeVertexBlock(unsigned char* data, unsigned char* data_end, const unsigned char* vertex_data, size_t vertex_count, size_t vertex_size, unsigned char last_vertex[256])
{ … }
#if defined(SIMD_FALLBACK) || (!defined(SIMD_SSE) && !defined(SIMD_NEON) && !defined(SIMD_AVX) && !defined(SIMD_WASM))
static const unsigned char* decodeBytesGroup(const unsigned char* data, unsigned char* buffer, int bitslog2)
{ … }
static const unsigned char* decodeBytes(const unsigned char* data, const unsigned char* data_end, unsigned char* buffer, size_t buffer_size)
{ … }
static const unsigned char* decodeVertexBlock(const unsigned char* data, const unsigned char* data_end, unsigned char* vertex_data, size_t vertex_count, size_t vertex_size, unsigned char last_vertex[256])
{ … }
#endif
#if defined(SIMD_SSE) || defined(SIMD_NEON) || defined(SIMD_WASM)
static unsigned char kDecodeBytesGroupShuffle[256][8];
static unsigned char kDecodeBytesGroupCount[256];
#ifdef __wasm__
__attribute__((cold))
#endif
static bool
decodeBytesGroupBuildTables()
{ … }
static bool gDecodeBytesGroupInitialized = …;
#endif
#ifdef SIMD_SSE
SIMD_TARGET
static __m128i decodeShuffleMask(unsigned char mask0, unsigned char mask1)
{ … }
SIMD_TARGET
static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int bitslog2)
{ … }
#endif
#ifdef SIMD_AVX
static const __m128i decodeBytesGroupConfig[] = {
_mm_set1_epi8(3),
_mm_set1_epi8(15),
_mm_setr_epi8(6, 4, 2, 0, 14, 12, 10, 8, 22, 20, 18, 16, 30, 28, 26, 24),
_mm_setr_epi8(4, 0, 12, 8, 20, 16, 28, 24, 36, 32, 44, 40, 52, 48, 60, 56),
};
static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int bitslog2)
{
switch (bitslog2)
{
case 0:
{
__m128i result = _mm_setzero_si128();
_mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);
return data;
}
case 1:
case 2:
{
const unsigned char* skip = data + (bitslog2 << 2);
__m128i selb = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(data));
__m128i rest = _mm_loadu_si128(reinterpret_cast<const __m128i*>(skip));
__m128i sent = decodeBytesGroupConfig[bitslog2 - 1];
__m128i ctrl = decodeBytesGroupConfig[bitslog2 + 1];
__m128i selw = _mm_shuffle_epi32(selb, 0x44);
__m128i sel = _mm_and_si128(sent, _mm_multishift_epi64_epi8(ctrl, selw));
__mmask16 mask16 = _mm_cmp_epi8_mask(sel, sent, _MM_CMPINT_EQ);
__m128i result = _mm_mask_expand_epi8(sel, mask16, rest);
_mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);
return skip + _mm_popcnt_u32(mask16);
}
case 3:
{
__m128i result = _mm_loadu_si128(reinterpret_cast<const __m128i*>(data));
_mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);
return data + 16;
}
default:
assert(!"Unexpected bit length");
return data;
}
}
#endif
#ifdef SIMD_NEON
static uint8x16_t shuffleBytes(unsigned char mask0, unsigned char mask1, uint8x8_t rest0, uint8x8_t rest1)
{
uint8x8_t sm0 = vld1_u8(kDecodeBytesGroupShuffle[mask0]);
uint8x8_t sm1 = vld1_u8(kDecodeBytesGroupShuffle[mask1]);
uint8x8_t r0 = vtbl1_u8(rest0, sm0);
uint8x8_t r1 = vtbl1_u8(rest1, sm1);
return vcombine_u8(r0, r1);
}
static void neonMoveMask(uint8x16_t mask, unsigned char& mask0, unsigned char& mask1)
{
const uint64_t magic = 0x000103070f1f3f80ull;
uint64x2_t mask2 = vreinterpretq_u64_u8(mask);
mask0 = uint8_t((vgetq_lane_u64(mask2, 0) * magic) >> 56);
mask1 = uint8_t((vgetq_lane_u64(mask2, 1) * magic) >> 56);
}
static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int bitslog2)
{
switch (bitslog2)
{
case 0:
{
uint8x16_t result = vdupq_n_u8(0);
vst1q_u8(buffer, result);
return data;
}
case 1:
{
#ifdef SIMD_LATENCYOPT
unsigned int data32;
memcpy(&data32, data, 4);
data32 &= data32 >> 1;
unsigned long long data64 = ((unsigned long long)data32 << 30) | (data32 & 0x3fffffff);
int datacnt = int(((data64 & 0x1111111111111111ull) * 0x1111111111111111ull) >> 60);
#endif
uint8x8_t sel2 = vld1_u8(data);
uint8x8_t sel22 = vzip_u8(vshr_n_u8(sel2, 4), sel2).val[0];
uint8x8x2_t sel2222 = vzip_u8(vshr_n_u8(sel22, 2), sel22);
uint8x16_t sel = vandq_u8(vcombine_u8(sel2222.val[0], sel2222.val[1]), vdupq_n_u8(3));
uint8x16_t mask = vceqq_u8(sel, vdupq_n_u8(3));
unsigned char mask0, mask1;
neonMoveMask(mask, mask0, mask1);
uint8x8_t rest0 = vld1_u8(data + 4);
uint8x8_t rest1 = vld1_u8(data + 4 + kDecodeBytesGroupCount[mask0]);
uint8x16_t result = vbslq_u8(mask, shuffleBytes(mask0, mask1, rest0, rest1), sel);
vst1q_u8(buffer, result);
#ifdef SIMD_LATENCYOPT
return data + 4 + datacnt;
#else
return data + 4 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];
#endif
}
case 2:
{
#ifdef SIMD_LATENCYOPT
unsigned long long data64;
memcpy(&data64, data, 8);
data64 &= data64 >> 1;
data64 &= data64 >> 2;
int datacnt = int(((data64 & 0x1111111111111111ull) * 0x1111111111111111ull) >> 60);
#endif
uint8x8_t sel4 = vld1_u8(data);
uint8x8x2_t sel44 = vzip_u8(vshr_n_u8(sel4, 4), vand_u8(sel4, vdup_n_u8(15)));
uint8x16_t sel = vcombine_u8(sel44.val[0], sel44.val[1]);
uint8x16_t mask = vceqq_u8(sel, vdupq_n_u8(15));
unsigned char mask0, mask1;
neonMoveMask(mask, mask0, mask1);
uint8x8_t rest0 = vld1_u8(data + 8);
uint8x8_t rest1 = vld1_u8(data + 8 + kDecodeBytesGroupCount[mask0]);
uint8x16_t result = vbslq_u8(mask, shuffleBytes(mask0, mask1, rest0, rest1), sel);
vst1q_u8(buffer, result);
#ifdef SIMD_LATENCYOPT
return data + 8 + datacnt;
#else
return data + 8 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];
#endif
}
case 3:
{
uint8x16_t result = vld1q_u8(data);
vst1q_u8(buffer, result);
return data + 16;
}
default:
assert(!"Unexpected bit length");
return data;
}
}
#endif
#ifdef SIMD_WASM
SIMD_TARGET
static v128_t decodeShuffleMask(unsigned char mask0, unsigned char mask1)
{
v128_t sm0 = wasm_v128_load(&kDecodeBytesGroupShuffle[mask0]);
v128_t sm1 = wasm_v128_load(&kDecodeBytesGroupShuffle[mask1]);
v128_t sm1off = wasm_v128_load(&kDecodeBytesGroupCount[mask0]);
sm1off = wasm_i8x16_shuffle(sm1off, sm1off, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
v128_t sm1r = wasm_i8x16_add(sm1, sm1off);
return wasmx_unpacklo_v64x2(sm0, sm1r);
}
SIMD_TARGET
static void wasmMoveMask(v128_t mask, unsigned char& mask0, unsigned char& mask1)
{
const uint64_t magic = 0x000103070f1f3f80ull;
mask0 = uint8_t((wasm_i64x2_extract_lane(mask, 0) * magic) >> 56);
mask1 = uint8_t((wasm_i64x2_extract_lane(mask, 1) * magic) >> 56);
}
SIMD_TARGET
static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int bitslog2)
{
switch (bitslog2)
{
case 0:
{
v128_t result = wasm_i8x16_splat(0);
wasm_v128_store(buffer, result);
return data;
}
case 1:
{
v128_t sel2 = wasm_v128_load(data);
v128_t rest = wasm_v128_load(data + 4);
v128_t sel22 = wasmx_unpacklo_v8x16(wasm_i16x8_shr(sel2, 4), sel2);
v128_t sel2222 = wasmx_unpacklo_v8x16(wasm_i16x8_shr(sel22, 2), sel22);
v128_t sel = wasm_v128_and(sel2222, wasm_i8x16_splat(3));
v128_t mask = wasm_i8x16_eq(sel, wasm_i8x16_splat(3));
unsigned char mask0, mask1;
wasmMoveMask(mask, mask0, mask1);
v128_t shuf = decodeShuffleMask(mask0, mask1);
v128_t result = wasm_v128_bitselect(wasm_i8x16_swizzle(rest, shuf), sel, mask);
wasm_v128_store(buffer, result);
return data + 4 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];
}
case 2:
{
v128_t sel4 = wasm_v128_load(data);
v128_t rest = wasm_v128_load(data + 8);
v128_t sel44 = wasmx_unpacklo_v8x16(wasm_i16x8_shr(sel4, 4), sel4);
v128_t sel = wasm_v128_and(sel44, wasm_i8x16_splat(15));
v128_t mask = wasm_i8x16_eq(sel, wasm_i8x16_splat(15));
unsigned char mask0, mask1;
wasmMoveMask(mask, mask0, mask1);
v128_t shuf = decodeShuffleMask(mask0, mask1);
v128_t result = wasm_v128_bitselect(wasm_i8x16_swizzle(rest, shuf), sel, mask);
wasm_v128_store(buffer, result);
return data + 8 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];
}
case 3:
{
v128_t result = wasm_v128_load(data);
wasm_v128_store(buffer, result);
return data + 16;
}
default:
assert(!"Unexpected bit length");
return data;
}
}
#endif
#if defined(SIMD_SSE) || defined(SIMD_AVX)
SIMD_TARGET
static void transpose8(__m128i& x0, __m128i& x1, __m128i& x2, __m128i& x3)
{ … }
SIMD_TARGET
static __m128i unzigzag8(__m128i v)
{ … }
#endif
#ifdef SIMD_NEON
static void transpose8(uint8x16_t& x0, uint8x16_t& x1, uint8x16_t& x2, uint8x16_t& x3)
{
uint8x16x2_t t01 = vzipq_u8(x0, x1);
uint8x16x2_t t23 = vzipq_u8(x2, x3);
uint16x8x2_t x01 = vzipq_u16(vreinterpretq_u16_u8(t01.val[0]), vreinterpretq_u16_u8(t23.val[0]));
uint16x8x2_t x23 = vzipq_u16(vreinterpretq_u16_u8(t01.val[1]), vreinterpretq_u16_u8(t23.val[1]));
x0 = vreinterpretq_u8_u16(x01.val[0]);
x1 = vreinterpretq_u8_u16(x01.val[1]);
x2 = vreinterpretq_u8_u16(x23.val[0]);
x3 = vreinterpretq_u8_u16(x23.val[1]);
}
static uint8x16_t unzigzag8(uint8x16_t v)
{
uint8x16_t xl = vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(vandq_u8(v, vdupq_n_u8(1)))));
uint8x16_t xr = vshrq_n_u8(v, 1);
return veorq_u8(xl, xr);
}
#endif
#ifdef SIMD_WASM
SIMD_TARGET
static void transpose8(v128_t& x0, v128_t& x1, v128_t& x2, v128_t& x3)
{
v128_t t0 = wasmx_unpacklo_v8x16(x0, x1);
v128_t t1 = wasmx_unpackhi_v8x16(x0, x1);
v128_t t2 = wasmx_unpacklo_v8x16(x2, x3);
v128_t t3 = wasmx_unpackhi_v8x16(x2, x3);
x0 = wasmx_unpacklo_v16x8(t0, t2);
x1 = wasmx_unpackhi_v16x8(t0, t2);
x2 = wasmx_unpacklo_v16x8(t1, t3);
x3 = wasmx_unpackhi_v16x8(t1, t3);
}
SIMD_TARGET
static v128_t unzigzag8(v128_t v)
{
v128_t xl = wasm_i8x16_neg(wasm_v128_and(v, wasm_i8x16_splat(1)));
v128_t xr = wasm_u8x16_shr(v, 1);
return wasm_v128_xor(xl, xr);
}
#endif
#if defined(SIMD_SSE) || defined(SIMD_AVX) || defined(SIMD_NEON) || defined(SIMD_WASM)
SIMD_TARGET
static const unsigned char* decodeBytesSimd(const unsigned char* data, const unsigned char* data_end, unsigned char* buffer, size_t buffer_size)
{ … }
SIMD_TARGET
static const unsigned char* decodeVertexBlockSimd(const unsigned char* data, const unsigned char* data_end, unsigned char* vertex_data, size_t vertex_count, size_t vertex_size, unsigned char last_vertex[256])
{ … }
#endif
#if defined(SIMD_SSE) && defined(SIMD_FALLBACK)
static unsigned int getCpuFeatures()
{ … }
static unsigned int cpuid = …;
#endif
}
size_t meshopt_encodeVertexBuffer(unsigned char* buffer, size_t buffer_size, const void* vertices, size_t vertex_count, size_t vertex_size)
{ … }
size_t meshopt_encodeVertexBufferBound(size_t vertex_count, size_t vertex_size)
{ … }
void meshopt_encodeVertexVersion(int version)
{ … }
int meshopt_decodeVertexBuffer(void* destination, size_t vertex_count, size_t vertex_size, const unsigned char* buffer, size_t buffer_size)
{ … }
#undef SIMD_NEON
#undef SIMD_SSE
#undef SIMD_AVX
#undef SIMD_WASM
#undef SIMD_FALLBACK
#undef SIMD_TARGET
#undef SIMD_LATENCYOPT