#include "folly/external/fast-crc32/sse_crc32c_v8s3x3.h"
#include <stdint.h>
#define CRC_EXPORT …
#if !defined(FOLLY_ENABLE_SSE42_CRC32C_V8S3X3)
#include <stdlib.h>
namespace folly::detail {
CRC_EXPORT uint32_t sse_crc32c_v8s3x3(const uint8_t*, size_t, uint32_t) { … }
}
#else
#include <nmmintrin.h>
#include <wmmintrin.h>
#if defined(_MSC_VER)
#define CRC_AINLINE …
#define CRC_ALIGN …
#else
#define CRC_AINLINE …
#define CRC_ALIGN …
#endif
#define clmul_lo …
#define clmul_hi …
namespace folly::detail {
CRC_AINLINE __m128i clmul_scalar(uint32_t a, uint32_t b) {
return _mm_clmulepi64_si128(_mm_cvtsi32_si128(a), _mm_cvtsi32_si128(b), 0);
}
static uint32_t xnmodp(uint64_t n) {
uint64_t stack = ~(uint64_t)1;
uint32_t acc, low;
for (; n > 191; n = (n >> 1) - 16) {
stack = (stack << 1) + (n & 1);
}
stack = ~stack;
acc = ((uint32_t)0x80000000) >> (n & 31);
for (n >>= 5; n; --n) {
acc = _mm_crc32_u32(acc, 0);
}
while ((low = stack & 1), stack >>= 1) {
__m128i x = _mm_cvtsi32_si128(acc);
uint64_t y = _mm_cvtsi128_si64(_mm_clmulepi64_si128(x, x, 0));
acc = _mm_crc32_u64(0, y << low);
}
return acc;
}
CRC_AINLINE __m128i crc_shift(uint32_t crc, size_t nbytes) {
return clmul_scalar(crc, xnmodp(nbytes * 8 - 33));
}
FOLLY_TARGET_ATTRIBUTE("sse4.2")
CRC_EXPORT uint32_t sse_crc32c_v8s3x3(const uint8_t* buf, size_t len, uint32_t crc0) {
for (; len && ((uintptr_t)buf & 7); --len) {
crc0 = _mm_crc32_u8(crc0, *buf++);
}
if (((uintptr_t)buf & 8) && len >= 8) {
crc0 = _mm_crc32_u64(crc0, *(const uint64_t*)buf);
buf += 8;
len -= 8;
}
if (len >= 208) {
size_t blk = (len - 8) / 200;
size_t klen = blk * 24;
const uint8_t* buf2 = buf + 0;
uint32_t crc1 = 0;
uint32_t crc2 = 0;
__m128i vc0;
__m128i vc1;
uint64_t vc;
__m128i x0 = _mm_loadu_si128((const __m128i*)buf2), y0;
__m128i x1 = _mm_loadu_si128((const __m128i*)(buf2 + 16)), y1;
__m128i x2 = _mm_loadu_si128((const __m128i*)(buf2 + 32)), y2;
__m128i x3 = _mm_loadu_si128((const __m128i*)(buf2 + 48)), y3;
__m128i x4 = _mm_loadu_si128((const __m128i*)(buf2 + 64)), y4;
__m128i x5 = _mm_loadu_si128((const __m128i*)(buf2 + 80)), y5;
__m128i x6 = _mm_loadu_si128((const __m128i*)(buf2 + 96)), y6;
__m128i x7 = _mm_loadu_si128((const __m128i*)(buf2 + 112)), y7;
__m128i k;
k = _mm_setr_epi32(0x6992cea2, 0, 0x0d3b6092, 0);
x0 = _mm_xor_si128(_mm_cvtsi32_si128(crc0), x0);
crc0 = 0;
buf2 += 128;
len -= 200;
buf += blk * 128;
while (len >= 208) {
y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k);
y1 = clmul_lo(x1, k), x1 = clmul_hi(x1, k);
y2 = clmul_lo(x2, k), x2 = clmul_hi(x2, k);
y3 = clmul_lo(x3, k), x3 = clmul_hi(x3, k);
y4 = clmul_lo(x4, k), x4 = clmul_hi(x4, k);
y5 = clmul_lo(x5, k), x5 = clmul_hi(x5, k);
y6 = clmul_lo(x6, k), x6 = clmul_hi(x6, k);
y7 = clmul_lo(x7, k), x7 = clmul_hi(x7, k);
y0 = _mm_xor_si128(y0, _mm_loadu_si128((const __m128i*)buf2)), x0 = _mm_xor_si128(x0, y0);
y1 = _mm_xor_si128(y1, _mm_loadu_si128((const __m128i*)(buf2 + 16))), x1 = _mm_xor_si128(x1, y1);
y2 = _mm_xor_si128(y2, _mm_loadu_si128((const __m128i*)(buf2 + 32))), x2 = _mm_xor_si128(x2, y2);
y3 = _mm_xor_si128(y3, _mm_loadu_si128((const __m128i*)(buf2 + 48))), x3 = _mm_xor_si128(x3, y3);
y4 = _mm_xor_si128(y4, _mm_loadu_si128((const __m128i*)(buf2 + 64))), x4 = _mm_xor_si128(x4, y4);
y5 = _mm_xor_si128(y5, _mm_loadu_si128((const __m128i*)(buf2 + 80))), x5 = _mm_xor_si128(x5, y5);
y6 = _mm_xor_si128(y6, _mm_loadu_si128((const __m128i*)(buf2 + 96))), x6 = _mm_xor_si128(x6, y6);
y7 = _mm_xor_si128(y7, _mm_loadu_si128((const __m128i*)(buf2 + 112))), x7 = _mm_xor_si128(x7, y7);
crc0 = _mm_crc32_u64(crc0, *(const uint64_t*)buf);
crc1 = _mm_crc32_u64(crc1, *(const uint64_t*)(buf + klen));
crc2 = _mm_crc32_u64(crc2, *(const uint64_t*)(buf + klen * 2));
crc0 = _mm_crc32_u64(crc0, *(const uint64_t*)(buf + 8));
crc1 = _mm_crc32_u64(crc1, *(const uint64_t*)(buf + klen + 8));
crc2 = _mm_crc32_u64(crc2, *(const uint64_t*)(buf + klen * 2 + 8));
crc0 = _mm_crc32_u64(crc0, *(const uint64_t*)(buf + 16));
crc1 = _mm_crc32_u64(crc1, *(const uint64_t*)(buf + klen + 16));
crc2 = _mm_crc32_u64(crc2, *(const uint64_t*)(buf + klen * 2 + 16));
buf += 24;
buf2 += 128;
len -= 200;
}
k = _mm_setr_epi32(0xf20c0dfe, 0, 0x493c7d27, 0);
y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k);
y2 = clmul_lo(x2, k), x2 = clmul_hi(x2, k);
y4 = clmul_lo(x4, k), x4 = clmul_hi(x4, k);
y6 = clmul_lo(x6, k), x6 = clmul_hi(x6, k);
y0 = _mm_xor_si128(y0, x1), x0 = _mm_xor_si128(x0, y0);
y2 = _mm_xor_si128(y2, x3), x2 = _mm_xor_si128(x2, y2);
y4 = _mm_xor_si128(y4, x5), x4 = _mm_xor_si128(x4, y4);
y6 = _mm_xor_si128(y6, x7), x6 = _mm_xor_si128(x6, y6);
k = _mm_setr_epi32(0x3da6d0cb, 0, 0xba4fc28e, 0);
y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k);
y4 = clmul_lo(x4, k), x4 = clmul_hi(x4, k);
y0 = _mm_xor_si128(y0, x2), x0 = _mm_xor_si128(x0, y0);
y4 = _mm_xor_si128(y4, x6), x4 = _mm_xor_si128(x4, y4);
k = _mm_setr_epi32(0x740eef02, 0, 0x9e4addf8, 0);
y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k);
y0 = _mm_xor_si128(y0, x4), x0 = _mm_xor_si128(x0, y0);
crc0 = _mm_crc32_u64(crc0, *(const uint64_t*)buf);
crc1 = _mm_crc32_u64(crc1, *(const uint64_t*)(buf + klen));
crc2 = _mm_crc32_u64(crc2, *(const uint64_t*)(buf + klen * 2));
crc0 = _mm_crc32_u64(crc0, *(const uint64_t*)(buf + 8));
crc1 = _mm_crc32_u64(crc1, *(const uint64_t*)(buf + klen + 8));
crc2 = _mm_crc32_u64(crc2, *(const uint64_t*)(buf + klen * 2 + 8));
crc0 = _mm_crc32_u64(crc0, *(const uint64_t*)(buf + 16));
crc1 = _mm_crc32_u64(crc1, *(const uint64_t*)(buf + klen + 16));
crc2 = _mm_crc32_u64(crc2, *(const uint64_t*)(buf + klen * 2 + 16));
buf += 24;
vc0 = crc_shift(crc0, klen * 2 + 8);
vc1 = crc_shift(crc1, klen + 8);
vc = _mm_extract_epi64(_mm_xor_si128(vc0, vc1), 0);
vc ^= _mm_extract_epi64(crc_shift(_mm_crc32_u64(_mm_crc32_u64(0, _mm_extract_epi64(x0, 0)), _mm_extract_epi64(x0, 1)), klen * 3 + 8), 0);
buf += klen * 2;
crc0 = crc2;
crc0 = _mm_crc32_u64(crc0, *(const uint64_t*)buf ^ vc), buf += 8;
len -= 8;
}
for (; len >= 8; buf += 8, len -= 8) {
crc0 = _mm_crc32_u64(crc0, *(const uint64_t*)buf);
}
for (; len; --len) {
crc0 = _mm_crc32_u8(crc0, *buf++);
}
return crc0;
}
}
#endif