#ifdef __SSE4_2__
#include <immintrin.h>
#endif
#ifdef _MSC_VER
#include <intrin.h>
#endif
#include <array>
#include <cstddef>
#include <cstdint>
#include <cstring>
#include <memory>
#include "absl/base/attributes.h"
#include "absl/base/config.h"
#include "absl/base/optimization.h"
#include "absl/base/prefetch.h"
#include "absl/crc/crc32c.h"
#include "absl/crc/internal/cpu_detect.h"
#include "absl/crc/internal/crc32_x86_arm_combined_simd.h"
#include "absl/crc/internal/crc_memcpy.h"
#include "absl/strings/string_view.h"
#if defined(ABSL_INTERNAL_HAVE_X86_64_ACCELERATED_CRC_MEMCPY_ENGINE) || \
defined(ABSL_INTERNAL_HAVE_ARM_ACCELERATED_CRC_MEMCPY_ENGINE)
namespace absl {
ABSL_NAMESPACE_BEGIN
namespace crc_internal {
namespace {
inline crc32c_t ShortCrcCopy(char* dst, const char* src, std::size_t length,
crc32c_t crc) {
uint32_t crc_uint32 = static_cast<uint32_t>(crc);
for (std::size_t i = 0; i < length; i++) {
uint8_t data = *reinterpret_cast<const uint8_t*>(src);
crc_uint32 = CRC32_u8(crc_uint32, data);
*reinterpret_cast<uint8_t*>(dst) = data;
++src;
++dst;
}
return crc32c_t{crc_uint32};
}
constexpr size_t kIntLoadsPerVec = sizeof(V128) / sizeof(uint64_t);
template <size_t vec_regions, size_t int_regions>
ABSL_ATTRIBUTE_NO_SANITIZE_UNDEFINED inline void LargeTailCopy(
crc32c_t* crcs, char** dst, const char** src, size_t region_size,
size_t copy_rounds) {
std::array<V128, vec_regions> data;
std::array<uint64_t, kIntLoadsPerVec * int_regions> int_data;
while (copy_rounds > 0) {
for (size_t i = 0; i < vec_regions; i++) {
size_t region = i;
auto* vsrc = reinterpret_cast<const V128*>(*src + region_size * region);
auto* vdst = reinterpret_cast<V128*>(*dst + region_size * region);
data[i] = V128_LoadU(vsrc);
V128_Store(vdst, data[i]);
crcs[region] = crc32c_t{static_cast<uint32_t>(
CRC32_u64(static_cast<uint32_t>(crcs[region]),
static_cast<uint64_t>(V128_Extract64<0>(data[i]))))};
crcs[region] = crc32c_t{static_cast<uint32_t>(
CRC32_u64(static_cast<uint32_t>(crcs[region]),
static_cast<uint64_t>(V128_Extract64<1>(data[i]))))};
}
for (size_t i = 0; i < int_regions; i++) {
size_t region = vec_regions + i;
auto* usrc =
reinterpret_cast<const uint64_t*>(*src + region_size * region);
auto* udst = reinterpret_cast<uint64_t*>(*dst + region_size * region);
for (size_t j = 0; j < kIntLoadsPerVec; j++) {
size_t data_index = i * kIntLoadsPerVec + j;
int_data[data_index] = *(usrc + j);
crcs[region] = crc32c_t{CRC32_u64(static_cast<uint32_t>(crcs[region]),
int_data[data_index])};
*(udst + j) = int_data[data_index];
}
}
*src += sizeof(V128);
*dst += sizeof(V128);
--copy_rounds;
}
}
}
template <size_t vec_regions, size_t int_regions>
class AcceleratedCrcMemcpyEngine : public CrcMemcpyEngine {
public:
AcceleratedCrcMemcpyEngine() = default;
AcceleratedCrcMemcpyEngine(const AcceleratedCrcMemcpyEngine&) = delete;
AcceleratedCrcMemcpyEngine operator=(const AcceleratedCrcMemcpyEngine&) =
delete;
crc32c_t Compute(void* __restrict dst, const void* __restrict src,
std::size_t length, crc32c_t initial_crc) const override;
};
template <size_t vec_regions, size_t int_regions>
ABSL_ATTRIBUTE_NO_SANITIZE_UNDEFINED crc32c_t
AcceleratedCrcMemcpyEngine<vec_regions, int_regions>::Compute(
void* __restrict dst, const void* __restrict src, std::size_t length,
crc32c_t initial_crc) const {
constexpr std::size_t kRegions = vec_regions + int_regions;
static_assert(kRegions > 0, "Must specify at least one region.");
constexpr uint32_t kCrcDataXor = uint32_t{0xffffffff};
constexpr std::size_t kBlockSize = sizeof(V128);
constexpr std::size_t kCopyRoundSize = kRegions * kBlockSize;
constexpr std::size_t kBlocksPerCacheLine = ABSL_CACHELINE_SIZE / kBlockSize;
char* dst_bytes = static_cast<char*>(dst);
const char* src_bytes = static_cast<const char*>(src);
static_assert(ABSL_CACHELINE_SIZE % kBlockSize == 0,
"Cache lines are not divided evenly into blocks, may have "
"unintended behavior!");
constexpr size_t kCrcSmallSize = 256;
constexpr std::size_t kPrefetchAhead = 2 * ABSL_CACHELINE_SIZE;
if (length < kCrcSmallSize) {
crc32c_t crc =
ExtendCrc32c(initial_crc, absl::string_view(src_bytes, length));
memcpy(dst, src, length);
return crc;
}
initial_crc = crc32c_t{static_cast<uint32_t>(initial_crc) ^ kCrcDataXor};
std::size_t bytes_from_last_aligned =
reinterpret_cast<uintptr_t>(dst) & (kBlockSize - 1);
if (bytes_from_last_aligned != 0) {
std::size_t bytes_for_alignment = kBlockSize - bytes_from_last_aligned;
initial_crc =
ShortCrcCopy(dst_bytes, src_bytes, bytes_for_alignment, initial_crc);
src_bytes += bytes_for_alignment;
dst_bytes += bytes_for_alignment;
length -= bytes_for_alignment;
}
crc32c_t crcs[kRegions];
crcs[0] = initial_crc;
for (size_t i = 1; i < kRegions; i++) {
crcs[i] = crc32c_t{kCrcDataXor};
}
size_t copy_rounds = length / kCopyRoundSize;
const std::size_t region_size = copy_rounds * kBlockSize;
const std::size_t tail_size = length - (kRegions * region_size);
std::array<V128, vec_regions> vec_data;
std::array<uint64_t, int_regions * kIntLoadsPerVec> int_data;
while (copy_rounds > kBlocksPerCacheLine) {
for (size_t i = 0; i < kRegions; i++) {
absl::PrefetchToLocalCache(src_bytes + kPrefetchAhead + region_size * i);
#ifdef ABSL_INTERNAL_HAVE_X86_64_ACCELERATED_CRC_MEMCPY_ENGINE
absl::PrefetchToLocalCache(dst_bytes + kPrefetchAhead + region_size * i);
#endif
}
for (size_t i = 0; i < kBlocksPerCacheLine; i++) {
for (size_t j = 0; j < vec_regions; j++) {
size_t region = (j + i) % kRegions;
auto* vsrc =
reinterpret_cast<const V128*>(src_bytes + region_size * region);
auto* vdst = reinterpret_cast<V128*>(dst_bytes + region_size * region);
vec_data[j] = V128_LoadU(vsrc + i);
crcs[region] = crc32c_t{static_cast<uint32_t>(
CRC32_u64(static_cast<uint32_t>(crcs[region]),
static_cast<uint64_t>(V128_Extract64<0>(vec_data[j]))))};
crcs[region] = crc32c_t{static_cast<uint32_t>(
CRC32_u64(static_cast<uint32_t>(crcs[region]),
static_cast<uint64_t>(V128_Extract64<1>(vec_data[j]))))};
V128_Store(vdst + i, vec_data[j]);
}
for (size_t j = 0; j < int_regions; j++) {
size_t region = (j + vec_regions + i) % kRegions;
auto* usrc =
reinterpret_cast<const uint64_t*>(src_bytes + region_size * region);
auto* udst =
reinterpret_cast<uint64_t*>(dst_bytes + region_size * region);
for (size_t k = 0; k < kIntLoadsPerVec; k++) {
size_t data_index = j * kIntLoadsPerVec + k;
int_data[data_index] = *(usrc + i * kIntLoadsPerVec + k);
crcs[region] = crc32c_t{CRC32_u64(static_cast<uint32_t>(crcs[region]),
int_data[data_index])};
*(udst + i * kIntLoadsPerVec + k) = int_data[data_index];
}
}
}
src_bytes += kBlockSize * kBlocksPerCacheLine;
dst_bytes += kBlockSize * kBlocksPerCacheLine;
copy_rounds -= kBlocksPerCacheLine;
}
LargeTailCopy<vec_regions, int_regions>(crcs, &dst_bytes, &src_bytes,
region_size, copy_rounds);
src_bytes += region_size * (kRegions - 1);
dst_bytes += region_size * (kRegions - 1);
std::size_t tail_blocks = tail_size / kBlockSize;
LargeTailCopy<0, 1>(&crcs[kRegions - 1], &dst_bytes, &src_bytes, 0,
tail_blocks);
crcs[kRegions - 1] =
ShortCrcCopy(dst_bytes, src_bytes, tail_size - tail_blocks * kBlockSize,
crcs[kRegions - 1]);
if (kRegions == 1) {
return crc32c_t{static_cast<uint32_t>(crcs[0]) ^ kCrcDataXor};
}
for (size_t i = 0; i + 1 < kRegions; i++) {
crcs[i] = crc32c_t{static_cast<uint32_t>(crcs[i]) ^ kCrcDataXor};
}
crc32c_t full_crc = crcs[0];
for (size_t i = 1; i + 1 < kRegions; i++) {
full_crc = ConcatCrc32c(full_crc, crcs[i], region_size);
}
crcs[kRegions - 1] =
crc32c_t{static_cast<uint32_t>(crcs[kRegions - 1]) ^ kCrcDataXor};
return ConcatCrc32c(full_crc, crcs[kRegions - 1], region_size + tail_size);
}
CrcMemcpy::ArchSpecificEngines CrcMemcpy::GetArchSpecificEngines() {
#ifdef UNDEFINED_BEHAVIOR_SANITIZER
CpuType cpu_type = GetCpuType();
switch (cpu_type) {
case CpuType::kAmdRome:
case CpuType::kAmdNaples:
case CpuType::kAmdMilan:
case CpuType::kAmdGenoa:
case CpuType::kAmdRyzenV3000:
case CpuType::kIntelCascadelakeXeon:
case CpuType::kIntelSkylakeXeon:
case CpuType::kIntelSkylake:
case CpuType::kIntelBroadwell:
case CpuType::kIntelHaswell:
case CpuType::kIntelIvybridge:
return {
new FallbackCrcMemcpyEngine(),
new CrcNonTemporalMemcpyAVXEngine(),
};
case CpuType::kIntelSandybridge:
return {
new FallbackCrcMemcpyEngine(),
new CrcNonTemporalMemcpyEngine(),
};
default:
return {new FallbackCrcMemcpyEngine(),
new FallbackCrcMemcpyEngine()};
}
#else
CpuType cpu_type = GetCpuType();
switch (cpu_type) {
case CpuType::kAmdRome:
case CpuType::kAmdNaples:
case CpuType::kAmdMilan:
case CpuType::kAmdGenoa:
case CpuType::kAmdRyzenV3000:
return {
new AcceleratedCrcMemcpyEngine<1, 2>(),
new CrcNonTemporalMemcpyAVXEngine(),
};
case CpuType::kIntelCascadelakeXeon:
case CpuType::kIntelSkylakeXeon:
case CpuType::kIntelSkylake:
case CpuType::kIntelBroadwell:
case CpuType::kIntelHaswell:
case CpuType::kIntelIvybridge:
return {
new AcceleratedCrcMemcpyEngine<3, 0>(),
new CrcNonTemporalMemcpyAVXEngine(),
};
case CpuType::kIntelSandybridge:
return {
new AcceleratedCrcMemcpyEngine<3, 0>(),
new CrcNonTemporalMemcpyEngine(),
};
default:
return {new FallbackCrcMemcpyEngine(),
new FallbackCrcMemcpyEngine()};
}
#endif
}
std::unique_ptr<CrcMemcpyEngine> CrcMemcpy::GetTestEngine(int vector,
int integer) {
if (vector == 3 && integer == 0) {
return std::make_unique<AcceleratedCrcMemcpyEngine<3, 0>>();
} else if (vector == 1 && integer == 2) {
return std::make_unique<AcceleratedCrcMemcpyEngine<1, 2>>();
} else if (vector == 1 && integer == 0) {
return std::make_unique<AcceleratedCrcMemcpyEngine<1, 0>>();
}
return nullptr;
}
}
ABSL_NAMESPACE_END
}
#endif