#ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_X86_H
#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_X86_H
#include "src/__support/macros/config.h"
#include "src/__support/macros/properties/architectures.h"
#if defined(LIBC_TARGET_ARCH_IS_X86)
#include "src/__support/common.h"
#include "src/string/memory_utils/op_builtin.h"
#include "src/string/memory_utils/op_generic.h"
#if defined(__AVX512BW__) || defined(__AVX512F__) || defined(__AVX2__) || \
defined(__SSE2__)
#include <immintrin.h>
#endif
#if !defined(__AVX512BW__) && (defined(_MSC_VER) || defined(__SCE__))
#define _mm512_cmpneq_epi8_mask …
#endif
#if !defined(__AVX2__) && (defined(_MSC_VER) || defined(__SCE__))
#define _mm256_movemask_epi8 …
#endif
#if !defined(__SSE2__) && (defined(_MSC_VER) || defined(__SCE__))
#define _mm_movemask_epi8 …
#endif
namespace LIBC_NAMESPACE_DECL {
namespace x86 {
LIBC_INLINE_VAR constexpr bool K_SSE2 = …;
LIBC_INLINE_VAR constexpr bool K_SSE41 = …;
LIBC_INLINE_VAR constexpr bool K_AVX = …;
LIBC_INLINE_VAR constexpr bool K_AVX2 = …;
LIBC_INLINE_VAR constexpr bool K_AVX512_F = …;
LIBC_INLINE_VAR constexpr bool K_AVX512_BW = …;
struct Memcpy { … };
}
}
namespace LIBC_NAMESPACE_DECL {
namespace generic {
template <typename T>
LIBC_INLINE uint32_t branchless_head_tail_neq(CPtr p1, CPtr p2, size_t count) { … }
template <> struct cmp_is_expensive<uint16_t> : public cpp::false_type { … };
template <> LIBC_INLINE bool eq<uint16_t>(CPtr p1, CPtr p2, size_t offset) { … }
template <>
LIBC_INLINE uint32_t neq<uint16_t>(CPtr p1, CPtr p2, size_t offset) { … }
template <>
LIBC_INLINE MemcmpReturnType cmp<uint16_t>(CPtr p1, CPtr p2, size_t offset) { … }
template <>
LIBC_INLINE MemcmpReturnType cmp_neq<uint16_t>(CPtr p1, CPtr p2, size_t offset);
template <> struct cmp_is_expensive<uint32_t> : public cpp::false_type { … };
template <> LIBC_INLINE bool eq<uint32_t>(CPtr p1, CPtr p2, size_t offset) { … }
template <>
LIBC_INLINE uint32_t neq<uint32_t>(CPtr p1, CPtr p2, size_t offset) { … }
template <>
LIBC_INLINE MemcmpReturnType cmp<uint32_t>(CPtr p1, CPtr p2, size_t offset) { … }
template <>
LIBC_INLINE MemcmpReturnType cmp_neq<uint32_t>(CPtr p1, CPtr p2, size_t offset);
template <> struct cmp_is_expensive<uint64_t> : public cpp::true_type { … };
template <> LIBC_INLINE bool eq<uint64_t>(CPtr p1, CPtr p2, size_t offset) { … }
template <>
LIBC_INLINE uint32_t neq<uint64_t>(CPtr p1, CPtr p2, size_t offset) { … }
template <>
LIBC_INLINE MemcmpReturnType cmp<uint64_t>(CPtr p1, CPtr p2, size_t offset);
template <>
LIBC_INLINE MemcmpReturnType cmp_neq<uint64_t>(CPtr p1, CPtr p2,
size_t offset) { … }
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wignored-attributes"
#if defined(__SSE4_1__)
template <> struct is_vector<__m128i> : cpp::true_type {};
template <> struct cmp_is_expensive<__m128i> : cpp::true_type {};
LIBC_INLINE __m128i load_and_xor_m128i(CPtr p1, CPtr p2, size_t offset) {
const auto a = load<__m128i>(p1, offset);
const auto b = load<__m128i>(p2, offset);
return _mm_xor_si128(a, b);
}
LIBC_INLINE __m128i bytewise_max(__m128i a, __m128i b) {
return _mm_max_epu8(a, b);
}
LIBC_INLINE __m128i bytewise_reverse(__m128i value) {
return _mm_shuffle_epi8(value, _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7,
8, 9, 10, 11, 12, 13, 14, 15));
}
LIBC_INLINE uint16_t big_endian_cmp_mask(__m128i max, __m128i value) {
return static_cast<uint16_t>(
_mm_movemask_epi8(bytewise_reverse(_mm_cmpeq_epi8(max, value))));
}
LIBC_INLINE bool is_zero(__m128i value) {
return _mm_testz_si128(value, value) == 1;
}
template <> LIBC_INLINE bool eq<__m128i>(CPtr p1, CPtr p2, size_t offset) {
return is_zero(load_and_xor_m128i(p1, p2, offset));
}
template <> LIBC_INLINE uint32_t neq<__m128i>(CPtr p1, CPtr p2, size_t offset) {
return !is_zero(load_and_xor_m128i(p1, p2, offset));
}
template <>
LIBC_INLINE uint32_t branchless_head_tail_neq<__m128i>(CPtr p1, CPtr p2,
size_t count) {
const __m128i head = load_and_xor_m128i(p1, p2, 0);
const __m128i tail = load_and_xor_m128i(p1, p2, count - sizeof(__m128i));
return !is_zero(_mm_or_si128(head, tail));
}
template <>
LIBC_INLINE MemcmpReturnType cmp_neq<__m128i>(CPtr p1, CPtr p2, size_t offset) {
const auto a = load<__m128i>(p1, offset);
const auto b = load<__m128i>(p2, offset);
const auto vmax = bytewise_max(a, b);
const auto le = big_endian_cmp_mask(vmax, b);
const auto ge = big_endian_cmp_mask(vmax, a);
static_assert(cpp::is_same_v<cpp::remove_cv_t<decltype(le)>, uint16_t>);
return static_cast<int32_t>(ge) - static_cast<int32_t>(le);
}
#endif
#if defined(__AVX__)
template <> struct is_vector<__m256i> : cpp::true_type {};
template <> struct cmp_is_expensive<__m256i> : cpp::true_type {};
LIBC_INLINE __m256i xor_m256i(__m256i a, __m256i b) {
return _mm256_castps_si256(
_mm256_xor_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
}
LIBC_INLINE __m256i or_m256i(__m256i a, __m256i b) {
return _mm256_castps_si256(
_mm256_or_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
}
LIBC_INLINE __m256i load_and_xor_m256i(CPtr p1, CPtr p2, size_t offset) {
const auto a = load<__m256i>(p1, offset);
const auto b = load<__m256i>(p2, offset);
return xor_m256i(a, b);
}
LIBC_INLINE bool is_zero(__m256i value) {
return _mm256_testz_si256(value, value) == 1;
}
template <> LIBC_INLINE bool eq<__m256i>(CPtr p1, CPtr p2, size_t offset) {
return is_zero(load_and_xor_m256i(p1, p2, offset));
}
template <> LIBC_INLINE uint32_t neq<__m256i>(CPtr p1, CPtr p2, size_t offset) {
return !is_zero(load_and_xor_m256i(p1, p2, offset));
}
template <>
LIBC_INLINE uint32_t branchless_head_tail_neq<__m256i>(CPtr p1, CPtr p2,
size_t count) {
const __m256i head = load_and_xor_m256i(p1, p2, 0);
const __m256i tail = load_and_xor_m256i(p1, p2, count - sizeof(__m256i));
return !is_zero(or_m256i(head, tail));
}
#endif
#if defined(__AVX2__)
LIBC_INLINE __m256i bytewise_max(__m256i a, __m256i b) {
return _mm256_max_epu8(a, b);
}
LIBC_INLINE uint32_t big_endian_cmp_mask(__m256i max, __m256i value) {
const __m256i little_endian_byte_mask = _mm256_cmpeq_epi8(max, value);
#if defined(__AVX512VBMI__) && defined(__AVX512VL__)
const __m256i big_endian_byte_mask =
_mm256_permutexvar_epi8(_mm256_set_epi8(0, 1, 2, 3, 4, 5, 6, 7,
8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 21, 22, 23,
24, 25, 26, 27, 28, 29, 30, 31),
little_endian_byte_mask);
return _mm256_movemask_epi8(big_endian_byte_mask);
#else
const __m256i half_reversed = _mm256_shuffle_epi8(
little_endian_byte_mask, _mm256_set_epi8(0, 1, 2, 3, 4, 5, 6, 7,
8, 9, 10, 11, 12, 13, 14, 15,
0, 1, 2, 3, 4, 5, 6, 7,
8, 9, 10, 11, 12, 13, 14, 15));
const uint32_t half_reversed_scalar = _mm256_movemask_epi8(half_reversed);
return (half_reversed_scalar << 16) | (half_reversed_scalar >> 16);
#endif
}
template <>
LIBC_INLINE MemcmpReturnType cmp_neq<__m256i>(CPtr p1, CPtr p2, size_t offset) {
const auto a = load<__m256i>(p1, offset);
const auto b = load<__m256i>(p2, offset);
const auto vmax = bytewise_max(a, b);
const auto le = big_endian_cmp_mask(vmax, b);
const auto ge = big_endian_cmp_mask(vmax, a);
static_assert(cpp::is_same_v<cpp::remove_cv_t<decltype(le)>, uint32_t>);
return cmp_neq_uint64_t(ge, le);
}
#endif
#if defined(__AVX512BW__)
template <> struct is_vector<__m512i> : cpp::true_type {};
template <> struct cmp_is_expensive<__m512i> : cpp::true_type {};
LIBC_INLINE __m512i bytewise_max(__m512i a, __m512i b) {
return _mm512_max_epu8(a, b);
}
LIBC_INLINE uint64_t big_endian_cmp_mask(__m512i max, __m512i value) {
#if false && defined(__AVX512VBMI__)
const auto indices = _mm512_set_epi8(0, 1, 2, 3, 4, 5, 6, 7,
8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 21, 22, 23,
24, 25, 26, 27, 28, 29, 30, 31,
32, 33, 34, 35, 36, 37, 38, 39,
40, 41, 42, 43, 44, 45, 46, 47,
48, 49, 50, 51, 52, 53, 54, 55,
56, 57, 58, 59, 60, 61, 62, 63);
return _mm512_cmpeq_epi8_mask(_mm512_permutexvar_epi8(indices, max),
_mm512_permutexvar_epi8(indices, value));
#else
const __m512i indices = _mm512_set_epi8(8, 9, 10, 11, 12, 13, 14, 15,
0, 1, 2, 3, 4, 5, 6, 7,
8, 9, 10, 11, 12, 13, 14, 15,
0, 1, 2, 3, 4, 5, 6, 7,
8, 9, 10, 11, 12, 13, 14, 15,
0, 1, 2, 3, 4, 5, 6, 7,
8, 9, 10, 11, 12, 13, 14, 15,
0, 1, 2, 3, 4, 5, 6, 7);
return __builtin_bswap64(
_mm512_cmpeq_epi8_mask(_mm512_shuffle_epi8(max, indices),
_mm512_shuffle_epi8(value, indices)));
#endif
}
template <> LIBC_INLINE bool eq<__m512i>(CPtr p1, CPtr p2, size_t offset) {
const auto a = load<__m512i>(p1, offset);
const auto b = load<__m512i>(p2, offset);
return _mm512_cmpneq_epi8_mask(a, b) == 0;
}
template <> LIBC_INLINE uint32_t neq<__m512i>(CPtr p1, CPtr p2, size_t offset) {
const auto a = load<__m512i>(p1, offset);
const auto b = load<__m512i>(p2, offset);
return _mm512_cmpneq_epi8_mask(a, b) != 0;
}
LIBC_INLINE __m512i load_and_xor_m512i(CPtr p1, CPtr p2, size_t offset) {
const auto a = load<__m512i>(p1, offset);
const auto b = load<__m512i>(p2, offset);
return _mm512_xor_epi64(a, b);
}
LIBC_INLINE bool is_zero(__m512i value) {
return _mm512_test_epi32_mask(value, value) == 0;
}
template <>
LIBC_INLINE uint32_t branchless_head_tail_neq<__m512i>(CPtr p1, CPtr p2,
size_t count) {
const __m512i head = load_and_xor_m512i(p1, p2, 0);
const __m512i tail = load_and_xor_m512i(p1, p2, count - sizeof(__m512i));
return !is_zero(_mm512_or_epi64(head, tail));
}
template <>
LIBC_INLINE MemcmpReturnType cmp_neq<__m512i>(CPtr p1, CPtr p2, size_t offset) {
const auto a = load<__m512i>(p1, offset);
const auto b = load<__m512i>(p2, offset);
const auto vmax = bytewise_max(a, b);
const auto le = big_endian_cmp_mask(vmax, b);
const auto ge = big_endian_cmp_mask(vmax, a);
static_assert(cpp::is_same_v<cpp::remove_cv_t<decltype(le)>, uint64_t>);
return cmp_neq_uint64_t(ge, le);
}
#endif
#pragma GCC diagnostic pop
}
}
#endif
#endif