#include <folly/detail/RangeSse42.h>
#include <cassert>
#include <folly/Portability.h>
#if !FOLLY_SSE_PREREQ(4, 2)
namespace folly {
namespace detail {
size_t qfind_first_byte_of_sse42(
const StringPieceLite haystack, const StringPieceLite needles) { … }
}
}
#else
#include <emmintrin.h>
#include <nmmintrin.h>
#include <smmintrin.h>
#include <cstdint>
#include <limits>
#include <string>
#include <folly/Likely.h>
#include <folly/detail/Sse.h>
namespace folly {
namespace detail {
static constexpr size_t kMinPageSize = 4096;
static_assert(
kMinPageSize >= 16, "kMinPageSize must be at least SSE register size");
template <typename T>
static inline uintptr_t page_for(T* addr) {
return reinterpret_cast<uintptr_t>(addr) / kMinPageSize;
}
static inline size_t nextAlignedIndex(const char* arr) {
auto firstPossible = reinterpret_cast<uintptr_t>(arr) + 1;
return 1 +
((firstPossible + 15) & ~0xF)
- firstPossible;
}
size_t qfind_first_byte_of_needles16(
const StringPieceLite haystack, const StringPieceLite needles) {
assert(haystack.size() > 0u);
assert(needles.size() > 0u);
assert(needles.size() <= 16u);
if ((needles.size() <= 2 && haystack.size() >= 256) ||
(haystack.size() < 16 &&
page_for(haystack.end() - 1) != page_for(haystack.data() + 15)) ||
page_for(needles.end() - 1) != page_for(needles.data() + 15)) {
return detail::qfind_first_byte_of_nosse(haystack, needles);
}
auto arr2 = _mm_loadu_si128_unchecked(
reinterpret_cast<const __m128i*>(needles.data()));
auto arr1 = _mm_loadu_si128_unchecked(
reinterpret_cast<const __m128i*>(haystack.data()));
auto index =
_mm_cmpestri(arr2, int(needles.size()), arr1, int(haystack.size()), 0);
if (index < 16) {
return size_t(index);
}
size_t i = nextAlignedIndex(haystack.data());
for (; i < haystack.size(); i += 16) {
arr1 = _mm_load_si128_unchecked(
reinterpret_cast<const __m128i*>(haystack.data() + i));
index = _mm_cmpestri(
arr2, int(needles.size()), arr1, int(haystack.size() - i), 0);
if (index < 16) {
return i + index;
}
}
return std::string::npos;
}
template <bool HAYSTACK_ALIGNED>
size_t scanHaystackBlock(
const StringPieceLite haystack,
const StringPieceLite needles,
uint64_t blockStartIdx) {
assert(needles.size() > 16u);
assert(
blockStartIdx + 16 <= haystack.size() ||
(page_for(haystack.data() + blockStartIdx) ==
page_for(haystack.data() + blockStartIdx + 15)));
__m128i arr1;
if (HAYSTACK_ALIGNED) {
arr1 = _mm_load_si128_unchecked(
reinterpret_cast<const __m128i*>(haystack.data() + blockStartIdx));
} else {
arr1 = _mm_loadu_si128_unchecked(
reinterpret_cast<const __m128i*>(haystack.data() + blockStartIdx));
}
auto arr2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(needles.data()));
auto b =
_mm_cmpestri(arr2, 16, arr1, int(haystack.size() - blockStartIdx), 0);
size_t j = nextAlignedIndex(needles.data());
for (; j < needles.size(); j += 16) {
arr2 = _mm_load_si128_unchecked(
reinterpret_cast<const __m128i*>(needles.data() + j));
auto index = _mm_cmpestri(
arr2,
int(needles.size() - j),
arr1,
int(haystack.size() - blockStartIdx),
0);
b = std::min(index, b);
}
if (b < 16) {
return blockStartIdx + b;
}
return std::string::npos;
}
size_t qfind_first_byte_of_sse42(
const StringPieceLite haystack, const StringPieceLite needles);
size_t qfind_first_byte_of_sse42(
const StringPieceLite haystack, const StringPieceLite needles) {
if (FOLLY_UNLIKELY(needles.empty() || haystack.empty())) {
return std::string::npos;
} else if (needles.size() <= 16) {
return qfind_first_byte_of_needles16(haystack, needles);
}
if (haystack.size() < 16 &&
page_for(haystack.end() - 1) != page_for(haystack.data() + 16)) {
if (haystack.size() <= 2) {
return qfind_first_byte_of_std(haystack, needles);
}
return qfind_first_byte_of_byteset(haystack, needles);
}
auto ret = scanHaystackBlock<false>(haystack, needles, 0);
if (ret != std::string::npos) {
return ret;
}
size_t i = nextAlignedIndex(haystack.data());
for (; i < haystack.size(); i += 16) {
ret = scanHaystackBlock<true>(haystack, needles, i);
if (ret != std::string::npos) {
return ret;
}
}
return std::string::npos;
}
}
}
#endif