folly/folly/detail/test/SplitStringSimdTest.cpp

/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#include <folly/detail/SplitStringSimd.h>
#include <folly/detail/SplitStringSimdImpl.h>

#include <folly/FBString.h>
#include <folly/FBVector.h>
#include <folly/Range.h>
#include <folly/portability/GTest.h>
#include <folly/small_vector.h>

#include <random>

namespace folly {
namespace detail {

// making sure that basic scalar works
TEST(SplitStringSimdTest, ByCharScalarKeepEmpty) {
  using pieces = std::vector<folly::StringPiece>;

  auto run = [](folly::StringPiece s) {
    pieces res;
    splitByCharScalar<false>(',', s, res);
    return res;
  };

  ASSERT_EQ(run(""), (pieces{""}));
  ASSERT_EQ(run("a"), (pieces{"a"}));
  ASSERT_EQ(run(",a"), (pieces{"", "a"}));
  ASSERT_EQ(run("a,aa"), (pieces{"a", "aa"}));
  ASSERT_EQ(run("a,,aa"), (pieces{"a", "", "aa"}));
  ASSERT_EQ(run("aaaa,aaa,aa,a"), (pieces{"aaaa", "aaa", "aa", "a"}));
  ASSERT_EQ(run("aaaa,aaa,aa,a,"), (pieces{"aaaa", "aaa", "aa", "a", ""}));
}

TEST(SplitStringSimdTest, ByCharScalarIgnoreEmpty) {
  using pieces = std::vector<folly::StringPiece>;

  auto run = [](folly::StringPiece s) {
    pieces res;
    splitByCharScalar<true>(',', s, res);
    return res;
  };

  ASSERT_EQ(run(""), (pieces{}));
  ASSERT_EQ(run("a"), (pieces{"a"}));
  ASSERT_EQ(run(",a"), (pieces{"a"}));
  ASSERT_EQ(run("a,aa"), (pieces{"a", "aa"}));
  ASSERT_EQ(run("a,,aa"), (pieces{"a", "aa"}));
  ASSERT_EQ(run("aaaa,aaa,aa,a,"), (pieces{"aaaa", "aaa", "aa", "a"}));
  ASSERT_EQ(run(",,,"), (pieces{}));
}

template <bool ignoreEmpty, typename Container>
void testContainerSV(
    folly::StringPiece s, const std::vector<folly::StringPiece>& expected) {
  Container actual;
  simdSplitByChar(',', s, actual, ignoreEmpty);

  ASSERT_EQ(expected.size(), actual.size());

  for (std::size_t i = 0; i != expected.size(); ++i) {
    ASSERT_EQ(expected[i].data(), actual[i].data()) << s << " : " << i;
    ASSERT_EQ(expected[i].size(), actual[i].size()) << s << " : " << i;
  }
}

// shorter name to get 1 line auto format
template <bool ie>
void testAllContainersOfSVs(
    folly::StringPiece s, const std::vector<folly::StringPiece>& expected) {
  testContainerSV<ie, folly::fbvector<folly::StringPiece>>(s, expected);
  testContainerSV<ie, folly::fbvector<std::string_view>>(s, expected);

  testContainerSV<ie, folly::small_vector<folly::StringPiece, 1>>(s, expected);
  testContainerSV<ie, folly::small_vector<folly::StringPiece, 2>>(s, expected);
  testContainerSV<ie, folly::small_vector<folly::StringPiece, 3>>(s, expected);
  testContainerSV<ie, folly::small_vector<folly::StringPiece, 4>>(s, expected);
  testContainerSV<ie, folly::small_vector<folly::StringPiece, 6>>(s, expected);
  testContainerSV<ie, folly::small_vector<folly::StringPiece, 7>>(s, expected);
  testContainerSV<ie, folly::small_vector<folly::StringPiece, 8>>(s, expected);
  static_assert(
      !SimdSplitByCharIsDefinedFor<
          folly::small_vector<folly::StringPiece, 9>>::value,
      "");

  testContainerSV<ie, folly::small_vector<std::string_view, 1>>(s, expected);
  testContainerSV<ie, folly::small_vector<std::string_view, 2>>(s, expected);
  testContainerSV<ie, folly::small_vector<std::string_view, 3>>(s, expected);
  testContainerSV<ie, folly::small_vector<std::string_view, 4>>(s, expected);
  testContainerSV<ie, folly::small_vector<std::string_view, 6>>(s, expected);
  testContainerSV<ie, folly::small_vector<std::string_view, 7>>(s, expected);
  testContainerSV<ie, folly::small_vector<std::string_view, 8>>(s, expected);
  static_assert(
      !SimdSplitByCharIsDefinedFor<
          folly::small_vector<std::string_view, 9>>::value,
      "");
}

template <bool ignoreEmpty, typename Container>
void testContainersOfStrings(
    folly::StringPiece s, const std::vector<folly::StringPiece>& expected) {
  Container actual;
  simdSplitByChar(',', s, actual, ignoreEmpty);

  ASSERT_EQ(expected.size(), actual.size()) << s;

  for (std::size_t i = 0; i != expected.size(); ++i) {
    ASSERT_EQ(expected[i], actual[i]) << s << " : " << i;
  }
}

template <bool ie>
void testAllContainersOfStrings(
    folly::StringPiece s, const std::vector<folly::StringPiece>& expected) {
  testContainersOfStrings<ie, std::vector<std::string>>(s, expected);
  testContainersOfStrings<ie, folly::fbvector<std::string>>(s, expected);

  testContainersOfStrings<ie, std::vector<folly::fbstring>>(s, expected);
  testContainersOfStrings<ie, folly::fbvector<folly::fbstring>>(s, expected);
}

template <bool ignoreEmpty>
void runTestStringSplitOneType(folly::StringPiece s) {
  std::vector<folly::StringPiece> expected;
  splitByCharScalar<ignoreEmpty>(',', s, expected);

  std::vector<std::vector<folly::StringPiece>> actuals;

  actuals.emplace_back();
  simdSplitByChar(',', s, actuals.back(), ignoreEmpty);

#if FOLLY_X64
  actuals.emplace_back();
  PlatformSimdSplitByChar<simd_detail::SimdCharSse2Platform, ignoreEmpty>{}(
      ',', s, actuals.back());
#if defined(__AVX2__)
  actuals.emplace_back();
  PlatformSimdSplitByChar<simd_detail::SimdCharAvx2Platform, ignoreEmpty>{}(
      ',', s, actuals.back());
#endif
#endif

#if FOLLY_AARCH64
  actuals.emplace_back();
  PlatformSimdSplitByChar<simd_detail::SimdCharAarch64Platform, ignoreEmpty>{}(
      ',', s, actuals.back());
#endif

  for (const auto& actual : actuals) {
    ASSERT_EQ(expected.size(), actual.size()) << s;

    for (std::size_t i = 0; i != expected.size(); ++i) {
      ASSERT_EQ(expected[i].data(), actual[i].data()) << s << " : " << i;
      ASSERT_EQ(expected[i].size(), actual[i].size()) << s << " : " << i;
    }
  }

  testAllContainersOfSVs<ignoreEmpty>(s, expected);
  testAllContainersOfStrings<ignoreEmpty>(s, expected);
}

void runTestStringSplit(folly::StringPiece s) {
  runTestStringSplitOneType<false>(s);
  runTestStringSplitOneType<true>(s);
}

std::string repeat(std::string_view substr, std::size_t n) {
  std::string res;
  while (n--) {
    res.append(substr);
  }
  return res;
}

// We also do fuzzing for covering more cases.
TEST(SplitStringSimd, ByChar) {
  runTestStringSplit("");
  runTestStringSplit(",");
  runTestStringSplit(",,");
  runTestStringSplit(",,,");
  runTestStringSplit(",a,,aa");
  runTestStringSplit("aa,aaa,aaaa");
  runTestStringSplit("aa,aaa,,,aaa,,a");

  runTestStringSplit(repeat("a,", 100));
  runTestStringSplit(repeat("aa,", 100));
  // every char (the bigger cases come from older version)
  runTestStringSplit(repeat(",", 255));
  runTestStringSplit(repeat(",", 512));
  runTestStringSplit(repeat(",", 512 + 16));
  runTestStringSplit(repeat(",", 512 + 32));
  runTestStringSplit(repeat(",", 512 + 64));
  runTestStringSplit(
      repeat(",", std::numeric_limits<std::uint16_t>::max() + 512 + 16));

  // special case: triggered shift right by 32 on uint32
  {
    constexpr std::string_view kTestData = "ong_history_by_pagetype_convr:0,";
    static_assert(kTestData.size() == 32);

    alignas(32) std::array<char, 32> buf;
    std::copy(kTestData.begin(), kTestData.end(), buf.begin());
    runTestStringSplit({buf.data(), buf.size()});
  }
}

TEST(SplitStringSimd, ByCharTestDifferentOffsets) {
  alignas(32) std::array<char, 100> buf;

  std::mt19937 gen;
  std::uniform_int_distribution<> dis(0, 1);
  for (auto& c : buf) {
    c = dis(gen) ? ',' : 'a';
  }

  for (auto f = buf.begin(); f != buf.end(); ++f) {
    for (auto l = f; l != buf.end(); ++l) {
      runTestStringSplit({f, l});
    }
  }
}

} // namespace detail
} // namespace folly