folly/folly/test/ConcurrentSkipListBenchmark.cpp

/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#include <folly/ConcurrentSkipList.h>

#include <map>
#include <memory>
#include <random>
#include <set>
#include <thread>

#include <glog/logging.h>

#include <folly/Benchmark.h>
#include <folly/hash/Hash.h>
#include <folly/portability/GFlags.h>
#include <folly/synchronization/RWSpinLock.h>

DEFINE_int32(num_threads, 12, "num concurrent threads to test");

// In some case, we may want to test worker threads operating on multiple
// lists. For example in search, not all threads are visiting the same posting
// list, but for the ones with some popular terms, they do get multiple
// visitors at the same time.
DEFINE_int32(num_sets, 1, "num of set to operate on");

static const int kInitHeadHeight = 10;
static const int kMaxValue = 0x1000000;

namespace {

using namespace folly;

typedef int ValueType;
typedef ConcurrentSkipList<ValueType> SkipListType;
typedef SkipListType::Accessor SkipListAccessor;
typedef std::set<ValueType> SetType;

static std::vector<ValueType> gData;
static void initData() {
  gData.resize(kMaxValue);
  for (int i = 0; i < kMaxValue; ++i) {
    gData[i] = i;
  }
  std::shuffle(gData.begin(), gData.end(), std::mt19937{});
}

// single thread benchmarks
void BM_IterateOverSet(int iters, int size) {
  SetType a_set;

  BENCHMARK_SUSPEND {
    CHECK_GT(size, 0);
    for (int i = 0; i < size; ++i) {
      a_set.insert(gData[rand() % kMaxValue]);
    }
  }

  [[maybe_unused]] int64_t sum = 0;
  auto iter = a_set.begin();
  for (int i = 0; i < iters; ++i) {
    sum += *iter++;
    if (iter == a_set.end()) {
      iter = a_set.begin();
    }
  }
  BENCHMARK_SUSPEND {
    // VLOG(20) << "sum = " << sum;
  }
}

void BM_IterateSkipList(int iters, int size) {
  BenchmarkSuspender susp;
  CHECK_GT(size, 0);
  auto skipList = SkipListType::create(kInitHeadHeight);
  for (int i = 0; i < size; ++i) {
    skipList.add(rand() % kMaxValue);
  }
  [[maybe_unused]] int64_t sum = 0;
  susp.dismiss();

  auto iter = skipList.begin();
  for (int i = 0; i < iters; ++i) {
    sum += *iter++;
    if (iter == skipList.end()) {
      iter = skipList.begin();
    }
  }

  BENCHMARK_SUSPEND {
    // VLOG(20) << "sum = " << sum;
  }
}

void BM_SetMerge(int iters, int size) {
  BenchmarkSuspender susp;
  SetType a_set;
  SetType b_set;
  for (int i = 0; i < iters; ++i) {
    a_set.insert(rand() % kMaxValue);
  }
  for (int i = 0; i < size; ++i) {
    b_set.insert(rand() % kMaxValue);
  }
  susp.dismiss();

  [[maybe_unused]] int64_t mergedSum = 0;
  FOR_EACH (it, a_set) {
    if (b_set.find(*it) != b_set.end()) {
      mergedSum += *it;
    }
  }
  BENCHMARK_SUSPEND {
    // VLOG(20) << mergedSum;
  }
}

void BM_CSLMergeLookup(int iters, int size) {
  BenchmarkSuspender susp;
  auto skipList = SkipListType::create(kInitHeadHeight);
  auto skipList2 = SkipListType::create(kInitHeadHeight);

  for (int i = 0; i < iters; ++i) {
    skipList.add(rand() % kMaxValue);
  }
  for (int i = 0; i < size; ++i) {
    skipList2.add(rand() % kMaxValue);
  }
  [[maybe_unused]] int64_t mergedSum = 0;
  susp.dismiss();

  SkipListType::Skipper skipper(skipList2);
  FOR_EACH (it, skipList) {
    if (skipper.to(*it)) {
      mergedSum += *it;
    }
  }

  BENCHMARK_SUSPEND {
    // VLOG(20) << mergedSum;
  }
}

// merge by two skippers
void BM_CSLMergeIntersection(int iters, int size) {
  BenchmarkSuspender susp;
  auto skipList = SkipListType::create(kInitHeadHeight);
  auto skipList2 = SkipListType::create(kInitHeadHeight);
  for (int i = 0; i < iters; ++i) {
    skipList.add(rand() % kMaxValue);
  }
  for (int i = 0; i < size; ++i) {
    skipList2.add(rand() % kMaxValue);
  }
  susp.dismiss();

  SkipListType::Skipper s1(skipList);
  SkipListType::Skipper s2(skipList2);

  [[maybe_unused]] int64_t mergedSum = 0;

  while (s1.good() && s2.good()) {
    int v1 = s1.data();
    int v2 = s2.data();
    if (v1 < v2) {
      s1.to(v2);
    } else if (v1 > v2) {
      s2.to(v1);
    } else {
      mergedSum += v1;
      ++s1;
      ++s2;
    }
  }

  BENCHMARK_SUSPEND {
    // VLOG(20) << mergedSum;
  }
}

void BM_SetContainsNotFound(int iters, int size) {
  BenchmarkSuspender susp;
  SetType aset;
  CHECK_LT(size, kMaxValue);
  for (int i = 0; i < size; ++i) {
    aset.insert(2 * i);
  }
  [[maybe_unused]] int64_t sum = 0;
  susp.dismiss();

  for (int i = 0; i < iters; ++i) {
    sum += (aset.end() == aset.find(2 * i + 1));
  }

  BENCHMARK_SUSPEND {
    // VLOG(20) << sum;
  }
}

void BM_SetContainsFound(int iters, int size) {
  BenchmarkSuspender susp;
  SetType aset;
  CHECK_LT(size, kMaxValue);

  for (int i = 0; i < size; ++i) {
    aset.insert(i);
  }

  std::vector<int> values;
  for (int i = 0; i < iters; ++i) {
    values.push_back(rand() % size);
  }
  [[maybe_unused]] int64_t sum = 0;
  susp.dismiss();

  for (int i = 0; i < iters; ++i) {
    sum += (aset.end() == aset.find(values[i]));
  }

  BENCHMARK_SUSPEND {
    // VLOG(20) << sum;
  }
}

void BM_CSLContainsFound(int iters, int size) {
  BenchmarkSuspender susp;
  auto skipList = SkipListType::create(kInitHeadHeight);
  CHECK_LT(size, kMaxValue);

  for (int i = 0; i < size; ++i) {
    skipList.add(i);
  }
  std::vector<int> values;
  for (int i = 0; i < iters; ++i) {
    values.push_back(rand() % size);
  }
  [[maybe_unused]] int64_t sum = 0;
  susp.dismiss();

  for (int i = 0; i < iters; ++i) {
    sum += skipList.contains(values[i]);
  }

  BENCHMARK_SUSPEND {
    // VLOG(20) << sum;
  }
}

void BM_CSLContainsNotFound(int iters, int size) {
  BenchmarkSuspender susp;
  auto skipList = SkipListType::create(kInitHeadHeight);
  CHECK_LT(size, kMaxValue);

  for (int i = 0; i < size; ++i) {
    skipList.add(2 * i);
  }
  [[maybe_unused]] int64_t sum = 0;
  susp.dismiss();

  for (int i = 0; i < iters; ++i) {
    sum += skipList.contains(2 * i + 1);
  }

  BENCHMARK_SUSPEND {
    // VLOG(20) << sum;
  }
}

void BM_AddSet(int iters, int size) {
  BenchmarkSuspender susp;
  SetType aset;
  for (int i = 0; i < size; ++i) {
    aset.insert(gData[i]);
  }
  susp.dismiss();

  for (int i = size; i < size + iters; ++i) {
    aset.insert(gData[i]);
  }
}

void BM_AddSkipList(int iters, int size) {
  BenchmarkSuspender susp;
  auto skipList = SkipListType::create(kInitHeadHeight);
  for (int i = 0; i < size; ++i) {
    skipList.add(gData[i]);
  }
  susp.dismiss();

  for (int i = size; i < size + iters; ++i) {
    skipList.add(gData[i]);
  }
}

BENCHMARK(Accessor, iters) {
  BenchmarkSuspender susp;
  auto skiplist = SkipListType::createInstance(kInitHeadHeight);
  auto sl = skiplist.get();

  susp.dismiss();
  for (size_t i = 0; i < iters; ++i) {
    SkipListAccessor accessor(sl);
  }
}

// a benchmark to estimate the
// low bound of doing a ref counting for an Accessor
BENCHMARK(accessorBasicRefcounting, iters) {
  BenchmarkSuspender susp;
  auto* value = new std::atomic<int32_t>();
  auto* dirty = new std::atomic<int32_t>();
  *value = *dirty = 0;
  folly::MicroSpinLock l;
  l.init();

  susp.dismiss();
  for (size_t i = 0; i < iters; ++i) {
    value->fetch_add(1, std::memory_order_relaxed);
    if (dirty->load(std::memory_order_acquire) != 0) {
      folly::MSLGuard g(l);
    }
    value->fetch_sub(1, std::memory_order_relaxed);
  }

  BENCHMARK_SUSPEND {
    delete dirty;
    delete value;
  }
}

// Data For testing contention benchmark
class ConcurrentAccessData {
 public:
  explicit ConcurrentAccessData(int size)
      : skipList_(SkipListType::create(10)),
        sets_(FLAGS_num_sets),
        locks_(FLAGS_num_sets) {
    for (int i = 0; i < size; ++i) {
      sets_[0].insert(i);
      skipList_.add(i);
    }

    for (int i = 0; i < FLAGS_num_sets; ++i) {
      locks_[i] = new RWSpinLock();
      if (i > 0) {
        sets_[i] = sets_[0];
      }
    }

// This requires knowledge of the C++ library internals. Only use it if we're
// using the GNU C++ library.
#ifdef _GLIBCXX_SYMVER
    // memory usage
    int64_t setMemorySize = sets_[0].size() * sizeof(*sets_[0].begin()._M_node);
    int64_t cslMemorySize = 0;
    for (auto it = skipList_.begin(); it != skipList_.end(); ++it) {
      cslMemorySize += it.nodeSize();
    }

    LOG(INFO) << "size=" << sets_[0].size()
              << "; std::set memory size=" << setMemorySize
              << "; csl memory size=" << cslMemorySize;
#endif

    readValues_.reserve(size);
    deleteValues_.reserve(size);
    writeValues_.reserve(size);
    for (int i = size; i < 2 * size; ++i) {
      readValues_.push_back(2 * i);
      deleteValues_.push_back(2 * i);

      // half new values and half already in the list
      writeValues_.push_back((rand() % 2) + 2 * i);
    }
    std::shuffle(readValues_.begin(), readValues_.end(), std::mt19937{});
    std::shuffle(deleteValues_.begin(), deleteValues_.end(), std::mt19937{});
    std::shuffle(writeValues_.begin(), writeValues_.end(), std::mt19937{});
  }

  ~ConcurrentAccessData() {
    FOR_EACH (lock, locks_)
      delete *lock;
  }

  inline bool skipListFind(int /* idx */, ValueType val) {
    return skipList_.contains(val);
  }
  inline void skipListInsert(int /* idx */, ValueType val) {
    skipList_.add(val);
  }
  inline void skipListErase(int /* idx */, ValueType val) {
    skipList_.remove(val);
  }

  inline bool setFind(int idx, ValueType val) {
    std::shared_lock g(*locks_[idx]);
    return sets_[idx].find(val) == sets_[idx].end();
  }
  inline void setInsert(int idx, ValueType val) {
    std::unique_lock g(*locks_[idx]);
    sets_[idx].insert(val);
  }
  inline void setErase(int idx, ValueType val) {
    std::unique_lock g(*locks_[idx]);
    sets_[idx].erase(val);
  }

  void runSkipList(int id, size_t iters) {
    [[maybe_unused]] int sum = 0;
    for (size_t i = 0; i < iters; ++i) {
      sum += accessSkipList(id, i);
    }
    // VLOG(20) << sum;
  }

  void runSet(size_t id, size_t iters) {
    [[maybe_unused]] int sum = 0;
    for (size_t i = 0; i < iters; ++i) {
      sum += accessSet(id, i);
    }
    // VLOG(20) << sum;
  }

  bool accessSkipList(int64_t id, size_t t) {
    if (t > readValues_.size()) {
      t = t % readValues_.size();
    }
    uint32_t h = folly::hash::twang_32from64(t * id);
    switch (h % 8) {
      case 7: // write
        if ((h & 0x31) == 0) { // 1/4 chance to delete
          skipListErase(0, deleteValues_[t]);
        } else {
          skipListInsert(0, writeValues_[t]);
        }
        return false;
      default:
        return skipListFind(0, readValues_[t]);
    }
  }

  bool accessSet(int64_t id, size_t t) {
    if (t > readValues_.size()) {
      t = t % readValues_.size();
    }
    uint32_t h = folly::hash::twang_32from64(t * id);
    int idx = (h % FLAGS_num_sets);
    switch (h % 8) { // 1/8 chance to write
      case 7: // write
        if ((h & 0x31) == 0) { // 1/32 chance to delete
          setErase(idx, deleteValues_[t]);
        } else {
          setInsert(idx, writeValues_[t]);
        }
        return false;
      default:
        return setFind(idx, readValues_[t]);
    }
  }

 private:
  SkipListType::Accessor skipList_;
  std::vector<SetType> sets_;
  std::vector<RWSpinLock*> locks_;

  std::vector<ValueType> readValues_;
  std::vector<ValueType> writeValues_;
  std::vector<ValueType> deleteValues_;
};

static std::map<int, std::shared_ptr<ConcurrentAccessData>> g_data;

static ConcurrentAccessData* mayInitTestData(int size) {
  auto it = g_data.find(size);
  if (it == g_data.end()) {
    auto ptr = std::make_shared<ConcurrentAccessData>(size);
    g_data[size] = ptr;
    return ptr.get();
  }
  return it->second.get();
}

void BM_ContentionCSL(int iters, int size) {
  BenchmarkSuspender susp;
  auto data = mayInitTestData(size);
  std::vector<std::thread> threads;
  susp.dismiss();

  for (int i = 0; i < FLAGS_num_threads; ++i) {
    threads.push_back(
        std::thread(&ConcurrentAccessData::runSkipList, data, i, iters));
  }
  FOR_EACH (t, threads) {
    (*t).join();
  }
}

void BM_ContentionStdSet(int iters, int size) {
  BenchmarkSuspender susp;
  auto data = mayInitTestData(size);
  std::vector<std::thread> threads;
  susp.dismiss();

  for (int i = 0; i < FLAGS_num_threads; ++i) {
    threads.push_back(
        std::thread(&ConcurrentAccessData::runSet, data, i, iters));
  }
  FOR_EACH (t, threads) {
    (*t).join();
  }
  susp.rehire();
}

// Single-thread benchmarking

BENCHMARK_DRAW_LINE();

BENCHMARK_PARAM(BM_IterateOverSet, 1000)
BENCHMARK_PARAM(BM_IterateSkipList, 1000)
BENCHMARK_DRAW_LINE();
BENCHMARK_PARAM(BM_IterateOverSet, 1000000)
BENCHMARK_PARAM(BM_IterateSkipList, 1000000)
BENCHMARK_DRAW_LINE();

// find with keys in the set
BENCHMARK_PARAM(BM_SetContainsFound, 1000)
BENCHMARK_PARAM(BM_CSLContainsFound, 1000)
BENCHMARK_DRAW_LINE();
BENCHMARK_PARAM(BM_SetContainsFound, 100000)
BENCHMARK_PARAM(BM_CSLContainsFound, 100000)
BENCHMARK_DRAW_LINE();
BENCHMARK_PARAM(BM_SetContainsFound, 1000000)
BENCHMARK_PARAM(BM_CSLContainsFound, 1000000)
BENCHMARK_DRAW_LINE();
BENCHMARK_PARAM(BM_SetContainsFound, 10000000)
BENCHMARK_PARAM(BM_CSLContainsFound, 10000000)
BENCHMARK_DRAW_LINE();

// find with keys not in the set
BENCHMARK_PARAM(BM_SetContainsNotFound, 1000)
BENCHMARK_PARAM(BM_CSLContainsNotFound, 1000)
BENCHMARK_DRAW_LINE();
BENCHMARK_PARAM(BM_SetContainsNotFound, 100000)
BENCHMARK_PARAM(BM_CSLContainsNotFound, 100000)
BENCHMARK_DRAW_LINE();
BENCHMARK_PARAM(BM_SetContainsNotFound, 1000000)
BENCHMARK_PARAM(BM_CSLContainsNotFound, 1000000)
BENCHMARK_DRAW_LINE();

BENCHMARK_PARAM(BM_AddSet, 1000)
BENCHMARK_PARAM(BM_AddSkipList, 1000)
BENCHMARK_DRAW_LINE();

BENCHMARK_PARAM(BM_AddSet, 65536)
BENCHMARK_PARAM(BM_AddSkipList, 65536)
BENCHMARK_DRAW_LINE();

BENCHMARK_PARAM(BM_AddSet, 1000000)
BENCHMARK_PARAM(BM_AddSkipList, 1000000)
BENCHMARK_DRAW_LINE();

BENCHMARK_PARAM(BM_SetMerge, 1000)
BENCHMARK_PARAM(BM_CSLMergeIntersection, 1000)
BENCHMARK_PARAM(BM_CSLMergeLookup, 1000)
BENCHMARK_DRAW_LINE();

BENCHMARK_PARAM(BM_SetMerge, 65536)
BENCHMARK_PARAM(BM_CSLMergeIntersection, 65536)
BENCHMARK_PARAM(BM_CSLMergeLookup, 65536)
BENCHMARK_DRAW_LINE();

BENCHMARK_PARAM(BM_SetMerge, 1000000)
BENCHMARK_PARAM(BM_CSLMergeIntersection, 1000000)
BENCHMARK_PARAM(BM_CSLMergeLookup, 1000000)
BENCHMARK_DRAW_LINE();

// multithreaded benchmarking

BENCHMARK_PARAM(BM_ContentionStdSet, 1024)
BENCHMARK_PARAM(BM_ContentionCSL, 1024)
BENCHMARK_DRAW_LINE();

BENCHMARK_PARAM(BM_ContentionStdSet, 65536)
BENCHMARK_PARAM(BM_ContentionCSL, 65536)
BENCHMARK_DRAW_LINE();

BENCHMARK_PARAM(BM_ContentionStdSet, 1048576)
BENCHMARK_PARAM(BM_ContentionCSL, 1048576)
BENCHMARK_DRAW_LINE();

} // namespace

int main(int argc, char** argv) {
  google::InitGoogleLogging(argv[0]);
  gflags::ParseCommandLineFlags(&argc, &argv, true);

  initData();
  runBenchmarks();
  return 0;
}

#if 0
/*
Benchmark on Intel(R) Xeon(R) CPU X5650 @2.67GHz

==============================================================================
1 thread Benchmark                     Iters   Total t    t/iter iter/sec
------------------------------------------------------------------------------
 +37.0% BM_Accessor                    100000  1.958 ms  19.58 ns  48.71 M
*       BM_AccessorBasicRefcounting    100000  1.429 ms  14.29 ns  66.74 M
------------------------------------------------------------------------------
 + 603% BM_IterateOverSet/1000         100000  1.589 ms  15.89 ns  60.02 M
*       BM_IterateSkipList/1000        100000    226 us   2.26 ns    422 M
------------------------------------------------------------------------------
 + 107% BM_IterateOverSet/976.6k       100000  8.324 ms  83.24 ns  11.46 M
*       BM_IterateSkipList/976.6k      100000  4.016 ms  40.16 ns  23.75 M
------------------------------------------------------------------------------
*       BM_SetContainsFound/1000       100000  7.082 ms  70.82 ns  13.47 M
 +39.9% BM_CSLContainsFound/1000       100000  9.908 ms  99.08 ns  9.625 M
------------------------------------------------------------------------------
*       BM_SetContainsFound/97.66k     100000   23.8 ms    238 ns  4.006 M
 +5.97% BM_CSLContainsFound/97.66k     100000  25.23 ms  252.3 ns  3.781 M
------------------------------------------------------------------------------
 +33.6% BM_SetContainsFound/976.6k     100000   64.3 ms    643 ns  1.483 M
*       BM_CSLContainsFound/976.6k     100000  48.13 ms  481.3 ns  1.981 M
------------------------------------------------------------------------------
 +30.3% BM_SetContainsFound/9.537M     100000  115.1 ms  1.151 us  848.6 k
*       BM_CSLContainsFound/9.537M     100000  88.33 ms  883.3 ns   1.08 M
------------------------------------------------------------------------------
*       BM_SetContainsNotFound/1000    100000  2.081 ms  20.81 ns  45.83 M
 +76.2% BM_CSLContainsNotFound/1000    100000  3.667 ms  36.67 ns  26.01 M
------------------------------------------------------------------------------
*       BM_SetContainsNotFound/97.66k  100000  6.049 ms  60.49 ns  15.77 M
 +32.7% BM_CSLContainsNotFound/97.66k  100000  8.025 ms  80.25 ns  11.88 M
------------------------------------------------------------------------------
*       BM_SetContainsNotFound/976.6k  100000  7.464 ms  74.64 ns  12.78 M
 +12.8% BM_CSLContainsNotFound/976.6k  100000  8.417 ms  84.17 ns  11.33 M
------------------------------------------------------------------------------
*       BM_AddSet/1000                 100000  29.26 ms  292.6 ns  3.259 M
 +70.0% BM_AddSkipList/1000            100000  49.75 ms  497.5 ns  1.917 M
------------------------------------------------------------------------------
*       BM_AddSet/64k                  100000  38.73 ms  387.3 ns  2.462 M
 +55.7% BM_AddSkipList/64k             100000   60.3 ms    603 ns  1.581 M
------------------------------------------------------------------------------
*       BM_AddSet/976.6k               100000  75.71 ms  757.1 ns   1.26 M
 +33.6% BM_AddSkipList/976.6k          100000  101.2 ms  1.012 us  965.3 k
------------------------------------------------------------------------------
 + 716% BM_SetMerge/1000               100000  6.872 ms  68.72 ns  13.88 M
*       BM_CSLMergeIntersection/1000   100000    842 us   8.42 ns  113.3 M
 + 268% BM_CSLMergeLookup/1000         100000    3.1 ms     31 ns  30.76 M
------------------------------------------------------------------------------
 +36.3% BM_SetMerge/64k                100000  14.03 ms  140.3 ns  6.798 M
 +39.4% BM_CSLMergeIntersection/64k    100000  14.35 ms  143.5 ns  6.645 M
*       BM_CSLMergeLookup/64k          100000  10.29 ms  102.9 ns  9.266 M
------------------------------------------------------------------------------
 +10.3% BM_SetMerge/976.6k             100000  46.24 ms  462.4 ns  2.062 M
 +25.1% BM_CSLMergeIntersection/976.6k 100000  52.47 ms  524.7 ns  1.818 M
*       BM_CSLMergeLookup/976.6k       100000  41.94 ms  419.3 ns  2.274 M
------------------------------------------------------------------------------


==============================================================================
Contention benchmark 7/8 find, 3/32 insert, 1/32 erase

 4 threads Benchmark                   Iters   Total t    t/iter iter/sec
------------------------------------------------------------------------------
 + 269% BM_ContentionStdSet/1k         100000  75.66 ms  756.6 ns   1.26 M
*       BM_ContentionCSL/1k            100000  20.47 ms  204.7 ns  4.658 M
------------------------------------------------------------------------------
 + 228% BM_ContentionStdSet/64k        100000  105.6 ms  1.056 us  924.9 k
*       BM_ContentionCSL/64k           100000  32.18 ms  321.8 ns  2.963 M
------------------------------------------------------------------------------
 + 224% BM_ContentionStdSet/1M         100000  117.4 ms  1.174 us  832.2 k
*       BM_ContentionCSL/1M            100000  36.18 ms  361.8 ns  2.636 M
------------------------------------------------------------------------------


12 threads Benchmark                   Iters   Total t    t/iter iter/sec
------------------------------------------------------------------------------
 + 697% BM_ContentionStdSet/1k         100000  455.3 ms  4.553 us  214.5 k
*       BM_ContentionCSL/1k            100000  57.12 ms  571.2 ns   1.67 M
------------------------------------------------------------------------------
 +1257% BM_ContentionStdSet/64k        100000  654.9 ms  6.549 us  149.1 k
*       BM_ContentionCSL/64k           100000  48.24 ms  482.4 ns  1.977 M
------------------------------------------------------------------------------
 +1262% BM_ContentionStdSet/1M         100000  657.3 ms  6.573 us  148.6 k
*       BM_ContentionCSL/1M            100000  48.25 ms  482.5 ns  1.977 M
------------------------------------------------------------------------------

*/
#endif