SmallLocksBenchmark.cpp | Explore in Territory

/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#include <algorithm>
#include <array>
#include <cmath>
#include <condition_variable>
#include <iostream>
#include <numeric>
#include <thread>
#include <vector>

#include <fmt/core.h>

#include <folly/Benchmark.h>
#include <folly/SharedMutex.h>
#include <folly/lang/Aligned.h>
#include <folly/synchronization/DistributedMutex.h>
#include <folly/synchronization/FlatCombining.h>
#include <folly/synchronization/SmallLocks.h>

/* "Work cycle" is just an additional nop loop iteration.
 * A smaller number of work cyles will result in more contention,
 * which is what we're trying to measure.  The relative ratio of
 * locked to unlocked cycles will simulate how big critical sections
 * are in production code
 */
DEFINE_int32(work, 100, "Number of work cycles");
DEFINE_int32(unlocked_work, 1000, "Number of unlocked work cycles");
DEFINE_int32(
    threads,
    std::thread::hardware_concurrency(),
    "Number of threads for fairness test");
DEFINE_bool(run_fairness, true, "Run fairness benchmarks");

static void burn(size_t n) {
  for (size_t i = 0; i < n; ++i) {
    folly::doNotOptimizeAway(i);
  }
}

namespace {
struct SimpleBarrier {
  explicit SimpleBarrier(int count) : count_(count) {}
  void wait() {
    // we spin for a bit to try and get the kernel to schedule threads on
    // different cores
    for (auto i = 0; i < 100000; ++i) {
      folly::doNotOptimizeAway(i);
    }
    num_.fetch_add(1);
    while (num_.load() != count_) {
    }
  }

 private:
  std::atomic<int> num_{0};
  const int count_;
};
} // namespace

template <typename Lock>
class InitLock {
  Lock lock_;

 public:
  InitLock() { lock_.init(); }
  void lock() { lock_.lock(); }
  void unlock() { lock_.unlock(); }
};

class DistributedMutexFlatCombining {
 public:
  folly::DistributedMutex mutex_;
};

class NoLock {
 public:
  void lock() {}
  void unlock() {}
};

class FlatCombiningMutexNoCaching
    : public folly::FlatCombining<FlatCombiningMutexNoCaching> {
 public:
  using Super = folly::FlatCombining<FlatCombiningMutexNoCaching>;

  template <typename CriticalSection>
  auto lock_combine(CriticalSection func, std::size_t) {
    auto record = this->allocRec();
    auto value = folly::invoke_result_t<CriticalSection&>{};
    this->requestFC([&]() { value = func(); }, record);
    this->freeRec(record);
    return value;
  }
};

class FlatCombiningMutexCaching
    : public folly::FlatCombining<FlatCombiningMutexCaching> {
 public:
  using Super = folly::FlatCombining<FlatCombiningMutexCaching>;

  FlatCombiningMutexCaching() {
    for (auto i = 0; i < 256; ++i) {
      this->records_.push_back(this->allocRec());
    }
  }

  template <typename CriticalSection>
  auto lock_combine(CriticalSection func, std::size_t index) {
    auto value = folly::invoke_result_t<CriticalSection&>{};
    this->requestFC([&]() { value = func(); }, records_.at(index));
    return value;
  }

  std::vector<Super::Rec*> records_;
};

template <typename Mutex, typename CriticalSection>
auto lock_and(Mutex& mutex, std::size_t, CriticalSection func) {
  auto lck = std::unique_lock{mutex};
  return func();
}
template <typename F>
auto lock_and(DistributedMutexFlatCombining& mutex, std::size_t, F func) {
  return mutex.mutex_.lock_combine(std::move(func));
}
template <typename F>
auto lock_and(FlatCombiningMutexNoCaching& mutex, std::size_t i, F func) {
  return mutex.lock_combine(func, i);
}
template <typename F>
auto lock_and(FlatCombiningMutexCaching& mutex, std::size_t i, F func) {
  return mutex.lock_combine(func, i);
}

/**
 * Functions to initialize, write and read from data
 *
 * These are used to do different things in the contended benchmark based on
 * the type of the data
 */
std::uint64_t write(std::uint64_t& value) {
  return ++value;
}
void read(std::uint64_t value) {
  folly::doNotOptimizeAway(value);
}
void initialize(std::uint64_t& value) {
  value = 1;
}

class alignas(folly::hardware_destructive_interference_size) Ints {
 public:
  std::array<folly::cacheline_aligned<std::uint64_t>, 5> ints_;
};
std::uint64_t write(Ints& vec) {
  auto sum = std::uint64_t{0};
  for (auto& integer : vec.ints_) {
    sum += (*integer += 1);
  }
  return sum;
}
void initialize(Ints&) {}

class alignas(folly::hardware_destructive_interference_size) AtomicsAdd {
 public:
  std::array<folly::cacheline_aligned<std::atomic<std::uint64_t>>, 5> ints_;
};
std::uint64_t write(AtomicsAdd& atomics) {
  auto sum = 0;
  folly::makeUnpredictable(sum);
  for (auto& integer : atomics.ints_) {
    sum += integer->fetch_add(1);
  }
  return sum;
}
void initialize(AtomicsAdd&) {}

class alignas(folly::hardware_destructive_interference_size) AtomicCas {
 public:
  std::atomic<std::uint64_t> integer_{0};
};
std::uint64_t write(AtomicCas& atomic) {
  auto value = atomic.integer_.load(std::memory_order_relaxed);
  folly::makeUnpredictable(value);
  while (!atomic.integer_.compare_exchange_strong(value, value + 1)) {
  }
  return value;
}
void initialize(AtomicCas&) {}

class alignas(folly::hardware_destructive_interference_size) AtomicFetchXor {
 public:
  std::atomic<std::uint64_t> integer_{0};
};
std::uint64_t write(AtomicFetchXor& atomic) {
  auto value = std::numeric_limits<std::uint64_t>::max();
  folly::makeUnpredictable(value);

  // XOR is a good choice here because it allows us to simulate random
  // operation in the hardware.  For example, if we were to use the same value
  // to do something like a bitwise or, the hardware is allowed to coalesce
  // the operations into one by treating all of the ones after the first as
  // idempotent, and then it only needs to transfer data across the bus
  // without actually needing to move the cacheline to the remote cores.  Very
  // much like the implementation here, but done in the hardware.  Coalescing
  // XORs is hard because it requires knowledge of the previous state and is
  // mutating on every operation
  value = atomic.integer_.fetch_xor(value, std::memory_order_acq_rel);
  return value;
}
void initialize(AtomicFetchXor&) {}

template <typename Lock, typename Data = std::uint64_t>
static void runContended(
    size_t numOps, size_t numThreads, size_t work = FLAGS_work) {
  folly::BenchmarkSuspender braces;
  size_t totalthreads = std::thread::hardware_concurrency();
  if (totalthreads < numThreads) {
    totalthreads = numThreads;
  }
  size_t threadgroups = totalthreads / numThreads;
  struct lockstruct {
    char padding1[128];
    Lock mutex;
    char padding2[128];
    Data value;
  };

  auto locks = std::vector<lockstruct>(threadgroups);
  for (auto& data : locks) {
    initialize(data.value);
  }
  folly::makeUnpredictable(locks);

  char padding3[128];
  (void)padding3;
  std::vector<std::thread> threads(totalthreads);

  SimpleBarrier runbarrier(totalthreads + 1);

  for (size_t t = 0; t < totalthreads; ++t) {
    threads[t] = std::thread([&, t] {
      lockstruct* mutex = &locks[t % threadgroups];
      runbarrier.wait();
      for (size_t op = 0; op < numOps; op += 1) {
        auto val =
            lock_and(mutex->mutex, t, [&value = mutex->value, work]() noexcept {
              burn(work);
              return write(value);
            });
        read(val);
        burn(FLAGS_unlocked_work);
      }
    });
  }

  runbarrier.wait();
  braces.dismissing([&] {
    for (auto& thr : threads) {
      thr.join();
    }
  });
}

template <typename Lock>
static void runFairness(std::size_t numThreads) {
  size_t totalthreads = std::thread::hardware_concurrency();
  if (totalthreads < numThreads) {
    totalthreads = numThreads;
  }
  long threadgroups = totalthreads / numThreads;
  struct lockstruct {
    char padding1[128];
    Lock lock;
  };

  auto locks =
      (struct lockstruct*)calloc(threadgroups, sizeof(struct lockstruct));

  char padding3[64];
  (void)padding3;
  std::vector<std::thread> threads(totalthreads);

  std::atomic<bool> stop{false};

  std::mutex rlock;
  std::vector<long> results;
  std::vector<std::chrono::microseconds> maxes;

  std::vector<std::chrono::microseconds> aqTime;
  std::vector<unsigned long> aqTimeSq;

  SimpleBarrier runbarrier(totalthreads + 1);

  for (size_t t = 0; t < totalthreads; ++t) {
    threads[t] = std::thread([&, t] {
      lockstruct* mutex = &locks[t % threadgroups];
      long value = 0;
      std::chrono::microseconds max(0);
      std::chrono::microseconds time(0);
      unsigned long timeSq(0);
      runbarrier.wait();
      while (!stop) {
        std::chrono::steady_clock::time_point prelock =
            std::chrono::steady_clock::now();
        lock_and(mutex->lock, t, [&]() {
          burn(FLAGS_work);
          value++;
        });
        std::chrono::steady_clock::time_point postlock =
            std::chrono::steady_clock::now();
        auto diff = std::chrono::duration_cast<std::chrono::microseconds>(
            postlock - prelock);
        time += diff;
        timeSq += diff.count() * diff.count();
        if (diff > max) {
          max = diff;
        }
      }
      {
        std::lock_guard<std::mutex> g(rlock);
        results.push_back(value);
        maxes.push_back(max);
        aqTime.push_back(time);
        aqTimeSq.push_back(timeSq);
      }
    });
  }

  runbarrier.wait();
  /* sleep override */
  std::this_thread::sleep_for(std::chrono::seconds(4));
  stop = true;

  for (auto& thr : threads) {
    thr.join();
  }

  // Calulate some stats
  unsigned long sum =
      folly::to_integral(std::accumulate(results.begin(), results.end(), 0.0));
  double m = sum / results.size();

  double accum = 0.0;
  std::for_each(results.begin(), results.end(), [&](const double d) {
    accum += (d - m) * (d - m);
  });
  double stdev = std::sqrt(accum / (results.size() - 1));
  std::chrono::microseconds mx = *std::max_element(maxes.begin(), maxes.end());
  std::chrono::microseconds agAqTime = std::accumulate(
      aqTime.begin(), aqTime.end(), std::chrono::microseconds(0));
  unsigned long agAqTimeSq =
      std::accumulate(aqTimeSq.begin(), aqTimeSq.end(), 0);
  std::chrono::microseconds mean = agAqTime / sum;
  double variance = (sum * agAqTimeSq - (agAqTime.count() * agAqTime.count())) /
      sum / (sum - 1);
  double stddev2 = std::sqrt(variance);

  fmt::print("Sum: {} Mean: {:.0f} stddev: {:.0f}\n", sum, m, stdev);
  fmt::print(
      "Lock time stats in us: mean {} stddev {:.0f} max {}\n",
      mean.count(),
      stddev2,
      mx.count());
}

template <typename Mutex>
void runUncontended(std::size_t iters) {
  auto&& mutex = Mutex{};
  for (auto i = std::size_t{0}; i < iters; ++i) {
    folly::makeUnpredictable(mutex);
    auto lck = std::unique_lock<Mutex>{mutex};
    folly::makeUnpredictable(mutex);
  }
}

BENCHMARK(StdMutexUncontendedBenchmark, iters) {
  runUncontended<std::mutex>(iters);
}

BENCHMARK(StdSharedMutexUncontendedBenchmark, iters) {
  runUncontended<std::shared_mutex>(iters);
}

BENCHMARK(MicroSpinLockUncontendedBenchmark, iters) {
  runUncontended<InitLock<folly::MicroSpinLock>>(iters);
}

BENCHMARK(PicoSpinLockUncontendedBenchmark, iters) {
  runUncontended<InitLock<folly::PicoSpinLock<std::uint16_t>>>(iters);
}

BENCHMARK(MicroLockUncontendedBenchmark, iters) {
  runUncontended<folly::MicroLock>(iters);
}

BENCHMARK(SharedMutexUncontendedBenchmark, iters) {
  runUncontended<folly::SharedMutex>(iters);
}

BENCHMARK(DistributedMutexUncontendedBenchmark, iters) {
  runUncontended<folly::DistributedMutex>(iters);
}

BENCHMARK(AtomicFetchAddUncontendedBenchmark, iters) {
  auto&& atomic = std::atomic<uint64_t>{0};
  while (iters--) {
    folly::doNotOptimizeAway(atomic.fetch_add(1));
  }
}

struct VirtualBase {
  virtual void foo() = 0;
  virtual ~VirtualBase() {}
};

struct VirtualImpl : VirtualBase {
  void foo() override { /* noop */ }
  ~VirtualImpl() override {}
};

#ifndef __clang__
__attribute__((noinline, noclone)) VirtualBase* makeVirtual() {
  return new VirtualImpl();
}

BENCHMARK(VirtualFunctionCall, iters) {
  VirtualBase* vb = makeVirtual();
  while (iters--) {
    vb->foo();
  }
  delete vb;
}
#endif

BENCHMARK_DRAW_LINE();

#define BENCH_BASE(...) FB_VA_GLUE(BENCHMARK_NAMED_PARAM, (__VA_ARGS__))
#define BENCH_REL(...) FB_VA_GLUE(BENCHMARK_RELATIVE_NAMED_PARAM, (__VA_ARGS__))

static void std_mutex(size_t numOps, size_t numThreads) {
  runContended<std::mutex>(numOps, numThreads);
}
static void std_shared_mutex(size_t numOps, size_t numThreads) {
  runContended<std::shared_mutex>(numOps, numThreads);
}
static void folly_microspin(size_t numOps, size_t numThreads) {
  runContended<InitLock<folly::MicroSpinLock>>(numOps, numThreads);
}
static void folly_picospin(size_t numOps, size_t numThreads) {
  runContended<InitLock<folly::PicoSpinLock<uint16_t>>>(numOps, numThreads);
}
static void folly_microlock(size_t numOps, size_t numThreads) {
  runContended<folly::MicroLock>(numOps, numThreads);
}
static void folly_sharedmutex(size_t numOps, size_t numThreads) {
  runContended<folly::SharedMutex>(numOps, numThreads);
}
static void folly_distributedmutex(size_t numOps, size_t numThreads) {
  runContended<folly::DistributedMutex>(numOps, numThreads);
}
static void folly_distributedmutex_combining(size_t ops, size_t threads) {
  runContended<DistributedMutexFlatCombining>(ops, threads);
}
static void folly_flatcombining_no_caching(size_t numOps, size_t numThreads) {
  runContended<FlatCombiningMutexNoCaching>(numOps, numThreads);
}
static void folly_flatcombining_caching(size_t numOps, size_t numThreads) {
  runContended<FlatCombiningMutexCaching>(numOps, numThreads);
}

static void std_mutex_simple(size_t numOps, size_t numThreads) {
  runContended<std::mutex, Ints>(numOps, numThreads, 0);
}
static void std_shared_mutex_simple(size_t numOps, size_t numThreads) {
  runContended<std::shared_mutex, Ints>(numOps, numThreads, 0);
}
static void folly_microspin_simple(size_t numOps, size_t numThreads) {
  runContended<InitLock<folly::MicroSpinLock>, Ints>(numOps, numThreads, 0);
}
static void folly_picospin_simple(size_t numOps, size_t numThreads) {
  runContended<InitLock<folly::PicoSpinLock<uint16_t>>, Ints>(
      numOps, numThreads, 0);
}
static void folly_microlock_simple(size_t numOps, size_t numThreads) {
  runContended<folly::MicroLock, Ints>(numOps, numThreads, 0);
}
static void folly_sharedmutex_simple(size_t numOps, size_t numThreads) {
  runContended<folly::SharedMutex, Ints>(numOps, numThreads, 0);
}
static void folly_distributedmutex_simple(size_t numOps, size_t numThreads) {
  runContended<folly::DistributedMutex, Ints>(numOps, numThreads, 0);
}
static void folly_distributedmutex_combining_simple(size_t o, size_t t) {
  runContended<DistributedMutexFlatCombining, Ints>(o, t, 0);
}
static void atomics_fetch_add(size_t numOps, size_t numThreads) {
  runContended<NoLock, AtomicsAdd>(numOps, numThreads, 0);
}
static void atomic_fetch_xor(size_t numOps, size_t numThreads) {
  runContended<NoLock, AtomicFetchXor>(numOps, numThreads, 0);
}
static void atomic_cas(size_t numOps, size_t numThreads) {
  runContended<NoLock, AtomicCas>(numOps, numThreads, 0);
}
static void folly_flatcombining_no_caching_simple(size_t ops, size_t threads) {
  runContended<FlatCombiningMutexNoCaching>(ops, threads, 0);
}
static void folly_flatcombining_caching_simple(size_t ops, size_t threads) {
  runContended<FlatCombiningMutexCaching>(ops, threads, 0);
}

BENCHMARK_DRAW_LINE();
BENCH_BASE(std_mutex, 1thread, 1)
BENCH_BASE(std_shared_mutex, 1thread, 1)
BENCH_REL(folly_microspin, 1thread, 1)
BENCH_REL(folly_picospin, 1thread, 1)
BENCH_REL(folly_microlock, 1thread, 1)
BENCH_REL(folly_sharedmutex, 1thread, 1)
BENCH_REL(folly_distributedmutex, 1thread, 1)
BENCH_REL(folly_distributedmutex_combining, 1thread, 1)
BENCH_REL(folly_flatcombining_no_caching, 1thread, 1)
BENCH_REL(folly_flatcombining_caching, 1thread, 1)
BENCHMARK_DRAW_LINE();
BENCH_BASE(std_mutex, 2thread, 2)
BENCH_BASE(std_shared_mutex, 2thread, 2)
BENCH_REL(folly_microspin, 2thread, 2)
BENCH_REL(folly_picospin, 2thread, 2)
BENCH_REL(folly_microlock, 2thread, 2)
BENCH_REL(folly_sharedmutex, 2thread, 2)
BENCH_REL(folly_distributedmutex, 2thread, 2)
BENCH_REL(folly_distributedmutex_combining, 2thread, 2)
BENCH_REL(folly_flatcombining_no_caching, 2thread, 2)
BENCH_REL(folly_flatcombining_caching, 2thread, 2)
BENCHMARK_DRAW_LINE();
BENCH_BASE(std_mutex, 4thread, 4)
BENCH_BASE(std_shared_mutex, 4thread, 4)
BENCH_REL(folly_microspin, 4thread, 4)
BENCH_REL(folly_picospin, 4thread, 4)
BENCH_REL(folly_microlock, 4thread, 4)
BENCH_REL(folly_sharedmutex, 4thread, 4)
BENCH_REL(folly_distributedmutex, 4thread, 4)
BENCH_REL(folly_distributedmutex_combining, 4thread, 4)
BENCH_REL(folly_flatcombining_no_caching, 4thread, 4)
BENCH_REL(folly_flatcombining_caching, 4thread, 4)
BENCHMARK_DRAW_LINE();
BENCH_BASE(std_mutex, 8thread, 8)
BENCH_BASE(std_shared_mutex, 8thread, 8)
BENCH_REL(folly_microspin, 8thread, 8)
BENCH_REL(folly_picospin, 8thread, 8)
BENCH_REL(folly_microlock, 8thread, 8)
BENCH_REL(folly_sharedmutex, 8thread, 8)
BENCH_REL(folly_distributedmutex, 8thread, 8)
BENCH_REL(folly_distributedmutex_combining, 8thread, 8)
BENCH_REL(folly_flatcombining_no_caching, 8thread, 8)
BENCH_REL(folly_flatcombining_caching, 8thread, 8)
BENCHMARK_DRAW_LINE();
BENCH_BASE(std_mutex, 16thread, 16)
BENCH_BASE(std_shared_mutex, 16thread, 16)
BENCH_REL(folly_microspin, 16thread, 16)
BENCH_REL(folly_picospin, 16thread, 16)
BENCH_REL(folly_microlock, 16thread, 16)
BENCH_REL(folly_sharedmutex, 16thread, 16)
BENCH_REL(folly_distributedmutex, 16thread, 16)
BENCH_REL(folly_distributedmutex_combining, 16thread, 16)
BENCH_REL(folly_flatcombining_no_caching, 16thread, 16)
BENCH_REL(folly_flatcombining_caching, 16thread, 16)
BENCHMARK_DRAW_LINE();
BENCH_BASE(std_mutex, 32thread, 32)
BENCH_BASE(std_shared_mutex, 32thread, 32)
BENCH_REL(folly_microspin, 32thread, 32)
BENCH_REL(folly_picospin, 32thread, 32)
BENCH_REL(folly_microlock, 32thread, 32)
BENCH_REL(folly_sharedmutex, 32thread, 32)
BENCH_REL(folly_distributedmutex, 32thread, 32)
BENCH_REL(folly_distributedmutex_combining, 32thread, 32)
BENCH_REL(folly_flatcombining_no_caching, 32thread, 32)
BENCH_REL(folly_flatcombining_caching, 32thread, 32)
BENCHMARK_DRAW_LINE();
BENCH_BASE(std_mutex, 64thread, 64)
BENCH_BASE(std_shared_mutex, 64thread, 64)
BENCH_REL(folly_microspin, 64thread, 64)
BENCH_REL(folly_picospin, 64thread, 64)
BENCH_REL(folly_microlock, 64thread, 64)
BENCH_REL(folly_sharedmutex, 64thread, 64)
BENCH_REL(folly_distributedmutex, 64thread, 64)
BENCH_REL(folly_distributedmutex_combining, 64thread, 64)
BENCH_REL(folly_flatcombining_no_caching, 64thread, 64)
BENCH_REL(folly_flatcombining_caching, 64thread, 64)
BENCHMARK_DRAW_LINE();
BENCH_BASE(std_mutex, 128thread, 128)
BENCH_BASE(std_shared_mutex, 128thread, 128)
BENCH_REL(folly_microspin, 128thread, 128)
BENCH_REL(folly_picospin, 128thread, 128)
BENCH_REL(folly_microlock, 128thread, 128)
BENCH_REL(folly_sharedmutex, 128thread, 128)
BENCH_REL(folly_distributedmutex, 128thread, 128)
BENCH_REL(folly_distributedmutex_combining, 128thread, 128)
BENCH_REL(folly_flatcombining_no_caching, 128thread, 128)
BENCH_REL(folly_flatcombining_caching, 128thread, 128)

BENCHMARK_DRAW_LINE();
BENCH_BASE(std_mutex_simple, 1thread, 1)
BENCH_BASE(std_shared_mutex_simple, 1thread, 1)
BENCH_REL(folly_microspin_simple, 1thread, 1)
BENCH_REL(folly_picospin_simple, 1thread, 1)
BENCH_REL(folly_microlock_simple, 1thread, 1)
BENCH_REL(folly_sharedmutex_simple, 1thread, 1)
BENCH_REL(folly_distributedmutex_simple, 1thread, 1)
BENCH_REL(folly_distributedmutex_combining_simple, 1thread, 1)
BENCH_REL(folly_flatcombining_no_caching_simple, 1thread, 1)
BENCH_REL(folly_flatcombining_caching_simple, 1thread, 1)
BENCH_REL(atomics_fetch_add, 1thread, 1)
BENCH_REL(atomic_fetch_xor, 1thread, 1)
BENCH_REL(atomic_cas, 1thread, 1)
BENCHMARK_DRAW_LINE();
BENCH_BASE(std_mutex_simple, 2thread, 2)
BENCH_BASE(std_shared_mutex_simple, 2thread, 2)
BENCH_REL(folly_microspin_simple, 2thread, 2)
BENCH_REL(folly_picospin_simple, 2thread, 2)
BENCH_REL(folly_microlock_simple, 2thread, 2)
BENCH_REL(folly_sharedmutex_simple, 2thread, 2)
BENCH_REL(folly_distributedmutex_simple, 2thread, 2)
BENCH_REL(folly_distributedmutex_combining_simple, 2thread, 2)
BENCH_REL(folly_flatcombining_no_caching_simple, 2thread, 2)
BENCH_REL(folly_flatcombining_caching_simple, 2thread, 2)
BENCH_REL(atomics_fetch_add, 2thread, 2)
BENCH_REL(atomic_fetch_xor, 2thread, 2)
BENCH_REL(atomic_cas, 2thread, 2)
BENCHMARK_DRAW_LINE();
BENCH_BASE(std_mutex_simple, 4thread, 4)
BENCH_BASE(std_shared_mutex_simple, 4thread, 4)
BENCH_REL(folly_microspin_simple, 4thread, 4)
BENCH_REL(folly_picospin_simple, 4thread, 4)
BENCH_REL(folly_microlock_simple, 4thread, 4)
BENCH_REL(folly_sharedmutex_simple, 4thread, 4)
BENCH_REL(folly_distributedmutex_simple, 4thread, 4)
BENCH_REL(folly_distributedmutex_combining_simple, 4thread, 4)
BENCH_REL(folly_flatcombining_no_caching_simple, 4thread, 4)
BENCH_REL(folly_flatcombining_caching_simple, 4thread, 4)
BENCH_REL(atomics_fetch_add, 4thread, 4)
BENCH_REL(atomic_fetch_xor, 4thread, 4)
BENCH_REL(atomic_cas, 4thread, 4)
BENCHMARK_DRAW_LINE();
BENCH_BASE(std_mutex_simple, 8thread, 8)
BENCH_BASE(std_shared_mutex_simple, 8thread, 8)
BENCH_REL(folly_microspin_simple, 8thread, 8)
BENCH_REL(folly_picospin_simple, 8thread, 8)
BENCH_REL(folly_microlock_simple, 8thread, 8)
BENCH_REL(folly_sharedmutex_simple, 8thread, 8)
BENCH_REL(folly_distributedmutex_simple, 8thread, 8)
BENCH_REL(folly_distributedmutex_combining_simple, 8thread, 8)
BENCH_REL(folly_flatcombining_no_caching_simple, 8thread, 8)
BENCH_REL(folly_flatcombining_caching_simple, 8thread, 8)
BENCH_REL(atomics_fetch_add, 8thread, 8)
BENCH_REL(atomic_fetch_xor, 8thread, 8)
BENCH_REL(atomic_cas, 8thread, 8)
BENCHMARK_DRAW_LINE();
BENCH_BASE(std_mutex_simple, 16thread, 16)
BENCH_BASE(std_shared_mutex_simple, 16thread, 16)
BENCH_REL(folly_microspin_simple, 16thread, 16)
BENCH_REL(folly_picospin_simple, 16thread, 16)
BENCH_REL(folly_microlock_simple, 16thread, 16)
BENCH_REL(folly_sharedmutex_simple, 16thread, 16)
BENCH_REL(folly_distributedmutex_simple, 16thread, 16)
BENCH_REL(folly_distributedmutex_combining_simple, 16thread, 16)
BENCH_REL(folly_flatcombining_no_caching_simple, 16thread, 16)
BENCH_REL(folly_flatcombining_caching_simple, 16thread, 16)
BENCH_REL(atomics_fetch_add, 16thread, 16)
BENCH_REL(atomic_fetch_xor, 16thread, 16)
BENCH_REL(atomic_cas, 16thread, 16)
BENCHMARK_DRAW_LINE();
BENCH_BASE(std_mutex_simple, 32thread, 32)
BENCH_BASE(std_shared_mutex_simple, 32thread, 32)
BENCH_REL(folly_microspin_simple, 32thread, 32)
BENCH_REL(folly_picospin_simple, 32thread, 32)
BENCH_REL(folly_microlock_simple, 32thread, 32)
BENCH_REL(folly_sharedmutex_simple, 32thread, 32)
BENCH_REL(folly_distributedmutex_simple, 32thread, 32)
BENCH_REL(folly_distributedmutex_combining_simple, 32thread, 32)
BENCH_REL(folly_flatcombining_no_caching_simple, 32thread, 32)
BENCH_REL(folly_flatcombining_caching_simple, 32thread, 32)
BENCH_REL(atomics_fetch_add, 32thread, 32)
BENCH_REL(atomic_fetch_xor, 32thread, 32)
BENCH_REL(atomic_cas, 32thread, 32)
BENCHMARK_DRAW_LINE();
BENCH_BASE(std_mutex_simple, 64thread, 64)
BENCH_BASE(std_shared_mutex_simple, 64thread, 64)
BENCH_REL(folly_microspin_simple, 64thread, 64)
BENCH_REL(folly_picospin_simple, 64thread, 64)
BENCH_REL(folly_microlock_simple, 64thread, 64)
BENCH_REL(folly_sharedmutex_simple, 64thread, 64)
BENCH_REL(folly_distributedmutex_simple, 64thread, 64)
BENCH_REL(folly_distributedmutex_combining_simple, 64thread, 64)
BENCH_REL(folly_flatcombining_no_caching_simple, 64thread, 64)
BENCH_REL(folly_flatcombining_caching_simple, 64thread, 64)
BENCH_REL(atomics_fetch_add, 64thread, 64)
BENCH_REL(atomic_fetch_xor, 64thread, 64)
BENCH_REL(atomic_cas, 64thread, 64)
BENCHMARK_DRAW_LINE();
BENCH_BASE(std_mutex_simple, 128thread, 128)
BENCH_BASE(std_shared_mutex_simple, 128thread, 128)
BENCH_REL(folly_microspin_simple, 128thread, 128)
BENCH_REL(folly_picospin_simple, 128thread, 128)
BENCH_REL(folly_microlock_simple, 128thread, 128)
BENCH_REL(folly_sharedmutex_simple, 128thread, 128)
BENCH_REL(folly_distributedmutex_simple, 128thread, 128)
BENCH_REL(folly_distributedmutex_combining_simple, 128thread, 128)
BENCH_REL(folly_flatcombining_no_caching_simple, 128thread, 128)
BENCH_REL(folly_flatcombining_caching_simple, 128thread, 128)
BENCH_REL(atomics_fetch_add, 128thread, 128)
BENCH_REL(atomic_fetch_xor, 128thread, 128)
BENCH_REL(atomic_cas, 128thread, 128)

template <typename Mutex>
void fairnessTest(std::string type, std::size_t numThreads) {
  std::cout << "------- " << type << " " << numThreads << " threads";
  std::cout << std::endl;
  runFairness<Mutex>(numThreads);
}

int main(int argc, char** argv) {
  gflags::ParseCommandLineFlags(&argc, &argv, true);

  if (FLAGS_run_fairness) {
    for (auto numThreads : {2, 4, 8, 16, 32, 64}) {
      fairnessTest<std::mutex>("std::mutex", numThreads);
      fairnessTest<std::shared_mutex>("std::shared_mutex", numThreads);
      fairnessTest<InitLock<folly::MicroSpinLock>>(
          "folly::MicroSpinLock", numThreads);
      fairnessTest<InitLock<folly::PicoSpinLock<std::uint16_t>>>(
          "folly::PicoSpinLock<std::uint16_t>", numThreads);
      fairnessTest<folly::MicroLock>("folly::MicroLock", numThreads);
      fairnessTest<folly::SharedMutex>("folly::SharedMutex", numThreads);
      fairnessTest<folly::DistributedMutex>(
          "folly::DistributedMutex", numThreads);
      fairnessTest<DistributedMutexFlatCombining>(
          "folly::DistributedMutex (Combining)", numThreads);

      std::cout << std::string(76, '=') << std::endl;
    }
  }

  folly::runBenchmarks();

  return 0;
}

/*
./small_locks_benchmark --bm_min_iters=100000
Intel(R) Xeon(R) CPU E5-2680 v4 @ 2.40GHz

------- std::mutex 2 threads
Sum: 361854376 Mean: 6461685 stddev: 770837
Lock time stats in us: mean 0 stddev 1 max 63002
------- folly::MicroSpinLock 2 threads
Sum: 454928254 Mean: 8123718 stddev: 1568978
Lock time stats in us: mean 0 stddev 9 max 118006
------- folly::PicoSpinLock<std::uint16_t> 2 threads
Sum: 376990850 Mean: 6731979 stddev: 1295859
Lock time stats in us: mean 0 stddev 1 max 83007
------- folly::MicroLock 2 threads
Sum: 316081944 Mean: 5644320 stddev: 1249068
Lock time stats in us: mean 0 stddev 13 max 53930
------- folly::SharedMutex 2 threads
Sum: 389298695 Mean: 6951762 stddev: 3031794
Lock time stats in us: mean 0 stddev 2 max 55004
------- folly::DistributedMutex 2 threads
Sum: 512343772 Mean: 9148995 stddev: 1168346
Lock time stats in us: mean 0 stddev 8 max 50830
------- folly::DistributedMutex (Combining) 2 threads
Sum: 475079423 Mean: 8483561 stddev: 899288
Lock time stats in us: mean 0 stddev 1 max 26006
============================================================================
------- std::mutex 4 threads
Sum: 164126417 Mean: 2930828 stddev: 208327
Lock time stats in us: mean 0 stddev 2 max 11759
------- folly::MicroSpinLock 4 threads
Sum: 168795789 Mean: 3014210 stddev: 825455
Lock time stats in us: mean 0 stddev 3 max 152163
------- folly::PicoSpinLock<std::uint16_t> 4 threads
Sum: 125788231 Mean: 2246218 stddev: 755074
Lock time stats in us: mean 1 stddev 3 max 151004
------- folly::MicroLock 4 threads
Sum: 109091138 Mean: 1948056 stddev: 465388
Lock time stats in us: mean 1 stddev 39 max 60029
------- folly::SharedMutex 4 threads
Sum: 107870343 Mean: 1926256 stddev: 1039541
Lock time stats in us: mean 1 stddev 2 max 57002
------- folly::DistributedMutex 4 threads
Sum: 207229191 Mean: 3700521 stddev: 182811
Lock time stats in us: mean 0 stddev 21 max 16231
------- folly::DistributedMutex (Combining) 4 threads
Sum: 204144735 Mean: 3645441 stddev: 619224
Lock time stats in us: mean 0 stddev 0 max 27008
============================================================================
------- std::mutex 8 threads
Sum: 82709846 Mean: 1476961 stddev: 173483
Lock time stats in us: mean 2 stddev 52 max 9404
------- folly::MicroSpinLock 8 threads
Sum: 94805197 Mean: 1692949 stddev: 633249
Lock time stats in us: mean 1 stddev 3 max 104517
------- folly::PicoSpinLock<std::uint16_t> 8 threads
Sum: 41587796 Mean: 742639 stddev: 191868
Lock time stats in us: mean 4 stddev 103 max 317025
------- folly::MicroLock 8 threads
Sum: 42414128 Mean: 757395 stddev: 234934
Lock time stats in us: mean 4 stddev 101 max 39660
------- folly::SharedMutex 8 threads
Sum: 58861445 Mean: 1051097 stddev: 491231
Lock time stats in us: mean 3 stddev 73 max 34007
------- folly::DistributedMutex 8 threads
Sum: 93377108 Mean: 1667448 stddev: 113502
Lock time stats in us: mean 1 stddev 46 max 11075
------- folly::DistributedMutex (Combining) 8 threads
Sum: 131093487 Mean: 2340955 stddev: 187841
Lock time stats in us: mean 1 stddev 3 max 25004
============================================================================
------- std::mutex 16 threads
Sum: 36606221 Mean: 653682 stddev: 65154
Lock time stats in us: mean 5 stddev 117 max 13603
------- folly::MicroSpinLock 16 threads
Sum: 27935153 Mean: 498842 stddev: 197304
Lock time stats in us: mean 7 stddev 3 max 257433
------- folly::PicoSpinLock<std::uint16_t> 16 threads
Sum: 12265416 Mean: 219025 stddev: 146399
Lock time stats in us: mean 17 stddev 350 max 471793
------- folly::MicroLock 16 threads
Sum: 18180611 Mean: 324653 stddev: 32123
Lock time stats in us: mean 11 stddev 236 max 40166
------- folly::SharedMutex 16 threads
Sum: 21734734 Mean: 388120 stddev: 190252
Lock time stats in us: mean 9 stddev 197 max 107045
------- folly::DistributedMutex 16 threads
Sum: 42823745 Mean: 764709 stddev: 64251
Lock time stats in us: mean 4 stddev 100 max 19986
------- folly::DistributedMutex (Combining) 16 threads
Sum: 63515255 Mean: 1134200 stddev: 37905
Lock time stats in us: mean 2 stddev 3 max 32005
============================================================================
------- std::mutex 32 threads
Sum: 10307832 Mean: 184068 stddev: 2431
Lock time stats in us: mean 21 stddev 416 max 18397
------- folly::MicroSpinLock 32 threads
Sum: 7318139 Mean: 130681 stddev: 24742
Lock time stats in us: mean 29 stddev 586 max 230672
------- folly::PicoSpinLock<std::uint16_t> 32 threads
Sum: 6424015 Mean: 114714 stddev: 138460
Lock time stats in us: mean 34 stddev 668 max 879632
------- folly::MicroLock 32 threads
Sum: 4893744 Mean: 87388 stddev: 6935
Lock time stats in us: mean 45 stddev 876 max 14902
------- folly::SharedMutex 32 threads
Sum: 6393363 Mean: 114167 stddev: 80211
Lock time stats in us: mean 34 stddev 671 max 75777
------- folly::DistributedMutex 32 threads
Sum: 14394775 Mean: 257049 stddev: 36723
Lock time stats in us: mean 15 stddev 298 max 54654
------- folly::DistributedMutex (Combining) 32 threads
Sum: 24232845 Mean: 432729 stddev: 11398
Lock time stats in us: mean 8 stddev 177 max 35008
============================================================================
------- std::mutex 64 threads
Sum: 10656640 Mean: 166510 stddev: 3340
Lock time stats in us: mean 23 stddev 402 max 10797
------- folly::MicroSpinLock 64 threads
Sum: 23284721 Mean: 363823 stddev: 62670
Lock time stats in us: mean 10 stddev 184 max 168470
------- folly::PicoSpinLock<std::uint16_t> 64 threads
Sum: 2322545 Mean: 36289 stddev: 6272
Lock time stats in us: mean 109 stddev 1846 max 1157157
------- folly::MicroLock 64 threads
Sum: 4835136 Mean: 75549 stddev: 3484
Lock time stats in us: mean 52 stddev 887 max 23895
------- folly::SharedMutex 64 threads
Sum: 7047147 Mean: 110111 stddev: 53207
Lock time stats in us: mean 35 stddev 608 max 85181
------- folly::DistributedMutex 64 threads
Sum: 14491662 Mean: 226432 stddev: 27098
Lock time stats in us: mean 17 stddev 296 max 55078
------- folly::DistributedMutex (Combining) 64 threads
Sum: 23885026 Mean: 373203 stddev: 14431
Lock time stats in us: mean 10 stddev 179 max 62008
============================================================================
============================================================================
folly/synchronization/test/SmallLocksBenchmark.cpprelative  time/iter  iters/s
============================================================================
StdMutexUncontendedBenchmark                                16.42ns   60.90M
MicroSpinLockUncontendedBenchmark                           10.95ns   91.33M
PicoSpinLockUncontendedBenchmark                            20.38ns   49.07M
MicroLockUncontendedBenchmark                               28.92ns   34.58M
SharedMutexUncontendedBenchmark                             19.47ns   51.36M
DistributedMutexUncontendedBenchmark                        28.89ns   34.62M
AtomicFetchAddUncontendedBenchmark                           5.47ns  182.91M
----------------------------------------------------------------------------
----------------------------------------------------------------------------
std_mutex(1thread)                                         900.28ns    1.11M
folly_microspin(1thread)                         109.53%   821.97ns    1.22M
folly_picospin(1thread)                          101.86%   883.88ns    1.13M
folly_microlock(1thread)                         102.54%   878.02ns    1.14M
folly_sharedmutex(1thread)                       132.03%   681.86ns    1.47M
folly_distributedmutex(1thread)                  129.50%   695.23ns    1.44M
folly_distributedmutex_combining(1thread)        130.73%   688.68ns    1.45M
folly_flatcombining_no_caching(1thread)          106.73%   843.49ns    1.19M
folly_flatcombining_caching(1thread)             125.22%   718.96ns    1.39M
----------------------------------------------------------------------------
std_mutex(2thread)                                           1.27us  784.90K
folly_microspin(2thread)                         147.93%   861.24ns    1.16M
folly_picospin(2thread)                          146.10%   872.06ns    1.15M
folly_microlock(2thread)                         131.35%   970.00ns    1.03M
folly_sharedmutex(2thread)                       135.07%   943.23ns    1.06M
folly_distributedmutex(2thread)                  135.88%   937.63ns    1.07M
folly_distributedmutex_combining(2thread)        130.37%   977.27ns    1.02M
folly_flatcombining_no_caching(2thread)           85.64%     1.49us  672.22K
folly_flatcombining_caching(2thread)              91.98%     1.39us  721.93K
----------------------------------------------------------------------------
std_mutex(4thread)                                           2.40us  417.44K
folly_microspin(4thread)                         101.55%     2.36us  423.92K
folly_picospin(4thread)                           97.89%     2.45us  408.64K
folly_microlock(4thread)                          79.64%     3.01us  332.45K
folly_sharedmutex(4thread)                        75.10%     3.19us  313.49K
folly_distributedmutex(4thread)                  126.16%     1.90us  526.63K
folly_distributedmutex_combining(4thread)        166.56%     1.44us  695.28K
folly_flatcombining_no_caching(4thread)           91.79%     2.61us  383.17K
folly_flatcombining_caching(4thread)             103.95%     2.30us  433.95K
----------------------------------------------------------------------------
std_mutex(8thread)                                           4.85us  206.37K
folly_microspin(8thread)                         105.28%     4.60us  217.28K
folly_picospin(8thread)                           89.06%     5.44us  183.80K
folly_microlock(8thread)                          73.95%     6.55us  152.62K
folly_sharedmutex(8thread)                        67.17%     7.21us  138.62K
folly_distributedmutex(8thread)                  162.16%     2.99us  334.66K
folly_distributedmutex_combining(8thread)        251.93%     1.92us  519.92K
folly_flatcombining_no_caching(8thread)          141.99%     3.41us  293.02K
folly_flatcombining_caching(8thread)             166.26%     2.91us  343.12K
----------------------------------------------------------------------------
std_mutex(16thread)                                         11.36us   88.01K
folly_microspin(16thread)                        102.73%    11.06us   90.42K
folly_picospin(16thread)                          44.00%    25.83us   38.72K
folly_microlock(16thread)                         52.42%    21.67us   46.14K
folly_sharedmutex(16thread)                       53.46%    21.26us   47.05K
folly_distributedmutex(16thread)                 166.17%     6.84us  146.24K
folly_distributedmutex_combining(16thread)       352.82%     3.22us  310.52K
folly_flatcombining_no_caching(16thread)         218.07%     5.21us  191.92K
folly_flatcombining_caching(16thread)            217.69%     5.22us  191.58K
----------------------------------------------------------------------------
std_mutex(32thread)                                         32.12us   31.13K
folly_microspin(32thread)                        104.52%    30.74us   32.54K
folly_picospin(32thread)                          32.81%    97.91us   10.21K
folly_microlock(32thread)                         57.40%    55.96us   17.87K
folly_sharedmutex(32thread)                       63.68%    50.45us   19.82K
folly_distributedmutex(32thread)                 180.17%    17.83us   56.09K
folly_distributedmutex_combining(32thread)       394.34%     8.15us  122.76K
folly_flatcombining_no_caching(32thread)         216.41%    14.84us   67.37K
folly_flatcombining_caching(32thread)            261.99%    12.26us   81.56K
----------------------------------------------------------------------------
std_mutex(64thread)                                         36.76us   27.20K
folly_microspin(64thread)                        112.14%    32.78us   30.51K
folly_picospin(64thread)                          32.34%   113.65us    8.80K
folly_microlock(64thread)                         57.21%    64.26us   15.56K
folly_sharedmutex(64thread)                       60.93%    60.33us   16.57K
folly_distributedmutex(64thread)                 179.79%    20.45us   48.91K
folly_distributedmutex_combining(64thread)       392.64%     9.36us  106.81K
folly_flatcombining_no_caching(64thread)         211.85%    17.35us   57.63K
folly_flatcombining_caching(64thread)            241.45%    15.22us   65.68K
----------------------------------------------------------------------------
std_mutex(128thread)                                        73.05us   13.69K
folly_microspin(128thread)                        97.45%    74.96us   13.34K
folly_picospin(128thread)                         31.46%   232.19us    4.31K
folly_microlock(128thread)                        56.50%   129.29us    7.73K
folly_sharedmutex(128thread)                      59.54%   122.69us    8.15K
folly_distributedmutex(128thread)                166.59%    43.85us   22.80K
folly_distributedmutex_combining(128thread)      379.86%    19.23us   52.00K
folly_flatcombining_no_caching(128thread)        179.10%    40.79us   24.52K
folly_flatcombining_caching(128thread)           189.64%    38.52us   25.96K
----------------------------------------------------------------------------
std_mutex_simple(1thread)                                  666.33ns    1.50M
folly_microspin_simple(1thread)                  109.80%   606.87ns    1.65M
folly_picospin_simple(1thread)                   108.89%   611.94ns    1.63M
folly_microlock_simple(1thread)                  108.42%   614.59ns    1.63M
folly_sharedmutex_simple(1thread)                 93.00%   716.47ns    1.40M
folly_distributedmutex_simple(1thread)            90.08%   739.68ns    1.35M
folly_distributedmutex_combining_simple(1thread   90.20%   738.73ns    1.35M
folly_flatcombining_no_caching_simple(1thread)    98.04%   679.68ns    1.47M
folly_flatcombining_caching_simple(1thread)      105.59%   631.04ns    1.58M
atomics_fetch_add(1thread)                       108.30%   615.29ns    1.63M
atomic_fetch_xor(1thread)                        110.52%   602.90ns    1.66M
atomic_cas(1thread)                              109.86%   606.52ns    1.65M
----------------------------------------------------------------------------
std_mutex_simple(2thread)                                    1.19us  841.25K
folly_microspin_simple(2thread)                  130.73%   909.27ns    1.10M
folly_picospin_simple(2thread)                   112.39%     1.06us  945.48K
folly_microlock_simple(2thread)                  113.89%     1.04us  958.14K
folly_sharedmutex_simple(2thread)                119.48%   994.86ns    1.01M
folly_distributedmutex_simple(2thread)           112.44%     1.06us  945.91K
folly_distributedmutex_combining_simple(2thread  123.12%   965.48ns    1.04M
folly_flatcombining_no_caching_simple(2thread)    90.56%     1.31us  761.82K
folly_flatcombining_caching_simple(2thread)      100.66%     1.18us  846.83K
atomics_fetch_add(2thread)                       119.15%   997.67ns    1.00M
atomic_fetch_xor(2thread)                        179.85%   660.93ns    1.51M
atomic_cas(2thread)                              179.40%   662.58ns    1.51M
----------------------------------------------------------------------------
std_mutex_simple(4thread)                                    2.37us  422.81K
folly_microspin_simple(4thread)                  110.42%     2.14us  466.89K
folly_picospin_simple(4thread)                   111.77%     2.12us  472.58K
folly_microlock_simple(4thread)                   82.17%     2.88us  347.44K
folly_sharedmutex_simple(4thread)                 93.40%     2.53us  394.89K
folly_distributedmutex_simple(4thread)           121.00%     1.95us  511.58K
folly_distributedmutex_combining_simple(4thread  187.65%     1.26us  793.42K
folly_flatcombining_no_caching_simple(4thread)   104.81%     2.26us  443.13K
folly_flatcombining_caching_simple(4thread)      112.90%     2.09us  477.34K
atomics_fetch_add(4thread)                       178.61%     1.32us  755.20K
atomic_fetch_xor(4thread)                        323.62%   730.84ns    1.37M
atomic_cas(4thread)                              300.43%   787.23ns    1.27M
----------------------------------------------------------------------------
std_mutex_simple(8thread)                                    5.02us  199.09K
folly_microspin_simple(8thread)                  116.44%     4.31us  231.82K
folly_picospin_simple(8thread)                    80.84%     6.21us  160.94K
folly_microlock_simple(8thread)                   77.18%     6.51us  153.66K
folly_sharedmutex_simple(8thread)                 76.09%     6.60us  151.48K
folly_distributedmutex_simple(8thread)           145.27%     3.46us  289.21K
folly_distributedmutex_combining_simple(8thread  310.65%     1.62us  618.48K
folly_flatcombining_no_caching_simple(8thread)   139.83%     3.59us  278.39K
folly_flatcombining_caching_simple(8thread)      163.72%     3.07us  325.95K
atomics_fetch_add(8thread)                       337.67%     1.49us  672.28K
atomic_fetch_xor(8thread)                        380.66%     1.32us  757.87K
atomic_cas(8thread)                              238.04%     2.11us  473.93K
----------------------------------------------------------------------------
std_mutex_simple(16thread)                                  12.26us   81.59K
folly_microspin_simple(16thread)                 116.32%    10.54us   94.91K
folly_picospin_simple(16thread)                   53.67%    22.83us   43.79K
folly_microlock_simple(16thread)                  66.39%    18.46us   54.17K
folly_sharedmutex_simple(16thread)                65.00%    18.85us   53.04K
folly_distributedmutex_simple(16thread)          171.32%     7.15us  139.79K
folly_distributedmutex_combining_simple(16threa  445.11%     2.75us  363.17K
folly_flatcombining_no_caching_simple(16thread)  206.11%     5.95us  168.17K
folly_flatcombining_caching_simple(16thread)     245.09%     5.00us  199.97K
atomics_fetch_add(16thread)                      494.82%     2.48us  403.73K
atomic_fetch_xor(16thread)                       489.90%     2.50us  399.72K
atomic_cas(16thread)                             232.76%     5.27us  189.91K
----------------------------------------------------------------------------
std_mutex_simple(32thread)                                  30.28us   33.03K
folly_microspin_simple(32thread)                 102.20%    29.62us   33.76K
folly_picospin_simple(32thread)                   31.56%    95.92us   10.43K
folly_microlock_simple(32thread)                  53.99%    56.07us   17.83K
folly_sharedmutex_simple(32thread)                67.49%    44.86us   22.29K
folly_distributedmutex_simple(32thread)          161.63%    18.73us   53.38K
folly_distributedmutex_combining_simple(32threa  605.26%     5.00us  199.92K
folly_flatcombining_no_caching_simple(32thread)  234.62%    12.90us   77.49K
folly_flatcombining_caching_simple(32thread)     332.21%     9.11us  109.73K
atomics_fetch_add(32thread)                      909.18%     3.33us  300.30K
atomic_fetch_xor(32thread)                       779.56%     3.88us  257.49K
atomic_cas(32thread)                             622.19%     4.87us  205.51K
----------------------------------------------------------------------------
std_mutex_simple(64thread)                                  34.33us   29.13K
folly_microspin_simple(64thread)                  99.86%    34.37us   29.09K
folly_picospin_simple(64thread)                   31.37%   109.42us    9.14K
folly_microlock_simple(64thread)                  53.46%    64.21us   15.57K
folly_sharedmutex_simple(64thread)                62.94%    54.54us   18.33K
folly_distributedmutex_simple(64thread)          161.26%    21.29us   46.98K
folly_distributedmutex_combining_simple(64threa  603.87%     5.68us  175.91K
folly_flatcombining_no_caching_simple(64thread)  247.00%    13.90us   71.95K
folly_flatcombining_caching_simple(64thread)     310.66%    11.05us   90.50K
atomics_fetch_add(64thread)                      839.49%     4.09us  244.55K
atomic_fetch_xor(64thread)                       756.48%     4.54us  220.37K
atomic_cas(64thread)                             606.85%     5.66us  176.78K
----------------------------------------------------------------------------
std_mutex_simple(128thread)                                 67.35us   14.85K
folly_microspin_simple(128thread)                 92.58%    72.75us   13.75K
folly_picospin_simple(128thread)                  29.87%   225.47us    4.44K
folly_microlock_simple(128thread)                 52.52%   128.25us    7.80K
folly_sharedmutex_simple(128thread)               59.79%   112.64us    8.88K
folly_distributedmutex_simple(128thread)         151.27%    44.52us   22.46K
folly_distributedmutex_combining_simple(128thre  580.11%    11.61us   86.13K
folly_flatcombining_no_caching_simple(128thread  219.20%    30.73us   32.55K
folly_flatcombining_caching_simple(128thread)    225.39%    29.88us   33.46K
atomics_fetch_add(128thread)                     813.36%     8.28us  120.76K
atomic_fetch_xor(128thread)                      740.02%     9.10us  109.88K
atomic_cas(128thread)                            586.66%    11.48us   87.11K
============================================================================

./small_locks_benchmark --bm_min_iters=100000
Intel(R) Xeon(R) D-2191 CPU @ 1.60GHz

============================================================================
folly/synchronization/test/SmallLocksBenchmark.cpprelative  time/iter  iters/s
============================================================================
StdMutexUncontendedBenchmark                                37.65ns   26.56M
MicroSpinLockUncontendedBenchmark                           21.97ns   45.53M
PicoSpinLockUncontendedBenchmark                            40.80ns   24.51M
MicroLockUncontendedBenchmark                               57.76ns   17.31M
SharedMutexUncontendedBenchmark                             39.55ns   25.29M
DistributedMutexUncontendedBenchmark                        51.47ns   19.43M
AtomicFetchAddUncontendedBenchmark                          10.67ns   93.73M
----------------------------------------------------------------------------
----------------------------------------------------------------------------
std_mutex(1thread)                                           1.37us  730.43K
folly_microspin(1thread)                         102.06%     1.34us  745.45K
folly_picospin(1thread)                          100.68%     1.36us  735.43K
folly_microlock(1thread)                         104.27%     1.31us  761.64K
folly_sharedmutex(1thread)                       101.95%     1.34us  744.65K
folly_distributedmutex(1thread)                   98.63%     1.39us  720.41K
folly_distributedmutex_combining(1thread)        103.78%     1.32us  758.05K
folly_flatcombining_no_caching(1thread)           95.44%     1.43us  697.15K
folly_flatcombining_caching(1thread)              99.11%     1.38us  723.94K
----------------------------------------------------------------------------
std_mutex(2thread)                                           1.65us  605.36K
folly_microspin(2thread)                         119.82%     1.38us  725.35K
folly_picospin(2thread)                          112.46%     1.47us  680.81K
folly_microlock(2thread)                         106.47%     1.55us  644.54K
folly_sharedmutex(2thread)                       107.12%     1.54us  648.45K
folly_distributedmutex(2thread)                  110.80%     1.49us  670.76K
folly_distributedmutex_combining(2thread)         97.09%     1.70us  587.77K
folly_flatcombining_no_caching(2thread)           83.37%     1.98us  504.68K
folly_flatcombining_caching(2thread)             108.62%     1.52us  657.54K
----------------------------------------------------------------------------
std_mutex(4thread)                                           2.92us  341.96K
folly_microspin(4thread)                         165.47%     1.77us  565.85K
folly_picospin(4thread)                          181.92%     1.61us  622.09K
folly_microlock(4thread)                         149.83%     1.95us  512.35K
folly_sharedmutex(4thread)                       158.69%     1.84us  542.66K
folly_distributedmutex(4thread)                  107.42%     2.72us  367.35K
folly_distributedmutex_combining(4thread)        144.34%     2.03us  493.59K
folly_flatcombining_no_caching(4thread)           88.43%     3.31us  302.40K
folly_flatcombining_caching(4thread)              94.20%     3.10us  322.11K
----------------------------------------------------------------------------
std_mutex(8thread)                                           7.04us  142.02K
folly_microspin(8thread)                         134.72%     5.23us  191.32K
folly_picospin(8thread)                          112.37%     6.27us  159.58K
folly_microlock(8thread)                         109.65%     6.42us  155.71K
folly_sharedmutex(8thread)                       105.92%     6.65us  150.42K
folly_distributedmutex(8thread)                  127.22%     5.53us  180.67K
folly_distributedmutex_combining(8thread)        275.50%     2.56us  391.26K
folly_flatcombining_no_caching(8thread)          144.99%     4.86us  205.92K
folly_flatcombining_caching(8thread)             156.31%     4.50us  221.99K
----------------------------------------------------------------------------
std_mutex(16thread)                                         13.08us   76.44K
folly_microspin(16thread)                         91.47%    14.30us   69.92K
folly_picospin(16thread)                          67.95%    19.25us   51.94K
folly_microlock(16thread)                         73.57%    17.78us   56.24K
folly_sharedmutex(16thread)                       70.59%    18.53us   53.96K
folly_distributedmutex(16thread)                 139.74%     9.36us  106.82K
folly_distributedmutex_combining(16thread)       338.38%     3.87us  258.67K
folly_flatcombining_no_caching(16thread)         194.08%     6.74us  148.36K
folly_flatcombining_caching(16thread)            195.03%     6.71us  149.09K
----------------------------------------------------------------------------
std_mutex(32thread)                                         25.35us   39.45K
folly_microspin(32thread)                         73.81%    34.35us   29.11K
folly_picospin(32thread)                          50.66%    50.04us   19.98K
folly_microlock(32thread)                         58.40%    43.41us   23.03K
folly_sharedmutex(32thread)                       55.14%    45.98us   21.75K
folly_distributedmutex(32thread)                 141.36%    17.93us   55.76K
folly_distributedmutex_combining(32thread)       358.52%     7.07us  141.42K
folly_flatcombining_no_caching(32thread)         257.78%     9.83us  101.68K
folly_flatcombining_caching(32thread)            285.82%     8.87us  112.74K
----------------------------------------------------------------------------
std_mutex(64thread)                                         45.03us   22.21K
folly_microspin(64thread)                         75.05%    60.00us   16.67K
folly_picospin(64thread)                          44.98%   100.12us    9.99K
folly_microlock(64thread)                         56.99%    79.01us   12.66K
folly_sharedmutex(64thread)                       52.67%    85.49us   11.70K
folly_distributedmutex(64thread)                 139.71%    32.23us   31.02K
folly_distributedmutex_combining(64thread)       343.76%    13.10us   76.34K
folly_flatcombining_no_caching(64thread)         211.67%    21.27us   47.01K
folly_flatcombining_caching(64thread)            222.51%    20.24us   49.41K
----------------------------------------------------------------------------
std_mutex(128thread)                                        88.78us   11.26K
folly_microspin(128thread)                        71.00%   125.03us    8.00K
folly_picospin(128thread)                         30.97%   286.63us    3.49K
folly_microlock(128thread)                        54.37%   163.28us    6.12K
folly_sharedmutex(128thread)                      51.69%   171.76us    5.82K
folly_distributedmutex(128thread)                137.37%    64.63us   15.47K
folly_distributedmutex_combining(128thread)      281.23%    31.57us   31.68K
folly_flatcombining_no_caching(128thread)        136.61%    64.99us   15.39K
folly_flatcombining_caching(128thread)           152.32%    58.29us   17.16K
----------------------------------------------------------------------------
std_mutex_simple(1thread)                                    1.63us  611.75K
folly_microspin_simple(1thread)                  103.24%     1.58us  631.57K
folly_picospin_simple(1thread)                   109.17%     1.50us  667.87K
folly_microlock_simple(1thread)                  111.22%     1.47us  680.41K
folly_sharedmutex_simple(1thread)                136.79%     1.19us  836.83K
folly_distributedmutex_simple(1thread)           107.21%     1.52us  655.88K
folly_distributedmutex_combining_simple(1thread  134.79%     1.21us  824.61K
folly_flatcombining_no_caching_simple(1thread)   127.99%     1.28us  782.99K
folly_flatcombining_caching_simple(1thread)      133.87%     1.22us  818.93K
atomics_fetch_add(1thread)                       138.24%     1.18us  845.70K
atomic_fetch_xor(1thread)                        106.94%     1.53us  654.23K
atomic_cas(1thread)                              124.81%     1.31us  763.52K
----------------------------------------------------------------------------
std_mutex_simple(2thread)                                    1.60us  626.60K
folly_microspin_simple(2thread)                  111.88%     1.43us  701.02K
folly_picospin_simple(2thread)                   106.11%     1.50us  664.91K
folly_microlock_simple(2thread)                   88.90%     1.80us  557.04K
folly_sharedmutex_simple(2thread)                 90.93%     1.76us  569.79K
folly_distributedmutex_simple(2thread)            93.93%     1.70us  588.57K
folly_distributedmutex_combining_simple(2thread  106.86%     1.49us  669.61K
folly_flatcombining_no_caching_simple(2thread)    85.92%     1.86us  538.37K
folly_flatcombining_caching_simple(2thread)       98.82%     1.61us  619.24K
atomics_fetch_add(2thread)                       104.61%     1.53us  655.46K
atomic_fetch_xor(2thread)                        126.46%     1.26us  792.40K
atomic_cas(2thread)                              125.92%     1.27us  788.99K
----------------------------------------------------------------------------
std_mutex_simple(4thread)                                    2.71us  368.45K
folly_microspin_simple(4thread)                  146.48%     1.85us  539.69K
folly_picospin_simple(4thread)                   163.54%     1.66us  602.57K
folly_microlock_simple(4thread)                  113.17%     2.40us  416.99K
folly_sharedmutex_simple(4thread)                142.36%     1.91us  524.52K
folly_distributedmutex_simple(4thread)           108.22%     2.51us  398.74K
folly_distributedmutex_combining_simple(4thread  141.49%     1.92us  521.30K
folly_flatcombining_no_caching_simple(4thread)    97.27%     2.79us  358.38K
folly_flatcombining_caching_simple(4thread)      106.12%     2.56us  390.99K
atomics_fetch_add(4thread)                       151.10%     1.80us  556.73K
atomic_fetch_xor(4thread)                        213.14%     1.27us  785.32K
atomic_cas(4thread)                              218.93%     1.24us  806.65K
----------------------------------------------------------------------------
std_mutex_simple(8thread)                                    7.02us  142.50K
folly_microspin_simple(8thread)                  137.77%     5.09us  196.33K
folly_picospin_simple(8thread)                   119.78%     5.86us  170.69K
folly_microlock_simple(8thread)                  108.08%     6.49us  154.02K
folly_sharedmutex_simple(8thread)                114.77%     6.11us  163.55K
folly_distributedmutex_simple(8thread)           120.24%     5.84us  171.35K
folly_distributedmutex_combining_simple(8thread  316.54%     2.22us  451.07K
folly_flatcombining_no_caching_simple(8thread)   136.43%     5.14us  194.42K
folly_flatcombining_caching_simple(8thread)      145.04%     4.84us  206.68K
atomics_fetch_add(8thread)                       358.98%     1.95us  511.55K
atomic_fetch_xor(8thread)                        505.27%     1.39us  720.02K
atomic_cas(8thread)                              389.32%     1.80us  554.79K
----------------------------------------------------------------------------
std_mutex_simple(16thread)                                  12.78us   78.24K
folly_microspin_simple(16thread)                  98.10%    13.03us   76.75K
folly_picospin_simple(16thread)                   72.52%    17.62us   56.74K
folly_microlock_simple(16thread)                  70.12%    18.23us   54.86K
folly_sharedmutex_simple(16thread)                76.81%    16.64us   60.09K
folly_distributedmutex_simple(16thread)          113.84%    11.23us   89.06K
folly_distributedmutex_combining_simple(16threa  498.99%     2.56us  390.39K
folly_flatcombining_no_caching_simple(16thread)  193.05%     6.62us  151.04K
folly_flatcombining_caching_simple(16thread)     220.47%     5.80us  172.49K
atomics_fetch_add(16thread)                      611.70%     2.09us  478.58K
atomic_fetch_xor(16thread)                       515.51%     2.48us  403.32K
atomic_cas(16thread)                             239.86%     5.33us  187.66K
----------------------------------------------------------------------------
std_mutex_simple(32thread)                                  23.80us   42.02K
folly_microspin_simple(32thread)                  76.32%    31.18us   32.07K
folly_picospin_simple(32thread)                   48.82%    48.75us   20.51K
folly_microlock_simple(32thread)                  52.99%    44.92us   22.26K
folly_sharedmutex_simple(32thread)                54.03%    44.05us   22.70K
folly_distributedmutex_simple(32thread)          108.28%    21.98us   45.49K
folly_distributedmutex_combining_simple(32threa  697.71%     3.41us  293.15K
folly_flatcombining_no_caching_simple(32thread)  291.70%     8.16us  122.56K
folly_flatcombining_caching_simple(32thread)     412.51%     5.77us  173.32K
atomics_fetch_add(32thread)                     1074.64%     2.21us  451.52K
atomic_fetch_xor(32thread)                       577.90%     4.12us  242.81K
atomic_cas(32thread)                             193.87%    12.28us   81.46K
----------------------------------------------------------------------------
std_mutex_simple(64thread)                                  41.40us   24.16K
folly_microspin_simple(64thread)                  75.30%    54.98us   18.19K
folly_picospin_simple(64thread)                   42.87%    96.57us   10.35K
folly_microlock_simple(64thread)                  50.88%    81.37us   12.29K
folly_sharedmutex_simple(64thread)                50.08%    82.67us   12.10K
folly_distributedmutex_simple(64thread)          105.81%    39.12us   25.56K
folly_distributedmutex_combining_simple(64threa  604.86%     6.84us  146.11K
folly_flatcombining_no_caching_simple(64thread)  269.82%    15.34us   65.18K
folly_flatcombining_caching_simple(64thread)     334.78%    12.37us   80.87K
atomics_fetch_add(64thread)                     1061.21%     3.90us  256.34K
atomic_fetch_xor(64thread)                       551.00%     7.51us  133.10K
atomic_cas(64thread)                             183.75%    22.53us   44.39K
----------------------------------------------------------------------------
std_mutex_simple(128thread)                                 80.97us   12.35K
folly_microspin_simple(128thread)                 70.93%   114.16us    8.76K
folly_picospin_simple(128thread)                  32.81%   246.78us    4.05K
folly_microlock_simple(128thread)                 48.00%   168.69us    5.93K
folly_sharedmutex_simple(128thread)               49.03%   165.15us    6.06K
folly_distributedmutex_simple(128thread)         103.96%    77.88us   12.84K
folly_distributedmutex_combining_simple(128thre  460.68%    17.58us   56.90K
folly_flatcombining_no_caching_simple(128thread  211.10%    38.35us   26.07K
folly_flatcombining_caching_simple(128thread)    220.02%    36.80us   27.17K
atomics_fetch_add(128thread)                    1031.88%     7.85us  127.45K
atomic_fetch_xor(128thread)                      543.67%    14.89us   67.15K
atomic_cas(128thread)                            179.37%    45.14us   22.15K
============================================================================
*/
folly/folly/synchronization/test/SmallLocksBenchmark.cpp