folly/folly/test/SynchronizedBenchmark.cpp

/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#include <folly/Synchronized.h>

#include <algorithm>
#include <condition_variable>
#include <map>
#include <memory>
#include <mutex>
#include <shared_mutex>
#include <thread>

#include <folly/Benchmark.h>
#include <folly/portability/GTest.h>

using namespace folly;

DEFINE_uint64(iterations, 100, "The number of iterations with lock held");

/**
 * Benchmarks:
 *
 * lock(folly::wlock(one), folly::rlock(two));
 * ------------
 * Three deadlock avoidance algorithms have been tested here with threads
 * locking a subset of the set of mutexes.  Two of the relevant algorithms and
 * their descriptions can be found here -
 * http://howardhinnant.github.io/dining_philosophers.html.  The ones tested
 * from that link are the ones named "Smart" and "Persistent".  The third one
 * tested is the standard ordered algorithm which locks mutexes in order of
 * their addresses.
 *
 * The benchmarks show that the "smart" algorithm almost always has better
 * performance under contention when acquiring more than one mutex.  Even in
 * simple cases.  In uncontended cases, all the algorithms perform the same
 *
 * ============================================================================
 * folly/test/SynchronizedBenchmark.cpp            relative  time/iter  iters/s
 * ============================================================================
 * UncontendedFollyLock                                        66.53ns   15.03M
 * UncontendedStdLock                                          68.72ns   14.55M
 * UncontendedOrdered                                          64.41ns   15.53M
 * UncontendedReverseOrdered                                   64.40ns   15.53M
 * ----------------------------------------------------------------------------
 * ThreeThreadsSimpleFollyLock                                  4.14us  241.57K
 * ThreeThreadsSimpleStdLock                                    5.17us  193.47K
 * ThreeThreadsSimpleOrdered                                    6.34us  157.81K
 * ThreeThreadsSimpleCarefullyOrdered                           6.27us  159.47K
 * ----------------------------------------------------------------------------
 * ThreeThreadsPathologicalFollyLock                            3.81us  262.24K
 * ThreeThreadsPathologicalStdLock                              5.34us  187.28K
 * ThreeThreadsPathologicalOrdered                              6.36us  157.28K
 * ThreeThreadsPathologicalCarefullyOrdered                     4.21us  237.29K
 * ----------------------------------------------------------------------------
 * TwoThreadsTwoMutexesOrdered                                260.87ns    3.83M
 * TwoThreadsTwoMutexesSmart                                  161.28ns    6.20M
 * TwoThreadsTwoMutexesPersistent                             226.25ns    4.42M
 * ----------------------------------------------------------------------------
 * TwoThreadsFourMutexesOrdered                               196.01ns    5.10M
 * TwoThreadsFourMutexesSmart                                 196.73ns    5.08M
 * TwoThreadsFourMutexesPersistent                            201.70ns    4.96M
 * ----------------------------------------------------------------------------
 * TwoThreadsEightMutexesOrdered                              195.76ns    5.11M
 * TwoThreadsEightMutexesSmart                                187.90ns    5.32M
 * TwoThreadsEightMutexesPersistent                           199.21ns    5.02M
 * ----------------------------------------------------------------------------
 * TwoThreadsSixteenMutexesOrdered                            203.91ns    4.90M
 * TwoThreadsSixteenMutexesSmart                              196.30ns    5.09M
 * TwoThreadsSixteenMutexesPersistent                         230.64ns    4.34M
 * ----------------------------------------------------------------------------
 * FourThreadsTwoMutexesOrdered                               814.98ns    1.23M
 * FourThreadsTwoMutexesSmart                                 559.79ns    1.79M
 * FourThreadsTwoMutexesPersistent                            520.90ns    1.92M
 * ----------------------------------------------------------------------------
 * FourThreadsFourMutexesOrdered                              456.04ns    2.19M
 * FourThreadsFourMutexesSmart                                391.69ns    2.55M
 * FourThreadsFourMutexesPersistent                           414.56ns    2.41M
 * ----------------------------------------------------------------------------
 * FourThreadsEightMutexesOrdered                             392.20ns    2.55M
 * FourThreadsEightMutexesSmart                               277.89ns    3.60M
 * FourThreadsEightMutexesPersistent                          301.98ns    3.31M
 * ----------------------------------------------------------------------------
 * FourThreadsSixteenMutexesOrdered                           356.36ns    2.81M
 * FourThreadsSixteenMutexesSmart                             291.40ns    3.43M
 * FourThreadsSixteenMutexesPersistent                        292.23ns    3.42M
 * ----------------------------------------------------------------------------
 * EightThreadsTwoMutexesOrdered                                1.58us  634.85K
 * EightThreadsTwoMutexesSmart                                  1.58us  634.85K
 * EightThreadsTwoMutexesPersistent                             1.56us  639.93K
 * ----------------------------------------------------------------------------
 * EightThreadsFourMutexesOrdered                               1.33us  753.45K
 * EightThreadsFourMutexesSmart                               794.36ns  936.34K
 * EightThreadsFourMutexesPersistent                          831.68ns    1.21M
 * ----------------------------------------------------------------------------
 * EightThreadsEightMutexesOrdered                            791.52ns    1.26M
 * EightThreadsEightMutexesSmart                              548.05ns    1.51M
 * EightThreadsEightMutexesPersistent                         563.14ns    2.78M
 * ----------------------------------------------------------------------------
 * EightThreadsSixteenMutexesOrdered                          785.40ns    2.11M
 * EightThreadsSixteenMutexesSmart                            541.27ns    1.60M
 * EightThreadsSixteenMutexesPersistent                       673.49ns    1.79M
 * ----------------------------------------------------------------------------
 * SixteenThreadsTwoMutexesOrdered                              1.98us  505.83K
 * SixteenThreadsTwoMutexesSmart                                1.85us  541.06K
 * SixteenThreadsTwoMutexesPersistent                           3.13us  319.53K
 * ----------------------------------------------------------------------------
 * SixteenThreadsFourMutexesOrdered                             2.46us  407.07K
 * SixteenThreadsFourMutexesSmart                               1.68us  594.47K
 * SixteenThreadsFourMutexesPersistent                          1.62us  617.22K
 * ----------------------------------------------------------------------------
 * SixteenThreadsEightMutexesOrdered                            1.67us  597.45K
 * SixteenThreadsEightMutexesSmart                              1.62us  616.83K
 * SixteenThreadsEightMutexesPersistent                         1.57us  637.50K
 * ----------------------------------------------------------------------------
 * SixteenThreadsSixteenMutexesOrdered                          1.20us  829.93K
 * SixteenThreadsSixteenMutexesSmart                            1.32us  757.03K
 * SixteenThreadsSixteenMutexesPersistent                       1.38us  726.75K
 * ============================================================================
 */

namespace {
template <typename NonMovableType>
std::vector<std::unique_ptr<NonMovableType>> makeVector(int num);

/**
 * Spin n times
 */
void spin(int n);

/**
 * Generic tests for three deadlock avoidance algorithms
 */
template <typename Mutex>
void ordered(
    std::size_t iters,
    int threads,
    std::vector<std::unique_ptr<Mutex>> mutexes);
template <typename Mutex>
void smart(
    std::size_t iters,
    int threads,
    std::vector<std::unique_ptr<Mutex>> mutexes);
template <typename Mutex>
void persistent(
    std::size_t iters,
    int threads,
    std::vector<std::unique_ptr<Mutex>> mutexes);

/**
 * Pathological case where the current folly::lock algorithm performs up to 2x
 * better than simple ordered locking
 *
 * This test degrades to pathological performance if the order of mutex
 * locking is not picked with care.  The thread that locks mutexes in a
 * deadlock avoiding manner is not doing much work.  And acquiring the locks
 * in order blocks another thread completely because it waits for a third
 * thread to finish processing before it can do its thing and let the second
 * thread proceed, effectively limiting the concurrency of the three threads
 *
 * The non-obvious resolution for the performance pathology is to change the
 * order of lock acquisition
 */
template <typename LockingFunc>
void pathological(std::size_t iters, LockingFunc func);

/**
 * Simple mutex acquisition, in this test one thread acquires two mutexes does
 * very little work and releases the mutexes, while two other threads are
 * trying to acquire the mutexes independently and doing work
 */
template <typename LockingFunc>
void simple(std::size_t iters, LockingFunc func);

/**
 * Simple uncontended mutex acquisition
 */
template <typename LockingFunc>
void uncontended(std::size_t iters, LockingFunc func);

/**
 * Help start tests only when the given number of threads have hit the start
 * line
 */
class BenchmarkStartBarrier {
 public:
  explicit BenchmarkStartBarrier(int threads) : threads_{threads + 1} {}

  void wait() {
    auto lck = std::unique_lock<std::mutex>{mutex_};
    ++started_;

    // if all the threads have started the benchmarks
    if (started_ == threads_) {
      cv_.notify_all();
      return;
    }

    // wait till all the threads have started
    while (started_ != threads_) {
      cv_.wait(lck);
    }
  }

  std::mutex mutex_;
  std::condition_variable cv_;
  const int threads_;
  int started_{0};
};
} // namespace

BENCHMARK(UncontendedFollyLock, iters) {
  uncontended(iters, [](auto& one, auto& two) { folly::lock(one, two); });
}

BENCHMARK(UncontendedStdLock, iters) {
  uncontended(iters, [](auto& one, auto& two) { std::lock(one, two); });
}

BENCHMARK(UncontendedOrdered, iters) {
  uncontended(iters, [](auto& one, auto& two) {
    one.lock();
    two.lock();
  });
}

BENCHMARK(UncontendedReverseOrdered, iters) {
  uncontended(iters, [](auto& one, auto& two) {
    two.lock();
    one.lock();
  });
}

BENCHMARK_DRAW_LINE();

BENCHMARK(ThreeThreadsSimpleFollyLock, iters) {
  simple(iters, [](auto& one, auto& two) { folly::lock(one, two); });
}

BENCHMARK(ThreeThreadsSimpleStdLock, iters) {
  simple(iters, [](auto& one, auto& two) { std::lock(one, two); });
}

BENCHMARK(ThreeThreadsSimpleOrdered, iters) {
  simple(iters, [](auto& one, auto& two) {
    one.lock();
    two.lock();
  });
}

BENCHMARK(ThreeThreadsSimpleReverseOrdered, iters) {
  simple(iters, [](auto& one, auto& two) {
    two.lock();
    one.lock();
  });
}

BENCHMARK_DRAW_LINE();

BENCHMARK(ThreeThreadsPathologicalFollyLock, iters) {
  pathological(iters, [](auto& one, auto& two, auto& three) {
    folly::lock(one, two, three);
  });
}

BENCHMARK(ThreeThreadsPathologicalStdLock, iters) {
  pathological(iters, [](auto& one, auto& two, auto& three) {
    std::lock(one, two, three);
  });
}

BENCHMARK(ThreeThreadsPathologicalOrdered, iters) {
  pathological(iters, [](auto& one, auto& two, auto& three) {
    one.lock();
    two.lock();
    three.lock();
  });
}

BENCHMARK(ThreeThreadsPathologicalCarefullyOrdered, iters) {
  pathological(iters, [](auto& one, auto& two, auto& three) {
    two.lock();
    three.lock();
    one.lock();
  });
}

BENCHMARK_DRAW_LINE();

BENCHMARK(TwoThreadsTwoMutexesOrdered, iters) {
  ordered(iters, 2, makeVector<std::mutex>(2));
}
BENCHMARK(TwoThreadsTwoMutexesSmart, iters) {
  smart(iters, 2, makeVector<std::mutex>(2));
}
BENCHMARK(TwoThreadsTwoMutexesPersistent, iters) {
  persistent(iters, 2, makeVector<std::mutex>(2));
}

BENCHMARK_DRAW_LINE();

BENCHMARK(TwoThreadsFourMutexesOrdered, iters) {
  ordered(iters, 2, makeVector<std::mutex>(4));
}
BENCHMARK(TwoThreadsFourMutexesSmart, iters) {
  smart(iters, 2, makeVector<std::mutex>(4));
}
BENCHMARK(TwoThreadsFourMutexesPersistent, iters) {
  persistent(iters, 2, makeVector<std::mutex>(4));
}

BENCHMARK_DRAW_LINE();

BENCHMARK(TwoThreadsEightMutexesOrdered, iters) {
  ordered(iters, 2, makeVector<std::mutex>(8));
}
BENCHMARK(TwoThreadsEightMutexesSmart, iters) {
  smart(iters, 2, makeVector<std::mutex>(8));
}
BENCHMARK(TwoThreadsEightMutexesPersistent, iters) {
  persistent(iters, 2, makeVector<std::mutex>(8));
}

BENCHMARK_DRAW_LINE();

BENCHMARK(TwoThreadsSixteenMutexesOrdered, iters) {
  ordered(iters, 2, makeVector<std::mutex>(16));
}
BENCHMARK(TwoThreadsSixteenMutexesSmart, iters) {
  smart(iters, 2, makeVector<std::mutex>(16));
}
BENCHMARK(TwoThreadsSixteenMutexesPersistent, iters) {
  persistent(iters, 2, makeVector<std::mutex>(16));
}

BENCHMARK_DRAW_LINE();

BENCHMARK(FourThreadsTwoMutexesOrdered, iters) {
  ordered(iters, 4, makeVector<std::mutex>(2));
}
BENCHMARK(FourThreadsTwoMutexesSmart, iters) {
  smart(iters, 4, makeVector<std::mutex>(2));
}
BENCHMARK(FourThreadsTwoMutexesPersistent, iters) {
  persistent(iters, 4, makeVector<std::mutex>(2));
}

BENCHMARK_DRAW_LINE();

BENCHMARK(FourThreadsFourMutexesOrdered, iters) {
  ordered(iters, 4, makeVector<std::mutex>(4));
}
BENCHMARK(FourThreadsFourMutexesSmart, iters) {
  smart(iters, 4, makeVector<std::mutex>(4));
}
BENCHMARK(FourThreadsFourMutexesPersistent, iters) {
  persistent(iters, 4, makeVector<std::mutex>(4));
}

BENCHMARK_DRAW_LINE();

BENCHMARK(FourThreadsEightMutexesOrdered, iters) {
  ordered(iters, 4, makeVector<std::mutex>(8));
}
BENCHMARK(FourThreadsEightMutexesSmart, iters) {
  smart(iters, 4, makeVector<std::mutex>(8));
}
BENCHMARK(FourThreadsEightMutexesPersistent, iters) {
  persistent(iters, 4, makeVector<std::mutex>(8));
}

BENCHMARK_DRAW_LINE();

BENCHMARK(FourThreadsSixteenMutexesOrdered, iters) {
  ordered(iters, 4, makeVector<std::mutex>(16));
}
BENCHMARK(FourThreadsSixteenMutexesSmart, iters) {
  smart(iters, 4, makeVector<std::mutex>(16));
}
BENCHMARK(FourThreadsSixteenMutexesPersistent, iters) {
  persistent(iters, 4, makeVector<std::mutex>(16));
}

BENCHMARK_DRAW_LINE();

BENCHMARK(EightThreadsTwoMutexesOrdered, iters) {
  ordered(iters, 8, makeVector<std::mutex>(2));
}
BENCHMARK(EightThreadsTwoMutexesSmart, iters) {
  smart(iters, 8, makeVector<std::mutex>(2));
}
BENCHMARK(EightThreadsTwoMutexesPersistent, iters) {
  persistent(iters, 8, makeVector<std::mutex>(2));
}

BENCHMARK_DRAW_LINE();

BENCHMARK(EightThreadsFourMutexesOrdered, iters) {
  ordered(iters, 8, makeVector<std::mutex>(4));
}
BENCHMARK(EightThreadsFourMutexesSmart, iters) {
  smart(iters, 8, makeVector<std::mutex>(4));
}
BENCHMARK(EightThreadsFourMutexesPersistent, iters) {
  persistent(iters, 8, makeVector<std::mutex>(4));
}

BENCHMARK_DRAW_LINE();

BENCHMARK(EightThreadsEightMutexesOrdered, iters) {
  ordered(iters, 8, makeVector<std::mutex>(8));
}
BENCHMARK(EightThreadsEightMutexesSmart, iters) {
  smart(iters, 8, makeVector<std::mutex>(8));
}
BENCHMARK(EightThreadsEightMutexesPersistent, iters) {
  persistent(iters, 8, makeVector<std::mutex>(8));
}

BENCHMARK_DRAW_LINE();

BENCHMARK(EightThreadsSixteenMutexesOrdered, iters) {
  ordered(iters, 8, makeVector<std::mutex>(16));
}
BENCHMARK(EightThreadsSixteenMutexesSmart, iters) {
  smart(iters, 8, makeVector<std::mutex>(16));
}
BENCHMARK(EightThreadsSixteenMutexesPersistent, iters) {
  persistent(iters, 8, makeVector<std::mutex>(16));
}

BENCHMARK_DRAW_LINE();

BENCHMARK(SixteenThreadsTwoMutexesOrdered, iters) {
  ordered(iters, 16, makeVector<std::mutex>(2));
}
BENCHMARK(SixteenThreadsTwoMutexesSmart, iters) {
  smart(iters, 16, makeVector<std::mutex>(2));
}
BENCHMARK(SixteenThreadsTwoMutexesPersistent, iters) {
  persistent(iters, 16, makeVector<std::mutex>(2));
}

BENCHMARK_DRAW_LINE();

BENCHMARK(SixteenThreadsFourMutexesOrdered, iters) {
  ordered(iters, 16, makeVector<std::mutex>(4));
}
BENCHMARK(SixteenThreadsFourMutexesSmart, iters) {
  smart(iters, 16, makeVector<std::mutex>(4));
}
BENCHMARK(SixteenThreadsFourMutexesPersistent, iters) {
  persistent(iters, 16, makeVector<std::mutex>(4));
}

BENCHMARK_DRAW_LINE();

BENCHMARK(SixteenThreadsEightMutexesOrdered, iters) {
  ordered(iters, 16, makeVector<std::mutex>(8));
}
BENCHMARK(SixteenThreadsEightMutexesSmart, iters) {
  smart(iters, 16, makeVector<std::mutex>(8));
}
BENCHMARK(SixteenThreadsEightMutexesPersistent, iters) {
  persistent(iters, 16, makeVector<std::mutex>(8));
}

BENCHMARK_DRAW_LINE();

BENCHMARK(SixteenThreadsSixteenMutexesOrdered, iters) {
  ordered(iters, 16, makeVector<std::mutex>(16));
}
BENCHMARK(SixteenThreadsSixteenMutexesSmart, iters) {
  smart(iters, 16, makeVector<std::mutex>(16));
}
BENCHMARK(SixteenThreadsSixteenMutexesPersistent, iters) {
  persistent(iters, 16, makeVector<std::mutex>(16));
}

int main(int argc, char** argv) {
  gflags::ParseCommandLineFlags(&argc, &argv, true);
  folly::runBenchmarks();
}

namespace {
std::pair<int, int> getMutexIndices(int threadId, int mutexListSize) {
  // assign two mutexes to the current thread, we need to prevent
  // deadlocks here by resorting the indexes in increasing order,
  // because a thread might pick the last mutex as the one it locks and
  // then adding one to that will make it wrap around, breaking the
  // ordering
  auto index = threadId % mutexListSize;

  auto firstMutexIndex = ((index + 1) == mutexListSize) ? 0 : index;
  auto secondMutexIndex =
      ((index + 1) == mutexListSize) ? (mutexListSize - 1) : (index + 1);

  return std::make_pair(firstMutexIndex, secondMutexIndex);
}

template <typename NonMovableType>
std::vector<std::unique_ptr<NonMovableType>> makeVector(int num) {
  auto vector = std::vector<std::unique_ptr<NonMovableType>>{};
  vector.reserve(num);
  for (auto i = 0; i < num; ++i) {
    vector.push_back(std::make_unique<NonMovableType>());
  }
  return vector;
}

template <typename Mutex>
void ordered(
    std::size_t iters,
    int numThreads,
    std::vector<std::unique_ptr<Mutex>> mutexes) {
  auto suspender = BenchmarkSuspender{};

  // Sort the mutexes so there is no deadlock because of lock acquisition
  // ordering
  std::sort(mutexes.begin(), mutexes.end(), [](auto& one, auto& two) {
    return one.get() < two.get();
  });

  auto threads = std::vector<std::thread>{};
  auto&& barrier = BenchmarkStartBarrier{numThreads};

  for (auto thread = 0; thread < numThreads; ++thread) {
    threads.emplace_back([&mutexes, iters, thread, &barrier] {
      barrier.wait();

      auto indices = getMutexIndices(thread, mutexes.size());
      for (auto i = std::size_t{0}; i < iters; ++i) {
        // lock the mutexes
        mutexes[indices.first]->lock();
        mutexes[indices.second]->lock();

        spin(FLAGS_iterations);

        mutexes[indices.first]->unlock();
        mutexes[indices.second]->unlock();
      }
    });
  }

  barrier.wait();
  suspender.dismissing([&] {
    for (auto& thread : threads) {
      thread.join();
    }
  });
}

template <typename Mutex>
void smart(
    std::size_t iters,
    int numThreads,
    std::vector<std::unique_ptr<Mutex>> mutexes) {
  auto suspender = BenchmarkSuspender{};

  auto threads = std::vector<std::thread>{};
  auto&& barrier = BenchmarkStartBarrier{numThreads};

  for (auto thread = 0; thread < numThreads; ++thread) {
    threads.emplace_back([iters, &mutexes, thread, &barrier] {
      barrier.wait();

      auto indices = std::make_pair(
          thread % mutexes.size(), (thread + 1) % mutexes.size());
      for (auto iter = std::size_t{0}; iter < iters; ++iter) {
        while (true) {
          mutexes[indices.first]->lock();
          if (mutexes[indices.second]->try_lock()) {
            break;
          }

          mutexes[indices.first]->unlock();
          std::swap(indices.first, indices.second);
          std::this_thread::yield();
        }

        spin(FLAGS_iterations);

        mutexes[indices.first]->unlock();
        mutexes[indices.second]->unlock();
      }
    });
  }

  barrier.wait();
  suspender.dismissing([&] {
    for (auto& thread : threads) {
      thread.join();
    }
  });
}

template <typename Mutex>
void persistent(
    std::size_t iters,
    int numThreads,
    std::vector<std::unique_ptr<Mutex>> mutexes) {
  auto suspender = BenchmarkSuspender{};

  auto threads = std::vector<std::thread>{};
  auto&& barrier = BenchmarkStartBarrier{numThreads};

  for (auto thread = 0; thread < numThreads; ++thread) {
    threads.emplace_back([iters, &mutexes, thread, &barrier] {
      barrier.wait();

      auto indices = std::make_pair(
          thread % mutexes.size(), (thread + 1) % mutexes.size());
      for (auto iter = std::size_t{0}; iter < iters; ++iter) {
        // lock the mutexes by first locking a mutex and then acquiring the
        // next mutex (or mutexes) with a try_lock()
        while (true) {
          mutexes[indices.first]->lock();
          if (mutexes[indices.second]->try_lock()) {
            break;
          }

          mutexes[indices.first]->unlock();
        }

        spin(FLAGS_iterations);

        mutexes[indices.first]->unlock();
        mutexes[indices.second]->unlock();
      }
    });
  }

  barrier.wait();
  suspender.dismissing([&] {
    for (auto& thread : threads) {
      thread.join();
    }
  });
}

template <typename LockingFunc>
void simple(std::size_t iters, LockingFunc func) {
  auto&& suspender = BenchmarkSuspender{};
  auto&& one = std::mutex{};
  auto&& two = std::mutex{};
  auto&& barrier = BenchmarkStartBarrier{3};

  auto threadOne = std::thread{[&] {
    barrier.wait();

    for (auto i = std::size_t{0}; i < iters; ++i) {
      auto lckOne = std::unique_lock<std::mutex>{one, std::defer_lock};
      auto lckTwo = std::unique_lock<std::mutex>{two, std::defer_lock};
      func(lckOne, lckTwo);

      spin(FLAGS_iterations);
    }
  }};

  auto threadTwo = std::thread{[&] {
    barrier.wait();

    for (auto i = std::size_t{0}; i < iters; ++i) {
      auto lck = std::unique_lock<std::mutex>{one};
      spin(FLAGS_iterations * FLAGS_iterations);
    }
  }};

  auto threadThree = std::thread{[&] {
    barrier.wait();

    for (auto i = std::size_t{0}; i < iters; ++i) {
      auto lck = std::unique_lock<std::mutex>{two};
      spin(FLAGS_iterations * FLAGS_iterations);
    }
  }};

  barrier.wait();
  suspender.dismissing([&] {
    threadOne.join();
    threadTwo.join();
    threadThree.join();
  });
}

template <typename LockingFunc>
void pathological(std::size_t iters, LockingFunc func) {
  auto&& suspender = BenchmarkSuspender{};
  auto&& one = std::mutex{};
  auto&& two = std::mutex{};
  auto&& three = std::mutex{};
  auto&& barrier = BenchmarkStartBarrier{3};

  auto threadOne = std::thread{[&] {
    barrier.wait();

    for (auto i = std::size_t{0}; i < iters; ++i) {
      auto lckOne = std::unique_lock<std::mutex>{one, std::defer_lock};
      auto lckTwo = std::unique_lock<std::mutex>{two, std::defer_lock};
      auto lckThree = std::unique_lock<std::mutex>{three, std::defer_lock};
      func(lckOne, lckTwo, lckThree);

      spin(FLAGS_iterations);
    }
  }};

  auto threadTwo = std::thread{[&] {
    barrier.wait();

    for (auto i = std::size_t{0}; i < iters; ++i) {
      auto lck = std::unique_lock<std::mutex>{one};

      spin(FLAGS_iterations * FLAGS_iterations);
    }
  }};

  auto threadThree = std::thread{[&] {
    barrier.wait();

    for (auto i = std::size_t{0}; i < iters; ++i) {
      auto lckTwo = std::unique_lock<std::mutex>{two};
      auto lckThree = std::unique_lock<std::mutex>{three};

      spin(FLAGS_iterations * FLAGS_iterations);
    }
  }};

  barrier.wait();
  suspender.dismissing([&] {
    threadOne.join();
    threadTwo.join();
    threadThree.join();
  });
}

template <typename LockingFunc>
void uncontended(std::size_t iters, LockingFunc func) {
  auto&& suspender = BenchmarkSuspender{};
  auto&& one = std::mutex{};
  auto&& two = std::mutex{};

  suspender.dismissing([&] {
    for (auto i = std::size_t{0}; i < iters; ++i) {
      func(one, two);

      spin(FLAGS_iterations);

      one.unlock();
      two.unlock();
    }
  });
}

void spin(int iterations) {
  for (auto i = 0; i < iterations; ++i) {
    doNotOptimizeAway(i);
  }
}

} // namespace