folly/folly/concurrency/test/DynamicBoundedQueueTest.cpp

/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#include <folly/concurrency/DynamicBoundedQueue.h>

#include <folly/MPMCQueue.h>
#include <folly/ProducerConsumerQueue.h>
#include <folly/portability/GFlags.h>
#include <folly/portability/GTest.h>

#include <glog/logging.h>

#include <atomic>
#include <iomanip>
#include <thread>

DEFINE_bool(bench, false, "run benchmark");
DEFINE_int32(reps, 10, "number of reps");
DEFINE_int32(ops, 1000000, "number of operations per rep");
DEFINE_int64(capacity, 1000000, "capacity");

template <typename T, bool MayBlock, typename WeightFn>
using DSPSC = folly::DSPSCQueue<T, MayBlock, 8, 7, WeightFn>;

template <typename T, bool MayBlock, typename WeightFn>
using DMPSC = folly::DMPSCQueue<T, MayBlock, 8, 7, WeightFn>;

template <typename T, bool MayBlock, typename WeightFn>
using DSPMC = folly::DSPMCQueue<T, MayBlock, 8, 7, WeightFn>;

template <typename T, bool MayBlock, typename WeightFn>
using DMPMC = folly::DMPMCQueue<T, MayBlock, 8, 7, WeightFn>;

template <template <typename, bool, typename> class Q, bool MayBlock>
void basic_test() {
  auto dur = std::chrono::microseconds(100);
  auto deadline = std::chrono::steady_clock::now() + dur;

  struct CustomWeightFn {
    uint64_t operator()(int val) { return val * 100; }
  };

  Q<int, MayBlock, CustomWeightFn> q(10000);

  ASSERT_TRUE(q.empty());
  ASSERT_EQ(q.size(), 0);
  int v;
  ASSERT_FALSE(q.try_dequeue(v));
  ASSERT_FALSE(q.try_dequeue().hasValue());

  q.enqueue(1);
  ASSERT_TRUE(q.try_enqueue(2));
  ASSERT_TRUE(q.try_enqueue_until(3, deadline));
  ASSERT_TRUE(q.try_enqueue_for(4, dur));
  q.enqueue(5);
  q.enqueue(6);
  q.enqueue(7);

  ASSERT_EQ(q.size(), 7);
  ASSERT_EQ(q.weight(), 2800);
  ASSERT_FALSE(q.empty());

  q.dequeue(v);
  ASSERT_EQ(v, 1);
  ASSERT_TRUE(q.try_dequeue(v));
  ASSERT_EQ(v, 2);
  ASSERT_TRUE(q.try_dequeue_until(v, deadline));
  ASSERT_EQ(v, 3);
  ASSERT_TRUE(q.try_dequeue_for(v, dur));
  ASSERT_EQ(v, 4);
  ASSERT_EQ(*q.try_dequeue(), 5);
  ASSERT_EQ(*q.try_dequeue_until(deadline), 6);
  ASSERT_EQ(*q.try_dequeue_for(dur), 7);

  ASSERT_TRUE(q.empty());
  ASSERT_EQ(q.size(), 0);
  ASSERT_EQ(q.weight(), 0);
}

TEST(DynamicBoundedQueue, basic) {
  basic_test<DSPSC, false>();
  basic_test<DMPSC, false>();
  basic_test<DSPMC, false>();
  basic_test<DMPMC, false>();
  basic_test<DSPSC, true>();
  basic_test<DMPSC, true>();
  basic_test<DSPMC, true>();
  basic_test<DMPMC, true>();
}

template <template <typename, bool, typename> class Q, bool MayBlock>
void move_test() {
  struct Foo {
    int v_;
    explicit Foo(int v) noexcept : v_(v) {}
    Foo(const Foo&) = delete;
    Foo& operator=(const Foo&) = delete;
    Foo(Foo&& other) noexcept : v_(other.v_) {}
    Foo& operator=(Foo&& other) noexcept {
      v_ = other.v_;
      return *this;
    }
  };

  struct CustomWeightFn {
    uint64_t operator()(Foo&&) { return 10; }
  };

  auto dur = std::chrono::microseconds(100);
  auto deadline = std::chrono::steady_clock::now() + dur;

  Q<Foo, MayBlock, CustomWeightFn> q(100);
  Foo v(1);
  q.enqueue(std::move(v));
  ASSERT_TRUE(q.try_enqueue(std::move(v)));
  ASSERT_TRUE(q.try_enqueue_until(std::move(v), deadline));
  ASSERT_TRUE(q.try_enqueue_for(std::move(v), dur));

  ASSERT_EQ(q.size(), 4);
  ASSERT_EQ(q.weight(), 40);
}

TEST(DynamicBoundedQueue, move) {
  move_test<DSPSC, false>();
  move_test<DMPSC, false>();
  move_test<DSPMC, false>();
  move_test<DMPMC, false>();
  move_test<DSPSC, true>();
  move_test<DMPSC, true>();
  move_test<DSPMC, true>();
  move_test<DMPMC, true>();
}

template <template <typename, bool, typename> class Q, bool MayBlock>
void capacity_test() {
  struct CustomWeightFn {
    uint64_t operator()(int val) { return val; }
  };

  Q<int, MayBlock, CustomWeightFn> q(1000);
  ASSERT_EQ(q.weight(), 0);
  int v;
  q.enqueue(100);
  ASSERT_EQ(q.weight(), 100);
  q.enqueue(300);
  ASSERT_EQ(q.weight(), 400);
  ASSERT_FALSE(q.try_enqueue(1200));
  q.reset_capacity(2000); // reset capacityy to 2000
  ASSERT_TRUE(q.try_enqueue(1200));
  ASSERT_EQ(q.weight(), 1600);
  ASSERT_EQ(q.size(), 3);
  q.reset_capacity(1000); // reset capacity back to 1000
  ASSERT_FALSE(q.try_enqueue(100));
  q.dequeue(v);
  ASSERT_EQ(v, 100);
  ASSERT_EQ(q.weight(), 1500);
  q.dequeue(v);
  ASSERT_EQ(v, 300);
  ASSERT_EQ(q.weight(), 1200);
}

TEST(DynamicBoundedQueue, capacity) {
  capacity_test<DSPSC, false>();
  capacity_test<DMPSC, false>();
  capacity_test<DSPMC, false>();
  capacity_test<DMPMC, false>();
  capacity_test<DSPSC, true>();
  capacity_test<DMPSC, true>();
  capacity_test<DSPMC, true>();
  capacity_test<DMPMC, true>();
}

template <typename ProdFunc, typename ConsFunc, typename EndFunc>
inline uint64_t run_once(
    int nprod,
    int ncons,
    const ProdFunc& prodFn,
    const ConsFunc& consFn,
    const EndFunc& endFn) {
  std::atomic<bool> start{false};
  std::atomic<int> ready{0};

  /* producers */
  std::vector<std::thread> prodThr(nprod);
  for (int tid = 0; tid < nprod; ++tid) {
    prodThr[tid] = std::thread([&, tid] {
      ++ready;
      while (!start.load()) {
        /* spin */;
      }
      prodFn(tid);
    });
  }

  /* consumers */
  std::vector<std::thread> consThr(ncons);
  for (int tid = 0; tid < ncons; ++tid) {
    consThr[tid] = std::thread([&, tid] {
      ++ready;
      while (!start.load()) {
        /* spin */;
      }
      consFn(tid);
    });
  }

  /* wait for all producers and consumers to be ready */
  while (ready.load() < (nprod + ncons)) {
    /* spin */;
  }

  /* begin time measurement */
  auto tbegin = std::chrono::steady_clock::now();
  start.store(true);

  /* wait for completion */
  for (int i = 0; i < nprod; ++i) {
    prodThr[i].join();
  }
  for (int i = 0; i < ncons; ++i) {
    consThr[i].join();
  }

  /* end time measurement */
  auto tend = std::chrono::steady_clock::now();
  endFn();
  return std::chrono::duration_cast<std::chrono::nanoseconds>(tend - tbegin)
      .count();
}

template <bool SingleProducer, bool SingleConsumer, bool MayBlock>
void enq_deq_test(const int nprod, const int ncons) {
  if (SingleProducer) {
    ASSERT_EQ(nprod, 1);
  }
  if (SingleConsumer) {
    ASSERT_EQ(ncons, 1);
  }

  int ops = 1000;
  folly::DynamicBoundedQueue<int, SingleProducer, SingleConsumer, MayBlock, 2>
      q(10);
  std::atomic<uint64_t> sum(0);

  auto prod = [&](int tid) {
    for (int i = tid; i < ops; i += nprod) {
      if ((i % 3) == 0) {
        while (!q.try_enqueue(i)) {
          /* keep trying */;
        }
      } else if ((i % 3) == 1) {
        auto dur = std::chrono::microseconds(100);
        while (!q.try_enqueue_for(i, dur)) {
          /* keep trying */;
        }
      } else {
        q.enqueue(i);
      }
    }
  };

  auto cons = [&](int tid) {
    uint64_t mysum = 0;
    for (int i = tid; i < ops; i += ncons) {
      int v;
      if ((i % 3) == 0) {
        while (!q.try_dequeue(v)) {
          /* keep trying */;
        }
      } else if ((i % 3) == 1) {
        auto dur = std::chrono::microseconds(100);
        while (!q.try_dequeue_for(v, dur)) {
          /* keep trying */;
        }
      } else {
        q.dequeue(v);
      }
      if (nprod == 1 && ncons == 1) {
        ASSERT_EQ(v, i);
      }
      mysum += v;
    }
    sum.fetch_add(mysum);
  };

  auto endfn = [&] {
    uint64_t expected = ops;
    expected *= ops - 1;
    expected /= 2;
    ASSERT_EQ(sum.load(), expected);
  };
  run_once(nprod, ncons, prod, cons, endfn);
}

TEST(DynamicBoundedQueue, enqDeq) {
  /* SPSC */
  enq_deq_test<true, true, false>(1, 1);
  enq_deq_test<true, true, true>(1, 1);
  /* MPSC */
  enq_deq_test<false, true, false>(1, 1);
  enq_deq_test<false, true, true>(1, 1);
  enq_deq_test<false, true, false>(2, 1);
  enq_deq_test<false, true, true>(2, 1);
  enq_deq_test<false, true, false>(10, 1);
  enq_deq_test<false, true, true>(10, 1);
  /* SPMC */
  enq_deq_test<true, false, false>(1, 1);
  enq_deq_test<true, false, true>(1, 1);
  enq_deq_test<true, false, false>(1, 2);
  enq_deq_test<true, false, true>(1, 2);
  enq_deq_test<true, false, false>(1, 10);
  enq_deq_test<true, false, true>(1, 10);
  /* MPMC */
  enq_deq_test<false, false, false>(1, 1);
  enq_deq_test<false, false, true>(1, 1);
  enq_deq_test<false, false, false>(2, 1);
  enq_deq_test<false, false, true>(2, 1);
  enq_deq_test<false, false, false>(10, 1);
  enq_deq_test<false, false, true>(10, 1);
  enq_deq_test<false, false, false>(1, 2);
  enq_deq_test<false, false, true>(1, 2);
  enq_deq_test<false, false, false>(1, 10);
  enq_deq_test<false, false, true>(1, 10);
  enq_deq_test<false, false, false>(2, 2);
  enq_deq_test<false, false, true>(2, 2);
  enq_deq_test<false, false, false>(10, 10);
  enq_deq_test<false, false, true>(10, 10);
}

template <typename RepFunc>
uint64_t runBench(const std::string& name, int ops, const RepFunc& repFn) {
  int reps = FLAGS_reps;
  uint64_t min = UINTMAX_MAX;
  uint64_t max = 0;
  uint64_t sum = 0;

  repFn(); // sometimes first run is outlier
  for (int r = 0; r < reps; ++r) {
    uint64_t dur = repFn();
    sum += dur;
    min = std::min(min, dur);
    max = std::max(max, dur);
    // if each rep takes too long run at least 2 reps
    const uint64_t minute = 60000000000ULL;
    if (sum > minute && r >= 1) {
      reps = r + 1;
      break;
    }
  }

  const std::string unit = " ns";
  uint64_t avg = sum / reps;
  uint64_t res = min;
  std::cout << name;
  std::cout << "   " << std::setw(4) << max / ops << unit;
  std::cout << "   " << std::setw(4) << avg / ops << unit;
  std::cout << "   " << std::setw(4) << res / ops << unit;
  std::cout << std::endl;
  return res;
}

template <template <typename, bool, typename> class Q, typename T, int Op>
uint64_t bench(const int nprod, const int ncons, const std::string& name) {
  int ops = FLAGS_ops;
  auto repFn = [&] {
    Q<T, Op == 3 || Op == 4 || Op == 5, folly::DefaultWeightFn<T>> q(
        FLAGS_capacity);
    std::atomic<uint64_t> sum(0);
    auto prod = [&](int tid) {
      for (int i = tid; i < ops; i += nprod) {
        if (Op == 0 || Op == 3) {
          while (!q.try_enqueue(i)) {
            /* keep trying */;
          }
        } else if (Op == 1 || Op == 4) {
          while (!q.try_enqueue_for(i, std::chrono::microseconds(1000))) {
            /* keep trying */;
          }
        } else {
          q.enqueue(i);
        }
      }
    };
    auto cons = [&](int tid) {
      uint64_t mysum = 0;
      T v = -1;
      for (int i = tid; i < ops; i += ncons) {
        if (Op == 0 || Op == 3) {
          while (!q.try_dequeue(v)) {
            /* keep trying */;
          }
        } else if (Op == 1 || Op == 4) {
          while (!q.try_dequeue_for(v, std::chrono::microseconds(1000))) {
            /* keep trying */;
          }
        } else {
          q.dequeue(v);
        }
        if (nprod == 1 && ncons == 1) {
          DCHECK_EQ(int(v), i);
        }
        mysum += v;
      }
      sum.fetch_add(mysum);
    };
    auto endfn = [&] {
      uint64_t expected = ops;
      expected *= ops - 1;
      expected /= 2;
      ASSERT_EQ(sum.load(), expected);
    };
    return run_once(nprod, ncons, prod, cons, endfn);
  };
  return runBench(name, ops, repFn);
}

/* For performance comparison */
template <typename T>
class MPMC {
  folly::MPMCQueue<T> q_;

 public:
  explicit MPMC(uint64_t capacity) : q_(capacity) {}

  void enqueue(const T& v) { q_.blockingWrite(v); }

  void enqueue(T&& v) { q_.blockingWrite(std::move(v)); }

  bool try_enqueue(const T& v) { return q_.write(v); }

  bool try_enqueue(const T&& v) { return q_.write(std::move(v)); }

  template <typename Rep, typename Period>
  bool try_enqueue_for(
      const T& v, const std::chrono::duration<Rep, Period>& duration) {
    return q_.tryWriteUntil(std::chrono::steady_clock::now() + duration, v);
  }

  void dequeue(T& item) { q_.blockingRead(item); }

  bool try_dequeue(T& item) { return q_.read(item); }

  template <typename Rep, typename Period>
  bool try_dequeue_for(
      T& item, const std::chrono::duration<Rep, Period>& duration) {
    return q_.tryReadUntil(std::chrono::steady_clock::now() + duration, item);
  }
};

template <typename T, bool, typename>
using FMPMC = MPMC<T>;

template <typename T>
class PCQ {
  folly::ProducerConsumerQueue<T> q_;

 public:
  explicit PCQ(uint64_t capacity) : q_(capacity) {}

  void enqueue(const T&) { ASSERT_TRUE(false); }

  bool try_enqueue(const T& v) { return q_.write(v); }

  bool try_enqueue(T&& v) { return q_.write(std::move(v)); }

  template <typename Rep, typename Period>
  bool try_enqueue_for(const T&, const std::chrono::duration<Rep, Period>&) {
    return false;
  }

  void dequeue(T&) { ASSERT_TRUE(false); }

  bool try_dequeue(T& item) { return q_.read(item); }

  template <typename Rep, typename Period>
  bool try_dequeue_for(T&, const std::chrono::duration<Rep, Period>&) {
    return false;
  }
};

template <typename T, bool, typename>
using FPCQ = PCQ<T>;

template <size_t M>
struct IntArray {
  int a[M];
  IntArray() {}
  /* implicit */ IntArray(int v) {
    for (size_t i = 0; i < M; ++i) {
      a[i] = v;
    }
  }
  operator int() { return a[0]; }
};

void dottedLine() {
  std::cout << ".............................................................."
            << std::endl;
}

template <typename T>
void type_benches(const int np, const int nc, const std::string& name) {
  std::cout << name
            << "===========================================" << std::endl;
  if (np == 1 && nc == 1) {
    bench<DSPSC, T, 0>(1, 1, "DSPSC try   spin only           ");
    bench<DSPSC, T, 1>(1, 1, "DSPSC timed spin only           ");
    bench<DSPSC, T, 2>(1, 1, "DSPSC wait  spin only           ");
    bench<DSPSC, T, 3>(1, 1, "DSPSC try   may block           ");
    bench<DSPSC, T, 4>(1, 1, "DSPSC timed may block           ");
    bench<DSPSC, T, 5>(1, 1, "DSPSC wait  may block           ");
    dottedLine();
  }
  if (nc == 1) {
    bench<DMPSC, T, 0>(np, 1, "DMPSC try   spin only           ");
    bench<DMPSC, T, 1>(np, 1, "DMPSC timed spin only           ");
    bench<DMPSC, T, 2>(np, 1, "DMPSC wait  spin only           ");
    bench<DMPSC, T, 3>(np, 1, "DMPSC try   may block           ");
    bench<DMPSC, T, 4>(np, 1, "DMPSC timed may block           ");
    bench<DMPSC, T, 5>(np, 1, "DMPSC wait  may block           ");
    dottedLine();
  }
  if (np == 1) {
    bench<DSPMC, T, 0>(1, nc, "DSPMC try   spin only           ");
    bench<DSPMC, T, 1>(1, nc, "DSPMC timed spin only           ");
    bench<DSPMC, T, 2>(1, nc, "DSPMC wait  spin only           ");
    bench<DSPMC, T, 3>(1, nc, "DSPMC try   may block           ");
    bench<DSPMC, T, 4>(1, nc, "DSPMC timed may block           ");
    bench<DSPMC, T, 5>(1, nc, "DSPMC wait  may block           ");
    dottedLine();
  }
  bench<DMPMC, T, 0>(np, nc, "DMPMC try   spin only           ");
  bench<DMPMC, T, 1>(np, nc, "DMPMC timed spin only           ");
  bench<DMPMC, T, 2>(np, nc, "DMPMC wait  spin only           ");
  bench<DMPMC, T, 3>(np, nc, "DMPMC try   may block           ");
  bench<DMPMC, T, 4>(np, nc, "DMPMC timed may block           ");
  bench<DMPMC, T, 5>(np, nc, "DMPMC wait  may block           ");
  dottedLine();
  if (np == 1 && nc == 1) {
    bench<FPCQ, T, 0>(1, 1, "folly::PCQ  read                ");
    dottedLine();
  }
  bench<FMPMC, T, 3>(np, nc, "folly::MPMC  read               ");
  bench<FMPMC, T, 4>(np, nc, "folly::MPMC  tryReadUntil       ");
  bench<FMPMC, T, 5>(np, nc, "folly::MPMC  blockingRead       ");
  std::cout << "=============================================================="
            << std::endl;
}

void benches(const int np, const int nc) {
  std::cout << "====================== " << std::setw(2) << np << " prod"
            << "  " << std::setw(2) << nc << " cons"
            << " ======================" << std::endl;
  type_benches<uint32_t>(np, nc, "=== uint32_t ======");
  // Benchmarks for other element sizes can be added as follows:
  //   type_benches<IntArray<4>>(np, nc, "=== IntArray<4> ===");
}

TEST(DynamicBoundedQueue, bench) {
  if (!FLAGS_bench) {
    return;
  }
  std::cout << "=============================================================="
            << std::endl;
  std::cout << std::setw(2) << FLAGS_reps << " reps of " << std::setw(8)
            << FLAGS_ops << " handoffs\n";
  dottedLine();
  std::cout << "Using capacity " << FLAGS_capacity << " for all queues\n";
  std::cout << "=============================================================="
            << std::endl;
  std::cout << "Test name                         Max time  Avg time  Min time"
            << std::endl;

  for (int np : {1, 8, 32}) {
    for (int nc : {1, 8, 32}) {
      benches(np, nc);
    }
  }
}

/*
$ numactl -N 1 dynamic_bounded_queue_test --bench --capacity=1000000
==============================================================
10 reps of  1000000 handoffs
..............................................................
Using capacity 1000000 for all queues
==============================================================
Test name                         Max time  Avg time  Min time
======================  1 prod   1 cons ======================
=== uint32_t =================================================
DSPSC try   spin only                 7 ns      7 ns      7 ns
DSPSC timed spin only                 9 ns      9 ns      9 ns
DSPSC wait  spin only                 7 ns      7 ns      7 ns
DSPSC try   may block                39 ns     36 ns     33 ns
DSPSC timed may block                39 ns     38 ns     37 ns
DSPSC wait  may block                37 ns     34 ns     33 ns
..............................................................
DMPSC try   spin only                54 ns     53 ns     52 ns
DMPSC timed spin only                53 ns     52 ns     51 ns
DMPSC wait  spin only                53 ns     52 ns     51 ns
DMPSC try   may block                67 ns     65 ns     64 ns
DMPSC timed may block                64 ns     62 ns     60 ns
DMPSC wait  may block                64 ns     62 ns     60 ns
..............................................................
DSPMC try   spin only                25 ns     24 ns     23 ns
DSPMC timed spin only                24 ns     23 ns     23 ns
DSPMC wait  spin only                22 ns     21 ns     21 ns
DSPMC try   may block                30 ns     26 ns     21 ns
DSPMC timed may block                25 ns     24 ns     24 ns
DSPMC wait  may block                22 ns     22 ns     21 ns
..............................................................
DMPMC try   spin only                48 ns     45 ns     39 ns
DMPMC timed spin only                31 ns     30 ns     24 ns
DMPMC wait  spin only                49 ns     47 ns     43 ns
DMPMC try   may block                63 ns     62 ns     61 ns
DMPMC timed may block                64 ns     60 ns     46 ns
DMPMC wait  may block                61 ns     60 ns     58 ns
..............................................................
folly::PCQ  read                      8 ns      7 ns      7 ns
..............................................................
folly::MPMC  read                    53 ns     51 ns     49 ns
folly::MPMC  tryReadUntil           112 ns    106 ns    103 ns
folly::MPMC  blockingRead            50 ns     47 ns     46 ns
==============================================================
======================  1 prod   8 cons ======================
=== uint32_t =================================================
DSPMC try   spin only               166 ns    159 ns    153 ns
DSPMC timed spin only               169 ns    163 ns    156 ns
DSPMC wait  spin only                60 ns     57 ns     54 ns
DSPMC try   may block               170 ns    163 ns    153 ns
DSPMC timed may block               165 ns    157 ns    150 ns
DSPMC wait  may block                94 ns     78 ns     52 ns
..............................................................
DMPMC try   spin only               170 ns    161 ns    149 ns
DMPMC timed spin only               167 ns    158 ns    149 ns
DMPMC wait  spin only                93 ns     80 ns     51 ns
DMPMC try   may block               164 ns    161 ns    154 ns
DMPMC timed may block               163 ns    156 ns    145 ns
DMPMC wait  may block               117 ns    102 ns     87 ns
..............................................................
folly::MPMC  read                   176 ns    168 ns    159 ns
folly::MPMC  tryReadUntil          1846 ns    900 ns    521 ns
folly::MPMC  blockingRead           219 ns    193 ns    178 ns
==============================================================
======================  1 prod  32 cons ======================
=== uint32_t =================================================
DSPMC try   spin only               224 ns    213 ns    204 ns
DSPMC timed spin only               215 ns    209 ns    199 ns
DSPMC wait  spin only               334 ns    114 ns     52 ns
DSPMC try   may block               240 ns    215 ns    202 ns
DSPMC timed may block               245 ns    221 ns    200 ns
DSPMC wait  may block               215 ns    151 ns     98 ns
..............................................................
DMPMC try   spin only               348 ns    252 ns    204 ns
DMPMC timed spin only               379 ns    244 ns    198 ns
DMPMC wait  spin only               173 ns    116 ns     89 ns
DMPMC try   may block               362 ns    231 ns    173 ns
DMPMC timed may block               282 ns    236 ns    206 ns
DMPMC wait  may block               252 ns    172 ns    134 ns
..............................................................
folly::MPMC  read                   540 ns    290 ns    186 ns
folly::MPMC  tryReadUntil          24946 ns   24280 ns   23113 ns
folly::MPMC  blockingRead          1345 ns   1297 ns   1265 ns
==============================================================
======================  8 prod   1 cons ======================
=== uint32_t =================================================
DMPSC try   spin only                68 ns     64 ns     60 ns
DMPSC timed spin only                69 ns     66 ns     61 ns
DMPSC wait  spin only                67 ns     65 ns     62 ns
DMPSC try   may block                77 ns     73 ns     67 ns
DMPSC timed may block                75 ns     74 ns     68 ns
DMPSC wait  may block                76 ns     73 ns     69 ns
..............................................................
DMPMC try   spin only                76 ns     66 ns     60 ns
DMPMC timed spin only                77 ns     68 ns     63 ns
DMPMC wait  spin only                68 ns     65 ns     63 ns
DMPMC try   may block                76 ns     72 ns     64 ns
DMPMC timed may block                82 ns     74 ns     67 ns
DMPMC wait  may block                77 ns     72 ns     68 ns
..............................................................
folly::MPMC  read                   170 ns    166 ns    161 ns
folly::MPMC  tryReadUntil           184 ns    179 ns    173 ns
folly::MPMC  blockingRead            79 ns     73 ns     53 ns
==============================================================
======================  8 prod   8 cons ======================
=== uint32_t =================================================
DMPMC try   spin only               181 ns    169 ns    133 ns
DMPMC timed spin only               179 ns    168 ns    148 ns
DMPMC wait  spin only                77 ns     76 ns     71 ns
DMPMC try   may block               180 ns    179 ns    176 ns
DMPMC timed may block               174 ns    166 ns    153 ns
DMPMC wait  may block                79 ns     78 ns     75 ns
..............................................................
folly::MPMC  read                   219 ns    206 ns    183 ns
folly::MPMC  tryReadUntil           262 ns    244 ns    213 ns
folly::MPMC  blockingRead            61 ns     58 ns     54 ns
==============================================================
======================  8 prod  32 cons ======================
=== uint32_t =================================================
DMPMC try   spin only               265 ns    217 ns    203 ns
DMPMC timed spin only               236 ns    215 ns    202 ns
DMPMC wait  spin only                93 ns     83 ns     77 ns
DMPMC try   may block               325 ns    234 ns    200 ns
DMPMC timed may block               206 ns    202 ns    193 ns
DMPMC wait  may block               139 ns     93 ns     76 ns
..............................................................
folly::MPMC  read                   259 ns    214 ns    201 ns
folly::MPMC  tryReadUntil           281 ns    274 ns    267 ns
folly::MPMC  blockingRead            62 ns     59 ns     57 ns
==============================================================
====================== 32 prod   1 cons ======================
=== uint32_t =================================================
DMPSC try   spin only                95 ns     57 ns     45 ns
DMPSC timed spin only                94 ns     52 ns     46 ns
DMPSC wait  spin only               104 ns     54 ns     43 ns
DMPSC try   may block                59 ns     54 ns     51 ns
DMPSC timed may block                86 ns     58 ns     52 ns
DMPSC wait  may block                76 ns     57 ns     53 ns
..............................................................
DMPMC try   spin only                68 ns     64 ns     60 ns
DMPMC timed spin only               137 ns     73 ns     61 ns
DMPMC wait  spin only                86 ns     65 ns     58 ns
DMPMC try   may block                89 ns     71 ns     65 ns
DMPMC timed may block                82 ns     69 ns     65 ns
DMPMC wait  may block                84 ns     71 ns     66 ns
..............................................................
folly::MPMC  read                   222 ns    203 ns    192 ns
folly::MPMC  tryReadUntil           239 ns    232 ns    191 ns
folly::MPMC  blockingRead            78 ns     68 ns     64 ns
==============================================================
====================== 32 prod   8 cons ======================
=== uint32_t =================================================
DMPMC try   spin only               183 ns    138 ns    107 ns
DMPMC timed spin only               237 ns    158 ns     98 ns
DMPMC wait  spin only                87 ns     70 ns     58 ns
DMPMC try   may block               169 ns    132 ns     92 ns
DMPMC timed may block               172 ns    133 ns     79 ns
DMPMC wait  may block               166 ns     89 ns     66 ns
..............................................................
folly::MPMC  read                   221 ns    194 ns    183 ns
folly::MPMC  tryReadUntil           258 ns    244 ns    230 ns
folly::MPMC  blockingRead            60 ns     54 ns     47 ns
==============================================================
====================== 32 prod  32 cons ======================
=== uint32_t =================================================
DMPMC try   spin only               419 ns    252 ns    181 ns
DMPMC timed spin only               252 ns    212 ns    179 ns
DMPMC wait  spin only               153 ns     79 ns     62 ns
DMPMC try   may block               302 ns    236 ns    182 ns
DMPMC timed may block               266 ns    213 ns    170 ns
DMPMC wait  may block               262 ns    120 ns     64 ns
..............................................................
folly::MPMC  read                   269 ns    245 ns    199 ns
folly::MPMC  tryReadUntil           257 ns    245 ns    235 ns
folly::MPMC  blockingRead            53 ns     48 ns     45 ns
==============================================================

$ numactl -N 1 dynamic_bounded_queue_test --bench --capacity=1000
==============================================================
10 reps of  1000000 handoffs
..............................................................
Using capacity 1000 for all queues
==============================================================
Test name                         Max time  Avg time  Min time
======================  1 prod   1 cons ======================
=== uint32_t =================================================
DSPSC try   spin only                 7 ns      7 ns      7 ns
DSPSC timed spin only                 9 ns      9 ns      9 ns
DSPSC wait  spin only                 7 ns      7 ns      7 ns
DSPSC try   may block                34 ns     33 ns     31 ns
DSPSC timed may block                34 ns     34 ns     33 ns
DSPSC wait  may block                30 ns     30 ns     29 ns
..............................................................
DMPSC try   spin only                60 ns     57 ns     55 ns
DMPSC timed spin only                55 ns     52 ns     51 ns
DMPSC wait  spin only                57 ns     54 ns     52 ns
DMPSC try   may block                66 ns     62 ns     39 ns
DMPSC timed may block                67 ns     64 ns     62 ns
DMPSC wait  may block                67 ns     65 ns     64 ns
..............................................................
DSPMC try   spin only                27 ns     25 ns     24 ns
DSPMC timed spin only                25 ns     25 ns     24 ns
DSPMC wait  spin only                23 ns     23 ns     22 ns
DSPMC try   may block                31 ns     26 ns     24 ns
DSPMC timed may block                33 ns     30 ns     30 ns
DSPMC wait  may block                37 ns     29 ns     28 ns
..............................................................
DMPMC try   spin only                55 ns     53 ns     51 ns
DMPMC timed spin only                36 ns     31 ns     26 ns
DMPMC wait  spin only                54 ns     53 ns     51 ns
DMPMC try   may block                68 ns     64 ns     51 ns
DMPMC timed may block                66 ns     63 ns     60 ns
DMPMC wait  may block                68 ns     63 ns     60 ns
..............................................................
folly::PCQ  read                     15 ns     13 ns     11 ns
..............................................................
folly::MPMC  read                    60 ns     56 ns     51 ns
folly::MPMC  tryReadUntil           134 ns    112 ns    102 ns
folly::MPMC  blockingRead            57 ns     51 ns     48 ns
==============================================================
======================  1 prod   8 cons ======================
=== uint32_t =================================================
DSPMC try   spin only               169 ns    162 ns    151 ns
DSPMC timed spin only               178 ns    166 ns    149 ns
DSPMC wait  spin only                59 ns     55 ns     54 ns
DSPMC try   may block               173 ns    163 ns    153 ns
DSPMC timed may block               171 ns    166 ns    156 ns
DSPMC wait  may block                71 ns     57 ns     51 ns
..............................................................
DMPMC try   spin only               172 ns    164 ns    158 ns
DMPMC timed spin only               173 ns    164 ns    156 ns
DMPMC wait  spin only                77 ns     62 ns     53 ns
DMPMC try   may block               181 ns    163 ns    152 ns
DMPMC timed may block               174 ns    165 ns    151 ns
DMPMC wait  may block                91 ns     72 ns     52 ns
..............................................................
folly::MPMC  read                   178 ns    167 ns    161 ns
folly::MPMC  tryReadUntil           991 ns    676 ns    423 ns
folly::MPMC  blockingRead           154 ns    129 ns     96 ns
==============================================================
======================  1 prod  32 cons ======================
=== uint32_t =================================================
DSPMC try   spin only               462 ns    288 ns    201 ns
DSPMC timed spin only               514 ns    283 ns    201 ns
DSPMC wait  spin only               100 ns     60 ns     45 ns
DSPMC try   may block               531 ns    318 ns    203 ns
DSPMC timed may block              1379 ns    891 ns    460 ns
DSPMC wait  may block               148 ns    111 ns     82 ns
..............................................................
DMPMC try   spin only               404 ns    312 ns    205 ns
DMPMC timed spin only               337 ns    253 ns    219 ns
DMPMC wait  spin only               130 ns     97 ns     72 ns
DMPMC try   may block               532 ns    265 ns    201 ns
DMPMC timed may block               846 ns    606 ns    412 ns
DMPMC wait  may block               158 ns    112 ns     87 ns
..............................................................
folly::MPMC  read                   880 ns    419 ns    284 ns
folly::MPMC  tryReadUntil          23432 ns   23184 ns   23007 ns
folly::MPMC  blockingRead          1353 ns   1308 ns   1279 ns
==============================================================
======================  8 prod   1 cons ======================
=== uint32_t =================================================
DMPSC try   spin only                67 ns     63 ns     51 ns
DMPSC timed spin only                69 ns     65 ns     63 ns
DMPSC wait  spin only                67 ns     65 ns     61 ns
DMPSC try   may block                73 ns     69 ns     63 ns
DMPSC timed may block                72 ns     69 ns     64 ns
DMPSC wait  may block                71 ns     70 ns     68 ns
..............................................................
DMPMC try   spin only                70 ns     64 ns     59 ns
DMPMC timed spin only                76 ns     66 ns     53 ns
DMPMC wait  spin only                68 ns     66 ns     64 ns
DMPMC try   may block                71 ns     68 ns     66 ns
DMPMC timed may block                72 ns     70 ns     67 ns
DMPMC wait  may block                73 ns     70 ns     67 ns
..............................................................
folly::MPMC  read                   193 ns    167 ns    153 ns
folly::MPMC  tryReadUntil           497 ns    415 ns    348 ns
folly::MPMC  blockingRead           163 ns    134 ns    115 ns
==============================================================
======================  8 prod   8 cons ======================
=== uint32_t =================================================
DMPMC try   spin only               216 ns    203 ns    196 ns
DMPMC timed spin only               199 ns    186 ns    178 ns
DMPMC wait  spin only                63 ns     60 ns     58 ns
DMPMC try   may block               212 ns    198 ns    183 ns
DMPMC timed may block               180 ns    170 ns    162 ns
DMPMC wait  may block                72 ns     68 ns     65 ns
..............................................................
folly::MPMC  read                   225 ns    201 ns    188 ns
folly::MPMC  tryReadUntil           255 ns    248 ns    232 ns
folly::MPMC  blockingRead            52 ns     48 ns     42 ns
==============================================================
======================  8 prod  32 cons ======================
=== uint32_t =================================================
DMPMC try   spin only               360 ns    302 ns    195 ns
DMPMC timed spin only               350 ns    272 ns    218 ns
DMPMC wait  spin only                92 ns     72 ns     61 ns
DMPMC try   may block               352 ns    263 ns    223 ns
DMPMC timed may block               218 ns    213 ns    209 ns
DMPMC wait  may block                98 ns     77 ns     70 ns
..............................................................
folly::MPMC  read                   611 ns    461 ns    339 ns
folly::MPMC  tryReadUntil           270 ns    260 ns    253 ns
folly::MPMC  blockingRead            89 ns     84 ns     80 ns
==============================================================
====================== 32 prod   1 cons ======================
=== uint32_t =================================================
DMPSC try   spin only               389 ns    248 ns    149 ns
DMPSC timed spin only               356 ns    235 ns    120 ns
DMPSC wait  spin only               343 ns    242 ns    125 ns
DMPSC try   may block               412 ns    294 ns    168 ns
DMPSC timed may block               332 ns    271 ns    189 ns
DMPSC wait  may block               280 ns    252 ns    199 ns
..............................................................
DMPMC try   spin only               393 ns    269 ns    105 ns
DMPMC timed spin only               328 ns    240 ns    112 ns
DMPMC wait  spin only               502 ns    266 ns    107 ns
DMPMC try   may block               514 ns    346 ns    192 ns
DMPMC timed may block               339 ns    318 ns    278 ns
DMPMC wait  may block               319 ns    307 ns    292 ns
..............................................................
folly::MPMC  read                   948 ns    517 ns    232 ns
folly::MPMC  tryReadUntil          9649 ns   7567 ns   4140 ns
folly::MPMC  blockingRead          1365 ns   1316 ns   1131 ns
==============================================================
====================== 32 prod   8 cons ======================
=== uint32_t =================================================
DMPMC try   spin only               436 ns    257 ns    115 ns
DMPMC timed spin only               402 ns    272 ns    121 ns
DMPMC wait  spin only               136 ns     78 ns     55 ns
DMPMC try   may block               454 ns    227 ns     78 ns
DMPMC timed may block               155 ns    137 ns    116 ns
DMPMC wait  may block                62 ns     59 ns     57 ns
..............................................................
folly::MPMC  read                   677 ns    497 ns    336 ns
folly::MPMC  tryReadUntil           268 ns    262 ns    258 ns
folly::MPMC  blockingRead            87 ns     85 ns     82 ns
==============================================================
====================== 32 prod  32 cons ======================
=== uint32_t =================================================
DMPMC try   spin only               786 ns    381 ns    142 ns
DMPMC timed spin only               795 ns    346 ns    126 ns
DMPMC wait  spin only               334 ns    107 ns     55 ns
DMPMC try   may block               535 ns    317 ns    144 ns
DMPMC timed may block               197 ns    192 ns    183 ns
DMPMC wait  may block               189 ns     75 ns     60 ns
..............................................................
folly::MPMC  read                  1110 ns    919 ns    732 ns
folly::MPMC  tryReadUntil           214 ns    210 ns    206 ns
folly::MPMC  blockingRead            53 ns     52 ns     51 ns
==============================================================

$ lscpu
Architecture:        x86_64
CPU op-mode(s):      32-bit, 64-bit
Byte Order:          Little Endian
CPU(s):              32
On-line CPU(s) list: 0-31
Thread(s) per core:  2
Core(s) per socket:  8
Socket(s):           2
NUMA node(s):        2
Vendor ID:           GenuineIntel
CPU family:          6
Model:               45
Model name:          Intel(R) Xeon(R) CPU E5-2660 0 @ 2.20GHz
Stepping:            6
CPU MHz:             2200.000
CPU max MHz:         2200.0000
CPU min MHz:         1200.0000
BogoMIPS:            4399.92
Virtualization:      VT-x
L1d cache:           32K
L1i cache:           32K
L2 cache:            256K
L3 cache:            20480K
NUMA node0 CPU(s):   0-7,16-23
NUMA node1 CPU(s):   8-15,24-31

Flags:               fpu vme de pse tsc msr pae mce cx8 apic sep mtrr
                     pge mca cmov pat pse36 clflush dts acpi mmx fxsr
                     sse sse2 ss ht tm pbe syscall nx pdpe1gb rdtscp
                     lm constant_tsc arch_perfmon pebs bts rep_good
                     nopl xtopology nonstop_tsc aperfmperf eagerfpu
                     pni pclmulqdq dtes64 monitor ds_cpl vmx smx est
                     tm2 ssse3 cx16 xtpr pdcm pcid dca sse4_1 sse4_2
                     x2apic popcnt tsc_deadline_timer aes xsave avx
                     lahf_lm epb tpr_shadow vnmi flexpriority ept vpid
                     xsaveopt dtherm arat pln pts
 */