CacheLocalityBenchmark.cpp | Explore in Territory

/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#include <folly/concurrency/CacheLocality.h>

#include <memory>
#include <thread>
#include <unordered_map>

#include <glog/logging.h>

#include <folly/Benchmark.h>
#include <folly/lang/Keep.h>

using namespace folly;

extern "C" FOLLY_KEEP size_t
check_access_spreader_atomic_current(size_t numStripes) {
  return AccessSpreader<>::current(numStripes);
}

extern "C" FOLLY_KEEP size_t
check_access_spreader_atomic_cached_current(size_t numStripes) {
  return AccessSpreader<>::cachedCurrent(numStripes);
}

#define DECLARE_SPREADER_TAG(tag, locality, func)      \
  namespace {                                          \
  template <typename dummy>                            \
  struct tag {};                                       \
  }                                                    \
  namespace folly {                                    \
  template <>                                          \
  const CacheLocality& CacheLocality::system<tag>() {  \
    static auto* inst = new CacheLocality(locality);   \
    return *inst;                                      \
  }                                                    \
  template <>                                          \
  Getcpu::Func AccessSpreader<tag>::pickGetcpuFunc() { \
    return func;                                       \
  }                                                    \
  template struct AccessSpreader<tag>;                 \
  }

DECLARE_SPREADER_TAG(
    ThreadLocalTag,
    CacheLocality::system<>(),
    folly::FallbackGetcpu<SequentialThreadId>::getcpu)
DECLARE_SPREADER_TAG(
    PthreadSelfTag,
    CacheLocality::system<>(),
    folly::FallbackGetcpu<HashingThreadId>::getcpu)

// Allow us to run cachedCurrent() in the benchmark function easily.
namespace {
template <typename dummy>
struct CachedCurrentTag {};
} // namespace
namespace folly {
template <>
const CacheLocality& CacheLocality::system<CachedCurrentTag>() {
  return CacheLocality::system<>();
}
template <>
size_t AccessSpreader<CachedCurrentTag>::current(
    size_t numStripes, const GlobalState& state) {
  auto& alter = reinterpret_cast<const AccessSpreader::GlobalState&>(state);
  return AccessSpreader::cachedCurrent(numStripes, alter);
}
} // namespace folly

BENCHMARK(AccessSpreaderUse, iters) {
  for (unsigned long i = 0; i < iters; ++i) {
    auto x = AccessSpreader<>::current(16);
    folly::doNotOptimizeAway(x);
  }
}

BENCHMARK(StateAccessSpreaderUse, iters) {
  auto& state = AccessSpreader<>::state();
  for (unsigned long i = 0; i < iters; ++i) {
    auto x = AccessSpreader<>::current(16, state);
    folly::doNotOptimizeAway(x);
  }
}

BENCHMARK(CachedAccessSpreaderUse, iters) {
  for (unsigned long i = 0; i < iters; ++i) {
    auto x = AccessSpreader<>::cachedCurrent(16);
    folly::doNotOptimizeAway(x);
  }
}

BENCHMARK(StateCachedAccessSpreaderUse, iters) {
  auto& state = AccessSpreader<>::state();
  for (unsigned long i = 0; i < iters; ++i) {
    auto x = AccessSpreader<>::cachedCurrent(16, state);
    folly::doNotOptimizeAway(x);
  }
}

BENCHMARK(BaselineAtomicIncrement, iters) {
  std::atomic<int> value;
  for (unsigned long i = 0; i < iters; ++i) {
    ++value;
    folly::doNotOptimizeAway(value);
  }
}

BENCHMARK(CachedAccessSpreaderAtomicIncrement, iters) {
  std::array<std::atomic<int>, 64> values;
  for (unsigned long i = 0; i < iters; ++i) {
    auto x = AccessSpreader<>::cachedCurrent(64);
    ++values[x];
    folly::doNotOptimizeAway(values[x]);
  }
}

BENCHMARK(StateCachedAccessSpreaderAtomicIncrement, iters) {
  auto& state = AccessSpreader<>::state();
  std::array<std::atomic<int>, 64> values;
  for (unsigned long i = 0; i < iters; ++i) {
    auto x = AccessSpreader<>::cachedCurrent(64, state);
    ++values[x];
    folly::doNotOptimizeAway(values[x]);
  }
}

// Benchmark scores here reflect the time for 32 threads to perform an
// atomic increment on a dual-socket E5-2660 @ 2.2Ghz.  Surprisingly,
// if we don't separate the counters onto unique 128 byte stripes the
// 1_stripe and 2_stripe results are identical, even though the L3 is
// claimed to have 64 byte cache lines.
//
// Getcpu refers to the vdso getcpu implementation.  ThreadLocal refers
// to execution using SequentialThreadId, the fallback if the vdso
// getcpu isn't available.  PthreadSelf hashes the value returned from
// getCurrentThreadID() as a fallback-fallback for systems that don't have
// thread-local support.
//
// At 16_stripe_0_work and 32_stripe_0_work there is only L1 traffic,
// so since the stripe selection is 12 nanos the atomic increments in
// the L1 is ~17 nanos.  At width 8_stripe_0_work the line is expected
// to ping-pong almost every operation, since the loops have the same
// duration.  Widths 4 and 2 have the same behavior, but each tour of the
// cache line is 4 and 8 cores long, respectively.  These all suggest a
// lower bound of 60 nanos for intra-chip handoff and increment between
// the L1s.
//
// With 420 nanos of busywork per contended increment, the system can
// hide all of the latency of a tour of length 4, but not quite one of
// length 8.  I was a bit surprised at how much worse the non-striped
// version got.  It seems that the inter-chip traffic also interferes
// with the L1-only localWork.load().  When the local work is doubled
// to about 1 microsecond we see that the inter-chip contention is still
// very important, but subdivisions on the same chip don't matter.
//
// sudo nice -n -20 buck-out/gen/folly/test/cache_locality_test
//     --benchmark --bm_min_iters=1000000
// ============================================================================
// folly/concurrency/test/CacheLocalityBenchmark.cpprelative  time/iter  iters/s
// ============================================================================
// AccessSpreaderUse                                           11.51ns   86.87M
// CachedAccessSpreaderUse                                      1.98ns  490.03M
// BaselineAtomicIncrement                                     10.37ns   96.43M
// CachedAccessSpreaderAtomicIncrement                         11.43ns   87.50M
// ----------------------------------------------------------------------------
// contentionAtWidthGetcpu(1_stripe_0_work)                   993.13ns    1.01M
// contentionAtWidthGetcpu(2_stripe_0_work)                   551.45ns    1.81M
// contentionAtWidthGetcpu(4_stripe_0_work)                   302.36ns    3.31M
// contentionAtWidthGetcpu(8_stripe_0_work)                   156.57ns    6.39M
// contentionAtWidthGetcpu(16_stripe_0_work)                   81.34ns   12.29M
// contentionAtWidthGetcpu(32_stripe_0_work)                   37.90ns   26.39M
// contentionAtWidthGetcpu(64_stripe_0_work)                   36.02ns   27.76M
// contentionAtWidthCached(2_stripe_0_work)                   310.64ns    3.22M
// contentionAtWidthCached(4_stripe_0_work)                   180.41ns    5.54M
// contentionAtWidthCached(8_stripe_0_work)                    87.84ns   11.38M
// contentionAtWidthCached(16_stripe_0_work)                   45.04ns   22.20M
// contentionAtWidthCached(32_stripe_0_work)                   19.92ns   50.20M
// contentionAtWidthCached(64_stripe_0_work)                   19.21ns   52.06M
// contentionAtWidthThreadLocal(2_stripe_0_work)              321.14ns    3.11M
// contentionAtWidthThreadLocal(4_stripe_0_work)              244.41ns    4.09M
// contentionAtWidthThreadLocal(8_stripe_0_work)              103.47ns    9.66M
// contentionAtWidthThreadLocal(16_stripe_0_work)              79.82ns   12.53M
// contentionAtWidthThreadLocal(32_stripe_0_work)              20.41ns   49.01M
// contentionAtWidthThreadLocal(64_stripe_0_work)              22.13ns   45.18M
// contentionAtWidthPthreadSelf(2_stripe_0_work)              373.46ns    2.68M
// contentionAtWidthPthreadSelf(4_stripe_0_work)              208.18ns    4.80M
// contentionAtWidthPthreadSelf(8_stripe_0_work)              105.99ns    9.43M
// contentionAtWidthPthreadSelf(16_stripe_0_work)             105.67ns    9.46M
// contentionAtWidthPthreadSelf(32_stripe_0_work)              76.01ns   13.16M
// contentionAtWidthPthreadSelf(64_stripe_0_work)              76.04ns   13.15M
// atomicIncrBaseline(local_incr_0_work)                       13.43ns   74.47M
// ----------------------------------------------------------------------------
// contentionAtWidthGetcpu(1_stripe_500_work)                   1.76us  567.20K
// contentionAtWidthGetcpu(2_stripe_500_work)                   1.16us  863.67K
// contentionAtWidthGetcpu(4_stripe_500_work)                 604.74ns    1.65M
// contentionAtWidthGetcpu(8_stripe_500_work)                 524.16ns    1.91M
// contentionAtWidthGetcpu(16_stripe_500_work)                478.92ns    2.09M
// contentionAtWidthGetcpu(32_stripe_500_work)                480.64ns    2.08M
// atomicIncrBaseline(local_incr_500_work)                    395.17ns    2.53M
// ----------------------------------------------------------------------------
// contentionAtWidthGetcpu(1_stripe_1000_work)                  2.16us  462.06K
// contentionAtWidthGetcpu(2_stripe_1000_work)                  1.31us  764.80K
// contentionAtWidthGetcpu(4_stripe_1000_work)                895.33ns    1.12M
// contentionAtWidthGetcpu(8_stripe_1000_work)                833.98ns    1.20M
// contentionAtWidthGetcpu(16_stripe_1000_work)               765.10ns    1.31M
// contentionAtWidthGetcpu(32_stripe_1000_work)               646.85ns    1.55M
// atomicIncrBaseline(local_incr_1000_work)                   656.15ns    1.52M
// ============================================================================
template <template <typename> class Tag>
static void contentionAtWidth(size_t iters, size_t stripes, size_t work) {
  const size_t counterAlignment = 128;
  const size_t numThreads = 32;

  folly::BenchmarkSuspender braces;

  std::atomic<size_t> ready(0);
  std::atomic<bool> go(false);

  // while in theory the cache line size is 64 bytes, experiments show
  // that we get contention on 128 byte boundaries for Ivy Bridge.  The
  // extra indirection adds 1 or 2 nanos
  assert(counterAlignment >= sizeof(std::atomic<size_t>));
  std::vector<char> raw(counterAlignment * stripes);

  // if we happen to be using the tlsRoundRobin, then sequentially
  // assigning the thread identifiers is the unlikely best-case scenario.
  // We don't want to unfairly benefit or penalize.  Computing the exact
  // maximum likelihood of the probability distributions is annoying, so
  // I approximate as 2/5 of the ids that have no threads, 2/5 that have
  // 1, 2/15 that have 2, and 1/15 that have 3.  We accomplish this by
  // wrapping back to slot 0 when we hit 1/15 and 1/5.

  std::vector<std::thread> threads;
  while (threads.size() < numThreads) {
    threads.push_back(std::thread([&, iters, stripes, work]() {
      auto counters = std::vector<std::atomic<size_t>*>(stripes);
      for (size_t i = 0; i < stripes; ++i) {
        counters[i] =
            new (raw.data() + counterAlignment * i) std::atomic<size_t>();
      }

      ready++;
      while (!go.load()) {
        std::this_thread::yield();
      }
      std::atomic<int> localWork(0);
      for (size_t i = iters; i > 0; --i) {
        ++*(counters[AccessSpreader<Tag>::current(stripes)]);
        for (size_t j = work; j > 0; --j) {
          auto x = localWork.load();
          folly::doNotOptimizeAway(x);
        }
      }
    }));

    if (threads.size() == numThreads / 15 || threads.size() == numThreads / 5) {
      // create a few dummy threads to wrap back around to 0 mod numCpus
      for (size_t i = threads.size(); i != numThreads; ++i) {
        std::thread t([&]() {
          auto x = AccessSpreader<Tag>::current(stripes);
          folly::doNotOptimizeAway(x);
        });
        t.join();
      }
    }
  }

  while (ready < numThreads) {
    std::this_thread::yield();
  }
  braces.dismiss();
  go = true;

  for (auto& thr : threads) {
    thr.join();
  }
}

static void atomicIncrBaseline(
    size_t iters, size_t work, size_t numThreads = 32) {
  folly::BenchmarkSuspender braces;

  std::atomic<bool> go(false);

  std::vector<std::thread> threads;
  while (threads.size() < numThreads) {
    threads.push_back(std::thread([&]() {
      while (!go.load()) {
        std::this_thread::yield();
      }
      std::atomic<size_t> localCounter(0);
      std::atomic<int> localWork(0);
      for (size_t i = iters; i > 0; --i) {
        localCounter++;
        for (size_t j = work; j > 0; --j) {
          auto x = localWork.load();
          folly::doNotOptimizeAway(x);
        }
      }
    }));
  }

  braces.dismiss();
  go = true;

  for (auto& thr : threads) {
    thr.join();
  }
}

static void contentionAtWidthGetcpu(size_t iters, size_t stripes, size_t work) {
  contentionAtWidth<std::atomic>(iters, stripes, work);
}

static void contentionAtWidthThreadLocal(
    size_t iters, size_t stripes, size_t work) {
  contentionAtWidth<ThreadLocalTag>(iters, stripes, work);
}

static void contentionAtWidthPthreadSelf(
    size_t iters, size_t stripes, size_t work) {
  contentionAtWidth<PthreadSelfTag>(iters, stripes, work);
}

static void contentionAtWidthCached(size_t iters, size_t stripes, size_t work) {
  contentionAtWidth<CachedCurrentTag>(iters, stripes, work);
}

BENCHMARK_DRAW_LINE();
BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 1_stripe_0_work, 1, 0)
BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 2_stripe_0_work, 2, 0)
BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 4_stripe_0_work, 4, 0)
BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 8_stripe_0_work, 8, 0)
BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 16_stripe_0_work, 16, 0)
BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 32_stripe_0_work, 32, 0)
BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 64_stripe_0_work, 64, 0)
BENCHMARK_NAMED_PARAM(contentionAtWidthCached, 2_stripe_0_work, 2, 0)
BENCHMARK_NAMED_PARAM(contentionAtWidthCached, 4_stripe_0_work, 4, 0)
BENCHMARK_NAMED_PARAM(contentionAtWidthCached, 8_stripe_0_work, 8, 0)
BENCHMARK_NAMED_PARAM(contentionAtWidthCached, 16_stripe_0_work, 16, 0)
BENCHMARK_NAMED_PARAM(contentionAtWidthCached, 32_stripe_0_work, 32, 0)
BENCHMARK_NAMED_PARAM(contentionAtWidthCached, 64_stripe_0_work, 64, 0)
BENCHMARK_NAMED_PARAM(contentionAtWidthThreadLocal, 2_stripe_0_work, 2, 0)
BENCHMARK_NAMED_PARAM(contentionAtWidthThreadLocal, 4_stripe_0_work, 4, 0)
BENCHMARK_NAMED_PARAM(contentionAtWidthThreadLocal, 8_stripe_0_work, 8, 0)
BENCHMARK_NAMED_PARAM(contentionAtWidthThreadLocal, 16_stripe_0_work, 16, 0)
BENCHMARK_NAMED_PARAM(contentionAtWidthThreadLocal, 32_stripe_0_work, 32, 0)
BENCHMARK_NAMED_PARAM(contentionAtWidthThreadLocal, 64_stripe_0_work, 64, 0)
BENCHMARK_NAMED_PARAM(contentionAtWidthPthreadSelf, 2_stripe_0_work, 2, 0)
BENCHMARK_NAMED_PARAM(contentionAtWidthPthreadSelf, 4_stripe_0_work, 4, 0)
BENCHMARK_NAMED_PARAM(contentionAtWidthPthreadSelf, 8_stripe_0_work, 8, 0)
BENCHMARK_NAMED_PARAM(contentionAtWidthPthreadSelf, 16_stripe_0_work, 16, 0)
BENCHMARK_NAMED_PARAM(contentionAtWidthPthreadSelf, 32_stripe_0_work, 32, 0)
BENCHMARK_NAMED_PARAM(contentionAtWidthPthreadSelf, 64_stripe_0_work, 64, 0)
BENCHMARK_NAMED_PARAM(atomicIncrBaseline, local_incr_0_work, 0)
BENCHMARK_DRAW_LINE();
BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 1_stripe_500_work, 1, 500)
BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 2_stripe_500_work, 2, 500)
BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 4_stripe_500_work, 4, 500)
BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 8_stripe_500_work, 8, 500)
BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 16_stripe_500_work, 16, 500)
BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 32_stripe_500_work, 32, 500)
BENCHMARK_NAMED_PARAM(atomicIncrBaseline, local_incr_500_work, 500)
BENCHMARK_DRAW_LINE();
BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 1_stripe_1000_work, 1, 1000)
BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 2_stripe_1000_work, 2, 1000)
BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 4_stripe_1000_work, 4, 1000)
BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 8_stripe_1000_work, 8, 1000)
BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 16_stripe_1000_work, 16, 1000)
BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 32_stripe_1000_work, 32, 1000)
BENCHMARK_NAMED_PARAM(atomicIncrBaseline, local_incr_1000_work, 1000)

int main(int argc, char** argv) {
  gflags::ParseCommandLineFlags(&argc, &argv, true);
  folly::runBenchmarks();
  return 0;
}
folly/folly/concurrency/test/CacheLocalityBenchmark.cpp