folly/folly/concurrency/test/CoreCachedSharedPtrTest.cpp

/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

// AtomicSharedPtr-detail.h only works with libstdc++, so skip these tests for
// other vendors
// PackedSyncPtr requires x64, ppc64 or aarch64, skip these tests for
// other arches
#include <folly/Portability.h>
#include <folly/portability/Config.h>
#if defined(__GLIBCXX__) && (FOLLY_X64 || FOLLY_PPC64 || FOLLY_AARCH64)

#include <atomic>
#include <memory>
#include <thread>
#include <vector>

#include <folly/Benchmark.h>
#include <folly/concurrency/AtomicSharedPtr.h>
#include <folly/concurrency/CoreCachedSharedPtr.h>
#include <folly/concurrency/ThreadCachedSynchronized.h>
#include <folly/concurrency/memory/ReadMostlySharedPtr.h>
#include <folly/portability/GTest.h>

namespace {

template <class Operation>
void parallelRun(
    Operation op, size_t numThreads = std::thread::hardware_concurrency()) {
  std::vector<std::thread> threads;
  for (size_t t = 0; t < numThreads; ++t) {
    threads.emplace_back([&, t] { op(t); });
  }
  for (auto& t : threads) {
    t.join();
  }
}

} // namespace

TEST(CoreCachedSharedPtr, Basic) {
  auto p = std::make_shared<int>(1);
  std::weak_ptr<int> wp(p);

  folly::CoreCachedSharedPtr<int> cached(p);
  folly::CoreCachedWeakPtr<int> wcached(cached);

  std::shared_ptr<int> p2 = cached.get();
  std::weak_ptr<int> wp2 = wcached.get();
  ASSERT_TRUE(p2 != nullptr);
  ASSERT_EQ(*p2, 1);
  ASSERT_FALSE(wp2.expired());

  // Check that other cores get the correct shared_ptr too.
  parallelRun([&](size_t) {
    EXPECT_TRUE(cached.get().get() == p.get());
    EXPECT_EQ(*cached.get(), 1);
    EXPECT_EQ(*wcached.lock(), 1);
  });

  p.reset();
  cached.reset();
  // p2 should survive.
  ASSERT_FALSE(wp.expired());
  // Here we don't know anything about wp2: could be expired even if
  // there is a living reference to the main object.

  p2.reset();
  ASSERT_TRUE(wp.expired());
  ASSERT_TRUE(wp2.expired());
}

TEST(CoreCachedSharedPtr, AtomicCoreCachedSharedPtr) {
  constexpr size_t kIters = 2000;
  {
    folly::AtomicCoreCachedSharedPtr<size_t> p;
    parallelRun([&](size_t) {
      for (size_t i = 0; i < kIters; ++i) {
        p.reset(std::make_shared<size_t>(i));
        EXPECT_TRUE(p.get());
        // Just read the value, and ensure that ASAN/TSAN do not complain.
        EXPECT_GE(*p.get(), 0);
      }
    });
  }
  {
    // One writer thread, all other readers, verify consistency.
    std::atomic<size_t> largestValueObserved{0};
    folly::AtomicCoreCachedSharedPtr<size_t> p{std::make_shared<size_t>(0)};
    parallelRun([&](size_t t) {
      if (t == 0) {
        for (size_t i = 0; i < kIters; ++i) {
          p.reset(std::make_shared<size_t>(i + 1));
        }
      } else {
        while (true) {
          auto exp = largestValueObserved.load();
          auto value = *p.get();
          EXPECT_GE(value, exp);
          // Maintain the maximum value observed so far. As soon as one thread
          // observes an update, they all should observe it.
          while (value > exp &&
                 !largestValueObserved.compare_exchange_weak(exp, value)) {
          }
          if (exp == kIters) {
            break;
          }
        }
      }
    });
  }
}

namespace {

template <class Holder>
void testAliasingCornerCases() {
  {
    Holder h;
    auto p1 = std::make_shared<int>(1);
    std::weak_ptr<int> w1 = p1;
    // Aliasing constructor, p2 is nullptr but still manages the object in p1.
    std::shared_ptr<int> p2(p1, nullptr);
    // And now it's the only reference left.
    p1.reset();
    EXPECT_FALSE(w1.expired());

    // Pass the ownership to the Holder.
    h.reset(p2);
    p2.reset();

    // Object should still be alive.
    EXPECT_FALSE(w1.expired());

    // And resetting the holder will destroy the managed object.
    h.reset();
    // Except in the case of AtomicCoreCachedSharedPtr, where the object is only
    // scheduled for reclamation, so we have to wait for a hazptr scan.
    folly::hazptr_cleanup();
    EXPECT_TRUE(w1.expired());
  }

  {
    Holder h;
    int x = 1;
    // p points to x, but has no managed object.
    std::shared_ptr<int> p(std::shared_ptr<int>{}, &x);
    h.reset(p);
    EXPECT_TRUE(h.get().get() == &x);
  }
}

} // namespace

TEST(CoreCachedSharedPtr, AliasingCornerCases) {
  testAliasingCornerCases<folly::CoreCachedSharedPtr<int>>();
}

TEST(CoreCachedSharedPtr, AliasingCornerCasesAtomic) {
  testAliasingCornerCases<folly::AtomicCoreCachedSharedPtr<int>>();
}

namespace {

template <class Operation>
size_t benchmarkParallelRun(Operation op, size_t numThreads) {
  constexpr size_t kIters = 2 * 1000 * 1000;

  std::vector<std::thread> threads;

  // Prevent the compiler from hoisting code out of the loop.
  auto opNoinline = [&]() FOLLY_NOINLINE { op(); };

  std::atomic<size_t> ready = 0;
  std::atomic<bool> done = false;
  std::atomic<size_t> iters = 0;

  for (size_t t = 0; t < numThreads; ++t) {
    threads.emplace_back([&] {
      ++ready;
      while (ready.load() < numThreads) {
      }
      size_t i = 0;
      // Interrupt all workers as soon as the first one is done, so the overall
      // wall time is close to the time spent in each thread.
      for (; !done.load(std::memory_order_acquire) && i < kIters; ++i) {
        opNoinline();
      }
      done = true;
      iters += i;
    });
  }

  for (auto& t : threads) {
    t.join();
  }

  return iters / numThreads;
}

size_t benchmarkSharedPtrAcquire(size_t numThreads) {
  auto p = std::make_shared<int>(1);
  return benchmarkParallelRun([&] { return p; }, numThreads);
}

size_t benchmarkWeakPtrLock(size_t numThreads) {
  auto p = std::make_shared<int>(1);
  std::weak_ptr<int> wp = p;
  return benchmarkParallelRun([&] { return wp.lock(); }, numThreads);
}

size_t benchmarkAtomicSharedPtrAcquire(size_t numThreads) {
  auto s = std::make_shared<int>(1);
  folly::atomic_shared_ptr<int> p;
  p.store(s);
  return benchmarkParallelRun([&] { return p.load(); }, numThreads);
}

size_t benchmarkCoreCachedSharedPtrAcquire(size_t numThreads) {
  folly::CoreCachedSharedPtr<int> p(std::make_shared<int>(1));
  return benchmarkParallelRun([&] { return p.get(); }, numThreads);
}

size_t benchmarkCoreCachedWeakPtrLock(size_t numThreads) {
  folly::CoreCachedSharedPtr<int> p(std::make_shared<int>(1));
  folly::CoreCachedWeakPtr<int> wp(p);
  return benchmarkParallelRun([&] { return wp.lock(); }, numThreads);
}

size_t benchmarkAtomicCoreCachedSharedPtrAcquire(size_t numThreads) {
  folly::AtomicCoreCachedSharedPtr<int> p(std::make_shared<int>(1));
  return benchmarkParallelRun([&] { return p.get(); }, numThreads);
}

size_t benchmarkReadMostlySharedPtrAcquire(size_t numThreads) {
  folly::ReadMostlyMainPtr<int> p{std::make_shared<int>(1)};
  return benchmarkParallelRun([&] { return p.getShared(); }, numThreads);
}

size_t benchmarkReadMostlyWeakPtrLock(size_t numThreads) {
  folly::ReadMostlyMainPtr<int> p{std::make_shared<int>(1)};
  folly::ReadMostlyWeakPtr<int> w{p};
  return benchmarkParallelRun([&] { return w.lock(); }, numThreads);
}

size_t benchmarkThreadCachedSynchronizedAcquire(size_t numThreads) {
  folly::thread_cached_synchronized<std::shared_ptr<int>> p{
      std::make_shared<int>(1)};
  return benchmarkParallelRun([&] { return *p; }, numThreads);
}

} // namespace

#define BENCHMARK_THREADS(THREADS)                                       \
  BENCHMARK_MULTI(SharedPtrAcquire_##THREADS##Threads) {                 \
    return benchmarkSharedPtrAcquire(THREADS);                           \
  }                                                                      \
  BENCHMARK_MULTI(WeakPtrLock_##THREADS##Threads) {                      \
    return benchmarkWeakPtrLock(THREADS);                                \
  }                                                                      \
  BENCHMARK_MULTI(AtomicSharedPtrAcquire_##THREADS##Threads) {           \
    return benchmarkAtomicSharedPtrAcquire(THREADS);                     \
  }                                                                      \
  BENCHMARK_MULTI(ThreadCachedSynchronizedAcquire_##THREADS##Threads) {  \
    return benchmarkThreadCachedSynchronizedAcquire(THREADS);            \
  }                                                                      \
  BENCHMARK_MULTI(CoreCachedSharedPtrAcquire_##THREADS##Threads) {       \
    return benchmarkCoreCachedSharedPtrAcquire(THREADS);                 \
  }                                                                      \
  BENCHMARK_MULTI(CoreCachedWeakPtrLock_##THREADS##Threads) {            \
    return benchmarkCoreCachedWeakPtrLock(THREADS);                      \
  }                                                                      \
  BENCHMARK_MULTI(AtomicCoreCachedSharedPtrAcquire_##THREADS##Threads) { \
    return benchmarkAtomicCoreCachedSharedPtrAcquire(THREADS);           \
  }                                                                      \
  BENCHMARK_MULTI(ReadMostlySharedPtrAcquire_##THREADS##Threads) {       \
    return benchmarkReadMostlySharedPtrAcquire(THREADS);                 \
  }                                                                      \
  BENCHMARK_MULTI(ReadMostlyWeakPtrLock_##THREADS##Threads) {            \
    return benchmarkReadMostlyWeakPtrLock(THREADS);                      \
  }                                                                      \
  BENCHMARK_DRAW_LINE();

BENCHMARK_THREADS(1)
BENCHMARK_THREADS(4)
BENCHMARK_THREADS(8)
BENCHMARK_THREADS(16)
BENCHMARK_THREADS(32)
BENCHMARK_THREADS(64)

BENCHMARK_MULTI(SharedPtrSingleThreadReset) {
  auto p = std::make_shared<int>(1);
  return benchmarkParallelRun([&] { p = std::make_shared<int>(1); }, 1);
}
BENCHMARK_MULTI(AtomicSharedPtrSingleThreadReset) {
  auto s = std::make_shared<int>(1);
  folly::atomic_shared_ptr<int> p;
  p.store(s);
  return benchmarkParallelRun([&] { p.store(std::make_shared<int>(1)); }, 1);
}
BENCHMARK_MULTI(CoreCachedSharedPtrSingleThreadReset) {
  folly::CoreCachedSharedPtr<int> p(std::make_shared<int>(1));
  return benchmarkParallelRun([&] { p.reset(std::make_shared<int>(1)); }, 1);
}
BENCHMARK_MULTI(AtomicCoreCachedSharedPtrSingleThreadReset) {
  folly::AtomicCoreCachedSharedPtr<int> p(std::make_shared<int>(1));
  return benchmarkParallelRun([&] { p.reset(std::make_shared<int>(1)); }, 1);
}

int main(int argc, char** argv) {
  testing::InitGoogleTest(&argc, argv);
  gflags::ParseCommandLineFlags(&argc, &argv, true);

  auto ret = RUN_ALL_TESTS();
  if (ret == 0 && FLAGS_benchmark) {
    folly::runBenchmarks();
  }

  return ret;
}

#endif // defined(__GLIBCXX__)

#if 0
// On Intel(R) Xeon(R) Gold 6138 CPU @ 2.00GHz, 2 sockets, 40 cores, 80 threads.
$ buck-out/gen/folly/concurrency/test/core_cached_shared_ptr_test --benchmark --bm_min_usec 500000

============================================================================
[...]ency/test/CoreCachedSharedPtrTest.cpp     relative  time/iter   iters/s
============================================================================
SharedPtrAcquire_1Threads                                  19.86ns    50.36M
WeakPtrLock_1Threads                                       21.23ns    47.10M
AtomicSharedPtrAcquire_1Threads                            30.92ns    32.34M
ThreadCachedSynchronizedAcquire_1Threads                   21.61ns    46.27M
CoreCachedSharedPtrAcquire_1Threads                        21.87ns    45.72M
CoreCachedWeakPtrLock_1Threads                             22.09ns    45.27M
AtomicCoreCachedSharedPtrAcquire_1Threads                  24.47ns    40.87M
ReadMostlySharedPtrAcquire_1Threads                        15.49ns    64.57M
ReadMostlyWeakPtrLock_1Threads                             15.32ns    65.27M
----------------------------------------------------------------------------
SharedPtrAcquire_4Threads                                 255.09ns     3.92M
WeakPtrLock_4Threads                                      287.98ns     3.47M
AtomicSharedPtrAcquire_4Threads                           709.64ns     1.41M
ThreadCachedSynchronizedAcquire_4Threads                  228.03ns     4.39M
CoreCachedSharedPtrAcquire_4Threads                        22.68ns    44.09M
CoreCachedWeakPtrLock_4Threads                             23.62ns    42.34M
AtomicCoreCachedSharedPtrAcquire_4Threads                  24.35ns    41.07M
ReadMostlySharedPtrAcquire_4Threads                        16.86ns    59.31M
ReadMostlyWeakPtrLock_4Threads                             16.25ns    61.55M
----------------------------------------------------------------------------
SharedPtrAcquire_8Threads                                 488.90ns     2.05M
WeakPtrLock_8Threads                                      577.98ns     1.73M
AtomicSharedPtrAcquire_8Threads                             1.78us   561.16K
ThreadCachedSynchronizedAcquire_8Threads                  483.47ns     2.07M
CoreCachedSharedPtrAcquire_8Threads                        23.45ns    42.64M
CoreCachedWeakPtrLock_8Threads                             23.81ns    41.99M
AtomicCoreCachedSharedPtrAcquire_8Threads                  24.49ns    40.83M
ReadMostlySharedPtrAcquire_8Threads                        17.10ns    58.49M
ReadMostlyWeakPtrLock_8Threads                             16.73ns    59.76M
----------------------------------------------------------------------------
SharedPtrAcquire_16Threads                                  1.08us   929.56K
WeakPtrLock_16Threads                                       1.56us   642.75K
AtomicSharedPtrAcquire_16Threads                            2.94us   340.19K
ThreadCachedSynchronizedAcquire_16Threads                   1.02us   981.43K
CoreCachedSharedPtrAcquire_16Threads                       25.58ns    39.10M
CoreCachedWeakPtrLock_16Threads                            24.79ns    40.34M
AtomicCoreCachedSharedPtrAcquire_16Threads                 25.97ns    38.50M
ReadMostlySharedPtrAcquire_16Threads                       18.11ns    55.20M
ReadMostlyWeakPtrLock_16Threads                            17.91ns    55.85M
----------------------------------------------------------------------------
SharedPtrAcquire_32Threads                                  2.25us   444.85K
WeakPtrLock_32Threads                                       3.29us   304.08K
AtomicSharedPtrAcquire_32Threads                            6.86us   145.76K
ThreadCachedSynchronizedAcquire_32Threads                   2.30us   434.67K
CoreCachedSharedPtrAcquire_32Threads                       25.35ns    39.45M
CoreCachedWeakPtrLock_32Threads                            25.37ns    39.41M
AtomicCoreCachedSharedPtrAcquire_32Threads                 26.69ns    37.46M
ReadMostlySharedPtrAcquire_32Threads                       19.32ns    51.75M
ReadMostlyWeakPtrLock_32Threads                            18.37ns    54.43M
----------------------------------------------------------------------------
SharedPtrAcquire_64Threads                                  4.92us   203.42K
WeakPtrLock_64Threads                                      10.90us    91.77K
AtomicSharedPtrAcquire_64Threads                           14.06us    71.12K
ThreadCachedSynchronizedAcquire_64Threads                   4.64us   215.59K
CoreCachedSharedPtrAcquire_64Threads                       33.87ns    29.53M
CoreCachedWeakPtrLock_64Threads                            48.94ns    20.43M
AtomicCoreCachedSharedPtrAcquire_64Threads                 35.85ns    27.89M
ReadMostlySharedPtrAcquire_64Threads                       32.08ns    31.17M
ReadMostlyWeakPtrLock_64Threads                            29.83ns    33.52M
----------------------------------------------------------------------------
SharedPtrSingleThreadReset                                 12.52ns    79.89M
AtomicSharedPtrSingleThreadReset                           48.73ns    20.52M
CoreCachedSharedPtrSingleThreadReset                        2.94us   340.29K
AtomicCoreCachedSharedPtrSingleThreadReset                  5.15us   194.14K
============================================================================
#endif