folly/folly/test/AtomicUnorderedMapTest.cpp

/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#include <folly/AtomicUnorderedMap.h>

#include <memory>
#include <thread>
#include <unordered_map>

#include <folly/Benchmark.h>
#include <folly/portability/GFlags.h>
#include <folly/portability/GTest.h>
#include <folly/test/DeterministicSchedule.h>

using namespace folly;

template <class T>
struct non_atomic {
  T value;

  non_atomic() = default;
  non_atomic(const non_atomic&) = delete;
  constexpr /* implicit */ non_atomic(T desired) : value(desired) {}

  T operator+=(T arg) {
    value += arg;
    return load();
  }

  T load(std::memory_order /* order */ = std::memory_order_seq_cst) const {
    return value;
  }

  /* implicit */
  operator T() const { return load(); }

  void store(
      T desired, std::memory_order /* order */ = std::memory_order_seq_cst) {
    value = desired;
  }

  T exchange(
      T desired, std::memory_order /* order */ = std::memory_order_seq_cst) {
    T old = load();
    store(desired);
    return old;
  }

  bool compare_exchange_weak(
      T& expected,
      T desired,
      std::memory_order /* success */ = std::memory_order_seq_cst,
      std::memory_order /* failure */ = std::memory_order_seq_cst) {
    if (value == expected) {
      value = desired;
      return true;
    }

    expected = value;
    return false;
  }

  bool compare_exchange_strong(
      T& expected,
      T desired,
      std::memory_order /* success */ = std::memory_order_seq_cst,
      std::memory_order /* failure */ = std::memory_order_seq_cst) {
    if (value == expected) {
      value = desired;
      return true;
    }

    expected = value;
    return false;
  }

  bool is_lock_free() const { return true; }
};

template <
    typename Key,
    typename Value,
    typename IndexType,
    template <typename> class Atom = std::atomic,
    typename Allocator = std::allocator<char>>
using UIM = AtomicUnorderedInsertMap<
    Key,
    Value,
    std::hash<Key>,
    std::equal_to<Key>,
    (std::is_trivially_destructible<Key>::value &&
     std::is_trivially_destructible<Value>::value),
    Atom,
    IndexType,
    Allocator>;

namespace {
template <typename T>
struct AtomicUnorderedInsertMapTest : public ::testing::Test {};
} // namespace

// uint16_t doesn't make sense for most platforms, but we might as well
// test it
using IndexTypesToTest = ::testing::Types<uint16_t, uint32_t, uint64_t>;
TYPED_TEST_SUITE(AtomicUnorderedInsertMapTest, IndexTypesToTest);

TYPED_TEST(AtomicUnorderedInsertMapTest, basic) {
  UIM<std::string,
      std::string,
      TypeParam,
      std::atomic,
      folly::detail::MMapAlloc>
      m(100);

  m.emplace("abc", "ABC");
  EXPECT_TRUE(m.find("abc") != m.cend());
  EXPECT_EQ(m.find("abc")->first, "abc");
  EXPECT_EQ(m.find("abc")->second, "ABC");
  EXPECT_TRUE(m.find("def") == m.cend());
  auto iter = m.cbegin();
  EXPECT_TRUE(iter != m.cend());
  EXPECT_TRUE(iter == m.find("abc"));
  auto a = iter;
  EXPECT_TRUE(a == iter);
  auto b = iter;
  ++iter;
  EXPECT_TRUE(iter == m.cend());
  EXPECT_TRUE(a == b);
  EXPECT_TRUE(a != iter);
  a++;
  EXPECT_TRUE(a == iter);
  EXPECT_TRUE(a != b);

  auto rangeIter = m.cbegin();
  for (const auto& [key, value] : m) {
    EXPECT_EQ(key, rangeIter->first);
    EXPECT_EQ(value, rangeIter->second);
    rangeIter++;
  }
  EXPECT_EQ(m.end(), m.cend());
}

TEST(AtomicUnorderedInsertMap, loadFactor) {
  AtomicUnorderedInsertMap<int, bool> m(5000, 0.5f);

  // we should be able to put in much more than 5000 things because of
  // our load factor request
  for (int i = 0; i < 10000; ++i) {
    m.emplace(i, true);
  }
}

TEST(AtomicUnorderedInsertMap, capacityExceeded) {
  AtomicUnorderedInsertMap<int, bool> m(5000, 1.0f);

  EXPECT_THROW(
      {
        for (int i = 0; i < 6000; ++i) {
          m.emplace(i, false);
        }
      },
      std::bad_alloc);
}

TYPED_TEST(AtomicUnorderedInsertMapTest, valueMutation) {
  UIM<int, MutableAtom<int>, TypeParam> m(100);

  for (int i = 0; i < 50; ++i) {
    m.emplace(i, i);
  }

  m.find(1)->second.data++;
}

TEST(UnorderedInsertMap, valueMutation) {
  UIM<int, MutableData<int>, uint32_t, non_atomic> m(100);

  for (int i = 0; i < 50; ++i) {
    m.emplace(i, i);
  }

  m.find(1)->second.data++;
  EXPECT_EQ(m.find(1)->second.data, 2);
}

// This test is too expensive to run automatically.  On my dev server it
// takes about 10 minutes for dbg build, 2 for opt.
TEST(AtomicUnorderedInsertMap, DISABLED_MegaMap) {
  size_t capacity = 2000000000;
  AtomicUnorderedInsertMap64<size_t, size_t> big(capacity);
  for (size_t i = 0; i < capacity * 2; i += 2) {
    big.emplace(i, i * 10);
  }
  for (size_t i = 0; i < capacity * 3; i += capacity / 1000 + 1) {
    auto iter = big.find(i);
    if ((i & 1) == 0 && i < capacity * 2) {
      EXPECT_EQ(iter->second, i * 10);
    } else {
      EXPECT_TRUE(iter == big.cend());
    }
  }
}

BENCHMARK(lookup_int_int_hit, iters) {
  std::unique_ptr<AtomicUnorderedInsertMap<int, size_t>> ptr = {};

  size_t capacity = 100000;

  BENCHMARK_SUSPEND {
    ptr = std::make_unique<AtomicUnorderedInsertMap<int, size_t>>(capacity);
    for (size_t i = 0; i < capacity; ++i) {
      auto k = 3 * ((5641 * i) % capacity);
      ptr->emplace(k, k + 1);
      EXPECT_EQ(ptr->find(k)->second, k + 1);
    }
  }

  for (size_t i = 0; i < iters; ++i) {
    size_t k = 3 * (((i * 7919) ^ (i * 4001)) % capacity);
    auto iter = ptr->find(k);
    if (iter == ptr->cend() || iter->second != k + 1) {
      auto jter = ptr->find(k);
      EXPECT_TRUE(iter == jter);
    }
    EXPECT_EQ(iter->second, k + 1);
  }

  BENCHMARK_SUSPEND {
    ptr.reset(nullptr);
  }
}

struct PairHash {
  size_t operator()(const std::pair<uint64_t, uint64_t>& pr) const {
    return pr.first ^ pr.second;
  }
};

void contendedRW(
    size_t itersPerThread,
    size_t capacity,
    size_t numThreads,
    size_t readsPerWrite) {
  typedef std::pair<uint64_t, uint64_t> Key;
  typedef AtomicUnorderedInsertMap<Key, MutableAtom<uint32_t>, PairHash> Map;

  std::unique_ptr<Map> ptr = {};
  std::atomic<bool> go{false};
  std::vector<std::thread> threads;

  BENCHMARK_SUSPEND {
    ptr = std::make_unique<Map>(capacity);
    while (threads.size() < numThreads) {
      threads.emplace_back([&]() {
        while (!go) {
          std::this_thread::yield();
        }

        size_t reads = 0;
        size_t writes = 0;
        while (reads + writes < itersPerThread) {
          auto r = Random::rand32();
          Key key(reads + writes, r);
          if (reads < writes * readsPerWrite ||
              writes >= capacity / numThreads) {
            // read needed
            ++reads;
            auto iter = ptr->find(key);
            EXPECT_TRUE(
                iter == ptr->cend() ||
                iter->second.data.load(std::memory_order_acquire) >= key.first);
          } else {
            ++writes;
            try {
              auto pr = ptr->emplace(key, key.first);
              if (!pr.second) {
                pr.first->second.data++;
              }
            } catch (std::bad_alloc&) {
              LOG(INFO) << "bad alloc";
            }
          }
        }
      });
    }
  }

  go = true;

  for (auto& thr : threads) {
    thr.join();
  }

  BENCHMARK_SUSPEND {
    ptr.reset(nullptr);
  }
}

// clang-format off
// sudo nice -n -20 ~/fbcode/_bin/common/concurrency/experimental/atomic_unordered_map --benchmark --bm_min_iters=1000000
//
// without MAP_HUGETLB (default)
//
// ============================================================================
// common/concurrency/experimental/AtomicUnorderedMapTest.cpprelative  time/iter
//   iters/s
// ============================================================================
// lookup_int_int_hit                                          20.05ns   49.89M
// contendedRW(small_32thr_99pct)                              70.36ns   14.21M
// contendedRW(large_32thr_99pct)                             164.23ns    6.09M
// contendedRW(large_32thr_99_9pct)                           158.81ns    6.30M
// ============================================================================
//
// with MAP_HUGETLB hacked in
// ============================================================================
// lookup_int_int_hit                                          19.67ns   50.84M
// contendedRW(small_32thr_99pct)                              62.46ns   16.01M
// contendedRW(large_32thr_99pct)                             119.41ns    8.37M
// contendedRW(large_32thr_99_9pct)                           111.23ns    8.99M
// ============================================================================
// clang-format on
BENCHMARK_NAMED_PARAM(contendedRW, small_32thr_99pct, 100000, 32, 99)
BENCHMARK_NAMED_PARAM(contendedRW, large_32thr_99pct, 100000000, 32, 99)
BENCHMARK_NAMED_PARAM(contendedRW, large_32thr_99_9pct, 100000000, 32, 999)

BENCHMARK_DRAW_LINE();

// clang-format off
// sudo nice -n -20 ~/fbcode/_build/opt/site_integrity/quasar/experimental/atomic_unordered_map_test --benchmark --bm_min_iters=10000
// Single threaded benchmarks to test how much better we are than
// std::unordered_map and what is the cost of using atomic operations
// in the uncontended use case
// ============================================================================
// std_map                                                      1.20ms   832.58
// atomic_fast_map                                            511.35us    1.96K
// fast_map                                                   196.28us    5.09K
// ============================================================================
// clang-format on

BENCHMARK(std_map) {
  std::unordered_map<long, long> m;
  m.reserve(10000);
  for (int i = 0; i < 10000; ++i) {
    m.emplace(i, i);
  }

  for (int i = 0; i < 10000; ++i) {
    auto a = m.find(i);
    folly::doNotOptimizeAway(&*a);
  }
}

BENCHMARK(atomic_fast_map) {
  UIM<long, long, uint32_t, std::atomic> m(10000);
  for (int i = 0; i < 10000; ++i) {
    m.emplace(i, i);
  }

  for (int i = 0; i < 10000; ++i) {
    auto a = m.find(i);
    folly::doNotOptimizeAway(&*a);
  }
}

BENCHMARK(fast_map) {
  UIM<long, long, uint32_t, non_atomic> m(10000);
  for (int i = 0; i < 10000; ++i) {
    m.emplace(i, i);
  }

  for (int i = 0; i < 10000; ++i) {
    auto a = m.find(i);
    folly::doNotOptimizeAway(&*a);
  }
}

BENCHMARK(atomic_fast_map_64) {
  UIM<long, long, uint64_t, std::atomic> m(10000);
  for (int i = 0; i < 10000; ++i) {
    m.emplace(i, i);
  }

  for (int i = 0; i < 10000; ++i) {
    auto a = m.find(i);
    folly::doNotOptimizeAway(&*a);
  }
}

BENCHMARK(fast_map_64) {
  UIM<long, long, uint64_t, non_atomic> m(10000);
  for (int i = 0; i < 10000; ++i) {
    m.emplace(i, i);
  }

  for (int i = 0; i < 10000; ++i) {
    auto a = m.find(i);
    folly::doNotOptimizeAway(&*a);
  }
}

int main(int argc, char** argv) {
  testing::InitGoogleTest(&argc, argv);
  gflags::ParseCommandLineFlags(&argc, &argv, true);
  int rv = RUN_ALL_TESTS();
  folly::runBenchmarksOnFlag();
  return rv;
}