CacheLocality.h | Explore in Territory

/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#pragma once

#include <algorithm>
#include <array>
#include <atomic>
#include <cassert>
#include <functional>
#include <limits>
#include <string>
#include <type_traits>
#include <vector>

#include <folly/Likely.h>
#include <folly/Portability.h>
#include <folly/lang/Align.h>
#include <folly/synchronization/AtomicRef.h>

namespace folly {

// This file contains several classes that might be useful if you are
// trying to dynamically optimize cache locality: CacheLocality reads
// cache sharing information from sysfs to determine how CPUs should be
// grouped to minimize contention, Getcpu provides fast access to the
// current CPU via __vdso_getcpu, and AccessSpreader uses these two to
// optimally spread accesses among a predetermined number of stripes.
//
// AccessSpreader<>::current(n) microbenchmarks at 22 nanos, which is
// substantially less than the cost of a cache miss.  This means that we
// can effectively use it to reduce cache line ping-pong on striped data
// structures such as IndexedMemPool or statistics counters.
//
// Because CacheLocality looks at all of the cache levels, it can be
// used for different levels of optimization.  AccessSpreader(2) does
// per-chip spreading on a dual socket system.  AccessSpreader(numCpus)
// does perfect per-cpu spreading.  AccessSpreader(numCpus / 2) does
// perfect L1 spreading in a system with hyperthreading enabled.

struct CacheLocality { … };

/// Knows how to derive a function pointer to the VDSO implementation of
/// getcpu(2), if available
struct Getcpu { … };

struct SequentialThreadId { … };

struct HashingThreadId { … };

/// A class that lazily binds a unique (for each implementation of Atom)
/// identifier to a thread.  This is a fallback mechanism for the access
/// spreader if __vdso_getcpu can't be loaded
template <typename ThreadId>
struct FallbackGetcpu { … };

FallbackGetcpuType;

namespace detail {

class AccessSpreaderBase { … };

} // namespace detail

/// AccessSpreader arranges access to a striped data structure in such a
/// way that concurrently executing threads are likely to be accessing
/// different stripes.  It does NOT guarantee uncontended access.
/// Your underlying algorithm must be thread-safe without spreading, this
/// is merely an optimization.  AccessSpreader::current(n) is typically
/// much faster than a cache miss (12 nanos on my dev box, tested fast
/// in both 2.6 and 3.2 kernels).
///
/// If available (and not using the deterministic testing implementation)
/// AccessSpreader uses the getcpu system call via VDSO and the
/// precise locality information retrieved from sysfs by CacheLocality.
/// This provides optimal anti-sharing at a fraction of the cost of a
/// cache miss.
///
/// When there are not as many stripes as processors, we try to optimally
/// place the cache sharing boundaries.  This means that if you have 2
/// stripes and run on a dual-socket system, your 2 stripes will each get
/// all of the cores from a single socket.  If you have 16 stripes on a
/// 16 core system plus hyperthreading (32 cpus), each core will get its
/// own stripe and there will be no cache sharing at all.
///
/// AccessSpreader has a fallback mechanism for when __vdso_getcpu can't be
/// loaded, or for use during deterministic testing.  Using sched_getcpu
/// or the getcpu syscall would negate the performance advantages of
/// access spreading, so we use a thread-local value and a shared atomic
/// counter to spread access out.  On systems lacking both a fast getcpu()
/// and TLS, we hash the thread id to spread accesses.
///
/// AccessSpreader is templated on the template type that is used
/// to implement atomics, as a way to instantiate the underlying
/// heuristics differently for production use and deterministic unit
/// testing.  See DeterministicScheduler for more.  If you aren't using
/// DeterministicScheduler, you can just use the default template parameter
/// all of the time.
template <template <typename> class Atom = std::atomic>
struct AccessSpreader : private detail::AccessSpreaderBase { … };

/**
 * An allocator that can be used with AccessSpreader to allocate core-local
 * memory.
 *
 * There is actually nothing special about the memory itself (it is not bound to
 * NUMA nodes or anything), but the allocator guarantees that memory allocatd
 * from the same stripe will only come from cache lines also allocated to the
 * same stripe, for the given numStripes.  This means multiple things using
 * AccessSpreader can allocate memory in smaller-than cacheline increments, and
 * be assured that it won't cause more false sharing than it otherwise would.
 *
 * Note that allocation and deallocation takes a per-size-class lock.
 *
 * Memory allocated with coreMalloc() must be freed with coreFree().
 */
void* coreMalloc(size_t size, size_t numStripes, size_t stripe);
void coreFree(void* ptr);

namespace detail {
void* coreMallocFromGuard(size_t size);
}

/**
 * An C++ allocator adapter for coreMalloc/Free. The allocator is stateless, to
 * avoid increasing the footprint of the container that uses it, so the stripe
 * needs to be passed out of band: allocate() can only be called while there is
 * an active CoreAllocatorGuard. deallocate() can instead be called at any
 * point.
 *
 * This makes CoreAllocator unsuitable for containers that can grow, and it is
 * meant for container where all allocations happen at construction time.
 */
template <typename T>
class CoreAllocator : private std::allocator<T> { … };

class FOLLY_NODISCARD CoreAllocatorGuard { … };

} // namespace folly
folly/folly/concurrency/CacheLocality.h