/* * Copyright (c) Meta Platforms, Inc. and affiliates. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #pragma once #include <algorithm> #include <array> #include <atomic> #include <cassert> #include <functional> #include <limits> #include <string> #include <type_traits> #include <vector> #include <folly/Likely.h> #include <folly/Portability.h> #include <folly/lang/Align.h> #include <folly/synchronization/AtomicRef.h> namespace folly { // This file contains several classes that might be useful if you are // trying to dynamically optimize cache locality: CacheLocality reads // cache sharing information from sysfs to determine how CPUs should be // grouped to minimize contention, Getcpu provides fast access to the // current CPU via __vdso_getcpu, and AccessSpreader uses these two to // optimally spread accesses among a predetermined number of stripes. // // AccessSpreader<>::current(n) microbenchmarks at 22 nanos, which is // substantially less than the cost of a cache miss. This means that we // can effectively use it to reduce cache line ping-pong on striped data // structures such as IndexedMemPool or statistics counters. // // Because CacheLocality looks at all of the cache levels, it can be // used for different levels of optimization. AccessSpreader(2) does // per-chip spreading on a dual socket system. AccessSpreader(numCpus) // does perfect per-cpu spreading. AccessSpreader(numCpus / 2) does // perfect L1 spreading in a system with hyperthreading enabled. struct CacheLocality { … }; /// Knows how to derive a function pointer to the VDSO implementation of /// getcpu(2), if available struct Getcpu { … }; struct SequentialThreadId { … }; struct HashingThreadId { … }; /// A class that lazily binds a unique (for each implementation of Atom) /// identifier to a thread. This is a fallback mechanism for the access /// spreader if __vdso_getcpu can't be loaded template <typename ThreadId> struct FallbackGetcpu { … }; FallbackGetcpuType; namespace detail { class AccessSpreaderBase { … }; } // namespace detail /// AccessSpreader arranges access to a striped data structure in such a /// way that concurrently executing threads are likely to be accessing /// different stripes. It does NOT guarantee uncontended access. /// Your underlying algorithm must be thread-safe without spreading, this /// is merely an optimization. AccessSpreader::current(n) is typically /// much faster than a cache miss (12 nanos on my dev box, tested fast /// in both 2.6 and 3.2 kernels). /// /// If available (and not using the deterministic testing implementation) /// AccessSpreader uses the getcpu system call via VDSO and the /// precise locality information retrieved from sysfs by CacheLocality. /// This provides optimal anti-sharing at a fraction of the cost of a /// cache miss. /// /// When there are not as many stripes as processors, we try to optimally /// place the cache sharing boundaries. This means that if you have 2 /// stripes and run on a dual-socket system, your 2 stripes will each get /// all of the cores from a single socket. If you have 16 stripes on a /// 16 core system plus hyperthreading (32 cpus), each core will get its /// own stripe and there will be no cache sharing at all. /// /// AccessSpreader has a fallback mechanism for when __vdso_getcpu can't be /// loaded, or for use during deterministic testing. Using sched_getcpu /// or the getcpu syscall would negate the performance advantages of /// access spreading, so we use a thread-local value and a shared atomic /// counter to spread access out. On systems lacking both a fast getcpu() /// and TLS, we hash the thread id to spread accesses. /// /// AccessSpreader is templated on the template type that is used /// to implement atomics, as a way to instantiate the underlying /// heuristics differently for production use and deterministic unit /// testing. See DeterministicScheduler for more. If you aren't using /// DeterministicScheduler, you can just use the default template parameter /// all of the time. template <template <typename> class Atom = std::atomic> struct AccessSpreader : private detail::AccessSpreaderBase { … }; /** * An allocator that can be used with AccessSpreader to allocate core-local * memory. * * There is actually nothing special about the memory itself (it is not bound to * NUMA nodes or anything), but the allocator guarantees that memory allocatd * from the same stripe will only come from cache lines also allocated to the * same stripe, for the given numStripes. This means multiple things using * AccessSpreader can allocate memory in smaller-than cacheline increments, and * be assured that it won't cause more false sharing than it otherwise would. * * Note that allocation and deallocation takes a per-size-class lock. * * Memory allocated with coreMalloc() must be freed with coreFree(). */ void* coreMalloc(size_t size, size_t numStripes, size_t stripe); void coreFree(void* ptr); namespace detail { void* coreMallocFromGuard(size_t size); } /** * An C++ allocator adapter for coreMalloc/Free. The allocator is stateless, to * avoid increasing the footprint of the container that uses it, so the stripe * needs to be passed out of band: allocate() can only be called while there is * an active CoreAllocatorGuard. deallocate() can instead be called at any * point. * * This makes CoreAllocator unsuitable for containers that can grow, and it is * meant for container where all allocations happen at construction time. */ template <typename T> class CoreAllocator : private std::allocator<T> { … }; class FOLLY_NODISCARD CoreAllocatorGuard { … }; } // namespace folly