/* * Copyright (c) Meta Platforms, Inc. and affiliates. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #pragma once #include <stdint.h> #include <atomic> #include <chrono> #include <memory> #include <mutex> #include <shared_mutex> #include <thread> #include <type_traits> #include <utility> #include <folly/CPortability.h> #include <folly/CppAttributes.h> #include <folly/Likely.h> #include <folly/chrono/Hardware.h> #include <folly/concurrency/CacheLocality.h> #include <folly/detail/Futex.h> #include <folly/portability/Asm.h> #include <folly/synchronization/Lock.h> #include <folly/synchronization/RelaxedAtomic.h> #include <folly/synchronization/SanitizeThread.h> #include <folly/system/ThreadId.h> // SharedMutex is a reader-writer lock. It is small, very fast, scalable // on multi-core, and suitable for use when readers or writers may block. // Unlike most other reader-writer locks, its throughput with concurrent // readers scales linearly; it is able to acquire and release the lock // in shared mode without cache line ping-ponging. It is suitable for // a wide range of lock hold times because it starts with spinning, // proceeds to using sched_yield with a preemption heuristic, and then // waits using futex and precise wakeups. // // SharedMutex provides all of the methods of folly::RWSpinLock, // boost::shared_mutex, boost::upgrade_mutex, and C++14's // std::shared_timed_mutex. All operations that can block are available // in try, try-for, and try-until (system_clock or steady_clock) versions. // // SharedMutexReadPriority gives priority to readers, // SharedMutexWritePriority gives priority to writers. SharedMutex is an // alias for SharedMutexWritePriority, because writer starvation is more // likely than reader starvation for the read-heavy workloads targeted // by SharedMutex. // // In my tests SharedMutex is as good or better than the other // reader-writer locks in use at Facebook for almost all use cases, // sometimes by a wide margin. (If it is rare that there are actually // concurrent readers then RWSpinLock can be a few nanoseconds faster.) // I compared it to folly::RWSpinLock, folly::RWTicketSpinLock64, // boost::shared_mutex, pthread_rwlock_t, and a RWLock that internally uses // spinlocks to guard state and pthread_mutex_t+pthread_cond_t to block. // (Thrift's ReadWriteMutex is based underneath on pthread_rwlock_t.) // It is generally as good or better than the rest when evaluating size, // speed, scalability, or latency outliers. In the corner cases where // it is not the fastest (such as single-threaded use or heavy write // contention) it is never very much worse than the best. See the bottom // of folly/test/SharedMutexTest.cpp for lots of microbenchmark results. // // Comparison to folly::RWSpinLock: // // * SharedMutex is faster than RWSpinLock when there are actually // concurrent read accesses (sometimes much faster), and ~5 nanoseconds // slower when there is not actually any contention. SharedMutex is // faster in every (benchmarked) scenario where the shared mode of // the lock is actually useful. // // * Concurrent shared access to SharedMutex scales linearly, while total // RWSpinLock throughput drops as more threads try to access the lock // in shared mode. Under very heavy read contention SharedMutex can // be two orders of magnitude faster than RWSpinLock (or any reader // writer lock that doesn't use striping or deferral). // // * SharedMutex can safely protect blocking calls, because after an // initial period of spinning it waits using futex(). // // * RWSpinLock prioritizes readers, SharedMutex has both reader- and // writer-priority variants, but defaults to write priority. // // * RWSpinLock's upgradeable mode blocks new readers, while SharedMutex's // doesn't. Both semantics are reasonable. The boost documentation // doesn't explicitly talk about this behavior (except by omitting // any statement that those lock modes conflict), but the boost // implementations do allow new readers while the upgradeable mode // is held. See https://github.com/boostorg/thread/blob/master/ // include/boost/thread/pthread/shared_mutex.hpp // // Both SharedMutex and RWSpinLock provide "exclusive", "upgrade", // and "shared" modes. At all times num_threads_holding_exclusive + // num_threads_holding_upgrade <= 1, and num_threads_holding_exclusive == // 0 || num_threads_holding_shared == 0. RWSpinLock has the additional // constraint that num_threads_holding_shared cannot increase while // num_threads_holding_upgrade is non-zero. // // Comparison to the internal RWLock: // // * SharedMutex doesn't allow a maximum reader count to be configured, // so it can't be used as a semaphore in the same way as RWLock. // // * SharedMutex is 4 bytes, RWLock is 256. // // * SharedMutex is as fast or faster than RWLock in all of my // microbenchmarks, and has positive rather than negative scalability. // // * RWLock and SharedMutex are both writer priority locks. // // * SharedMutex avoids latency outliers as well as RWLock. // // * SharedMutex uses different names (t != 0 below): // // RWLock::lock(0) => SharedMutex::lock() // // RWLock::lock(t) => SharedMutex::try_lock_for(milliseconds(t)) // // RWLock::tryLock() => SharedMutex::try_lock() // // RWLock::unlock() => SharedMutex::unlock() // // RWLock::enter(0) => SharedMutex::lock_shared() // // RWLock::enter(t) => // SharedMutex::try_lock_shared_for(milliseconds(t)) // // RWLock::tryEnter() => SharedMutex::try_lock_shared() // // RWLock::leave() => SharedMutex::unlock_shared() // // * RWLock allows the reader count to be adjusted by a value other // than 1 during enter() or leave(). SharedMutex doesn't currently // implement this feature. // // * RWLock's methods are marked const, SharedMutex's aren't. // // Reader-writer locks have the potential to allow concurrent access // to shared read-mostly data, but in practice they often provide no // improvement over a mutex. The problem is the cache coherence protocol // of modern CPUs. Coherence is provided by making sure that when a cache // line is written it is present in only one core's cache. Since a memory // write is required to acquire a reader-writer lock in shared mode, the // cache line holding the lock is invalidated in all of the other caches. // This leads to cache misses when another thread wants to acquire or // release the lock concurrently. When the RWLock is colocated with the // data it protects (common), cache misses can also continue occur when // a thread that already holds the lock tries to read the protected data. // // Ideally, a reader-writer lock would allow multiple cores to acquire // and release the lock in shared mode without incurring any cache misses. // This requires that each core records its shared access in a cache line // that isn't read or written by other read-locking cores. (Writers will // have to check all of the cache lines.) Typical server hardware when // this comment was written has 16 L1 caches and cache lines of 64 bytes, // so a lock striped over all L1 caches would occupy a prohibitive 1024 // bytes. Nothing says that we need a separate set of per-core memory // locations for each lock, however. Each SharedMutex instance is only // 4 bytes, but all locks together share a 2K area in which they make a // core-local record of lock acquisitions. // // SharedMutex's strategy of using a shared set of core-local stripes has // a potential downside, because it means that acquisition of any lock in // write mode can conflict with acquisition of any lock in shared mode. // If a lock instance doesn't actually experience concurrency then this // downside will outweight the upside of improved scalability for readers. // To avoid this problem we dynamically detect concurrent accesses to // SharedMutex, and don't start using the deferred mode unless we actually // observe concurrency. See kNumSharedToStartDeferring. // // It is explicitly allowed to call unlock_shared() from a different // thread than lock_shared(), so long as they are properly paired. // unlock_shared() needs to find the location at which lock_shared() // recorded the lock, which might be in the lock itself or in any of // the shared slots. If you can conveniently pass state from lock // acquisition to release then the fastest mechanism is to std::move // the std::shared_lock instance or an SharedMutex::Token (using // lock_shared(Token&) and unlock_shared(Token&)). The guard or token // will tell unlock_shared where in deferredReaders[] to look for the // deferred lock. The Token-less version of unlock_shared() works in all // cases, but is optimized for the common (no inter-thread handoff) case. // // In both read- and write-priority mode, a waiting lock() (exclusive mode) // only blocks readers after it has waited for an active upgrade lock to be // released; until the upgrade lock is released (or upgraded or downgraded) // readers will still be able to enter. Preferences about lock acquisition // are not guaranteed to be enforced perfectly (even if they were, there // is theoretically the chance that a thread could be arbitrarily suspended // between calling lock() and SharedMutex code actually getting executed). // // try_*_for methods always try at least once, even if the duration // is zero or negative. The duration type must be compatible with // std::chrono::steady_clock. try_*_until methods also always try at // least once. std::chrono::system_clock and std::chrono::steady_clock // are supported. // // If you have observed by profiling that your SharedMutex-s are getting // cache misses on deferredReaders[] due to another SharedMutex user, then // you can use the tag type to create your own instantiation of the type. // The contention threshold (see kNumSharedToStartDeferring) should make // this unnecessary in all but the most extreme cases. Make sure to check // that the increased icache and dcache footprint of the tagged result is // worth it. // SharedMutex's use of thread local storage is an optimization, so // for the case where thread local storage is not supported, define it // away. // Note about TSAN (ThreadSanitizer): the SharedMutexWritePriority version // (the default) of this mutex is annotated appropriately so that TSAN can // perform lock inversion analysis. However, the SharedMutexReadPriority version // is not annotated. This is because TSAN's lock order heuristic // assumes that two calls to lock_shared must be ordered, which leads // to too many false positives for the reader-priority case. // // Suppose thread A holds a SharedMutexWritePriority lock in shared mode and an // independent thread B is waiting for exclusive access. Then a thread C's // lock_shared can't proceed until A has released the lock. Discounting // situations that never use exclusive mode (so no lock is necessary at all) // this means that without higher-level reasoning it is not safe to ignore // reader <-> reader interactions. // // This reasoning does not apply to SharedMutexReadPriority, because there are // no actions by a thread B that can make C need to wait for A. Since the // overwhelming majority of SharedMutex instances use write priority, we // restrict the TSAN annotations to only SharedMutexWritePriority. namespace folly { struct SharedMutexToken { … }; struct SharedMutexPolicyDefault { … }; namespace shared_mutex_detail { struct PolicyTracked : SharedMutexPolicyDefault { … }; struct PolicySuppressTSAN : SharedMutexPolicyDefault { … }; // Returns a guard that gives permission for the current thread to // annotate, and adjust the annotation bits in, the SharedMutex at ptr. std::unique_lock<std::mutex> annotationGuard(void* ptr); constexpr uint32_t kMaxDeferredReadersAllocated = …; [[FOLLY_ATTR_GNU_COLD]] uint32_t getMaxDeferredReadersSlow( relaxed_atomic<uint32_t>& cache); long getCurrentThreadInvoluntaryContextSwitchCount(); // kMaxDeferredReaders FOLLY_EXPORT FOLLY_ALWAYS_INLINE uint32_t getMaxDeferredReaders() { … } class NopOwnershipTracker { … }; class ThreadIdOwnershipTracker { … }; } // namespace shared_mutex_detail template < bool ReaderPriority, typename Tag_ = void, template <typename> class Atom = std::atomic, typename Policy = SharedMutexPolicyDefault> class SharedMutexImpl : std::conditional_t< Policy::track_thread_id, shared_mutex_detail::ThreadIdOwnershipTracker, shared_mutex_detail::NopOwnershipTracker> { … }; SharedMutexReadPriority; SharedMutexWritePriority; SharedMutex; SharedMutexTracked; SharedMutexSuppressTSAN; // Prevent the compiler from instantiating these in other translation units. // They are instantiated once in SharedMutex.cpp extern template class SharedMutexImpl<true>; extern template class SharedMutexImpl<false>; template < bool ReaderPriority, typename Tag_, template <typename> class Atom, typename Policy> alignas(hardware_destructive_interference_size) typename SharedMutexImpl<ReaderPriority, Tag_, Atom, Policy>:: DeferredReaderSlot SharedMutexImpl<ReaderPriority, Tag_, Atom, Policy>::deferredReaders [shared_mutex_detail::kMaxDeferredReadersAllocated * kDeferredSeparationFactor] = …; template < bool ReaderPriority, typename Tag_, template <typename> class Atom, typename Policy> bool SharedMutexImpl<ReaderPriority, Tag_, Atom, Policy>:: tryUnlockTokenlessSharedDeferred() { … } template < bool ReaderPriority, typename Tag_, template <typename> class Atom, typename Policy> template <class WaitContext> bool SharedMutexImpl<ReaderPriority, Tag_, Atom, Policy>::lockSharedImpl( uint32_t& state, Token* token, WaitContext& ctx) { … } namespace shared_mutex_detail { [[noreturn]] void throwOperationNotPermitted(); [[noreturn]] void throwDeadlockWouldOccur(); } // namespace shared_mutex_detail } // namespace folly // std::shared_lock specialization for folly::SharedMutex to leverage tokenful // version of unlock_shared for faster unlocking. namespace std { shared_lock< ::folly::SharedMutexImpl<ReaderPriority, Tag_, Atom, Policy>>; } // namespace std