SharedMutex.h | Explore in Territory

/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#pragma once

#include <stdint.h>

#include <atomic>
#include <chrono>
#include <memory>
#include <mutex>
#include <shared_mutex>
#include <thread>
#include <type_traits>
#include <utility>

#include <folly/CPortability.h>
#include <folly/CppAttributes.h>
#include <folly/Likely.h>
#include <folly/chrono/Hardware.h>
#include <folly/concurrency/CacheLocality.h>
#include <folly/detail/Futex.h>
#include <folly/portability/Asm.h>
#include <folly/synchronization/Lock.h>
#include <folly/synchronization/RelaxedAtomic.h>
#include <folly/synchronization/SanitizeThread.h>
#include <folly/system/ThreadId.h>

// SharedMutex is a reader-writer lock.  It is small, very fast, scalable
// on multi-core, and suitable for use when readers or writers may block.
// Unlike most other reader-writer locks, its throughput with concurrent
// readers scales linearly; it is able to acquire and release the lock
// in shared mode without cache line ping-ponging.  It is suitable for
// a wide range of lock hold times because it starts with spinning,
// proceeds to using sched_yield with a preemption heuristic, and then
// waits using futex and precise wakeups.
//
// SharedMutex provides all of the methods of folly::RWSpinLock,
// boost::shared_mutex, boost::upgrade_mutex, and C++14's
// std::shared_timed_mutex.  All operations that can block are available
// in try, try-for, and try-until (system_clock or steady_clock) versions.
//
// SharedMutexReadPriority gives priority to readers,
// SharedMutexWritePriority gives priority to writers.  SharedMutex is an
// alias for SharedMutexWritePriority, because writer starvation is more
// likely than reader starvation for the read-heavy workloads targeted
// by SharedMutex.
//
// In my tests SharedMutex is as good or better than the other
// reader-writer locks in use at Facebook for almost all use cases,
// sometimes by a wide margin.  (If it is rare that there are actually
// concurrent readers then RWSpinLock can be a few nanoseconds faster.)
// I compared it to folly::RWSpinLock, folly::RWTicketSpinLock64,
// boost::shared_mutex, pthread_rwlock_t, and a RWLock that internally uses
// spinlocks to guard state and pthread_mutex_t+pthread_cond_t to block.
// (Thrift's ReadWriteMutex is based underneath on pthread_rwlock_t.)
// It is generally as good or better than the rest when evaluating size,
// speed, scalability, or latency outliers.  In the corner cases where
// it is not the fastest (such as single-threaded use or heavy write
// contention) it is never very much worse than the best.  See the bottom
// of folly/test/SharedMutexTest.cpp for lots of microbenchmark results.
//
// Comparison to folly::RWSpinLock:
//
//  * SharedMutex is faster than RWSpinLock when there are actually
//    concurrent read accesses (sometimes much faster), and ~5 nanoseconds
//    slower when there is not actually any contention.  SharedMutex is
//    faster in every (benchmarked) scenario where the shared mode of
//    the lock is actually useful.
//
//  * Concurrent shared access to SharedMutex scales linearly, while total
//    RWSpinLock throughput drops as more threads try to access the lock
//    in shared mode.  Under very heavy read contention SharedMutex can
//    be two orders of magnitude faster than RWSpinLock (or any reader
//    writer lock that doesn't use striping or deferral).
//
//  * SharedMutex can safely protect blocking calls, because after an
//    initial period of spinning it waits using futex().
//
//  * RWSpinLock prioritizes readers, SharedMutex has both reader- and
//    writer-priority variants, but defaults to write priority.
//
//  * RWSpinLock's upgradeable mode blocks new readers, while SharedMutex's
//    doesn't.  Both semantics are reasonable.  The boost documentation
//    doesn't explicitly talk about this behavior (except by omitting
//    any statement that those lock modes conflict), but the boost
//    implementations do allow new readers while the upgradeable mode
//    is held.  See https://github.com/boostorg/thread/blob/master/
//      include/boost/thread/pthread/shared_mutex.hpp
//
// Both SharedMutex and RWSpinLock provide "exclusive", "upgrade",
// and "shared" modes.  At all times num_threads_holding_exclusive +
// num_threads_holding_upgrade <= 1, and num_threads_holding_exclusive ==
// 0 || num_threads_holding_shared == 0.  RWSpinLock has the additional
// constraint that num_threads_holding_shared cannot increase while
// num_threads_holding_upgrade is non-zero.
//
// Comparison to the internal RWLock:
//
//  * SharedMutex doesn't allow a maximum reader count to be configured,
//    so it can't be used as a semaphore in the same way as RWLock.
//
//  * SharedMutex is 4 bytes, RWLock is 256.
//
//  * SharedMutex is as fast or faster than RWLock in all of my
//    microbenchmarks, and has positive rather than negative scalability.
//
//  * RWLock and SharedMutex are both writer priority locks.
//
//  * SharedMutex avoids latency outliers as well as RWLock.
//
//  * SharedMutex uses different names (t != 0 below):
//
//    RWLock::lock(0)    => SharedMutex::lock()
//
//    RWLock::lock(t)    => SharedMutex::try_lock_for(milliseconds(t))
//
//    RWLock::tryLock()  => SharedMutex::try_lock()
//
//    RWLock::unlock()   => SharedMutex::unlock()
//
//    RWLock::enter(0)   => SharedMutex::lock_shared()
//
//    RWLock::enter(t)   =>
//        SharedMutex::try_lock_shared_for(milliseconds(t))
//
//    RWLock::tryEnter() => SharedMutex::try_lock_shared()
//
//    RWLock::leave()    => SharedMutex::unlock_shared()
//
//  * RWLock allows the reader count to be adjusted by a value other
//    than 1 during enter() or leave(). SharedMutex doesn't currently
//    implement this feature.
//
//  * RWLock's methods are marked const, SharedMutex's aren't.
//
// Reader-writer locks have the potential to allow concurrent access
// to shared read-mostly data, but in practice they often provide no
// improvement over a mutex.  The problem is the cache coherence protocol
// of modern CPUs.  Coherence is provided by making sure that when a cache
// line is written it is present in only one core's cache.  Since a memory
// write is required to acquire a reader-writer lock in shared mode, the
// cache line holding the lock is invalidated in all of the other caches.
// This leads to cache misses when another thread wants to acquire or
// release the lock concurrently.  When the RWLock is colocated with the
// data it protects (common), cache misses can also continue occur when
// a thread that already holds the lock tries to read the protected data.
//
// Ideally, a reader-writer lock would allow multiple cores to acquire
// and release the lock in shared mode without incurring any cache misses.
// This requires that each core records its shared access in a cache line
// that isn't read or written by other read-locking cores.  (Writers will
// have to check all of the cache lines.)  Typical server hardware when
// this comment was written has 16 L1 caches and cache lines of 64 bytes,
// so a lock striped over all L1 caches would occupy a prohibitive 1024
// bytes.  Nothing says that we need a separate set of per-core memory
// locations for each lock, however.  Each SharedMutex instance is only
// 4 bytes, but all locks together share a 2K area in which they make a
// core-local record of lock acquisitions.
//
// SharedMutex's strategy of using a shared set of core-local stripes has
// a potential downside, because it means that acquisition of any lock in
// write mode can conflict with acquisition of any lock in shared mode.
// If a lock instance doesn't actually experience concurrency then this
// downside will outweight the upside of improved scalability for readers.
// To avoid this problem we dynamically detect concurrent accesses to
// SharedMutex, and don't start using the deferred mode unless we actually
// observe concurrency.  See kNumSharedToStartDeferring.
//
// It is explicitly allowed to call unlock_shared() from a different
// thread than lock_shared(), so long as they are properly paired.
// unlock_shared() needs to find the location at which lock_shared()
// recorded the lock, which might be in the lock itself or in any of
// the shared slots.  If you can conveniently pass state from lock
// acquisition to release then the fastest mechanism is to std::move
// the std::shared_lock instance or an SharedMutex::Token (using
// lock_shared(Token&) and unlock_shared(Token&)).  The guard or token
// will tell unlock_shared where in deferredReaders[] to look for the
// deferred lock.  The Token-less version of unlock_shared() works in all
// cases, but is optimized for the common (no inter-thread handoff) case.
//
// In both read- and write-priority mode, a waiting lock() (exclusive mode)
// only blocks readers after it has waited for an active upgrade lock to be
// released; until the upgrade lock is released (or upgraded or downgraded)
// readers will still be able to enter.  Preferences about lock acquisition
// are not guaranteed to be enforced perfectly (even if they were, there
// is theoretically the chance that a thread could be arbitrarily suspended
// between calling lock() and SharedMutex code actually getting executed).
//
// try_*_for methods always try at least once, even if the duration
// is zero or negative.  The duration type must be compatible with
// std::chrono::steady_clock.  try_*_until methods also always try at
// least once.  std::chrono::system_clock and std::chrono::steady_clock
// are supported.
//
// If you have observed by profiling that your SharedMutex-s are getting
// cache misses on deferredReaders[] due to another SharedMutex user, then
// you can use the tag type to create your own instantiation of the type.
// The contention threshold (see kNumSharedToStartDeferring) should make
// this unnecessary in all but the most extreme cases.  Make sure to check
// that the increased icache and dcache footprint of the tagged result is
// worth it.

// SharedMutex's use of thread local storage is an optimization, so
// for the case where thread local storage is not supported, define it
// away.

// Note about TSAN (ThreadSanitizer): the SharedMutexWritePriority version
// (the default) of this mutex is annotated appropriately so that TSAN can
// perform lock inversion analysis. However, the SharedMutexReadPriority version
// is not annotated.  This is because TSAN's lock order heuristic
// assumes that two calls to lock_shared must be ordered, which leads
// to too many false positives for the reader-priority case.
//
// Suppose thread A holds a SharedMutexWritePriority lock in shared mode and an
// independent thread B is waiting for exclusive access. Then a thread C's
// lock_shared can't proceed until A has released the lock. Discounting
// situations that never use exclusive mode (so no lock is necessary at all)
// this means that without higher-level reasoning it is not safe to ignore
// reader <-> reader interactions.
//
// This reasoning does not apply to SharedMutexReadPriority, because there are
// no actions by a thread B that can make C need to wait for A. Since the
// overwhelming majority of SharedMutex instances use write priority, we
// restrict the TSAN annotations to only SharedMutexWritePriority.

namespace folly {

struct SharedMutexToken { … };

struct SharedMutexPolicyDefault { … };

namespace shared_mutex_detail {

struct PolicyTracked : SharedMutexPolicyDefault { … };
struct PolicySuppressTSAN : SharedMutexPolicyDefault { … };

// Returns a guard that gives permission for the current thread to
// annotate, and adjust the annotation bits in, the SharedMutex at ptr.
std::unique_lock<std::mutex> annotationGuard(void* ptr);

constexpr uint32_t kMaxDeferredReadersAllocated = …;

[[FOLLY_ATTR_GNU_COLD]] uint32_t getMaxDeferredReadersSlow(
    relaxed_atomic<uint32_t>& cache);

long getCurrentThreadInvoluntaryContextSwitchCount();

// kMaxDeferredReaders
FOLLY_EXPORT FOLLY_ALWAYS_INLINE uint32_t getMaxDeferredReaders() { … }

class NopOwnershipTracker { … };

class ThreadIdOwnershipTracker { … };
} // namespace shared_mutex_detail

template <
    bool ReaderPriority,
    typename Tag_ = void,
    template <typename> class Atom = std::atomic,
    typename Policy = SharedMutexPolicyDefault>
class SharedMutexImpl : std::conditional_t<
                            Policy::track_thread_id,
                            shared_mutex_detail::ThreadIdOwnershipTracker,
                            shared_mutex_detail::NopOwnershipTracker> { … };

SharedMutexReadPriority;
SharedMutexWritePriority;
SharedMutex;
SharedMutexTracked;
SharedMutexSuppressTSAN;

// Prevent the compiler from instantiating these in other translation units.
// They are instantiated once in SharedMutex.cpp
extern template class SharedMutexImpl<true>;
extern template class SharedMutexImpl<false>;

template <
    bool ReaderPriority,
    typename Tag_,
    template <typename>
    class Atom,
    typename Policy>
alignas(hardware_destructive_interference_size)
    typename SharedMutexImpl<ReaderPriority, Tag_, Atom, Policy>::
        DeferredReaderSlot
    SharedMutexImpl<ReaderPriority, Tag_, Atom, Policy>::deferredReaders
        [shared_mutex_detail::kMaxDeferredReadersAllocated *
         kDeferredSeparationFactor] = …;

template <
    bool ReaderPriority,
    typename Tag_,
    template <typename>
    class Atom,
    typename Policy>
bool SharedMutexImpl<ReaderPriority, Tag_, Atom, Policy>::
    tryUnlockTokenlessSharedDeferred() { … }

template <
    bool ReaderPriority,
    typename Tag_,
    template <typename>
    class Atom,
    typename Policy>
template <class WaitContext>
bool SharedMutexImpl<ReaderPriority, Tag_, Atom, Policy>::lockSharedImpl(
    uint32_t& state, Token* token, WaitContext& ctx) { … }

namespace shared_mutex_detail {

[[noreturn]] void throwOperationNotPermitted();

[[noreturn]] void throwDeadlockWouldOccur();

} // namespace shared_mutex_detail

} // namespace folly

// std::shared_lock specialization for folly::SharedMutex to leverage tokenful
// version of unlock_shared for faster unlocking.
namespace std {

shared_lock< ::folly::SharedMutexImpl<ReaderPriority, Tag_, Atom, Policy>>;

} // namespace std
folly/folly/SharedMutex.h