folly/folly/synchronization/RWSpinLock.h

/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

/*
 * N.B. You most likely do _not_ want to use RWSpinLock or any other
 * kind of spinlock.  Use SharedMutex instead.
 *
 * In short, spinlocks in preemptive multi-tasking operating systems
 * have serious problems and fast mutexes like SharedMutex are almost
 * certainly the better choice, because letting the OS scheduler put a
 * thread to sleep is better for system responsiveness and throughput
 * than wasting a timeslice repeatedly querying a lock held by a
 * thread that's blocked, and you can't prevent userspace
 * programs blocking.
 *
 * Spinlocks in an operating system kernel make much more sense than
 * they do in userspace.
 *
 * -------------------------------------------------------------------
 *
 * Two Read-Write spin lock implementations.
 *
 *  Ref: http://locklessinc.com/articles/locks
 *
 *  Both locks here are faster than pthread_rwlock and have very low
 *  overhead (usually 20-30ns).  They don't use any system mutexes and
 *  are very compact (4/8 bytes), so are suitable for per-instance
 *  based locking, particularly when contention is not expected.
 *
 *  For a spinlock, RWSpinLock is a reasonable choice.  (See the note
 *  about for why a spin lock is frequently a bad idea generally.)
 *  RWSpinLock has minimal overhead, and comparable contention
 *  performance when the number of competing threads is less than or
 *  equal to the number of logical CPUs.  Even as the number of
 *  threads gets larger, RWSpinLock can still be very competitive in
 *  READ, although it is slower on WRITE, and also inherently unfair
 *  to writers.
 *
 *  RWTicketSpinLock shows more balanced READ/WRITE performance.  If
 *  your application really needs a lot more threads, and a
 *  higher-priority writer, prefer one of the RWTicketSpinLock locks.
 *
 *  Caveats:
 *
 *    RWTicketSpinLock locks can only be used with GCC on x86/x86-64
 *    based systems.
 *
 *    RWTicketSpinLock<32> only allows up to 2^8 - 1 concurrent
 *    readers and writers.
 *
 *    RWTicketSpinLock<64> only allows up to 2^16 - 1 concurrent
 *    readers and writers.
 *
 *    RWTicketSpinLock<..., true> (kFavorWriter = true, that is, strict
 *    writer priority) is NOT reentrant, even for lock_shared().
 *
 *    The lock will not grant any new shared (read) accesses while a thread
 *    attempting to acquire the lock in write mode is blocked. (That is,
 *    if the lock is held in shared mode by N threads, and a thread attempts
 *    to acquire it in write mode, no one else can acquire it in shared mode
 *    until these N threads release the lock and then the blocked thread
 *    acquires and releases the exclusive lock.) This also applies for
 *    attempts to reacquire the lock in shared mode by threads that already
 *    hold it in shared mode, making the lock non-reentrant.
 *
 *    RWSpinLock handles 2^30 - 1 concurrent readers.
 */

#pragma once

/*
========================================================================
Benchmark on (Intel(R) Xeon(R) CPU  L5630  @ 2.13GHz)  8 cores(16 HTs)
========================================================================

------------------------------------------------------------------------------
1. Single thread benchmark (read/write lock + unlock overhead)
Benchmark                                    Iters   Total t    t/iter iter/sec
-------------------------------------------------------------------------------
*      BM_RWSpinLockRead                     100000  1.786 ms  17.86 ns   53.4M
+30.5% BM_RWSpinLockWrite                    100000  2.331 ms  23.31 ns  40.91M
+85.7% BM_RWTicketSpinLock32Read             100000  3.317 ms  33.17 ns  28.75M
+96.0% BM_RWTicketSpinLock32Write            100000    3.5 ms     35 ns  27.25M
+85.6% BM_RWTicketSpinLock64Read             100000  3.315 ms  33.15 ns  28.77M
+96.0% BM_RWTicketSpinLock64Write            100000    3.5 ms     35 ns  27.25M
+85.7% BM_RWTicketSpinLock32FavorWriterRead  100000  3.317 ms  33.17 ns  28.75M
+29.7% BM_RWTicketSpinLock32FavorWriterWrite 100000  2.316 ms  23.16 ns  41.18M
+85.3% BM_RWTicketSpinLock64FavorWriterRead  100000  3.309 ms  33.09 ns  28.82M
+30.2% BM_RWTicketSpinLock64FavorWriterWrite 100000  2.325 ms  23.25 ns  41.02M
+ 175% BM_PThreadRWMutexRead                 100000  4.917 ms  49.17 ns   19.4M
+ 166% BM_PThreadRWMutexWrite                100000  4.757 ms  47.57 ns  20.05M

------------------------------------------------------------------------------
2. Contention Benchmark      90% read  10% write
Benchmark                    hits       average    min       max        sigma
------------------------------------------------------------------------------
---------- 8  threads ------------
RWSpinLock       Write       142666     220ns      78ns      40.8us     269ns
RWSpinLock       Read        1282297    222ns      80ns      37.7us     248ns
RWTicketSpinLock Write       85692      209ns      71ns      17.9us     252ns
RWTicketSpinLock Read        769571     215ns      78ns      33.4us     251ns
pthread_rwlock_t Write       84248      2.48us     99ns      269us      8.19us
pthread_rwlock_t Read        761646     933ns      101ns     374us      3.25us

---------- 16 threads ------------
RWSpinLock       Write       124236     237ns      78ns      261us      801ns
RWSpinLock       Read        1115807    236ns      78ns      2.27ms     2.17us
RWTicketSpinLock Write       81781      231ns      71ns      31.4us     351ns
RWTicketSpinLock Read        734518     238ns      78ns      73.6us     379ns
pthread_rwlock_t Write       83363      7.12us     99ns      785us      28.1us
pthread_rwlock_t Read        754978     2.18us     101ns     1.02ms     14.3us

---------- 50 threads ------------
RWSpinLock       Write       131142     1.37us     82ns      7.53ms     68.2us
RWSpinLock       Read        1181240    262ns      78ns      6.62ms     12.7us
RWTicketSpinLock Write       83045      397ns      73ns      7.01ms     31.5us
RWTicketSpinLock Read        744133     386ns      78ns        11ms     31.4us
pthread_rwlock_t Write       80849      112us      103ns     4.52ms     263us
pthread_rwlock_t Read        728698     24us       101ns     7.28ms     194us

*/

#include <folly/Portability.h>
#include <folly/portability/Asm.h>

#if defined(__GNUC__) && (defined(__i386) || FOLLY_X64 || defined(ARCH_K8))
#define RW_SPINLOCK_USE_X86_INTRINSIC_
#include <x86intrin.h>
#elif defined(_MSC_VER) && defined(FOLLY_X64)
#define RW_SPINLOCK_USE_X86_INTRINSIC_
#elif FOLLY_AARCH64
#define RW_SPINLOCK_USE_X86_INTRINSIC_
#else
#undef RW_SPINLOCK_USE_X86_INTRINSIC_
#endif

// iOS doesn't define _mm_cvtsi64_si128 and friends
#if (FOLLY_SSE >= 2) && !FOLLY_MOBILE && FOLLY_X64
#define RW_SPINLOCK_USE_SSE_INSTRUCTIONS_
#else
#undef RW_SPINLOCK_USE_SSE_INSTRUCTIONS_
#endif

#include <algorithm>
#include <atomic>
#include <thread>

#include <folly/Likely.h>
#include <folly/synchronization/Lock.h>

namespace folly {

/*
 * A simple, small (4-bytes), but unfair rwlock.  Use it when you want
 * a nice writer and don't expect a lot of write/read contention, or
 * when you need small rwlocks since you are creating a large number
 * of them.
 *
 * Note that the unfairness here is extreme: if the lock is
 * continually accessed for read, writers will never get a chance.  If
 * the lock can be that highly contended this class is probably not an
 * ideal choice anyway.
 *
 * It currently implements most of the Lockable, SharedLockable and
 * UpgradeLockable concepts except the TimedLockable related locking/unlocking
 * interfaces.
 */
class RWSpinLock {};

#ifdef RW_SPINLOCK_USE_X86_INTRINSIC_
// A more balanced Read-Write spin lock implemented based on GCC intrinsics.

namespace detail {
template <size_t kBitWidth>
struct RWTicketIntTrait {};

template <>
struct RWTicketIntTrait<64> {};

template <>
struct RWTicketIntTrait<32> {};
} // namespace detail

template <size_t kBitWidth, bool kFavorWriter = false>
class RWTicketSpinLockT {};

RWTicketSpinLock32;
RWTicketSpinLock64;

#endif // RW_SPINLOCK_USE_X86_INTRINSIC_

} // namespace folly

#ifdef RW_SPINLOCK_USE_X86_INTRINSIC_
#undef RW_SPINLOCK_USE_X86_INTRINSIC_
#endif