folly/folly/concurrency/UnboundedQueue.h

/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#pragma once

#include <atomic>
#include <chrono>
#include <memory>

#include <glog/logging.h>

#include <folly/ConstexprMath.h>
#include <folly/Optional.h>
#include <folly/Traits.h>
#include <folly/concurrency/CacheLocality.h>
#include <folly/lang/Align.h>
#include <folly/synchronization/Hazptr.h>
#include <folly/synchronization/SaturatingSemaphore.h>
#include <folly/synchronization/WaitOptions.h>
#include <folly/synchronization/detail/Spin.h>

namespace folly {

/// UnboundedQueue supports a variety of options for unbounded
/// dynamically expanding an shrinking queues, including variations of:
/// - Single vs. multiple producers
/// - Single vs. multiple consumers
/// - Blocking vs. spin-waiting
/// - Non-waiting, timed, and waiting consumer operations.
/// Producer operations never wait or fail (unless out-of-memory).
///
/// Template parameters:
/// - T: element type
/// - SingleProducer: true if there can be only one producer at a
///   time.
/// - SingleConsumer: true if there can be only one consumer at a
///   time.
/// - MayBlock: true if consumers may block, false if they only
///   spin. A performance tuning parameter.
/// - LgSegmentSize (default 8): Log base 2 of number of elements per
///   segment. A performance tuning parameter. See below.
/// - LgAlign (default 7): Log base 2 of alignment directive; can be
///   used to balance scalability (avoidance of false sharing) with
///   memory efficiency.
///
/// When to use UnboundedQueue:
/// - If a small bound may lead to deadlock or performance degradation
///   under bursty patterns.
/// - If there is no risk of the queue growing too much.
///
/// When not to use UnboundedQueue:
/// - If there is risk of the queue growing too much and a large bound
///   is acceptable, then use DynamicBoundedQueue.
/// - If the queue must not allocate on enqueue or it must have a
///   small bound, then use fixed-size MPMCQueue or (if non-blocking
///   SPSC) ProducerConsumerQueue.
///
/// Template Aliases:
///   USPSCQueue<T, MayBlock, LgSegmentSize, LgAlign>
///   UMPSCQueue<T, MayBlock, LgSegmentSize, LgAlign>
///   USPMCQueue<T, MayBlock, LgSegmentSize, LgAlign>
///   UMPMCQueue<T, MayBlock, LgSegmentSize, LgAlign>
///
/// Functions:
///   Producer operations never wait or fail (unless OOM)
///     void enqueue(const T&);
///     void enqueue(T&&);
///         Adds an element to the end of the queue.
///
///   Consumer operations:
///     void dequeue(T&);
///     T dequeue();
///         Extracts an element from the front of the queue. Waits
///         until an element is available if needed.
///     bool try_dequeue(T&);
///     folly::Optional<T> try_dequeue();
///         Tries to extract an element from the front of the queue
///         if available.
///     bool try_dequeue_until(T&, time_point& deadline);
///     folly::Optional<T> try_dequeue_until(time_point& deadline);
///         Tries to extract an element from the front of the queue
///         if available until the specified deadline.
///     bool try_dequeue_for(T&, duration&);
///     folly::Optional<T> try_dequeue_for(duration&);
///         Tries to extract an element from the front of the queue if
///         available until the expiration of the specified duration.
///     const T* try_peek();
///         Returns pointer to the element at the front of the queue
///         if available, or nullptr if the queue is empty. Only for
///         SPSC and MPSC.
///
///   Secondary functions:
///     size_t size();
///         Returns an estimate of the size of the queue.
///     bool empty();
///         Returns true only if the queue was empty during the call.
///     Note: size() and empty() are guaranteed to be accurate only if
///     the queue is not changed concurrently.
///
/// Usage examples:
/// @code
///   /* UMPSC, doesn't block, 1024 int elements per segment */
///   UMPSCQueue<int, false, 10> q;
///   q.enqueue(1);
///   q.enqueue(2);
///   q.enqueue(3);
///   ASSERT_FALSE(q.empty());
///   ASSERT_EQ(q.size(), 3);
///   int v;
///   q.dequeue(v);
///   ASSERT_EQ(v, 1);
///   ASSERT_TRUE(try_dequeue(v));
///   ASSERT_EQ(v, 2);
///   ASSERT_TRUE(try_dequeue_until(v, now() + seconds(1)));
///   ASSERT_EQ(v, 3);
///   ASSERT_TRUE(q.empty());
///   ASSERT_EQ(q.size(), 0);
///   ASSERT_FALSE(try_dequeue(v));
///   ASSERT_FALSE(try_dequeue_for(v, microseconds(100)));
/// @endcode
///
/// Design:
/// - The queue is composed of one or more segments. Each segment has
///   a fixed size of 2^LgSegmentSize entries. Each segment is used
///   exactly once.
/// - Each entry is composed of a futex and a single element.
/// - Each segment's array of entries is strided to avoid false sharing.
///   I.e., to reduce any cacheline contention that might be induced by
///   concurrent mutations to the queue that might happen to affect
///   otherwise-adjacent locations that might happen to share cacheline.
/// - The queue contains two 64-bit ticket variables. The producer
///   ticket counts the number of producer tickets issued so far, and
///   the same for the consumer ticket. Each ticket number corresponds
///   to a specific entry in a specific segment.
/// - The queue maintains two pointers, head and tail. Head points to
///   the segment that corresponds to the current consumer
///   ticket. Similarly, tail pointer points to the segment that
///   corresponds to the producer ticket.
/// - Segments are organized as a singly linked list.
/// - The producer with the first ticket in the current producer
///   segment has primary responsibility for allocating and linking
///   the next segment. Other producers and connsumers may help do so
///   when needed if that thread is delayed.
/// - The producer with the last ticket in the current producer
///   segment is primarily responsible for advancing the tail pointer
///   to the next segment. Other producers and consumers may help do
///   so when needed if that thread is delayed.
/// - Similarly, the consumer with the last ticket in the current
///   consumer segment is primarily responsible for advancing the head
///   pointer to the next segment. Other consumers may help do so when
///   needed if that thread is delayed.
/// - The tail pointer must not lag behind the head pointer.
///   Otherwise, the algorithm cannot be certain about the removal of
///   segment and would have to incur higher costs to ensure safe
///   reclamation.  Consumers must ensure that head never overtakes
///   tail.
///
/// Memory Usage:
/// - An empty queue contains one segment. A nonempty queue contains
///   one or two more segment than fits its contents.
/// - Removed segments are not reclaimed until there are no threads,
///   producers or consumers, with references to them or their
///   predecessors. That is, a lagging thread may delay the reclamation
///   of a chain of removed segments.
/// - The template parameter LgAlign can be used to reduce memory usage
///   at the cost of increased chance of false sharing.
///
/// Performance considerations:
/// - All operations take constant time, excluding the costs of
///   allocation, reclamation, interference from other threads, and
///   waiting for actions by other threads.
/// - In general, using the single producer and or single consumer
///   variants yield better performance than the MP and MC
///   alternatives.
/// - SPSC without blocking is the fastest configuration. It doesn't
///   include any read-modify-write atomic operations, full fences, or
///   system calls in the critical path.
/// - MP adds a fetch_add to the critical path of each producer operation.
/// - MC adds a fetch_add or compare_exchange to the critical path of
///   each consumer operation.
/// - The possibility of consumers blocking, even if they never do,
///   adds a compare_exchange to the critical path of each producer
///   operation.
/// - MPMC, SPMC, MPSC require the use of a deferred reclamation
///   mechanism to guarantee that segments removed from the linked
///   list, i.e., unreachable from the head pointer, are reclaimed
///   only after they are no longer needed by any lagging producers or
///   consumers.
/// - The overheads of segment allocation and reclamation are intended
///   to be mostly out of the critical path of the queue's throughput.
/// - If the template parameter LgSegmentSize is changed, it should be
///   set adequately high to keep the amortized cost of allocation and
///   reclamation low.
/// - It is recommended to measure performance with different variants
///   when applicable, e.g., UMPMC vs UMPSC. Depending on the use
///   case, sometimes the variant with the higher sequential overhead
///   may yield better results due to, for example, more favorable
///   producer-consumer balance or favorable timing for avoiding
///   costly blocking.
///
/// Guarantees:
/// - The queues are linearizable:
///   - For two enqueue operations q(A) and q(B), if q(A) < q(B) in
///     the happens-before relation, then A precedes B in the queue.
///   - For two dequeue operations d(A) and d(B), if d(A) < d(B) in
///     the happens-before relation, then A preceded B in the queue.

template <
    typename T,
    bool SingleProducer,
    bool SingleConsumer,
    bool MayBlock,
    size_t LgSegmentSize = 8,
    size_t LgAlign = constexpr_log2(hardware_destructive_interference_size),
    template <typename> class Atom = std::atomic>
class UnboundedQueue {}; // UnboundedQueue

/* Aliases */

USPSCQueue;

UMPSCQueue;

USPMCQueue;

UMPMCQueue;

} // namespace folly