// SPDX-License-Identifier: GPL-2.0 /* kernel/rwsem.c: R/W semaphores, public implementation * * Written by David Howells ([email protected]). * Derived from asm-i386/semaphore.h * * Writer lock-stealing by Alex Shi <[email protected]> * and Michel Lespinasse <[email protected]> * * Optimistic spinning by Tim Chen <[email protected]> * and Davidlohr Bueso <[email protected]>. Based on mutexes. * * Rwsem count bit fields re-definition and rwsem rearchitecture by * Waiman Long <[email protected]> and * Peter Zijlstra <[email protected]>. */ #include <linux/types.h> #include <linux/kernel.h> #include <linux/sched.h> #include <linux/sched/rt.h> #include <linux/sched/task.h> #include <linux/sched/debug.h> #include <linux/sched/wake_q.h> #include <linux/sched/signal.h> #include <linux/sched/clock.h> #include <linux/export.h> #include <linux/rwsem.h> #include <linux/atomic.h> #include <trace/events/lock.h> #ifndef CONFIG_PREEMPT_RT #include "lock_events.h" /* * The least significant 2 bits of the owner value has the following * meanings when set. * - Bit 0: RWSEM_READER_OWNED - rwsem may be owned by readers (just a hint) * - Bit 1: RWSEM_NONSPINNABLE - Cannot spin on a reader-owned lock * * When the rwsem is reader-owned and a spinning writer has timed out, * the nonspinnable bit will be set to disable optimistic spinning. * When a writer acquires a rwsem, it puts its task_struct pointer * into the owner field. It is cleared after an unlock. * * When a reader acquires a rwsem, it will also puts its task_struct * pointer into the owner field with the RWSEM_READER_OWNED bit set. * On unlock, the owner field will largely be left untouched. So * for a free or reader-owned rwsem, the owner value may contain * information about the last reader that acquires the rwsem. * * That information may be helpful in debugging cases where the system * seems to hang on a reader owned rwsem especially if only one reader * is involved. Ideally we would like to track all the readers that own * a rwsem, but the overhead is simply too big. * * A fast path reader optimistic lock stealing is supported when the rwsem * is previously owned by a writer and the following conditions are met: * - rwsem is not currently writer owned * - the handoff isn't set. */ #define RWSEM_READER_OWNED … #define RWSEM_NONSPINNABLE … #define RWSEM_OWNER_FLAGS_MASK … #ifdef CONFIG_DEBUG_RWSEMS #define DEBUG_RWSEMS_WARN_ON(c, sem) … #else #define DEBUG_RWSEMS_WARN_ON … #endif /* * On 64-bit architectures, the bit definitions of the count are: * * Bit 0 - writer locked bit * Bit 1 - waiters present bit * Bit 2 - lock handoff bit * Bits 3-7 - reserved * Bits 8-62 - 55-bit reader count * Bit 63 - read fail bit * * On 32-bit architectures, the bit definitions of the count are: * * Bit 0 - writer locked bit * Bit 1 - waiters present bit * Bit 2 - lock handoff bit * Bits 3-7 - reserved * Bits 8-30 - 23-bit reader count * Bit 31 - read fail bit * * It is not likely that the most significant bit (read fail bit) will ever * be set. This guard bit is still checked anyway in the down_read() fastpath * just in case we need to use up more of the reader bits for other purpose * in the future. * * atomic_long_fetch_add() is used to obtain reader lock, whereas * atomic_long_cmpxchg() will be used to obtain writer lock. * * There are three places where the lock handoff bit may be set or cleared. * 1) rwsem_mark_wake() for readers -- set, clear * 2) rwsem_try_write_lock() for writers -- set, clear * 3) rwsem_del_waiter() -- clear * * For all the above cases, wait_lock will be held. A writer must also * be the first one in the wait_list to be eligible for setting the handoff * bit. So concurrent setting/clearing of handoff bit is not possible. */ #define RWSEM_WRITER_LOCKED … #define RWSEM_FLAG_WAITERS … #define RWSEM_FLAG_HANDOFF … #define RWSEM_FLAG_READFAIL … #define RWSEM_READER_SHIFT … #define RWSEM_READER_BIAS … #define RWSEM_READER_MASK … #define RWSEM_WRITER_MASK … #define RWSEM_LOCK_MASK … #define RWSEM_READ_FAILED_MASK … /* * All writes to owner are protected by WRITE_ONCE() to make sure that * store tearing can't happen as optimistic spinners may read and use * the owner value concurrently without lock. Read from owner, however, * may not need READ_ONCE() as long as the pointer value is only used * for comparison and isn't being dereferenced. * * Both rwsem_{set,clear}_owner() functions should be in the same * preempt disable section as the atomic op that changes sem->count. */ static inline void rwsem_set_owner(struct rw_semaphore *sem) { … } static inline void rwsem_clear_owner(struct rw_semaphore *sem) { … } /* * Test the flags in the owner field. */ static inline bool rwsem_test_oflags(struct rw_semaphore *sem, long flags) { … } /* * The task_struct pointer of the last owning reader will be left in * the owner field. * * Note that the owner value just indicates the task has owned the rwsem * previously, it may not be the real owner or one of the real owners * anymore when that field is examined, so take it with a grain of salt. * * The reader non-spinnable bit is preserved. */ static inline void __rwsem_set_reader_owned(struct rw_semaphore *sem, struct task_struct *owner) { … } static inline void rwsem_set_reader_owned(struct rw_semaphore *sem) { … } /* * Return true if the rwsem is owned by a reader. */ static inline bool is_rwsem_reader_owned(struct rw_semaphore *sem) { … } #ifdef CONFIG_DEBUG_RWSEMS /* * With CONFIG_DEBUG_RWSEMS configured, it will make sure that if there * is a task pointer in owner of a reader-owned rwsem, it will be the * real owner or one of the real owners. The only exception is when the * unlock is done by up_read_non_owner(). */ static inline void rwsem_clear_reader_owned(struct rw_semaphore *sem) { … } #else static inline void rwsem_clear_reader_owned(struct rw_semaphore *sem) { } #endif /* * Set the RWSEM_NONSPINNABLE bits if the RWSEM_READER_OWNED flag * remains set. Otherwise, the operation will be aborted. */ static inline void rwsem_set_nonspinnable(struct rw_semaphore *sem) { … } static inline bool rwsem_read_trylock(struct rw_semaphore *sem, long *cntp) { … } static inline bool rwsem_write_trylock(struct rw_semaphore *sem) { … } /* * Return just the real task structure pointer of the owner */ static inline struct task_struct *rwsem_owner(struct rw_semaphore *sem) { … } /* * Return the real task structure pointer of the owner and the embedded * flags in the owner. pflags must be non-NULL. */ static inline struct task_struct * rwsem_owner_flags(struct rw_semaphore *sem, unsigned long *pflags) { … } /* * Guide to the rw_semaphore's count field. * * When the RWSEM_WRITER_LOCKED bit in count is set, the lock is owned * by a writer. * * The lock is owned by readers when * (1) the RWSEM_WRITER_LOCKED isn't set in count, * (2) some of the reader bits are set in count, and * (3) the owner field has RWSEM_READ_OWNED bit set. * * Having some reader bits set is not enough to guarantee a readers owned * lock as the readers may be in the process of backing out from the count * and a writer has just released the lock. So another writer may steal * the lock immediately after that. */ /* * Initialize an rwsem: */ void __init_rwsem(struct rw_semaphore *sem, const char *name, struct lock_class_key *key) { … } EXPORT_SYMBOL(…); enum rwsem_waiter_type { … }; struct rwsem_waiter { … }; #define rwsem_first_waiter(sem) … enum rwsem_wake_type { … }; /* * The typical HZ value is either 250 or 1000. So set the minimum waiting * time to at least 4ms or 1 jiffy (if it is higher than 4ms) in the wait * queue before initiating the handoff protocol. */ #define RWSEM_WAIT_TIMEOUT … /* * Magic number to batch-wakeup waiting readers, even when writers are * also present in the queue. This both limits the amount of work the * waking thread must do and also prevents any potential counter overflow, * however unlikely. */ #define MAX_READERS_WAKEUP … static inline void rwsem_add_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter) { … } /* * Remove a waiter from the wait_list and clear flags. * * Both rwsem_mark_wake() and rwsem_try_write_lock() contain a full 'copy' of * this function. Modify with care. * * Return: true if wait_list isn't empty and false otherwise */ static inline bool rwsem_del_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter) { … } /* * handle the lock release when processes blocked on it that can now run * - if we come here from up_xxxx(), then the RWSEM_FLAG_WAITERS bit must * have been set. * - there must be someone on the queue * - the wait_lock must be held by the caller * - tasks are marked for wakeup, the caller must later invoke wake_up_q() * to actually wakeup the blocked task(s) and drop the reference count, * preferably when the wait_lock is released * - woken process blocks are discarded from the list after having task zeroed * - writers are only marked woken if downgrading is false * * Implies rwsem_del_waiter() for all woken readers. */ static void rwsem_mark_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type, struct wake_q_head *wake_q) { … } /* * Remove a waiter and try to wake up other waiters in the wait queue * This function is called from the out_nolock path of both the reader and * writer slowpaths with wait_lock held. It releases the wait_lock and * optionally wake up waiters before it returns. */ static inline void rwsem_del_wake_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter, struct wake_q_head *wake_q) __releases(&sem->wait_lock) { … } /* * This function must be called with the sem->wait_lock held to prevent * race conditions between checking the rwsem wait list and setting the * sem->count accordingly. * * Implies rwsem_del_waiter() on success. */ static inline bool rwsem_try_write_lock(struct rw_semaphore *sem, struct rwsem_waiter *waiter) { … } /* * The rwsem_spin_on_owner() function returns the following 4 values * depending on the lock owner state. * OWNER_NULL : owner is currently NULL * OWNER_WRITER: when owner changes and is a writer * OWNER_READER: when owner changes and the new owner may be a reader. * OWNER_NONSPINNABLE: * when optimistic spinning has to stop because either the * owner stops running, is unknown, or its timeslice has * been used up. */ enum owner_state { … }; #ifdef CONFIG_RWSEM_SPIN_ON_OWNER /* * Try to acquire write lock before the writer has been put on wait queue. */ static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem) { … } static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem) { … } #define OWNER_SPINNABLE … static inline enum owner_state rwsem_owner_state(struct task_struct *owner, unsigned long flags) { … } static noinline enum owner_state rwsem_spin_on_owner(struct rw_semaphore *sem) { … } /* * Calculate reader-owned rwsem spinning threshold for writer * * The more readers own the rwsem, the longer it will take for them to * wind down and free the rwsem. So the empirical formula used to * determine the actual spinning time limit here is: * * Spinning threshold = (10 + nr_readers/2)us * * The limit is capped to a maximum of 25us (30 readers). This is just * a heuristic and is subjected to change in the future. */ static inline u64 rwsem_rspin_threshold(struct rw_semaphore *sem) { … } static bool rwsem_optimistic_spin(struct rw_semaphore *sem) { … } /* * Clear the owner's RWSEM_NONSPINNABLE bit if it is set. This should * only be called when the reader count reaches 0. */ static inline void clear_nonspinnable(struct rw_semaphore *sem) { … } #else static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem) { return false; } static inline bool rwsem_optimistic_spin(struct rw_semaphore *sem) { return false; } static inline void clear_nonspinnable(struct rw_semaphore *sem) { } static inline enum owner_state rwsem_spin_on_owner(struct rw_semaphore *sem) { return OWNER_NONSPINNABLE; } #endif /* * Prepare to wake up waiter(s) in the wait queue by putting them into the * given wake_q if the rwsem lock owner isn't a writer. If rwsem is likely * reader-owned, wake up read lock waiters in queue front or wake up any * front waiter otherwise. * This is being called from both reader and writer slow paths. */ static inline void rwsem_cond_wake_waiter(struct rw_semaphore *sem, long count, struct wake_q_head *wake_q) { … } /* * Wait for the read lock to be granted */ static struct rw_semaphore __sched * rwsem_down_read_slowpath(struct rw_semaphore *sem, long count, unsigned int state) { … } /* * Wait until we successfully acquire the write lock */ static struct rw_semaphore __sched * rwsem_down_write_slowpath(struct rw_semaphore *sem, int state) { … } /* * handle waking up a waiter on the semaphore * - up_read/up_write has decremented the active part of count if we come here */ static struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem) { … } /* * downgrade a write lock into a read lock * - caller incremented waiting part of count and discovered it still negative * - just wake up any readers at the front of the queue */ static struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem) { … } /* * lock for reading */ static __always_inline int __down_read_common(struct rw_semaphore *sem, int state) { … } static __always_inline void __down_read(struct rw_semaphore *sem) { … } static __always_inline int __down_read_interruptible(struct rw_semaphore *sem) { … } static __always_inline int __down_read_killable(struct rw_semaphore *sem) { … } static inline int __down_read_trylock(struct rw_semaphore *sem) { … } /* * lock for writing */ static __always_inline int __down_write_common(struct rw_semaphore *sem, int state) { … } static __always_inline void __down_write(struct rw_semaphore *sem) { … } static __always_inline int __down_write_killable(struct rw_semaphore *sem) { … } static inline int __down_write_trylock(struct rw_semaphore *sem) { … } /* * unlock after reading */ static inline void __up_read(struct rw_semaphore *sem) { … } /* * unlock after writing */ static inline void __up_write(struct rw_semaphore *sem) { … } /* * downgrade write lock to read lock */ static inline void __downgrade_write(struct rw_semaphore *sem) { … } #else /* !CONFIG_PREEMPT_RT */ #define RT_MUTEX_BUILD_MUTEX #include "rtmutex.c" #define rwbase_set_and_save_current_state … #define rwbase_restore_current_state … #define rwbase_rtmutex_lock_state … #define rwbase_rtmutex_slowlock_locked … #define rwbase_rtmutex_unlock … #define rwbase_rtmutex_trylock … #define rwbase_signal_pending_state … #define rwbase_pre_schedule … #define rwbase_schedule … #define rwbase_post_schedule … #include "rwbase_rt.c" void __init_rwsem(struct rw_semaphore *sem, const char *name, struct lock_class_key *key) { init_rwbase_rt(&(sem)->rwbase); #ifdef CONFIG_DEBUG_LOCK_ALLOC debug_check_no_locks_freed((void *)sem, sizeof(*sem)); lockdep_init_map_wait(&sem->dep_map, name, key, 0, LD_WAIT_SLEEP); #endif } EXPORT_SYMBOL(__init_rwsem); static inline void __down_read(struct rw_semaphore *sem) { rwbase_read_lock(&sem->rwbase, TASK_UNINTERRUPTIBLE); } static inline int __down_read_interruptible(struct rw_semaphore *sem) { return rwbase_read_lock(&sem->rwbase, TASK_INTERRUPTIBLE); } static inline int __down_read_killable(struct rw_semaphore *sem) { return rwbase_read_lock(&sem->rwbase, TASK_KILLABLE); } static inline int __down_read_trylock(struct rw_semaphore *sem) { return rwbase_read_trylock(&sem->rwbase); } static inline void __up_read(struct rw_semaphore *sem) { rwbase_read_unlock(&sem->rwbase, TASK_NORMAL); } static inline void __sched __down_write(struct rw_semaphore *sem) { rwbase_write_lock(&sem->rwbase, TASK_UNINTERRUPTIBLE); } static inline int __sched __down_write_killable(struct rw_semaphore *sem) { return rwbase_write_lock(&sem->rwbase, TASK_KILLABLE); } static inline int __down_write_trylock(struct rw_semaphore *sem) { return rwbase_write_trylock(&sem->rwbase); } static inline void __up_write(struct rw_semaphore *sem) { rwbase_write_unlock(&sem->rwbase); } static inline void __downgrade_write(struct rw_semaphore *sem) { rwbase_write_downgrade(&sem->rwbase); } /* Debug stubs for the common API */ #define DEBUG_RWSEMS_WARN_ON … static inline void __rwsem_set_reader_owned(struct rw_semaphore *sem, struct task_struct *owner) { } static inline bool is_rwsem_reader_owned(struct rw_semaphore *sem) { int count = atomic_read(&sem->rwbase.readers); return count < 0 && count != READER_BIAS; } #endif /* CONFIG_PREEMPT_RT */ /* * lock for reading */ void __sched down_read(struct rw_semaphore *sem) { … } EXPORT_SYMBOL(…); int __sched down_read_interruptible(struct rw_semaphore *sem) { … } EXPORT_SYMBOL(…); int __sched down_read_killable(struct rw_semaphore *sem) { … } EXPORT_SYMBOL(…); /* * trylock for reading -- returns 1 if successful, 0 if contention */ int down_read_trylock(struct rw_semaphore *sem) { … } EXPORT_SYMBOL(…); /* * lock for writing */ void __sched down_write(struct rw_semaphore *sem) { … } EXPORT_SYMBOL(…); /* * lock for writing */ int __sched down_write_killable(struct rw_semaphore *sem) { … } EXPORT_SYMBOL(…); /* * trylock for writing -- returns 1 if successful, 0 if contention */ int down_write_trylock(struct rw_semaphore *sem) { … } EXPORT_SYMBOL(…); /* * release a read lock */ void up_read(struct rw_semaphore *sem) { … } EXPORT_SYMBOL(…); /* * release a write lock */ void up_write(struct rw_semaphore *sem) { … } EXPORT_SYMBOL(…); /* * downgrade write lock to read lock */ void downgrade_write(struct rw_semaphore *sem) { … } EXPORT_SYMBOL(…); #ifdef CONFIG_DEBUG_LOCK_ALLOC void down_read_nested(struct rw_semaphore *sem, int subclass) { … } EXPORT_SYMBOL(…); int down_read_killable_nested(struct rw_semaphore *sem, int subclass) { … } EXPORT_SYMBOL(…); void _down_write_nest_lock(struct rw_semaphore *sem, struct lockdep_map *nest) { … } EXPORT_SYMBOL(…); void down_read_non_owner(struct rw_semaphore *sem) { … } EXPORT_SYMBOL(…); void down_write_nested(struct rw_semaphore *sem, int subclass) { … } EXPORT_SYMBOL(…); int __sched down_write_killable_nested(struct rw_semaphore *sem, int subclass) { … } EXPORT_SYMBOL(…); void up_read_non_owner(struct rw_semaphore *sem) { … } EXPORT_SYMBOL(…); #endif