sem.c | Explore in Territory

// SPDX-License-Identifier: GPL-2.0
/*
 * linux/ipc/sem.c
 * Copyright (C) 1992 Krishna Balasubramanian
 * Copyright (C) 1995 Eric Schenk, Bruno Haible
 *
 * /proc/sysvipc/sem support (c) 1999 Dragos Acostachioaie <[email protected]>
 *
 * SMP-threaded, sysctl's added
 * (c) 1999 Manfred Spraul <[email protected]>
 * Enforced range limit on SEM_UNDO
 * (c) 2001 Red Hat Inc
 * Lockless wakeup
 * (c) 2003 Manfred Spraul <[email protected]>
 * (c) 2016 Davidlohr Bueso <[email protected]>
 * Further wakeup optimizations, documentation
 * (c) 2010 Manfred Spraul <[email protected]>
 *
 * support for audit of ipc object properties and permission changes
 * Dustin Kirkland <[email protected]>
 *
 * namespaces support
 * OpenVZ, SWsoft Inc.
 * Pavel Emelianov <[email protected]>
 *
 * Implementation notes: (May 2010)
 * This file implements System V semaphores.
 *
 * User space visible behavior:
 * - FIFO ordering for semop() operations (just FIFO, not starvation
 *   protection)
 * - multiple semaphore operations that alter the same semaphore in
 *   one semop() are handled.
 * - sem_ctime (time of last semctl()) is updated in the IPC_SET, SETVAL and
 *   SETALL calls.
 * - two Linux specific semctl() commands: SEM_STAT, SEM_INFO.
 * - undo adjustments at process exit are limited to 0..SEMVMX.
 * - namespace are supported.
 * - SEMMSL, SEMMNS, SEMOPM and SEMMNI can be configured at runtime by writing
 *   to /proc/sys/kernel/sem.
 * - statistics about the usage are reported in /proc/sysvipc/sem.
 *
 * Internals:
 * - scalability:
 *   - all global variables are read-mostly.
 *   - semop() calls and semctl(RMID) are synchronized by RCU.
 *   - most operations do write operations (actually: spin_lock calls) to
 *     the per-semaphore array structure.
 *   Thus: Perfect SMP scaling between independent semaphore arrays.
 *         If multiple semaphores in one array are used, then cache line
 *         trashing on the semaphore array spinlock will limit the scaling.
 * - semncnt and semzcnt are calculated on demand in count_semcnt()
 * - the task that performs a successful semop() scans the list of all
 *   sleeping tasks and completes any pending operations that can be fulfilled.
 *   Semaphores are actively given to waiting tasks (necessary for FIFO).
 *   (see update_queue())
 * - To improve the scalability, the actual wake-up calls are performed after
 *   dropping all locks. (see wake_up_sem_queue_prepare())
 * - All work is done by the waker, the woken up task does not have to do
 *   anything - not even acquiring a lock or dropping a refcount.
 * - A woken up task may not even touch the semaphore array anymore, it may
 *   have been destroyed already by a semctl(RMID).
 * - UNDO values are stored in an array (one per process and per
 *   semaphore array, lazily allocated). For backwards compatibility, multiple
 *   modes for the UNDO variables are supported (per process, per thread)
 *   (see copy_semundo, CLONE_SYSVSEM)
 * - There are two lists of the pending operations: a per-array list
 *   and per-semaphore list (stored in the array). This allows to achieve FIFO
 *   ordering without always scanning all pending operations.
 *   The worst-case behavior is nevertheless O(N^2) for N wakeups.
 */

#include <linux/compat.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/init.h>
#include <linux/proc_fs.h>
#include <linux/time.h>
#include <linux/security.h>
#include <linux/syscalls.h>
#include <linux/audit.h>
#include <linux/capability.h>
#include <linux/seq_file.h>
#include <linux/rwsem.h>
#include <linux/nsproxy.h>
#include <linux/ipc_namespace.h>
#include <linux/sched/wake_q.h>
#include <linux/nospec.h>
#include <linux/rhashtable.h>

#include <linux/uaccess.h>
#include "util.h"

/* One semaphore structure for each semaphore in the system. */
struct sem { … } ____cacheline_aligned_in_smp;

/* One sem_array data structure for each set of semaphores in the system. */
struct sem_array { … } __randomize_layout;

/* One queue for each sleeping process in the system. */
struct sem_queue { … };

/* Each task has a list of undo requests. They are executed automatically
 * when the process exits.
 */
struct sem_undo { … };

/* sem_undo_list controls shared access to the list of sem_undo structures
 * that may be shared among all a CLONE_SYSVSEM task group.
 */
struct sem_undo_list { … };


#define sem_ids(ns) …

static int newary(struct ipc_namespace *, struct ipc_params *);
static void freeary(struct ipc_namespace *, struct kern_ipc_perm *);
#ifdef CONFIG_PROC_FS
static int sysvipc_sem_proc_show(struct seq_file *s, void *it);
#endif

#define SEMMSL_FAST …
#define SEMOPM_FAST …

/*
 * Switching from the mode suitable for simple ops
 * to the mode for complex ops is costly. Therefore:
 * use some hysteresis
 */
#define USE_GLOBAL_LOCK_HYSTERESIS …

/*
 * Locking:
 * a) global sem_lock() for read/write
 *	sem_undo.id_next,
 *	sem_array.complex_count,
 *	sem_array.pending{_alter,_const},
 *	sem_array.sem_undo
 *
 * b) global or semaphore sem_lock() for read/write:
 *	sem_array.sems[i].pending_{const,alter}:
 *
 * c) special:
 *	sem_undo_list.list_proc:
 *	* undo_list->lock for write
 *	* rcu for read
 *	use_global_lock:
 *	* global sem_lock() for write
 *	* either local or global sem_lock() for read.
 *
 * Memory ordering:
 * Most ordering is enforced by using spin_lock() and spin_unlock().
 *
 * Exceptions:
 * 1) use_global_lock: (SEM_BARRIER_1)
 * Setting it from non-zero to 0 is a RELEASE, this is ensured by
 * using smp_store_release(): Immediately after setting it to 0,
 * a simple op can start.
 * Testing if it is non-zero is an ACQUIRE, this is ensured by using
 * smp_load_acquire().
 * Setting it from 0 to non-zero must be ordered with regards to
 * this smp_load_acquire(), this is guaranteed because the smp_load_acquire()
 * is inside a spin_lock() and after a write from 0 to non-zero a
 * spin_lock()+spin_unlock() is done.
 * To prevent the compiler/cpu temporarily writing 0 to use_global_lock,
 * READ_ONCE()/WRITE_ONCE() is used.
 *
 * 2) queue.status: (SEM_BARRIER_2)
 * Initialization is done while holding sem_lock(), so no further barrier is
 * required.
 * Setting it to a result code is a RELEASE, this is ensured by both a
 * smp_store_release() (for case a) and while holding sem_lock()
 * (for case b).
 * The ACQUIRE when reading the result code without holding sem_lock() is
 * achieved by using READ_ONCE() + smp_acquire__after_ctrl_dep().
 * (case a above).
 * Reading the result code while holding sem_lock() needs no further barriers,
 * the locks inside sem_lock() enforce ordering (case b above)
 *
 * 3) current->state:
 * current->state is set to TASK_INTERRUPTIBLE while holding sem_lock().
 * The wakeup is handled using the wake_q infrastructure. wake_q wakeups may
 * happen immediately after calling wake_q_add. As wake_q_add_safe() is called
 * when holding sem_lock(), no further barriers are required.
 *
 * See also ipc/mqueue.c for more details on the covered races.
 */

#define sc_semmsl …
#define sc_semmns …
#define sc_semopm …
#define sc_semmni …

void sem_init_ns(struct ipc_namespace *ns)
{ … }

#ifdef CONFIG_IPC_NS
void sem_exit_ns(struct ipc_namespace *ns)
{ … }
#endif

void __init sem_init(void)
{ … }

/**
 * unmerge_queues - unmerge queues, if possible.
 * @sma: semaphore array
 *
 * The function unmerges the wait queues if complex_count is 0.
 * It must be called prior to dropping the global semaphore array lock.
 */
static void unmerge_queues(struct sem_array *sma)
{ … }

/**
 * merge_queues - merge single semop queues into global queue
 * @sma: semaphore array
 *
 * This function merges all per-semaphore queues into the global queue.
 * It is necessary to achieve FIFO ordering for the pending single-sop
 * operations when a multi-semop operation must sleep.
 * Only the alter operations must be moved, the const operations can stay.
 */
static void merge_queues(struct sem_array *sma)
{ … }

static void sem_rcu_free(struct rcu_head *head)
{ … }

/*
 * Enter the mode suitable for non-simple operations:
 * Caller must own sem_perm.lock.
 */
static void complexmode_enter(struct sem_array *sma)
{ … }

/*
 * Try to leave the mode that disallows simple operations:
 * Caller must own sem_perm.lock.
 */
static void complexmode_tryleave(struct sem_array *sma)
{ … }

#define SEM_GLOBAL_LOCK …
/*
 * If the request contains only one semaphore operation, and there are
 * no complex transactions pending, lock only the semaphore involved.
 * Otherwise, lock the entire semaphore array, since we either have
 * multiple semaphores in our own semops, or we need to look at
 * semaphores from other pending complex operations.
 */
static inline int sem_lock(struct sem_array *sma, struct sembuf *sops,
			      int nsops)
{ … }

static inline void sem_unlock(struct sem_array *sma, int locknum)
{ … }

/*
 * sem_lock_(check_) routines are called in the paths where the rwsem
 * is not held.
 *
 * The caller holds the RCU read lock.
 */
static inline struct sem_array *sem_obtain_object(struct ipc_namespace *ns, int id)
{ … }

static inline struct sem_array *sem_obtain_object_check(struct ipc_namespace *ns,
							int id)
{ … }

static inline void sem_lock_and_putref(struct sem_array *sma)
{ … }

static inline void sem_rmid(struct ipc_namespace *ns, struct sem_array *s)
{ … }

static struct sem_array *sem_alloc(size_t nsems)
{ … }

/**
 * newary - Create a new semaphore set
 * @ns: namespace
 * @params: ptr to the structure that contains key, semflg and nsems
 *
 * Called with sem_ids.rwsem held (as a writer)
 */
static int newary(struct ipc_namespace *ns, struct ipc_params *params)
{ … }


/*
 * Called with sem_ids.rwsem and ipcp locked.
 */
static int sem_more_checks(struct kern_ipc_perm *ipcp, struct ipc_params *params)
{ … }

long ksys_semget(key_t key, int nsems, int semflg)
{ … }

SYSCALL_DEFINE3(semget, key_t, key, int, nsems, int, semflg)
{ … }

/**
 * perform_atomic_semop[_slow] - Attempt to perform semaphore
 *                               operations on a given array.
 * @sma: semaphore array
 * @q: struct sem_queue that describes the operation
 *
 * Caller blocking are as follows, based the value
 * indicated by the semaphore operation (sem_op):
 *
 *  (1) >0 never blocks.
 *  (2)  0 (wait-for-zero operation): semval is non-zero.
 *  (3) <0 attempting to decrement semval to a value smaller than zero.
 *
 * Returns 0 if the operation was possible.
 * Returns 1 if the operation is impossible, the caller must sleep.
 * Returns <0 for error codes.
 */
static int perform_atomic_semop_slow(struct sem_array *sma, struct sem_queue *q)
{ … }

static int perform_atomic_semop(struct sem_array *sma, struct sem_queue *q)
{ … }

static inline void wake_up_sem_queue_prepare(struct sem_queue *q, int error,
					     struct wake_q_head *wake_q)
{ … }

static void unlink_queue(struct sem_array *sma, struct sem_queue *q)
{ … }

/** check_restart(sma, q)
 * @sma: semaphore array
 * @q: the operation that just completed
 *
 * update_queue is O(N^2) when it restarts scanning the whole queue of
 * waiting operations. Therefore this function checks if the restart is
 * really necessary. It is called after a previously waiting operation
 * modified the array.
 * Note that wait-for-zero operations are handled without restart.
 */
static inline int check_restart(struct sem_array *sma, struct sem_queue *q)
{ … }

/**
 * wake_const_ops - wake up non-alter tasks
 * @sma: semaphore array.
 * @semnum: semaphore that was modified.
 * @wake_q: lockless wake-queue head.
 *
 * wake_const_ops must be called after a semaphore in a semaphore array
 * was set to 0. If complex const operations are pending, wake_const_ops must
 * be called with semnum = -1, as well as with the number of each modified
 * semaphore.
 * The tasks that must be woken up are added to @wake_q. The return code
 * is stored in q->pid.
 * The function returns 1 if at least one operation was completed successfully.
 */
static int wake_const_ops(struct sem_array *sma, int semnum,
			  struct wake_q_head *wake_q)
{ … }

/**
 * do_smart_wakeup_zero - wakeup all wait for zero tasks
 * @sma: semaphore array
 * @sops: operations that were performed
 * @nsops: number of operations
 * @wake_q: lockless wake-queue head
 *
 * Checks all required queue for wait-for-zero operations, based
 * on the actual changes that were performed on the semaphore array.
 * The function returns 1 if at least one operation was completed successfully.
 */
static int do_smart_wakeup_zero(struct sem_array *sma, struct sembuf *sops,
				int nsops, struct wake_q_head *wake_q)
{ … }


/**
 * update_queue - look for tasks that can be completed.
 * @sma: semaphore array.
 * @semnum: semaphore that was modified.
 * @wake_q: lockless wake-queue head.
 *
 * update_queue must be called after a semaphore in a semaphore array
 * was modified. If multiple semaphores were modified, update_queue must
 * be called with semnum = -1, as well as with the number of each modified
 * semaphore.
 * The tasks that must be woken up are added to @wake_q. The return code
 * is stored in q->pid.
 * The function internally checks if const operations can now succeed.
 *
 * The function return 1 if at least one semop was completed successfully.
 */
static int update_queue(struct sem_array *sma, int semnum, struct wake_q_head *wake_q)
{ … }

/**
 * set_semotime - set sem_otime
 * @sma: semaphore array
 * @sops: operations that modified the array, may be NULL
 *
 * sem_otime is replicated to avoid cache line trashing.
 * This function sets one instance to the current time.
 */
static void set_semotime(struct sem_array *sma, struct sembuf *sops)
{ … }

/**
 * do_smart_update - optimized update_queue
 * @sma: semaphore array
 * @sops: operations that were performed
 * @nsops: number of operations
 * @otime: force setting otime
 * @wake_q: lockless wake-queue head
 *
 * do_smart_update() does the required calls to update_queue and wakeup_zero,
 * based on the actual changes that were performed on the semaphore array.
 * Note that the function does not do the actual wake-up: the caller is
 * responsible for calling wake_up_q().
 * It is safe to perform this call after dropping all locks.
 */
static void do_smart_update(struct sem_array *sma, struct sembuf *sops, int nsops,
			    int otime, struct wake_q_head *wake_q)
{ … }

/*
 * check_qop: Test if a queued operation sleeps on the semaphore semnum
 */
static int check_qop(struct sem_array *sma, int semnum, struct sem_queue *q,
			bool count_zero)
{ … }

/* The following counts are associated to each semaphore:
 *   semncnt        number of tasks waiting on semval being nonzero
 *   semzcnt        number of tasks waiting on semval being zero
 *
 * Per definition, a task waits only on the semaphore of the first semop
 * that cannot proceed, even if additional operation would block, too.
 */
static int count_semcnt(struct sem_array *sma, ushort semnum,
			bool count_zero)
{ … }

/* Free a semaphore set. freeary() is called with sem_ids.rwsem locked
 * as a writer and the spinlock for this semaphore set hold. sem_ids.rwsem
 * remains locked on exit.
 */
static void freeary(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp)
{ … }

static unsigned long copy_semid_to_user(void __user *buf, struct semid64_ds *in, int version)
{ … }

static time64_t get_semotime(struct sem_array *sma)
{ … }

static int semctl_stat(struct ipc_namespace *ns, int semid,
			 int cmd, struct semid64_ds *semid64)
{ … }

static int semctl_info(struct ipc_namespace *ns, int semid,
			 int cmd, void __user *p)
{ … }

static int semctl_setval(struct ipc_namespace *ns, int semid, int semnum,
		int val)
{ … }

static int semctl_main(struct ipc_namespace *ns, int semid, int semnum,
		int cmd, void __user *p)
{ … }

static inline unsigned long
copy_semid_from_user(struct semid64_ds *out, void __user *buf, int version)
{ … }

/*
 * This function handles some semctl commands which require the rwsem
 * to be held in write mode.
 * NOTE: no locks must be held, the rwsem is taken inside this function.
 */
static int semctl_down(struct ipc_namespace *ns, int semid,
		       int cmd, struct semid64_ds *semid64)
{ … }

static long ksys_semctl(int semid, int semnum, int cmd, unsigned long arg, int version)
{ … }

SYSCALL_DEFINE4(semctl, int, semid, int, semnum, int, cmd, unsigned long, arg)
{ … }

#ifdef CONFIG_ARCH_WANT_IPC_PARSE_VERSION
long ksys_old_semctl(int semid, int semnum, int cmd, unsigned long arg)
{
	int version = ipc_parse_version(&cmd);

	return ksys_semctl(semid, semnum, cmd, arg, version);
}

SYSCALL_DEFINE4(old_semctl, int, semid, int, semnum, int, cmd, unsigned long, arg)
{
	return ksys_old_semctl(semid, semnum, cmd, arg);
}
#endif

#ifdef CONFIG_COMPAT

struct compat_semid_ds { … };

static int copy_compat_semid_from_user(struct semid64_ds *out, void __user *buf,
					int version)
{ … }

static int copy_compat_semid_to_user(void __user *buf, struct semid64_ds *in,
					int version)
{ … }

static long compat_ksys_semctl(int semid, int semnum, int cmd, int arg, int version)
{ … }

COMPAT_SYSCALL_DEFINE4(semctl, int, semid, int, semnum, int, cmd, int, arg)
{ … }

#ifdef CONFIG_ARCH_WANT_COMPAT_IPC_PARSE_VERSION
long compat_ksys_old_semctl(int semid, int semnum, int cmd, int arg)
{ … }

COMPAT_SYSCALL_DEFINE4(old_semctl, int, semid, int, semnum, int, cmd, int, arg)
{ … }
#endif
#endif

/* If the task doesn't already have a undo_list, then allocate one
 * here.  We guarantee there is only one thread using this undo list,
 * and current is THE ONE
 *
 * If this allocation and assignment succeeds, but later
 * portions of this code fail, there is no need to free the sem_undo_list.
 * Just let it stay associated with the task, and it'll be freed later
 * at exit time.
 *
 * This can block, so callers must hold no locks.
 */
static inline int get_undo_list(struct sem_undo_list **undo_listp)
{ … }

static struct sem_undo *__lookup_undo(struct sem_undo_list *ulp, int semid)
{ … }

static struct sem_undo *lookup_undo(struct sem_undo_list *ulp, int semid)
{ … }

/**
 * find_alloc_undo - lookup (and if not present create) undo array
 * @ns: namespace
 * @semid: semaphore array id
 *
 * The function looks up (and if not present creates) the undo structure.
 * The size of the undo structure depends on the size of the semaphore
 * array, thus the alloc path is not that straightforward.
 * Lifetime-rules: sem_undo is rcu-protected, on success, the function
 * performs a rcu_read_lock().
 */
static struct sem_undo *find_alloc_undo(struct ipc_namespace *ns, int semid)
{ … }

long __do_semtimedop(int semid, struct sembuf *sops,
		unsigned nsops, const struct timespec64 *timeout,
		struct ipc_namespace *ns)
{ … }

static long do_semtimedop(int semid, struct sembuf __user *tsops,
		unsigned nsops, const struct timespec64 *timeout)
{ … }

long ksys_semtimedop(int semid, struct sembuf __user *tsops,
		     unsigned int nsops, const struct __kernel_timespec __user *timeout)
{ … }

SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops,
		unsigned int, nsops, const struct __kernel_timespec __user *, timeout)
{ … }

#ifdef CONFIG_COMPAT_32BIT_TIME
long compat_ksys_semtimedop(int semid, struct sembuf __user *tsems,
			    unsigned int nsops,
			    const struct old_timespec32 __user *timeout)
{ … }

SYSCALL_DEFINE4(semtimedop_time32, int, semid, struct sembuf __user *, tsems,
		       unsigned int, nsops,
		       const struct old_timespec32 __user *, timeout)
{ … }
#endif

SYSCALL_DEFINE3(semop, int, semid, struct sembuf __user *, tsops,
		unsigned, nsops)
{ … }

/* If CLONE_SYSVSEM is set, establish sharing of SEM_UNDO state between
 * parent and child tasks.
 */

int copy_semundo(unsigned long clone_flags, struct task_struct *tsk)
{ … }

/*
 * add semadj values to semaphores, free undo structures.
 * undo structures are not freed when semaphore arrays are destroyed
 * so some of them may be out of date.
 * IMPLEMENTATION NOTE: There is some confusion over whether the
 * set of adjustments that needs to be done should be done in an atomic
 * manner or not. That is, if we are attempting to decrement the semval
 * should we queue up and wait until we can do so legally?
 * The original implementation attempted to do this (queue and wait).
 * The current implementation does not do so. The POSIX standard
 * and SVID should be consulted to determine what behavior is mandated.
 */
void exit_sem(struct task_struct *tsk)
{ … }

#ifdef CONFIG_PROC_FS
static int sysvipc_sem_proc_show(struct seq_file *s, void *it)
{ … }
#endif