// SPDX-License-Identifier: GPL-2.0 /* * linux/kernel/seccomp.c * * Copyright 2004-2005 Andrea Arcangeli <[email protected]> * * Copyright (C) 2012 Google, Inc. * Will Drewry <[email protected]> * * This defines a simple but solid secure-computing facility. * * Mode 1 uses a fixed list of allowed system calls. * Mode 2 allows user-defined system call filters in the form * of Berkeley Packet Filters/Linux Socket Filters. */ #define pr_fmt(fmt) … #include <linux/refcount.h> #include <linux/audit.h> #include <linux/compat.h> #include <linux/coredump.h> #include <linux/kmemleak.h> #include <linux/nospec.h> #include <linux/prctl.h> #include <linux/sched.h> #include <linux/sched/task_stack.h> #include <linux/seccomp.h> #include <linux/slab.h> #include <linux/syscalls.h> #include <linux/sysctl.h> /* Not exposed in headers: strictly internal use only. */ #define SECCOMP_MODE_DEAD … #ifdef CONFIG_HAVE_ARCH_SECCOMP_FILTER #include <asm/syscall.h> #endif #ifdef CONFIG_SECCOMP_FILTER #include <linux/file.h> #include <linux/filter.h> #include <linux/pid.h> #include <linux/ptrace.h> #include <linux/capability.h> #include <linux/uaccess.h> #include <linux/anon_inodes.h> #include <linux/lockdep.h> /* * When SECCOMP_IOCTL_NOTIF_ID_VALID was first introduced, it had the * wrong direction flag in the ioctl number. This is the broken one, * which the kernel needs to keep supporting until all userspaces stop * using the wrong command number. */ #define SECCOMP_IOCTL_NOTIF_ID_VALID_WRONG_DIR … enum notify_state { … }; struct seccomp_knotif { … }; /** * struct seccomp_kaddfd - container for seccomp_addfd ioctl messages * * @file: A reference to the file to install in the other task * @fd: The fd number to install it at. If the fd number is -1, it means the * installing process should allocate the fd as normal. * @flags: The flags for the new file descriptor. At the moment, only O_CLOEXEC * is allowed. * @ioctl_flags: The flags used for the seccomp_addfd ioctl. * @setfd: whether or not SECCOMP_ADDFD_FLAG_SETFD was set during notify_addfd * @ret: The return value of the installing process. It is set to the fd num * upon success (>= 0). * @completion: Indicates that the installing process has completed fd * installation, or gone away (either due to successful * reply, or signal) * @list: list_head for chaining seccomp_kaddfd together. * */ struct seccomp_kaddfd { … }; /** * struct notification - container for seccomp userspace notifications. Since * most seccomp filters will not have notification listeners attached and this * structure is fairly large, we store the notification-specific stuff in a * separate structure. * * @requests: A semaphore that users of this notification can wait on for * changes. Actual reads and writes are still controlled with * filter->notify_lock. * @flags: A set of SECCOMP_USER_NOTIF_FD_* flags. * @next_id: The id of the next request. * @notifications: A list of struct seccomp_knotif elements. */ struct notification { … }; #ifdef SECCOMP_ARCH_NATIVE /** * struct action_cache - per-filter cache of seccomp actions per * arch/syscall pair * * @allow_native: A bitmap where each bit represents whether the * filter will always allow the syscall, for the * native architecture. * @allow_compat: A bitmap where each bit represents whether the * filter will always allow the syscall, for the * compat architecture. */ struct action_cache { … }; #else struct action_cache { }; static inline bool seccomp_cache_check_allow(const struct seccomp_filter *sfilter, const struct seccomp_data *sd) { return false; } static inline void seccomp_cache_prepare(struct seccomp_filter *sfilter) { } #endif /* SECCOMP_ARCH_NATIVE */ /** * struct seccomp_filter - container for seccomp BPF programs * * @refs: Reference count to manage the object lifetime. * A filter's reference count is incremented for each directly * attached task, once for the dependent filter, and if * requested for the user notifier. When @refs reaches zero, * the filter can be freed. * @users: A filter's @users count is incremented for each directly * attached task (filter installation, fork(), thread_sync), * and once for the dependent filter (tracked in filter->prev). * When it reaches zero it indicates that no direct or indirect * users of that filter exist. No new tasks can get associated with * this filter after reaching 0. The @users count is always smaller * or equal to @refs. Hence, reaching 0 for @users does not mean * the filter can be freed. * @cache: cache of arch/syscall mappings to actions * @log: true if all actions except for SECCOMP_RET_ALLOW should be logged * @wait_killable_recv: Put notifying process in killable state once the * notification is received by the userspace listener. * @prev: points to a previously installed, or inherited, filter * @prog: the BPF program to evaluate * @notif: the struct that holds all notification related information * @notify_lock: A lock for all notification-related accesses. * @wqh: A wait queue for poll if a notifier is in use. * * seccomp_filter objects are organized in a tree linked via the @prev * pointer. For any task, it appears to be a singly-linked list starting * with current->seccomp.filter, the most recently attached or inherited filter. * However, multiple filters may share a @prev node, by way of fork(), which * results in a unidirectional tree existing in memory. This is similar to * how namespaces work. * * seccomp_filter objects should never be modified after being attached * to a task_struct (other than @refs). */ struct seccomp_filter { … }; /* Limit any path through the tree to 256KB worth of instructions. */ #define MAX_INSNS_PER_PATH … /* * Endianness is explicitly ignored and left for BPF program authors to manage * as per the specific architecture. */ static void populate_seccomp_data(struct seccomp_data *sd) { … } /** * seccomp_check_filter - verify seccomp filter code * @filter: filter to verify * @flen: length of filter * * Takes a previously checked filter (by bpf_check_classic) and * redirects all filter code that loads struct sk_buff data * and related data through seccomp_bpf_load. It also * enforces length and alignment checking of those loads. * * Returns 0 if the rule set is legal or -EINVAL if not. */ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen) { … } #ifdef SECCOMP_ARCH_NATIVE static inline bool seccomp_cache_check_allow_bitmap(const void *bitmap, size_t bitmap_size, int syscall_nr) { … } /** * seccomp_cache_check_allow - lookup seccomp cache * @sfilter: The seccomp filter * @sd: The seccomp data to lookup the cache with * * Returns true if the seccomp_data is cached and allowed. */ static inline bool seccomp_cache_check_allow(const struct seccomp_filter *sfilter, const struct seccomp_data *sd) { … } #endif /* SECCOMP_ARCH_NATIVE */ #define ACTION_ONLY(ret) … /** * seccomp_run_filters - evaluates all seccomp filters against @sd * @sd: optional seccomp data to be passed to filters * @match: stores struct seccomp_filter that resulted in the return value, * unless filter returned SECCOMP_RET_ALLOW, in which case it will * be unchanged. * * Returns valid seccomp BPF response codes. */ static u32 seccomp_run_filters(const struct seccomp_data *sd, struct seccomp_filter **match) { … } #endif /* CONFIG_SECCOMP_FILTER */ static inline bool seccomp_may_assign_mode(unsigned long seccomp_mode) { … } void __weak arch_seccomp_spec_mitigate(struct task_struct *task) { … } static inline void seccomp_assign_mode(struct task_struct *task, unsigned long seccomp_mode, unsigned long flags) { … } #ifdef CONFIG_SECCOMP_FILTER /* Returns 1 if the parent is an ancestor of the child. */ static int is_ancestor(struct seccomp_filter *parent, struct seccomp_filter *child) { … } /** * seccomp_can_sync_threads: checks if all threads can be synchronized * * Expects sighand and cred_guard_mutex locks to be held. * * Returns 0 on success, -ve on error, or the pid of a thread which was * either not in the correct seccomp mode or did not have an ancestral * seccomp filter. */ static inline pid_t seccomp_can_sync_threads(void) { … } static inline void seccomp_filter_free(struct seccomp_filter *filter) { … } static void __seccomp_filter_orphan(struct seccomp_filter *orig) { … } static void __put_seccomp_filter(struct seccomp_filter *orig) { … } static void __seccomp_filter_release(struct seccomp_filter *orig) { … } /** * seccomp_filter_release - Detach the task from its filter tree, * drop its reference count, and notify * about unused filters * * @tsk: task the filter should be released from. * * This function should only be called when the task is exiting as * it detaches it from its filter tree. PF_EXITING has to be set * for the task. */ void seccomp_filter_release(struct task_struct *tsk) { … } /** * seccomp_sync_threads: sets all threads to use current's filter * * @flags: SECCOMP_FILTER_FLAG_* flags to set during sync. * * Expects sighand and cred_guard_mutex locks to be held, and for * seccomp_can_sync_threads() to have returned success already * without dropping the locks. * */ static inline void seccomp_sync_threads(unsigned long flags) { … } /** * seccomp_prepare_filter: Prepares a seccomp filter for use. * @fprog: BPF program to install * * Returns filter on success or an ERR_PTR on failure. */ static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog) { … } /** * seccomp_prepare_user_filter - prepares a user-supplied sock_fprog * @user_filter: pointer to the user data containing a sock_fprog. * * Returns 0 on success and non-zero otherwise. */ static struct seccomp_filter * seccomp_prepare_user_filter(const char __user *user_filter) { … } #ifdef SECCOMP_ARCH_NATIVE /** * seccomp_is_const_allow - check if filter is constant allow with given data * @fprog: The BPF programs * @sd: The seccomp data to check against, only syscall number and arch * number are considered constant. */ static bool seccomp_is_const_allow(struct sock_fprog_kern *fprog, struct seccomp_data *sd) { … } static void seccomp_cache_prepare_bitmap(struct seccomp_filter *sfilter, void *bitmap, const void *bitmap_prev, size_t bitmap_size, int arch) { … } /** * seccomp_cache_prepare - emulate the filter to find cacheable syscalls * @sfilter: The seccomp filter * * Returns 0 if successful or -errno if error occurred. */ static void seccomp_cache_prepare(struct seccomp_filter *sfilter) { … } #endif /* SECCOMP_ARCH_NATIVE */ /** * seccomp_attach_filter: validate and attach filter * @flags: flags to change filter behavior * @filter: seccomp filter to add to the current process * * Caller must be holding current->sighand->siglock lock. * * Returns 0 on success, -ve on error, or * - in TSYNC mode: the pid of a thread which was either not in the correct * seccomp mode or did not have an ancestral seccomp filter * - in NEW_LISTENER mode: the fd of the new listener */ static long seccomp_attach_filter(unsigned int flags, struct seccomp_filter *filter) { … } static void __get_seccomp_filter(struct seccomp_filter *filter) { … } /* get_seccomp_filter - increments the reference count of the filter on @tsk */ void get_seccomp_filter(struct task_struct *tsk) { … } #endif /* CONFIG_SECCOMP_FILTER */ /* For use with seccomp_actions_logged */ #define SECCOMP_LOG_KILL_PROCESS … #define SECCOMP_LOG_KILL_THREAD … #define SECCOMP_LOG_TRAP … #define SECCOMP_LOG_ERRNO … #define SECCOMP_LOG_TRACE … #define SECCOMP_LOG_LOG … #define SECCOMP_LOG_ALLOW … #define SECCOMP_LOG_USER_NOTIF … static u32 seccomp_actions_logged = …; static inline void seccomp_log(unsigned long syscall, long signr, u32 action, bool requested) { … } /* * Secure computing mode 1 allows only read/write/exit/sigreturn. * To be fully secure this must be combined with rlimit * to limit the stack allocations too. */ static const int mode1_syscalls[] = …; static void __secure_computing_strict(int this_syscall) { … } #ifndef CONFIG_HAVE_ARCH_SECCOMP_FILTER void secure_computing_strict(int this_syscall) { int mode = current->seccomp.mode; if (IS_ENABLED(CONFIG_CHECKPOINT_RESTORE) && unlikely(current->ptrace & PT_SUSPEND_SECCOMP)) return; if (mode == SECCOMP_MODE_DISABLED) return; else if (mode == SECCOMP_MODE_STRICT) __secure_computing_strict(this_syscall); else BUG(); } #else #ifdef CONFIG_SECCOMP_FILTER static u64 seccomp_next_notify_id(struct seccomp_filter *filter) { … } static void seccomp_handle_addfd(struct seccomp_kaddfd *addfd, struct seccomp_knotif *n) { … } static bool should_sleep_killable(struct seccomp_filter *match, struct seccomp_knotif *n) { … } static int seccomp_do_user_notification(int this_syscall, struct seccomp_filter *match, const struct seccomp_data *sd) { … } static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, const bool recheck_after_trace) { … } #else static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, const bool recheck_after_trace) { BUG(); return -1; } #endif int __secure_computing(const struct seccomp_data *sd) { … } #endif /* CONFIG_HAVE_ARCH_SECCOMP_FILTER */ long prctl_get_seccomp(void) { … } /** * seccomp_set_mode_strict: internal function for setting strict seccomp * * Once current->seccomp.mode is non-zero, it may not be changed. * * Returns 0 on success or -EINVAL on failure. */ static long seccomp_set_mode_strict(void) { … } #ifdef CONFIG_SECCOMP_FILTER static void seccomp_notify_free(struct seccomp_filter *filter) { … } static void seccomp_notify_detach(struct seccomp_filter *filter) { … } static int seccomp_notify_release(struct inode *inode, struct file *file) { … } /* must be called with notif_lock held */ static inline struct seccomp_knotif * find_notification(struct seccomp_filter *filter, u64 id) { … } static int recv_wake_function(wait_queue_entry_t *wait, unsigned int mode, int sync, void *key) { … } static int recv_wait_event(struct seccomp_filter *filter) { … } static long seccomp_notify_recv(struct seccomp_filter *filter, void __user *buf) { … } static long seccomp_notify_send(struct seccomp_filter *filter, void __user *buf) { … } static long seccomp_notify_id_valid(struct seccomp_filter *filter, void __user *buf) { … } static long seccomp_notify_set_flags(struct seccomp_filter *filter, unsigned long flags) { … } static long seccomp_notify_addfd(struct seccomp_filter *filter, struct seccomp_notif_addfd __user *uaddfd, unsigned int size) { … } static long seccomp_notify_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { … } static __poll_t seccomp_notify_poll(struct file *file, struct poll_table_struct *poll_tab) { … } static const struct file_operations seccomp_notify_ops = …; static struct file *init_listener(struct seccomp_filter *filter) { … } /* * Does @new_child have a listener while an ancestor also has a listener? * If so, we'll want to reject this filter. * This only has to be tested for the current process, even in the TSYNC case, * because TSYNC installs @child with the same parent on all threads. * Note that @new_child is not hooked up to its parent at this point yet, so * we use current->seccomp.filter. */ static bool has_duplicate_listener(struct seccomp_filter *new_child) { … } /** * seccomp_set_mode_filter: internal function for setting seccomp filter * @flags: flags to change filter behavior * @filter: struct sock_fprog containing filter * * This function may be called repeatedly to install additional filters. * Every filter successfully installed will be evaluated (in reverse order) * for each system call the task makes. * * Once current->seccomp.mode is non-zero, it may not be changed. * * Returns 0 on success or -EINVAL on failure. */ static long seccomp_set_mode_filter(unsigned int flags, const char __user *filter) { … } #else static inline long seccomp_set_mode_filter(unsigned int flags, const char __user *filter) { return -EINVAL; } #endif static long seccomp_get_action_avail(const char __user *uaction) { … } static long seccomp_get_notif_sizes(void __user *usizes) { … } /* Common entry point for both prctl and syscall. */ static long do_seccomp(unsigned int op, unsigned int flags, void __user *uargs) { … } SYSCALL_DEFINE3(seccomp, unsigned int, op, unsigned int, flags, void __user *, uargs) { … } /** * prctl_set_seccomp: configures current->seccomp.mode * @seccomp_mode: requested mode to use * @filter: optional struct sock_fprog for use with SECCOMP_MODE_FILTER * * Returns 0 on success or -EINVAL on failure. */ long prctl_set_seccomp(unsigned long seccomp_mode, void __user *filter) { … } #if defined(CONFIG_SECCOMP_FILTER) && defined(CONFIG_CHECKPOINT_RESTORE) static struct seccomp_filter *get_nth_filter(struct task_struct *task, unsigned long filter_off) { … } long seccomp_get_filter(struct task_struct *task, unsigned long filter_off, void __user *data) { … } long seccomp_get_metadata(struct task_struct *task, unsigned long size, void __user *data) { … } #endif #ifdef CONFIG_SYSCTL /* Human readable action names for friendly sysctl interaction */ #define SECCOMP_RET_KILL_PROCESS_NAME … #define SECCOMP_RET_KILL_THREAD_NAME … #define SECCOMP_RET_TRAP_NAME … #define SECCOMP_RET_ERRNO_NAME … #define SECCOMP_RET_USER_NOTIF_NAME … #define SECCOMP_RET_TRACE_NAME … #define SECCOMP_RET_LOG_NAME … #define SECCOMP_RET_ALLOW_NAME … static const char seccomp_actions_avail[] = …; struct seccomp_log_name { … }; static const struct seccomp_log_name seccomp_log_names[] = …; static bool seccomp_names_from_actions_logged(char *names, size_t size, u32 actions_logged, const char *sep) { … } static bool seccomp_action_logged_from_name(u32 *action_logged, const char *name) { … } static bool seccomp_actions_logged_from_names(u32 *actions_logged, char *names) { … } static int read_actions_logged(const struct ctl_table *ro_table, void *buffer, size_t *lenp, loff_t *ppos) { … } static int write_actions_logged(const struct ctl_table *ro_table, void *buffer, size_t *lenp, loff_t *ppos, u32 *actions_logged) { … } static void audit_actions_logged(u32 actions_logged, u32 old_actions_logged, int ret) { … } static int seccomp_actions_logged_handler(const struct ctl_table *ro_table, int write, void *buffer, size_t *lenp, loff_t *ppos) { … } static struct ctl_table seccomp_sysctl_table[] = …; static int __init seccomp_sysctl_init(void) { … } device_initcall(…) … #endif /* CONFIG_SYSCTL */ #ifdef CONFIG_SECCOMP_CACHE_DEBUG /* Currently CONFIG_SECCOMP_CACHE_DEBUG implies SECCOMP_ARCH_NATIVE */ static void proc_pid_seccomp_cache_arch(struct seq_file *m, const char *name, const void *bitmap, size_t bitmap_size) { … } int proc_pid_seccomp_cache(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { … } #endif /* CONFIG_SECCOMP_CACHE_DEBUG */