// SPDX-License-Identifier: GPL-2.0-only /* * linux/fs/namespace.c * * (C) Copyright Al Viro 2000, 2001 * * Based on code from fs/super.c, copyright Linus Torvalds and others. * Heavily rewritten. */ #include <linux/syscalls.h> #include <linux/export.h> #include <linux/capability.h> #include <linux/mnt_namespace.h> #include <linux/user_namespace.h> #include <linux/namei.h> #include <linux/security.h> #include <linux/cred.h> #include <linux/idr.h> #include <linux/init.h> /* init_rootfs */ #include <linux/fs_struct.h> /* get_fs_root et.al. */ #include <linux/fsnotify.h> /* fsnotify_vfsmount_delete */ #include <linux/file.h> #include <linux/uaccess.h> #include <linux/proc_ns.h> #include <linux/magic.h> #include <linux/memblock.h> #include <linux/proc_fs.h> #include <linux/task_work.h> #include <linux/sched/task.h> #include <uapi/linux/mount.h> #include <linux/fs_context.h> #include <linux/shmem_fs.h> #include <linux/mnt_idmapping.h> #include <linux/nospec.h> #include "pnode.h" #include "internal.h" /* Maximum number of mounts in a mount namespace */ static unsigned int sysctl_mount_max __read_mostly = …; static unsigned int m_hash_mask __ro_after_init; static unsigned int m_hash_shift __ro_after_init; static unsigned int mp_hash_mask __ro_after_init; static unsigned int mp_hash_shift __ro_after_init; static __initdata unsigned long mhash_entries; static int __init set_mhash_entries(char *str) { … } __setup(…); static __initdata unsigned long mphash_entries; static int __init set_mphash_entries(char *str) { … } __setup(…); static u64 event; static DEFINE_IDA(mnt_id_ida); static DEFINE_IDA(mnt_group_ida); /* Don't allow confusion with old 32bit mount ID */ #define MNT_UNIQUE_ID_OFFSET … static atomic64_t mnt_id_ctr = …; static struct hlist_head *mount_hashtable __ro_after_init; static struct hlist_head *mountpoint_hashtable __ro_after_init; static struct kmem_cache *mnt_cache __ro_after_init; static DECLARE_RWSEM(namespace_sem); static HLIST_HEAD(unmounted); /* protected by namespace_sem */ static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */ static DEFINE_RWLOCK(mnt_ns_tree_lock); static struct rb_root mnt_ns_tree = …; /* protected by mnt_ns_tree_lock */ struct mount_kattr { … }; /* /sys/fs */ struct kobject *fs_kobj __ro_after_init; EXPORT_SYMBOL_GPL(…); /* * vfsmount lock may be taken for read to prevent changes to the * vfsmount hash, ie. during mountpoint lookups or walking back * up the tree. * * It should be taken for write in all cases where the vfsmount * tree or hash is modified or when a vfsmount structure is modified. */ __cacheline_aligned_in_smp DEFINE_SEQLOCK(…); static int mnt_ns_cmp(u64 seq, const struct mnt_namespace *ns) { … } static inline struct mnt_namespace *node_to_mnt_ns(const struct rb_node *node) { … } static bool mnt_ns_less(struct rb_node *a, const struct rb_node *b) { … } static void mnt_ns_tree_add(struct mnt_namespace *ns) { … } static void mnt_ns_release(struct mnt_namespace *ns) { … } DEFINE_FREE(…) static void mnt_ns_tree_remove(struct mnt_namespace *ns) { … } /* * Returns the mount namespace which either has the specified id, or has the * next smallest id afer the specified one. */ static struct mnt_namespace *mnt_ns_find_id_at(u64 mnt_ns_id) { … } /* * Lookup a mount namespace by id and take a passive reference count. Taking a * passive reference means the mount namespace can be emptied if e.g., the last * task holding an active reference exits. To access the mounts of the * namespace the @namespace_sem must first be acquired. If the namespace has * already shut down before acquiring @namespace_sem, {list,stat}mount() will * see that the mount rbtree of the namespace is empty. */ static struct mnt_namespace *lookup_mnt_ns(u64 mnt_ns_id) { … } static inline void lock_mount_hash(void) { … } static inline void unlock_mount_hash(void) { … } static inline struct hlist_head *m_hash(struct vfsmount *mnt, struct dentry *dentry) { … } static inline struct hlist_head *mp_hash(struct dentry *dentry) { … } static int mnt_alloc_id(struct mount *mnt) { … } static void mnt_free_id(struct mount *mnt) { … } /* * Allocate a new peer group ID */ static int mnt_alloc_group_id(struct mount *mnt) { … } /* * Release a peer group ID */ void mnt_release_group_id(struct mount *mnt) { … } /* * vfsmount lock must be held for read */ static inline void mnt_add_count(struct mount *mnt, int n) { … } /* * vfsmount lock must be held for write */ int mnt_get_count(struct mount *mnt) { … } static struct mount *alloc_vfsmnt(const char *name) { … } /* * Most r/o checks on a fs are for operations that take * discrete amounts of time, like a write() or unlink(). * We must keep track of when those operations start * (for permission checks) and when they end, so that * we can determine when writes are able to occur to * a filesystem. */ /* * __mnt_is_readonly: check whether a mount is read-only * @mnt: the mount to check for its write status * * This shouldn't be used directly ouside of the VFS. * It does not guarantee that the filesystem will stay * r/w, just that it is right *now*. This can not and * should not be used in place of IS_RDONLY(inode). * mnt_want/drop_write() will _keep_ the filesystem * r/w. */ bool __mnt_is_readonly(struct vfsmount *mnt) { … } EXPORT_SYMBOL_GPL(…); static inline void mnt_inc_writers(struct mount *mnt) { … } static inline void mnt_dec_writers(struct mount *mnt) { … } static unsigned int mnt_get_writers(struct mount *mnt) { … } static int mnt_is_readonly(struct vfsmount *mnt) { … } /* * Most r/o & frozen checks on a fs are for operations that take discrete * amounts of time, like a write() or unlink(). We must keep track of when * those operations start (for permission checks) and when they end, so that we * can determine when writes are able to occur to a filesystem. */ /** * mnt_get_write_access - get write access to a mount without freeze protection * @m: the mount on which to take a write * * This tells the low-level filesystem that a write is about to be performed to * it, and makes sure that writes are allowed (mnt it read-write) before * returning success. This operation does not protect against filesystem being * frozen. When the write operation is finished, mnt_put_write_access() must be * called. This is effectively a refcount. */ int mnt_get_write_access(struct vfsmount *m) { … } EXPORT_SYMBOL_GPL(…); /** * mnt_want_write - get write access to a mount * @m: the mount on which to take a write * * This tells the low-level filesystem that a write is about to be performed to * it, and makes sure that writes are allowed (mount is read-write, filesystem * is not frozen) before returning success. When the write operation is * finished, mnt_drop_write() must be called. This is effectively a refcount. */ int mnt_want_write(struct vfsmount *m) { … } EXPORT_SYMBOL_GPL(…); /** * mnt_get_write_access_file - get write access to a file's mount * @file: the file who's mount on which to take a write * * This is like mnt_get_write_access, but if @file is already open for write it * skips incrementing mnt_writers (since the open file already has a reference) * and instead only does the check for emergency r/o remounts. This must be * paired with mnt_put_write_access_file. */ int mnt_get_write_access_file(struct file *file) { … } /** * mnt_want_write_file - get write access to a file's mount * @file: the file who's mount on which to take a write * * This is like mnt_want_write, but if the file is already open for writing it * skips incrementing mnt_writers (since the open file already has a reference) * and instead only does the freeze protection and the check for emergency r/o * remounts. This must be paired with mnt_drop_write_file. */ int mnt_want_write_file(struct file *file) { … } EXPORT_SYMBOL_GPL(…); /** * mnt_put_write_access - give up write access to a mount * @mnt: the mount on which to give up write access * * Tells the low-level filesystem that we are done * performing writes to it. Must be matched with * mnt_get_write_access() call above. */ void mnt_put_write_access(struct vfsmount *mnt) { … } EXPORT_SYMBOL_GPL(…); /** * mnt_drop_write - give up write access to a mount * @mnt: the mount on which to give up write access * * Tells the low-level filesystem that we are done performing writes to it and * also allows filesystem to be frozen again. Must be matched with * mnt_want_write() call above. */ void mnt_drop_write(struct vfsmount *mnt) { … } EXPORT_SYMBOL_GPL(…); void mnt_put_write_access_file(struct file *file) { … } void mnt_drop_write_file(struct file *file) { … } EXPORT_SYMBOL(…); /** * mnt_hold_writers - prevent write access to the given mount * @mnt: mnt to prevent write access to * * Prevents write access to @mnt if there are no active writers for @mnt. * This function needs to be called and return successfully before changing * properties of @mnt that need to remain stable for callers with write access * to @mnt. * * After this functions has been called successfully callers must pair it with * a call to mnt_unhold_writers() in order to stop preventing write access to * @mnt. * * Context: This function expects lock_mount_hash() to be held serializing * setting MNT_WRITE_HOLD. * Return: On success 0 is returned. * On error, -EBUSY is returned. */ static inline int mnt_hold_writers(struct mount *mnt) { … } /** * mnt_unhold_writers - stop preventing write access to the given mount * @mnt: mnt to stop preventing write access to * * Stop preventing write access to @mnt allowing callers to gain write access * to @mnt again. * * This function can only be called after a successful call to * mnt_hold_writers(). * * Context: This function expects lock_mount_hash() to be held. */ static inline void mnt_unhold_writers(struct mount *mnt) { … } static int mnt_make_readonly(struct mount *mnt) { … } int sb_prepare_remount_readonly(struct super_block *sb) { … } static void free_vfsmnt(struct mount *mnt) { … } static void delayed_free_vfsmnt(struct rcu_head *head) { … } /* call under rcu_read_lock */ int __legitimize_mnt(struct vfsmount *bastard, unsigned seq) { … } /* call under rcu_read_lock */ static bool legitimize_mnt(struct vfsmount *bastard, unsigned seq) { … } /** * __lookup_mnt - find first child mount * @mnt: parent mount * @dentry: mountpoint * * If @mnt has a child mount @c mounted @dentry find and return it. * * Note that the child mount @c need not be unique. There are cases * where shadow mounts are created. For example, during mount * propagation when a source mount @mnt whose root got overmounted by a * mount @o after path lookup but before @namespace_sem could be * acquired gets copied and propagated. So @mnt gets copied including * @o. When @mnt is propagated to a destination mount @d that already * has another mount @n mounted at the same mountpoint then the source * mount @mnt will be tucked beneath @n, i.e., @n will be mounted on * @mnt and @mnt mounted on @d. Now both @n and @o are mounted at @mnt * on @dentry. * * Return: The first child of @mnt mounted @dentry or NULL. */ struct mount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry) { … } /* * lookup_mnt - Return the first child mount mounted at path * * "First" means first mounted chronologically. If you create the * following mounts: * * mount /dev/sda1 /mnt * mount /dev/sda2 /mnt * mount /dev/sda3 /mnt * * Then lookup_mnt() on the base /mnt dentry in the root mount will * return successively the root dentry and vfsmount of /dev/sda1, then * /dev/sda2, then /dev/sda3, then NULL. * * lookup_mnt takes a reference to the found vfsmount. */ struct vfsmount *lookup_mnt(const struct path *path) { … } /* * __is_local_mountpoint - Test to see if dentry is a mountpoint in the * current mount namespace. * * The common case is dentries are not mountpoints at all and that * test is handled inline. For the slow case when we are actually * dealing with a mountpoint of some kind, walk through all of the * mounts in the current mount namespace and test to see if the dentry * is a mountpoint. * * The mount_hashtable is not usable in the context because we * need to identify all mounts that may be in the current mount * namespace not just a mount that happens to have some specified * parent mount. */ bool __is_local_mountpoint(struct dentry *dentry) { … } static struct mountpoint *lookup_mountpoint(struct dentry *dentry) { … } static struct mountpoint *get_mountpoint(struct dentry *dentry) { … } /* * vfsmount lock must be held. Additionally, the caller is responsible * for serializing calls for given disposal list. */ static void __put_mountpoint(struct mountpoint *mp, struct list_head *list) { … } /* called with namespace_lock and vfsmount lock */ static void put_mountpoint(struct mountpoint *mp) { … } static inline int check_mnt(struct mount *mnt) { … } /* * vfsmount lock must be held for write */ static void touch_mnt_namespace(struct mnt_namespace *ns) { … } /* * vfsmount lock must be held for write */ static void __touch_mnt_namespace(struct mnt_namespace *ns) { … } /* * vfsmount lock must be held for write */ static struct mountpoint *unhash_mnt(struct mount *mnt) { … } /* * vfsmount lock must be held for write */ static void umount_mnt(struct mount *mnt) { … } /* * vfsmount lock must be held for write */ void mnt_set_mountpoint(struct mount *mnt, struct mountpoint *mp, struct mount *child_mnt) { … } /** * mnt_set_mountpoint_beneath - mount a mount beneath another one * * @new_parent: the source mount * @top_mnt: the mount beneath which @new_parent is mounted * @new_mp: the new mountpoint of @top_mnt on @new_parent * * Remove @top_mnt from its current mountpoint @top_mnt->mnt_mp and * parent @top_mnt->mnt_parent and mount it on top of @new_parent at * @new_mp. And mount @new_parent on the old parent and old * mountpoint of @top_mnt. * * Context: This function expects namespace_lock() and lock_mount_hash() * to have been acquired in that order. */ static void mnt_set_mountpoint_beneath(struct mount *new_parent, struct mount *top_mnt, struct mountpoint *new_mp) { … } static void __attach_mnt(struct mount *mnt, struct mount *parent) { … } /** * attach_mnt - mount a mount, attach to @mount_hashtable and parent's * list of child mounts * @parent: the parent * @mnt: the new mount * @mp: the new mountpoint * @beneath: whether to mount @mnt beneath or on top of @parent * * If @beneath is false, mount @mnt at @mp on @parent. Then attach @mnt * to @parent's child mount list and to @mount_hashtable. * * If @beneath is true, remove @mnt from its current parent and * mountpoint and mount it on @mp on @parent, and mount @parent on the * old parent and old mountpoint of @mnt. Finally, attach @parent to * @mnt_hashtable and @parent->mnt_parent->mnt_mounts. * * Note, when __attach_mnt() is called @mnt->mnt_parent already points * to the correct parent. * * Context: This function expects namespace_lock() and lock_mount_hash() * to have been acquired in that order. */ static void attach_mnt(struct mount *mnt, struct mount *parent, struct mountpoint *mp, bool beneath) { … } void mnt_change_mountpoint(struct mount *parent, struct mountpoint *mp, struct mount *mnt) { … } static inline struct mount *node_to_mount(struct rb_node *node) { … } static void mnt_add_to_ns(struct mnt_namespace *ns, struct mount *mnt) { … } /* * vfsmount lock must be held for write */ static void commit_tree(struct mount *mnt) { … } static struct mount *next_mnt(struct mount *p, struct mount *root) { … } static struct mount *skip_mnt_tree(struct mount *p) { … } /** * vfs_create_mount - Create a mount for a configured superblock * @fc: The configuration context with the superblock attached * * Create a mount to an already configured superblock. If necessary, the * caller should invoke vfs_get_tree() before calling this. * * Note that this does not attach the mount to anything. */ struct vfsmount *vfs_create_mount(struct fs_context *fc) { … } EXPORT_SYMBOL(…); struct vfsmount *fc_mount(struct fs_context *fc) { … } EXPORT_SYMBOL(…); struct vfsmount *vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void *data) { … } EXPORT_SYMBOL_GPL(…); struct vfsmount * vfs_submount(const struct dentry *mountpoint, struct file_system_type *type, const char *name, void *data) { … } EXPORT_SYMBOL_GPL(…); static struct mount *clone_mnt(struct mount *old, struct dentry *root, int flag) { … } static void cleanup_mnt(struct mount *mnt) { … } static void __cleanup_mnt(struct rcu_head *head) { … } static LLIST_HEAD(delayed_mntput_list); static void delayed_mntput(struct work_struct *unused) { … } static DECLARE_DELAYED_WORK(delayed_mntput_work, delayed_mntput); static void mntput_no_expire(struct mount *mnt) { … } void mntput(struct vfsmount *mnt) { … } EXPORT_SYMBOL(…); struct vfsmount *mntget(struct vfsmount *mnt) { … } EXPORT_SYMBOL(…); /* * Make a mount point inaccessible to new lookups. * Because there may still be current users, the caller MUST WAIT * for an RCU grace period before destroying the mount point. */ void mnt_make_shortterm(struct vfsmount *mnt) { … } /** * path_is_mountpoint() - Check if path is a mount in the current namespace. * @path: path to check * * d_mountpoint() can only be used reliably to establish if a dentry is * not mounted in any namespace and that common case is handled inline. * d_mountpoint() isn't aware of the possibility there may be multiple * mounts using a given dentry in a different namespace. This function * checks if the passed in path is a mountpoint rather than the dentry * alone. */ bool path_is_mountpoint(const struct path *path) { … } EXPORT_SYMBOL(…); struct vfsmount *mnt_clone_internal(const struct path *path) { … } /* * Returns the mount which either has the specified mnt_id, or has the next * smallest id afer the specified one. */ static struct mount *mnt_find_id_at(struct mnt_namespace *ns, u64 mnt_id) { … } /* * Returns the mount which either has the specified mnt_id, or has the next * greater id before the specified one. */ static struct mount *mnt_find_id_at_reverse(struct mnt_namespace *ns, u64 mnt_id) { … } #ifdef CONFIG_PROC_FS /* iterator; we want it to have access to namespace_sem, thus here... */ static void *m_start(struct seq_file *m, loff_t *pos) { … } static void *m_next(struct seq_file *m, void *v, loff_t *pos) { … } static void m_stop(struct seq_file *m, void *v) { … } static int m_show(struct seq_file *m, void *v) { … } const struct seq_operations mounts_op = …; #endif /* CONFIG_PROC_FS */ /** * may_umount_tree - check if a mount tree is busy * @m: root of mount tree * * This is called to check if a tree of mounts has any * open files, pwds, chroots or sub mounts that are * busy. */ int may_umount_tree(struct vfsmount *m) { … } EXPORT_SYMBOL(…); /** * may_umount - check if a mount point is busy * @mnt: root of mount * * This is called to check if a mount point has any * open files, pwds, chroots or sub mounts. If the * mount has sub mounts this will return busy * regardless of whether the sub mounts are busy. * * Doesn't take quota and stuff into account. IOW, in some cases it will * give false negatives. The main reason why it's here is that we need * a non-destructive way to look for easily umountable filesystems. */ int may_umount(struct vfsmount *mnt) { … } EXPORT_SYMBOL(…); static void namespace_unlock(void) { … } static inline void namespace_lock(void) { … } enum umount_tree_flags { … }; static bool disconnect_mount(struct mount *mnt, enum umount_tree_flags how) { … } /* * mount_lock must be held * namespace_sem must be held for write */ static void umount_tree(struct mount *mnt, enum umount_tree_flags how) { … } static void shrink_submounts(struct mount *mnt); static int do_umount_root(struct super_block *sb) { … } static int do_umount(struct mount *mnt, int flags) { … } /* * __detach_mounts - lazily unmount all mounts on the specified dentry * * During unlink, rmdir, and d_drop it is possible to loose the path * to an existing mountpoint, and wind up leaking the mount. * detach_mounts allows lazily unmounting those mounts instead of * leaking them. * * The caller may hold dentry->d_inode->i_mutex. */ void __detach_mounts(struct dentry *dentry) { … } /* * Is the caller allowed to modify his namespace? */ bool may_mount(void) { … } static void warn_mandlock(void) { … } static int can_umount(const struct path *path, int flags) { … } // caller is responsible for flags being sane int path_umount(struct path *path, int flags) { … } static int ksys_umount(char __user *name, int flags) { … } SYSCALL_DEFINE2(umount, char __user *, name, int, flags) { … } #ifdef __ARCH_WANT_SYS_OLDUMOUNT /* * The 2.0 compatible umount. No flags. */ SYSCALL_DEFINE1(oldumount, char __user *, name) { … } #endif static bool is_mnt_ns_file(struct dentry *dentry) { … } static struct mnt_namespace *to_mnt_ns(struct ns_common *ns) { … } struct ns_common *from_mnt_ns(struct mnt_namespace *mnt) { … } static bool mnt_ns_loop(struct dentry *dentry) { … } struct mount *copy_tree(struct mount *src_root, struct dentry *dentry, int flag) { … } /* Caller should check returned pointer for errors */ struct vfsmount *collect_mounts(const struct path *path) { … } static void free_mnt_ns(struct mnt_namespace *); static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *, bool); void dissolve_on_fput(struct vfsmount *mnt) { … } void drop_collected_mounts(struct vfsmount *mnt) { … } bool has_locked_children(struct mount *mnt, struct dentry *dentry) { … } /** * clone_private_mount - create a private clone of a path * @path: path to clone * * This creates a new vfsmount, which will be the clone of @path. The new mount * will not be attached anywhere in the namespace and will be private (i.e. * changes to the originating mount won't be propagated into this). * * Release with mntput(). */ struct vfsmount *clone_private_mount(const struct path *path) { … } EXPORT_SYMBOL_GPL(…); int iterate_mounts(int (*f)(struct vfsmount *, void *), void *arg, struct vfsmount *root) { … } static void lock_mnt_tree(struct mount *mnt) { … } static void cleanup_group_ids(struct mount *mnt, struct mount *end) { … } static int invent_group_ids(struct mount *mnt, bool recurse) { … } int count_mounts(struct mnt_namespace *ns, struct mount *mnt) { … } enum mnt_tree_flags_t { … }; /** * attach_recursive_mnt - attach a source mount tree * @source_mnt: mount tree to be attached * @top_mnt: mount that @source_mnt will be mounted on or mounted beneath * @dest_mp: the mountpoint @source_mnt will be mounted at * @flags: modify how @source_mnt is supposed to be attached * * NOTE: in the table below explains the semantics when a source mount * of a given type is attached to a destination mount of a given type. * --------------------------------------------------------------------------- * | BIND MOUNT OPERATION | * |************************************************************************** * | source-->| shared | private | slave | unbindable | * | dest | | | | | * | | | | | | | * | v | | | | | * |************************************************************************** * | shared | shared (++) | shared (+) | shared(+++)| invalid | * | | | | | | * |non-shared| shared (+) | private | slave (*) | invalid | * *************************************************************************** * A bind operation clones the source mount and mounts the clone on the * destination mount. * * (++) the cloned mount is propagated to all the mounts in the propagation * tree of the destination mount and the cloned mount is added to * the peer group of the source mount. * (+) the cloned mount is created under the destination mount and is marked * as shared. The cloned mount is added to the peer group of the source * mount. * (+++) the mount is propagated to all the mounts in the propagation tree * of the destination mount and the cloned mount is made slave * of the same master as that of the source mount. The cloned mount * is marked as 'shared and slave'. * (*) the cloned mount is made a slave of the same master as that of the * source mount. * * --------------------------------------------------------------------------- * | MOVE MOUNT OPERATION | * |************************************************************************** * | source-->| shared | private | slave | unbindable | * | dest | | | | | * | | | | | | | * | v | | | | | * |************************************************************************** * | shared | shared (+) | shared (+) | shared(+++) | invalid | * | | | | | | * |non-shared| shared (+*) | private | slave (*) | unbindable | * *************************************************************************** * * (+) the mount is moved to the destination. And is then propagated to * all the mounts in the propagation tree of the destination mount. * (+*) the mount is moved to the destination. * (+++) the mount is moved to the destination and is then propagated to * all the mounts belonging to the destination mount's propagation tree. * the mount is marked as 'shared and slave'. * (*) the mount continues to be a slave at the new location. * * if the source mount is a tree, the operations explained above is * applied to each mount in the tree. * Must be called without spinlocks held, since this function can sleep * in allocations. * * Context: The function expects namespace_lock() to be held. * Return: If @source_mnt was successfully attached 0 is returned. * Otherwise a negative error code is returned. */ static int attach_recursive_mnt(struct mount *source_mnt, struct mount *top_mnt, struct mountpoint *dest_mp, enum mnt_tree_flags_t flags) { … } /** * do_lock_mount - lock mount and mountpoint * @path: target path * @beneath: whether the intention is to mount beneath @path * * Follow the mount stack on @path until the top mount @mnt is found. If * the initial @path->{mnt,dentry} is a mountpoint lookup the first * mount stacked on top of it. Then simply follow @{mnt,mnt->mnt_root} * until nothing is stacked on top of it anymore. * * Acquire the inode_lock() on the top mount's ->mnt_root to protect * against concurrent removal of the new mountpoint from another mount * namespace. * * If @beneath is requested, acquire inode_lock() on @mnt's mountpoint * @mp on @mnt->mnt_parent must be acquired. This protects against a * concurrent unlink of @mp->mnt_dentry from another mount namespace * where @mnt doesn't have a child mount mounted @mp. A concurrent * removal of @mnt->mnt_root doesn't matter as nothing will be mounted * on top of it for @beneath. * * In addition, @beneath needs to make sure that @mnt hasn't been * unmounted or moved from its current mountpoint in between dropping * @mount_lock and acquiring @namespace_sem. For the !@beneath case @mnt * being unmounted would be detected later by e.g., calling * check_mnt(mnt) in the function it's called from. For the @beneath * case however, it's useful to detect it directly in do_lock_mount(). * If @mnt hasn't been unmounted then @mnt->mnt_mountpoint still points * to @mnt->mnt_mp->m_dentry. But if @mnt has been unmounted it will * point to @mnt->mnt_root and @mnt->mnt_mp will be NULL. * * Return: Either the target mountpoint on the top mount or the top * mount's mountpoint. */ static struct mountpoint *do_lock_mount(struct path *path, bool beneath) { … } static inline struct mountpoint *lock_mount(struct path *path) { … } static void unlock_mount(struct mountpoint *where) { … } static int graft_tree(struct mount *mnt, struct mount *p, struct mountpoint *mp) { … } /* * Sanity check the flags to change_mnt_propagation. */ static int flags_to_propagation_type(int ms_flags) { … } /* * recursively change the type of the mountpoint. */ static int do_change_type(struct path *path, int ms_flags) { … } static struct mount *__do_loopback(struct path *old_path, int recurse) { … } /* * do loopback mount. */ static int do_loopback(struct path *path, const char *old_name, int recurse) { … } static struct file *open_detached_copy(struct path *path, bool recursive) { … } SYSCALL_DEFINE3(open_tree, int, dfd, const char __user *, filename, unsigned, flags) { … } /* * Don't allow locked mount flags to be cleared. * * No locks need to be held here while testing the various MNT_LOCK * flags because those flags can never be cleared once they are set. */ static bool can_change_locked_flags(struct mount *mnt, unsigned int mnt_flags) { … } static int change_mount_ro_state(struct mount *mnt, unsigned int mnt_flags) { … } static void set_mount_attributes(struct mount *mnt, unsigned int mnt_flags) { … } static void mnt_warn_timestamp_expiry(struct path *mountpoint, struct vfsmount *mnt) { … } /* * Handle reconfiguration of the mountpoint only without alteration of the * superblock it refers to. This is triggered by specifying MS_REMOUNT|MS_BIND * to mount(2). */ static int do_reconfigure_mnt(struct path *path, unsigned int mnt_flags) { … } /* * change filesystem flags. dir should be a physical root of filesystem. * If you've mounted a non-root directory somewhere and want to do remount * on it - tough luck. */ static int do_remount(struct path *path, int ms_flags, int sb_flags, int mnt_flags, void *data) { … } static inline int tree_contains_unbindable(struct mount *mnt) { … } /* * Check that there aren't references to earlier/same mount namespaces in the * specified subtree. Such references can act as pins for mount namespaces * that aren't checked by the mount-cycle checking code, thereby allowing * cycles to be made. */ static bool check_for_nsfs_mounts(struct mount *subtree) { … } static int do_set_group(struct path *from_path, struct path *to_path) { … } /** * path_overmounted - check if path is overmounted * @path: path to check * * Check if path is overmounted, i.e., if there's a mount on top of * @path->mnt with @path->dentry as mountpoint. * * Context: This function expects namespace_lock() to be held. * Return: If path is overmounted true is returned, false if not. */ static inline bool path_overmounted(const struct path *path) { … } /** * can_move_mount_beneath - check that we can mount beneath the top mount * @from: mount to mount beneath * @to: mount under which to mount * @mp: mountpoint of @to * * - Make sure that @to->dentry is actually the root of a mount under * which we can mount another mount. * - Make sure that nothing can be mounted beneath the caller's current * root or the rootfs of the namespace. * - Make sure that the caller can unmount the topmost mount ensuring * that the caller could reveal the underlying mountpoint. * - Ensure that nothing has been mounted on top of @from before we * grabbed @namespace_sem to avoid creating pointless shadow mounts. * - Prevent mounting beneath a mount if the propagation relationship * between the source mount, parent mount, and top mount would lead to * nonsensical mount trees. * * Context: This function expects namespace_lock() to be held. * Return: On success 0, and on error a negative error code is returned. */ static int can_move_mount_beneath(const struct path *from, const struct path *to, const struct mountpoint *mp) { … } static int do_move_mount(struct path *old_path, struct path *new_path, bool beneath) { … } static int do_move_mount_old(struct path *path, const char *old_name) { … } /* * add a mount into a namespace's mount tree */ static int do_add_mount(struct mount *newmnt, struct mountpoint *mp, const struct path *path, int mnt_flags) { … } static bool mount_too_revealing(const struct super_block *sb, int *new_mnt_flags); /* * Create a new mount using a superblock configuration and request it * be added to the namespace tree. */ static int do_new_mount_fc(struct fs_context *fc, struct path *mountpoint, unsigned int mnt_flags) { … } /* * create a new mount for userspace and request it to be added into the * namespace's tree */ static int do_new_mount(struct path *path, const char *fstype, int sb_flags, int mnt_flags, const char *name, void *data) { … } int finish_automount(struct vfsmount *m, const struct path *path) { … } /** * mnt_set_expiry - Put a mount on an expiration list * @mnt: The mount to list. * @expiry_list: The list to add the mount to. */ void mnt_set_expiry(struct vfsmount *mnt, struct list_head *expiry_list) { … } EXPORT_SYMBOL(…); /* * process a list of expirable mountpoints with the intent of discarding any * mountpoints that aren't in use and haven't been touched since last we came * here */ void mark_mounts_for_expiry(struct list_head *mounts) { … } EXPORT_SYMBOL_GPL(…); /* * Ripoff of 'select_parent()' * * search the list of submounts for a given mountpoint, and move any * shrinkable submounts to the 'graveyard' list. */ static int select_submounts(struct mount *parent, struct list_head *graveyard) { … } /* * process a list of expirable mountpoints with the intent of discarding any * submounts of a specific parent mountpoint * * mount_lock must be held for write */ static void shrink_submounts(struct mount *mnt) { … } static void *copy_mount_options(const void __user * data) { … } static char *copy_mount_string(const void __user *data) { … } /* * Flags is a 32-bit value that allows up to 31 non-fs dependent flags to * be given to the mount() call (ie: read-only, no-dev, no-suid etc). * * data is a (void *) that can point to any structure up to * PAGE_SIZE-1 bytes, which can contain arbitrary fs-dependent * information (or be NULL). * * Pre-0.97 versions of mount() didn't have a flags word. * When the flags word was introduced its top half was required * to have the magic value 0xC0ED, and this remained so until 2.4.0-test9. * Therefore, if this magic number is present, it carries no information * and must be discarded. */ int path_mount(const char *dev_name, struct path *path, const char *type_page, unsigned long flags, void *data_page) { … } long do_mount(const char *dev_name, const char __user *dir_name, const char *type_page, unsigned long flags, void *data_page) { … } static struct ucounts *inc_mnt_namespaces(struct user_namespace *ns) { … } static void dec_mnt_namespaces(struct ucounts *ucounts) { … } static void free_mnt_ns(struct mnt_namespace *ns) { … } /* * Assign a sequence number so we can detect when we attempt to bind * mount a reference to an older mount namespace into the current * mount namespace, preventing reference counting loops. A 64bit * number incrementing at 10Ghz will take 12,427 years to wrap which * is effectively never, so we can ignore the possibility. */ static atomic64_t mnt_ns_seq = …; static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool anon) { … } __latent_entropy struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns, struct user_namespace *user_ns, struct fs_struct *new_fs) { … } struct dentry *mount_subtree(struct vfsmount *m, const char *name) { … } EXPORT_SYMBOL(…); SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name, char __user *, type, unsigned long, flags, void __user *, data) { … } #define FSMOUNT_VALID_FLAGS … #define MOUNT_SETATTR_VALID_FLAGS … #define MOUNT_SETATTR_PROPAGATION_FLAGS … static unsigned int attr_flags_to_mnt_flags(u64 attr_flags) { … } /* * Create a kernel mount representation for a new, prepared superblock * (specified by fs_fd) and attach to an open_tree-like file descriptor. */ SYSCALL_DEFINE3(fsmount, int, fs_fd, unsigned int, flags, unsigned int, attr_flags) { … } /* * Move a mount from one place to another. In combination with * fsopen()/fsmount() this is used to install a new mount and in combination * with open_tree(OPEN_TREE_CLONE [| AT_RECURSIVE]) it can be used to copy * a mount subtree. * * Note the flags value is a combination of MOVE_MOUNT_* flags. */ SYSCALL_DEFINE5(move_mount, int, from_dfd, const char __user *, from_pathname, int, to_dfd, const char __user *, to_pathname, unsigned int, flags) { … } /* * Return true if path is reachable from root * * namespace_sem or mount_lock is held */ bool is_path_reachable(struct mount *mnt, struct dentry *dentry, const struct path *root) { … } bool path_is_under(const struct path *path1, const struct path *path2) { … } EXPORT_SYMBOL(…); /* * pivot_root Semantics: * Moves the root file system of the current process to the directory put_old, * makes new_root as the new root file system of the current process, and sets * root/cwd of all processes which had them on the current root to new_root. * * Restrictions: * The new_root and put_old must be directories, and must not be on the * same file system as the current process root. The put_old must be * underneath new_root, i.e. adding a non-zero number of /.. to the string * pointed to by put_old must yield the same directory as new_root. No other * file system may be mounted on put_old. After all, new_root is a mountpoint. * * Also, the current root cannot be on the 'rootfs' (initial ramfs) filesystem. * See Documentation/filesystems/ramfs-rootfs-initramfs.rst for alternatives * in this situation. * * Notes: * - we don't move root/cwd if they are not at the root (reason: if something * cared enough to change them, it's probably wrong to force them elsewhere) * - it's okay to pick a root that isn't the root of a file system, e.g. * /nfs/my_root where /nfs is the mount point. It must be a mountpoint, * though, so you may need to say mount --bind /nfs/my_root /nfs/my_root * first. */ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root, const char __user *, put_old) { … } static unsigned int recalc_flags(struct mount_kattr *kattr, struct mount *mnt) { … } static int can_idmap_mount(const struct mount_kattr *kattr, struct mount *mnt) { … } /** * mnt_allow_writers() - check whether the attribute change allows writers * @kattr: the new mount attributes * @mnt: the mount to which @kattr will be applied * * Check whether thew new mount attributes in @kattr allow concurrent writers. * * Return: true if writers need to be held, false if not */ static inline bool mnt_allow_writers(const struct mount_kattr *kattr, const struct mount *mnt) { … } static int mount_setattr_prepare(struct mount_kattr *kattr, struct mount *mnt) { … } static void do_idmap_mount(const struct mount_kattr *kattr, struct mount *mnt) { … } static void mount_setattr_commit(struct mount_kattr *kattr, struct mount *mnt) { … } static int do_mount_setattr(struct path *path, struct mount_kattr *kattr) { … } static int build_mount_idmapped(const struct mount_attr *attr, size_t usize, struct mount_kattr *kattr, unsigned int flags) { … } static int build_mount_kattr(const struct mount_attr *attr, size_t usize, struct mount_kattr *kattr, unsigned int flags) { … } static void finish_mount_kattr(struct mount_kattr *kattr) { … } SYSCALL_DEFINE5(mount_setattr, int, dfd, const char __user *, path, unsigned int, flags, struct mount_attr __user *, uattr, size_t, usize) { … } int show_path(struct seq_file *m, struct dentry *root) { … } static struct vfsmount *lookup_mnt_in_ns(u64 id, struct mnt_namespace *ns) { … } struct kstatmount { … }; static u64 mnt_to_attr_flags(struct vfsmount *mnt) { … } static u64 mnt_to_propagation_flags(struct mount *m) { … } static void statmount_sb_basic(struct kstatmount *s) { … } static void statmount_mnt_basic(struct kstatmount *s) { … } static void statmount_propagate_from(struct kstatmount *s) { … } static int statmount_mnt_root(struct kstatmount *s, struct seq_file *seq) { … } static int statmount_mnt_point(struct kstatmount *s, struct seq_file *seq) { … } static int statmount_fs_type(struct kstatmount *s, struct seq_file *seq) { … } static void statmount_mnt_ns_id(struct kstatmount *s, struct mnt_namespace *ns) { … } static int statmount_mnt_opts(struct kstatmount *s, struct seq_file *seq) { … } static int statmount_string(struct kstatmount *s, u64 flag) { … } static int copy_statmount_to_user(struct kstatmount *s) { … } static struct mount *listmnt_next(struct mount *curr, bool reverse) { … } static int grab_requested_root(struct mnt_namespace *ns, struct path *root) { … } static int do_statmount(struct kstatmount *s, u64 mnt_id, u64 mnt_ns_id, struct mnt_namespace *ns) { … } static inline bool retry_statmount(const long ret, size_t *seq_size) { … } #define STATMOUNT_STRING_REQ … static int prepare_kstatmount(struct kstatmount *ks, struct mnt_id_req *kreq, struct statmount __user *buf, size_t bufsize, size_t seq_size) { … } static int copy_mnt_id_req(const struct mnt_id_req __user *req, struct mnt_id_req *kreq) { … } /* * If the user requested a specific mount namespace id, look that up and return * that, or if not simply grab a passive reference on our mount namespace and * return that. */ static struct mnt_namespace *grab_requested_mnt_ns(u64 mnt_ns_id) { … } SYSCALL_DEFINE4(statmount, const struct mnt_id_req __user *, req, struct statmount __user *, buf, size_t, bufsize, unsigned int, flags) { … } static ssize_t do_listmount(struct mnt_namespace *ns, u64 mnt_parent_id, u64 last_mnt_id, u64 *mnt_ids, size_t nr_mnt_ids, bool reverse) { … } SYSCALL_DEFINE4(listmount, const struct mnt_id_req __user *, req, u64 __user *, mnt_ids, size_t, nr_mnt_ids, unsigned int, flags) { … } static void __init init_mount_tree(void) { … } void __init mnt_init(void) { … } void put_mnt_ns(struct mnt_namespace *ns) { … } struct vfsmount *kern_mount(struct file_system_type *type) { … } EXPORT_SYMBOL_GPL(…); void kern_unmount(struct vfsmount *mnt) { … } EXPORT_SYMBOL(…); void kern_unmount_array(struct vfsmount *mnt[], unsigned int num) { … } EXPORT_SYMBOL(…); bool our_mnt(struct vfsmount *mnt) { … } bool current_chrooted(void) { … } static bool mnt_already_visible(struct mnt_namespace *ns, const struct super_block *sb, int *new_mnt_flags) { … } static bool mount_too_revealing(const struct super_block *sb, int *new_mnt_flags) { … } bool mnt_may_suid(struct vfsmount *mnt) { … } static struct ns_common *mntns_get(struct task_struct *task) { … } static void mntns_put(struct ns_common *ns) { … } static int mntns_install(struct nsset *nsset, struct ns_common *ns) { … } static struct user_namespace *mntns_owner(struct ns_common *ns) { … } const struct proc_ns_operations mntns_operations = …; #ifdef CONFIG_SYSCTL static struct ctl_table fs_namespace_sysctls[] = …; static int __init init_fs_namespace_sysctls(void) { … } fs_initcall(init_fs_namespace_sysctls); #endif /* CONFIG_SYSCTL */