namespace.c | Explore in Territory

// SPDX-License-Identifier: GPL-2.0-only
/*
 *  linux/fs/namespace.c
 *
 * (C) Copyright Al Viro 2000, 2001
 *
 * Based on code from fs/super.c, copyright Linus Torvalds and others.
 * Heavily rewritten.
 */

#include <linux/syscalls.h>
#include <linux/export.h>
#include <linux/capability.h>
#include <linux/mnt_namespace.h>
#include <linux/user_namespace.h>
#include <linux/namei.h>
#include <linux/security.h>
#include <linux/cred.h>
#include <linux/idr.h>
#include <linux/init.h>		/* init_rootfs */
#include <linux/fs_struct.h>	/* get_fs_root et.al. */
#include <linux/fsnotify.h>	/* fsnotify_vfsmount_delete */
#include <linux/file.h>
#include <linux/uaccess.h>
#include <linux/proc_ns.h>
#include <linux/magic.h>
#include <linux/memblock.h>
#include <linux/proc_fs.h>
#include <linux/task_work.h>
#include <linux/sched/task.h>
#include <uapi/linux/mount.h>
#include <linux/fs_context.h>
#include <linux/shmem_fs.h>
#include <linux/mnt_idmapping.h>
#include <linux/nospec.h>

#include "pnode.h"
#include "internal.h"

/* Maximum number of mounts in a mount namespace */
static unsigned int sysctl_mount_max __read_mostly = …;

static unsigned int m_hash_mask __ro_after_init;
static unsigned int m_hash_shift __ro_after_init;
static unsigned int mp_hash_mask __ro_after_init;
static unsigned int mp_hash_shift __ro_after_init;

static __initdata unsigned long mhash_entries;
static int __init set_mhash_entries(char *str)
{ … }
__setup(…);

static __initdata unsigned long mphash_entries;
static int __init set_mphash_entries(char *str)
{ … }
__setup(…);

static u64 event;
static DEFINE_IDA(mnt_id_ida);
static DEFINE_IDA(mnt_group_ida);

/* Don't allow confusion with old 32bit mount ID */
#define MNT_UNIQUE_ID_OFFSET …
static atomic64_t mnt_id_ctr = …;

static struct hlist_head *mount_hashtable __ro_after_init;
static struct hlist_head *mountpoint_hashtable __ro_after_init;
static struct kmem_cache *mnt_cache __ro_after_init;
static DECLARE_RWSEM(namespace_sem);
static HLIST_HEAD(unmounted);	/* protected by namespace_sem */
static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */
static DEFINE_RWLOCK(mnt_ns_tree_lock);
static struct rb_root mnt_ns_tree = …; /* protected by mnt_ns_tree_lock */

struct mount_kattr { … };

/* /sys/fs */
struct kobject *fs_kobj __ro_after_init;
EXPORT_SYMBOL_GPL(…);

/*
 * vfsmount lock may be taken for read to prevent changes to the
 * vfsmount hash, ie. during mountpoint lookups or walking back
 * up the tree.
 *
 * It should be taken for write in all cases where the vfsmount
 * tree or hash is modified or when a vfsmount structure is modified.
 */
__cacheline_aligned_in_smp DEFINE_SEQLOCK(…);

static int mnt_ns_cmp(u64 seq, const struct mnt_namespace *ns)
{ … }

static inline struct mnt_namespace *node_to_mnt_ns(const struct rb_node *node)
{ … }

static bool mnt_ns_less(struct rb_node *a, const struct rb_node *b)
{ … }

static void mnt_ns_tree_add(struct mnt_namespace *ns)
{ … }

static void mnt_ns_release(struct mnt_namespace *ns)
{ … }
DEFINE_FREE(…)

static void mnt_ns_tree_remove(struct mnt_namespace *ns)
{ … }

/*
 * Returns the mount namespace which either has the specified id, or has the
 * next smallest id afer the specified one.
 */
static struct mnt_namespace *mnt_ns_find_id_at(u64 mnt_ns_id)
{ … }

/*
 * Lookup a mount namespace by id and take a passive reference count. Taking a
 * passive reference means the mount namespace can be emptied if e.g., the last
 * task holding an active reference exits. To access the mounts of the
 * namespace the @namespace_sem must first be acquired. If the namespace has
 * already shut down before acquiring @namespace_sem, {list,stat}mount() will
 * see that the mount rbtree of the namespace is empty.
 */
static struct mnt_namespace *lookup_mnt_ns(u64 mnt_ns_id)
{ … }

static inline void lock_mount_hash(void)
{ … }

static inline void unlock_mount_hash(void)
{ … }

static inline struct hlist_head *m_hash(struct vfsmount *mnt, struct dentry *dentry)
{ … }

static inline struct hlist_head *mp_hash(struct dentry *dentry)
{ … }

static int mnt_alloc_id(struct mount *mnt)
{ … }

static void mnt_free_id(struct mount *mnt)
{ … }

/*
 * Allocate a new peer group ID
 */
static int mnt_alloc_group_id(struct mount *mnt)
{ … }

/*
 * Release a peer group ID
 */
void mnt_release_group_id(struct mount *mnt)
{ … }

/*
 * vfsmount lock must be held for read
 */
static inline void mnt_add_count(struct mount *mnt, int n)
{ … }

/*
 * vfsmount lock must be held for write
 */
int mnt_get_count(struct mount *mnt)
{ … }

static struct mount *alloc_vfsmnt(const char *name)
{ … }

/*
 * Most r/o checks on a fs are for operations that take
 * discrete amounts of time, like a write() or unlink().
 * We must keep track of when those operations start
 * (for permission checks) and when they end, so that
 * we can determine when writes are able to occur to
 * a filesystem.
 */
/*
 * __mnt_is_readonly: check whether a mount is read-only
 * @mnt: the mount to check for its write status
 *
 * This shouldn't be used directly ouside of the VFS.
 * It does not guarantee that the filesystem will stay
 * r/w, just that it is right *now*.  This can not and
 * should not be used in place of IS_RDONLY(inode).
 * mnt_want/drop_write() will _keep_ the filesystem
 * r/w.
 */
bool __mnt_is_readonly(struct vfsmount *mnt)
{ … }
EXPORT_SYMBOL_GPL(…);

static inline void mnt_inc_writers(struct mount *mnt)
{ … }

static inline void mnt_dec_writers(struct mount *mnt)
{ … }

static unsigned int mnt_get_writers(struct mount *mnt)
{ … }

static int mnt_is_readonly(struct vfsmount *mnt)
{ … }

/*
 * Most r/o & frozen checks on a fs are for operations that take discrete
 * amounts of time, like a write() or unlink().  We must keep track of when
 * those operations start (for permission checks) and when they end, so that we
 * can determine when writes are able to occur to a filesystem.
 */
/**
 * mnt_get_write_access - get write access to a mount without freeze protection
 * @m: the mount on which to take a write
 *
 * This tells the low-level filesystem that a write is about to be performed to
 * it, and makes sure that writes are allowed (mnt it read-write) before
 * returning success. This operation does not protect against filesystem being
 * frozen. When the write operation is finished, mnt_put_write_access() must be
 * called. This is effectively a refcount.
 */
int mnt_get_write_access(struct vfsmount *m)
{ … }
EXPORT_SYMBOL_GPL(…);

/**
 * mnt_want_write - get write access to a mount
 * @m: the mount on which to take a write
 *
 * This tells the low-level filesystem that a write is about to be performed to
 * it, and makes sure that writes are allowed (mount is read-write, filesystem
 * is not frozen) before returning success.  When the write operation is
 * finished, mnt_drop_write() must be called.  This is effectively a refcount.
 */
int mnt_want_write(struct vfsmount *m)
{ … }
EXPORT_SYMBOL_GPL(…);

/**
 * mnt_get_write_access_file - get write access to a file's mount
 * @file: the file who's mount on which to take a write
 *
 * This is like mnt_get_write_access, but if @file is already open for write it
 * skips incrementing mnt_writers (since the open file already has a reference)
 * and instead only does the check for emergency r/o remounts.  This must be
 * paired with mnt_put_write_access_file.
 */
int mnt_get_write_access_file(struct file *file)
{ … }

/**
 * mnt_want_write_file - get write access to a file's mount
 * @file: the file who's mount on which to take a write
 *
 * This is like mnt_want_write, but if the file is already open for writing it
 * skips incrementing mnt_writers (since the open file already has a reference)
 * and instead only does the freeze protection and the check for emergency r/o
 * remounts.  This must be paired with mnt_drop_write_file.
 */
int mnt_want_write_file(struct file *file)
{ … }
EXPORT_SYMBOL_GPL(…);

/**
 * mnt_put_write_access - give up write access to a mount
 * @mnt: the mount on which to give up write access
 *
 * Tells the low-level filesystem that we are done
 * performing writes to it.  Must be matched with
 * mnt_get_write_access() call above.
 */
void mnt_put_write_access(struct vfsmount *mnt)
{ … }
EXPORT_SYMBOL_GPL(…);

/**
 * mnt_drop_write - give up write access to a mount
 * @mnt: the mount on which to give up write access
 *
 * Tells the low-level filesystem that we are done performing writes to it and
 * also allows filesystem to be frozen again.  Must be matched with
 * mnt_want_write() call above.
 */
void mnt_drop_write(struct vfsmount *mnt)
{ … }
EXPORT_SYMBOL_GPL(…);

void mnt_put_write_access_file(struct file *file)
{ … }

void mnt_drop_write_file(struct file *file)
{ … }
EXPORT_SYMBOL(…);

/**
 * mnt_hold_writers - prevent write access to the given mount
 * @mnt: mnt to prevent write access to
 *
 * Prevents write access to @mnt if there are no active writers for @mnt.
 * This function needs to be called and return successfully before changing
 * properties of @mnt that need to remain stable for callers with write access
 * to @mnt.
 *
 * After this functions has been called successfully callers must pair it with
 * a call to mnt_unhold_writers() in order to stop preventing write access to
 * @mnt.
 *
 * Context: This function expects lock_mount_hash() to be held serializing
 *          setting MNT_WRITE_HOLD.
 * Return: On success 0 is returned.
 *	   On error, -EBUSY is returned.
 */
static inline int mnt_hold_writers(struct mount *mnt)
{ … }

/**
 * mnt_unhold_writers - stop preventing write access to the given mount
 * @mnt: mnt to stop preventing write access to
 *
 * Stop preventing write access to @mnt allowing callers to gain write access
 * to @mnt again.
 *
 * This function can only be called after a successful call to
 * mnt_hold_writers().
 *
 * Context: This function expects lock_mount_hash() to be held.
 */
static inline void mnt_unhold_writers(struct mount *mnt)
{ … }

static int mnt_make_readonly(struct mount *mnt)
{ … }

int sb_prepare_remount_readonly(struct super_block *sb)
{ … }

static void free_vfsmnt(struct mount *mnt)
{ … }

static void delayed_free_vfsmnt(struct rcu_head *head)
{ … }

/* call under rcu_read_lock */
int __legitimize_mnt(struct vfsmount *bastard, unsigned seq)
{ … }

/* call under rcu_read_lock */
static bool legitimize_mnt(struct vfsmount *bastard, unsigned seq)
{ … }

/**
 * __lookup_mnt - find first child mount
 * @mnt:	parent mount
 * @dentry:	mountpoint
 *
 * If @mnt has a child mount @c mounted @dentry find and return it.
 *
 * Note that the child mount @c need not be unique. There are cases
 * where shadow mounts are created. For example, during mount
 * propagation when a source mount @mnt whose root got overmounted by a
 * mount @o after path lookup but before @namespace_sem could be
 * acquired gets copied and propagated. So @mnt gets copied including
 * @o. When @mnt is propagated to a destination mount @d that already
 * has another mount @n mounted at the same mountpoint then the source
 * mount @mnt will be tucked beneath @n, i.e., @n will be mounted on
 * @mnt and @mnt mounted on @d. Now both @n and @o are mounted at @mnt
 * on @dentry.
 *
 * Return: The first child of @mnt mounted @dentry or NULL.
 */
struct mount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry)
{ … }

/*
 * lookup_mnt - Return the first child mount mounted at path
 *
 * "First" means first mounted chronologically.  If you create the
 * following mounts:
 *
 * mount /dev/sda1 /mnt
 * mount /dev/sda2 /mnt
 * mount /dev/sda3 /mnt
 *
 * Then lookup_mnt() on the base /mnt dentry in the root mount will
 * return successively the root dentry and vfsmount of /dev/sda1, then
 * /dev/sda2, then /dev/sda3, then NULL.
 *
 * lookup_mnt takes a reference to the found vfsmount.
 */
struct vfsmount *lookup_mnt(const struct path *path)
{ … }

/*
 * __is_local_mountpoint - Test to see if dentry is a mountpoint in the
 *                         current mount namespace.
 *
 * The common case is dentries are not mountpoints at all and that
 * test is handled inline.  For the slow case when we are actually
 * dealing with a mountpoint of some kind, walk through all of the
 * mounts in the current mount namespace and test to see if the dentry
 * is a mountpoint.
 *
 * The mount_hashtable is not usable in the context because we
 * need to identify all mounts that may be in the current mount
 * namespace not just a mount that happens to have some specified
 * parent mount.
 */
bool __is_local_mountpoint(struct dentry *dentry)
{ … }

static struct mountpoint *lookup_mountpoint(struct dentry *dentry)
{ … }

static struct mountpoint *get_mountpoint(struct dentry *dentry)
{ … }

/*
 * vfsmount lock must be held.  Additionally, the caller is responsible
 * for serializing calls for given disposal list.
 */
static void __put_mountpoint(struct mountpoint *mp, struct list_head *list)
{ … }

/* called with namespace_lock and vfsmount lock */
static void put_mountpoint(struct mountpoint *mp)
{ … }

static inline int check_mnt(struct mount *mnt)
{ … }

/*
 * vfsmount lock must be held for write
 */
static void touch_mnt_namespace(struct mnt_namespace *ns)
{ … }

/*
 * vfsmount lock must be held for write
 */
static void __touch_mnt_namespace(struct mnt_namespace *ns)
{ … }

/*
 * vfsmount lock must be held for write
 */
static struct mountpoint *unhash_mnt(struct mount *mnt)
{ … }

/*
 * vfsmount lock must be held for write
 */
static void umount_mnt(struct mount *mnt)
{ … }

/*
 * vfsmount lock must be held for write
 */
void mnt_set_mountpoint(struct mount *mnt,
			struct mountpoint *mp,
			struct mount *child_mnt)
{ … }

/**
 * mnt_set_mountpoint_beneath - mount a mount beneath another one
 *
 * @new_parent: the source mount
 * @top_mnt:    the mount beneath which @new_parent is mounted
 * @new_mp:     the new mountpoint of @top_mnt on @new_parent
 *
 * Remove @top_mnt from its current mountpoint @top_mnt->mnt_mp and
 * parent @top_mnt->mnt_parent and mount it on top of @new_parent at
 * @new_mp. And mount @new_parent on the old parent and old
 * mountpoint of @top_mnt.
 *
 * Context: This function expects namespace_lock() and lock_mount_hash()
 *          to have been acquired in that order.
 */
static void mnt_set_mountpoint_beneath(struct mount *new_parent,
				       struct mount *top_mnt,
				       struct mountpoint *new_mp)
{ … }


static void __attach_mnt(struct mount *mnt, struct mount *parent)
{ … }

/**
 * attach_mnt - mount a mount, attach to @mount_hashtable and parent's
 *              list of child mounts
 * @parent:  the parent
 * @mnt:     the new mount
 * @mp:      the new mountpoint
 * @beneath: whether to mount @mnt beneath or on top of @parent
 *
 * If @beneath is false, mount @mnt at @mp on @parent. Then attach @mnt
 * to @parent's child mount list and to @mount_hashtable.
 *
 * If @beneath is true, remove @mnt from its current parent and
 * mountpoint and mount it on @mp on @parent, and mount @parent on the
 * old parent and old mountpoint of @mnt. Finally, attach @parent to
 * @mnt_hashtable and @parent->mnt_parent->mnt_mounts.
 *
 * Note, when __attach_mnt() is called @mnt->mnt_parent already points
 * to the correct parent.
 *
 * Context: This function expects namespace_lock() and lock_mount_hash()
 *          to have been acquired in that order.
 */
static void attach_mnt(struct mount *mnt, struct mount *parent,
		       struct mountpoint *mp, bool beneath)
{ … }

void mnt_change_mountpoint(struct mount *parent, struct mountpoint *mp, struct mount *mnt)
{ … }

static inline struct mount *node_to_mount(struct rb_node *node)
{ … }

static void mnt_add_to_ns(struct mnt_namespace *ns, struct mount *mnt)
{ … }

/*
 * vfsmount lock must be held for write
 */
static void commit_tree(struct mount *mnt)
{ … }

static struct mount *next_mnt(struct mount *p, struct mount *root)
{ … }

static struct mount *skip_mnt_tree(struct mount *p)
{ … }

/**
 * vfs_create_mount - Create a mount for a configured superblock
 * @fc: The configuration context with the superblock attached
 *
 * Create a mount to an already configured superblock.  If necessary, the
 * caller should invoke vfs_get_tree() before calling this.
 *
 * Note that this does not attach the mount to anything.
 */
struct vfsmount *vfs_create_mount(struct fs_context *fc)
{ … }
EXPORT_SYMBOL(…);

struct vfsmount *fc_mount(struct fs_context *fc)
{ … }
EXPORT_SYMBOL(…);

struct vfsmount *vfs_kern_mount(struct file_system_type *type,
				int flags, const char *name,
				void *data)
{ … }
EXPORT_SYMBOL_GPL(…);

struct vfsmount *
vfs_submount(const struct dentry *mountpoint, struct file_system_type *type,
	     const char *name, void *data)
{ … }
EXPORT_SYMBOL_GPL(…);

static struct mount *clone_mnt(struct mount *old, struct dentry *root,
					int flag)
{ … }

static void cleanup_mnt(struct mount *mnt)
{ … }

static void __cleanup_mnt(struct rcu_head *head)
{ … }

static LLIST_HEAD(delayed_mntput_list);
static void delayed_mntput(struct work_struct *unused)
{ … }
static DECLARE_DELAYED_WORK(delayed_mntput_work, delayed_mntput);

static void mntput_no_expire(struct mount *mnt)
{ … }

void mntput(struct vfsmount *mnt)
{ … }
EXPORT_SYMBOL(…);

struct vfsmount *mntget(struct vfsmount *mnt)
{ … }
EXPORT_SYMBOL(…);

/*
 * Make a mount point inaccessible to new lookups.
 * Because there may still be current users, the caller MUST WAIT
 * for an RCU grace period before destroying the mount point.
 */
void mnt_make_shortterm(struct vfsmount *mnt)
{ … }

/**
 * path_is_mountpoint() - Check if path is a mount in the current namespace.
 * @path: path to check
 *
 *  d_mountpoint() can only be used reliably to establish if a dentry is
 *  not mounted in any namespace and that common case is handled inline.
 *  d_mountpoint() isn't aware of the possibility there may be multiple
 *  mounts using a given dentry in a different namespace. This function
 *  checks if the passed in path is a mountpoint rather than the dentry
 *  alone.
 */
bool path_is_mountpoint(const struct path *path)
{ … }
EXPORT_SYMBOL(…);

struct vfsmount *mnt_clone_internal(const struct path *path)
{ … }

/*
 * Returns the mount which either has the specified mnt_id, or has the next
 * smallest id afer the specified one.
 */
static struct mount *mnt_find_id_at(struct mnt_namespace *ns, u64 mnt_id)
{ … }

/*
 * Returns the mount which either has the specified mnt_id, or has the next
 * greater id before the specified one.
 */
static struct mount *mnt_find_id_at_reverse(struct mnt_namespace *ns, u64 mnt_id)
{ … }

#ifdef CONFIG_PROC_FS

/* iterator; we want it to have access to namespace_sem, thus here... */
static void *m_start(struct seq_file *m, loff_t *pos)
{ … }

static void *m_next(struct seq_file *m, void *v, loff_t *pos)
{ … }

static void m_stop(struct seq_file *m, void *v)
{ … }

static int m_show(struct seq_file *m, void *v)
{ … }

const struct seq_operations mounts_op = …;

#endif  /* CONFIG_PROC_FS */

/**
 * may_umount_tree - check if a mount tree is busy
 * @m: root of mount tree
 *
 * This is called to check if a tree of mounts has any
 * open files, pwds, chroots or sub mounts that are
 * busy.
 */
int may_umount_tree(struct vfsmount *m)
{ … }

EXPORT_SYMBOL(…);

/**
 * may_umount - check if a mount point is busy
 * @mnt: root of mount
 *
 * This is called to check if a mount point has any
 * open files, pwds, chroots or sub mounts. If the
 * mount has sub mounts this will return busy
 * regardless of whether the sub mounts are busy.
 *
 * Doesn't take quota and stuff into account. IOW, in some cases it will
 * give false negatives. The main reason why it's here is that we need
 * a non-destructive way to look for easily umountable filesystems.
 */
int may_umount(struct vfsmount *mnt)
{ … }

EXPORT_SYMBOL(…);

static void namespace_unlock(void)
{ … }

static inline void namespace_lock(void)
{ … }

enum umount_tree_flags { … };

static bool disconnect_mount(struct mount *mnt, enum umount_tree_flags how)
{ … }

/*
 * mount_lock must be held
 * namespace_sem must be held for write
 */
static void umount_tree(struct mount *mnt, enum umount_tree_flags how)
{ … }

static void shrink_submounts(struct mount *mnt);

static int do_umount_root(struct super_block *sb)
{ … }

static int do_umount(struct mount *mnt, int flags)
{ … }

/*
 * __detach_mounts - lazily unmount all mounts on the specified dentry
 *
 * During unlink, rmdir, and d_drop it is possible to loose the path
 * to an existing mountpoint, and wind up leaking the mount.
 * detach_mounts allows lazily unmounting those mounts instead of
 * leaking them.
 *
 * The caller may hold dentry->d_inode->i_mutex.
 */
void __detach_mounts(struct dentry *dentry)
{ … }

/*
 * Is the caller allowed to modify his namespace?
 */
bool may_mount(void)
{ … }

static void warn_mandlock(void)
{ … }

static int can_umount(const struct path *path, int flags)
{ … }

// caller is responsible for flags being sane
int path_umount(struct path *path, int flags)
{ … }

static int ksys_umount(char __user *name, int flags)
{ … }

SYSCALL_DEFINE2(umount, char __user *, name, int, flags)
{ … }

#ifdef __ARCH_WANT_SYS_OLDUMOUNT

/*
 *	The 2.0 compatible umount. No flags.
 */
SYSCALL_DEFINE1(oldumount, char __user *, name)
{ … }

#endif

static bool is_mnt_ns_file(struct dentry *dentry)
{ … }

static struct mnt_namespace *to_mnt_ns(struct ns_common *ns)
{ … }

struct ns_common *from_mnt_ns(struct mnt_namespace *mnt)
{ … }

static bool mnt_ns_loop(struct dentry *dentry)
{ … }

struct mount *copy_tree(struct mount *src_root, struct dentry *dentry,
					int flag)
{ … }

/* Caller should check returned pointer for errors */

struct vfsmount *collect_mounts(const struct path *path)
{ … }

static void free_mnt_ns(struct mnt_namespace *);
static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *, bool);

void dissolve_on_fput(struct vfsmount *mnt)
{ … }

void drop_collected_mounts(struct vfsmount *mnt)
{ … }

bool has_locked_children(struct mount *mnt, struct dentry *dentry)
{ … }

/**
 * clone_private_mount - create a private clone of a path
 * @path: path to clone
 *
 * This creates a new vfsmount, which will be the clone of @path.  The new mount
 * will not be attached anywhere in the namespace and will be private (i.e.
 * changes to the originating mount won't be propagated into this).
 *
 * Release with mntput().
 */
struct vfsmount *clone_private_mount(const struct path *path)
{ … }
EXPORT_SYMBOL_GPL(…);

int iterate_mounts(int (*f)(struct vfsmount *, void *), void *arg,
		   struct vfsmount *root)
{ … }

static void lock_mnt_tree(struct mount *mnt)
{ … }

static void cleanup_group_ids(struct mount *mnt, struct mount *end)
{ … }

static int invent_group_ids(struct mount *mnt, bool recurse)
{ … }

int count_mounts(struct mnt_namespace *ns, struct mount *mnt)
{ … }

enum mnt_tree_flags_t { … };

/**
 * attach_recursive_mnt - attach a source mount tree
 * @source_mnt: mount tree to be attached
 * @top_mnt:    mount that @source_mnt will be mounted on or mounted beneath
 * @dest_mp:    the mountpoint @source_mnt will be mounted at
 * @flags:      modify how @source_mnt is supposed to be attached
 *
 *  NOTE: in the table below explains the semantics when a source mount
 *  of a given type is attached to a destination mount of a given type.
 * ---------------------------------------------------------------------------
 * |         BIND MOUNT OPERATION                                            |
 * |**************************************************************************
 * | source-->| shared        |       private  |       slave    | unbindable |
 * | dest     |               |                |                |            |
 * |   |      |               |                |                |            |
 * |   v      |               |                |                |            |
 * |**************************************************************************
 * |  shared  | shared (++)   |     shared (+) |     shared(+++)|  invalid   |
 * |          |               |                |                |            |
 * |non-shared| shared (+)    |      private   |      slave (*) |  invalid   |
 * ***************************************************************************
 * A bind operation clones the source mount and mounts the clone on the
 * destination mount.
 *
 * (++)  the cloned mount is propagated to all the mounts in the propagation
 * 	 tree of the destination mount and the cloned mount is added to
 * 	 the peer group of the source mount.
 * (+)   the cloned mount is created under the destination mount and is marked
 *       as shared. The cloned mount is added to the peer group of the source
 *       mount.
 * (+++) the mount is propagated to all the mounts in the propagation tree
 *       of the destination mount and the cloned mount is made slave
 *       of the same master as that of the source mount. The cloned mount
 *       is marked as 'shared and slave'.
 * (*)   the cloned mount is made a slave of the same master as that of the
 * 	 source mount.
 *
 * ---------------------------------------------------------------------------
 * |         		MOVE MOUNT OPERATION                                 |
 * |**************************************************************************
 * | source-->| shared        |       private  |       slave    | unbindable |
 * | dest     |               |                |                |            |
 * |   |      |               |                |                |            |
 * |   v      |               |                |                |            |
 * |**************************************************************************
 * |  shared  | shared (+)    |     shared (+) |    shared(+++) |  invalid   |
 * |          |               |                |                |            |
 * |non-shared| shared (+*)   |      private   |    slave (*)   | unbindable |
 * ***************************************************************************
 *
 * (+)  the mount is moved to the destination. And is then propagated to
 * 	all the mounts in the propagation tree of the destination mount.
 * (+*)  the mount is moved to the destination.
 * (+++)  the mount is moved to the destination and is then propagated to
 * 	all the mounts belonging to the destination mount's propagation tree.
 * 	the mount is marked as 'shared and slave'.
 * (*)	the mount continues to be a slave at the new location.
 *
 * if the source mount is a tree, the operations explained above is
 * applied to each mount in the tree.
 * Must be called without spinlocks held, since this function can sleep
 * in allocations.
 *
 * Context: The function expects namespace_lock() to be held.
 * Return: If @source_mnt was successfully attached 0 is returned.
 *         Otherwise a negative error code is returned.
 */
static int attach_recursive_mnt(struct mount *source_mnt,
				struct mount *top_mnt,
				struct mountpoint *dest_mp,
				enum mnt_tree_flags_t flags)
{ … }

/**
 * do_lock_mount - lock mount and mountpoint
 * @path:    target path
 * @beneath: whether the intention is to mount beneath @path
 *
 * Follow the mount stack on @path until the top mount @mnt is found. If
 * the initial @path->{mnt,dentry} is a mountpoint lookup the first
 * mount stacked on top of it. Then simply follow @{mnt,mnt->mnt_root}
 * until nothing is stacked on top of it anymore.
 *
 * Acquire the inode_lock() on the top mount's ->mnt_root to protect
 * against concurrent removal of the new mountpoint from another mount
 * namespace.
 *
 * If @beneath is requested, acquire inode_lock() on @mnt's mountpoint
 * @mp on @mnt->mnt_parent must be acquired. This protects against a
 * concurrent unlink of @mp->mnt_dentry from another mount namespace
 * where @mnt doesn't have a child mount mounted @mp. A concurrent
 * removal of @mnt->mnt_root doesn't matter as nothing will be mounted
 * on top of it for @beneath.
 *
 * In addition, @beneath needs to make sure that @mnt hasn't been
 * unmounted or moved from its current mountpoint in between dropping
 * @mount_lock and acquiring @namespace_sem. For the !@beneath case @mnt
 * being unmounted would be detected later by e.g., calling
 * check_mnt(mnt) in the function it's called from. For the @beneath
 * case however, it's useful to detect it directly in do_lock_mount().
 * If @mnt hasn't been unmounted then @mnt->mnt_mountpoint still points
 * to @mnt->mnt_mp->m_dentry. But if @mnt has been unmounted it will
 * point to @mnt->mnt_root and @mnt->mnt_mp will be NULL.
 *
 * Return: Either the target mountpoint on the top mount or the top
 *         mount's mountpoint.
 */
static struct mountpoint *do_lock_mount(struct path *path, bool beneath)
{ … }

static inline struct mountpoint *lock_mount(struct path *path)
{ … }

static void unlock_mount(struct mountpoint *where)
{ … }

static int graft_tree(struct mount *mnt, struct mount *p, struct mountpoint *mp)
{ … }

/*
 * Sanity check the flags to change_mnt_propagation.
 */

static int flags_to_propagation_type(int ms_flags)
{ … }

/*
 * recursively change the type of the mountpoint.
 */
static int do_change_type(struct path *path, int ms_flags)
{ … }

static struct mount *__do_loopback(struct path *old_path, int recurse)
{ … }

/*
 * do loopback mount.
 */
static int do_loopback(struct path *path, const char *old_name,
				int recurse)
{ … }

static struct file *open_detached_copy(struct path *path, bool recursive)
{ … }

SYSCALL_DEFINE3(open_tree, int, dfd, const char __user *, filename, unsigned, flags)
{ … }

/*
 * Don't allow locked mount flags to be cleared.
 *
 * No locks need to be held here while testing the various MNT_LOCK
 * flags because those flags can never be cleared once they are set.
 */
static bool can_change_locked_flags(struct mount *mnt, unsigned int mnt_flags)
{ … }

static int change_mount_ro_state(struct mount *mnt, unsigned int mnt_flags)
{ … }

static void set_mount_attributes(struct mount *mnt, unsigned int mnt_flags)
{ … }

static void mnt_warn_timestamp_expiry(struct path *mountpoint, struct vfsmount *mnt)
{ … }

/*
 * Handle reconfiguration of the mountpoint only without alteration of the
 * superblock it refers to.  This is triggered by specifying MS_REMOUNT|MS_BIND
 * to mount(2).
 */
static int do_reconfigure_mnt(struct path *path, unsigned int mnt_flags)
{ … }

/*
 * change filesystem flags. dir should be a physical root of filesystem.
 * If you've mounted a non-root directory somewhere and want to do remount
 * on it - tough luck.
 */
static int do_remount(struct path *path, int ms_flags, int sb_flags,
		      int mnt_flags, void *data)
{ … }

static inline int tree_contains_unbindable(struct mount *mnt)
{ … }

/*
 * Check that there aren't references to earlier/same mount namespaces in the
 * specified subtree.  Such references can act as pins for mount namespaces
 * that aren't checked by the mount-cycle checking code, thereby allowing
 * cycles to be made.
 */
static bool check_for_nsfs_mounts(struct mount *subtree)
{ … }

static int do_set_group(struct path *from_path, struct path *to_path)
{ … }

/**
 * path_overmounted - check if path is overmounted
 * @path: path to check
 *
 * Check if path is overmounted, i.e., if there's a mount on top of
 * @path->mnt with @path->dentry as mountpoint.
 *
 * Context: This function expects namespace_lock() to be held.
 * Return: If path is overmounted true is returned, false if not.
 */
static inline bool path_overmounted(const struct path *path)
{ … }

/**
 * can_move_mount_beneath - check that we can mount beneath the top mount
 * @from: mount to mount beneath
 * @to:   mount under which to mount
 * @mp:   mountpoint of @to
 *
 * - Make sure that @to->dentry is actually the root of a mount under
 *   which we can mount another mount.
 * - Make sure that nothing can be mounted beneath the caller's current
 *   root or the rootfs of the namespace.
 * - Make sure that the caller can unmount the topmost mount ensuring
 *   that the caller could reveal the underlying mountpoint.
 * - Ensure that nothing has been mounted on top of @from before we
 *   grabbed @namespace_sem to avoid creating pointless shadow mounts.
 * - Prevent mounting beneath a mount if the propagation relationship
 *   between the source mount, parent mount, and top mount would lead to
 *   nonsensical mount trees.
 *
 * Context: This function expects namespace_lock() to be held.
 * Return: On success 0, and on error a negative error code is returned.
 */
static int can_move_mount_beneath(const struct path *from,
				  const struct path *to,
				  const struct mountpoint *mp)
{ … }

static int do_move_mount(struct path *old_path, struct path *new_path,
			 bool beneath)
{ … }

static int do_move_mount_old(struct path *path, const char *old_name)
{ … }

/*
 * add a mount into a namespace's mount tree
 */
static int do_add_mount(struct mount *newmnt, struct mountpoint *mp,
			const struct path *path, int mnt_flags)
{ … }

static bool mount_too_revealing(const struct super_block *sb, int *new_mnt_flags);

/*
 * Create a new mount using a superblock configuration and request it
 * be added to the namespace tree.
 */
static int do_new_mount_fc(struct fs_context *fc, struct path *mountpoint,
			   unsigned int mnt_flags)
{ … }

/*
 * create a new mount for userspace and request it to be added into the
 * namespace's tree
 */
static int do_new_mount(struct path *path, const char *fstype, int sb_flags,
			int mnt_flags, const char *name, void *data)
{ … }

int finish_automount(struct vfsmount *m, const struct path *path)
{ … }

/**
 * mnt_set_expiry - Put a mount on an expiration list
 * @mnt: The mount to list.
 * @expiry_list: The list to add the mount to.
 */
void mnt_set_expiry(struct vfsmount *mnt, struct list_head *expiry_list)
{ … }
EXPORT_SYMBOL(…);

/*
 * process a list of expirable mountpoints with the intent of discarding any
 * mountpoints that aren't in use and haven't been touched since last we came
 * here
 */
void mark_mounts_for_expiry(struct list_head *mounts)
{ … }

EXPORT_SYMBOL_GPL(…);

/*
 * Ripoff of 'select_parent()'
 *
 * search the list of submounts for a given mountpoint, and move any
 * shrinkable submounts to the 'graveyard' list.
 */
static int select_submounts(struct mount *parent, struct list_head *graveyard)
{ … }

/*
 * process a list of expirable mountpoints with the intent of discarding any
 * submounts of a specific parent mountpoint
 *
 * mount_lock must be held for write
 */
static void shrink_submounts(struct mount *mnt)
{ … }

static void *copy_mount_options(const void __user * data)
{ … }

static char *copy_mount_string(const void __user *data)
{ … }

/*
 * Flags is a 32-bit value that allows up to 31 non-fs dependent flags to
 * be given to the mount() call (ie: read-only, no-dev, no-suid etc).
 *
 * data is a (void *) that can point to any structure up to
 * PAGE_SIZE-1 bytes, which can contain arbitrary fs-dependent
 * information (or be NULL).
 *
 * Pre-0.97 versions of mount() didn't have a flags word.
 * When the flags word was introduced its top half was required
 * to have the magic value 0xC0ED, and this remained so until 2.4.0-test9.
 * Therefore, if this magic number is present, it carries no information
 * and must be discarded.
 */
int path_mount(const char *dev_name, struct path *path,
		const char *type_page, unsigned long flags, void *data_page)
{ … }

long do_mount(const char *dev_name, const char __user *dir_name,
		const char *type_page, unsigned long flags, void *data_page)
{ … }

static struct ucounts *inc_mnt_namespaces(struct user_namespace *ns)
{ … }

static void dec_mnt_namespaces(struct ucounts *ucounts)
{ … }

static void free_mnt_ns(struct mnt_namespace *ns)
{ … }

/*
 * Assign a sequence number so we can detect when we attempt to bind
 * mount a reference to an older mount namespace into the current
 * mount namespace, preventing reference counting loops.  A 64bit
 * number incrementing at 10Ghz will take 12,427 years to wrap which
 * is effectively never, so we can ignore the possibility.
 */
static atomic64_t mnt_ns_seq = …;

static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool anon)
{ … }

__latent_entropy
struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,
		struct user_namespace *user_ns, struct fs_struct *new_fs)
{ … }

struct dentry *mount_subtree(struct vfsmount *m, const char *name)
{ … }
EXPORT_SYMBOL(…);

SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name,
		char __user *, type, unsigned long, flags, void __user *, data)
{ … }

#define FSMOUNT_VALID_FLAGS …

#define MOUNT_SETATTR_VALID_FLAGS …

#define MOUNT_SETATTR_PROPAGATION_FLAGS …

static unsigned int attr_flags_to_mnt_flags(u64 attr_flags)
{ … }

/*
 * Create a kernel mount representation for a new, prepared superblock
 * (specified by fs_fd) and attach to an open_tree-like file descriptor.
 */
SYSCALL_DEFINE3(fsmount, int, fs_fd, unsigned int, flags,
		unsigned int, attr_flags)
{ … }

/*
 * Move a mount from one place to another.  In combination with
 * fsopen()/fsmount() this is used to install a new mount and in combination
 * with open_tree(OPEN_TREE_CLONE [| AT_RECURSIVE]) it can be used to copy
 * a mount subtree.
 *
 * Note the flags value is a combination of MOVE_MOUNT_* flags.
 */
SYSCALL_DEFINE5(move_mount,
		int, from_dfd, const char __user *, from_pathname,
		int, to_dfd, const char __user *, to_pathname,
		unsigned int, flags)
{ … }

/*
 * Return true if path is reachable from root
 *
 * namespace_sem or mount_lock is held
 */
bool is_path_reachable(struct mount *mnt, struct dentry *dentry,
			 const struct path *root)
{ … }

bool path_is_under(const struct path *path1, const struct path *path2)
{ … }
EXPORT_SYMBOL(…);

/*
 * pivot_root Semantics:
 * Moves the root file system of the current process to the directory put_old,
 * makes new_root as the new root file system of the current process, and sets
 * root/cwd of all processes which had them on the current root to new_root.
 *
 * Restrictions:
 * The new_root and put_old must be directories, and  must not be on the
 * same file  system as the current process root. The put_old  must  be
 * underneath new_root,  i.e. adding a non-zero number of /.. to the string
 * pointed to by put_old must yield the same directory as new_root. No other
 * file system may be mounted on put_old. After all, new_root is a mountpoint.
 *
 * Also, the current root cannot be on the 'rootfs' (initial ramfs) filesystem.
 * See Documentation/filesystems/ramfs-rootfs-initramfs.rst for alternatives
 * in this situation.
 *
 * Notes:
 *  - we don't move root/cwd if they are not at the root (reason: if something
 *    cared enough to change them, it's probably wrong to force them elsewhere)
 *  - it's okay to pick a root that isn't the root of a file system, e.g.
 *    /nfs/my_root where /nfs is the mount point. It must be a mountpoint,
 *    though, so you may need to say mount --bind /nfs/my_root /nfs/my_root
 *    first.
 */
SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
		const char __user *, put_old)
{ … }

static unsigned int recalc_flags(struct mount_kattr *kattr, struct mount *mnt)
{ … }

static int can_idmap_mount(const struct mount_kattr *kattr, struct mount *mnt)
{ … }

/**
 * mnt_allow_writers() - check whether the attribute change allows writers
 * @kattr: the new mount attributes
 * @mnt: the mount to which @kattr will be applied
 *
 * Check whether thew new mount attributes in @kattr allow concurrent writers.
 *
 * Return: true if writers need to be held, false if not
 */
static inline bool mnt_allow_writers(const struct mount_kattr *kattr,
				     const struct mount *mnt)
{ … }

static int mount_setattr_prepare(struct mount_kattr *kattr, struct mount *mnt)
{ … }

static void do_idmap_mount(const struct mount_kattr *kattr, struct mount *mnt)
{ … }

static void mount_setattr_commit(struct mount_kattr *kattr, struct mount *mnt)
{ … }

static int do_mount_setattr(struct path *path, struct mount_kattr *kattr)
{ … }

static int build_mount_idmapped(const struct mount_attr *attr, size_t usize,
				struct mount_kattr *kattr, unsigned int flags)
{ … }

static int build_mount_kattr(const struct mount_attr *attr, size_t usize,
			     struct mount_kattr *kattr, unsigned int flags)
{ … }

static void finish_mount_kattr(struct mount_kattr *kattr)
{ … }

SYSCALL_DEFINE5(mount_setattr, int, dfd, const char __user *, path,
		unsigned int, flags, struct mount_attr __user *, uattr,
		size_t, usize)
{ … }

int show_path(struct seq_file *m, struct dentry *root)
{ … }

static struct vfsmount *lookup_mnt_in_ns(u64 id, struct mnt_namespace *ns)
{ … }

struct kstatmount { … };

static u64 mnt_to_attr_flags(struct vfsmount *mnt)
{ … }

static u64 mnt_to_propagation_flags(struct mount *m)
{ … }

static void statmount_sb_basic(struct kstatmount *s)
{ … }

static void statmount_mnt_basic(struct kstatmount *s)
{ … }

static void statmount_propagate_from(struct kstatmount *s)
{ … }

static int statmount_mnt_root(struct kstatmount *s, struct seq_file *seq)
{ … }

static int statmount_mnt_point(struct kstatmount *s, struct seq_file *seq)
{ … }

static int statmount_fs_type(struct kstatmount *s, struct seq_file *seq)
{ … }

static void statmount_mnt_ns_id(struct kstatmount *s, struct mnt_namespace *ns)
{ … }

static int statmount_mnt_opts(struct kstatmount *s, struct seq_file *seq)
{ … }

static int statmount_string(struct kstatmount *s, u64 flag)
{ … }

static int copy_statmount_to_user(struct kstatmount *s)
{ … }

static struct mount *listmnt_next(struct mount *curr, bool reverse)
{ … }

static int grab_requested_root(struct mnt_namespace *ns, struct path *root)
{ … }

static int do_statmount(struct kstatmount *s, u64 mnt_id, u64 mnt_ns_id,
			struct mnt_namespace *ns)
{ … }

static inline bool retry_statmount(const long ret, size_t *seq_size)
{ … }

#define STATMOUNT_STRING_REQ …

static int prepare_kstatmount(struct kstatmount *ks, struct mnt_id_req *kreq,
			      struct statmount __user *buf, size_t bufsize,
			      size_t seq_size)
{ … }

static int copy_mnt_id_req(const struct mnt_id_req __user *req,
			   struct mnt_id_req *kreq)
{ … }

/*
 * If the user requested a specific mount namespace id, look that up and return
 * that, or if not simply grab a passive reference on our mount namespace and
 * return that.
 */
static struct mnt_namespace *grab_requested_mnt_ns(u64 mnt_ns_id)
{ … }

SYSCALL_DEFINE4(statmount, const struct mnt_id_req __user *, req,
		struct statmount __user *, buf, size_t, bufsize,
		unsigned int, flags)
{ … }

static ssize_t do_listmount(struct mnt_namespace *ns, u64 mnt_parent_id,
			    u64 last_mnt_id, u64 *mnt_ids, size_t nr_mnt_ids,
			    bool reverse)
{ … }

SYSCALL_DEFINE4(listmount, const struct mnt_id_req __user *, req,
		u64 __user *, mnt_ids, size_t, nr_mnt_ids, unsigned int, flags)
{ … }

static void __init init_mount_tree(void)
{ … }

void __init mnt_init(void)
{ … }

void put_mnt_ns(struct mnt_namespace *ns)
{ … }

struct vfsmount *kern_mount(struct file_system_type *type)
{ … }
EXPORT_SYMBOL_GPL(…);

void kern_unmount(struct vfsmount *mnt)
{ … }
EXPORT_SYMBOL(…);

void kern_unmount_array(struct vfsmount *mnt[], unsigned int num)
{ … }
EXPORT_SYMBOL(…);

bool our_mnt(struct vfsmount *mnt)
{ … }

bool current_chrooted(void)
{ … }

static bool mnt_already_visible(struct mnt_namespace *ns,
				const struct super_block *sb,
				int *new_mnt_flags)
{ … }

static bool mount_too_revealing(const struct super_block *sb, int *new_mnt_flags)
{ … }

bool mnt_may_suid(struct vfsmount *mnt)
{ … }

static struct ns_common *mntns_get(struct task_struct *task)
{ … }

static void mntns_put(struct ns_common *ns)
{ … }

static int mntns_install(struct nsset *nsset, struct ns_common *ns)
{ … }

static struct user_namespace *mntns_owner(struct ns_common *ns)
{ … }

const struct proc_ns_operations mntns_operations = …;

#ifdef CONFIG_SYSCTL
static struct ctl_table fs_namespace_sysctls[] = …;

static int __init init_fs_namespace_sysctls(void)
{ … }
fs_initcall(init_fs_namespace_sysctls);

#endif /* CONFIG_SYSCTL */