vfs.c | Explore in Territory

// SPDX-License-Identifier: GPL-2.0
/*
 * File operations used by nfsd. Some of these have been ripped from
 * other parts of the kernel because they weren't exported, others
 * are partial duplicates with added or changed functionality.
 *
 * Note that several functions dget() the dentry upon which they want
 * to act, most notably those that create directory entries. Response
 * dentry's are dput()'d if necessary in the release callback.
 * So if you notice code paths that apparently fail to dput() the
 * dentry, don't worry--they have been taken care of.
 *
 * Copyright (C) 1995-1999 Olaf Kirch <[email protected]>
 * Zerocpy NFS support (C) 2002 Hirokazu Takahashi <[email protected]>
 */

#include <linux/fs.h>
#include <linux/file.h>
#include <linux/splice.h>
#include <linux/falloc.h>
#include <linux/fcntl.h>
#include <linux/namei.h>
#include <linux/delay.h>
#include <linux/fsnotify.h>
#include <linux/posix_acl_xattr.h>
#include <linux/xattr.h>
#include <linux/jhash.h>
#include <linux/pagemap.h>
#include <linux/slab.h>
#include <linux/uaccess.h>
#include <linux/exportfs.h>
#include <linux/writeback.h>
#include <linux/security.h>

#include "xdr3.h"

#ifdef CONFIG_NFSD_V4
#include "../internal.h"
#include "acl.h"
#include "idmap.h"
#include "xdr4.h"
#endif /* CONFIG_NFSD_V4 */

#include "nfsd.h"
#include "vfs.h"
#include "filecache.h"
#include "trace.h"

#define NFSDDBG_FACILITY …

/**
 * nfserrno - Map Linux errnos to NFS errnos
 * @errno: POSIX(-ish) error code to be mapped
 *
 * Returns the appropriate (net-endian) nfserr_* (or nfs_ok if errno is 0). If
 * it's an error we don't expect, log it once and return nfserr_io.
 */
__be32
nfserrno (int errno)
{ … }

/* 
 * Called from nfsd_lookup and encode_dirent. Check if we have crossed 
 * a mount point.
 * Returns -EAGAIN or -ETIMEDOUT leaving *dpp and *expp unchanged,
 *  or nfs_ok having possibly changed *dpp and *expp
 */
int
nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp, 
		        struct svc_export **expp)
{ … }

static void follow_to_parent(struct path *path)
{ … }

static int nfsd_lookup_parent(struct svc_rqst *rqstp, struct dentry *dparent, struct svc_export **exp, struct dentry **dentryp)
{ … }

/*
 * For nfsd purposes, we treat V4ROOT exports as though there was an
 * export at *every* directory.
 * We return:
 * '1' if this dentry *must* be an export point,
 * '2' if it might be, if there is really a mount here, and
 * '0' if there is no chance of an export point here.
 */
int nfsd_mountpoint(struct dentry *dentry, struct svc_export *exp)
{ … }

__be32
nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp,
		   const char *name, unsigned int len,
		   struct svc_export **exp_ret, struct dentry **dentry_ret)
{ … }

/**
 * nfsd_lookup - look up a single path component for nfsd
 *
 * @rqstp:   the request context
 * @fhp:     the file handle of the directory
 * @name:    the component name, or %NULL to look up parent
 * @len:     length of name to examine
 * @resfh:   pointer to pre-initialised filehandle to hold result.
 *
 * Look up one component of a pathname.
 * N.B. After this call _both_ fhp and resfh need an fh_put
 *
 * If the lookup would cross a mountpoint, and the mounted filesystem
 * is exported to the client with NFSEXP_NOHIDE, then the lookup is
 * accepted as it stands and the mounted directory is
 * returned. Otherwise the covered directory is returned.
 * NOTE: this mountpoint crossing is not supported properly by all
 *   clients and is explicitly disallowed for NFSv3
 *
 */
__be32
nfsd_lookup(struct svc_rqst *rqstp, struct svc_fh *fhp, const char *name,
	    unsigned int len, struct svc_fh *resfh)
{ … }

static void
commit_reset_write_verifier(struct nfsd_net *nn, struct svc_rqst *rqstp,
			    int err)
{ … }

/*
 * Commit metadata changes to stable storage.
 */
static int
commit_inode_metadata(struct inode *inode)
{ … }

static int
commit_metadata(struct svc_fh *fhp)
{ … }

/*
 * Go over the attributes and take care of the small differences between
 * NFS semantics and what Linux expects.
 */
static void
nfsd_sanitize_attrs(struct inode *inode, struct iattr *iap)
{ … }

static __be32
nfsd_get_write_access(struct svc_rqst *rqstp, struct svc_fh *fhp,
		struct iattr *iap)
{ … }

static int __nfsd_setattr(struct dentry *dentry, struct iattr *iap)
{ … }

/**
 * nfsd_setattr - Set various file attributes.
 * @rqstp: controlling RPC transaction
 * @fhp: filehandle of target
 * @attr: attributes to set
 * @guardtime: do not act if ctime.tv_sec does not match this timestamp
 *
 * This call may adjust the contents of @attr (in particular, this
 * call may change the bits in the na_iattr.ia_valid field).
 *
 * Returns nfs_ok on success, otherwise an NFS status code is
 * returned. Caller must release @fhp by calling fh_put in either
 * case.
 */
__be32
nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp,
	     struct nfsd_attrs *attr, const struct timespec64 *guardtime)
{ … }

#if defined(CONFIG_NFSD_V4)
/*
 * NFS junction information is stored in an extended attribute.
 */
#define NFSD_JUNCTION_XATTR_NAME …

/**
 * nfsd4_is_junction - Test if an object could be an NFS junction
 *
 * @dentry: object to test
 *
 * Returns 1 if "dentry" appears to contain NFS junction information.
 * Otherwise 0 is returned.
 */
int nfsd4_is_junction(struct dentry *dentry)
{ … }

static struct nfsd4_compound_state *nfsd4_get_cstate(struct svc_rqst *rqstp)
{ … }

__be32 nfsd4_clone_file_range(struct svc_rqst *rqstp,
		struct nfsd_file *nf_src, u64 src_pos,
		struct nfsd_file *nf_dst, u64 dst_pos,
		u64 count, bool sync)
{ … }

ssize_t nfsd_copy_file_range(struct file *src, u64 src_pos, struct file *dst,
			     u64 dst_pos, u64 count)
{ … }

__be32 nfsd4_vfs_fallocate(struct svc_rqst *rqstp, struct svc_fh *fhp,
			   struct file *file, loff_t offset, loff_t len,
			   int flags)
{ … }
#endif /* defined(CONFIG_NFSD_V4) */

/*
 * Check server access rights to a file system object
 */
struct accessmap { … };
static struct accessmap	nfs3_regaccess[] = …;

static struct accessmap	nfs3_diraccess[] = …;

static struct accessmap	nfs3_anyaccess[] = …;

__be32
nfsd_access(struct svc_rqst *rqstp, struct svc_fh *fhp, u32 *access, u32 *supported)
{ … }

int nfsd_open_break_lease(struct inode *inode, int access)
{ … }

/*
 * Open an existing file or directory.
 * The may_flags argument indicates the type of open (read/write/lock)
 * and additional flags.
 * N.B. After this call fhp needs an fh_put
 */
static int
__nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type,
			int may_flags, struct file **filp)
{ … }

__be32
nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type,
		int may_flags, struct file **filp)
{ … }

/**
 * nfsd_open_verified - Open a regular file for the filecache
 * @rqstp: RPC request
 * @fhp: NFS filehandle of the file to open
 * @may_flags: internal permission flags
 * @filp: OUT: open "struct file *"
 *
 * Returns zero on success, or a negative errno value.
 */
int
nfsd_open_verified(struct svc_rqst *rqstp, struct svc_fh *fhp, int may_flags,
		   struct file **filp)
{ … }

/*
 * Grab and keep cached pages associated with a file in the svc_rqst
 * so that they can be passed to the network sendmsg routines
 * directly. They will be released after the sending has completed.
 *
 * Return values: Number of bytes consumed, or -EIO if there are no
 * remaining pages in rqstp->rq_pages.
 */
static int
nfsd_splice_actor(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
		  struct splice_desc *sd)
{ … }

static int nfsd_direct_splice_actor(struct pipe_inode_info *pipe,
				    struct splice_desc *sd)
{ … }

static u32 nfsd_eof_on_read(struct file *file, loff_t offset, ssize_t len,
		size_t expected)
{ … }

static __be32 nfsd_finish_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
			       struct file *file, loff_t offset,
			       unsigned long *count, u32 *eof, ssize_t host_err)
{ … }

/**
 * nfsd_splice_read - Perform a VFS read using a splice pipe
 * @rqstp: RPC transaction context
 * @fhp: file handle of file to be read
 * @file: opened struct file of file to be read
 * @offset: starting byte offset
 * @count: IN: requested number of bytes; OUT: number of bytes read
 * @eof: OUT: set non-zero if operation reached the end of the file
 *
 * Returns nfs_ok on success, otherwise an nfserr stat value is
 * returned.
 */
__be32 nfsd_splice_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
			struct file *file, loff_t offset, unsigned long *count,
			u32 *eof)
{ … }

/**
 * nfsd_iter_read - Perform a VFS read using an iterator
 * @rqstp: RPC transaction context
 * @fhp: file handle of file to be read
 * @file: opened struct file of file to be read
 * @offset: starting byte offset
 * @count: IN: requested number of bytes; OUT: number of bytes read
 * @base: offset in first page of read buffer
 * @eof: OUT: set non-zero if operation reached the end of the file
 *
 * Some filesystems or situations cannot use nfsd_splice_read. This
 * function is the slightly less-performant fallback for those cases.
 *
 * Returns nfs_ok on success, otherwise an nfserr stat value is
 * returned.
 */
__be32 nfsd_iter_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
		      struct file *file, loff_t offset, unsigned long *count,
		      unsigned int base, u32 *eof)
{ … }

/*
 * Gathered writes: If another process is currently writing to the file,
 * there's a high chance this is another nfsd (triggered by a bulk write
 * from a client's biod). Rather than syncing the file with each write
 * request, we sleep for 10 msec.
 *
 * I don't know if this roughly approximates C. Juszak's idea of
 * gathered writes, but it's a nice and simple solution (IMHO), and it
 * seems to work:-)
 *
 * Note: we do this only in the NFSv2 case, since v3 and higher have a
 * better tool (separate unstable writes and commits) for solving this
 * problem.
 */
static int wait_for_concurrent_writes(struct file *file)
{ … }

__be32
nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfsd_file *nf,
				loff_t offset, struct kvec *vec, int vlen,
				unsigned long *cnt, int stable,
				__be32 *verf)
{ … }

/**
 * nfsd_read_splice_ok - check if spliced reading is supported
 * @rqstp: RPC transaction context
 *
 * Return values:
 *   %true: nfsd_splice_read() may be used
 *   %false: nfsd_splice_read() must not be used
 *
 * NFS READ normally uses splice to send data in-place. However the
 * data in cache can change after the reply's MIC is computed but
 * before the RPC reply is sent. To prevent the client from
 * rejecting the server-computed MIC in this somewhat rare case, do
 * not use splice with the GSS integrity and privacy services.
 */
bool nfsd_read_splice_ok(struct svc_rqst *rqstp)
{ … }

/**
 * nfsd_read - Read data from a file
 * @rqstp: RPC transaction context
 * @fhp: file handle of file to be read
 * @offset: starting byte offset
 * @count: IN: requested number of bytes; OUT: number of bytes read
 * @eof: OUT: set non-zero if operation reached the end of the file
 *
 * The caller must verify that there is enough space in @rqstp.rq_res
 * to perform this operation.
 *
 * N.B. After this call fhp needs an fh_put
 *
 * Returns nfs_ok on success, otherwise an nfserr stat value is
 * returned.
 */
__be32 nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
		 loff_t offset, unsigned long *count, u32 *eof)
{ … }

/*
 * Write data to a file.
 * The stable flag requests synchronous writes.
 * N.B. After this call fhp needs an fh_put
 */
__be32
nfsd_write(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t offset,
	   struct kvec *vec, int vlen, unsigned long *cnt, int stable,
	   __be32 *verf)
{ … }

/**
 * nfsd_commit - Commit pending writes to stable storage
 * @rqstp: RPC request being processed
 * @fhp: NFS filehandle
 * @nf: target file
 * @offset: raw offset from beginning of file
 * @count: raw count of bytes to sync
 * @verf: filled in with the server's current write verifier
 *
 * Note: we guarantee that data that lies within the range specified
 * by the 'offset' and 'count' parameters will be synced. The server
 * is permitted to sync data that lies outside this range at the
 * same time.
 *
 * Unfortunately we cannot lock the file to make sure we return full WCC
 * data to the client, as locking happens lower down in the filesystem.
 *
 * Return values:
 *   An nfsstat value in network byte order.
 */
__be32
nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfsd_file *nf,
	    u64 offset, u32 count, __be32 *verf)
{ … }

/**
 * nfsd_create_setattr - Set a created file's attributes
 * @rqstp: RPC transaction being executed
 * @fhp: NFS filehandle of parent directory
 * @resfhp: NFS filehandle of new object
 * @attrs: requested attributes of new object
 *
 * Returns nfs_ok on success, or an nfsstat in network byte order.
 */
__be32
nfsd_create_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp,
		    struct svc_fh *resfhp, struct nfsd_attrs *attrs)
{ … }

/* HPUX client sometimes creates a file in mode 000, and sets size to 0.
 * setting size to 0 may fail for some specific file systems by the permission
 * checking which requires WRITE permission but the mode is 000.
 * we ignore the resizing(to 0) on the just new created file, since the size is
 * 0 after file created.
 *
 * call this only after vfs_create() is called.
 * */
static void
nfsd_check_ignore_resizing(struct iattr *iap)
{ … }

/* The parent directory should already be locked: */
__be32
nfsd_create_locked(struct svc_rqst *rqstp, struct svc_fh *fhp,
		   struct nfsd_attrs *attrs,
		   int type, dev_t rdev, struct svc_fh *resfhp)
{ … }

/*
 * Create a filesystem object (regular, directory, special).
 * Note that the parent directory is left locked.
 *
 * N.B. Every call to nfsd_create needs an fh_put for _both_ fhp and resfhp
 */
__be32
nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
	    char *fname, int flen, struct nfsd_attrs *attrs,
	    int type, dev_t rdev, struct svc_fh *resfhp)
{ … }

/*
 * Read a symlink. On entry, *lenp must contain the maximum path length that
 * fits into the buffer. On return, it contains the true length.
 * N.B. After this call fhp needs an fh_put
 */
__be32
nfsd_readlink(struct svc_rqst *rqstp, struct svc_fh *fhp, char *buf, int *lenp)
{ … }

/**
 * nfsd_symlink - Create a symlink and look up its inode
 * @rqstp: RPC transaction being executed
 * @fhp: NFS filehandle of parent directory
 * @fname: filename of the new symlink
 * @flen: length of @fname
 * @path: content of the new symlink (NUL-terminated)
 * @attrs: requested attributes of new object
 * @resfhp: NFS filehandle of new object
 *
 * N.B. After this call _both_ fhp and resfhp need an fh_put
 *
 * Returns nfs_ok on success, or an nfsstat in network byte order.
 */
__be32
nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp,
	     char *fname, int flen,
	     char *path, struct nfsd_attrs *attrs,
	     struct svc_fh *resfhp)
{ … }

/*
 * Create a hardlink
 * N.B. After this call _both_ ffhp and tfhp need an fh_put
 */
__be32
nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp,
				char *name, int len, struct svc_fh *tfhp)
{ … }

static void
nfsd_close_cached_files(struct dentry *dentry)
{ … }

static bool
nfsd_has_cached_files(struct dentry *dentry)
{ … }

/*
 * Rename a file
 * N.B. After this call _both_ ffhp and tfhp need an fh_put
 */
__be32
nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
			    struct svc_fh *tfhp, char *tname, int tlen)
{ … }

/*
 * Unlink a file or directory
 * N.B. After this call fhp needs an fh_put
 */
__be32
nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
				char *fname, int flen)
{ … }

/*
 * We do this buffering because we must not call back into the file
 * system's ->lookup() method from the filldir callback. That may well
 * deadlock a number of file systems.
 *
 * This is based heavily on the implementation of same in XFS.
 */
struct buffered_dirent { … };

struct readdir_data { … };

static bool nfsd_buffered_filldir(struct dir_context *ctx, const char *name,
				 int namlen, loff_t offset, u64 ino,
				 unsigned int d_type)
{ … }

static __be32 nfsd_buffered_readdir(struct file *file, struct svc_fh *fhp,
				    nfsd_filldir_t func, struct readdir_cd *cdp,
				    loff_t *offsetp)
{ … }

/**
 * nfsd_readdir - Read entries from a directory
 * @rqstp: RPC transaction context
 * @fhp: NFS file handle of directory to be read
 * @offsetp: OUT: seek offset of final entry that was read
 * @cdp: OUT: an eof error value
 * @func: entry filler actor
 *
 * This implementation ignores the NFSv3/4 verifier cookie.
 *
 * NB: normal system calls hold file->f_pos_lock when calling
 * ->iterate_shared and ->llseek, but nfsd_readdir() does not.
 * Because the struct file acquired here is not visible to other
 * threads, it's internal state does not need mutex protection.
 *
 * Returns nfs_ok on success, otherwise an nfsstat code is
 * returned.
 */
__be32
nfsd_readdir(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t *offsetp, 
	     struct readdir_cd *cdp, nfsd_filldir_t func)
{ … }

/**
 * nfsd_filp_close: close a file synchronously
 * @fp: the file to close
 *
 * nfsd_filp_close() is similar in behaviour to filp_close().
 * The difference is that if this is the final close on the
 * file, the that finalisation happens immediately, rather then
 * being handed over to a work_queue, as it the case for
 * filp_close().
 * When a user-space process closes a file (even when using
 * filp_close() the finalisation happens before returning to
 * userspace, so it is effectively synchronous.  When a kernel thread
 * uses file_close(), on the other hand, the handling is completely
 * asynchronous.  This means that any cost imposed by that finalisation
 * is not imposed on the nfsd thread, and nfsd could potentually
 * close files more quickly than the work queue finalises the close,
 * which would lead to unbounded growth in the queue.
 *
 * In some contexts is it not safe to synchronously wait for
 * close finalisation (see comment for __fput_sync()), but nfsd
 * does not match those contexts.  In partcilarly it does not, at the
 * time that this function is called, hold and locks and no finalisation
 * of any file, socket, or device driver would have any cause to wait
 * for nfsd to make progress.
 */
void nfsd_filp_close(struct file *fp)
{ … }

/*
 * Get file system stats
 * N.B. After this call fhp needs an fh_put
 */
__be32
nfsd_statfs(struct svc_rqst *rqstp, struct svc_fh *fhp, struct kstatfs *stat, int access)
{ … }

static int exp_rdonly(struct svc_rqst *rqstp, struct svc_export *exp)
{ … }

#ifdef CONFIG_NFSD_V4
/*
 * Helper function to translate error numbers. In the case of xattr operations,
 * some error codes need to be translated outside of the standard translations.
 *
 * ENODATA needs to be translated to nfserr_noxattr.
 * E2BIG to nfserr_xattr2big.
 *
 * Additionally, vfs_listxattr can return -ERANGE. This means that the
 * file has too many extended attributes to retrieve inside an
 * XATTR_LIST_MAX sized buffer. This is a bug in the xattr implementation:
 * filesystems will allow the adding of extended attributes until they hit
 * their own internal limit. This limit may be larger than XATTR_LIST_MAX.
 * So, at that point, the attributes are present and valid, but can't
 * be retrieved using listxattr, since the upper level xattr code enforces
 * the XATTR_LIST_MAX limit.
 *
 * This bug means that we need to deal with listxattr returning -ERANGE. The
 * best mapping is to return TOOSMALL.
 */
static __be32
nfsd_xattr_errno(int err)
{ … }

/*
 * Retrieve the specified user extended attribute. To avoid always
 * having to allocate the maximum size (since we are not getting
 * a maximum size from the RPC), do a probe + alloc. Hold a reader
 * lock on i_rwsem to prevent the extended attribute from changing
 * size while we're doing this.
 */
__be32
nfsd_getxattr(struct svc_rqst *rqstp, struct svc_fh *fhp, char *name,
	      void **bufp, int *lenp)
{ … }

/*
 * Retrieve the xattr names. Since we can't know how many are
 * user extended attributes, we must get all attributes here,
 * and have the XDR encode filter out the "user." ones.
 *
 * While this could always just allocate an XATTR_LIST_MAX
 * buffer, that's a waste, so do a probe + allocate. To
 * avoid any changes between the probe and allocate, wrap
 * this in inode_lock.
 */
__be32
nfsd_listxattr(struct svc_rqst *rqstp, struct svc_fh *fhp, char **bufp,
	       int *lenp)
{ … }

/**
 * nfsd_removexattr - Remove an extended attribute
 * @rqstp: RPC transaction being executed
 * @fhp: NFS filehandle of object with xattr to remove
 * @name: name of xattr to remove (NUL-terminate)
 *
 * Pass in a NULL pointer for delegated_inode, and let the client deal
 * with NFS4ERR_DELAY (same as with e.g. setattr and remove).
 *
 * Returns nfs_ok on success, or an nfsstat in network byte order.
 */
__be32
nfsd_removexattr(struct svc_rqst *rqstp, struct svc_fh *fhp, char *name)
{ … }

__be32
nfsd_setxattr(struct svc_rqst *rqstp, struct svc_fh *fhp, char *name,
	      void *buf, u32 len, u32 flags)
{ … }
#endif

/*
 * Check for a user's access permissions to this inode.
 */
__be32
nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp,
					struct dentry *dentry, int acc)
{ … }
linux/fs/nfsd/vfs.c