direct.c | Explore in Territory

// SPDX-License-Identifier: GPL-2.0-only
/*
 * linux/fs/nfs/direct.c
 *
 * Copyright (C) 2003 by Chuck Lever <[email protected]>
 *
 * High-performance uncached I/O for the Linux NFS client
 *
 * There are important applications whose performance or correctness
 * depends on uncached access to file data.  Database clusters
 * (multiple copies of the same instance running on separate hosts)
 * implement their own cache coherency protocol that subsumes file
 * system cache protocols.  Applications that process datasets
 * considerably larger than the client's memory do not always benefit
 * from a local cache.  A streaming video server, for instance, has no
 * need to cache the contents of a file.
 *
 * When an application requests uncached I/O, all read and write requests
 * are made directly to the server; data stored or fetched via these
 * requests is not cached in the Linux page cache.  The client does not
 * correct unaligned requests from applications.  All requested bytes are
 * held on permanent storage before a direct write system call returns to
 * an application.
 *
 * Solaris implements an uncached I/O facility called directio() that
 * is used for backups and sequential I/O to very large files.  Solaris
 * also supports uncaching whole NFS partitions with "-o forcedirectio,"
 * an undocumented mount option.
 *
 * Designed by Jeff Kimmel, Chuck Lever, and Trond Myklebust, with
 * help from Andrew Morton.
 *
 * 18 Dec 2001	Initial implementation for 2.4  --cel
 * 08 Jul 2002	Version for 2.4.19, with bug fixes --trondmy
 * 08 Jun 2003	Port to 2.5 APIs  --cel
 * 31 Mar 2004	Handle direct I/O without VFS support  --cel
 * 15 Sep 2004	Parallel async reads  --cel
 * 04 May 2005	support O_DIRECT with aio  --cel
 *
 */

#include <linux/errno.h>
#include <linux/sched.h>
#include <linux/kernel.h>
#include <linux/file.h>
#include <linux/pagemap.h>
#include <linux/kref.h>
#include <linux/slab.h>
#include <linux/task_io_accounting_ops.h>
#include <linux/module.h>

#include <linux/nfs_fs.h>
#include <linux/nfs_page.h>
#include <linux/sunrpc/clnt.h>

#include <linux/uaccess.h>
#include <linux/atomic.h>

#include "internal.h"
#include "iostat.h"
#include "pnfs.h"
#include "fscache.h"
#include "nfstrace.h"

#define NFSDBG_FACILITY …

static struct kmem_cache *nfs_direct_cachep;

static const struct nfs_pgio_completion_ops nfs_direct_write_completion_ops;
static const struct nfs_commit_completion_ops nfs_direct_commit_completion_ops;
static void nfs_direct_write_complete(struct nfs_direct_req *dreq);
static void nfs_direct_write_schedule_work(struct work_struct *work);

static inline void get_dreq(struct nfs_direct_req *dreq)
{ … }

static inline int put_dreq(struct nfs_direct_req *dreq)
{ … }

static void
nfs_direct_handle_truncated(struct nfs_direct_req *dreq,
			    const struct nfs_pgio_header *hdr,
			    ssize_t dreq_len)
{ … }

static void
nfs_direct_count_bytes(struct nfs_direct_req *dreq,
		       const struct nfs_pgio_header *hdr)
{ … }

static void nfs_direct_truncate_request(struct nfs_direct_req *dreq,
					struct nfs_page *req)
{ … }

/**
 * nfs_swap_rw - NFS address space operation for swap I/O
 * @iocb: target I/O control block
 * @iter: I/O buffer
 *
 * Perform IO to the swap-file.  This is much like direct IO.
 */
int nfs_swap_rw(struct kiocb *iocb, struct iov_iter *iter)
{ … }

static void nfs_direct_release_pages(struct page **pages, unsigned int npages)
{ … }

void nfs_init_cinfo_from_dreq(struct nfs_commit_info *cinfo,
			      struct nfs_direct_req *dreq)
{ … }

static inline struct nfs_direct_req *nfs_direct_req_alloc(void)
{ … }

static void nfs_direct_req_free(struct kref *kref)
{ … }

static void nfs_direct_req_release(struct nfs_direct_req *dreq)
{ … }

ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq, loff_t offset)
{ … }
EXPORT_SYMBOL_GPL(…);

/*
 * Collects and returns the final error value/byte-count.
 */
static ssize_t nfs_direct_wait(struct nfs_direct_req *dreq)
{ … }

/*
 * Synchronous I/O uses a stack-allocated iocb.  Thus we can't trust
 * the iocb is still valid here if this is a synchronous request.
 */
static void nfs_direct_complete(struct nfs_direct_req *dreq)
{ … }

static void nfs_direct_read_completion(struct nfs_pgio_header *hdr)
{ … }

static void nfs_read_sync_pgio_error(struct list_head *head, int error)
{ … }

static void nfs_direct_pgio_init(struct nfs_pgio_header *hdr)
{ … }

static const struct nfs_pgio_completion_ops nfs_direct_read_completion_ops = …;

/*
 * For each rsize'd chunk of the user's buffer, dispatch an NFS READ
 * operation.  If nfs_readdata_alloc() or get_user_pages() fails,
 * bail and stop sending more reads.  Read length accounting is
 * handled automatically by nfs_direct_read_result().  Otherwise, if
 * no requests have been sent, just return an error.
 */

static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq,
					      struct iov_iter *iter,
					      loff_t pos)
{ … }

/**
 * nfs_file_direct_read - file direct read operation for NFS files
 * @iocb: target I/O control block
 * @iter: vector of user buffers into which to read data
 * @swap: flag indicating this is swap IO, not O_DIRECT IO
 *
 * We use this function for direct reads instead of calling
 * generic_file_aio_read() in order to avoid gfar's check to see if
 * the request starts before the end of the file.  For that check
 * to work, we must generate a GETATTR before each direct read, and
 * even then there is a window between the GETATTR and the subsequent
 * READ where the file size could change.  Our preference is simply
 * to do all reads the application wants, and the server will take
 * care of managing the end of file boundary.
 *
 * This function also eliminates unnecessarily updating the file's
 * atime locally, as the NFS server sets the file's atime, and this
 * client must read the updated atime from the server back into its
 * cache.
 */
ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter,
			     bool swap)
{ … }

static void nfs_direct_add_page_head(struct list_head *list,
				     struct nfs_page *req)
{ … }

static void nfs_direct_join_group(struct list_head *list,
				  struct nfs_commit_info *cinfo,
				  struct inode *inode)
{ … }

static void
nfs_direct_write_scan_commit_list(struct inode *inode,
				  struct list_head *list,
				  struct nfs_commit_info *cinfo)
{ … }

static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
{ … }

static void nfs_direct_commit_complete(struct nfs_commit_data *data)
{ … }

static void nfs_direct_resched_write(struct nfs_commit_info *cinfo,
		struct nfs_page *req)
{ … }

static const struct nfs_commit_completion_ops nfs_direct_commit_completion_ops = …;

static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq)
{ … }

static void nfs_direct_write_clear_reqs(struct nfs_direct_req *dreq)
{ … }

static void nfs_direct_write_schedule_work(struct work_struct *work)
{ … }

static void nfs_direct_write_complete(struct nfs_direct_req *dreq)
{ … }

static void nfs_direct_write_completion(struct nfs_pgio_header *hdr)
{ … }

static void nfs_write_sync_pgio_error(struct list_head *head, int error)
{ … }

static void nfs_direct_write_reschedule_io(struct nfs_pgio_header *hdr)
{ … }

static const struct nfs_pgio_completion_ops nfs_direct_write_completion_ops = …;


/*
 * NB: Return the value of the first error return code.  Subsequent
 *     errors after the first one are ignored.
 */
/*
 * For each wsize'd chunk of the user's buffer, dispatch an NFS WRITE
 * operation.  If nfs_writedata_alloc() or get_user_pages() fails,
 * bail and stop sending more writes.  Write length accounting is
 * handled automatically by nfs_direct_write_result().  Otherwise, if
 * no requests have been sent, just return an error.
 */
static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
					       struct iov_iter *iter,
					       loff_t pos, int ioflags)
{ … }

/**
 * nfs_file_direct_write - file direct write operation for NFS files
 * @iocb: target I/O control block
 * @iter: vector of user buffers from which to write data
 * @swap: flag indicating this is swap IO, not O_DIRECT IO
 *
 * We use this function for direct writes instead of calling
 * generic_file_aio_write() in order to avoid taking the inode
 * semaphore and updating the i_size.  The NFS server will set
 * the new i_size and this client must read the updated size
 * back into its cache.  We let the server do generic write
 * parameter checking and report problems.
 *
 * We eliminate local atime updates, see direct read above.
 *
 * We avoid unnecessary page cache invalidations for normal cached
 * readers of this file.
 *
 * Note that O_APPEND is not supported for NFS direct writes, as there
 * is no atomic O_APPEND write facility in the NFS protocol.
 */
ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter,
			      bool swap)
{ … }

/**
 * nfs_init_directcache - create a slab cache for nfs_direct_req structures
 *
 */
int __init nfs_init_directcache(void)
{ … }

/**
 * nfs_destroy_directcache - destroy the slab cache for nfs_direct_req structures
 *
 */
void nfs_destroy_directcache(void)
{ … }