file.c | Explore in Territory

// SPDX-License-Identifier: GPL-2.0
#include <linux/ceph/ceph_debug.h>
#include <linux/ceph/striper.h>

#include <linux/module.h>
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/file.h>
#include <linux/mount.h>
#include <linux/namei.h>
#include <linux/writeback.h>
#include <linux/falloc.h>
#include <linux/iversion.h>
#include <linux/ktime.h>
#include <linux/splice.h>

#include "super.h"
#include "mds_client.h"
#include "cache.h"
#include "io.h"
#include "metric.h"

static __le32 ceph_flags_sys2wire(struct ceph_mds_client *mdsc, u32 flags)
{ … }

/*
 * Ceph file operations
 *
 * Implement basic open/close functionality, and implement
 * read/write.
 *
 * We implement three modes of file I/O:
 *  - buffered uses the generic_file_aio_{read,write} helpers
 *
 *  - synchronous is used when there is multi-client read/write
 *    sharing, avoids the page cache, and synchronously waits for an
 *    ack from the OSD.
 *
 *  - direct io takes the variant of the sync path that references
 *    user pages directly.
 *
 * fsync() flushes and waits on dirty pages, but just queues metadata
 * for writeback: since the MDS can recover size and mtime there is no
 * need to wait for MDS acknowledgement.
 */

/*
 * How many pages to get in one call to iov_iter_get_pages().  This
 * determines the size of the on-stack array used as a buffer.
 */
#define ITER_GET_BVECS_PAGES …

static ssize_t __iter_get_bvecs(struct iov_iter *iter, size_t maxsize,
				struct bio_vec *bvecs)
{ … }

/*
 * iov_iter_get_pages() only considers one iov_iter segment, no matter
 * what maxsize or maxpages are given.  For ITER_BVEC that is a single
 * page.
 *
 * Attempt to get up to @maxsize bytes worth of pages from @iter.
 * Return the number of bytes in the created bio_vec array, or an error.
 */
static ssize_t iter_get_bvecs_alloc(struct iov_iter *iter, size_t maxsize,
				    struct bio_vec **bvecs, int *num_bvecs)
{ … }

static void put_bvecs(struct bio_vec *bvecs, int num_bvecs, bool should_dirty)
{ … }

/*
 * Prepare an open request.  Preallocate ceph_cap to avoid an
 * inopportune ENOMEM later.
 */
static struct ceph_mds_request *
prepare_open_request(struct super_block *sb, int flags, int create_mode)
{ … }

static int ceph_init_file_info(struct inode *inode, struct file *file,
					int fmode, bool isdir)
{ … }

/*
 * initialize private struct file data.
 * if we fail, clean up by dropping fmode reference on the ceph_inode
 */
static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
{ … }

/*
 * try renew caps after session gets killed.
 */
int ceph_renew_caps(struct inode *inode, int fmode)
{ … }

/*
 * If we already have the requisite capabilities, we can satisfy
 * the open request locally (no need to request new caps from the
 * MDS).  We do, however, need to inform the MDS (asynchronously)
 * if our wanted caps set expands.
 */
int ceph_open(struct inode *inode, struct file *file)
{ … }

/* Clone the layout from a synchronous create, if the dir now has Dc caps */
static void
cache_file_layout(struct inode *dst, struct inode *src)
{ … }

/*
 * Try to set up an async create. We need caps, a file layout, and inode number,
 * and either a lease on the dentry or complete dir info. If any of those
 * criteria are not satisfied, then return false and the caller can go
 * synchronous.
 */
static int try_prep_async_create(struct inode *dir, struct dentry *dentry,
				 struct ceph_file_layout *lo, u64 *pino)
{ … }

static void restore_deleg_ino(struct inode *dir, u64 ino)
{ … }

static void wake_async_create_waiters(struct inode *inode,
				      struct ceph_mds_session *session)
{ … }

static void ceph_async_create_cb(struct ceph_mds_client *mdsc,
                                 struct ceph_mds_request *req)
{ … }

static int ceph_finish_async_create(struct inode *dir, struct inode *inode,
				    struct dentry *dentry,
				    struct file *file, umode_t mode,
				    struct ceph_mds_request *req,
				    struct ceph_acl_sec_ctx *as_ctx,
				    struct ceph_file_layout *lo)
{ … }

/*
 * Do a lookup + open with a single request.  If we get a non-existent
 * file or symlink, return 1 so the VFS can retry.
 */
int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
		     struct file *file, unsigned flags, umode_t mode)
{ … }

int ceph_release(struct inode *inode, struct file *file)
{ … }

enum { … };

/*
 * Completely synchronous read and write methods.  Direct from __user
 * buffer to osd, or directly to user pages (if O_DIRECT).
 *
 * If the read spans object boundary, just do multiple reads.  (That's not
 * atomic, but good enough for now.)
 *
 * If we get a short result from the OSD, check against i_size; we need to
 * only return a short read to the caller if we hit EOF.
 */
ssize_t __ceph_sync_read(struct inode *inode, loff_t *ki_pos,
			 struct iov_iter *to, int *retry_op,
			 u64 *last_objver)
{ … }

static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to,
			      int *retry_op)
{ … }

struct ceph_aio_request { … };

struct ceph_aio_work { … };

static void ceph_aio_retry_work(struct work_struct *work);

static void ceph_aio_complete(struct inode *inode,
			      struct ceph_aio_request *aio_req)
{ … }

static void ceph_aio_complete_req(struct ceph_osd_request *req)
{ … }

static void ceph_aio_retry_work(struct work_struct *work)
{ … }

static ssize_t
ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
		       struct ceph_snap_context *snapc,
		       struct ceph_cap_flush **pcf)
{ … }

/*
 * Synchronous write, straight from __user pointer or user pages.
 *
 * If write spans object boundary, just do multiple writes.  (For a
 * correct atomic write, we should e.g. take write locks on all
 * objects, rollback on failure, etc.)
 */
static ssize_t
ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
		struct ceph_snap_context *snapc)
{ … }

/*
 * Wrap generic_file_aio_read with checks for cap bits on the inode.
 * Atomically grab references, so that those bits are not released
 * back to the MDS mid-read.
 *
 * Hmm, the sync read case isn't actually async... should it be?
 */
static ssize_t ceph_read_iter(struct kiocb *iocb, struct iov_iter *to)
{ … }

/*
 * Wrap filemap_splice_read with checks for cap bits on the inode.
 * Atomically grab references, so that those bits are not released
 * back to the MDS mid-read.
 */
static ssize_t ceph_splice_read(struct file *in, loff_t *ppos,
				struct pipe_inode_info *pipe,
				size_t len, unsigned int flags)
{ … }

/*
 * Take cap references to avoid releasing caps to MDS mid-write.
 *
 * If we are synchronous, and write with an old snap context, the OSD
 * may return EOLDSNAPC.  In that case, retry the write.. _after_
 * dropping our cap refs and allowing the pending snap to logically
 * complete _before_ this write occurs.
 *
 * If we are near ENOSPC, write synchronously.
 */
static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
{ … }

/*
 * llseek.  be sure to verify file size on SEEK_END.
 */
static loff_t ceph_llseek(struct file *file, loff_t offset, int whence)
{ … }

static inline void ceph_zero_partial_page(
	struct inode *inode, loff_t offset, unsigned size)
{ … }

static void ceph_zero_pagecache_range(struct inode *inode, loff_t offset,
				      loff_t length)
{ … }

static int ceph_zero_partial_object(struct inode *inode,
				    loff_t offset, loff_t *length)
{ … }

static int ceph_zero_objects(struct inode *inode, loff_t offset, loff_t length)
{ … }

static long ceph_fallocate(struct file *file, int mode,
				loff_t offset, loff_t length)
{ … }

/*
 * This function tries to get FILE_WR capabilities for dst_ci and FILE_RD for
 * src_ci.  Two attempts are made to obtain both caps, and an error is return if
 * this fails; zero is returned on success.
 */
static int get_rd_wr_caps(struct file *src_filp, int *src_got,
			  struct file *dst_filp,
			  loff_t dst_endoff, int *dst_got)
{ … }

static void put_rd_wr_caps(struct ceph_inode_info *src_ci, int src_got,
			   struct ceph_inode_info *dst_ci, int dst_got)
{ … }

/*
 * This function does several size-related checks, returning an error if:
 *  - source file is smaller than off+len
 *  - destination file size is not OK (inode_newsize_ok())
 *  - max bytes quotas is exceeded
 */
static int is_file_size_ok(struct inode *src_inode, struct inode *dst_inode,
			   loff_t src_off, loff_t dst_off, size_t len)
{ … }

static struct ceph_osd_request *
ceph_alloc_copyfrom_request(struct ceph_osd_client *osdc,
			    u64 src_snapid,
			    struct ceph_object_id *src_oid,
			    struct ceph_object_locator *src_oloc,
			    struct ceph_object_id *dst_oid,
			    struct ceph_object_locator *dst_oloc,
			    u32 truncate_seq, u64 truncate_size)
{ … }

static ssize_t ceph_do_objects_copy(struct ceph_inode_info *src_ci, u64 *src_off,
				    struct ceph_inode_info *dst_ci, u64 *dst_off,
				    struct ceph_fs_client *fsc,
				    size_t len, unsigned int flags)
{ … }

static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
				      struct file *dst_file, loff_t dst_off,
				      size_t len, unsigned int flags)
{ … }

static ssize_t ceph_copy_file_range(struct file *src_file, loff_t src_off,
				    struct file *dst_file, loff_t dst_off,
				    size_t len, unsigned int flags)
{ … }

const struct file_operations ceph_file_fops = …;
linux/fs/ceph/file.c