dax.c | Explore in Territory

// SPDX-License-Identifier: GPL-2.0-only
/*
 * fs/dax.c - Direct Access filesystem code
 * Copyright (c) 2013-2014 Intel Corporation
 * Author: Matthew Wilcox <[email protected]>
 * Author: Ross Zwisler <[email protected]>
 */

#include <linux/atomic.h>
#include <linux/blkdev.h>
#include <linux/buffer_head.h>
#include <linux/dax.h>
#include <linux/fs.h>
#include <linux/highmem.h>
#include <linux/memcontrol.h>
#include <linux/mm.h>
#include <linux/mutex.h>
#include <linux/pagevec.h>
#include <linux/sched.h>
#include <linux/sched/signal.h>
#include <linux/uio.h>
#include <linux/vmstat.h>
#include <linux/pfn_t.h>
#include <linux/sizes.h>
#include <linux/mmu_notifier.h>
#include <linux/iomap.h>
#include <linux/rmap.h>
#include <asm/pgalloc.h>

#define CREATE_TRACE_POINTS
#include <trace/events/fs_dax.h>

/* We choose 4096 entries - same as per-zone page wait tables */
#define DAX_WAIT_TABLE_BITS …
#define DAX_WAIT_TABLE_ENTRIES …

/* The 'colour' (ie low bits) within a PMD of a page offset.  */
#define PG_PMD_COLOUR …
#define PG_PMD_NR …

static wait_queue_head_t wait_table[DAX_WAIT_TABLE_ENTRIES];

static int __init init_dax_wait_table(void)
{ … }
fs_initcall(init_dax_wait_table);

/*
 * DAX pagecache entries use XArray value entries so they can't be mistaken
 * for pages.  We use one bit for locking, one bit for the entry size (PMD)
 * and two more to tell us if the entry is a zero page or an empty entry that
 * is just used for locking.  In total four special bits.
 *
 * If the PMD bit isn't set the entry has size PAGE_SIZE, and if the ZERO_PAGE
 * and EMPTY bits aren't set the entry is a normal DAX entry with a filesystem
 * block allocation.
 */
#define DAX_SHIFT …
#define DAX_LOCKED …
#define DAX_PMD …
#define DAX_ZERO_PAGE …
#define DAX_EMPTY …

static unsigned long dax_to_pfn(void *entry)
{ … }

static void *dax_make_entry(pfn_t pfn, unsigned long flags)
{ … }

static bool dax_is_locked(void *entry)
{ … }

static unsigned int dax_entry_order(void *entry)
{ … }

static unsigned long dax_is_pmd_entry(void *entry)
{ … }

static bool dax_is_pte_entry(void *entry)
{ … }

static int dax_is_zero_entry(void *entry)
{ … }

static int dax_is_empty_entry(void *entry)
{ … }

/*
 * true if the entry that was found is of a smaller order than the entry
 * we were looking for
 */
static bool dax_is_conflict(void *entry)
{ … }

/*
 * DAX page cache entry locking
 */
struct exceptional_entry_key { … };

struct wait_exceptional_entry_queue { … };

/**
 * enum dax_wake_mode: waitqueue wakeup behaviour
 * @WAKE_ALL: wake all waiters in the waitqueue
 * @WAKE_NEXT: wake only the first waiter in the waitqueue
 */
enum dax_wake_mode { … };

static wait_queue_head_t *dax_entry_waitqueue(struct xa_state *xas,
		void *entry, struct exceptional_entry_key *key)
{ … }

static int wake_exceptional_entry_func(wait_queue_entry_t *wait,
		unsigned int mode, int sync, void *keyp)
{ … }

/*
 * @entry may no longer be the entry at the index in the mapping.
 * The important information it's conveying is whether the entry at
 * this index used to be a PMD entry.
 */
static void dax_wake_entry(struct xa_state *xas, void *entry,
			   enum dax_wake_mode mode)
{ … }

/*
 * Look up entry in page cache, wait for it to become unlocked if it
 * is a DAX entry and return it.  The caller must subsequently call
 * put_unlocked_entry() if it did not lock the entry or dax_unlock_entry()
 * if it did.  The entry returned may have a larger order than @order.
 * If @order is larger than the order of the entry found in i_pages, this
 * function returns a dax_is_conflict entry.
 *
 * Must be called with the i_pages lock held.
 */
static void *get_unlocked_entry(struct xa_state *xas, unsigned int order)
{ … }

/*
 * The only thing keeping the address space around is the i_pages lock
 * (it's cycled in clear_inode() after removing the entries from i_pages)
 * After we call xas_unlock_irq(), we cannot touch xas->xa.
 */
static void wait_entry_unlocked(struct xa_state *xas, void *entry)
{ … }

static void put_unlocked_entry(struct xa_state *xas, void *entry,
			       enum dax_wake_mode mode)
{ … }

/*
 * We used the xa_state to get the entry, but then we locked the entry and
 * dropped the xa_lock, so we know the xa_state is stale and must be reset
 * before use.
 */
static void dax_unlock_entry(struct xa_state *xas, void *entry)
{ … }

/*
 * Return: The entry stored at this location before it was locked.
 */
static void *dax_lock_entry(struct xa_state *xas, void *entry)
{ … }

static unsigned long dax_entry_size(void *entry)
{ … }

static unsigned long dax_end_pfn(void *entry)
{ … }

/*
 * Iterate through all mapped pfns represented by an entry, i.e. skip
 * 'empty' and 'zero' entries.
 */
#define for_each_mapped_pfn(entry, pfn) …

static inline bool dax_page_is_shared(struct page *page)
{ … }

/*
 * Set the page->mapping with PAGE_MAPPING_DAX_SHARED flag, increase the
 * refcount.
 */
static inline void dax_page_share_get(struct page *page)
{ … }

static inline unsigned long dax_page_share_put(struct page *page)
{ … }

/*
 * When it is called in dax_insert_entry(), the shared flag will indicate that
 * whether this entry is shared by multiple files.  If so, set the page->mapping
 * PAGE_MAPPING_DAX_SHARED, and use page->share as refcount.
 */
static void dax_associate_entry(void *entry, struct address_space *mapping,
		struct vm_area_struct *vma, unsigned long address, bool shared)
{ … }

static void dax_disassociate_entry(void *entry, struct address_space *mapping,
		bool trunc)
{ … }

static struct page *dax_busy_page(void *entry)
{ … }

/**
 * dax_lock_folio - Lock the DAX entry corresponding to a folio
 * @folio: The folio whose entry we want to lock
 *
 * Context: Process context.
 * Return: A cookie to pass to dax_unlock_folio() or 0 if the entry could
 * not be locked.
 */
dax_entry_t dax_lock_folio(struct folio *folio)
{ … }

void dax_unlock_folio(struct folio *folio, dax_entry_t cookie)
{ … }

/*
 * dax_lock_mapping_entry - Lock the DAX entry corresponding to a mapping
 * @mapping: the file's mapping whose entry we want to lock
 * @index: the offset within this file
 * @page: output the dax page corresponding to this dax entry
 *
 * Return: A cookie to pass to dax_unlock_mapping_entry() or 0 if the entry
 * could not be locked.
 */
dax_entry_t dax_lock_mapping_entry(struct address_space *mapping, pgoff_t index,
		struct page **page)
{ … }

void dax_unlock_mapping_entry(struct address_space *mapping, pgoff_t index,
		dax_entry_t cookie)
{ … }

/*
 * Find page cache entry at given index. If it is a DAX entry, return it
 * with the entry locked. If the page cache doesn't contain an entry at
 * that index, add a locked empty entry.
 *
 * When requesting an entry with size DAX_PMD, grab_mapping_entry() will
 * either return that locked entry or will return VM_FAULT_FALLBACK.
 * This will happen if there are any PTE entries within the PMD range
 * that we are requesting.
 *
 * We always favor PTE entries over PMD entries. There isn't a flow where we
 * evict PTE entries in order to 'upgrade' them to a PMD entry.  A PMD
 * insertion will fail if it finds any PTE entries already in the tree, and a
 * PTE insertion will cause an existing PMD entry to be unmapped and
 * downgraded to PTE entries.  This happens for both PMD zero pages as
 * well as PMD empty entries.
 *
 * The exception to this downgrade path is for PMD entries that have
 * real storage backing them.  We will leave these real PMD entries in
 * the tree, and PTE writes will simply dirty the entire PMD entry.
 *
 * Note: Unlike filemap_fault() we don't honor FAULT_FLAG_RETRY flags. For
 * persistent memory the benefit is doubtful. We can add that later if we can
 * show it helps.
 *
 * On error, this function does not return an ERR_PTR.  Instead it returns
 * a VM_FAULT code, encoded as an xarray internal entry.  The ERR_PTR values
 * overlap with xarray value entries.
 */
static void *grab_mapping_entry(struct xa_state *xas,
		struct address_space *mapping, unsigned int order)
{ … }

/**
 * dax_layout_busy_page_range - find first pinned page in @mapping
 * @mapping: address space to scan for a page with ref count > 1
 * @start: Starting offset. Page containing 'start' is included.
 * @end: End offset. Page containing 'end' is included. If 'end' is LLONG_MAX,
 *       pages from 'start' till the end of file are included.
 *
 * DAX requires ZONE_DEVICE mapped pages. These pages are never
 * 'onlined' to the page allocator so they are considered idle when
 * page->count == 1. A filesystem uses this interface to determine if
 * any page in the mapping is busy, i.e. for DMA, or other
 * get_user_pages() usages.
 *
 * It is expected that the filesystem is holding locks to block the
 * establishment of new mappings in this address_space. I.e. it expects
 * to be able to run unmap_mapping_range() and subsequently not race
 * mapping_mapped() becoming true.
 */
struct page *dax_layout_busy_page_range(struct address_space *mapping,
					loff_t start, loff_t end)
{ … }
EXPORT_SYMBOL_GPL(…);

struct page *dax_layout_busy_page(struct address_space *mapping)
{ … }
EXPORT_SYMBOL_GPL(…);

static int __dax_invalidate_entry(struct address_space *mapping,
					  pgoff_t index, bool trunc)
{ … }

static int __dax_clear_dirty_range(struct address_space *mapping,
		pgoff_t start, pgoff_t end)
{ … }

/*
 * Delete DAX entry at @index from @mapping.  Wait for it
 * to be unlocked before deleting it.
 */
int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index)
{ … }

/*
 * Invalidate DAX entry if it is clean.
 */
int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
				      pgoff_t index)
{ … }

static pgoff_t dax_iomap_pgoff(const struct iomap *iomap, loff_t pos)
{ … }

static int copy_cow_page_dax(struct vm_fault *vmf, const struct iomap_iter *iter)
{ … }

/*
 * MAP_SYNC on a dax mapping guarantees dirty metadata is
 * flushed on write-faults (non-cow), but not read-faults.
 */
static bool dax_fault_is_synchronous(const struct iomap_iter *iter,
		struct vm_area_struct *vma)
{ … }

/*
 * By this point grab_mapping_entry() has ensured that we have a locked entry
 * of the appropriate size so we don't have to worry about downgrading PMDs to
 * PTEs.  If we happen to be trying to insert a PTE and there is a PMD
 * already in the tree, we will skip the insertion and just dirty the PMD as
 * appropriate.
 */
static void *dax_insert_entry(struct xa_state *xas, struct vm_fault *vmf,
		const struct iomap_iter *iter, void *entry, pfn_t pfn,
		unsigned long flags)
{ … }

static int dax_writeback_one(struct xa_state *xas, struct dax_device *dax_dev,
		struct address_space *mapping, void *entry)
{ … }

/*
 * Flush the mapping to the persistent domain within the byte range of [start,
 * end]. This is required by data integrity operations to ensure file data is
 * on persistent storage prior to completion of the operation.
 */
int dax_writeback_mapping_range(struct address_space *mapping,
		struct dax_device *dax_dev, struct writeback_control *wbc)
{ … }
EXPORT_SYMBOL_GPL(…);

static int dax_iomap_direct_access(const struct iomap *iomap, loff_t pos,
		size_t size, void **kaddr, pfn_t *pfnp)
{ … }

/**
 * dax_iomap_copy_around - Prepare for an unaligned write to a shared/cow page
 * by copying the data before and after the range to be written.
 * @pos:	address to do copy from.
 * @length:	size of copy operation.
 * @align_size:	aligned w.r.t align_size (either PMD_SIZE or PAGE_SIZE)
 * @srcmap:	iomap srcmap
 * @daddr:	destination address to copy to.
 *
 * This can be called from two places. Either during DAX write fault (page
 * aligned), to copy the length size data to daddr. Or, while doing normal DAX
 * write operation, dax_iomap_iter() might call this to do the copy of either
 * start or end unaligned address. In the latter case the rest of the copy of
 * aligned ranges is taken care by dax_iomap_iter() itself.
 * If the srcmap contains invalid data, such as HOLE and UNWRITTEN, zero the
 * area to make sure no old data remains.
 */
static int dax_iomap_copy_around(loff_t pos, uint64_t length, size_t align_size,
		const struct iomap *srcmap, void *daddr)
{ … }

/*
 * The user has performed a load from a hole in the file.  Allocating a new
 * page in the file would cause excessive storage usage for workloads with
 * sparse files.  Instead we insert a read-only mapping of the 4k zero page.
 * If this page is ever written to we will re-fault and change the mapping to
 * point to real DAX storage instead.
 */
static vm_fault_t dax_load_hole(struct xa_state *xas, struct vm_fault *vmf,
		const struct iomap_iter *iter, void **entry)
{ … }

#ifdef CONFIG_FS_DAX_PMD
static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf,
		const struct iomap_iter *iter, void **entry)
{ … }
#else
static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf,
		const struct iomap_iter *iter, void **entry)
{
	return VM_FAULT_FALLBACK;
}
#endif /* CONFIG_FS_DAX_PMD */

static s64 dax_unshare_iter(struct iomap_iter *iter)
{ … }

int dax_file_unshare(struct inode *inode, loff_t pos, loff_t len,
		const struct iomap_ops *ops)
{ … }
EXPORT_SYMBOL_GPL(…);

static int dax_memzero(struct iomap_iter *iter, loff_t pos, size_t size)
{ … }

static s64 dax_zero_iter(struct iomap_iter *iter, bool *did_zero)
{ … }

int dax_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero,
		const struct iomap_ops *ops)
{ … }
EXPORT_SYMBOL_GPL(…);

int dax_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
		const struct iomap_ops *ops)
{ … }
EXPORT_SYMBOL_GPL(…);

static loff_t dax_iomap_iter(const struct iomap_iter *iomi,
		struct iov_iter *iter)
{ … }

/**
 * dax_iomap_rw - Perform I/O to a DAX file
 * @iocb:	The control block for this I/O
 * @iter:	The addresses to do I/O from or to
 * @ops:	iomap ops passed from the file system
 *
 * This function performs read and write operations to directly mapped
 * persistent memory.  The callers needs to take care of read/write exclusion
 * and evicting any page cache pages in the region under I/O.
 */
ssize_t
dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
		const struct iomap_ops *ops)
{ … }
EXPORT_SYMBOL_GPL(…);

static vm_fault_t dax_fault_return(int error)
{ … }

/*
 * When handling a synchronous page fault and the inode need a fsync, we can
 * insert the PTE/PMD into page tables only after that fsync happened. Skip
 * insertion for now and return the pfn so that caller can insert it after the
 * fsync is done.
 */
static vm_fault_t dax_fault_synchronous_pfnp(pfn_t *pfnp, pfn_t pfn)
{ … }

static vm_fault_t dax_fault_cow_page(struct vm_fault *vmf,
		const struct iomap_iter *iter)
{ … }

/**
 * dax_fault_iter - Common actor to handle pfn insertion in PTE/PMD fault.
 * @vmf:	vm fault instance
 * @iter:	iomap iter
 * @pfnp:	pfn to be returned
 * @xas:	the dax mapping tree of a file
 * @entry:	an unlocked dax entry to be inserted
 * @pmd:	distinguish whether it is a pmd fault
 */
static vm_fault_t dax_fault_iter(struct vm_fault *vmf,
		const struct iomap_iter *iter, pfn_t *pfnp,
		struct xa_state *xas, void **entry, bool pmd)
{ … }

static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
			       int *iomap_errp, const struct iomap_ops *ops)
{ … }

#ifdef CONFIG_FS_DAX_PMD
static bool dax_fault_check_fallback(struct vm_fault *vmf, struct xa_state *xas,
		pgoff_t max_pgoff)
{ … }

static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
			       const struct iomap_ops *ops)
{ … }
#else
static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
			       const struct iomap_ops *ops)
{
	return VM_FAULT_FALLBACK;
}
#endif /* CONFIG_FS_DAX_PMD */

/**
 * dax_iomap_fault - handle a page fault on a DAX file
 * @vmf: The description of the fault
 * @order: Order of the page to fault in
 * @pfnp: PFN to insert for synchronous faults if fsync is required
 * @iomap_errp: Storage for detailed error code in case of error
 * @ops: Iomap ops passed from the file system
 *
 * When a page fault occurs, filesystems may call this helper in
 * their fault handler for DAX files. dax_iomap_fault() assumes the caller
 * has done all the necessary locking for page fault to proceed
 * successfully.
 */
vm_fault_t dax_iomap_fault(struct vm_fault *vmf, unsigned int order,
		    pfn_t *pfnp, int *iomap_errp, const struct iomap_ops *ops)
{ … }
EXPORT_SYMBOL_GPL(…);

/*
 * dax_insert_pfn_mkwrite - insert PTE or PMD entry into page tables
 * @vmf: The description of the fault
 * @pfn: PFN to insert
 * @order: Order of entry to insert.
 *
 * This function inserts a writeable PTE or PMD entry into the page tables
 * for an mmaped DAX file.  It also marks the page cache entry as dirty.
 */
static vm_fault_t
dax_insert_pfn_mkwrite(struct vm_fault *vmf, pfn_t pfn, unsigned int order)
{ … }

/**
 * dax_finish_sync_fault - finish synchronous page fault
 * @vmf: The description of the fault
 * @order: Order of entry to be inserted
 * @pfn: PFN to insert
 *
 * This function ensures that the file range touched by the page fault is
 * stored persistently on the media and handles inserting of appropriate page
 * table entry.
 */
vm_fault_t dax_finish_sync_fault(struct vm_fault *vmf, unsigned int order,
		pfn_t pfn)
{ … }
EXPORT_SYMBOL_GPL(…);

static loff_t dax_range_compare_iter(struct iomap_iter *it_src,
		struct iomap_iter *it_dest, u64 len, bool *same)
{ … }

int dax_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
		struct inode *dst, loff_t dstoff, loff_t len, bool *same,
		const struct iomap_ops *ops)
{ … }

int dax_remap_file_range_prep(struct file *file_in, loff_t pos_in,
			      struct file *file_out, loff_t pos_out,
			      loff_t *len, unsigned int remap_flags,
			      const struct iomap_ops *ops)
{ … }
EXPORT_SYMBOL_GPL(…);
linux/fs/dax.c