// SPDX-License-Identifier: GPL-2.0-only /* * fs/dax.c - Direct Access filesystem code * Copyright (c) 2013-2014 Intel Corporation * Author: Matthew Wilcox <[email protected]> * Author: Ross Zwisler <[email protected]> */ #include <linux/atomic.h> #include <linux/blkdev.h> #include <linux/buffer_head.h> #include <linux/dax.h> #include <linux/fs.h> #include <linux/highmem.h> #include <linux/memcontrol.h> #include <linux/mm.h> #include <linux/mutex.h> #include <linux/pagevec.h> #include <linux/sched.h> #include <linux/sched/signal.h> #include <linux/uio.h> #include <linux/vmstat.h> #include <linux/pfn_t.h> #include <linux/sizes.h> #include <linux/mmu_notifier.h> #include <linux/iomap.h> #include <linux/rmap.h> #include <asm/pgalloc.h> #define CREATE_TRACE_POINTS #include <trace/events/fs_dax.h> /* We choose 4096 entries - same as per-zone page wait tables */ #define DAX_WAIT_TABLE_BITS … #define DAX_WAIT_TABLE_ENTRIES … /* The 'colour' (ie low bits) within a PMD of a page offset. */ #define PG_PMD_COLOUR … #define PG_PMD_NR … static wait_queue_head_t wait_table[DAX_WAIT_TABLE_ENTRIES]; static int __init init_dax_wait_table(void) { … } fs_initcall(init_dax_wait_table); /* * DAX pagecache entries use XArray value entries so they can't be mistaken * for pages. We use one bit for locking, one bit for the entry size (PMD) * and two more to tell us if the entry is a zero page or an empty entry that * is just used for locking. In total four special bits. * * If the PMD bit isn't set the entry has size PAGE_SIZE, and if the ZERO_PAGE * and EMPTY bits aren't set the entry is a normal DAX entry with a filesystem * block allocation. */ #define DAX_SHIFT … #define DAX_LOCKED … #define DAX_PMD … #define DAX_ZERO_PAGE … #define DAX_EMPTY … static unsigned long dax_to_pfn(void *entry) { … } static void *dax_make_entry(pfn_t pfn, unsigned long flags) { … } static bool dax_is_locked(void *entry) { … } static unsigned int dax_entry_order(void *entry) { … } static unsigned long dax_is_pmd_entry(void *entry) { … } static bool dax_is_pte_entry(void *entry) { … } static int dax_is_zero_entry(void *entry) { … } static int dax_is_empty_entry(void *entry) { … } /* * true if the entry that was found is of a smaller order than the entry * we were looking for */ static bool dax_is_conflict(void *entry) { … } /* * DAX page cache entry locking */ struct exceptional_entry_key { … }; struct wait_exceptional_entry_queue { … }; /** * enum dax_wake_mode: waitqueue wakeup behaviour * @WAKE_ALL: wake all waiters in the waitqueue * @WAKE_NEXT: wake only the first waiter in the waitqueue */ enum dax_wake_mode { … }; static wait_queue_head_t *dax_entry_waitqueue(struct xa_state *xas, void *entry, struct exceptional_entry_key *key) { … } static int wake_exceptional_entry_func(wait_queue_entry_t *wait, unsigned int mode, int sync, void *keyp) { … } /* * @entry may no longer be the entry at the index in the mapping. * The important information it's conveying is whether the entry at * this index used to be a PMD entry. */ static void dax_wake_entry(struct xa_state *xas, void *entry, enum dax_wake_mode mode) { … } /* * Look up entry in page cache, wait for it to become unlocked if it * is a DAX entry and return it. The caller must subsequently call * put_unlocked_entry() if it did not lock the entry or dax_unlock_entry() * if it did. The entry returned may have a larger order than @order. * If @order is larger than the order of the entry found in i_pages, this * function returns a dax_is_conflict entry. * * Must be called with the i_pages lock held. */ static void *get_unlocked_entry(struct xa_state *xas, unsigned int order) { … } /* * The only thing keeping the address space around is the i_pages lock * (it's cycled in clear_inode() after removing the entries from i_pages) * After we call xas_unlock_irq(), we cannot touch xas->xa. */ static void wait_entry_unlocked(struct xa_state *xas, void *entry) { … } static void put_unlocked_entry(struct xa_state *xas, void *entry, enum dax_wake_mode mode) { … } /* * We used the xa_state to get the entry, but then we locked the entry and * dropped the xa_lock, so we know the xa_state is stale and must be reset * before use. */ static void dax_unlock_entry(struct xa_state *xas, void *entry) { … } /* * Return: The entry stored at this location before it was locked. */ static void *dax_lock_entry(struct xa_state *xas, void *entry) { … } static unsigned long dax_entry_size(void *entry) { … } static unsigned long dax_end_pfn(void *entry) { … } /* * Iterate through all mapped pfns represented by an entry, i.e. skip * 'empty' and 'zero' entries. */ #define for_each_mapped_pfn(entry, pfn) … static inline bool dax_page_is_shared(struct page *page) { … } /* * Set the page->mapping with PAGE_MAPPING_DAX_SHARED flag, increase the * refcount. */ static inline void dax_page_share_get(struct page *page) { … } static inline unsigned long dax_page_share_put(struct page *page) { … } /* * When it is called in dax_insert_entry(), the shared flag will indicate that * whether this entry is shared by multiple files. If so, set the page->mapping * PAGE_MAPPING_DAX_SHARED, and use page->share as refcount. */ static void dax_associate_entry(void *entry, struct address_space *mapping, struct vm_area_struct *vma, unsigned long address, bool shared) { … } static void dax_disassociate_entry(void *entry, struct address_space *mapping, bool trunc) { … } static struct page *dax_busy_page(void *entry) { … } /** * dax_lock_folio - Lock the DAX entry corresponding to a folio * @folio: The folio whose entry we want to lock * * Context: Process context. * Return: A cookie to pass to dax_unlock_folio() or 0 if the entry could * not be locked. */ dax_entry_t dax_lock_folio(struct folio *folio) { … } void dax_unlock_folio(struct folio *folio, dax_entry_t cookie) { … } /* * dax_lock_mapping_entry - Lock the DAX entry corresponding to a mapping * @mapping: the file's mapping whose entry we want to lock * @index: the offset within this file * @page: output the dax page corresponding to this dax entry * * Return: A cookie to pass to dax_unlock_mapping_entry() or 0 if the entry * could not be locked. */ dax_entry_t dax_lock_mapping_entry(struct address_space *mapping, pgoff_t index, struct page **page) { … } void dax_unlock_mapping_entry(struct address_space *mapping, pgoff_t index, dax_entry_t cookie) { … } /* * Find page cache entry at given index. If it is a DAX entry, return it * with the entry locked. If the page cache doesn't contain an entry at * that index, add a locked empty entry. * * When requesting an entry with size DAX_PMD, grab_mapping_entry() will * either return that locked entry or will return VM_FAULT_FALLBACK. * This will happen if there are any PTE entries within the PMD range * that we are requesting. * * We always favor PTE entries over PMD entries. There isn't a flow where we * evict PTE entries in order to 'upgrade' them to a PMD entry. A PMD * insertion will fail if it finds any PTE entries already in the tree, and a * PTE insertion will cause an existing PMD entry to be unmapped and * downgraded to PTE entries. This happens for both PMD zero pages as * well as PMD empty entries. * * The exception to this downgrade path is for PMD entries that have * real storage backing them. We will leave these real PMD entries in * the tree, and PTE writes will simply dirty the entire PMD entry. * * Note: Unlike filemap_fault() we don't honor FAULT_FLAG_RETRY flags. For * persistent memory the benefit is doubtful. We can add that later if we can * show it helps. * * On error, this function does not return an ERR_PTR. Instead it returns * a VM_FAULT code, encoded as an xarray internal entry. The ERR_PTR values * overlap with xarray value entries. */ static void *grab_mapping_entry(struct xa_state *xas, struct address_space *mapping, unsigned int order) { … } /** * dax_layout_busy_page_range - find first pinned page in @mapping * @mapping: address space to scan for a page with ref count > 1 * @start: Starting offset. Page containing 'start' is included. * @end: End offset. Page containing 'end' is included. If 'end' is LLONG_MAX, * pages from 'start' till the end of file are included. * * DAX requires ZONE_DEVICE mapped pages. These pages are never * 'onlined' to the page allocator so they are considered idle when * page->count == 1. A filesystem uses this interface to determine if * any page in the mapping is busy, i.e. for DMA, or other * get_user_pages() usages. * * It is expected that the filesystem is holding locks to block the * establishment of new mappings in this address_space. I.e. it expects * to be able to run unmap_mapping_range() and subsequently not race * mapping_mapped() becoming true. */ struct page *dax_layout_busy_page_range(struct address_space *mapping, loff_t start, loff_t end) { … } EXPORT_SYMBOL_GPL(…); struct page *dax_layout_busy_page(struct address_space *mapping) { … } EXPORT_SYMBOL_GPL(…); static int __dax_invalidate_entry(struct address_space *mapping, pgoff_t index, bool trunc) { … } static int __dax_clear_dirty_range(struct address_space *mapping, pgoff_t start, pgoff_t end) { … } /* * Delete DAX entry at @index from @mapping. Wait for it * to be unlocked before deleting it. */ int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index) { … } /* * Invalidate DAX entry if it is clean. */ int dax_invalidate_mapping_entry_sync(struct address_space *mapping, pgoff_t index) { … } static pgoff_t dax_iomap_pgoff(const struct iomap *iomap, loff_t pos) { … } static int copy_cow_page_dax(struct vm_fault *vmf, const struct iomap_iter *iter) { … } /* * MAP_SYNC on a dax mapping guarantees dirty metadata is * flushed on write-faults (non-cow), but not read-faults. */ static bool dax_fault_is_synchronous(const struct iomap_iter *iter, struct vm_area_struct *vma) { … } /* * By this point grab_mapping_entry() has ensured that we have a locked entry * of the appropriate size so we don't have to worry about downgrading PMDs to * PTEs. If we happen to be trying to insert a PTE and there is a PMD * already in the tree, we will skip the insertion and just dirty the PMD as * appropriate. */ static void *dax_insert_entry(struct xa_state *xas, struct vm_fault *vmf, const struct iomap_iter *iter, void *entry, pfn_t pfn, unsigned long flags) { … } static int dax_writeback_one(struct xa_state *xas, struct dax_device *dax_dev, struct address_space *mapping, void *entry) { … } /* * Flush the mapping to the persistent domain within the byte range of [start, * end]. This is required by data integrity operations to ensure file data is * on persistent storage prior to completion of the operation. */ int dax_writeback_mapping_range(struct address_space *mapping, struct dax_device *dax_dev, struct writeback_control *wbc) { … } EXPORT_SYMBOL_GPL(…); static int dax_iomap_direct_access(const struct iomap *iomap, loff_t pos, size_t size, void **kaddr, pfn_t *pfnp) { … } /** * dax_iomap_copy_around - Prepare for an unaligned write to a shared/cow page * by copying the data before and after the range to be written. * @pos: address to do copy from. * @length: size of copy operation. * @align_size: aligned w.r.t align_size (either PMD_SIZE or PAGE_SIZE) * @srcmap: iomap srcmap * @daddr: destination address to copy to. * * This can be called from two places. Either during DAX write fault (page * aligned), to copy the length size data to daddr. Or, while doing normal DAX * write operation, dax_iomap_iter() might call this to do the copy of either * start or end unaligned address. In the latter case the rest of the copy of * aligned ranges is taken care by dax_iomap_iter() itself. * If the srcmap contains invalid data, such as HOLE and UNWRITTEN, zero the * area to make sure no old data remains. */ static int dax_iomap_copy_around(loff_t pos, uint64_t length, size_t align_size, const struct iomap *srcmap, void *daddr) { … } /* * The user has performed a load from a hole in the file. Allocating a new * page in the file would cause excessive storage usage for workloads with * sparse files. Instead we insert a read-only mapping of the 4k zero page. * If this page is ever written to we will re-fault and change the mapping to * point to real DAX storage instead. */ static vm_fault_t dax_load_hole(struct xa_state *xas, struct vm_fault *vmf, const struct iomap_iter *iter, void **entry) { … } #ifdef CONFIG_FS_DAX_PMD static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf, const struct iomap_iter *iter, void **entry) { … } #else static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf, const struct iomap_iter *iter, void **entry) { return VM_FAULT_FALLBACK; } #endif /* CONFIG_FS_DAX_PMD */ static s64 dax_unshare_iter(struct iomap_iter *iter) { … } int dax_file_unshare(struct inode *inode, loff_t pos, loff_t len, const struct iomap_ops *ops) { … } EXPORT_SYMBOL_GPL(…); static int dax_memzero(struct iomap_iter *iter, loff_t pos, size_t size) { … } static s64 dax_zero_iter(struct iomap_iter *iter, bool *did_zero) { … } int dax_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero, const struct iomap_ops *ops) { … } EXPORT_SYMBOL_GPL(…); int dax_truncate_page(struct inode *inode, loff_t pos, bool *did_zero, const struct iomap_ops *ops) { … } EXPORT_SYMBOL_GPL(…); static loff_t dax_iomap_iter(const struct iomap_iter *iomi, struct iov_iter *iter) { … } /** * dax_iomap_rw - Perform I/O to a DAX file * @iocb: The control block for this I/O * @iter: The addresses to do I/O from or to * @ops: iomap ops passed from the file system * * This function performs read and write operations to directly mapped * persistent memory. The callers needs to take care of read/write exclusion * and evicting any page cache pages in the region under I/O. */ ssize_t dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter, const struct iomap_ops *ops) { … } EXPORT_SYMBOL_GPL(…); static vm_fault_t dax_fault_return(int error) { … } /* * When handling a synchronous page fault and the inode need a fsync, we can * insert the PTE/PMD into page tables only after that fsync happened. Skip * insertion for now and return the pfn so that caller can insert it after the * fsync is done. */ static vm_fault_t dax_fault_synchronous_pfnp(pfn_t *pfnp, pfn_t pfn) { … } static vm_fault_t dax_fault_cow_page(struct vm_fault *vmf, const struct iomap_iter *iter) { … } /** * dax_fault_iter - Common actor to handle pfn insertion in PTE/PMD fault. * @vmf: vm fault instance * @iter: iomap iter * @pfnp: pfn to be returned * @xas: the dax mapping tree of a file * @entry: an unlocked dax entry to be inserted * @pmd: distinguish whether it is a pmd fault */ static vm_fault_t dax_fault_iter(struct vm_fault *vmf, const struct iomap_iter *iter, pfn_t *pfnp, struct xa_state *xas, void **entry, bool pmd) { … } static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, int *iomap_errp, const struct iomap_ops *ops) { … } #ifdef CONFIG_FS_DAX_PMD static bool dax_fault_check_fallback(struct vm_fault *vmf, struct xa_state *xas, pgoff_t max_pgoff) { … } static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, const struct iomap_ops *ops) { … } #else static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, const struct iomap_ops *ops) { return VM_FAULT_FALLBACK; } #endif /* CONFIG_FS_DAX_PMD */ /** * dax_iomap_fault - handle a page fault on a DAX file * @vmf: The description of the fault * @order: Order of the page to fault in * @pfnp: PFN to insert for synchronous faults if fsync is required * @iomap_errp: Storage for detailed error code in case of error * @ops: Iomap ops passed from the file system * * When a page fault occurs, filesystems may call this helper in * their fault handler for DAX files. dax_iomap_fault() assumes the caller * has done all the necessary locking for page fault to proceed * successfully. */ vm_fault_t dax_iomap_fault(struct vm_fault *vmf, unsigned int order, pfn_t *pfnp, int *iomap_errp, const struct iomap_ops *ops) { … } EXPORT_SYMBOL_GPL(…); /* * dax_insert_pfn_mkwrite - insert PTE or PMD entry into page tables * @vmf: The description of the fault * @pfn: PFN to insert * @order: Order of entry to insert. * * This function inserts a writeable PTE or PMD entry into the page tables * for an mmaped DAX file. It also marks the page cache entry as dirty. */ static vm_fault_t dax_insert_pfn_mkwrite(struct vm_fault *vmf, pfn_t pfn, unsigned int order) { … } /** * dax_finish_sync_fault - finish synchronous page fault * @vmf: The description of the fault * @order: Order of entry to be inserted * @pfn: PFN to insert * * This function ensures that the file range touched by the page fault is * stored persistently on the media and handles inserting of appropriate page * table entry. */ vm_fault_t dax_finish_sync_fault(struct vm_fault *vmf, unsigned int order, pfn_t pfn) { … } EXPORT_SYMBOL_GPL(…); static loff_t dax_range_compare_iter(struct iomap_iter *it_src, struct iomap_iter *it_dest, u64 len, bool *same) { … } int dax_dedupe_file_range_compare(struct inode *src, loff_t srcoff, struct inode *dst, loff_t dstoff, loff_t len, bool *same, const struct iomap_ops *ops) { … } int dax_remap_file_range_prep(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, loff_t *len, unsigned int remap_flags, const struct iomap_ops *ops) { … } EXPORT_SYMBOL_GPL(…);