// SPDX-License-Identifier: GPL-2.0 #include <linux/pagewalk.h> #include <linux/highmem.h> #include <linux/sched.h> #include <linux/hugetlb.h> #include <linux/swap.h> #include <linux/swapops.h> /* * We want to know the real level where a entry is located ignoring any * folding of levels which may be happening. For example if p4d is folded then * a missing entry found at level 1 (p4d) is actually at level 0 (pgd). */ static int real_depth(int depth) { … } static int walk_pte_range_inner(pte_t *pte, unsigned long addr, unsigned long end, struct mm_walk *walk) { … } static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, struct mm_walk *walk) { … } static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, struct mm_walk *walk) { … } static int walk_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end, struct mm_walk *walk) { … } static int walk_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end, struct mm_walk *walk) { … } static int walk_pgd_range(unsigned long addr, unsigned long end, struct mm_walk *walk) { … } #ifdef CONFIG_HUGETLB_PAGE static unsigned long hugetlb_entry_end(struct hstate *h, unsigned long addr, unsigned long end) { … } static int walk_hugetlb_range(unsigned long addr, unsigned long end, struct mm_walk *walk) { … } #else /* CONFIG_HUGETLB_PAGE */ static int walk_hugetlb_range(unsigned long addr, unsigned long end, struct mm_walk *walk) { return 0; } #endif /* CONFIG_HUGETLB_PAGE */ /* * Decide whether we really walk over the current vma on [@start, @end) * or skip it via the returned value. Return 0 if we do walk over the * current vma, and return 1 if we skip the vma. Negative values means * error, where we abort the current walk. */ static int walk_page_test(unsigned long start, unsigned long end, struct mm_walk *walk) { … } static int __walk_page_range(unsigned long start, unsigned long end, struct mm_walk *walk) { … } static inline void process_mm_walk_lock(struct mm_struct *mm, enum page_walk_lock walk_lock) { … } static inline void process_vma_walk_lock(struct vm_area_struct *vma, enum page_walk_lock walk_lock) { … } /** * walk_page_range - walk page table with caller specific callbacks * @mm: mm_struct representing the target process of page table walk * @start: start address of the virtual address range * @end: end address of the virtual address range * @ops: operation to call during the walk * @private: private data for callbacks' usage * * Recursively walk the page table tree of the process represented by @mm * within the virtual address range [@start, @end). During walking, we can do * some caller-specific works for each entry, by setting up pmd_entry(), * pte_entry(), and/or hugetlb_entry(). If you don't set up for some of these * callbacks, the associated entries/pages are just ignored. * The return values of these callbacks are commonly defined like below: * * - 0 : succeeded to handle the current entry, and if you don't reach the * end address yet, continue to walk. * - >0 : succeeded to handle the current entry, and return to the caller * with caller specific value. * - <0 : failed to handle the current entry, and return to the caller * with error code. * * Before starting to walk page table, some callers want to check whether * they really want to walk over the current vma, typically by checking * its vm_flags. walk_page_test() and @ops->test_walk() are used for this * purpose. * * If operations need to be staged before and committed after a vma is walked, * there are two callbacks, pre_vma() and post_vma(). Note that post_vma(), * since it is intended to handle commit-type operations, can't return any * errors. * * struct mm_walk keeps current values of some common data like vma and pmd, * which are useful for the access from callbacks. If you want to pass some * caller-specific data to callbacks, @private should be helpful. * * Locking: * Callers of walk_page_range() and walk_page_vma() should hold @mm->mmap_lock, * because these function traverse vma list and/or access to vma's data. */ int walk_page_range(struct mm_struct *mm, unsigned long start, unsigned long end, const struct mm_walk_ops *ops, void *private) { … } /** * walk_page_range_novma - walk a range of pagetables not backed by a vma * @mm: mm_struct representing the target process of page table walk * @start: start address of the virtual address range * @end: end address of the virtual address range * @ops: operation to call during the walk * @pgd: pgd to walk if different from mm->pgd * @private: private data for callbacks' usage * * Similar to walk_page_range() but can walk any page tables even if they are * not backed by VMAs. Because 'unusual' entries may be walked this function * will also not lock the PTEs for the pte_entry() callback. This is useful for * walking the kernel pages tables or page tables for firmware. * * Note: Be careful to walk the kernel pages tables, the caller may be need to * take other effective approache (mmap lock may be insufficient) to prevent * the intermediate kernel page tables belonging to the specified address range * from being freed (e.g. memory hot-remove). */ int walk_page_range_novma(struct mm_struct *mm, unsigned long start, unsigned long end, const struct mm_walk_ops *ops, pgd_t *pgd, void *private) { … } int walk_page_range_vma(struct vm_area_struct *vma, unsigned long start, unsigned long end, const struct mm_walk_ops *ops, void *private) { … } int walk_page_vma(struct vm_area_struct *vma, const struct mm_walk_ops *ops, void *private) { … } /** * walk_page_mapping - walk all memory areas mapped into a struct address_space. * @mapping: Pointer to the struct address_space * @first_index: First page offset in the address_space * @nr: Number of incremental page offsets to cover * @ops: operation to call during the walk * @private: private data for callbacks' usage * * This function walks all memory areas mapped into a struct address_space. * The walk is limited to only the given page-size index range, but if * the index boundaries cross a huge page-table entry, that entry will be * included. * * Also see walk_page_range() for additional information. * * Locking: * This function can't require that the struct mm_struct::mmap_lock is held, * since @mapping may be mapped by multiple processes. Instead * @mapping->i_mmap_rwsem must be held. This might have implications in the * callbacks, and it's up tho the caller to ensure that the * struct mm_struct::mmap_lock is not needed. * * Also this means that a caller can't rely on the struct * vm_area_struct::vm_flags to be constant across a call, * except for immutable flags. Callers requiring this shouldn't use * this function. * * Return: 0 on success, negative error code on failure, positive number on * caller defined premature termination. */ int walk_page_mapping(struct address_space *mapping, pgoff_t first_index, pgoff_t nr, const struct mm_walk_ops *ops, void *private) { … } /** * folio_walk_start - walk the page tables to a folio * @fw: filled with information on success. * @vma: the VMA. * @addr: the virtual address to use for the page table walk. * @flags: flags modifying which folios to walk to. * * Walk the page tables using @addr in a given @vma to a mapped folio and * return the folio, making sure that the page table entry referenced by * @addr cannot change until folio_walk_end() was called. * * As default, this function returns only folios that are not special (e.g., not * the zeropage) and never returns folios that are supposed to be ignored by the * VM as documented by vm_normal_page(). If requested, zeropages will be * returned as well. * * As default, this function only considers present page table entries. * If requested, it will also consider migration entries. * * If this function returns NULL it might either indicate "there is nothing" or * "there is nothing suitable". * * On success, @fw is filled and the function returns the folio while the PTL * is still held and folio_walk_end() must be called to clean up, * releasing any held locks. The returned folio must *not* be used after the * call to folio_walk_end(), unless a short-term folio reference is taken before * that call. * * @fw->page will correspond to the page that is effectively referenced by * @addr. However, for migration entries and shared zeropages @fw->page is * set to NULL. Note that large folios might be mapped by multiple page table * entries, and this function will always only lookup a single entry as * specified by @addr, which might or might not cover more than a single page of * the returned folio. * * This function must *not* be used as a naive replacement for * get_user_pages() / pin_user_pages(), especially not to perform DMA or * to carelessly modify page content. This function may *only* be used to grab * short-term folio references, never to grab long-term folio references. * * Using the page table entry pointers in @fw for reading or modifying the * entry should be avoided where possible: however, there might be valid * use cases. * * WARNING: Modifying page table entries in hugetlb VMAs requires a lot of care. * For example, PMD page table sharing might require prior unsharing. Also, * logical hugetlb entries might span multiple physical page table entries, * which *must* be modified in a single operation (set_huge_pte_at(), * huge_ptep_set_*, ...). Note that the page table entry stored in @fw might * not correspond to the first physical entry of a logical hugetlb entry. * * The mmap lock must be held in read mode. * * Return: folio pointer on success, otherwise NULL. */ struct folio *folio_walk_start(struct folio_walk *fw, struct vm_area_struct *vma, unsigned long addr, folio_walk_flags_t flags) { … }