pagewalk.c | Explore in Territory

// SPDX-License-Identifier: GPL-2.0
#include <linux/pagewalk.h>
#include <linux/highmem.h>
#include <linux/sched.h>
#include <linux/hugetlb.h>
#include <linux/swap.h>
#include <linux/swapops.h>

/*
 * We want to know the real level where a entry is located ignoring any
 * folding of levels which may be happening. For example if p4d is folded then
 * a missing entry found at level 1 (p4d) is actually at level 0 (pgd).
 */
static int real_depth(int depth)
{ … }

static int walk_pte_range_inner(pte_t *pte, unsigned long addr,
				unsigned long end, struct mm_walk *walk)
{ … }

static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
			  struct mm_walk *walk)
{ … }

static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
			  struct mm_walk *walk)
{ … }

static int walk_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
			  struct mm_walk *walk)
{ … }

static int walk_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end,
			  struct mm_walk *walk)
{ … }

static int walk_pgd_range(unsigned long addr, unsigned long end,
			  struct mm_walk *walk)
{ … }

#ifdef CONFIG_HUGETLB_PAGE
static unsigned long hugetlb_entry_end(struct hstate *h, unsigned long addr,
				       unsigned long end)
{ … }

static int walk_hugetlb_range(unsigned long addr, unsigned long end,
			      struct mm_walk *walk)
{ … }

#else /* CONFIG_HUGETLB_PAGE */
static int walk_hugetlb_range(unsigned long addr, unsigned long end,
			      struct mm_walk *walk)
{
	return 0;
}

#endif /* CONFIG_HUGETLB_PAGE */

/*
 * Decide whether we really walk over the current vma on [@start, @end)
 * or skip it via the returned value. Return 0 if we do walk over the
 * current vma, and return 1 if we skip the vma. Negative values means
 * error, where we abort the current walk.
 */
static int walk_page_test(unsigned long start, unsigned long end,
			struct mm_walk *walk)
{ … }

static int __walk_page_range(unsigned long start, unsigned long end,
			struct mm_walk *walk)
{ … }

static inline void process_mm_walk_lock(struct mm_struct *mm,
					enum page_walk_lock walk_lock)
{ … }

static inline void process_vma_walk_lock(struct vm_area_struct *vma,
					 enum page_walk_lock walk_lock)
{ … }

/**
 * walk_page_range - walk page table with caller specific callbacks
 * @mm:		mm_struct representing the target process of page table walk
 * @start:	start address of the virtual address range
 * @end:	end address of the virtual address range
 * @ops:	operation to call during the walk
 * @private:	private data for callbacks' usage
 *
 * Recursively walk the page table tree of the process represented by @mm
 * within the virtual address range [@start, @end). During walking, we can do
 * some caller-specific works for each entry, by setting up pmd_entry(),
 * pte_entry(), and/or hugetlb_entry(). If you don't set up for some of these
 * callbacks, the associated entries/pages are just ignored.
 * The return values of these callbacks are commonly defined like below:
 *
 *  - 0  : succeeded to handle the current entry, and if you don't reach the
 *         end address yet, continue to walk.
 *  - >0 : succeeded to handle the current entry, and return to the caller
 *         with caller specific value.
 *  - <0 : failed to handle the current entry, and return to the caller
 *         with error code.
 *
 * Before starting to walk page table, some callers want to check whether
 * they really want to walk over the current vma, typically by checking
 * its vm_flags. walk_page_test() and @ops->test_walk() are used for this
 * purpose.
 *
 * If operations need to be staged before and committed after a vma is walked,
 * there are two callbacks, pre_vma() and post_vma(). Note that post_vma(),
 * since it is intended to handle commit-type operations, can't return any
 * errors.
 *
 * struct mm_walk keeps current values of some common data like vma and pmd,
 * which are useful for the access from callbacks. If you want to pass some
 * caller-specific data to callbacks, @private should be helpful.
 *
 * Locking:
 *   Callers of walk_page_range() and walk_page_vma() should hold @mm->mmap_lock,
 *   because these function traverse vma list and/or access to vma's data.
 */
int walk_page_range(struct mm_struct *mm, unsigned long start,
		unsigned long end, const struct mm_walk_ops *ops,
		void *private)
{ … }

/**
 * walk_page_range_novma - walk a range of pagetables not backed by a vma
 * @mm:		mm_struct representing the target process of page table walk
 * @start:	start address of the virtual address range
 * @end:	end address of the virtual address range
 * @ops:	operation to call during the walk
 * @pgd:	pgd to walk if different from mm->pgd
 * @private:	private data for callbacks' usage
 *
 * Similar to walk_page_range() but can walk any page tables even if they are
 * not backed by VMAs. Because 'unusual' entries may be walked this function
 * will also not lock the PTEs for the pte_entry() callback. This is useful for
 * walking the kernel pages tables or page tables for firmware.
 *
 * Note: Be careful to walk the kernel pages tables, the caller may be need to
 * take other effective approache (mmap lock may be insufficient) to prevent
 * the intermediate kernel page tables belonging to the specified address range
 * from being freed (e.g. memory hot-remove).
 */
int walk_page_range_novma(struct mm_struct *mm, unsigned long start,
			  unsigned long end, const struct mm_walk_ops *ops,
			  pgd_t *pgd,
			  void *private)
{ … }

int walk_page_range_vma(struct vm_area_struct *vma, unsigned long start,
			unsigned long end, const struct mm_walk_ops *ops,
			void *private)
{ … }

int walk_page_vma(struct vm_area_struct *vma, const struct mm_walk_ops *ops,
		void *private)
{ … }

/**
 * walk_page_mapping - walk all memory areas mapped into a struct address_space.
 * @mapping: Pointer to the struct address_space
 * @first_index: First page offset in the address_space
 * @nr: Number of incremental page offsets to cover
 * @ops:	operation to call during the walk
 * @private:	private data for callbacks' usage
 *
 * This function walks all memory areas mapped into a struct address_space.
 * The walk is limited to only the given page-size index range, but if
 * the index boundaries cross a huge page-table entry, that entry will be
 * included.
 *
 * Also see walk_page_range() for additional information.
 *
 * Locking:
 *   This function can't require that the struct mm_struct::mmap_lock is held,
 *   since @mapping may be mapped by multiple processes. Instead
 *   @mapping->i_mmap_rwsem must be held. This might have implications in the
 *   callbacks, and it's up tho the caller to ensure that the
 *   struct mm_struct::mmap_lock is not needed.
 *
 *   Also this means that a caller can't rely on the struct
 *   vm_area_struct::vm_flags to be constant across a call,
 *   except for immutable flags. Callers requiring this shouldn't use
 *   this function.
 *
 * Return: 0 on success, negative error code on failure, positive number on
 * caller defined premature termination.
 */
int walk_page_mapping(struct address_space *mapping, pgoff_t first_index,
		      pgoff_t nr, const struct mm_walk_ops *ops,
		      void *private)
{ … }

/**
 * folio_walk_start - walk the page tables to a folio
 * @fw: filled with information on success.
 * @vma: the VMA.
 * @addr: the virtual address to use for the page table walk.
 * @flags: flags modifying which folios to walk to.
 *
 * Walk the page tables using @addr in a given @vma to a mapped folio and
 * return the folio, making sure that the page table entry referenced by
 * @addr cannot change until folio_walk_end() was called.
 *
 * As default, this function returns only folios that are not special (e.g., not
 * the zeropage) and never returns folios that are supposed to be ignored by the
 * VM as documented by vm_normal_page(). If requested, zeropages will be
 * returned as well.
 *
 * As default, this function only considers present page table entries.
 * If requested, it will also consider migration entries.
 *
 * If this function returns NULL it might either indicate "there is nothing" or
 * "there is nothing suitable".
 *
 * On success, @fw is filled and the function returns the folio while the PTL
 * is still held and folio_walk_end() must be called to clean up,
 * releasing any held locks. The returned folio must *not* be used after the
 * call to folio_walk_end(), unless a short-term folio reference is taken before
 * that call.
 *
 * @fw->page will correspond to the page that is effectively referenced by
 * @addr. However, for migration entries and shared zeropages @fw->page is
 * set to NULL. Note that large folios might be mapped by multiple page table
 * entries, and this function will always only lookup a single entry as
 * specified by @addr, which might or might not cover more than a single page of
 * the returned folio.
 *
 * This function must *not* be used as a naive replacement for
 * get_user_pages() / pin_user_pages(), especially not to perform DMA or
 * to carelessly modify page content. This function may *only* be used to grab
 * short-term folio references, never to grab long-term folio references.
 *
 * Using the page table entry pointers in @fw for reading or modifying the
 * entry should be avoided where possible: however, there might be valid
 * use cases.
 *
 * WARNING: Modifying page table entries in hugetlb VMAs requires a lot of care.
 * For example, PMD page table sharing might require prior unsharing. Also,
 * logical hugetlb entries might span multiple physical page table entries,
 * which *must* be modified in a single operation (set_huge_pte_at(),
 * huge_ptep_set_*, ...). Note that the page table entry stored in @fw might
 * not correspond to the first physical entry of a logical hugetlb entry.
 *
 * The mmap lock must be held in read mode.
 *
 * Return: folio pointer on success, otherwise NULL.
 */
struct folio *folio_walk_start(struct folio_walk *fw,
		struct vm_area_struct *vma, unsigned long addr,
		folio_walk_flags_t flags)
{ … }
linux/mm/pagewalk.c