// SPDX-License-Identifier: GPL-2.0-only #include <linux/kernel.h> #include <linux/errno.h> #include <linux/err.h> #include <linux/spinlock.h> #include <linux/mm.h> #include <linux/memfd.h> #include <linux/memremap.h> #include <linux/pagemap.h> #include <linux/rmap.h> #include <linux/swap.h> #include <linux/swapops.h> #include <linux/secretmem.h> #include <linux/sched/signal.h> #include <linux/rwsem.h> #include <linux/hugetlb.h> #include <linux/migrate.h> #include <linux/mm_inline.h> #include <linux/pagevec.h> #include <linux/sched/mm.h> #include <linux/shmem_fs.h> #include <asm/mmu_context.h> #include <asm/tlbflush.h> #include "internal.h" struct follow_page_context { … }; static inline void sanity_check_pinned_pages(struct page **pages, unsigned long npages) { … } /* * Return the folio with ref appropriately incremented, * or NULL if that failed. */ static inline struct folio *try_get_folio(struct page *page, int refs) { … } static void gup_put_folio(struct folio *folio, int refs, unsigned int flags) { … } /** * try_grab_folio() - add a folio's refcount by a flag-dependent amount * @folio: pointer to folio to be grabbed * @refs: the value to (effectively) add to the folio's refcount * @flags: gup flags: these are the FOLL_* flag values * * This might not do anything at all, depending on the flags argument. * * "grab" names in this file mean, "look at flags to decide whether to use * FOLL_PIN or FOLL_GET behavior, when incrementing the folio's refcount. * * Either FOLL_PIN or FOLL_GET (or neither) may be set, but not both at the same * time. * * Return: 0 for success, or if no action was required (if neither FOLL_PIN * nor FOLL_GET was set, nothing is done). A negative error code for failure: * * -ENOMEM FOLL_GET or FOLL_PIN was set, but the folio could not * be grabbed. * * It is called when we have a stable reference for the folio, typically in * GUP slow path. */ int __must_check try_grab_folio(struct folio *folio, int refs, unsigned int flags) { … } /** * unpin_user_page() - release a dma-pinned page * @page: pointer to page to be released * * Pages that were pinned via pin_user_pages*() must be released via either * unpin_user_page(), or one of the unpin_user_pages*() routines. This is so * that such pages can be separately tracked and uniquely handled. In * particular, interactions with RDMA and filesystems need special handling. */ void unpin_user_page(struct page *page) { … } EXPORT_SYMBOL(…); /** * unpin_folio() - release a dma-pinned folio * @folio: pointer to folio to be released * * Folios that were pinned via memfd_pin_folios() or other similar routines * must be released either using unpin_folio() or unpin_folios(). */ void unpin_folio(struct folio *folio) { … } EXPORT_SYMBOL_GPL(…); /** * folio_add_pin - Try to get an additional pin on a pinned folio * @folio: The folio to be pinned * * Get an additional pin on a folio we already have a pin on. Makes no change * if the folio is a zero_page. */ void folio_add_pin(struct folio *folio) { … } static inline struct folio *gup_folio_range_next(struct page *start, unsigned long npages, unsigned long i, unsigned int *ntails) { … } static inline struct folio *gup_folio_next(struct page **list, unsigned long npages, unsigned long i, unsigned int *ntails) { … } /** * unpin_user_pages_dirty_lock() - release and optionally dirty gup-pinned pages * @pages: array of pages to be maybe marked dirty, and definitely released. * @npages: number of pages in the @pages array. * @make_dirty: whether to mark the pages dirty * * "gup-pinned page" refers to a page that has had one of the get_user_pages() * variants called on that page. * * For each page in the @pages array, make that page (or its head page, if a * compound page) dirty, if @make_dirty is true, and if the page was previously * listed as clean. In any case, releases all pages using unpin_user_page(), * possibly via unpin_user_pages(), for the non-dirty case. * * Please see the unpin_user_page() documentation for details. * * set_page_dirty_lock() is used internally. If instead, set_page_dirty() is * required, then the caller should a) verify that this is really correct, * because _lock() is usually required, and b) hand code it: * set_page_dirty_lock(), unpin_user_page(). * */ void unpin_user_pages_dirty_lock(struct page **pages, unsigned long npages, bool make_dirty) { … } EXPORT_SYMBOL(…); /** * unpin_user_page_range_dirty_lock() - release and optionally dirty * gup-pinned page range * * @page: the starting page of a range maybe marked dirty, and definitely released. * @npages: number of consecutive pages to release. * @make_dirty: whether to mark the pages dirty * * "gup-pinned page range" refers to a range of pages that has had one of the * pin_user_pages() variants called on that page. * * For the page ranges defined by [page .. page+npages], make that range (or * its head pages, if a compound page) dirty, if @make_dirty is true, and if the * page range was previously listed as clean. * * set_page_dirty_lock() is used internally. If instead, set_page_dirty() is * required, then the caller should a) verify that this is really correct, * because _lock() is usually required, and b) hand code it: * set_page_dirty_lock(), unpin_user_page(). * */ void unpin_user_page_range_dirty_lock(struct page *page, unsigned long npages, bool make_dirty) { … } EXPORT_SYMBOL(…); static void gup_fast_unpin_user_pages(struct page **pages, unsigned long npages) { … } /** * unpin_user_pages() - release an array of gup-pinned pages. * @pages: array of pages to be marked dirty and released. * @npages: number of pages in the @pages array. * * For each page in the @pages array, release the page using unpin_user_page(). * * Please see the unpin_user_page() documentation for details. */ void unpin_user_pages(struct page **pages, unsigned long npages) { … } EXPORT_SYMBOL(…); /** * unpin_user_folio() - release pages of a folio * @folio: pointer to folio to be released * @npages: number of pages of same folio * * Release npages of the folio */ void unpin_user_folio(struct folio *folio, unsigned long npages) { … } EXPORT_SYMBOL(…); /** * unpin_folios() - release an array of gup-pinned folios. * @folios: array of folios to be marked dirty and released. * @nfolios: number of folios in the @folios array. * * For each folio in the @folios array, release the folio using gup_put_folio. * * Please see the unpin_folio() documentation for details. */ void unpin_folios(struct folio **folios, unsigned long nfolios) { … } EXPORT_SYMBOL_GPL(…); /* * Set the MMF_HAS_PINNED if not set yet; after set it'll be there for the mm's * lifecycle. Avoid setting the bit unless necessary, or it might cause write * cache bouncing on large SMP machines for concurrent pinned gups. */ static inline void mm_set_has_pinned_flag(unsigned long *mm_flags) { … } #ifdef CONFIG_MMU #ifdef CONFIG_HAVE_GUP_FAST static int record_subpages(struct page *page, unsigned long sz, unsigned long addr, unsigned long end, struct page **pages) { … } /** * try_grab_folio_fast() - Attempt to get or pin a folio in fast path. * @page: pointer to page to be grabbed * @refs: the value to (effectively) add to the folio's refcount * @flags: gup flags: these are the FOLL_* flag values. * * "grab" names in this file mean, "look at flags to decide whether to use * FOLL_PIN or FOLL_GET behavior, when incrementing the folio's refcount. * * Either FOLL_PIN or FOLL_GET (or neither) must be set, but not both at the * same time. (That's true throughout the get_user_pages*() and * pin_user_pages*() APIs.) Cases: * * FOLL_GET: folio's refcount will be incremented by @refs. * * FOLL_PIN on large folios: folio's refcount will be incremented by * @refs, and its pincount will be incremented by @refs. * * FOLL_PIN on single-page folios: folio's refcount will be incremented by * @refs * GUP_PIN_COUNTING_BIAS. * * Return: The folio containing @page (with refcount appropriately * incremented) for success, or NULL upon failure. If neither FOLL_GET * nor FOLL_PIN was set, that's considered failure, and furthermore, * a likely bug in the caller, so a warning is also emitted. * * It uses add ref unless zero to elevate the folio refcount and must be called * in fast path only. */ static struct folio *try_grab_folio_fast(struct page *page, int refs, unsigned int flags) { … } #endif /* CONFIG_HAVE_GUP_FAST */ static struct page *no_page_table(struct vm_area_struct *vma, unsigned int flags, unsigned long address) { … } #ifdef CONFIG_PGTABLE_HAS_HUGE_LEAVES static struct page *follow_huge_pud(struct vm_area_struct *vma, unsigned long addr, pud_t *pudp, int flags, struct follow_page_context *ctx) { … } /* FOLL_FORCE can write to even unwritable PMDs in COW mappings. */ static inline bool can_follow_write_pmd(pmd_t pmd, struct page *page, struct vm_area_struct *vma, unsigned int flags) { … } static struct page *follow_huge_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd, unsigned int flags, struct follow_page_context *ctx) { … } #else /* CONFIG_PGTABLE_HAS_HUGE_LEAVES */ static struct page *follow_huge_pud(struct vm_area_struct *vma, unsigned long addr, pud_t *pudp, int flags, struct follow_page_context *ctx) { return NULL; } static struct page *follow_huge_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd, unsigned int flags, struct follow_page_context *ctx) { return NULL; } #endif /* CONFIG_PGTABLE_HAS_HUGE_LEAVES */ static int follow_pfn_pte(struct vm_area_struct *vma, unsigned long address, pte_t *pte, unsigned int flags) { … } /* FOLL_FORCE can write to even unwritable PTEs in COW mappings. */ static inline bool can_follow_write_pte(pte_t pte, struct page *page, struct vm_area_struct *vma, unsigned int flags) { … } static struct page *follow_page_pte(struct vm_area_struct *vma, unsigned long address, pmd_t *pmd, unsigned int flags, struct dev_pagemap **pgmap) { … } static struct page *follow_pmd_mask(struct vm_area_struct *vma, unsigned long address, pud_t *pudp, unsigned int flags, struct follow_page_context *ctx) { … } static struct page *follow_pud_mask(struct vm_area_struct *vma, unsigned long address, p4d_t *p4dp, unsigned int flags, struct follow_page_context *ctx) { … } static struct page *follow_p4d_mask(struct vm_area_struct *vma, unsigned long address, pgd_t *pgdp, unsigned int flags, struct follow_page_context *ctx) { … } /** * follow_page_mask - look up a page descriptor from a user-virtual address * @vma: vm_area_struct mapping @address * @address: virtual address to look up * @flags: flags modifying lookup behaviour * @ctx: contains dev_pagemap for %ZONE_DEVICE memory pinning and a * pointer to output page_mask * * @flags can have FOLL_ flags set, defined in <linux/mm.h> * * When getting pages from ZONE_DEVICE memory, the @ctx->pgmap caches * the device's dev_pagemap metadata to avoid repeating expensive lookups. * * When getting an anonymous page and the caller has to trigger unsharing * of a shared anonymous page first, -EMLINK is returned. The caller should * trigger a fault with FAULT_FLAG_UNSHARE set. Note that unsharing is only * relevant with FOLL_PIN and !FOLL_WRITE. * * On output, the @ctx->page_mask is set according to the size of the page. * * Return: the mapped (struct page *), %NULL if no mapping exists, or * an error pointer if there is a mapping to something not represented * by a page descriptor (see also vm_normal_page()). */ static struct page *follow_page_mask(struct vm_area_struct *vma, unsigned long address, unsigned int flags, struct follow_page_context *ctx) { … } static int get_gate_page(struct mm_struct *mm, unsigned long address, unsigned int gup_flags, struct vm_area_struct **vma, struct page **page) { … } /* * mmap_lock must be held on entry. If @flags has FOLL_UNLOCKABLE but not * FOLL_NOWAIT, the mmap_lock may be released. If it is, *@locked will be set * to 0 and -EBUSY returned. */ static int faultin_page(struct vm_area_struct *vma, unsigned long address, unsigned int flags, bool unshare, int *locked) { … } /* * Writing to file-backed mappings which require folio dirty tracking using GUP * is a fundamentally broken operation, as kernel write access to GUP mappings * do not adhere to the semantics expected by a file system. * * Consider the following scenario:- * * 1. A folio is written to via GUP which write-faults the memory, notifying * the file system and dirtying the folio. * 2. Later, writeback is triggered, resulting in the folio being cleaned and * the PTE being marked read-only. * 3. The GUP caller writes to the folio, as it is mapped read/write via the * direct mapping. * 4. The GUP caller, now done with the page, unpins it and sets it dirty * (though it does not have to). * * This results in both data being written to a folio without writenotify, and * the folio being dirtied unexpectedly (if the caller decides to do so). */ static bool writable_file_mapping_allowed(struct vm_area_struct *vma, unsigned long gup_flags) { … } static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags) { … } /* * This is "vma_lookup()", but with a warning if we would have * historically expanded the stack in the GUP code. */ static struct vm_area_struct *gup_vma_lookup(struct mm_struct *mm, unsigned long addr) { … } /** * __get_user_pages() - pin user pages in memory * @mm: mm_struct of target mm * @start: starting user address * @nr_pages: number of pages from start to pin * @gup_flags: flags modifying pin behaviour * @pages: array that receives pointers to the pages pinned. * Should be at least nr_pages long. Or NULL, if caller * only intends to ensure the pages are faulted in. * @locked: whether we're still with the mmap_lock held * * Returns either number of pages pinned (which may be less than the * number requested), or an error. Details about the return value: * * -- If nr_pages is 0, returns 0. * -- If nr_pages is >0, but no pages were pinned, returns -errno. * -- If nr_pages is >0, and some pages were pinned, returns the number of * pages pinned. Again, this may be less than nr_pages. * -- 0 return value is possible when the fault would need to be retried. * * The caller is responsible for releasing returned @pages, via put_page(). * * Must be called with mmap_lock held. It may be released. See below. * * __get_user_pages walks a process's page tables and takes a reference to * each struct page that each user address corresponds to at a given * instant. That is, it takes the page that would be accessed if a user * thread accesses the given user virtual address at that instant. * * This does not guarantee that the page exists in the user mappings when * __get_user_pages returns, and there may even be a completely different * page there in some cases (eg. if mmapped pagecache has been invalidated * and subsequently re-faulted). However it does guarantee that the page * won't be freed completely. And mostly callers simply care that the page * contains data that was valid *at some point in time*. Typically, an IO * or similar operation cannot guarantee anything stronger anyway because * locks can't be held over the syscall boundary. * * If @gup_flags & FOLL_WRITE == 0, the page must not be written to. If * the page is written to, set_page_dirty (or set_page_dirty_lock, as * appropriate) must be called after the page is finished with, and * before put_page is called. * * If FOLL_UNLOCKABLE is set without FOLL_NOWAIT then the mmap_lock may * be released. If this happens *@locked will be set to 0 on return. * * A caller using such a combination of @gup_flags must therefore hold the * mmap_lock for reading only, and recognize when it's been released. Otherwise, * it must be held for either reading or writing and will not be released. * * In most cases, get_user_pages or get_user_pages_fast should be used * instead of __get_user_pages. __get_user_pages should be used only if * you need some special @gup_flags. */ static long __get_user_pages(struct mm_struct *mm, unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, int *locked) { … } static bool vma_permits_fault(struct vm_area_struct *vma, unsigned int fault_flags) { … } /** * fixup_user_fault() - manually resolve a user page fault * @mm: mm_struct of target mm * @address: user address * @fault_flags:flags to pass down to handle_mm_fault() * @unlocked: did we unlock the mmap_lock while retrying, maybe NULL if caller * does not allow retry. If NULL, the caller must guarantee * that fault_flags does not contain FAULT_FLAG_ALLOW_RETRY. * * This is meant to be called in the specific scenario where for locking reasons * we try to access user memory in atomic context (within a pagefault_disable() * section), this returns -EFAULT, and we want to resolve the user fault before * trying again. * * Typically this is meant to be used by the futex code. * * The main difference with get_user_pages() is that this function will * unconditionally call handle_mm_fault() which will in turn perform all the * necessary SW fixup of the dirty and young bits in the PTE, while * get_user_pages() only guarantees to update these in the struct page. * * This is important for some architectures where those bits also gate the * access permission to the page because they are maintained in software. On * such architectures, gup() will not be enough to make a subsequent access * succeed. * * This function will not return with an unlocked mmap_lock. So it has not the * same semantics wrt the @mm->mmap_lock as does filemap_fault(). */ int fixup_user_fault(struct mm_struct *mm, unsigned long address, unsigned int fault_flags, bool *unlocked) { … } EXPORT_SYMBOL_GPL(…); /* * GUP always responds to fatal signals. When FOLL_INTERRUPTIBLE is * specified, it'll also respond to generic signals. The caller of GUP * that has FOLL_INTERRUPTIBLE should take care of the GUP interruption. */ static bool gup_signal_pending(unsigned int flags) { … } /* * Locking: (*locked == 1) means that the mmap_lock has already been acquired by * the caller. This function may drop the mmap_lock. If it does so, then it will * set (*locked = 0). * * (*locked == 0) means that the caller expects this function to acquire and * drop the mmap_lock. Therefore, the value of *locked will still be zero when * the function returns, even though it may have changed temporarily during * function execution. * * Please note that this function, unlike __get_user_pages(), will not return 0 * for nr_pages > 0, unless FOLL_NOWAIT is used. */ static __always_inline long __get_user_pages_locked(struct mm_struct *mm, unsigned long start, unsigned long nr_pages, struct page **pages, int *locked, unsigned int flags) { … } /** * populate_vma_page_range() - populate a range of pages in the vma. * @vma: target vma * @start: start address * @end: end address * @locked: whether the mmap_lock is still held * * This takes care of mlocking the pages too if VM_LOCKED is set. * * Return either number of pages pinned in the vma, or a negative error * code on error. * * vma->vm_mm->mmap_lock must be held. * * If @locked is NULL, it may be held for read or write and will * be unperturbed. * * If @locked is non-NULL, it must held for read only and may be * released. If it's released, *@locked will be set to 0. */ long populate_vma_page_range(struct vm_area_struct *vma, unsigned long start, unsigned long end, int *locked) { … } /* * faultin_page_range() - populate (prefault) page tables inside the * given range readable/writable * * This takes care of mlocking the pages, too, if VM_LOCKED is set. * * @mm: the mm to populate page tables in * @start: start address * @end: end address * @write: whether to prefault readable or writable * @locked: whether the mmap_lock is still held * * Returns either number of processed pages in the MM, or a negative error * code on error (see __get_user_pages()). Note that this function reports * errors related to VMAs, such as incompatible mappings, as expected by * MADV_POPULATE_(READ|WRITE). * * The range must be page-aligned. * * mm->mmap_lock must be held. If it's released, *@locked will be set to 0. */ long faultin_page_range(struct mm_struct *mm, unsigned long start, unsigned long end, bool write, int *locked) { … } /* * __mm_populate - populate and/or mlock pages within a range of address space. * * This is used to implement mlock() and the MAP_POPULATE / MAP_LOCKED mmap * flags. VMAs must be already marked with the desired vm_flags, and * mmap_lock must not be held. */ int __mm_populate(unsigned long start, unsigned long len, int ignore_errors) { … } #else /* CONFIG_MMU */ static long __get_user_pages_locked(struct mm_struct *mm, unsigned long start, unsigned long nr_pages, struct page **pages, int *locked, unsigned int foll_flags) { struct vm_area_struct *vma; bool must_unlock = false; unsigned long vm_flags; long i; if (!nr_pages) return 0; /* * The internal caller expects GUP to manage the lock internally and the * lock must be released when this returns. */ if (!*locked) { if (mmap_read_lock_killable(mm)) return -EAGAIN; must_unlock = true; *locked = 1; } /* calculate required read or write permissions. * If FOLL_FORCE is set, we only require the "MAY" flags. */ vm_flags = (foll_flags & FOLL_WRITE) ? (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD); vm_flags &= (foll_flags & FOLL_FORCE) ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE); for (i = 0; i < nr_pages; i++) { vma = find_vma(mm, start); if (!vma) break; /* protect what we can, including chardevs */ if ((vma->vm_flags & (VM_IO | VM_PFNMAP)) || !(vm_flags & vma->vm_flags)) break; if (pages) { pages[i] = virt_to_page((void *)start); if (pages[i]) get_page(pages[i]); } start = (start + PAGE_SIZE) & PAGE_MASK; } if (must_unlock && *locked) { mmap_read_unlock(mm); *locked = 0; } return i ? : -EFAULT; } #endif /* !CONFIG_MMU */ /** * fault_in_writeable - fault in userspace address range for writing * @uaddr: start of address range * @size: size of address range * * Returns the number of bytes not faulted in (like copy_to_user() and * copy_from_user()). */ size_t fault_in_writeable(char __user *uaddr, size_t size) { … } EXPORT_SYMBOL(…); /** * fault_in_subpage_writeable - fault in an address range for writing * @uaddr: start of address range * @size: size of address range * * Fault in a user address range for writing while checking for permissions at * sub-page granularity (e.g. arm64 MTE). This function should be used when * the caller cannot guarantee forward progress of a copy_to_user() loop. * * Returns the number of bytes not faulted in (like copy_to_user() and * copy_from_user()). */ size_t fault_in_subpage_writeable(char __user *uaddr, size_t size) { … } EXPORT_SYMBOL(…); /* * fault_in_safe_writeable - fault in an address range for writing * @uaddr: start of address range * @size: length of address range * * Faults in an address range for writing. This is primarily useful when we * already know that some or all of the pages in the address range aren't in * memory. * * Unlike fault_in_writeable(), this function is non-destructive. * * Note that we don't pin or otherwise hold the pages referenced that we fault * in. There's no guarantee that they'll stay in memory for any duration of * time. * * Returns the number of bytes not faulted in, like copy_to_user() and * copy_from_user(). */ size_t fault_in_safe_writeable(const char __user *uaddr, size_t size) { … } EXPORT_SYMBOL(…); /** * fault_in_readable - fault in userspace address range for reading * @uaddr: start of user address range * @size: size of user address range * * Returns the number of bytes not faulted in (like copy_to_user() and * copy_from_user()). */ size_t fault_in_readable(const char __user *uaddr, size_t size) { … } EXPORT_SYMBOL(…); /** * get_dump_page() - pin user page in memory while writing it to core dump * @addr: user address * * Returns struct page pointer of user page pinned for dump, * to be freed afterwards by put_page(). * * Returns NULL on any kind of failure - a hole must then be inserted into * the corefile, to preserve alignment with its headers; and also returns * NULL wherever the ZERO_PAGE, or an anonymous pte_none, has been found - * allowing a hole to be left in the corefile to save disk space. * * Called without mmap_lock (takes and releases the mmap_lock by itself). */ #ifdef CONFIG_ELF_CORE struct page *get_dump_page(unsigned long addr) { … } #endif /* CONFIG_ELF_CORE */ #ifdef CONFIG_MIGRATION /* * Returns the number of collected folios. Return value is always >= 0. */ static unsigned long collect_longterm_unpinnable_folios( struct list_head *movable_folio_list, unsigned long nr_folios, struct folio **folios) { … } /* * Unpins all folios and migrates device coherent folios and movable_folio_list. * Returns -EAGAIN if all folios were successfully migrated or -errno for * failure (or partial success). */ static int migrate_longterm_unpinnable_folios( struct list_head *movable_folio_list, unsigned long nr_folios, struct folio **folios) { … } /* * Check whether all folios are *allowed* to be pinned indefinitely (long term). * Rather confusingly, all folios in the range are required to be pinned via * FOLL_PIN, before calling this routine. * * Return values: * * 0: if everything is OK and all folios in the range are allowed to be pinned, * then this routine leaves all folios pinned and returns zero for success. * * -EAGAIN: if any folios in the range are not allowed to be pinned, then this * routine will migrate those folios away, unpin all the folios in the range. If * migration of the entire set of folios succeeds, then -EAGAIN is returned. The * caller should re-pin the entire range with FOLL_PIN and then call this * routine again. * * -ENOMEM, or any other -errno: if an error *other* than -EAGAIN occurs, this * indicates a migration failure. The caller should give up, and propagate the * error back up the call stack. The caller does not need to unpin any folios in * that case, because this routine will do the unpinning. */ static long check_and_migrate_movable_folios(unsigned long nr_folios, struct folio **folios) { … } /* * Return values and behavior are the same as those for * check_and_migrate_movable_folios(). */ static long check_and_migrate_movable_pages(unsigned long nr_pages, struct page **pages) { … } #else static long check_and_migrate_movable_pages(unsigned long nr_pages, struct page **pages) { return 0; } static long check_and_migrate_movable_folios(unsigned long nr_folios, struct folio **folios) { return 0; } #endif /* CONFIG_MIGRATION */ /* * __gup_longterm_locked() is a wrapper for __get_user_pages_locked which * allows us to process the FOLL_LONGTERM flag. */ static long __gup_longterm_locked(struct mm_struct *mm, unsigned long start, unsigned long nr_pages, struct page **pages, int *locked, unsigned int gup_flags) { … } /* * Check that the given flags are valid for the exported gup/pup interface, and * update them with the required flags that the caller must have set. */ static bool is_valid_gup_args(struct page **pages, int *locked, unsigned int *gup_flags_p, unsigned int to_set) { … } #ifdef CONFIG_MMU /** * get_user_pages_remote() - pin user pages in memory * @mm: mm_struct of target mm * @start: starting user address * @nr_pages: number of pages from start to pin * @gup_flags: flags modifying lookup behaviour * @pages: array that receives pointers to the pages pinned. * Should be at least nr_pages long. Or NULL, if caller * only intends to ensure the pages are faulted in. * @locked: pointer to lock flag indicating whether lock is held and * subsequently whether VM_FAULT_RETRY functionality can be * utilised. Lock must initially be held. * * Returns either number of pages pinned (which may be less than the * number requested), or an error. Details about the return value: * * -- If nr_pages is 0, returns 0. * -- If nr_pages is >0, but no pages were pinned, returns -errno. * -- If nr_pages is >0, and some pages were pinned, returns the number of * pages pinned. Again, this may be less than nr_pages. * * The caller is responsible for releasing returned @pages, via put_page(). * * Must be called with mmap_lock held for read or write. * * get_user_pages_remote walks a process's page tables and takes a reference * to each struct page that each user address corresponds to at a given * instant. That is, it takes the page that would be accessed if a user * thread accesses the given user virtual address at that instant. * * This does not guarantee that the page exists in the user mappings when * get_user_pages_remote returns, and there may even be a completely different * page there in some cases (eg. if mmapped pagecache has been invalidated * and subsequently re-faulted). However it does guarantee that the page * won't be freed completely. And mostly callers simply care that the page * contains data that was valid *at some point in time*. Typically, an IO * or similar operation cannot guarantee anything stronger anyway because * locks can't be held over the syscall boundary. * * If gup_flags & FOLL_WRITE == 0, the page must not be written to. If the page * is written to, set_page_dirty (or set_page_dirty_lock, as appropriate) must * be called after the page is finished with, and before put_page is called. * * get_user_pages_remote is typically used for fewer-copy IO operations, * to get a handle on the memory by some means other than accesses * via the user virtual addresses. The pages may be submitted for * DMA to devices or accessed via their kernel linear mapping (via the * kmap APIs). Care should be taken to use the correct cache flushing APIs. * * See also get_user_pages_fast, for performance critical applications. * * get_user_pages_remote should be phased out in favor of * get_user_pages_locked|unlocked or get_user_pages_fast. Nothing * should use get_user_pages_remote because it cannot pass * FAULT_FLAG_ALLOW_RETRY to handle_mm_fault. */ long get_user_pages_remote(struct mm_struct *mm, unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, int *locked) { … } EXPORT_SYMBOL(…); #else /* CONFIG_MMU */ long get_user_pages_remote(struct mm_struct *mm, unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, int *locked) { return 0; } #endif /* !CONFIG_MMU */ /** * get_user_pages() - pin user pages in memory * @start: starting user address * @nr_pages: number of pages from start to pin * @gup_flags: flags modifying lookup behaviour * @pages: array that receives pointers to the pages pinned. * Should be at least nr_pages long. Or NULL, if caller * only intends to ensure the pages are faulted in. * * This is the same as get_user_pages_remote(), just with a less-flexible * calling convention where we assume that the mm being operated on belongs to * the current task, and doesn't allow passing of a locked parameter. We also * obviously don't pass FOLL_REMOTE in here. */ long get_user_pages(unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages) { … } EXPORT_SYMBOL(…); /* * get_user_pages_unlocked() is suitable to replace the form: * * mmap_read_lock(mm); * get_user_pages(mm, ..., pages, NULL); * mmap_read_unlock(mm); * * with: * * get_user_pages_unlocked(mm, ..., pages); * * It is functionally equivalent to get_user_pages_fast so * get_user_pages_fast should be used instead if specific gup_flags * (e.g. FOLL_FORCE) are not required. */ long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages, struct page **pages, unsigned int gup_flags) { … } EXPORT_SYMBOL(…); /* * GUP-fast * * get_user_pages_fast attempts to pin user pages by walking the page * tables directly and avoids taking locks. Thus the walker needs to be * protected from page table pages being freed from under it, and should * block any THP splits. * * One way to achieve this is to have the walker disable interrupts, and * rely on IPIs from the TLB flushing code blocking before the page table * pages are freed. This is unsuitable for architectures that do not need * to broadcast an IPI when invalidating TLBs. * * Another way to achieve this is to batch up page table containing pages * belonging to more than one mm_user, then rcu_sched a callback to free those * pages. Disabling interrupts will allow the gup_fast() walker to both block * the rcu_sched callback, and an IPI that we broadcast for splitting THPs * (which is a relatively rare event). The code below adopts this strategy. * * Before activating this code, please be aware that the following assumptions * are currently made: * * *) Either MMU_GATHER_RCU_TABLE_FREE is enabled, and tlb_remove_table() is used to * free pages containing page tables or TLB flushing requires IPI broadcast. * * *) ptes can be read atomically by the architecture. * * *) access_ok is sufficient to validate userspace address ranges. * * The last two assumptions can be relaxed by the addition of helper functions. * * This code is based heavily on the PowerPC implementation by Nick Piggin. */ #ifdef CONFIG_HAVE_GUP_FAST /* * Used in the GUP-fast path to determine whether GUP is permitted to work on * a specific folio. * * This call assumes the caller has pinned the folio, that the lowest page table * level still points to this folio, and that interrupts have been disabled. * * GUP-fast must reject all secretmem folios. * * Writing to pinned file-backed dirty tracked folios is inherently problematic * (see comment describing the writable_file_mapping_allowed() function). We * therefore try to avoid the most egregious case of a long-term mapping doing * so. * * This function cannot be as thorough as that one as the VMA is not available * in the fast path, so instead we whitelist known good cases and if in doubt, * fall back to the slow path. */ static bool gup_fast_folio_allowed(struct folio *folio, unsigned int flags) { … } static void __maybe_unused gup_fast_undo_dev_pagemap(int *nr, int nr_start, unsigned int flags, struct page **pages) { … } #ifdef CONFIG_ARCH_HAS_PTE_SPECIAL /* * GUP-fast relies on pte change detection to avoid concurrent pgtable * operations. * * To pin the page, GUP-fast needs to do below in order: * (1) pin the page (by prefetching pte), then (2) check pte not changed. * * For the rest of pgtable operations where pgtable updates can be racy * with GUP-fast, we need to do (1) clear pte, then (2) check whether page * is pinned. * * Above will work for all pte-level operations, including THP split. * * For THP collapse, it's a bit more complicated because GUP-fast may be * walking a pgtable page that is being freed (pte is still valid but pmd * can be cleared already). To avoid race in such condition, we need to * also check pmd here to make sure pmd doesn't change (corresponds to * pmdp_collapse_flush() in the THP collapse code path). */ static int gup_fast_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { … } #else /* * If we can't determine whether or not a pte is special, then fail immediately * for ptes. Note, we can still pin HugeTLB and THP as these are guaranteed not * to be special. * * For a futex to be placed on a THP tail page, get_futex_key requires a * get_user_pages_fast_only implementation that can pin pages. Thus it's still * useful to have gup_fast_pmd_leaf even if we can't operate on ptes. */ static int gup_fast_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { return 0; } #endif /* CONFIG_ARCH_HAS_PTE_SPECIAL */ #if defined(CONFIG_ARCH_HAS_PTE_DEVMAP) && defined(CONFIG_TRANSPARENT_HUGEPAGE) static int gup_fast_devmap_leaf(unsigned long pfn, unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { … } static int gup_fast_devmap_pmd_leaf(pmd_t orig, pmd_t *pmdp, unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { … } static int gup_fast_devmap_pud_leaf(pud_t orig, pud_t *pudp, unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { … } #else static int gup_fast_devmap_pmd_leaf(pmd_t orig, pmd_t *pmdp, unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { BUILD_BUG(); return 0; } static int gup_fast_devmap_pud_leaf(pud_t pud, pud_t *pudp, unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { BUILD_BUG(); return 0; } #endif static int gup_fast_pmd_leaf(pmd_t orig, pmd_t *pmdp, unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { … } static int gup_fast_pud_leaf(pud_t orig, pud_t *pudp, unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { … } static int gup_fast_pgd_leaf(pgd_t orig, pgd_t *pgdp, unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { … } static int gup_fast_pmd_range(pud_t *pudp, pud_t pud, unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { … } static int gup_fast_pud_range(p4d_t *p4dp, p4d_t p4d, unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { … } static int gup_fast_p4d_range(pgd_t *pgdp, pgd_t pgd, unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { … } static void gup_fast_pgd_range(unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { … } #else static inline void gup_fast_pgd_range(unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { } #endif /* CONFIG_HAVE_GUP_FAST */ #ifndef gup_fast_permitted /* * Check if it's allowed to use get_user_pages_fast_only() for the range, or * we need to fall back to the slow version: */ static bool gup_fast_permitted(unsigned long start, unsigned long end) { return true; } #endif static unsigned long gup_fast(unsigned long start, unsigned long end, unsigned int gup_flags, struct page **pages) { … } static int gup_fast_fallback(unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages) { … } /** * get_user_pages_fast_only() - pin user pages in memory * @start: starting user address * @nr_pages: number of pages from start to pin * @gup_flags: flags modifying pin behaviour * @pages: array that receives pointers to the pages pinned. * Should be at least nr_pages long. * * Like get_user_pages_fast() except it's IRQ-safe in that it won't fall back to * the regular GUP. * * If the architecture does not support this function, simply return with no * pages pinned. * * Careful, careful! COW breaking can go either way, so a non-write * access can get ambiguous page results. If you call this function without * 'write' set, you'd better be sure that you're ok with that ambiguity. */ int get_user_pages_fast_only(unsigned long start, int nr_pages, unsigned int gup_flags, struct page **pages) { … } EXPORT_SYMBOL_GPL(…); /** * get_user_pages_fast() - pin user pages in memory * @start: starting user address * @nr_pages: number of pages from start to pin * @gup_flags: flags modifying pin behaviour * @pages: array that receives pointers to the pages pinned. * Should be at least nr_pages long. * * Attempt to pin user pages in memory without taking mm->mmap_lock. * If not successful, it will fall back to taking the lock and * calling get_user_pages(). * * Returns number of pages pinned. This may be fewer than the number requested. * If nr_pages is 0 or negative, returns 0. If no pages were pinned, returns * -errno. */ int get_user_pages_fast(unsigned long start, int nr_pages, unsigned int gup_flags, struct page **pages) { … } EXPORT_SYMBOL_GPL(…); /** * pin_user_pages_fast() - pin user pages in memory without taking locks * * @start: starting user address * @nr_pages: number of pages from start to pin * @gup_flags: flags modifying pin behaviour * @pages: array that receives pointers to the pages pinned. * Should be at least nr_pages long. * * Nearly the same as get_user_pages_fast(), except that FOLL_PIN is set. See * get_user_pages_fast() for documentation on the function arguments, because * the arguments here are identical. * * FOLL_PIN means that the pages must be released via unpin_user_page(). Please * see Documentation/core-api/pin_user_pages.rst for further details. * * Note that if a zero_page is amongst the returned pages, it will not have * pins in it and unpin_user_page() will not remove pins from it. */ int pin_user_pages_fast(unsigned long start, int nr_pages, unsigned int gup_flags, struct page **pages) { … } EXPORT_SYMBOL_GPL(…); /** * pin_user_pages_remote() - pin pages of a remote process * * @mm: mm_struct of target mm * @start: starting user address * @nr_pages: number of pages from start to pin * @gup_flags: flags modifying lookup behaviour * @pages: array that receives pointers to the pages pinned. * Should be at least nr_pages long. * @locked: pointer to lock flag indicating whether lock is held and * subsequently whether VM_FAULT_RETRY functionality can be * utilised. Lock must initially be held. * * Nearly the same as get_user_pages_remote(), except that FOLL_PIN is set. See * get_user_pages_remote() for documentation on the function arguments, because * the arguments here are identical. * * FOLL_PIN means that the pages must be released via unpin_user_page(). Please * see Documentation/core-api/pin_user_pages.rst for details. * * Note that if a zero_page is amongst the returned pages, it will not have * pins in it and unpin_user_page*() will not remove pins from it. */ long pin_user_pages_remote(struct mm_struct *mm, unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, int *locked) { … } EXPORT_SYMBOL(…); /** * pin_user_pages() - pin user pages in memory for use by other devices * * @start: starting user address * @nr_pages: number of pages from start to pin * @gup_flags: flags modifying lookup behaviour * @pages: array that receives pointers to the pages pinned. * Should be at least nr_pages long. * * Nearly the same as get_user_pages(), except that FOLL_TOUCH is not set, and * FOLL_PIN is set. * * FOLL_PIN means that the pages must be released via unpin_user_page(). Please * see Documentation/core-api/pin_user_pages.rst for details. * * Note that if a zero_page is amongst the returned pages, it will not have * pins in it and unpin_user_page*() will not remove pins from it. */ long pin_user_pages(unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages) { … } EXPORT_SYMBOL(…); /* * pin_user_pages_unlocked() is the FOLL_PIN variant of * get_user_pages_unlocked(). Behavior is the same, except that this one sets * FOLL_PIN and rejects FOLL_GET. * * Note that if a zero_page is amongst the returned pages, it will not have * pins in it and unpin_user_page*() will not remove pins from it. */ long pin_user_pages_unlocked(unsigned long start, unsigned long nr_pages, struct page **pages, unsigned int gup_flags) { … } EXPORT_SYMBOL(…); /** * memfd_pin_folios() - pin folios associated with a memfd * @memfd: the memfd whose folios are to be pinned * @start: the first memfd offset * @end: the last memfd offset (inclusive) * @folios: array that receives pointers to the folios pinned * @max_folios: maximum number of entries in @folios * @offset: the offset into the first folio * * Attempt to pin folios associated with a memfd in the contiguous range * [start, end]. Given that a memfd is either backed by shmem or hugetlb, * the folios can either be found in the page cache or need to be allocated * if necessary. Once the folios are located, they are all pinned via * FOLL_PIN and @offset is populatedwith the offset into the first folio. * And, eventually, these pinned folios must be released either using * unpin_folios() or unpin_folio(). * * It must be noted that the folios may be pinned for an indefinite amount * of time. And, in most cases, the duration of time they may stay pinned * would be controlled by the userspace. This behavior is effectively the * same as using FOLL_LONGTERM with other GUP APIs. * * Returns number of folios pinned, which could be less than @max_folios * as it depends on the folio sizes that cover the range [start, end]. * If no folios were pinned, it returns -errno. */ long memfd_pin_folios(struct file *memfd, loff_t start, loff_t end, struct folio **folios, unsigned int max_folios, pgoff_t *offset) { … } EXPORT_SYMBOL_GPL(…);