huge_memory.c | Explore in Territory

// SPDX-License-Identifier: GPL-2.0-only
/*
 *  Copyright (C) 2009  Red Hat, Inc.
 */

#define pr_fmt(fmt) …

#include <linux/mm.h>
#include <linux/sched.h>
#include <linux/sched/mm.h>
#include <linux/sched/coredump.h>
#include <linux/sched/numa_balancing.h>
#include <linux/highmem.h>
#include <linux/hugetlb.h>
#include <linux/mmu_notifier.h>
#include <linux/rmap.h>
#include <linux/swap.h>
#include <linux/shrinker.h>
#include <linux/mm_inline.h>
#include <linux/swapops.h>
#include <linux/backing-dev.h>
#include <linux/dax.h>
#include <linux/mm_types.h>
#include <linux/khugepaged.h>
#include <linux/freezer.h>
#include <linux/pfn_t.h>
#include <linux/mman.h>
#include <linux/memremap.h>
#include <linux/pagemap.h>
#include <linux/debugfs.h>
#include <linux/migrate.h>
#include <linux/hashtable.h>
#include <linux/userfaultfd_k.h>
#include <linux/page_idle.h>
#include <linux/shmem_fs.h>
#include <linux/oom.h>
#include <linux/numa.h>
#include <linux/page_owner.h>
#include <linux/sched/sysctl.h>
#include <linux/memory-tiers.h>
#include <linux/compat.h>
#include <linux/pgalloc_tag.h>

#include <asm/tlb.h>
#include <asm/pgalloc.h>
#include "internal.h"
#include "swap.h"

#define CREATE_TRACE_POINTS
#include <trace/events/thp.h>

/*
 * By default, transparent hugepage support is disabled in order to avoid
 * risking an increased memory footprint for applications that are not
 * guaranteed to benefit from it. When transparent hugepage support is
 * enabled, it is for all mappings, and khugepaged scans all mappings.
 * Defrag is invoked by khugepaged hugepage allocations and by page faults
 * for all hugepage allocations.
 */
unsigned long transparent_hugepage_flags __read_mostly = …;

static struct shrinker *deferred_split_shrinker;
static unsigned long deferred_split_count(struct shrinker *shrink,
					  struct shrink_control *sc);
static unsigned long deferred_split_scan(struct shrinker *shrink,
					 struct shrink_control *sc);

static atomic_t huge_zero_refcount;
struct folio *huge_zero_folio __read_mostly;
unsigned long huge_zero_pfn __read_mostly = …;
unsigned long huge_anon_orders_always __read_mostly;
unsigned long huge_anon_orders_madvise __read_mostly;
unsigned long huge_anon_orders_inherit __read_mostly;

unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma,
					 unsigned long vm_flags,
					 unsigned long tva_flags,
					 unsigned long orders)
{ … }

static bool get_huge_zero_page(void)
{ … }

static void put_huge_zero_page(void)
{ … }

struct folio *mm_get_huge_zero_folio(struct mm_struct *mm)
{ … }

void mm_put_huge_zero_folio(struct mm_struct *mm)
{ … }

static unsigned long shrink_huge_zero_page_count(struct shrinker *shrink,
					struct shrink_control *sc)
{ … }

static unsigned long shrink_huge_zero_page_scan(struct shrinker *shrink,
				       struct shrink_control *sc)
{ … }

static struct shrinker *huge_zero_page_shrinker;

#ifdef CONFIG_SYSFS
static ssize_t enabled_show(struct kobject *kobj,
			    struct kobj_attribute *attr, char *buf)
{ … }

static ssize_t enabled_store(struct kobject *kobj,
			     struct kobj_attribute *attr,
			     const char *buf, size_t count)
{ … }

static struct kobj_attribute enabled_attr = …;

ssize_t single_hugepage_flag_show(struct kobject *kobj,
				  struct kobj_attribute *attr, char *buf,
				  enum transparent_hugepage_flag flag)
{ … }

ssize_t single_hugepage_flag_store(struct kobject *kobj,
				 struct kobj_attribute *attr,
				 const char *buf, size_t count,
				 enum transparent_hugepage_flag flag)
{ … }

static ssize_t defrag_show(struct kobject *kobj,
			   struct kobj_attribute *attr, char *buf)
{ … }

static ssize_t defrag_store(struct kobject *kobj,
			    struct kobj_attribute *attr,
			    const char *buf, size_t count)
{ … }
static struct kobj_attribute defrag_attr = …;

static ssize_t use_zero_page_show(struct kobject *kobj,
				  struct kobj_attribute *attr, char *buf)
{ … }
static ssize_t use_zero_page_store(struct kobject *kobj,
		struct kobj_attribute *attr, const char *buf, size_t count)
{ … }
static struct kobj_attribute use_zero_page_attr = …;

static ssize_t hpage_pmd_size_show(struct kobject *kobj,
				   struct kobj_attribute *attr, char *buf)
{ … }
static struct kobj_attribute hpage_pmd_size_attr = …;

static struct attribute *hugepage_attr[] = …;

static const struct attribute_group hugepage_attr_group = …;

static void hugepage_exit_sysfs(struct kobject *hugepage_kobj);
static void thpsize_release(struct kobject *kobj);
static DEFINE_SPINLOCK(huge_anon_orders_lock);
static LIST_HEAD(thpsize_list);

static ssize_t thpsize_enabled_show(struct kobject *kobj,
				    struct kobj_attribute *attr, char *buf)
{ … }

static ssize_t thpsize_enabled_store(struct kobject *kobj,
				     struct kobj_attribute *attr,
				     const char *buf, size_t count)
{ … }

static struct kobj_attribute thpsize_enabled_attr = …;

static struct attribute *thpsize_attrs[] = …;

static const struct attribute_group thpsize_attr_group = …;

static const struct kobj_type thpsize_ktype = …;

DEFINE_PER_CPU(struct mthp_stat, mthp_stats) = …;

static unsigned long sum_mthp_stat(int order, enum mthp_stat_item item)
{ … }

#define DEFINE_MTHP_STAT_ATTR(_name, _index) …

DEFINE_MTHP_STAT_ATTR(…);
DEFINE_MTHP_STAT_ATTR(…);
DEFINE_MTHP_STAT_ATTR(…);
DEFINE_MTHP_STAT_ATTR(…);
DEFINE_MTHP_STAT_ATTR(…);
DEFINE_MTHP_STAT_ATTR(…);
DEFINE_MTHP_STAT_ATTR(…);
DEFINE_MTHP_STAT_ATTR(…);
DEFINE_MTHP_STAT_ATTR(…);
DEFINE_MTHP_STAT_ATTR(…);
DEFINE_MTHP_STAT_ATTR(…);

static struct attribute *stats_attrs[] = …;

static struct attribute_group stats_attr_group = …;

static struct thpsize *thpsize_create(int order, struct kobject *parent)
{ … }

static void thpsize_release(struct kobject *kobj)
{ … }

static int __init hugepage_init_sysfs(struct kobject **hugepage_kobj)
{ … }

static void __init hugepage_exit_sysfs(struct kobject *hugepage_kobj)
{ … }
#else
static inline int hugepage_init_sysfs(struct kobject **hugepage_kobj)
{
	return 0;
}

static inline void hugepage_exit_sysfs(struct kobject *hugepage_kobj)
{
}
#endif /* CONFIG_SYSFS */

static int __init thp_shrinker_init(void)
{ … }

static void __init thp_shrinker_exit(void)
{ … }

static int __init hugepage_init(void)
{ … }
subsys_initcall(hugepage_init);

static int __init setup_transparent_hugepage(char *str)
{ … }
__setup(…);

pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
{ … }

#ifdef CONFIG_MEMCG
static inline
struct deferred_split *get_deferred_split_queue(struct folio *folio)
{ … }
#else
static inline
struct deferred_split *get_deferred_split_queue(struct folio *folio)
{
	struct pglist_data *pgdat = NODE_DATA(folio_nid(folio));

	return &pgdat->deferred_split_queue;
}
#endif

static inline bool is_transparent_hugepage(const struct folio *folio)
{ … }

static unsigned long __thp_get_unmapped_area(struct file *filp,
		unsigned long addr, unsigned long len,
		loff_t off, unsigned long flags, unsigned long size,
		vm_flags_t vm_flags)
{ … }

unsigned long thp_get_unmapped_area_vmflags(struct file *filp, unsigned long addr,
		unsigned long len, unsigned long pgoff, unsigned long flags,
		vm_flags_t vm_flags)
{ … }

unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr,
		unsigned long len, unsigned long pgoff, unsigned long flags)
{ … }
EXPORT_SYMBOL_GPL(…);

static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
			struct page *page, gfp_t gfp)
{ … }

/*
 * always: directly stall for all thp allocations
 * defer: wake kswapd and fail if not immediately available
 * defer+madvise: wake kswapd and directly stall for MADV_HUGEPAGE, otherwise
 *		  fail if not immediately available
 * madvise: directly stall for MADV_HUGEPAGE, otherwise fail if not immediately
 *	    available
 * never: never stall for any thp allocation
 */
gfp_t vma_thp_gfp_mask(struct vm_area_struct *vma)
{ … }

/* Caller must hold page table lock. */
static void set_huge_zero_folio(pgtable_t pgtable, struct mm_struct *mm,
		struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd,
		struct folio *zero_folio)
{ … }

vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf)
{ … }

static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
		pmd_t *pmd, pfn_t pfn, pgprot_t prot, bool write,
		pgtable_t pgtable)
{ … }

/**
 * vmf_insert_pfn_pmd - insert a pmd size pfn
 * @vmf: Structure describing the fault
 * @pfn: pfn to insert
 * @write: whether it's a write fault
 *
 * Insert a pmd size pfn. See vmf_insert_pfn() for additional info.
 *
 * Return: vm_fault_t value.
 */
vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, pfn_t pfn, bool write)
{ … }
EXPORT_SYMBOL_GPL(…);

#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
static pud_t maybe_pud_mkwrite(pud_t pud, struct vm_area_struct *vma)
{ … }

static void insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr,
		pud_t *pud, pfn_t pfn, bool write)
{ … }

/**
 * vmf_insert_pfn_pud - insert a pud size pfn
 * @vmf: Structure describing the fault
 * @pfn: pfn to insert
 * @write: whether it's a write fault
 *
 * Insert a pud size pfn. See vmf_insert_pfn() for additional info.
 *
 * Return: vm_fault_t value.
 */
vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn, bool write)
{ … }
EXPORT_SYMBOL_GPL(…);
#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */

void touch_pmd(struct vm_area_struct *vma, unsigned long addr,
	       pmd_t *pmd, bool write)
{ … }

struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr,
		pmd_t *pmd, int flags, struct dev_pagemap **pgmap)
{ … }

int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
		  pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
		  struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
{ … }

#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
void touch_pud(struct vm_area_struct *vma, unsigned long addr,
	       pud_t *pud, bool write)
{ … }

int copy_huge_pud(struct mm_struct *dst_mm, struct mm_struct *src_mm,
		  pud_t *dst_pud, pud_t *src_pud, unsigned long addr,
		  struct vm_area_struct *vma)
{ … }

void huge_pud_set_accessed(struct vm_fault *vmf, pud_t orig_pud)
{ … }
#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */

void huge_pmd_set_accessed(struct vm_fault *vmf)
{ … }

vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf)
{ … }

static inline bool can_change_pmd_writable(struct vm_area_struct *vma,
					   unsigned long addr, pmd_t pmd)
{ … }

/* NUMA hinting page fault entry point for trans huge pmds */
vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf)
{ … }

/*
 * Return true if we do MADV_FREE successfully on entire pmd page.
 * Otherwise, return false.
 */
bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
		pmd_t *pmd, unsigned long addr, unsigned long next)
{ … }

static inline void zap_deposited_table(struct mm_struct *mm, pmd_t *pmd)
{ … }

int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
		 pmd_t *pmd, unsigned long addr)
{ … }

#ifndef pmd_move_must_withdraw
static inline int pmd_move_must_withdraw(spinlock_t *new_pmd_ptl,
					 spinlock_t *old_pmd_ptl,
					 struct vm_area_struct *vma)
{ … }
#endif

static pmd_t move_soft_dirty_pmd(pmd_t pmd)
{ … }

bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
		  unsigned long new_addr, pmd_t *old_pmd, pmd_t *new_pmd)
{ … }

/*
 * Returns
 *  - 0 if PMD could not be locked
 *  - 1 if PMD was locked but protections unchanged and TLB flush unnecessary
 *      or if prot_numa but THP migration is not supported
 *  - HPAGE_PMD_NR if protections changed and TLB flush necessary
 */
int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
		    pmd_t *pmd, unsigned long addr, pgprot_t newprot,
		    unsigned long cp_flags)
{ … }

#ifdef CONFIG_USERFAULTFD
/*
 * The PT lock for src_pmd and dst_vma/src_vma (for reading) are locked by
 * the caller, but it must return after releasing the page_table_lock.
 * Just move the page from src_pmd to dst_pmd if possible.
 * Return zero if succeeded in moving the page, -EAGAIN if it needs to be
 * repeated by the caller, or other errors in case of failure.
 */
int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pmd_t dst_pmdval,
			struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
			unsigned long dst_addr, unsigned long src_addr)
{ … }
#endif /* CONFIG_USERFAULTFD */

/*
 * Returns page table lock pointer if a given pmd maps a thp, NULL otherwise.
 *
 * Note that if it returns page table lock pointer, this routine returns without
 * unlocking page table lock. So callers must unlock it.
 */
spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma)
{ … }

/*
 * Returns page table lock pointer if a given pud maps a thp, NULL otherwise.
 *
 * Note that if it returns page table lock pointer, this routine returns without
 * unlocking page table lock. So callers must unlock it.
 */
spinlock_t *__pud_trans_huge_lock(pud_t *pud, struct vm_area_struct *vma)
{ … }

#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
int zap_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma,
		 pud_t *pud, unsigned long addr)
{ … }

static void __split_huge_pud_locked(struct vm_area_struct *vma, pud_t *pud,
		unsigned long haddr)
{ … }

void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud,
		unsigned long address)
{ … }
#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */

static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
		unsigned long haddr, pmd_t *pmd)
{ … }

static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
		unsigned long haddr, bool freeze)
{ … }

void split_huge_pmd_locked(struct vm_area_struct *vma, unsigned long address,
			   pmd_t *pmd, bool freeze, struct folio *folio)
{ … }

void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
		unsigned long address, bool freeze, struct folio *folio)
{ … }

void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address,
		bool freeze, struct folio *folio)
{ … }

static inline void split_huge_pmd_if_needed(struct vm_area_struct *vma, unsigned long address)
{ … }

void vma_adjust_trans_huge(struct vm_area_struct *vma,
			     unsigned long start,
			     unsigned long end,
			     long adjust_next)
{ … }

static void unmap_folio(struct folio *folio)
{ … }

static bool __discard_anon_folio_pmd_locked(struct vm_area_struct *vma,
					    unsigned long addr, pmd_t *pmdp,
					    struct folio *folio)
{ … }

bool unmap_huge_pmd_locked(struct vm_area_struct *vma, unsigned long addr,
			   pmd_t *pmdp, struct folio *folio)
{ … }

static void remap_page(struct folio *folio, unsigned long nr)
{ … }

static void lru_add_page_tail(struct page *head, struct page *tail,
		struct lruvec *lruvec, struct list_head *list)
{ … }

static void __split_huge_page_tail(struct folio *folio, int tail,
		struct lruvec *lruvec, struct list_head *list,
		unsigned int new_order)
{ … }

static void __split_huge_page(struct page *page, struct list_head *list,
		pgoff_t end, unsigned int new_order)
{ … }

/* Racy check whether the huge page can be split */
bool can_split_folio(struct folio *folio, int *pextra_pins)
{ … }

/*
 * This function splits a large folio into smaller folios of order @new_order.
 * @page can point to any page of the large folio to split. The split operation
 * does not change the position of @page.
 *
 * Prerequisites:
 *
 * 1) The caller must hold a reference on the @page's owning folio, also known
 *    as the large folio.
 *
 * 2) The large folio must be locked.
 *
 * 3) The folio must not be pinned. Any unexpected folio references, including
 *    GUP pins, will result in the folio not getting split; instead, the caller
 *    will receive an -EAGAIN.
 *
 * 4) @new_order > 1, usually. Splitting to order-1 anonymous folios is not
 *    supported for non-file-backed folios, because folio->_deferred_list, which
 *    is used by partially mapped folios, is stored in subpage 2, but an order-1
 *    folio only has subpages 0 and 1. File-backed order-1 folios are supported,
 *    since they do not use _deferred_list.
 *
 * After splitting, the caller's folio reference will be transferred to @page,
 * resulting in a raised refcount of @page after this call. The other pages may
 * be freed if they are not mapped.
 *
 * If @list is null, tail pages will be added to LRU list, otherwise, to @list.
 *
 * Pages in @new_order will inherit the mapping, flags, and so on from the
 * huge page.
 *
 * Returns 0 if the huge page was split successfully.
 *
 * Returns -EAGAIN if the folio has unexpected reference (e.g., GUP) or if
 * the folio was concurrently removed from the page cache.
 *
 * Returns -EBUSY when trying to split the huge zeropage, if the folio is
 * under writeback, if fs-specific folio metadata cannot currently be
 * released, or if some unexpected race happened (e.g., anon VMA disappeared,
 * truncation).
 *
 * Returns -EINVAL when trying to split to an order that is incompatible
 * with the folio. Splitting to order 0 is compatible with all folios.
 */
int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
				     unsigned int new_order)
{ … }

void __folio_undo_large_rmappable(struct folio *folio)
{ … }

void deferred_split_folio(struct folio *folio)
{ … }

static unsigned long deferred_split_count(struct shrinker *shrink,
		struct shrink_control *sc)
{ … }

static unsigned long deferred_split_scan(struct shrinker *shrink,
		struct shrink_control *sc)
{ … }

#ifdef CONFIG_DEBUG_FS
static void split_huge_pages_all(void)
{ … }

static inline bool vma_not_suitable_for_thp_split(struct vm_area_struct *vma)
{ … }

static int split_huge_pages_pid(int pid, unsigned long vaddr_start,
				unsigned long vaddr_end, unsigned int new_order)
{ … }

static int split_huge_pages_in_file(const char *file_path, pgoff_t off_start,
				pgoff_t off_end, unsigned int new_order)
{ … }

#define MAX_INPUT_BUF_SZ …

static ssize_t split_huge_pages_write(struct file *file, const char __user *buf,
				size_t count, loff_t *ppops)
{ … }

static const struct file_operations split_huge_pages_fops = …;

static int __init split_huge_pages_debugfs(void)
{ … }
late_initcall(split_huge_pages_debugfs);
#endif

#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
int set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw,
		struct page *page)
{ … }

void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new)
{ … }
#endif
linux/mm/huge_memory.c