migrate.c | Explore in Territory

// SPDX-License-Identifier: GPL-2.0
/*
 * Memory Migration functionality - linux/mm/migrate.c
 *
 * Copyright (C) 2006 Silicon Graphics, Inc., Christoph Lameter
 *
 * Page migration was first developed in the context of the memory hotplug
 * project. The main authors of the migration code are:
 *
 * IWAMOTO Toshihiro <[email protected]>
 * Hirokazu Takahashi <[email protected]>
 * Dave Hansen <[email protected]>
 * Christoph Lameter
 */

#include <linux/migrate.h>
#include <linux/export.h>
#include <linux/swap.h>
#include <linux/swapops.h>
#include <linux/pagemap.h>
#include <linux/buffer_head.h>
#include <linux/mm_inline.h>
#include <linux/nsproxy.h>
#include <linux/ksm.h>
#include <linux/rmap.h>
#include <linux/topology.h>
#include <linux/cpu.h>
#include <linux/cpuset.h>
#include <linux/writeback.h>
#include <linux/mempolicy.h>
#include <linux/vmalloc.h>
#include <linux/security.h>
#include <linux/backing-dev.h>
#include <linux/compaction.h>
#include <linux/syscalls.h>
#include <linux/compat.h>
#include <linux/hugetlb.h>
#include <linux/hugetlb_cgroup.h>
#include <linux/gfp.h>
#include <linux/pfn_t.h>
#include <linux/memremap.h>
#include <linux/userfaultfd_k.h>
#include <linux/balloon_compaction.h>
#include <linux/page_idle.h>
#include <linux/page_owner.h>
#include <linux/sched/mm.h>
#include <linux/ptrace.h>
#include <linux/oom.h>
#include <linux/memory.h>
#include <linux/random.h>
#include <linux/sched/sysctl.h>
#include <linux/memory-tiers.h>

#include <asm/tlbflush.h>

#include <trace/events/migrate.h>

#include "internal.h"

bool isolate_movable_page(struct page *page, isolate_mode_t mode)
{ … }

static void putback_movable_folio(struct folio *folio)
{ … }

/*
 * Put previously isolated pages back onto the appropriate lists
 * from where they were once taken off for compaction/migration.
 *
 * This function shall be used whenever the isolated pageset has been
 * built from lru, balloon, hugetlbfs page. See isolate_migratepages_range()
 * and isolate_hugetlb().
 */
void putback_movable_pages(struct list_head *l)
{ … }

/*
 * Restore a potential migration pte to a working pte entry
 */
static bool remove_migration_pte(struct folio *folio,
		struct vm_area_struct *vma, unsigned long addr, void *old)
{ … }

/*
 * Get rid of all migration entries and replace them by
 * references to the indicated page.
 */
void remove_migration_ptes(struct folio *src, struct folio *dst, bool locked)
{ … }

/*
 * Something used the pte of a page under migration. We need to
 * get to the page and wait until migration is finished.
 * When we return from this function the fault will be retried.
 */
void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
			  unsigned long address)
{ … }

#ifdef CONFIG_HUGETLB_PAGE
/*
 * The vma read lock must be held upon entry. Holding that lock prevents either
 * the pte or the ptl from being freed.
 *
 * This function will release the vma lock before returning.
 */
void migration_entry_wait_huge(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep)
{ … }
#endif

#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
void pmd_migration_entry_wait(struct mm_struct *mm, pmd_t *pmd)
{ … }
#endif

static int folio_expected_refs(struct address_space *mapping,
		struct folio *folio)
{ … }

/*
 * Replace the folio in the mapping.
 *
 * The number of remaining references must be:
 * 1 for anonymous folios without a mapping
 * 2 for folios with a mapping
 * 3 for folios with a mapping and PagePrivate/PagePrivate2 set.
 */
static int __folio_migrate_mapping(struct address_space *mapping,
		struct folio *newfolio, struct folio *folio, int expected_count)
{ … }

int folio_migrate_mapping(struct address_space *mapping,
		struct folio *newfolio, struct folio *folio, int extra_count)
{ … }
EXPORT_SYMBOL(…);

/*
 * The expected number of remaining references is the same as that
 * of folio_migrate_mapping().
 */
int migrate_huge_page_move_mapping(struct address_space *mapping,
				   struct folio *dst, struct folio *src)
{ … }

/*
 * Copy the flags and some other ancillary information
 */
void folio_migrate_flags(struct folio *newfolio, struct folio *folio)
{ … }
EXPORT_SYMBOL(…);

/************************************************************
 *                    Migration functions
 ***********************************************************/

static int __migrate_folio(struct address_space *mapping, struct folio *dst,
			   struct folio *src, void *src_private,
			   enum migrate_mode mode)
{ … }

/**
 * migrate_folio() - Simple folio migration.
 * @mapping: The address_space containing the folio.
 * @dst: The folio to migrate the data to.
 * @src: The folio containing the current data.
 * @mode: How to migrate the page.
 *
 * Common logic to directly migrate a single LRU folio suitable for
 * folios that do not use PagePrivate/PagePrivate2.
 *
 * Folios are locked upon entry and exit.
 */
int migrate_folio(struct address_space *mapping, struct folio *dst,
		  struct folio *src, enum migrate_mode mode)
{ … }
EXPORT_SYMBOL(…);

#ifdef CONFIG_BUFFER_HEAD
/* Returns true if all buffers are successfully locked */
static bool buffer_migrate_lock_buffers(struct buffer_head *head,
							enum migrate_mode mode)
{ … }

static int __buffer_migrate_folio(struct address_space *mapping,
		struct folio *dst, struct folio *src, enum migrate_mode mode,
		bool check_refs)
{ … }

/**
 * buffer_migrate_folio() - Migration function for folios with buffers.
 * @mapping: The address space containing @src.
 * @dst: The folio to migrate to.
 * @src: The folio to migrate from.
 * @mode: How to migrate the folio.
 *
 * This function can only be used if the underlying filesystem guarantees
 * that no other references to @src exist. For example attached buffer
 * heads are accessed only under the folio lock.  If your filesystem cannot
 * provide this guarantee, buffer_migrate_folio_norefs() may be more
 * appropriate.
 *
 * Return: 0 on success or a negative errno on failure.
 */
int buffer_migrate_folio(struct address_space *mapping,
		struct folio *dst, struct folio *src, enum migrate_mode mode)
{ … }
EXPORT_SYMBOL(…);

/**
 * buffer_migrate_folio_norefs() - Migration function for folios with buffers.
 * @mapping: The address space containing @src.
 * @dst: The folio to migrate to.
 * @src: The folio to migrate from.
 * @mode: How to migrate the folio.
 *
 * Like buffer_migrate_folio() except that this variant is more careful
 * and checks that there are also no buffer head references. This function
 * is the right one for mappings where buffer heads are directly looked
 * up and referenced (such as block device mappings).
 *
 * Return: 0 on success or a negative errno on failure.
 */
int buffer_migrate_folio_norefs(struct address_space *mapping,
		struct folio *dst, struct folio *src, enum migrate_mode mode)
{ … }
EXPORT_SYMBOL_GPL(…);
#endif /* CONFIG_BUFFER_HEAD */

int filemap_migrate_folio(struct address_space *mapping,
		struct folio *dst, struct folio *src, enum migrate_mode mode)
{ … }
EXPORT_SYMBOL_GPL(…);

/*
 * Writeback a folio to clean the dirty state
 */
static int writeout(struct address_space *mapping, struct folio *folio)
{ … }

/*
 * Default handling if a filesystem does not provide a migration function.
 */
static int fallback_migrate_folio(struct address_space *mapping,
		struct folio *dst, struct folio *src, enum migrate_mode mode)
{ … }

/*
 * Move a page to a newly allocated page
 * The page is locked and all ptes have been successfully removed.
 *
 * The new page will have replaced the old page if this function
 * is successful.
 *
 * Return value:
 *   < 0 - error code
 *  MIGRATEPAGE_SUCCESS - success
 */
static int move_to_new_folio(struct folio *dst, struct folio *src,
				enum migrate_mode mode)
{ … }

/*
 * To record some information during migration, we use unused private
 * field of struct folio of the newly allocated destination folio.
 * This is safe because nobody is using it except us.
 */
enum { … };

static void __migrate_folio_record(struct folio *dst,
				   int old_page_state,
				   struct anon_vma *anon_vma)
{ … }

static void __migrate_folio_extract(struct folio *dst,
				   int *old_page_state,
				   struct anon_vma **anon_vmap)
{ … }

/* Restore the source folio to the original state upon failure */
static void migrate_folio_undo_src(struct folio *src,
				   int page_was_mapped,
				   struct anon_vma *anon_vma,
				   bool locked,
				   struct list_head *ret)
{ … }

/* Restore the destination folio to the original state upon failure */
static void migrate_folio_undo_dst(struct folio *dst, bool locked,
		free_folio_t put_new_folio, unsigned long private)
{ … }

/* Cleanup src folio upon migration success */
static void migrate_folio_done(struct folio *src,
			       enum migrate_reason reason)
{ … }

/* Obtain the lock on page, remove all ptes. */
static int migrate_folio_unmap(new_folio_t get_new_folio,
		free_folio_t put_new_folio, unsigned long private,
		struct folio *src, struct folio **dstp, enum migrate_mode mode,
		enum migrate_reason reason, struct list_head *ret)
{ … }

/* Migrate the folio to the newly allocated folio in dst. */
static int migrate_folio_move(free_folio_t put_new_folio, unsigned long private,
			      struct folio *src, struct folio *dst,
			      enum migrate_mode mode, enum migrate_reason reason,
			      struct list_head *ret)
{ … }

/*
 * Counterpart of unmap_and_move_page() for hugepage migration.
 *
 * This function doesn't wait the completion of hugepage I/O
 * because there is no race between I/O and migration for hugepage.
 * Note that currently hugepage I/O occurs only in direct I/O
 * where no lock is held and PG_writeback is irrelevant,
 * and writeback status of all subpages are counted in the reference
 * count of the head page (i.e. if all subpages of a 2MB hugepage are
 * under direct I/O, the reference of the head page is 512 and a bit more.)
 * This means that when we try to migrate hugepage whose subpages are
 * doing direct I/O, some references remain after try_to_unmap() and
 * hugepage migration fails without data corruption.
 *
 * There is also no race when direct I/O is issued on the page under migration,
 * because then pte is replaced with migration swap entry and direct I/O code
 * will wait in the page fault for migration to complete.
 */
static int unmap_and_move_huge_page(new_folio_t get_new_folio,
		free_folio_t put_new_folio, unsigned long private,
		struct folio *src, int force, enum migrate_mode mode,
		int reason, struct list_head *ret)
{ … }

static inline int try_split_folio(struct folio *folio, struct list_head *split_folios)
{ … }

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
#define NR_MAX_BATCHED_MIGRATION …
#else
#define NR_MAX_BATCHED_MIGRATION …
#endif
#define NR_MAX_MIGRATE_PAGES_RETRY …
#define NR_MAX_MIGRATE_ASYNC_RETRY …
#define NR_MAX_MIGRATE_SYNC_RETRY …

struct migrate_pages_stats { … };

/*
 * Returns the number of hugetlb folios that were not migrated, or an error code
 * after NR_MAX_MIGRATE_PAGES_RETRY attempts or if no hugetlb folios are movable
 * any more because the list has become empty or no retryable hugetlb folios
 * exist any more. It is caller's responsibility to call putback_movable_pages()
 * only if ret != 0.
 */
static int migrate_hugetlbs(struct list_head *from, new_folio_t get_new_folio,
			    free_folio_t put_new_folio, unsigned long private,
			    enum migrate_mode mode, int reason,
			    struct migrate_pages_stats *stats,
			    struct list_head *ret_folios)
{ … }

/*
 * migrate_pages_batch() first unmaps folios in the from list as many as
 * possible, then move the unmapped folios.
 *
 * We only batch migration if mode == MIGRATE_ASYNC to avoid to wait a
 * lock or bit when we have locked more than one folio.  Which may cause
 * deadlock (e.g., for loop device).  So, if mode != MIGRATE_ASYNC, the
 * length of the from list must be <= 1.
 */
static int migrate_pages_batch(struct list_head *from,
		new_folio_t get_new_folio, free_folio_t put_new_folio,
		unsigned long private, enum migrate_mode mode, int reason,
		struct list_head *ret_folios, struct list_head *split_folios,
		struct migrate_pages_stats *stats, int nr_pass)
{ … }

static int migrate_pages_sync(struct list_head *from, new_folio_t get_new_folio,
		free_folio_t put_new_folio, unsigned long private,
		enum migrate_mode mode, int reason,
		struct list_head *ret_folios, struct list_head *split_folios,
		struct migrate_pages_stats *stats)
{ … }

/*
 * migrate_pages - migrate the folios specified in a list, to the free folios
 *		   supplied as the target for the page migration
 *
 * @from:		The list of folios to be migrated.
 * @get_new_folio:	The function used to allocate free folios to be used
 *			as the target of the folio migration.
 * @put_new_folio:	The function used to free target folios if migration
 *			fails, or NULL if no special handling is necessary.
 * @private:		Private data to be passed on to get_new_folio()
 * @mode:		The migration mode that specifies the constraints for
 *			folio migration, if any.
 * @reason:		The reason for folio migration.
 * @ret_succeeded:	Set to the number of folios migrated successfully if
 *			the caller passes a non-NULL pointer.
 *
 * The function returns after NR_MAX_MIGRATE_PAGES_RETRY attempts or if no folios
 * are movable any more because the list has become empty or no retryable folios
 * exist any more. It is caller's responsibility to call putback_movable_pages()
 * only if ret != 0.
 *
 * Returns the number of {normal folio, large folio, hugetlb} that were not
 * migrated, or an error code. The number of large folio splits will be
 * considered as the number of non-migrated large folio, no matter how many
 * split folios of the large folio are migrated successfully.
 */
int migrate_pages(struct list_head *from, new_folio_t get_new_folio,
		free_folio_t put_new_folio, unsigned long private,
		enum migrate_mode mode, int reason, unsigned int *ret_succeeded)
{ … }

struct folio *alloc_migration_target(struct folio *src, unsigned long private)
{ … }

#ifdef CONFIG_NUMA

static int store_status(int __user *status, int start, int value, int nr)
{ … }

static int do_move_pages_to_node(struct list_head *pagelist, int node)
{ … }

/*
 * Resolves the given address to a struct page, isolates it from the LRU and
 * puts it to the given pagelist.
 * Returns:
 *     errno - if the page cannot be found/isolated
 *     0 - when it doesn't have to be migrated because it is already on the
 *         target node
 *     1 - when it has been queued
 */
static int add_page_for_migration(struct mm_struct *mm, const void __user *p,
		int node, struct list_head *pagelist, bool migrate_all)
{ … }

static int move_pages_and_store_status(int node,
		struct list_head *pagelist, int __user *status,
		int start, int i, unsigned long nr_pages)
{ … }

/*
 * Migrate an array of page address onto an array of nodes and fill
 * the corresponding array of status.
 */
static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes,
			 unsigned long nr_pages,
			 const void __user * __user *pages,
			 const int __user *nodes,
			 int __user *status, int flags)
{ … }

/*
 * Determine the nodes of an array of pages and store it in an array of status.
 */
static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages,
				const void __user **pages, int *status)
{ … }

static int get_compat_pages_array(const void __user *chunk_pages[],
				  const void __user * __user *pages,
				  unsigned long chunk_nr)
{ … }

/*
 * Determine the nodes of a user array of pages and store it in
 * a user array of status.
 */
static int do_pages_stat(struct mm_struct *mm, unsigned long nr_pages,
			 const void __user * __user *pages,
			 int __user *status)
{ … }

static struct mm_struct *find_mm_struct(pid_t pid, nodemask_t *mem_nodes)
{ … }

/*
 * Move a list of pages in the address space of the currently executing
 * process.
 */
static int kernel_move_pages(pid_t pid, unsigned long nr_pages,
			     const void __user * __user *pages,
			     const int __user *nodes,
			     int __user *status, int flags)
{ … }

SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages,
		const void __user * __user *, pages,
		const int __user *, nodes,
		int __user *, status, int, flags)
{ … }

#ifdef CONFIG_NUMA_BALANCING
/*
 * Returns true if this is a safe migration target node for misplaced NUMA
 * pages. Currently it only checks the watermarks which is crude.
 */
static bool migrate_balanced_pgdat(struct pglist_data *pgdat,
				   unsigned long nr_migrate_pages)
{ … }

static struct folio *alloc_misplaced_dst_folio(struct folio *src,
					   unsigned long data)
{ … }

/*
 * Prepare for calling migrate_misplaced_folio() by isolating the folio if
 * permitted. Must be called with the PTL still held.
 */
int migrate_misplaced_folio_prepare(struct folio *folio,
		struct vm_area_struct *vma, int node)
{ … }

/*
 * Attempt to migrate a misplaced folio to the specified destination
 * node. Caller is expected to have isolated the folio by calling
 * migrate_misplaced_folio_prepare(), which will result in an
 * elevated reference count on the folio. This function will un-isolate the
 * folio, dereferencing the folio before returning.
 */
int migrate_misplaced_folio(struct folio *folio, struct vm_area_struct *vma,
			    int node)
{ … }
#endif /* CONFIG_NUMA_BALANCING */
#endif /* CONFIG_NUMA */