// SPDX-License-Identifier: GPL-2.0-only /* * linux/mm/memory_hotplug.c * * Copyright (C) */ #include <linux/stddef.h> #include <linux/mm.h> #include <linux/sched/signal.h> #include <linux/swap.h> #include <linux/interrupt.h> #include <linux/pagemap.h> #include <linux/compiler.h> #include <linux/export.h> #include <linux/writeback.h> #include <linux/slab.h> #include <linux/sysctl.h> #include <linux/cpu.h> #include <linux/memory.h> #include <linux/memremap.h> #include <linux/memory_hotplug.h> #include <linux/vmalloc.h> #include <linux/ioport.h> #include <linux/delay.h> #include <linux/migrate.h> #include <linux/page-isolation.h> #include <linux/pfn.h> #include <linux/suspend.h> #include <linux/mm_inline.h> #include <linux/firmware-map.h> #include <linux/stop_machine.h> #include <linux/hugetlb.h> #include <linux/memblock.h> #include <linux/compaction.h> #include <linux/rmap.h> #include <linux/module.h> #include <asm/tlbflush.h> #include "internal.h" #include "shuffle.h" enum { … }; static int memmap_mode __read_mostly = …; static inline unsigned long memory_block_memmap_size(void) { … } static inline unsigned long memory_block_memmap_on_memory_pages(void) { … } #ifdef CONFIG_MHP_MEMMAP_ON_MEMORY /* * memory_hotplug.memmap_on_memory parameter */ static int set_memmap_mode(const char *val, const struct kernel_param *kp) { … } static int get_memmap_mode(char *buffer, const struct kernel_param *kp) { … } static const struct kernel_param_ops memmap_mode_ops = …; module_param_cb(…); MODULE_PARM_DESC(…) …; static inline bool mhp_memmap_on_memory(void) { … } #else static inline bool mhp_memmap_on_memory(void) { return false; } #endif enum { … }; static const char * const online_policy_to_str[] = …; static int set_online_policy(const char *val, const struct kernel_param *kp) { … } static int get_online_policy(char *buffer, const struct kernel_param *kp) { … } /* * memory_hotplug.online_policy: configure online behavior when onlining without * specifying a zone (MMOP_ONLINE) * * "contig-zones": keep zone contiguous * "auto-movable": online memory to ZONE_MOVABLE if the configuration * (auto_movable_ratio, auto_movable_numa_aware) allows for it */ static int online_policy __read_mostly = …; static const struct kernel_param_ops online_policy_ops = …; module_param_cb(…); MODULE_PARM_DESC(…) …; /* * memory_hotplug.auto_movable_ratio: specify maximum MOVABLE:KERNEL ratio * * The ratio represent an upper limit and the kernel might decide to not * online some memory to ZONE_MOVABLE -- e.g., because hotplugged KERNEL memory * doesn't allow for more MOVABLE memory. */ static unsigned int auto_movable_ratio __read_mostly = …; module_param(auto_movable_ratio, uint, 0644); MODULE_PARM_DESC(…) …; /* * memory_hotplug.auto_movable_numa_aware: consider numa node stats */ #ifdef CONFIG_NUMA static bool auto_movable_numa_aware __read_mostly = …; module_param(auto_movable_numa_aware, bool, 0644); MODULE_PARM_DESC(…) …; #endif /* CONFIG_NUMA */ /* * online_page_callback contains pointer to current page onlining function. * Initially it is generic_online_page(). If it is required it could be * changed by calling set_online_page_callback() for callback registration * and restore_online_page_callback() for generic callback restore. */ static online_page_callback_t online_page_callback = …; static DEFINE_MUTEX(online_page_callback_lock); DEFINE_STATIC_PERCPU_RWSEM(…); void get_online_mems(void) { … } void put_online_mems(void) { … } bool movable_node_enabled = …; #ifndef CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE int mhp_default_online_type = MMOP_OFFLINE; #else int mhp_default_online_type = …; #endif static int __init setup_memhp_default_state(char *str) { … } __setup(…); void mem_hotplug_begin(void) { … } void mem_hotplug_done(void) { … } u64 max_mem_size = …; /* add this memory to iomem resource */ static struct resource *register_memory_resource(u64 start, u64 size, const char *resource_name) { … } static void release_memory_resource(struct resource *res) { … } static int check_pfn_span(unsigned long pfn, unsigned long nr_pages) { … } /* * Return page for the valid pfn only if the page is online. All pfn * walkers which rely on the fully initialized page->flags and others * should use this rather than pfn_valid && pfn_to_page */ struct page *pfn_to_online_page(unsigned long pfn) { … } EXPORT_SYMBOL_GPL(…); int __ref __add_pages(int nid, unsigned long pfn, unsigned long nr_pages, struct mhp_params *params) { … } /* find the smallest valid pfn in the range [start_pfn, end_pfn) */ static unsigned long find_smallest_section_pfn(int nid, struct zone *zone, unsigned long start_pfn, unsigned long end_pfn) { … } /* find the biggest valid pfn in the range [start_pfn, end_pfn). */ static unsigned long find_biggest_section_pfn(int nid, struct zone *zone, unsigned long start_pfn, unsigned long end_pfn) { … } static void shrink_zone_span(struct zone *zone, unsigned long start_pfn, unsigned long end_pfn) { … } static void update_pgdat_span(struct pglist_data *pgdat) { … } void __ref remove_pfn_range_from_zone(struct zone *zone, unsigned long start_pfn, unsigned long nr_pages) { … } /** * __remove_pages() - remove sections of pages * @pfn: starting pageframe (must be aligned to start of a section) * @nr_pages: number of pages to remove (must be multiple of section size) * @altmap: alternative device page map or %NULL if default memmap is used * * Generic helper function to remove section mappings and sysfs entries * for the section of the memory we are removing. Caller needs to make * sure that pages are marked reserved and zones are adjust properly by * calling offline_pages(). */ void __remove_pages(unsigned long pfn, unsigned long nr_pages, struct vmem_altmap *altmap) { … } int set_online_page_callback(online_page_callback_t callback) { … } EXPORT_SYMBOL_GPL(…); int restore_online_page_callback(online_page_callback_t callback) { … } EXPORT_SYMBOL_GPL(…); /* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */ void __ref generic_online_page(struct page *page, unsigned int order) { … } EXPORT_SYMBOL_GPL(…); static void online_pages_range(unsigned long start_pfn, unsigned long nr_pages) { … } /* check which state of node_states will be changed when online memory */ static void node_states_check_changes_online(unsigned long nr_pages, struct zone *zone, struct memory_notify *arg) { … } static void node_states_set_node(int node, struct memory_notify *arg) { … } static void __meminit resize_zone_range(struct zone *zone, unsigned long start_pfn, unsigned long nr_pages) { … } static void __meminit resize_pgdat_range(struct pglist_data *pgdat, unsigned long start_pfn, unsigned long nr_pages) { … } #ifdef CONFIG_ZONE_DEVICE static void section_taint_zone_device(unsigned long pfn) { … } #else static inline void section_taint_zone_device(unsigned long pfn) { } #endif /* * Associate the pfn range with the given zone, initializing the memmaps * and resizing the pgdat/zone data to span the added pages. After this * call, all affected pages are PageOffline(). * * All aligned pageblocks are initialized to the specified migratetype * (usually MIGRATE_MOVABLE). Besides setting the migratetype, no related * zone stats (e.g., nr_isolate_pageblock) are touched. */ void __ref move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn, unsigned long nr_pages, struct vmem_altmap *altmap, int migratetype) { … } struct auto_movable_stats { … }; static void auto_movable_stats_account_zone(struct auto_movable_stats *stats, struct zone *zone) { … } struct auto_movable_group_stats { … }; static int auto_movable_stats_account_group(struct memory_group *group, void *arg) { … } static bool auto_movable_can_online_movable(int nid, struct memory_group *group, unsigned long nr_pages) { … } /* * Returns a default kernel memory zone for the given pfn range. * If no kernel zone covers this pfn range it will automatically go * to the ZONE_NORMAL. */ static struct zone *default_kernel_zone_for_pfn(int nid, unsigned long start_pfn, unsigned long nr_pages) { … } /* * Determine to which zone to online memory dynamically based on user * configuration and system stats. We care about the following ratio: * * MOVABLE : KERNEL * * Whereby MOVABLE is memory in ZONE_MOVABLE and KERNEL is memory in * one of the kernel zones. CMA pages inside one of the kernel zones really * behaves like ZONE_MOVABLE, so we treat them accordingly. * * We don't allow for hotplugged memory in a KERNEL zone to increase the * amount of MOVABLE memory we can have, so we end up with: * * MOVABLE : KERNEL_EARLY * * Whereby KERNEL_EARLY is memory in one of the kernel zones, available sinze * boot. We base our calculation on KERNEL_EARLY internally, because: * * a) Hotplugged memory in one of the kernel zones can sometimes still get * hotunplugged, especially when hot(un)plugging individual memory blocks. * There is no coordination across memory devices, therefore "automatic" * hotunplugging, as implemented in hypervisors, could result in zone * imbalances. * b) Early/boot memory in one of the kernel zones can usually not get * hotunplugged again (e.g., no firmware interface to unplug, fragmented * with unmovable allocations). While there are corner cases where it might * still work, it is barely relevant in practice. * * Exceptions are dynamic memory groups, which allow for more MOVABLE * memory within the same memory group -- because in that case, there is * coordination within the single memory device managed by a single driver. * * We rely on "present pages" instead of "managed pages", as the latter is * highly unreliable and dynamic in virtualized environments, and does not * consider boot time allocations. For example, memory ballooning adjusts the * managed pages when inflating/deflating the balloon, and balloon compaction * can even migrate inflated pages between zones. * * Using "present pages" is better but some things to keep in mind are: * * a) Some memblock allocations, such as for the crashkernel area, are * effectively unused by the kernel, yet they account to "present pages". * Fortunately, these allocations are comparatively small in relevant setups * (e.g., fraction of system memory). * b) Some hotplugged memory blocks in virtualized environments, esecially * hotplugged by virtio-mem, look like they are completely present, however, * only parts of the memory block are actually currently usable. * "present pages" is an upper limit that can get reached at runtime. As * we base our calculations on KERNEL_EARLY, this is not an issue. */ static struct zone *auto_movable_zone_for_pfn(int nid, struct memory_group *group, unsigned long pfn, unsigned long nr_pages) { … } static inline struct zone *default_zone_for_pfn(int nid, unsigned long start_pfn, unsigned long nr_pages) { … } struct zone *zone_for_pfn_range(int online_type, int nid, struct memory_group *group, unsigned long start_pfn, unsigned long nr_pages) { … } /* * This function should only be called by memory_block_{online,offline}, * and {online,offline}_pages. */ void adjust_present_page_count(struct page *page, struct memory_group *group, long nr_pages) { … } int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages, struct zone *zone, bool mhp_off_inaccessible) { … } void mhp_deinit_memmap_on_memory(unsigned long pfn, unsigned long nr_pages) { … } /* * Must be called with mem_hotplug_lock in write mode. */ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, struct zone *zone, struct memory_group *group) { … } /* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */ static pg_data_t __ref *hotadd_init_pgdat(int nid) { … } /* * __try_online_node - online a node if offlined * @nid: the node ID * @set_node_online: Whether we want to online the node * called by cpu_up() to online a node without onlined memory. * * Returns: * 1 -> a new node has been allocated * 0 -> the node is already online * -ENOMEM -> the node could not be allocated */ static int __try_online_node(int nid, bool set_node_online) { … } /* * Users of this function always want to online/register the node */ int try_online_node(int nid) { … } static int check_hotplug_memory_range(u64 start, u64 size) { … } static int online_memory_block(struct memory_block *mem, void *arg) { … } #ifndef arch_supports_memmap_on_memory static inline bool arch_supports_memmap_on_memory(unsigned long vmemmap_size) { … } #endif bool mhp_supports_memmap_on_memory(void) { … } EXPORT_SYMBOL_GPL(…); static void __ref remove_memory_blocks_and_altmaps(u64 start, u64 size) { … } static int create_altmaps_and_memory_blocks(int nid, struct memory_group *group, u64 start, u64 size, mhp_t mhp_flags) { … } /* * NOTE: The caller must call lock_device_hotplug() to serialize hotplug * and online/offline operations (triggered e.g. by sysfs). * * we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */ int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags) { … } /* requires device_hotplug_lock, see add_memory_resource() */ int __ref __add_memory(int nid, u64 start, u64 size, mhp_t mhp_flags) { … } int add_memory(int nid, u64 start, u64 size, mhp_t mhp_flags) { … } EXPORT_SYMBOL_GPL(…); /* * Add special, driver-managed memory to the system as system RAM. Such * memory is not exposed via the raw firmware-provided memmap as system * RAM, instead, it is detected and added by a driver - during cold boot, * after a reboot, and after kexec. * * Reasons why this memory should not be used for the initial memmap of a * kexec kernel or for placing kexec images: * - The booting kernel is in charge of determining how this memory will be * used (e.g., use persistent memory as system RAM) * - Coordination with a hypervisor is required before this memory * can be used (e.g., inaccessible parts). * * For this memory, no entries in /sys/firmware/memmap ("raw firmware-provided * memory map") are created. Also, the created memory resource is flagged * with IORESOURCE_SYSRAM_DRIVER_MANAGED, so in-kernel users can special-case * this memory as well (esp., not place kexec images onto it). * * The resource_name (visible via /proc/iomem) has to have the format * "System RAM ($DRIVER)". */ int add_memory_driver_managed(int nid, u64 start, u64 size, const char *resource_name, mhp_t mhp_flags) { … } EXPORT_SYMBOL_GPL(…); /* * Platforms should define arch_get_mappable_range() that provides * maximum possible addressable physical memory range for which the * linear mapping could be created. The platform returned address * range must adhere to these following semantics. * * - range.start <= range.end * - Range includes both end points [range.start..range.end] * * There is also a fallback definition provided here, allowing the * entire possible physical address range in case any platform does * not define arch_get_mappable_range(). */ struct range __weak arch_get_mappable_range(void) { … } struct range mhp_get_pluggable_range(bool need_mapping) { … } EXPORT_SYMBOL_GPL(…); bool mhp_range_allowed(u64 start, u64 size, bool need_mapping) { … } #ifdef CONFIG_MEMORY_HOTREMOVE /* * Scan pfn range [start,end) to find movable/migratable pages (LRU pages, * non-lru movable pages and hugepages). Will skip over most unmovable * pages (esp., pages that can be skipped when offlining), but bail out on * definitely unmovable pages. * * Returns: * 0 in case a movable page is found and movable_pfn was updated. * -ENOENT in case no movable page was found. * -EBUSY in case a definitely unmovable page was found. */ static int scan_movable_pages(unsigned long start, unsigned long end, unsigned long *movable_pfn) { … } static void do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) { … } static int __init cmdline_parse_movable_node(char *p) { … } early_param(…); /* check which state of node_states will be changed when offline memory */ static void node_states_check_changes_offline(unsigned long nr_pages, struct zone *zone, struct memory_notify *arg) { … } static void node_states_clear_node(int node, struct memory_notify *arg) { … } static int count_system_ram_pages_cb(unsigned long start_pfn, unsigned long nr_pages, void *data) { … } /* * Must be called with mem_hotplug_lock in write mode. */ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages, struct zone *zone, struct memory_group *group) { … } static int check_memblock_offlined_cb(struct memory_block *mem, void *arg) { … } static int count_memory_range_altmaps_cb(struct memory_block *mem, void *arg) { … } static int check_cpu_on_node(int nid) { … } static int check_no_memblock_for_node_cb(struct memory_block *mem, void *arg) { … } /** * try_offline_node * @nid: the node ID * * Offline a node if all memory sections and cpus of the node are removed. * * NOTE: The caller must call lock_device_hotplug() to serialize hotplug * and online/offline operations before this call. */ void try_offline_node(int nid) { … } EXPORT_SYMBOL(…); static int memory_blocks_have_altmaps(u64 start, u64 size) { … } static int __ref try_remove_memory(u64 start, u64 size) { … } /** * __remove_memory - Remove memory if every memory block is offline * @start: physical address of the region to remove * @size: size of the region to remove * * NOTE: The caller must call lock_device_hotplug() to serialize hotplug * and online/offline operations before this call, as required by * try_offline_node(). */ void __remove_memory(u64 start, u64 size) { … } /* * Remove memory if every memory block is offline, otherwise return -EBUSY is * some memory is not offline */ int remove_memory(u64 start, u64 size) { … } EXPORT_SYMBOL_GPL(…); static int try_offline_memory_block(struct memory_block *mem, void *arg) { … } static int try_reonline_memory_block(struct memory_block *mem, void *arg) { … } /* * Try to offline and remove memory. Might take a long time to finish in case * memory is still in use. Primarily useful for memory devices that logically * unplugged all memory (so it's no longer in use) and want to offline + remove * that memory. */ int offline_and_remove_memory(u64 start, u64 size) { … } EXPORT_SYMBOL_GPL(…); #endif /* CONFIG_MEMORY_HOTREMOVE */