// SPDX-License-Identifier: GPL-2.0-only /* * mm/mmap.c * * Written by obz. * * Address space accounting code <[email protected]> */ #define pr_fmt(fmt) … #include <linux/kernel.h> #include <linux/slab.h> #include <linux/backing-dev.h> #include <linux/mm.h> #include <linux/mm_inline.h> #include <linux/shm.h> #include <linux/mman.h> #include <linux/pagemap.h> #include <linux/swap.h> #include <linux/syscalls.h> #include <linux/capability.h> #include <linux/init.h> #include <linux/file.h> #include <linux/fs.h> #include <linux/personality.h> #include <linux/security.h> #include <linux/hugetlb.h> #include <linux/shmem_fs.h> #include <linux/profile.h> #include <linux/export.h> #include <linux/mount.h> #include <linux/mempolicy.h> #include <linux/rmap.h> #include <linux/mmu_notifier.h> #include <linux/mmdebug.h> #include <linux/perf_event.h> #include <linux/audit.h> #include <linux/khugepaged.h> #include <linux/uprobes.h> #include <linux/notifier.h> #include <linux/memory.h> #include <linux/printk.h> #include <linux/userfaultfd_k.h> #include <linux/moduleparam.h> #include <linux/pkeys.h> #include <linux/oom.h> #include <linux/sched/mm.h> #include <linux/ksm.h> #include <linux/uaccess.h> #include <asm/cacheflush.h> #include <asm/tlb.h> #include <asm/mmu_context.h> #define CREATE_TRACE_POINTS #include <trace/events/mmap.h> #include "internal.h" #ifndef arch_mmap_check #define arch_mmap_check(addr, len, flags) … #endif #ifdef CONFIG_HAVE_ARCH_MMAP_RND_BITS const int mmap_rnd_bits_min = …; int mmap_rnd_bits_max __ro_after_init = …; int mmap_rnd_bits __read_mostly = …; #endif #ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS const int mmap_rnd_compat_bits_min = …; const int mmap_rnd_compat_bits_max = …; int mmap_rnd_compat_bits __read_mostly = …; #endif static bool ignore_rlimit_data; core_param(…); static void unmap_region(struct mm_struct *mm, struct ma_state *mas, struct vm_area_struct *vma, struct vm_area_struct *prev, struct vm_area_struct *next, unsigned long start, unsigned long end, unsigned long tree_end, bool mm_wr_locked); static pgprot_t vm_pgprot_modify(pgprot_t oldprot, unsigned long vm_flags) { … } /* Update vma->vm_page_prot to reflect vma->vm_flags. */ void vma_set_page_prot(struct vm_area_struct *vma) { … } /* * Requires inode->i_mapping->i_mmap_rwsem */ static void __remove_shared_vm_struct(struct vm_area_struct *vma, struct address_space *mapping) { … } /* * Unlink a file-based vm structure from its interval tree, to hide * vma from rmap and vmtruncate before freeing its page tables. */ void unlink_file_vma(struct vm_area_struct *vma) { … } void unlink_file_vma_batch_init(struct unlink_vma_file_batch *vb) { … } static void unlink_file_vma_batch_process(struct unlink_vma_file_batch *vb) { … } void unlink_file_vma_batch_add(struct unlink_vma_file_batch *vb, struct vm_area_struct *vma) { … } void unlink_file_vma_batch_final(struct unlink_vma_file_batch *vb) { … } /* * Close a vm structure and free it. */ static void remove_vma(struct vm_area_struct *vma, bool unreachable) { … } static inline struct vm_area_struct *vma_prev_limit(struct vma_iterator *vmi, unsigned long min) { … } /* * check_brk_limits() - Use platform specific check of range & verify mlock * limits. * @addr: The address to check * @len: The size of increase. * * Return: 0 on success. */ static int check_brk_limits(unsigned long addr, unsigned long len) { … } static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *brkvma, unsigned long addr, unsigned long request, unsigned long flags); SYSCALL_DEFINE1(brk, unsigned long, brk) { … } #if defined(CONFIG_DEBUG_VM_MAPLE_TREE) static void validate_mm(struct mm_struct *mm) { … } #else /* !CONFIG_DEBUG_VM_MAPLE_TREE */ #define validate_mm … #endif /* CONFIG_DEBUG_VM_MAPLE_TREE */ /* * vma has some anon_vma assigned, and is already inserted on that * anon_vma's interval trees. * * Before updating the vma's vm_start / vm_end / vm_pgoff fields, the * vma must be removed from the anon_vma's interval trees using * anon_vma_interval_tree_pre_update_vma(). * * After the update, the vma will be reinserted using * anon_vma_interval_tree_post_update_vma(). * * The entire update must be protected by exclusive mmap_lock and by * the root anon_vma's mutex. */ static inline void anon_vma_interval_tree_pre_update_vma(struct vm_area_struct *vma) { … } static inline void anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma) { … } static unsigned long count_vma_pages_range(struct mm_struct *mm, unsigned long addr, unsigned long end) { … } static void __vma_link_file(struct vm_area_struct *vma, struct address_space *mapping) { … } static void vma_link_file(struct vm_area_struct *vma) { … } static int vma_link(struct mm_struct *mm, struct vm_area_struct *vma) { … } /* * init_multi_vma_prep() - Initializer for struct vma_prepare * @vp: The vma_prepare struct * @vma: The vma that will be altered once locked * @next: The next vma if it is to be adjusted * @remove: The first vma to be removed * @remove2: The second vma to be removed */ static inline void init_multi_vma_prep(struct vma_prepare *vp, struct vm_area_struct *vma, struct vm_area_struct *next, struct vm_area_struct *remove, struct vm_area_struct *remove2) { … } /* * init_vma_prep() - Initializer wrapper for vma_prepare struct * @vp: The vma_prepare struct * @vma: The vma that will be altered once locked */ static inline void init_vma_prep(struct vma_prepare *vp, struct vm_area_struct *vma) { … } /* * vma_prepare() - Helper function for handling locking VMAs prior to altering * @vp: The initialized vma_prepare struct */ static inline void vma_prepare(struct vma_prepare *vp) { … } /* * vma_complete- Helper function for handling the unlocking after altering VMAs, * or for inserting a VMA. * * @vp: The vma_prepare struct * @vmi: The vma iterator * @mm: The mm_struct */ static inline void vma_complete(struct vma_prepare *vp, struct vma_iterator *vmi, struct mm_struct *mm) { … } /* * dup_anon_vma() - Helper function to duplicate anon_vma * @dst: The destination VMA * @src: The source VMA * @dup: Pointer to the destination VMA when successful. * * Returns: 0 on success. */ static inline int dup_anon_vma(struct vm_area_struct *dst, struct vm_area_struct *src, struct vm_area_struct **dup) { … } /* * vma_expand - Expand an existing VMA * * @vmi: The vma iterator * @vma: The vma to expand * @start: The start of the vma * @end: The exclusive end of the vma * @pgoff: The page offset of vma * @next: The current of next vma. * * Expand @vma to @start and @end. Can expand off the start and end. Will * expand over @next if it's different from @vma and @end == @next->vm_end. * Checking if the @vma can expand and merge with @next needs to be handled by * the caller. * * Returns: 0 on success */ int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma, unsigned long start, unsigned long end, pgoff_t pgoff, struct vm_area_struct *next) { … } /* * vma_shrink() - Reduce an existing VMAs memory area * @vmi: The vma iterator * @vma: The VMA to modify * @start: The new start * @end: The new end * * Returns: 0 on success, -ENOMEM otherwise */ int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma, unsigned long start, unsigned long end, pgoff_t pgoff) { … } /* * If the vma has a ->close operation then the driver probably needs to release * per-vma resources, so we don't attempt to merge those if the caller indicates * the current vma may be removed as part of the merge. */ static inline bool is_mergeable_vma(struct vm_area_struct *vma, struct file *file, unsigned long vm_flags, struct vm_userfaultfd_ctx vm_userfaultfd_ctx, struct anon_vma_name *anon_name, bool may_remove_vma) { … } static inline bool is_mergeable_anon_vma(struct anon_vma *anon_vma1, struct anon_vma *anon_vma2, struct vm_area_struct *vma) { … } /* * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff) * in front of (at a lower virtual address and file offset than) the vma. * * We cannot merge two vmas if they have differently assigned (non-NULL) * anon_vmas, nor if same anon_vma is assigned but offsets incompatible. * * We don't check here for the merged mmap wrapping around the end of pagecache * indices (16TB on ia32) because do_mmap() does not permit mmap's which * wrap, nor mmaps which cover the final page at index -1UL. * * We assume the vma may be removed as part of the merge. */ static bool can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags, struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff, struct vm_userfaultfd_ctx vm_userfaultfd_ctx, struct anon_vma_name *anon_name) { … } /* * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff) * beyond (at a higher virtual address and file offset than) the vma. * * We cannot merge two vmas if they have differently assigned (non-NULL) * anon_vmas, nor if same anon_vma is assigned but offsets incompatible. * * We assume that vma is not removed as part of the merge. */ static bool can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags, struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff, struct vm_userfaultfd_ctx vm_userfaultfd_ctx, struct anon_vma_name *anon_name) { … } /* * Given a mapping request (addr,end,vm_flags,file,pgoff,anon_name), * figure out whether that can be merged with its predecessor or its * successor. Or both (it neatly fills a hole). * * In most cases - when called for mmap, brk or mremap - [addr,end) is * certain not to be mapped by the time vma_merge is called; but when * called for mprotect, it is certain to be already mapped (either at * an offset within prev, or at the start of next), and the flags of * this area are about to be changed to vm_flags - and the no-change * case has already been eliminated. * * The following mprotect cases have to be considered, where **** is * the area passed down from mprotect_fixup, never extending beyond one * vma, PPPP is the previous vma, CCCC is a concurrent vma that starts * at the same address as **** and is of the same or larger span, and * NNNN the next vma after ****: * * **** **** **** * PPPPPPNNNNNN PPPPPPNNNNNN PPPPPPCCCCCC * cannot merge might become might become * PPNNNNNNNNNN PPPPPPPPPPCC * mmap, brk or case 4 below case 5 below * mremap move: * **** **** * PPPP NNNN PPPPCCCCNNNN * might become might become * PPPPPPPPPPPP 1 or PPPPPPPPPPPP 6 or * PPPPPPPPNNNN 2 or PPPPPPPPNNNN 7 or * PPPPNNNNNNNN 3 PPPPNNNNNNNN 8 * * It is important for case 8 that the vma CCCC overlapping the * region **** is never going to extended over NNNN. Instead NNNN must * be extended in region **** and CCCC must be removed. This way in * all cases where vma_merge succeeds, the moment vma_merge drops the * rmap_locks, the properties of the merged vma will be already * correct for the whole merged range. Some of those properties like * vm_page_prot/vm_flags may be accessed by rmap_walks and they must * be correct for the whole merged range immediately after the * rmap_locks are released. Otherwise if NNNN would be removed and * CCCC would be extended over the NNNN range, remove_migration_ptes * or other rmap walkers (if working on addresses beyond the "end" * parameter) may establish ptes with the wrong permissions of CCCC * instead of the right permissions of NNNN. * * In the code below: * PPPP is represented by *prev * CCCC is represented by *curr or not represented at all (NULL) * NNNN is represented by *next or not represented at all (NULL) * **** is not represented - it will be merged and the vma containing the * area is returned, or the function will return NULL */ static struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct vm_area_struct *prev, struct vm_area_struct *src, unsigned long addr, unsigned long end, unsigned long vm_flags, pgoff_t pgoff, struct mempolicy *policy, struct vm_userfaultfd_ctx vm_userfaultfd_ctx, struct anon_vma_name *anon_name) { … } /* * Rough compatibility check to quickly see if it's even worth looking * at sharing an anon_vma. * * They need to have the same vm_file, and the flags can only differ * in things that mprotect may change. * * NOTE! The fact that we share an anon_vma doesn't _have_ to mean that * we can merge the two vma's. For example, we refuse to merge a vma if * there is a vm_ops->close() function, because that indicates that the * driver is doing some kind of reference counting. But that doesn't * really matter for the anon_vma sharing case. */ static int anon_vma_compatible(struct vm_area_struct *a, struct vm_area_struct *b) { … } /* * Do some basic sanity checking to see if we can re-use the anon_vma * from 'old'. The 'a'/'b' vma's are in VM order - one of them will be * the same as 'old', the other will be the new one that is trying * to share the anon_vma. * * NOTE! This runs with mmap_lock held for reading, so it is possible that * the anon_vma of 'old' is concurrently in the process of being set up * by another page fault trying to merge _that_. But that's ok: if it * is being set up, that automatically means that it will be a singleton * acceptable for merging, so we can do all of this optimistically. But * we do that READ_ONCE() to make sure that we never re-load the pointer. * * IOW: that the "list_is_singular()" test on the anon_vma_chain only * matters for the 'stable anon_vma' case (ie the thing we want to avoid * is to return an anon_vma that is "complex" due to having gone through * a fork). * * We also make sure that the two vma's are compatible (adjacent, * and with the same memory policies). That's all stable, even with just * a read lock on the mmap_lock. */ static struct anon_vma *reusable_anon_vma(struct vm_area_struct *old, struct vm_area_struct *a, struct vm_area_struct *b) { … } /* * find_mergeable_anon_vma is used by anon_vma_prepare, to check * neighbouring vmas for a suitable anon_vma, before it goes off * to allocate a new anon_vma. It checks because a repetitive * sequence of mprotects and faults may otherwise lead to distinct * anon_vmas being allocated, preventing vma merge in subsequent * mprotect. */ struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma) { … } /* * If a hint addr is less than mmap_min_addr change hint to be as * low as possible but still greater than mmap_min_addr */ static inline unsigned long round_hint_to_min(unsigned long hint) { … } bool mlock_future_ok(struct mm_struct *mm, unsigned long flags, unsigned long bytes) { … } static inline u64 file_mmap_size_max(struct file *file, struct inode *inode) { … } static inline bool file_mmap_ok(struct file *file, struct inode *inode, unsigned long pgoff, unsigned long len) { … } /* * The caller must write-lock current->mm->mmap_lock. */ unsigned long do_mmap(struct file *file, unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, vm_flags_t vm_flags, unsigned long pgoff, unsigned long *populate, struct list_head *uf) { … } unsigned long ksys_mmap_pgoff(unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, unsigned long fd, unsigned long pgoff) { … } SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, unsigned long, prot, unsigned long, flags, unsigned long, fd, unsigned long, pgoff) { … } #ifdef __ARCH_WANT_SYS_OLD_MMAP struct mmap_arg_struct { unsigned long addr; unsigned long len; unsigned long prot; unsigned long flags; unsigned long fd; unsigned long offset; }; SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg) { struct mmap_arg_struct a; if (copy_from_user(&a, arg, sizeof(a))) return -EFAULT; if (offset_in_page(a.offset)) return -EINVAL; return ksys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd, a.offset >> PAGE_SHIFT); } #endif /* __ARCH_WANT_SYS_OLD_MMAP */ static bool vm_ops_needs_writenotify(const struct vm_operations_struct *vm_ops) { … } static bool vma_is_shared_writable(struct vm_area_struct *vma) { … } static bool vma_fs_can_writeback(struct vm_area_struct *vma) { … } /* * Does this VMA require the underlying folios to have their dirty state * tracked? */ bool vma_needs_dirty_tracking(struct vm_area_struct *vma) { … } /* * Some shared mappings will want the pages marked read-only * to track write events. If so, we'll downgrade vm_page_prot * to the private version (using protection_map[] without the * VM_SHARED bit). */ bool vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot) { … } /* * We account for memory if it's a private writeable mapping, * not hugepages and VM_NORESERVE wasn't set. */ static inline bool accountable_mapping(struct file *file, vm_flags_t vm_flags) { … } /** * unmapped_area() - Find an area between the low_limit and the high_limit with * the correct alignment and offset, all from @info. Note: current->mm is used * for the search. * * @info: The unmapped area information including the range [low_limit - * high_limit), the alignment offset and mask. * * Return: A memory address or -ENOMEM. */ static unsigned long unmapped_area(struct vm_unmapped_area_info *info) { … } /** * unmapped_area_topdown() - Find an area between the low_limit and the * high_limit with the correct alignment and offset at the highest available * address, all from @info. Note: current->mm is used for the search. * * @info: The unmapped area information including the range [low_limit - * high_limit), the alignment offset and mask. * * Return: A memory address or -ENOMEM. */ static unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info) { … } /* * Search for an unmapped address range. * * We are looking for a range that: * - does not intersect with any VMA; * - is contained within the [low_limit, high_limit) interval; * - is at least the desired size. * - satisfies (begin_addr & align_mask) == (align_offset & align_mask) */ unsigned long vm_unmapped_area(struct vm_unmapped_area_info *info) { … } /* Get an address range which is currently unmapped. * For shmat() with addr=0. * * Ugly calling convention alert: * Return value with the low bits set means error value, * ie * if (ret & ~PAGE_MASK) * error = ret; * * This function "knows" that -ENOMEM has the bits set. */ unsigned long generic_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { … } #ifndef HAVE_ARCH_UNMAPPED_AREA unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { return generic_get_unmapped_area(filp, addr, len, pgoff, flags); } #endif /* * This mmap-allocator allocates new areas top-down from below the * stack's low limit (the base): */ unsigned long generic_get_unmapped_area_topdown(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { … } #ifndef HAVE_ARCH_UNMAPPED_AREA_TOPDOWN unsigned long arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { return generic_get_unmapped_area_topdown(filp, addr, len, pgoff, flags); } #endif #ifndef HAVE_ARCH_UNMAPPED_AREA_VMFLAGS unsigned long arch_get_unmapped_area_vmflags(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags, vm_flags_t vm_flags) { return arch_get_unmapped_area(filp, addr, len, pgoff, flags); } unsigned long arch_get_unmapped_area_topdown_vmflags(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags, vm_flags_t vm_flags) { return arch_get_unmapped_area_topdown(filp, addr, len, pgoff, flags); } #endif unsigned long mm_get_unmapped_area_vmflags(struct mm_struct *mm, struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags, vm_flags_t vm_flags) { … } unsigned long __get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags, vm_flags_t vm_flags) { … } unsigned long mm_get_unmapped_area(struct mm_struct *mm, struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { … } EXPORT_SYMBOL(…); /** * find_vma_intersection() - Look up the first VMA which intersects the interval * @mm: The process address space. * @start_addr: The inclusive start user address. * @end_addr: The exclusive end user address. * * Returns: The first VMA within the provided range, %NULL otherwise. Assumes * start_addr < end_addr. */ struct vm_area_struct *find_vma_intersection(struct mm_struct *mm, unsigned long start_addr, unsigned long end_addr) { … } EXPORT_SYMBOL(…); /** * find_vma() - Find the VMA for a given address, or the next VMA. * @mm: The mm_struct to check * @addr: The address * * Returns: The VMA associated with addr, or the next VMA. * May return %NULL in the case of no VMA at addr or above. */ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) { … } EXPORT_SYMBOL(…); /** * find_vma_prev() - Find the VMA for a given address, or the next vma and * set %pprev to the previous VMA, if any. * @mm: The mm_struct to check * @addr: The address * @pprev: The pointer to set to the previous VMA * * Note that RCU lock is missing here since the external mmap_lock() is used * instead. * * Returns: The VMA associated with @addr, or the next vma. * May return %NULL in the case of no vma at addr or above. */ struct vm_area_struct * find_vma_prev(struct mm_struct *mm, unsigned long addr, struct vm_area_struct **pprev) { … } /* * Verify that the stack growth is acceptable and * update accounting. This is shared with both the * grow-up and grow-down cases. */ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, unsigned long grow) { … } #if defined(CONFIG_STACK_GROWSUP) /* * PA-RISC uses this for its stack. * vma is the last one with address > vma->vm_end. Have to extend vma. */ static int expand_upwards(struct vm_area_struct *vma, unsigned long address) { struct mm_struct *mm = vma->vm_mm; struct vm_area_struct *next; unsigned long gap_addr; int error = 0; VMA_ITERATOR(vmi, mm, vma->vm_start); if (!(vma->vm_flags & VM_GROWSUP)) return -EFAULT; /* Guard against exceeding limits of the address space. */ address &= PAGE_MASK; if (address >= (TASK_SIZE & PAGE_MASK)) return -ENOMEM; address += PAGE_SIZE; /* Enforce stack_guard_gap */ gap_addr = address + stack_guard_gap; /* Guard against overflow */ if (gap_addr < address || gap_addr > TASK_SIZE) gap_addr = TASK_SIZE; next = find_vma_intersection(mm, vma->vm_end, gap_addr); if (next && vma_is_accessible(next)) { if (!(next->vm_flags & VM_GROWSUP)) return -ENOMEM; /* Check that both stack segments have the same anon_vma? */ } if (next) vma_iter_prev_range_limit(&vmi, address); vma_iter_config(&vmi, vma->vm_start, address); if (vma_iter_prealloc(&vmi, vma)) return -ENOMEM; /* We must make sure the anon_vma is allocated. */ if (unlikely(anon_vma_prepare(vma))) { vma_iter_free(&vmi); return -ENOMEM; } /* Lock the VMA before expanding to prevent concurrent page faults */ vma_start_write(vma); /* * vma->vm_start/vm_end cannot change under us because the caller * is required to hold the mmap_lock in read mode. We need the * anon_vma lock to serialize against concurrent expand_stacks. */ anon_vma_lock_write(vma->anon_vma); /* Somebody else might have raced and expanded it already */ if (address > vma->vm_end) { unsigned long size, grow; size = address - vma->vm_start; grow = (address - vma->vm_end) >> PAGE_SHIFT; error = -ENOMEM; if (vma->vm_pgoff + (size >> PAGE_SHIFT) >= vma->vm_pgoff) { error = acct_stack_growth(vma, size, grow); if (!error) { /* * We only hold a shared mmap_lock lock here, so * we need to protect against concurrent vma * expansions. anon_vma_lock_write() doesn't * help here, as we don't guarantee that all * growable vmas in a mm share the same root * anon vma. So, we reuse mm->page_table_lock * to guard against concurrent vma expansions. */ spin_lock(&mm->page_table_lock); if (vma->vm_flags & VM_LOCKED) mm->locked_vm += grow; vm_stat_account(mm, vma->vm_flags, grow); anon_vma_interval_tree_pre_update_vma(vma); vma->vm_end = address; /* Overwrite old entry in mtree. */ vma_iter_store(&vmi, vma); anon_vma_interval_tree_post_update_vma(vma); spin_unlock(&mm->page_table_lock); perf_event_mmap(vma); } } } anon_vma_unlock_write(vma->anon_vma); vma_iter_free(&vmi); validate_mm(mm); return error; } #endif /* CONFIG_STACK_GROWSUP */ /* * vma is the first one with address < vma->vm_start. Have to extend vma. * mmap_lock held for writing. */ int expand_downwards(struct vm_area_struct *vma, unsigned long address) { … } /* enforced gap between the expanding stack and other mappings. */ unsigned long stack_guard_gap = …; static int __init cmdline_parse_stack_guard_gap(char *p) { … } __setup(…); #ifdef CONFIG_STACK_GROWSUP int expand_stack_locked(struct vm_area_struct *vma, unsigned long address) { return expand_upwards(vma, address); } struct vm_area_struct *find_extend_vma_locked(struct mm_struct *mm, unsigned long addr) { struct vm_area_struct *vma, *prev; addr &= PAGE_MASK; vma = find_vma_prev(mm, addr, &prev); if (vma && (vma->vm_start <= addr)) return vma; if (!prev) return NULL; if (expand_stack_locked(prev, addr)) return NULL; if (prev->vm_flags & VM_LOCKED) populate_vma_page_range(prev, addr, prev->vm_end, NULL); return prev; } #else int expand_stack_locked(struct vm_area_struct *vma, unsigned long address) { … } struct vm_area_struct *find_extend_vma_locked(struct mm_struct *mm, unsigned long addr) { … } #endif #if defined(CONFIG_STACK_GROWSUP) #define vma_expand_up … #define vma_expand_down … #else #define vma_expand_up(vma,addr) … #define vma_expand_down(vma, addr) … #endif /* * expand_stack(): legacy interface for page faulting. Don't use unless * you have to. * * This is called with the mm locked for reading, drops the lock, takes * the lock for writing, tries to look up a vma again, expands it if * necessary, and downgrades the lock to reading again. * * If no vma is found or it can't be expanded, it returns NULL and has * dropped the lock. */ struct vm_area_struct *expand_stack(struct mm_struct *mm, unsigned long addr) { … } /* * Ok - we have the memory areas we should free on a maple tree so release them, * and do the vma updates. * * Called with the mm semaphore held. */ static inline void remove_mt(struct mm_struct *mm, struct ma_state *mas) { … } /* * Get rid of page table information in the indicated region. * * Called with the mm semaphore held. */ static void unmap_region(struct mm_struct *mm, struct ma_state *mas, struct vm_area_struct *vma, struct vm_area_struct *prev, struct vm_area_struct *next, unsigned long start, unsigned long end, unsigned long tree_end, bool mm_wr_locked) { … } /* * __split_vma() bypasses sysctl_max_map_count checking. We use this where it * has already been checked or doesn't make sense to fail. * VMA Iterator will point to the end VMA. */ static int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, unsigned long addr, int new_below) { … } /* * Split a vma into two pieces at address 'addr', a new vma is allocated * either for the first part or the tail. */ static int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, unsigned long addr, int new_below) { … } /* * We are about to modify one or multiple of a VMA's flags, policy, userfaultfd * context and anonymous VMA name within the range [start, end). * * As a result, we might be able to merge the newly modified VMA range with an * adjacent VMA with identical properties. * * If no merge is possible and the range does not span the entirety of the VMA, * we then need to split the VMA to accommodate the change. * * The function returns either the merged VMA, the original VMA if a split was * required instead, or an error if the split failed. */ struct vm_area_struct *vma_modify(struct vma_iterator *vmi, struct vm_area_struct *prev, struct vm_area_struct *vma, unsigned long start, unsigned long end, unsigned long vm_flags, struct mempolicy *policy, struct vm_userfaultfd_ctx uffd_ctx, struct anon_vma_name *anon_name) { … } /* * Attempt to merge a newly mapped VMA with those adjacent to it. The caller * must ensure that [start, end) does not overlap any existing VMA. */ static struct vm_area_struct *vma_merge_new_vma(struct vma_iterator *vmi, struct vm_area_struct *prev, struct vm_area_struct *vma, unsigned long start, unsigned long end, pgoff_t pgoff) { … } /* * Expand vma by delta bytes, potentially merging with an immediately adjacent * VMA with identical properties. */ struct vm_area_struct *vma_merge_extend(struct vma_iterator *vmi, struct vm_area_struct *vma, unsigned long delta) { … } /* * do_vmi_align_munmap() - munmap the aligned region from @start to @end. * @vmi: The vma iterator * @vma: The starting vm_area_struct * @mm: The mm_struct * @start: The aligned start address to munmap. * @end: The aligned end address to munmap. * @uf: The userfaultfd list_head * @unlock: Set to true to drop the mmap_lock. unlocking only happens on * success. * * Return: 0 on success and drops the lock if so directed, error and leaves the * lock held otherwise. */ static int do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, struct mm_struct *mm, unsigned long start, unsigned long end, struct list_head *uf, bool unlock) { … } /* * do_vmi_munmap() - munmap a given range. * @vmi: The vma iterator * @mm: The mm_struct * @start: The start address to munmap * @len: The length of the range to munmap * @uf: The userfaultfd list_head * @unlock: set to true if the user wants to drop the mmap_lock on success * * This function takes a @mas that is either pointing to the previous VMA or set * to MA_START and sets it up to remove the mapping(s). The @len will be * aligned and any arch_unmap work will be preformed. * * Return: 0 on success and drops the lock if so directed, error and leaves the * lock held otherwise. */ int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm, unsigned long start, size_t len, struct list_head *uf, bool unlock) { … } /* do_munmap() - Wrapper function for non-maple tree aware do_munmap() calls. * @mm: The mm_struct * @start: The start address to munmap * @len: The length to be munmapped. * @uf: The userfaultfd list_head * * Return: 0 on success, error otherwise. */ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, struct list_head *uf) { … } unsigned long mmap_region(struct file *file, unsigned long addr, unsigned long len, vm_flags_t vm_flags, unsigned long pgoff, struct list_head *uf) { … } static int __vm_munmap(unsigned long start, size_t len, bool unlock) { … } int vm_munmap(unsigned long start, size_t len) { … } EXPORT_SYMBOL(…); SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len) { … } /* * Emulation of deprecated remap_file_pages() syscall. */ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, unsigned long, prot, unsigned long, pgoff, unsigned long, flags) { … } /* * do_vma_munmap() - Unmap a full or partial vma. * @vmi: The vma iterator pointing at the vma * @vma: The first vma to be munmapped * @start: the start of the address to unmap * @end: The end of the address to unmap * @uf: The userfaultfd list_head * @unlock: Drop the lock on success * * unmaps a VMA mapping when the vma iterator is already in position. * Does not handle alignment. * * Return: 0 on success drops the lock of so directed, error on failure and will * still hold the lock. */ int do_vma_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, unsigned long start, unsigned long end, struct list_head *uf, bool unlock) { … } /* * do_brk_flags() - Increase the brk vma if the flags match. * @vmi: The vma iterator * @addr: The start address * @len: The length of the increase * @vma: The vma, * @flags: The VMA Flags * * Extend the brk VMA from addr to addr + len. If the VMA is NULL or the flags * do not match then create a new anonymous VMA. Eventually we may be able to * do some brk-specific accounting here. */ static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma, unsigned long addr, unsigned long len, unsigned long flags) { … } int vm_brk_flags(unsigned long addr, unsigned long request, unsigned long flags) { … } EXPORT_SYMBOL(…); /* Release all mmaps. */ void exit_mmap(struct mm_struct *mm) { … } /* Insert vm structure into process list sorted by address * and into the inode's i_mmap tree. If vm_file is non-NULL * then i_mmap_rwsem is taken here. */ int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) { … } /* * Copy the vma structure to a new location in the same mm, * prior to moving page table entries, to effect an mremap move. */ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, unsigned long addr, unsigned long len, pgoff_t pgoff, bool *need_rmap_locks) { … } /* * Return true if the calling process may expand its vm space by the passed * number of pages */ bool may_expand_vm(struct mm_struct *mm, vm_flags_t flags, unsigned long npages) { … } void vm_stat_account(struct mm_struct *mm, vm_flags_t flags, long npages) { … } static vm_fault_t special_mapping_fault(struct vm_fault *vmf); /* * Having a close hook prevents vma merging regardless of flags. */ static void special_mapping_close(struct vm_area_struct *vma) { … } static const char *special_mapping_name(struct vm_area_struct *vma) { … } static int special_mapping_mremap(struct vm_area_struct *new_vma) { … } static int special_mapping_split(struct vm_area_struct *vma, unsigned long addr) { … } static const struct vm_operations_struct special_mapping_vmops = …; static const struct vm_operations_struct legacy_special_mapping_vmops = …; static vm_fault_t special_mapping_fault(struct vm_fault *vmf) { … } static struct vm_area_struct *__install_special_mapping( struct mm_struct *mm, unsigned long addr, unsigned long len, unsigned long vm_flags, void *priv, const struct vm_operations_struct *ops) { … } bool vma_is_special_mapping(const struct vm_area_struct *vma, const struct vm_special_mapping *sm) { … } /* * Called with mm->mmap_lock held for writing. * Insert a new vma covering the given region, with the given flags. * Its pages are supplied by the given array of struct page *. * The array can be shorter than len >> PAGE_SHIFT if it's null-terminated. * The region past the last page supplied will always produce SIGBUS. * The array pointer and the pages it points to are assumed to stay alive * for as long as this mapping might exist. */ struct vm_area_struct *_install_special_mapping( struct mm_struct *mm, unsigned long addr, unsigned long len, unsigned long vm_flags, const struct vm_special_mapping *spec) { … } int install_special_mapping(struct mm_struct *mm, unsigned long addr, unsigned long len, unsigned long vm_flags, struct page **pages) { … } static DEFINE_MUTEX(mm_all_locks_mutex); static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma) { … } static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping) { … } /* * This operation locks against the VM for all pte/vma/mm related * operations that could ever happen on a certain mm. This includes * vmtruncate, try_to_unmap, and all page faults. * * The caller must take the mmap_lock in write mode before calling * mm_take_all_locks(). The caller isn't allowed to release the * mmap_lock until mm_drop_all_locks() returns. * * mmap_lock in write mode is required in order to block all operations * that could modify pagetables and free pages without need of * altering the vma layout. It's also needed in write mode to avoid new * anon_vmas to be associated with existing vmas. * * A single task can't take more than one mm_take_all_locks() in a row * or it would deadlock. * * The LSB in anon_vma->rb_root.rb_node and the AS_MM_ALL_LOCKS bitflag in * mapping->flags avoid to take the same lock twice, if more than one * vma in this mm is backed by the same anon_vma or address_space. * * We take locks in following order, accordingly to comment at beginning * of mm/rmap.c: * - all hugetlbfs_i_mmap_rwsem_key locks (aka mapping->i_mmap_rwsem for * hugetlb mapping); * - all vmas marked locked * - all i_mmap_rwsem locks; * - all anon_vma->rwseml * * We can take all locks within these types randomly because the VM code * doesn't nest them and we protected from parallel mm_take_all_locks() by * mm_all_locks_mutex. * * mm_take_all_locks() and mm_drop_all_locks are expensive operations * that may have to take thousand of locks. * * mm_take_all_locks() can fail if it's interrupted by signals. */ int mm_take_all_locks(struct mm_struct *mm) { … } static void vm_unlock_anon_vma(struct anon_vma *anon_vma) { … } static void vm_unlock_mapping(struct address_space *mapping) { … } /* * The mmap_lock cannot be released by the caller until * mm_drop_all_locks() returns. */ void mm_drop_all_locks(struct mm_struct *mm) { … } /* * initialise the percpu counter for VM */ void __init mmap_init(void) { … } /* * Initialise sysctl_user_reserve_kbytes. * * This is intended to prevent a user from starting a single memory hogging * process, such that they cannot recover (kill the hog) in OVERCOMMIT_NEVER * mode. * * The default value is min(3% of free memory, 128MB) * 128MB is enough to recover with sshd/login, bash, and top/kill. */ static int init_user_reserve(void) { … } subsys_initcall(init_user_reserve); /* * Initialise sysctl_admin_reserve_kbytes. * * The purpose of sysctl_admin_reserve_kbytes is to allow the sys admin * to log in and kill a memory hogging process. * * Systems with more than 256MB will reserve 8MB, enough to recover * with sshd, bash, and top in OVERCOMMIT_GUESS. Smaller systems will * only reserve 3% of free pages by default. */ static int init_admin_reserve(void) { … } subsys_initcall(init_admin_reserve); /* * Reinititalise user and admin reserves if memory is added or removed. * * The default user reserve max is 128MB, and the default max for the * admin reserve is 8MB. These are usually, but not always, enough to * enable recovery from a memory hogging process using login/sshd, a shell, * and tools like top. It may make sense to increase or even disable the * reserve depending on the existence of swap or variations in the recovery * tools. So, the admin may have changed them. * * If memory is added and the reserves have been eliminated or increased above * the default max, then we'll trust the admin. * * If memory is removed and there isn't enough free memory, then we * need to reset the reserves. * * Otherwise keep the reserve set by the admin. */ static int reserve_mem_notifier(struct notifier_block *nb, unsigned long action, void *data) { … } static int __meminit init_reserve_notifier(void) { … } subsys_initcall(init_reserve_notifier);