// SPDX-License-Identifier: GPL-2.0-only #include <linux/init.h> #include <linux/mm.h> #include <linux/spinlock.h> #include <linux/smp.h> #include <linux/interrupt.h> #include <linux/export.h> #include <linux/cpu.h> #include <linux/debugfs.h> #include <linux/sched/smt.h> #include <linux/task_work.h> #include <linux/mmu_notifier.h> #include <asm/tlbflush.h> #include <asm/mmu_context.h> #include <asm/nospec-branch.h> #include <asm/cache.h> #include <asm/cacheflush.h> #include <asm/apic.h> #include <asm/perf_event.h> #include "mm_internal.h" #ifdef CONFIG_PARAVIRT #define STATIC_NOPV #else #define STATIC_NOPV … #define __flush_tlb_local … #define __flush_tlb_global … #define __flush_tlb_one_user … #define __flush_tlb_multi … #endif /* * TLB flushing, formerly SMP-only * c/o Linus Torvalds. * * These mean you can really definitely utterly forget about * writing to user space from interrupts. (Its not allowed anyway). * * Optimizations Manfred Spraul <[email protected]> * * More scalable flush, from Andi Kleen * * Implement flush IPI by CALL_FUNCTION_VECTOR, Alex Shi */ /* * Bits to mangle the TIF_SPEC_* state into the mm pointer which is * stored in cpu_tlb_state.last_user_mm_spec. */ #define LAST_USER_MM_IBPB … #define LAST_USER_MM_L1D_FLUSH … #define LAST_USER_MM_SPEC_MASK … /* Bits to set when tlbstate and flush is (re)initialized */ #define LAST_USER_MM_INIT … /* * The x86 feature is called PCID (Process Context IDentifier). It is similar * to what is traditionally called ASID on the RISC processors. * * We don't use the traditional ASID implementation, where each process/mm gets * its own ASID and flush/restart when we run out of ASID space. * * Instead we have a small per-cpu array of ASIDs and cache the last few mm's * that came by on this CPU, allowing cheaper switch_mm between processes on * this CPU. * * We end up with different spaces for different things. To avoid confusion we * use different names for each of them: * * ASID - [0, TLB_NR_DYN_ASIDS-1] * the canonical identifier for an mm * * kPCID - [1, TLB_NR_DYN_ASIDS] * the value we write into the PCID part of CR3; corresponds to the * ASID+1, because PCID 0 is special. * * uPCID - [2048 + 1, 2048 + TLB_NR_DYN_ASIDS] * for KPTI each mm has two address spaces and thus needs two * PCID values, but we can still do with a single ASID denomination * for each mm. Corresponds to kPCID + 2048. * */ /* There are 12 bits of space for ASIDS in CR3 */ #define CR3_HW_ASID_BITS … /* * When enabled, MITIGATION_PAGE_TABLE_ISOLATION consumes a single bit for * user/kernel switches */ #ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION #define PTI_CONSUMED_PCID_BITS … #else #define PTI_CONSUMED_PCID_BITS … #endif #define CR3_AVAIL_PCID_BITS … /* * ASIDs are zero-based: 0->MAX_AVAIL_ASID are valid. -1 below to account * for them being zero-based. Another -1 is because PCID 0 is reserved for * use by non-PCID-aware users. */ #define MAX_ASID_AVAILABLE … /* * Given @asid, compute kPCID */ static inline u16 kern_pcid(u16 asid) { … } /* * Given @asid, compute uPCID */ static inline u16 user_pcid(u16 asid) { … } static inline unsigned long build_cr3(pgd_t *pgd, u16 asid, unsigned long lam) { … } static inline unsigned long build_cr3_noflush(pgd_t *pgd, u16 asid, unsigned long lam) { … } /* * We get here when we do something requiring a TLB invalidation * but could not go invalidate all of the contexts. We do the * necessary invalidation by clearing out the 'ctx_id' which * forces a TLB flush when the context is loaded. */ static void clear_asid_other(void) { … } atomic64_t last_mm_ctx_id = …; static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen, u16 *new_asid, bool *need_flush) { … } /* * Given an ASID, flush the corresponding user ASID. We can delay this * until the next time we switch to it. * * See SWITCH_TO_USER_CR3. */ static inline void invalidate_user_asid(u16 asid) { … } static void load_new_mm_cr3(pgd_t *pgdir, u16 new_asid, unsigned long lam, bool need_flush) { … } void leave_mm(void) { … } EXPORT_SYMBOL_GPL(…); void switch_mm(struct mm_struct *prev, struct mm_struct *next, struct task_struct *tsk) { … } /* * Invoked from return to user/guest by a task that opted-in to L1D * flushing but ended up running on an SMT enabled core due to wrong * affinity settings or CPU hotplug. This is part of the paranoid L1D flush * contract which this task requested. */ static void l1d_flush_force_sigbus(struct callback_head *ch) { … } static void l1d_flush_evaluate(unsigned long prev_mm, unsigned long next_mm, struct task_struct *next) { … } static unsigned long mm_mangle_tif_spec_bits(struct task_struct *next) { … } static void cond_mitigation(struct task_struct *next) { … } #ifdef CONFIG_PERF_EVENTS static inline void cr4_update_pce_mm(struct mm_struct *mm) { … } void cr4_update_pce(void *ignored) { … } #else static inline void cr4_update_pce_mm(struct mm_struct *mm) { } #endif /* * This optimizes when not actually switching mm's. Some architectures use the * 'unused' argument for this optimization, but x86 must use * 'cpu_tlbstate.loaded_mm' instead because it does not always keep * 'current->active_mm' up to date. */ void switch_mm_irqs_off(struct mm_struct *unused, struct mm_struct *next, struct task_struct *tsk) { … } /* * Please ignore the name of this function. It should be called * switch_to_kernel_thread(). * * enter_lazy_tlb() is a hint from the scheduler that we are entering a * kernel thread or other context without an mm. Acceptable implementations * include doing nothing whatsoever, switching to init_mm, or various clever * lazy tricks to try to minimize TLB flushes. * * The scheduler reserves the right to call enter_lazy_tlb() several times * in a row. It will notify us that we're going back to a real mm by * calling switch_mm_irqs_off(). */ void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) { … } /* * Call this when reinitializing a CPU. It fixes the following potential * problems: * * - The ASID changed from what cpu_tlbstate thinks it is (most likely * because the CPU was taken down and came back up with CR3's PCID * bits clear. CPU hotplug can do this. * * - The TLB contains junk in slots corresponding to inactive ASIDs. * * - The CPU went so far out to lunch that it may have missed a TLB * flush. */ void initialize_tlbstate_and_flush(void) { … } /* * flush_tlb_func()'s memory ordering requirement is that any * TLB fills that happen after we flush the TLB are ordered after we * read active_mm's tlb_gen. We don't need any explicit barriers * because all x86 flush operations are serializing and the * atomic64_read operation won't be reordered by the compiler. */ static void flush_tlb_func(void *info) { … } static bool tlb_is_not_lazy(int cpu, void *data) { … } DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state_shared, cpu_tlbstate_shared); EXPORT_PER_CPU_SYMBOL(…); STATIC_NOPV void native_flush_tlb_multi(const struct cpumask *cpumask, const struct flush_tlb_info *info) { … } void flush_tlb_multi(const struct cpumask *cpumask, const struct flush_tlb_info *info) { … } /* * See Documentation/arch/x86/tlb.rst for details. We choose 33 * because it is large enough to cover the vast majority (at * least 95%) of allocations, and is small enough that we are * confident it will not cause too much overhead. Each single * flush is about 100 ns, so this caps the maximum overhead at * _about_ 3,000 ns. * * This is in units of pages. */ unsigned long tlb_single_page_flush_ceiling __read_mostly = …; static DEFINE_PER_CPU_SHARED_ALIGNED(struct flush_tlb_info, flush_tlb_info); #ifdef CONFIG_DEBUG_VM static DEFINE_PER_CPU(unsigned int, flush_tlb_info_idx); #endif static struct flush_tlb_info *get_flush_tlb_info(struct mm_struct *mm, unsigned long start, unsigned long end, unsigned int stride_shift, bool freed_tables, u64 new_tlb_gen) { … } static void put_flush_tlb_info(void) { … } void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, unsigned long end, unsigned int stride_shift, bool freed_tables) { … } static void do_flush_tlb_all(void *info) { … } void flush_tlb_all(void) { … } static void do_kernel_range_flush(void *info) { … } void flush_tlb_kernel_range(unsigned long start, unsigned long end) { … } /* * This can be used from process context to figure out what the value of * CR3 is without needing to do a (slow) __read_cr3(). * * It's intended to be used for code like KVM that sneakily changes CR3 * and needs to restore it. It needs to be used very carefully. */ unsigned long __get_current_cr3_fast(void) { … } EXPORT_SYMBOL_GPL(…); /* * Flush one page in the kernel mapping */ void flush_tlb_one_kernel(unsigned long addr) { … } /* * Flush one page in the user mapping */ STATIC_NOPV void native_flush_tlb_one_user(unsigned long addr) { … } void flush_tlb_one_user(unsigned long addr) { … } /* * Flush everything */ STATIC_NOPV void native_flush_tlb_global(void) { … } /* * Flush the entire current user mapping */ STATIC_NOPV void native_flush_tlb_local(void) { … } void flush_tlb_local(void) { … } /* * Flush everything */ void __flush_tlb_all(void) { … } EXPORT_SYMBOL_GPL(…); void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch) { … } /* * Blindly accessing user memory from NMI context can be dangerous * if we're in the middle of switching the current user task or * switching the loaded mm. It can also be dangerous if we * interrupted some kernel code that was temporarily using a * different mm. */ bool nmi_uaccess_okay(void) { … } static ssize_t tlbflush_read_file(struct file *file, char __user *user_buf, size_t count, loff_t *ppos) { … } static ssize_t tlbflush_write_file(struct file *file, const char __user *user_buf, size_t count, loff_t *ppos) { … } static const struct file_operations fops_tlbflush = …; static int __init create_tlb_single_page_flush_ceiling(void) { … } late_initcall(create_tlb_single_page_flush_ceiling);