tlb.c | Explore in Territory

// SPDX-License-Identifier: GPL-2.0-only
#include <linux/init.h>

#include <linux/mm.h>
#include <linux/spinlock.h>
#include <linux/smp.h>
#include <linux/interrupt.h>
#include <linux/export.h>
#include <linux/cpu.h>
#include <linux/debugfs.h>
#include <linux/sched/smt.h>
#include <linux/task_work.h>
#include <linux/mmu_notifier.h>

#include <asm/tlbflush.h>
#include <asm/mmu_context.h>
#include <asm/nospec-branch.h>
#include <asm/cache.h>
#include <asm/cacheflush.h>
#include <asm/apic.h>
#include <asm/perf_event.h>

#include "mm_internal.h"

#ifdef CONFIG_PARAVIRT
#define STATIC_NOPV
#else
#define STATIC_NOPV …
#define __flush_tlb_local …
#define __flush_tlb_global …
#define __flush_tlb_one_user …
#define __flush_tlb_multi …
#endif

/*
 *	TLB flushing, formerly SMP-only
 *		c/o Linus Torvalds.
 *
 *	These mean you can really definitely utterly forget about
 *	writing to user space from interrupts. (Its not allowed anyway).
 *
 *	Optimizations Manfred Spraul <[email protected]>
 *
 *	More scalable flush, from Andi Kleen
 *
 *	Implement flush IPI by CALL_FUNCTION_VECTOR, Alex Shi
 */

/*
 * Bits to mangle the TIF_SPEC_* state into the mm pointer which is
 * stored in cpu_tlb_state.last_user_mm_spec.
 */
#define LAST_USER_MM_IBPB …
#define LAST_USER_MM_L1D_FLUSH …
#define LAST_USER_MM_SPEC_MASK …

/* Bits to set when tlbstate and flush is (re)initialized */
#define LAST_USER_MM_INIT …

/*
 * The x86 feature is called PCID (Process Context IDentifier). It is similar
 * to what is traditionally called ASID on the RISC processors.
 *
 * We don't use the traditional ASID implementation, where each process/mm gets
 * its own ASID and flush/restart when we run out of ASID space.
 *
 * Instead we have a small per-cpu array of ASIDs and cache the last few mm's
 * that came by on this CPU, allowing cheaper switch_mm between processes on
 * this CPU.
 *
 * We end up with different spaces for different things. To avoid confusion we
 * use different names for each of them:
 *
 * ASID  - [0, TLB_NR_DYN_ASIDS-1]
 *         the canonical identifier for an mm
 *
 * kPCID - [1, TLB_NR_DYN_ASIDS]
 *         the value we write into the PCID part of CR3; corresponds to the
 *         ASID+1, because PCID 0 is special.
 *
 * uPCID - [2048 + 1, 2048 + TLB_NR_DYN_ASIDS]
 *         for KPTI each mm has two address spaces and thus needs two
 *         PCID values, but we can still do with a single ASID denomination
 *         for each mm. Corresponds to kPCID + 2048.
 *
 */

/* There are 12 bits of space for ASIDS in CR3 */
#define CR3_HW_ASID_BITS …

/*
 * When enabled, MITIGATION_PAGE_TABLE_ISOLATION consumes a single bit for
 * user/kernel switches
 */
#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION
#define PTI_CONSUMED_PCID_BITS …
#else
#define PTI_CONSUMED_PCID_BITS …
#endif

#define CR3_AVAIL_PCID_BITS …

/*
 * ASIDs are zero-based: 0->MAX_AVAIL_ASID are valid.  -1 below to account
 * for them being zero-based.  Another -1 is because PCID 0 is reserved for
 * use by non-PCID-aware users.
 */
#define MAX_ASID_AVAILABLE …

/*
 * Given @asid, compute kPCID
 */
static inline u16 kern_pcid(u16 asid)
{ … }

/*
 * Given @asid, compute uPCID
 */
static inline u16 user_pcid(u16 asid)
{ … }

static inline unsigned long build_cr3(pgd_t *pgd, u16 asid, unsigned long lam)
{ … }

static inline unsigned long build_cr3_noflush(pgd_t *pgd, u16 asid,
					      unsigned long lam)
{ … }

/*
 * We get here when we do something requiring a TLB invalidation
 * but could not go invalidate all of the contexts.  We do the
 * necessary invalidation by clearing out the 'ctx_id' which
 * forces a TLB flush when the context is loaded.
 */
static void clear_asid_other(void)
{ … }

atomic64_t last_mm_ctx_id = …;


static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen,
			    u16 *new_asid, bool *need_flush)
{ … }

/*
 * Given an ASID, flush the corresponding user ASID.  We can delay this
 * until the next time we switch to it.
 *
 * See SWITCH_TO_USER_CR3.
 */
static inline void invalidate_user_asid(u16 asid)
{ … }

static void load_new_mm_cr3(pgd_t *pgdir, u16 new_asid, unsigned long lam,
			    bool need_flush)
{ … }

void leave_mm(void)
{ … }
EXPORT_SYMBOL_GPL(…);

void switch_mm(struct mm_struct *prev, struct mm_struct *next,
	       struct task_struct *tsk)
{ … }

/*
 * Invoked from return to user/guest by a task that opted-in to L1D
 * flushing but ended up running on an SMT enabled core due to wrong
 * affinity settings or CPU hotplug. This is part of the paranoid L1D flush
 * contract which this task requested.
 */
static void l1d_flush_force_sigbus(struct callback_head *ch)
{ … }

static void l1d_flush_evaluate(unsigned long prev_mm, unsigned long next_mm,
				struct task_struct *next)
{ … }

static unsigned long mm_mangle_tif_spec_bits(struct task_struct *next)
{ … }

static void cond_mitigation(struct task_struct *next)
{ … }

#ifdef CONFIG_PERF_EVENTS
static inline void cr4_update_pce_mm(struct mm_struct *mm)
{ … }

void cr4_update_pce(void *ignored)
{ … }

#else
static inline void cr4_update_pce_mm(struct mm_struct *mm) { }
#endif

/*
 * This optimizes when not actually switching mm's.  Some architectures use the
 * 'unused' argument for this optimization, but x86 must use
 * 'cpu_tlbstate.loaded_mm' instead because it does not always keep
 * 'current->active_mm' up to date.
 */
void switch_mm_irqs_off(struct mm_struct *unused, struct mm_struct *next,
			struct task_struct *tsk)
{ … }

/*
 * Please ignore the name of this function.  It should be called
 * switch_to_kernel_thread().
 *
 * enter_lazy_tlb() is a hint from the scheduler that we are entering a
 * kernel thread or other context without an mm.  Acceptable implementations
 * include doing nothing whatsoever, switching to init_mm, or various clever
 * lazy tricks to try to minimize TLB flushes.
 *
 * The scheduler reserves the right to call enter_lazy_tlb() several times
 * in a row.  It will notify us that we're going back to a real mm by
 * calling switch_mm_irqs_off().
 */
void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
{ … }

/*
 * Call this when reinitializing a CPU.  It fixes the following potential
 * problems:
 *
 * - The ASID changed from what cpu_tlbstate thinks it is (most likely
 *   because the CPU was taken down and came back up with CR3's PCID
 *   bits clear.  CPU hotplug can do this.
 *
 * - The TLB contains junk in slots corresponding to inactive ASIDs.
 *
 * - The CPU went so far out to lunch that it may have missed a TLB
 *   flush.
 */
void initialize_tlbstate_and_flush(void)
{ … }

/*
 * flush_tlb_func()'s memory ordering requirement is that any
 * TLB fills that happen after we flush the TLB are ordered after we
 * read active_mm's tlb_gen.  We don't need any explicit barriers
 * because all x86 flush operations are serializing and the
 * atomic64_read operation won't be reordered by the compiler.
 */
static void flush_tlb_func(void *info)
{ … }

static bool tlb_is_not_lazy(int cpu, void *data)
{ … }

DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state_shared, cpu_tlbstate_shared);
EXPORT_PER_CPU_SYMBOL(…);

STATIC_NOPV void native_flush_tlb_multi(const struct cpumask *cpumask,
					 const struct flush_tlb_info *info)
{ … }

void flush_tlb_multi(const struct cpumask *cpumask,
		      const struct flush_tlb_info *info)
{ … }

/*
 * See Documentation/arch/x86/tlb.rst for details.  We choose 33
 * because it is large enough to cover the vast majority (at
 * least 95%) of allocations, and is small enough that we are
 * confident it will not cause too much overhead.  Each single
 * flush is about 100 ns, so this caps the maximum overhead at
 * _about_ 3,000 ns.
 *
 * This is in units of pages.
 */
unsigned long tlb_single_page_flush_ceiling __read_mostly = …;

static DEFINE_PER_CPU_SHARED_ALIGNED(struct flush_tlb_info, flush_tlb_info);

#ifdef CONFIG_DEBUG_VM
static DEFINE_PER_CPU(unsigned int, flush_tlb_info_idx);
#endif

static struct flush_tlb_info *get_flush_tlb_info(struct mm_struct *mm,
			unsigned long start, unsigned long end,
			unsigned int stride_shift, bool freed_tables,
			u64 new_tlb_gen)
{ … }

static void put_flush_tlb_info(void)
{ … }

void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
				unsigned long end, unsigned int stride_shift,
				bool freed_tables)
{ … }


static void do_flush_tlb_all(void *info)
{ … }

void flush_tlb_all(void)
{ … }

static void do_kernel_range_flush(void *info)
{ … }

void flush_tlb_kernel_range(unsigned long start, unsigned long end)
{ … }

/*
 * This can be used from process context to figure out what the value of
 * CR3 is without needing to do a (slow) __read_cr3().
 *
 * It's intended to be used for code like KVM that sneakily changes CR3
 * and needs to restore it.  It needs to be used very carefully.
 */
unsigned long __get_current_cr3_fast(void)
{ … }
EXPORT_SYMBOL_GPL(…);

/*
 * Flush one page in the kernel mapping
 */
void flush_tlb_one_kernel(unsigned long addr)
{ … }

/*
 * Flush one page in the user mapping
 */
STATIC_NOPV void native_flush_tlb_one_user(unsigned long addr)
{ … }

void flush_tlb_one_user(unsigned long addr)
{ … }

/*
 * Flush everything
 */
STATIC_NOPV void native_flush_tlb_global(void)
{ … }

/*
 * Flush the entire current user mapping
 */
STATIC_NOPV void native_flush_tlb_local(void)
{ … }

void flush_tlb_local(void)
{ … }

/*
 * Flush everything
 */
void __flush_tlb_all(void)
{ … }
EXPORT_SYMBOL_GPL(…);

void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch)
{ … }

/*
 * Blindly accessing user memory from NMI context can be dangerous
 * if we're in the middle of switching the current user task or
 * switching the loaded mm.  It can also be dangerous if we
 * interrupted some kernel code that was temporarily using a
 * different mm.
 */
bool nmi_uaccess_okay(void)
{ … }

static ssize_t tlbflush_read_file(struct file *file, char __user *user_buf,
			     size_t count, loff_t *ppos)
{ … }

static ssize_t tlbflush_write_file(struct file *file,
		 const char __user *user_buf, size_t count, loff_t *ppos)
{ … }

static const struct file_operations fops_tlbflush = …;

static int __init create_tlb_single_page_flush_ceiling(void)
{ … }
late_initcall(create_tlb_single_page_flush_ceiling);
linux/arch/x86/mm/tlb.c