linux/arch/x86/xen/mmu_pv.c

// SPDX-License-Identifier: GPL-2.0

/*
 * Xen mmu operations
 *
 * This file contains the various mmu fetch and update operations.
 * The most important job they must perform is the mapping between the
 * domain's pfn and the overall machine mfns.
 *
 * Xen allows guests to directly update the pagetable, in a controlled
 * fashion.  In other words, the guest modifies the same pagetable
 * that the CPU actually uses, which eliminates the overhead of having
 * a separate shadow pagetable.
 *
 * In order to allow this, it falls on the guest domain to map its
 * notion of a "physical" pfn - which is just a domain-local linear
 * address - into a real "machine address" which the CPU's MMU can
 * use.
 *
 * A pgd_t/pmd_t/pte_t will typically contain an mfn, and so can be
 * inserted directly into the pagetable.  When creating a new
 * pte/pmd/pgd, it converts the passed pfn into an mfn.  Conversely,
 * when reading the content back with __(pgd|pmd|pte)_val, it converts
 * the mfn back into a pfn.
 *
 * The other constraint is that all pages which make up a pagetable
 * must be mapped read-only in the guest.  This prevents uncontrolled
 * guest updates to the pagetable.  Xen strictly enforces this, and
 * will disallow any pagetable update which will end up mapping a
 * pagetable page RW, and will disallow using any writable page as a
 * pagetable.
 *
 * Naively, when loading %cr3 with the base of a new pagetable, Xen
 * would need to validate the whole pagetable before going on.
 * Naturally, this is quite slow.  The solution is to "pin" a
 * pagetable, which enforces all the constraints on the pagetable even
 * when it is not actively in use.  This means that Xen can be assured
 * that it is still valid when you do load it into %cr3, and doesn't
 * need to revalidate it.
 *
 * Jeremy Fitzhardinge <[email protected]>, XenSource Inc, 2007
 */
#include <linux/sched/mm.h>
#include <linux/debugfs.h>
#include <linux/bug.h>
#include <linux/vmalloc.h>
#include <linux/export.h>
#include <linux/init.h>
#include <linux/gfp.h>
#include <linux/memblock.h>
#include <linux/seq_file.h>
#include <linux/crash_dump.h>
#include <linux/pgtable.h>
#ifdef CONFIG_KEXEC_CORE
#include <linux/kexec.h>
#endif

#include <trace/events/xen.h>

#include <asm/tlbflush.h>
#include <asm/fixmap.h>
#include <asm/mmu_context.h>
#include <asm/setup.h>
#include <asm/paravirt.h>
#include <asm/e820/api.h>
#include <asm/linkage.h>
#include <asm/page.h>
#include <asm/init.h>
#include <asm/memtype.h>
#include <asm/smp.h>
#include <asm/tlb.h>

#include <asm/xen/hypercall.h>
#include <asm/xen/hypervisor.h>

#include <xen/xen.h>
#include <xen/page.h>
#include <xen/interface/xen.h>
#include <xen/interface/hvm/hvm_op.h>
#include <xen/interface/version.h>
#include <xen/interface/memory.h>
#include <xen/hvc-console.h>
#include <xen/swiotlb-xen.h>

#include "xen-ops.h"

/*
 * Prototypes for functions called via PV_CALLEE_SAVE_REGS_THUNK() in order
 * to avoid warnings with "-Wmissing-prototypes".
 */
pteval_t xen_pte_val(pte_t pte);
pgdval_t xen_pgd_val(pgd_t pgd);
pmdval_t xen_pmd_val(pmd_t pmd);
pudval_t xen_pud_val(pud_t pud);
p4dval_t xen_p4d_val(p4d_t p4d);
pte_t xen_make_pte(pteval_t pte);
pgd_t xen_make_pgd(pgdval_t pgd);
pmd_t xen_make_pmd(pmdval_t pmd);
pud_t xen_make_pud(pudval_t pud);
p4d_t xen_make_p4d(p4dval_t p4d);
pte_t xen_make_pte_init(pteval_t pte);

#ifdef CONFIG_X86_VSYSCALL_EMULATION
/* l3 pud for userspace vsyscall mapping */
static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss;
#endif

/*
 * Protects atomic reservation decrease/increase against concurrent increases.
 * Also protects non-atomic updates of current_pages and balloon lists.
 */
static DEFINE_SPINLOCK(xen_reservation_lock);

/*
 * Note about cr3 (pagetable base) values:
 *
 * xen_cr3 contains the current logical cr3 value; it contains the
 * last set cr3.  This may not be the current effective cr3, because
 * its update may be being lazily deferred.  However, a vcpu looking
 * at its own cr3 can use this value knowing that it everything will
 * be self-consistent.
 *
 * xen_current_cr3 contains the actual vcpu cr3; it is set once the
 * hypercall to set the vcpu cr3 is complete (so it may be a little
 * out of date, but it will never be set early).  If one vcpu is
 * looking at another vcpu's cr3 value, it should use this variable.
 */
DEFINE_PER_CPU(unsigned long, xen_cr3);	 /* cr3 stored as physaddr */
static DEFINE_PER_CPU(unsigned long, xen_current_cr3);	/* actual vcpu cr3 */

static phys_addr_t xen_pt_base, xen_pt_size __initdata;

static DEFINE_STATIC_KEY_FALSE(xen_struct_pages_ready);

/*
 * Just beyond the highest usermode address.  STACK_TOP_MAX has a
 * redzone above it, so round it up to a PGD boundary.
 */
#define USER_LIMIT

void make_lowmem_page_readonly(void *vaddr)
{}

void make_lowmem_page_readwrite(void *vaddr)
{}


/*
 * During early boot all page table pages are pinned, but we do not have struct
 * pages, so return true until struct pages are ready.
 */
static bool xen_page_pinned(void *ptr)
{}

static void xen_extend_mmu_update(const struct mmu_update *update)
{}

static void xen_extend_mmuext_op(const struct mmuext_op *op)
{}

static void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val)
{}

static void xen_set_pmd(pmd_t *ptr, pmd_t val)
{}

/*
 * Associate a virtual page frame with a given physical page frame
 * and protection flags for that frame.
 */
void __init set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags)
{}

static bool xen_batched_set_pte(pte_t *ptep, pte_t pteval)
{}

static inline void __xen_set_pte(pte_t *ptep, pte_t pteval)
{}

static void xen_set_pte(pte_t *ptep, pte_t pteval)
{}

static pte_t xen_ptep_modify_prot_start(struct vm_area_struct *vma,
					unsigned long addr, pte_t *ptep)
{}

static void xen_ptep_modify_prot_commit(struct vm_area_struct *vma,
					unsigned long addr,
					pte_t *ptep, pte_t pte)
{}

/* Assume pteval_t is equivalent to all the other *val_t types. */
static pteval_t pte_mfn_to_pfn(pteval_t val)
{}

static pteval_t pte_pfn_to_mfn(pteval_t val)
{}

__visible pteval_t xen_pte_val(pte_t pte)
{}
PV_CALLEE_SAVE_REGS_THUNK();

__visible pgdval_t xen_pgd_val(pgd_t pgd)
{}
PV_CALLEE_SAVE_REGS_THUNK();

__visible pte_t xen_make_pte(pteval_t pte)
{}
PV_CALLEE_SAVE_REGS_THUNK();

__visible pgd_t xen_make_pgd(pgdval_t pgd)
{}
PV_CALLEE_SAVE_REGS_THUNK();

__visible pmdval_t xen_pmd_val(pmd_t pmd)
{}
PV_CALLEE_SAVE_REGS_THUNK();

static void xen_set_pud_hyper(pud_t *ptr, pud_t val)
{}

static void xen_set_pud(pud_t *ptr, pud_t val)
{}

__visible pmd_t xen_make_pmd(pmdval_t pmd)
{}
PV_CALLEE_SAVE_REGS_THUNK();

__visible pudval_t xen_pud_val(pud_t pud)
{}
PV_CALLEE_SAVE_REGS_THUNK();

__visible pud_t xen_make_pud(pudval_t pud)
{}
PV_CALLEE_SAVE_REGS_THUNK();

static pgd_t *xen_get_user_pgd(pgd_t *pgd)
{}

static void __xen_set_p4d_hyper(p4d_t *ptr, p4d_t val)
{}

/*
 * Raw hypercall-based set_p4d, intended for in early boot before
 * there's a page structure.  This implies:
 *  1. The only existing pagetable is the kernel's
 *  2. It is always pinned
 *  3. It has no user pagetable attached to it
 */
static void __init xen_set_p4d_hyper(p4d_t *ptr, p4d_t val)
{}

static void xen_set_p4d(p4d_t *ptr, p4d_t val)
{}

#if CONFIG_PGTABLE_LEVELS >= 5
__visible p4dval_t xen_p4d_val(p4d_t p4d)
{}
PV_CALLEE_SAVE_REGS_THUNK();

__visible p4d_t xen_make_p4d(p4dval_t p4d)
{}
PV_CALLEE_SAVE_REGS_THUNK();
#endif  /* CONFIG_PGTABLE_LEVELS >= 5 */

static void xen_pmd_walk(struct mm_struct *mm, pmd_t *pmd,
			 void (*func)(struct mm_struct *mm, struct page *,
				      enum pt_level),
			 bool last, unsigned long limit)
{}

static void xen_pud_walk(struct mm_struct *mm, pud_t *pud,
			 void (*func)(struct mm_struct *mm, struct page *,
				      enum pt_level),
			 bool last, unsigned long limit)
{}

static void xen_p4d_walk(struct mm_struct *mm, p4d_t *p4d,
			 void (*func)(struct mm_struct *mm, struct page *,
				      enum pt_level),
			 bool last, unsigned long limit)
{}

/*
 * (Yet another) pagetable walker.  This one is intended for pinning a
 * pagetable.  This means that it walks a pagetable and calls the
 * callback function on each page it finds making up the page table,
 * at every level.  It walks the entire pagetable, but it only bothers
 * pinning pte pages which are below limit.  In the normal case this
 * will be STACK_TOP_MAX, but at boot we need to pin up to
 * FIXADDR_TOP.
 *
 * We must skip the Xen hole in the middle of the address space, just after
 * the big x86-64 virtual hole.
 */
static void __xen_pgd_walk(struct mm_struct *mm, pgd_t *pgd,
			   void (*func)(struct mm_struct *mm, struct page *,
					enum pt_level),
			   unsigned long limit)
{}

static void xen_pgd_walk(struct mm_struct *mm,
			 void (*func)(struct mm_struct *mm, struct page *,
				      enum pt_level),
			 unsigned long limit)
{}

/* If we're using split pte locks, then take the page's lock and
   return a pointer to it.  Otherwise return NULL. */
static spinlock_t *xen_pte_lock(struct page *page, struct mm_struct *mm)
{}

static void xen_pte_unlock(void *v)
{}

static void xen_do_pin(unsigned level, unsigned long pfn)
{}

static void xen_pin_page(struct mm_struct *mm, struct page *page,
			 enum pt_level level)
{}

/* This is called just after a mm has been created, but it has not
   been used yet.  We need to make sure that its pagetable is all
   read-only, and can be pinned. */
static void __xen_pgd_pin(struct mm_struct *mm, pgd_t *pgd)
{}

static void xen_pgd_pin(struct mm_struct *mm)
{}

/*
 * On save, we need to pin all pagetables to make sure they get their
 * mfns turned into pfns.  Search the list for any unpinned pgds and pin
 * them (unpinned pgds are not currently in use, probably because the
 * process is under construction or destruction).
 *
 * Expected to be called in stop_machine() ("equivalent to taking
 * every spinlock in the system"), so the locking doesn't really
 * matter all that much.
 */
void xen_mm_pin_all(void)
{}

static void __init xen_mark_pinned(struct mm_struct *mm, struct page *page,
				   enum pt_level level)
{}

/*
 * The init_mm pagetable is really pinned as soon as its created, but
 * that's before we have page structures to store the bits.  So do all
 * the book-keeping now once struct pages for allocated pages are
 * initialized. This happens only after memblock_free_all() is called.
 */
static void __init xen_after_bootmem(void)
{}

static void xen_unpin_page(struct mm_struct *mm, struct page *page,
			   enum pt_level level)
{}

/* Release a pagetables pages back as normal RW */
static void __xen_pgd_unpin(struct mm_struct *mm, pgd_t *pgd)
{}

static void xen_pgd_unpin(struct mm_struct *mm)
{}

/*
 * On resume, undo any pinning done at save, so that the rest of the
 * kernel doesn't see any unexpected pinned pagetables.
 */
void xen_mm_unpin_all(void)
{}

static void xen_enter_mmap(struct mm_struct *mm)
{}

static void drop_mm_ref_this_cpu(void *info)
{}

#ifdef CONFIG_SMP
/*
 * Another cpu may still have their %cr3 pointing at the pagetable, so
 * we need to repoint it somewhere else before we can unpin it.
 */
static void xen_drop_mm_ref(struct mm_struct *mm)
{}
#else
static void xen_drop_mm_ref(struct mm_struct *mm)
{
	drop_mm_ref_this_cpu(mm);
}
#endif

/*
 * While a process runs, Xen pins its pagetables, which means that the
 * hypervisor forces it to be read-only, and it controls all updates
 * to it.  This means that all pagetable updates have to go via the
 * hypervisor, which is moderately expensive.
 *
 * Since we're pulling the pagetable down, we switch to use init_mm,
 * unpin old process pagetable and mark it all read-write, which
 * allows further operations on it to be simple memory accesses.
 *
 * The only subtle point is that another CPU may be still using the
 * pagetable because of lazy tlb flushing.  This means we need need to
 * switch all CPUs off this pagetable before we can unpin it.
 */
static void xen_exit_mmap(struct mm_struct *mm)
{}

static void xen_post_allocator_init(void);

static void __init pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
{}

static void __init xen_cleanhighmap(unsigned long vaddr,
				    unsigned long vaddr_end)
{}

/*
 * Make a page range writeable and free it.
 */
static void __init xen_free_ro_pages(unsigned long paddr, unsigned long size)
{}

static void __init xen_cleanmfnmap_free_pgtbl(void *pgtbl, bool unpin)
{}

static void __init xen_cleanmfnmap_pmd(pmd_t *pmd, bool unpin)
{}

static void __init xen_cleanmfnmap_pud(pud_t *pud, bool unpin)
{}

static void __init xen_cleanmfnmap_p4d(p4d_t *p4d, bool unpin)
{}

/*
 * Since it is well isolated we can (and since it is perhaps large we should)
 * also free the page tables mapping the initial P->M table.
 */
static void __init xen_cleanmfnmap(unsigned long vaddr)
{}

static void __init xen_pagetable_p2m_free(void)
{}

static void __init xen_pagetable_cleanhighmap(void)
{}

static void __init xen_pagetable_p2m_setup(void)
{}

static void __init xen_pagetable_init(void)
{}

static noinstr void xen_write_cr2(unsigned long cr2)
{}

static noinline void xen_flush_tlb(void)
{}

static void xen_flush_tlb_one_user(unsigned long addr)
{}

static void xen_flush_tlb_multi(const struct cpumask *cpus,
				const struct flush_tlb_info *info)
{}

static unsigned long xen_read_cr3(void)
{}

static void set_current_cr3(void *v)
{}

static void __xen_write_cr3(bool kernel, unsigned long cr3)
{}
static void xen_write_cr3(unsigned long cr3)
{}

/*
 * At the start of the day - when Xen launches a guest, it has already
 * built pagetables for the guest. We diligently look over them
 * in xen_setup_kernel_pagetable and graft as appropriate them in the
 * init_top_pgt and its friends. Then when we are happy we load
 * the new init_top_pgt - and continue on.
 *
 * The generic code starts (start_kernel) and 'init_mem_mapping' sets
 * up the rest of the pagetables. When it has completed it loads the cr3.
 * N.B. that baremetal would start at 'start_kernel' (and the early
 * #PF handler would create bootstrap pagetables) - so we are running
 * with the same assumptions as what to do when write_cr3 is executed
 * at this point.
 *
 * Since there are no user-page tables at all, we have two variants
 * of xen_write_cr3 - the early bootup (this one), and the late one
 * (xen_write_cr3). The reason we have to do that is that in 64-bit
 * the Linux kernel and user-space are both in ring 3 while the
 * hypervisor is in ring 0.
 */
static void __init xen_write_cr3_init(unsigned long cr3)
{}

static int xen_pgd_alloc(struct mm_struct *mm)
{}

static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd)
{}

/*
 * Init-time set_pte while constructing initial pagetables, which
 * doesn't allow RO page table pages to be remapped RW.
 *
 * If there is no MFN for this PFN then this page is initially
 * ballooned out so clear the PTE (as in decrease_reservation() in
 * drivers/xen/balloon.c).
 *
 * Many of these PTE updates are done on unpinned and writable pages
 * and doing a hypercall for these is unnecessary and expensive.  At
 * this point it is rarely possible to tell if a page is pinned, so
 * mostly write the PTE directly and rely on Xen trapping and
 * emulating any updates as necessary.
 */
static void __init xen_set_pte_init(pte_t *ptep, pte_t pte)
{}

__visible pte_t xen_make_pte_init(pteval_t pte)
{}
PV_CALLEE_SAVE_REGS_THUNK();

/* Early in boot, while setting up the initial pagetable, assume
   everything is pinned. */
static void __init xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn)
{}

/* Used for pmd and pud */
static void __init xen_alloc_pmd_init(struct mm_struct *mm, unsigned long pfn)
{}

/* Early release_pte assumes that all pts are pinned, since there's
   only init_mm and anything attached to that is pinned. */
static void __init xen_release_pte_init(unsigned long pfn)
{}

static void __init xen_release_pmd_init(unsigned long pfn)
{}

static inline void __pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
{}

static inline void __set_pfn_prot(unsigned long pfn, pgprot_t prot)
{}

/* This needs to make sure the new pte page is pinned iff its being
   attached to a pinned pagetable. */
static inline void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn,
				    unsigned level)
{}

static void xen_alloc_pte(struct mm_struct *mm, unsigned long pfn)
{}

static void xen_alloc_pmd(struct mm_struct *mm, unsigned long pfn)
{}

/* This should never happen until we're OK to use struct page */
static inline void xen_release_ptpage(unsigned long pfn, unsigned level)
{}

static void xen_release_pte(unsigned long pfn)
{}

static void xen_release_pmd(unsigned long pfn)
{}

static void xen_alloc_pud(struct mm_struct *mm, unsigned long pfn)
{}

static void xen_release_pud(unsigned long pfn)
{}

/*
 * Like __va(), but returns address in the kernel mapping (which is
 * all we have until the physical memory mapping has been set up.
 */
static void * __init __ka(phys_addr_t paddr)
{}

/* Convert a machine address to physical address */
static unsigned long __init m2p(phys_addr_t maddr)
{}

/* Convert a machine address to kernel virtual */
static void * __init m2v(phys_addr_t maddr)
{}

/* Set the page permissions on an identity-mapped pages */
static void __init set_page_prot_flags(void *addr, pgprot_t prot,
				       unsigned long flags)
{}
static void __init set_page_prot(void *addr, pgprot_t prot)
{}

void __init xen_setup_machphys_mapping(void)
{}

static void __init convert_pfn_mfn(void *v)
{}
static void __init check_pt_base(unsigned long *pt_base, unsigned long *pt_end,
				 unsigned long addr)
{}
/*
 * Set up the initial kernel pagetable.
 *
 * We can construct this by grafting the Xen provided pagetable into
 * head_64.S's preconstructed pagetables.  We copy the Xen L2's into
 * level2_ident_pgt, and level2_kernel_pgt.  This means that only the
 * kernel has a physical mapping to start with - but that's enough to
 * get __va working.  We need to fill in the rest of the physical
 * mapping once some sort of allocator has been set up.
 */
void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
{}

/*
 * Read a value from a physical address.
 */
static unsigned long __init xen_read_phys_ulong(phys_addr_t addr)
{}

/*
 * Translate a virtual address to a physical one without relying on mapped
 * page tables. Don't rely on big pages being aligned in (guest) physical
 * space!
 */
static phys_addr_t __init xen_early_virt_to_phys(unsigned long vaddr)
{}

/*
 * Find a new area for the hypervisor supplied p2m list and relocate the p2m to
 * this area.
 */
void __init xen_relocate_p2m(void)
{}

void __init xen_reserve_special_pages(void)
{}

void __init xen_pt_check_e820(void)
{}

static unsigned char dummy_mapping[PAGE_SIZE] __page_aligned_bss;

static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
{}

static void xen_enter_lazy_mmu(void)
{}

static void xen_flush_lazy_mmu(void)
{}

static void __init xen_post_allocator_init(void)
{}

static void xen_leave_lazy_mmu(void)
{}

static const typeof(pv_ops) xen_mmu_ops __initconst =;

void __init xen_init_mmu_ops(void)
{}

/* Protected by xen_reservation_lock. */
#define MAX_CONTIG_ORDER
static unsigned long discontig_frames[1<<MAX_CONTIG_ORDER];

#define VOID_PTE
static void xen_zap_pfn_range(unsigned long vaddr, unsigned int order,
				unsigned long *in_frames,
				unsigned long *out_frames)
{}

/*
 * Update the pfn-to-mfn mappings for a virtual address range, either to
 * point to an array of mfns, or contiguously from a single starting
 * mfn.
 */
static void xen_remap_exchanged_ptes(unsigned long vaddr, int order,
				     unsigned long *mfns,
				     unsigned long first_mfn)
{}

/*
 * Perform the hypercall to exchange a region of our pfns to point to
 * memory with the required contiguous alignment.  Takes the pfns as
 * input, and populates mfns as output.
 *
 * Returns a success code indicating whether the hypervisor was able to
 * satisfy the request or not.
 */
static int xen_exchange_memory(unsigned long extents_in, unsigned int order_in,
			       unsigned long *pfns_in,
			       unsigned long extents_out,
			       unsigned int order_out,
			       unsigned long *mfns_out,
			       unsigned int address_bits)
{}

int xen_create_contiguous_region(phys_addr_t pstart, unsigned int order,
				 unsigned int address_bits,
				 dma_addr_t *dma_handle)
{}

void xen_destroy_contiguous_region(phys_addr_t pstart, unsigned int order)
{}

static noinline void xen_flush_tlb_all(void)
{}

#define REMAP_BATCH_SIZE

struct remap_data {};

static int remap_area_pfn_pte_fn(pte_t *ptep, unsigned long addr, void *data)
{}

int xen_remap_pfn(struct vm_area_struct *vma, unsigned long addr,
		  xen_pfn_t *pfn, int nr, int *err_ptr, pgprot_t prot,
		  unsigned int domid, bool no_translate)
{}
EXPORT_SYMBOL_GPL();

#ifdef CONFIG_VMCORE_INFO
phys_addr_t paddr_vmcoreinfo_note(void)
{}
#endif /* CONFIG_KEXEC_CORE */