memtype.c | Explore in Territory

// SPDX-License-Identifier: GPL-2.0-only
/*
 * Page Attribute Table (PAT) support: handle memory caching attributes in page tables.
 *
 * Authors: Venkatesh Pallipadi <[email protected]>
 *          Suresh B Siddha <[email protected]>
 *
 * Loosely based on earlier PAT patchset from Eric Biederman and Andi Kleen.
 *
 * Basic principles:
 *
 * PAT is a CPU feature supported by all modern x86 CPUs, to allow the firmware and
 * the kernel to set one of a handful of 'caching type' attributes for physical
 * memory ranges: uncached, write-combining, write-through, write-protected,
 * and the most commonly used and default attribute: write-back caching.
 *
 * PAT support supersedes and augments MTRR support in a compatible fashion: MTRR is
 * a hardware interface to enumerate a limited number of physical memory ranges
 * and set their caching attributes explicitly, programmed into the CPU via MSRs.
 * Even modern CPUs have MTRRs enabled - but these are typically not touched
 * by the kernel or by user-space (such as the X server), we rely on PAT for any
 * additional cache attribute logic.
 *
 * PAT doesn't work via explicit memory ranges, but uses page table entries to add
 * cache attribute information to the mapped memory range: there's 3 bits used,
 * (_PAGE_PWT, _PAGE_PCD, _PAGE_PAT), with the 8 possible values mapped by the
 * CPU to actual cache attributes via an MSR loaded into the CPU (MSR_IA32_CR_PAT).
 *
 * ( There's a metric ton of finer details, such as compatibility with CPU quirks
 *   that only support 4 types of PAT entries, and interaction with MTRRs, see
 *   below for details. )
 */

#include <linux/seq_file.h>
#include <linux/memblock.h>
#include <linux/debugfs.h>
#include <linux/ioport.h>
#include <linux/kernel.h>
#include <linux/pfn_t.h>
#include <linux/slab.h>
#include <linux/mm.h>
#include <linux/highmem.h>
#include <linux/fs.h>
#include <linux/rbtree.h>

#include <asm/cacheflush.h>
#include <asm/cacheinfo.h>
#include <asm/processor.h>
#include <asm/tlbflush.h>
#include <asm/x86_init.h>
#include <asm/fcntl.h>
#include <asm/e820/api.h>
#include <asm/mtrr.h>
#include <asm/page.h>
#include <asm/msr.h>
#include <asm/memtype.h>
#include <asm/io.h>

#include "memtype.h"
#include "../mm_internal.h"

#undef pr_fmt
#define pr_fmt(fmt) …

static bool __read_mostly pat_disabled = !IS_ENABLED(…);
static u64 __ro_after_init pat_msr_val;

/*
 * PAT support is enabled by default, but can be disabled for
 * various user-requested or hardware-forced reasons:
 */
static void __init pat_disable(const char *msg_reason)
{ … }

static int __init nopat(char *str)
{ … }
early_param(…);

bool pat_enabled(void)
{ … }
EXPORT_SYMBOL_GPL(…);

int pat_debug_enable;

static int __init pat_debug_setup(char *str)
{ … }
__setup(…);

#ifdef CONFIG_X86_PAT
/*
 * X86 PAT uses page flags arch_1 and uncached together to keep track of
 * memory type of pages that have backing page struct.
 *
 * X86 PAT supports 4 different memory types:
 *  - _PAGE_CACHE_MODE_WB
 *  - _PAGE_CACHE_MODE_WC
 *  - _PAGE_CACHE_MODE_UC_MINUS
 *  - _PAGE_CACHE_MODE_WT
 *
 * _PAGE_CACHE_MODE_WB is the default type.
 */

#define _PGMT_WB …
#define _PGMT_WC …
#define _PGMT_UC_MINUS …
#define _PGMT_WT …
#define _PGMT_MASK …
#define _PGMT_CLEAR_MASK …

static inline enum page_cache_mode get_page_memtype(struct page *pg)
{ … }

static inline void set_page_memtype(struct page *pg,
				    enum page_cache_mode memtype)
{ … }
#else
static inline enum page_cache_mode get_page_memtype(struct page *pg)
{
	return -1;
}
static inline void set_page_memtype(struct page *pg,
				    enum page_cache_mode memtype)
{
}
#endif

enum { … };

#define CM …

static enum page_cache_mode __init pat_get_cache_mode(unsigned int pat_val,
						      char *msg)
{ … }

#undef CM

/*
 * Update the cache mode to pgprot translation tables according to PAT
 * configuration.
 * Using lower indices is preferred, so we start with highest index.
 */
static void __init init_cache_modes(u64 pat)
{ … }

void pat_cpu_init(void)
{ … }

/**
 * pat_bp_init - Initialize the PAT MSR value and PAT table
 *
 * This function initializes PAT MSR value and PAT table with an OS-defined
 * value to enable additional cache attributes, WC, WT and WP.
 *
 * This function prepares the calls of pat_cpu_init() via cache_cpu_init()
 * on all CPUs.
 */
void __init pat_bp_init(void)
{ … }

static DEFINE_SPINLOCK(memtype_lock);	/* protects memtype accesses */

/*
 * Does intersection of PAT memory type and MTRR memory type and returns
 * the resulting memory type as PAT understands it.
 * (Type in pat and mtrr will not have same value)
 * The intersection is based on "Effective Memory Type" tables in IA-32
 * SDM vol 3a
 */
static unsigned long pat_x_mtrr_type(u64 start, u64 end,
				     enum page_cache_mode req_type)
{ … }

struct pagerange_state { … };

static int
pagerange_is_ram_callback(unsigned long initial_pfn, unsigned long total_nr_pages, void *arg)
{ … }

static int pat_pagerange_is_ram(resource_size_t start, resource_size_t end)
{ … }

/*
 * For RAM pages, we use page flags to mark the pages with appropriate type.
 * The page flags are limited to four types, WB (default), WC, WT and UC-.
 * WP request fails with -EINVAL, and UC gets redirected to UC-.  Setting
 * a new memory type is only allowed for a page mapped with the default WB
 * type.
 *
 * Here we do two passes:
 * - Find the memtype of all the pages in the range, look for any conflicts.
 * - In case of no conflicts, set the new memtype for pages in the range.
 */
static int reserve_ram_pages_type(u64 start, u64 end,
				  enum page_cache_mode req_type,
				  enum page_cache_mode *new_type)
{ … }

static int free_ram_pages_type(u64 start, u64 end)
{ … }

static u64 sanitize_phys(u64 address)
{ … }

/*
 * req_type typically has one of the:
 * - _PAGE_CACHE_MODE_WB
 * - _PAGE_CACHE_MODE_WC
 * - _PAGE_CACHE_MODE_UC_MINUS
 * - _PAGE_CACHE_MODE_UC
 * - _PAGE_CACHE_MODE_WT
 *
 * If new_type is NULL, function will return an error if it cannot reserve the
 * region with req_type. If new_type is non-NULL, function will return
 * available type in new_type in case of no error. In case of any error
 * it will return a negative return value.
 */
int memtype_reserve(u64 start, u64 end, enum page_cache_mode req_type,
		    enum page_cache_mode *new_type)
{ … }

int memtype_free(u64 start, u64 end)
{ … }


/**
 * lookup_memtype - Looks up the memory type for a physical address
 * @paddr: physical address of which memory type needs to be looked up
 *
 * Only to be called when PAT is enabled
 *
 * Returns _PAGE_CACHE_MODE_WB, _PAGE_CACHE_MODE_WC, _PAGE_CACHE_MODE_UC_MINUS
 * or _PAGE_CACHE_MODE_WT.
 */
static enum page_cache_mode lookup_memtype(u64 paddr)
{ … }

/**
 * pat_pfn_immune_to_uc_mtrr - Check whether the PAT memory type
 * of @pfn cannot be overridden by UC MTRR memory type.
 *
 * Only to be called when PAT is enabled.
 *
 * Returns true, if the PAT memory type of @pfn is UC, UC-, or WC.
 * Returns false in other cases.
 */
bool pat_pfn_immune_to_uc_mtrr(unsigned long pfn)
{ … }
EXPORT_SYMBOL_GPL(…);

/**
 * memtype_reserve_io - Request a memory type mapping for a region of memory
 * @start: start (physical address) of the region
 * @end: end (physical address) of the region
 * @type: A pointer to memtype, with requested type. On success, requested
 * or any other compatible type that was available for the region is returned
 *
 * On success, returns 0
 * On failure, returns non-zero
 */
int memtype_reserve_io(resource_size_t start, resource_size_t end,
			enum page_cache_mode *type)
{ … }

/**
 * memtype_free_io - Release a memory type mapping for a region of memory
 * @start: start (physical address) of the region
 * @end: end (physical address) of the region
 */
void memtype_free_io(resource_size_t start, resource_size_t end)
{ … }

#ifdef CONFIG_X86_PAT
int arch_io_reserve_memtype_wc(resource_size_t start, resource_size_t size)
{ … }
EXPORT_SYMBOL(…);

void arch_io_free_memtype_wc(resource_size_t start, resource_size_t size)
{ … }
EXPORT_SYMBOL(…);
#endif

pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
				unsigned long size, pgprot_t vma_prot)
{ … }

#ifdef CONFIG_STRICT_DEVMEM
/* This check is done in drivers/char/mem.c in case of STRICT_DEVMEM */
static inline int range_is_allowed(unsigned long pfn, unsigned long size)
{ … }
#else
/* This check is needed to avoid cache aliasing when PAT is enabled */
static inline int range_is_allowed(unsigned long pfn, unsigned long size)
{
	u64 from = ((u64)pfn) << PAGE_SHIFT;
	u64 to = from + size;
	u64 cursor = from;

	if (!pat_enabled())
		return 1;

	while (cursor < to) {
		if (!devmem_is_allowed(pfn))
			return 0;
		cursor += PAGE_SIZE;
		pfn++;
	}
	return 1;
}
#endif /* CONFIG_STRICT_DEVMEM */

int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
				unsigned long size, pgprot_t *vma_prot)
{ … }

/*
 * Change the memory type for the physical address range in kernel identity
 * mapping space if that range is a part of identity map.
 */
int memtype_kernel_map_sync(u64 base, unsigned long size,
			    enum page_cache_mode pcm)
{ … }

/*
 * Internal interface to reserve a range of physical memory with prot.
 * Reserved non RAM regions only and after successful memtype_reserve,
 * this func also keeps identity mapping (if any) in sync with this new prot.
 */
static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot,
				int strict_prot)
{ … }

/*
 * Internal interface to free a range of physical memory.
 * Frees non RAM regions only.
 */
static void free_pfn_range(u64 paddr, unsigned long size)
{ … }

static int follow_phys(struct vm_area_struct *vma, unsigned long *prot,
		resource_size_t *phys)
{ … }

static int get_pat_info(struct vm_area_struct *vma, resource_size_t *paddr,
		pgprot_t *pgprot)
{ … }

/*
 * track_pfn_copy is called when vma that is covering the pfnmap gets
 * copied through copy_page_range().
 *
 * If the vma has a linear pfn mapping for the entire range, we get the prot
 * from pte and reserve the entire vma range with single reserve_pfn_range call.
 */
int track_pfn_copy(struct vm_area_struct *vma)
{ … }

/*
 * prot is passed in as a parameter for the new mapping. If the vma has
 * a linear pfn mapping for the entire range, or no vma is provided,
 * reserve the entire pfn + size range with single reserve_pfn_range
 * call.
 */
int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot,
		    unsigned long pfn, unsigned long addr, unsigned long size)
{ … }

void track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot, pfn_t pfn)
{ … }

/*
 * untrack_pfn is called while unmapping a pfnmap for a region.
 * untrack can be called for a specific region indicated by pfn and size or
 * can be for the entire vma (in which case pfn, size are zero).
 */
void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn,
		 unsigned long size, bool mm_wr_locked)
{ … }

/*
 * untrack_pfn_clear is called if the following situation fits:
 *
 * 1) while mremapping a pfnmap for a new region,  with the old vma after
 * its pfnmap page table has been removed.  The new vma has a new pfnmap
 * to the same pfn & cache type with VM_PAT set.
 * 2) while duplicating vm area, the new vma fails to copy the pgtable from
 * old vma.
 */
void untrack_pfn_clear(struct vm_area_struct *vma)
{ … }

pgprot_t pgprot_writecombine(pgprot_t prot)
{ … }
EXPORT_SYMBOL_GPL(…);

pgprot_t pgprot_writethrough(pgprot_t prot)
{ … }
EXPORT_SYMBOL_GPL(…);

#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_X86_PAT)

/*
 * We are allocating a temporary printout-entry to be passed
 * between seq_start()/next() and seq_show():
 */
static struct memtype *memtype_get_idx(loff_t pos)
{ … }

static void *memtype_seq_start(struct seq_file *seq, loff_t *pos)
{ … }

static void *memtype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{ … }

static void memtype_seq_stop(struct seq_file *seq, void *v)
{ … }

static int memtype_seq_show(struct seq_file *seq, void *v)
{ … }

static const struct seq_operations memtype_seq_ops = …;

static int memtype_seq_open(struct inode *inode, struct file *file)
{ … }

static const struct file_operations memtype_fops = …;

static int __init pat_memtype_list_init(void)
{ … }
late_initcall(pat_memtype_list_init);

#endif /* CONFIG_DEBUG_FS && CONFIG_X86_PAT */
linux/arch/x86/mm/pat/memtype.c