linux/drivers/ras/amd/fmpm.c

// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * FRU (Field-Replaceable Unit) Memory Poison Manager
 *
 * Copyright (c) 2024, Advanced Micro Devices, Inc.
 * All Rights Reserved.
 *
 * Authors:
 *	Naveen Krishna Chatradhi <[email protected]>
 *	Muralidhara M K <[email protected]>
 *	Yazen Ghannam <[email protected]>
 *
 * Implementation notes, assumptions, and limitations:
 *
 * - FRU memory poison section and memory poison descriptor definitions are not yet
 *   included in the UEFI specification. So they are defined here. Afterwards, they
 *   may be moved to linux/cper.h, if appropriate.
 *
 * - Platforms based on AMD MI300 systems will be the first to use these structures.
 *   There are a number of assumptions made here that will need to be generalized
 *   to support other platforms.
 *
 *   AMD MI300-based platform(s) assumptions:
 *   - Memory errors are reported through x86 MCA.
 *   - The entire DRAM row containing a memory error should be retired.
 *   - There will be (1) FRU memory poison section per CPER.
 *   - The FRU will be the CPU package (processor socket).
 *   - The default number of memory poison descriptor entries should be (8).
 *   - The platform will use ACPI ERST for persistent storage.
 *   - All FRU records should be saved to persistent storage. Module init will
 *     fail if any FRU record is not successfully written.
 *
 * - Boot time memory retirement may occur later than ideal due to dependencies
 *   on other libraries and drivers. This leaves a gap where bad memory may be
 *   accessed during early boot stages.
 *
 * - Enough memory should be pre-allocated for each FRU record to be able to hold
 *   the expected number of descriptor entries. This, mostly empty, record is
 *   written to storage during init time. Subsequent writes to the same record
 *   should allow the Platform to update the stored record in-place. Otherwise,
 *   if the record is extended, then the Platform may need to perform costly memory
 *   management operations on the storage. For example, the Platform may spend time
 *   in Firmware copying and invalidating memory on a relatively slow SPI ROM.
 */

#define pr_fmt(fmt)

#include <linux/cper.h>
#include <linux/ras.h>
#include <linux/cpu.h>

#include <acpi/apei.h>

#include <asm/cpu_device_id.h>
#include <asm/mce.h>

#include "../debugfs.h"

#include "atl/internal.h"

#define INVALID_CPU

/* Validation Bits */
#define FMP_VALID_ARCH_TYPE
#define FMP_VALID_ARCH
#define FMP_VALID_ID_TYPE
#define FMP_VALID_ID
#define FMP_VALID_LIST_ENTRIES
#define FMP_VALID_LIST

/* FRU Architecture Types */
#define FMP_ARCH_TYPE_X86_CPUID_1_EAX

/* FRU ID Types */
#define FMP_ID_TYPE_X86_PPIN

/* FRU Memory Poison Section */
struct cper_sec_fru_mem_poison {} __packed;

/* FRU Descriptor ID Types */
#define FPD_HW_ID_TYPE_MCA_IPID

/* FRU Descriptor Address Types */
#define FPD_ADDR_TYPE_MCA_ADDR

/* Memory Poison Descriptor */
struct cper_fru_poison_desc {} __packed;

/* Collection of headers and sections for easy pointer use. */
struct fru_rec {} __packed;

/*
 * Pointers to the complete CPER record of each FRU.
 *
 * Memory allocation will include padded space for descriptor entries.
 */
static struct fru_rec **fru_records;

/* system physical addresses array */
static u64 *spa_entries;

static struct dentry *fmpm_dfs_dir;
static struct dentry *fmpm_dfs_entries;

#define CPER_CREATOR_FMP

#define CPER_SECTION_TYPE_FMP

/**
 * DOC: max_nr_entries (byte)
 * Maximum number of descriptor entries possible for each FRU.
 *
 * Values between '1' and '255' are valid.
 * No input or '0' will default to FMPM_DEFAULT_MAX_NR_ENTRIES.
 */
static u8 max_nr_entries;
module_param(max_nr_entries, byte, 0644);
MODULE_PARM_DESC();

#define FMPM_DEFAULT_MAX_NR_ENTRIES

/* Maximum number of FRUs in the system. */
#define FMPM_MAX_NR_FRU
static unsigned int max_nr_fru;

/* Total length of record including headers and list of descriptor entries. */
static size_t max_rec_len;

#define FMPM_MAX_REC_LEN

/* Total number of SPA entries across all FRUs. */
static unsigned int spa_nr_entries;

/*
 * Protect the local records cache in fru_records and prevent concurrent
 * writes to storage. This is only needed after init once notifier block
 * registration is done.
 *
 * The majority of a record is fixed at module init and will not change
 * during run time. The entries within a record will be updated as new
 * errors are reported. The mutex should be held whenever the entries are
 * accessed during run time.
 */
static DEFINE_MUTEX(fmpm_update_mutex);

#define for_each_fru(i, rec)

static inline u32 get_fmp_len(struct fru_rec *rec)
{}

static struct fru_rec *get_fru_record(u64 fru_id)
{}

/*
 * Sum up all bytes within the FRU Memory Poison Section including the Memory
 * Poison Descriptor entries.
 *
 * Don't include the old checksum here. It's a u32 value, so summing each of its
 * bytes will give the wrong total.
 */
static u32 do_fmp_checksum(struct cper_sec_fru_mem_poison *fmp, u32 len)
{}

static int update_record_on_storage(struct fru_rec *rec)
{}

static bool rec_has_valid_entries(struct fru_rec *rec)
{}

static bool fpds_equal(struct cper_fru_poison_desc *old, struct cper_fru_poison_desc *new)
{}

static bool rec_has_fpd(struct fru_rec *rec, struct cper_fru_poison_desc *fpd)
{}

static void save_spa(struct fru_rec *rec, unsigned int entry,
		     u64 addr, u64 id, unsigned int cpu)
{}

static void update_fru_record(struct fru_rec *rec, struct mce *m)
{}

static void retire_dram_row(u64 addr, u64 id, u32 cpu)
{}

static int fru_handle_mem_poison(struct notifier_block *nb, unsigned long val, void *data)
{}

static struct notifier_block fru_mem_poison_nb =;

static void retire_mem_fmp(struct fru_rec *rec)
{}

static void retire_mem_records(void)
{}

/* Set the CPER Record Header and CPER Section Descriptor fields. */
static void set_rec_fields(struct fru_rec *rec)
{}

static int save_new_records(void)
{}

/* Check that the record matches expected types for the current system.*/
static bool fmp_is_usable(struct fru_rec *rec)
{}

static bool fmp_is_valid(struct fru_rec *rec)
{}

static struct fru_rec *get_valid_record(struct fru_rec *old)
{}

/*
 * Fetch saved records from persistent storage.
 *
 * For each found record:
 * - If it was not created by this module, then ignore it.
 * - If it is valid, then copy its data to the local cache.
 * - If it is not valid, then erase it.
 */
static int get_saved_records(void)
{}

static void set_fmp_fields(struct fru_rec *rec, unsigned int cpu)
{}

static int init_fmps(void)
{}

static int get_system_info(void)
{}

static void free_records(void)
{}

static int allocate_records(void)
{}

static void *fmpm_start(struct seq_file *f, loff_t *pos)
{}

static void *fmpm_next(struct seq_file *f, void *data, loff_t *pos)
{}

static void fmpm_stop(struct seq_file *f, void *data)
{}

#define SHORT_WIDTH
#define U64_WIDTH
#define TIMESTAMP_WIDTH
#define LONG_WIDTH
#define U64_PAD
#define TS_PAD
static int fmpm_show(struct seq_file *f, void *data)
{}

static const struct seq_operations fmpm_seq_ops =;

static int fmpm_open(struct inode *inode, struct file *file)
{}

static const struct file_operations fmpm_fops =;

static void setup_debugfs(void)
{}

static const struct x86_cpu_id fmpm_cpuids[] =;
MODULE_DEVICE_TABLE(x86cpu, fmpm_cpuids);

static int __init fru_mem_poison_init(void)
{}

static void __exit fru_mem_poison_exit(void)
{}

module_init();
module_exit(fru_mem_poison_exit);

MODULE_LICENSE();
MODULE_DESCRIPTION();