// SPDX-License-Identifier: GPL-2.0-or-later /* * FRU (Field-Replaceable Unit) Memory Poison Manager * * Copyright (c) 2024, Advanced Micro Devices, Inc. * All Rights Reserved. * * Authors: * Naveen Krishna Chatradhi <[email protected]> * Muralidhara M K <[email protected]> * Yazen Ghannam <[email protected]> * * Implementation notes, assumptions, and limitations: * * - FRU memory poison section and memory poison descriptor definitions are not yet * included in the UEFI specification. So they are defined here. Afterwards, they * may be moved to linux/cper.h, if appropriate. * * - Platforms based on AMD MI300 systems will be the first to use these structures. * There are a number of assumptions made here that will need to be generalized * to support other platforms. * * AMD MI300-based platform(s) assumptions: * - Memory errors are reported through x86 MCA. * - The entire DRAM row containing a memory error should be retired. * - There will be (1) FRU memory poison section per CPER. * - The FRU will be the CPU package (processor socket). * - The default number of memory poison descriptor entries should be (8). * - The platform will use ACPI ERST for persistent storage. * - All FRU records should be saved to persistent storage. Module init will * fail if any FRU record is not successfully written. * * - Boot time memory retirement may occur later than ideal due to dependencies * on other libraries and drivers. This leaves a gap where bad memory may be * accessed during early boot stages. * * - Enough memory should be pre-allocated for each FRU record to be able to hold * the expected number of descriptor entries. This, mostly empty, record is * written to storage during init time. Subsequent writes to the same record * should allow the Platform to update the stored record in-place. Otherwise, * if the record is extended, then the Platform may need to perform costly memory * management operations on the storage. For example, the Platform may spend time * in Firmware copying and invalidating memory on a relatively slow SPI ROM. */ #define pr_fmt(fmt) … #include <linux/cper.h> #include <linux/ras.h> #include <linux/cpu.h> #include <acpi/apei.h> #include <asm/cpu_device_id.h> #include <asm/mce.h> #include "../debugfs.h" #include "atl/internal.h" #define INVALID_CPU … /* Validation Bits */ #define FMP_VALID_ARCH_TYPE … #define FMP_VALID_ARCH … #define FMP_VALID_ID_TYPE … #define FMP_VALID_ID … #define FMP_VALID_LIST_ENTRIES … #define FMP_VALID_LIST … /* FRU Architecture Types */ #define FMP_ARCH_TYPE_X86_CPUID_1_EAX … /* FRU ID Types */ #define FMP_ID_TYPE_X86_PPIN … /* FRU Memory Poison Section */ struct cper_sec_fru_mem_poison { … } __packed; /* FRU Descriptor ID Types */ #define FPD_HW_ID_TYPE_MCA_IPID … /* FRU Descriptor Address Types */ #define FPD_ADDR_TYPE_MCA_ADDR … /* Memory Poison Descriptor */ struct cper_fru_poison_desc { … } __packed; /* Collection of headers and sections for easy pointer use. */ struct fru_rec { … } __packed; /* * Pointers to the complete CPER record of each FRU. * * Memory allocation will include padded space for descriptor entries. */ static struct fru_rec **fru_records; /* system physical addresses array */ static u64 *spa_entries; static struct dentry *fmpm_dfs_dir; static struct dentry *fmpm_dfs_entries; #define CPER_CREATOR_FMP … #define CPER_SECTION_TYPE_FMP … /** * DOC: max_nr_entries (byte) * Maximum number of descriptor entries possible for each FRU. * * Values between '1' and '255' are valid. * No input or '0' will default to FMPM_DEFAULT_MAX_NR_ENTRIES. */ static u8 max_nr_entries; module_param(max_nr_entries, byte, 0644); MODULE_PARM_DESC(…) …; #define FMPM_DEFAULT_MAX_NR_ENTRIES … /* Maximum number of FRUs in the system. */ #define FMPM_MAX_NR_FRU … static unsigned int max_nr_fru; /* Total length of record including headers and list of descriptor entries. */ static size_t max_rec_len; #define FMPM_MAX_REC_LEN … /* Total number of SPA entries across all FRUs. */ static unsigned int spa_nr_entries; /* * Protect the local records cache in fru_records and prevent concurrent * writes to storage. This is only needed after init once notifier block * registration is done. * * The majority of a record is fixed at module init and will not change * during run time. The entries within a record will be updated as new * errors are reported. The mutex should be held whenever the entries are * accessed during run time. */ static DEFINE_MUTEX(fmpm_update_mutex); #define for_each_fru(i, rec) … static inline u32 get_fmp_len(struct fru_rec *rec) { … } static struct fru_rec *get_fru_record(u64 fru_id) { … } /* * Sum up all bytes within the FRU Memory Poison Section including the Memory * Poison Descriptor entries. * * Don't include the old checksum here. It's a u32 value, so summing each of its * bytes will give the wrong total. */ static u32 do_fmp_checksum(struct cper_sec_fru_mem_poison *fmp, u32 len) { … } static int update_record_on_storage(struct fru_rec *rec) { … } static bool rec_has_valid_entries(struct fru_rec *rec) { … } static bool fpds_equal(struct cper_fru_poison_desc *old, struct cper_fru_poison_desc *new) { … } static bool rec_has_fpd(struct fru_rec *rec, struct cper_fru_poison_desc *fpd) { … } static void save_spa(struct fru_rec *rec, unsigned int entry, u64 addr, u64 id, unsigned int cpu) { … } static void update_fru_record(struct fru_rec *rec, struct mce *m) { … } static void retire_dram_row(u64 addr, u64 id, u32 cpu) { … } static int fru_handle_mem_poison(struct notifier_block *nb, unsigned long val, void *data) { … } static struct notifier_block fru_mem_poison_nb = …; static void retire_mem_fmp(struct fru_rec *rec) { … } static void retire_mem_records(void) { … } /* Set the CPER Record Header and CPER Section Descriptor fields. */ static void set_rec_fields(struct fru_rec *rec) { … } static int save_new_records(void) { … } /* Check that the record matches expected types for the current system.*/ static bool fmp_is_usable(struct fru_rec *rec) { … } static bool fmp_is_valid(struct fru_rec *rec) { … } static struct fru_rec *get_valid_record(struct fru_rec *old) { … } /* * Fetch saved records from persistent storage. * * For each found record: * - If it was not created by this module, then ignore it. * - If it is valid, then copy its data to the local cache. * - If it is not valid, then erase it. */ static int get_saved_records(void) { … } static void set_fmp_fields(struct fru_rec *rec, unsigned int cpu) { … } static int init_fmps(void) { … } static int get_system_info(void) { … } static void free_records(void) { … } static int allocate_records(void) { … } static void *fmpm_start(struct seq_file *f, loff_t *pos) { … } static void *fmpm_next(struct seq_file *f, void *data, loff_t *pos) { … } static void fmpm_stop(struct seq_file *f, void *data) { … } #define SHORT_WIDTH … #define U64_WIDTH … #define TIMESTAMP_WIDTH … #define LONG_WIDTH … #define U64_PAD … #define TS_PAD … static int fmpm_show(struct seq_file *f, void *data) { … } static const struct seq_operations fmpm_seq_ops = …; static int fmpm_open(struct inode *inode, struct file *file) { … } static const struct file_operations fmpm_fops = …; static void setup_debugfs(void) { … } static const struct x86_cpu_id fmpm_cpuids[] = …; MODULE_DEVICE_TABLE(x86cpu, fmpm_cpuids); static int __init fru_mem_poison_init(void) { … } static void __exit fru_mem_poison_exit(void) { … } module_init(…) …; module_exit(fru_mem_poison_exit); MODULE_LICENSE(…) …; MODULE_DESCRIPTION(…) …;