/* * Copyright 2019 Advanced Micro Devices, Inc. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR * OTHER DEALINGS IN THE SOFTWARE. * */ #include "amdgpu_ras_eeprom.h" #include "amdgpu.h" #include "amdgpu_ras.h" #include <linux/bits.h> #include "atom.h" #include "amdgpu_eeprom.h" #include "amdgpu_atomfirmware.h" #include <linux/debugfs.h> #include <linux/uaccess.h> #include "amdgpu_reset.h" /* These are memory addresses as would be seen by one or more EEPROM * chips strung on the I2C bus, usually by manipulating pins 1-3 of a * set of EEPROM devices. They form a continuous memory space. * * The I2C device address includes the device type identifier, 1010b, * which is a reserved value and indicates that this is an I2C EEPROM * device. It also includes the top 3 bits of the 19 bit EEPROM memory * address, namely bits 18, 17, and 16. This makes up the 7 bit * address sent on the I2C bus with bit 0 being the direction bit, * which is not represented here, and sent by the hardware directly. * * For instance, * 50h = 1010000b => device type identifier 1010b, bits 18:16 = 000b, address 0. * 54h = 1010100b => --"--, bits 18:16 = 100b, address 40000h. * 56h = 1010110b => --"--, bits 18:16 = 110b, address 60000h. * Depending on the size of the I2C EEPROM device(s), bits 18:16 may * address memory in a device or a device on the I2C bus, depending on * the status of pins 1-3. See top of amdgpu_eeprom.c. * * The RAS table lives either at address 0 or address 40000h of EEPROM. */ #define EEPROM_I2C_MADDR_0 … #define EEPROM_I2C_MADDR_4 … /* * The 2 macros below represent the actual size in bytes that * those entities occupy in the EEPROM memory. * RAS_TABLE_RECORD_SIZE is different than sizeof(eeprom_table_record) which * uses uint64 to store 6b fields such as retired_page. */ #define RAS_TABLE_HEADER_SIZE … #define RAS_TABLE_RECORD_SIZE … /* Table hdr is 'AMDR' */ #define RAS_TABLE_HDR_VAL … /* Bad GPU tag ‘BADG’ */ #define RAS_TABLE_HDR_BAD … /* * EEPROM Table structure v1 * --------------------------------- * | | * | EEPROM TABLE HEADER | * | ( size 20 Bytes ) | * | | * --------------------------------- * | | * | BAD PAGE RECORD AREA | * | | * --------------------------------- */ /* Assume 2-Mbit size EEPROM and take up the whole space. */ #define RAS_TBL_SIZE_BYTES … #define RAS_TABLE_START … #define RAS_HDR_START … #define RAS_RECORD_START … #define RAS_MAX_RECORD_COUNT … /* * EEPROM Table structrue v2.1 * --------------------------------- * | | * | EEPROM TABLE HEADER | * | ( size 20 Bytes ) | * | | * --------------------------------- * | | * | EEPROM TABLE RAS INFO | * | (available info size 4 Bytes) | * | ( reserved size 252 Bytes ) | * | | * --------------------------------- * | | * | BAD PAGE RECORD AREA | * | | * --------------------------------- */ /* EEPROM Table V2_1 */ #define RAS_TABLE_V2_1_INFO_SIZE … #define RAS_TABLE_V2_1_INFO_START … #define RAS_RECORD_START_V2_1 … #define RAS_MAX_RECORD_COUNT_V2_1 … /* Given a zero-based index of an EEPROM RAS record, yields the EEPROM * offset off of RAS_TABLE_START. That is, this is something you can * add to control->i2c_address, and then tell I2C layer to read * from/write to there. _N is the so called absolute index, * because it starts right after the table header. */ #define RAS_INDEX_TO_OFFSET(_C, _N) … #define RAS_OFFSET_TO_INDEX(_C, _O) … /* Given a 0-based relative record index, 0, 1, 2, ..., etc., off * of "fri", return the absolute record index off of the end of * the table header. */ #define RAS_RI_TO_AI(_C, _I) … #define RAS_NUM_RECS(_tbl_hdr) … #define RAS_NUM_RECS_V2_1(_tbl_hdr) … #define to_amdgpu_device(x) … static bool __is_ras_eeprom_supported(struct amdgpu_device *adev) { … } static bool __get_eeprom_i2c_addr(struct amdgpu_device *adev, struct amdgpu_ras_eeprom_control *control) { … } static void __encode_table_header_to_buf(struct amdgpu_ras_eeprom_table_header *hdr, unsigned char *buf) { … } static void __decode_table_header_from_buf(struct amdgpu_ras_eeprom_table_header *hdr, unsigned char *buf) { … } static int __write_table_header(struct amdgpu_ras_eeprom_control *control) { … } static void __encode_table_ras_info_to_buf(struct amdgpu_ras_eeprom_table_ras_info *rai, unsigned char *buf) { … } static void __decode_table_ras_info_from_buf(struct amdgpu_ras_eeprom_table_ras_info *rai, unsigned char *buf) { … } static int __write_table_ras_info(struct amdgpu_ras_eeprom_control *control) { … } static u8 __calc_hdr_byte_sum(const struct amdgpu_ras_eeprom_control *control) { … } static u8 __calc_ras_info_byte_sum(const struct amdgpu_ras_eeprom_control *control) { … } static int amdgpu_ras_eeprom_correct_header_tag( struct amdgpu_ras_eeprom_control *control, uint32_t header) { … } static void amdgpu_ras_set_eeprom_table_version(struct amdgpu_ras_eeprom_control *control) { … } /** * amdgpu_ras_eeprom_reset_table -- Reset the RAS EEPROM table * @control: pointer to control structure * * Reset the contents of the header of the RAS EEPROM table. * Return 0 on success, -errno on error. */ int amdgpu_ras_eeprom_reset_table(struct amdgpu_ras_eeprom_control *control) { … } static void __encode_table_record_to_buf(struct amdgpu_ras_eeprom_control *control, struct eeprom_table_record *record, unsigned char *buf) { … } static void __decode_table_record_from_buf(struct amdgpu_ras_eeprom_control *control, struct eeprom_table_record *record, unsigned char *buf) { … } bool amdgpu_ras_eeprom_check_err_threshold(struct amdgpu_device *adev) { … } /** * __amdgpu_ras_eeprom_write -- write indexed from buffer to EEPROM * @control: pointer to control structure * @buf: pointer to buffer containing data to write * @fri: start writing at this index * @num: number of records to write * * The caller must hold the table mutex in @control. * Return 0 on success, -errno otherwise. */ static int __amdgpu_ras_eeprom_write(struct amdgpu_ras_eeprom_control *control, u8 *buf, const u32 fri, const u32 num) { … } static int amdgpu_ras_eeprom_append_table(struct amdgpu_ras_eeprom_control *control, struct eeprom_table_record *record, const u32 num) { … } static int amdgpu_ras_eeprom_update_header(struct amdgpu_ras_eeprom_control *control) { … } /** * amdgpu_ras_eeprom_append -- append records to the EEPROM RAS table * @control: pointer to control structure * @record: array of records to append * @num: number of records in @record array * * Append @num records to the table, calculate the checksum and write * the table back to EEPROM. The maximum number of records that * can be appended is between 1 and control->ras_max_record_count, * regardless of how many records are already stored in the table. * * Return 0 on success or if EEPROM is not supported, -errno on error. */ int amdgpu_ras_eeprom_append(struct amdgpu_ras_eeprom_control *control, struct eeprom_table_record *record, const u32 num) { … } /** * __amdgpu_ras_eeprom_read -- read indexed from EEPROM into buffer * @control: pointer to control structure * @buf: pointer to buffer to read into * @fri: first record index, start reading at this index, absolute index * @num: number of records to read * * The caller must hold the table mutex in @control. * Return 0 on success, -errno otherwise. */ static int __amdgpu_ras_eeprom_read(struct amdgpu_ras_eeprom_control *control, u8 *buf, const u32 fri, const u32 num) { … } /** * amdgpu_ras_eeprom_read -- read EEPROM * @control: pointer to control structure * @record: array of records to read into * @num: number of records in @record * * Reads num records from the RAS table in EEPROM and * writes the data into @record array. * * Returns 0 on success, -errno on error. */ int amdgpu_ras_eeprom_read(struct amdgpu_ras_eeprom_control *control, struct eeprom_table_record *record, const u32 num) { … } uint32_t amdgpu_ras_eeprom_max_record_count(struct amdgpu_ras_eeprom_control *control) { … } static ssize_t amdgpu_ras_debugfs_eeprom_size_read(struct file *f, char __user *buf, size_t size, loff_t *pos) { … } const struct file_operations amdgpu_ras_debugfs_eeprom_size_ops = …; static const char *tbl_hdr_str = …; static const char *tbl_hdr_fmt = …; #define tbl_hdr_fmt_size … static const char *rec_hdr_str = …; static const char *rec_hdr_fmt = …; #define rec_hdr_fmt_size … static const char *record_err_type_str[AMDGPU_RAS_EEPROM_ERR_COUNT] = …; static loff_t amdgpu_ras_debugfs_table_size(struct amdgpu_ras_eeprom_control *control) { … } void amdgpu_ras_debugfs_set_ret_size(struct amdgpu_ras_eeprom_control *control) { … } static ssize_t amdgpu_ras_debugfs_table_read(struct file *f, char __user *buf, size_t size, loff_t *pos) { … } static ssize_t amdgpu_ras_debugfs_eeprom_table_read(struct file *f, char __user *buf, size_t size, loff_t *pos) { … } const struct file_operations amdgpu_ras_debugfs_eeprom_table_ops = …; /** * __verify_ras_table_checksum -- verify the RAS EEPROM table checksum * @control: pointer to control structure * * Check the checksum of the stored in EEPROM RAS table. * * Return 0 if the checksum is correct, * positive if it is not correct, and * -errno on I/O error. */ static int __verify_ras_table_checksum(struct amdgpu_ras_eeprom_control *control) { … } static int __read_table_ras_info(struct amdgpu_ras_eeprom_control *control) { … } int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control) { … }