amdgpu_ras.c | Explore in Territory

/*
 * Copyright 2018 Advanced Micro Devices, Inc.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
 * OTHER DEALINGS IN THE SOFTWARE.
 *
 *
 */
#include <linux/debugfs.h>
#include <linux/list.h>
#include <linux/module.h>
#include <linux/uaccess.h>
#include <linux/reboot.h>
#include <linux/syscalls.h>
#include <linux/pm_runtime.h>
#include <linux/list_sort.h>

#include "amdgpu.h"
#include "amdgpu_ras.h"
#include "amdgpu_atomfirmware.h"
#include "amdgpu_xgmi.h"
#include "ivsrcid/nbio/irqsrcs_nbif_7_4.h"
#include "nbio_v4_3.h"
#include "nbio_v7_9.h"
#include "atom.h"
#include "amdgpu_reset.h"
#include "amdgpu_psp.h"

#ifdef CONFIG_X86_MCE_AMD
#include <asm/mce.h>

static bool notifier_registered;
#endif
static const char *RAS_FS_NAME = …;

const char *ras_error_string[] = …;

const char *ras_block_string[] = …;

const char *ras_mca_block_string[] = …;

struct amdgpu_ras_block_list { … };

const char *get_ras_block_str(struct ras_common_if *ras_block)
{ … }

#define ras_block_str(_BLOCK_) …

#define ras_err_str(i) …

#define RAS_DEFAULT_FLAGS …

/* inject address is 52 bits */
#define RAS_UMC_INJECT_ADDR_LIMIT …

/* typical ECC bad page rate is 1 bad page per 100MB VRAM */
#define RAS_BAD_PAGE_COVER …

#define MAX_UMC_POISON_POLLING_TIME_ASYNC …

#define AMDGPU_RAS_RETIRE_PAGE_INTERVAL …

#define MAX_FLUSH_RETIRE_DWORK_TIMES …

enum amdgpu_ras_retire_page_reservation { … };

atomic_t amdgpu_ras_in_intr = …;

static bool amdgpu_ras_check_bad_page_unlock(struct amdgpu_ras *con,
				uint64_t addr);
static bool amdgpu_ras_check_bad_page(struct amdgpu_device *adev,
				uint64_t addr);
#ifdef CONFIG_X86_MCE_AMD
static void amdgpu_register_bad_pages_mca_notifier(struct amdgpu_device *adev);
struct mce_notifier_adev_list { … };
static struct mce_notifier_adev_list mce_adev_list;
#endif

void amdgpu_ras_set_error_query_ready(struct amdgpu_device *adev, bool ready)
{ … }

static bool amdgpu_ras_get_error_query_ready(struct amdgpu_device *adev)
{ … }

static int amdgpu_reserve_page_direct(struct amdgpu_device *adev, uint64_t address)
{ … }

static ssize_t amdgpu_ras_debugfs_read(struct file *f, char __user *buf,
					size_t size, loff_t *pos)
{ … }

static const struct file_operations amdgpu_ras_debugfs_ops = …;

static int amdgpu_ras_find_block_id_by_name(const char *name, int *block_id)
{ … }

static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f,
		const char __user *buf, size_t size,
		loff_t *pos, struct ras_debug_if *data)
{ … }

static void amdgpu_ras_instance_mask_check(struct amdgpu_device *adev,
				struct ras_debug_if *data)
{ … }

/**
 * DOC: AMDGPU RAS debugfs control interface
 *
 * The control interface accepts struct ras_debug_if which has two members.
 *
 * First member: ras_debug_if::head or ras_debug_if::inject.
 *
 * head is used to indicate which IP block will be under control.
 *
 * head has four members, they are block, type, sub_block_index, name.
 * block: which IP will be under control.
 * type: what kind of error will be enabled/disabled/injected.
 * sub_block_index: some IPs have subcomponets. say, GFX, sDMA.
 * name: the name of IP.
 *
 * inject has three more members than head, they are address, value and mask.
 * As their names indicate, inject operation will write the
 * value to the address.
 *
 * The second member: struct ras_debug_if::op.
 * It has three kinds of operations.
 *
 * - 0: disable RAS on the block. Take ::head as its data.
 * - 1: enable RAS on the block. Take ::head as its data.
 * - 2: inject errors on the block. Take ::inject as its data.
 *
 * How to use the interface?
 *
 * In a program
 *
 * Copy the struct ras_debug_if in your code and initialize it.
 * Write the struct to the control interface.
 *
 * From shell
 *
 * .. code-block:: bash
 *
 *	echo "disable <block>" > /sys/kernel/debug/dri/<N>/ras/ras_ctrl
 *	echo "enable  <block> <error>" > /sys/kernel/debug/dri/<N>/ras/ras_ctrl
 *	echo "inject  <block> <error> <sub-block> <address> <value> <mask>" > /sys/kernel/debug/dri/<N>/ras/ras_ctrl
 *
 * Where N, is the card which you want to affect.
 *
 * "disable" requires only the block.
 * "enable" requires the block and error type.
 * "inject" requires the block, error type, address, and value.
 *
 * The block is one of: umc, sdma, gfx, etc.
 *	see ras_block_string[] for details
 *
 * The error type is one of: ue, ce and poison where,
 *	ue is multi-uncorrectable
 *	ce is single-correctable
 *	poison is poison
 *
 * The sub-block is a the sub-block index, pass 0 if there is no sub-block.
 * The address and value are hexadecimal numbers, leading 0x is optional.
 * The mask means instance mask, is optional, default value is 0x1.
 *
 * For instance,
 *
 * .. code-block:: bash
 *
 *	echo inject umc ue 0x0 0x0 0x0 > /sys/kernel/debug/dri/0/ras/ras_ctrl
 *	echo inject umc ce 0 0 0 3 > /sys/kernel/debug/dri/0/ras/ras_ctrl
 *	echo disable umc > /sys/kernel/debug/dri/0/ras/ras_ctrl
 *
 * How to check the result of the operation?
 *
 * To check disable/enable, see "ras" features at,
 * /sys/class/drm/card[0/1/2...]/device/ras/features
 *
 * To check inject, see the corresponding error count at,
 * /sys/class/drm/card[0/1/2...]/device/ras/[gfx|sdma|umc|...]_err_count
 *
 * .. note::
 *	Operations are only allowed on blocks which are supported.
 *	Check the "ras" mask at /sys/module/amdgpu/parameters/ras_mask
 *	to see which blocks support RAS on a particular asic.
 *
 */
static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f,
					     const char __user *buf,
					     size_t size, loff_t *pos)
{ … }

/**
 * DOC: AMDGPU RAS debugfs EEPROM table reset interface
 *
 * Some boards contain an EEPROM which is used to persistently store a list of
 * bad pages which experiences ECC errors in vram.  This interface provides
 * a way to reset the EEPROM, e.g., after testing error injection.
 *
 * Usage:
 *
 * .. code-block:: bash
 *
 *	echo 1 > ../ras/ras_eeprom_reset
 *
 * will reset EEPROM table to 0 entries.
 *
 */
static ssize_t amdgpu_ras_debugfs_eeprom_write(struct file *f,
					       const char __user *buf,
					       size_t size, loff_t *pos)
{ … }

static const struct file_operations amdgpu_ras_debugfs_ctrl_ops = …;

static const struct file_operations amdgpu_ras_debugfs_eeprom_ops = …;

/**
 * DOC: AMDGPU RAS sysfs Error Count Interface
 *
 * It allows the user to read the error count for each IP block on the gpu through
 * /sys/class/drm/card[0/1/2...]/device/ras/[gfx/sdma/...]_err_count
 *
 * It outputs the multiple lines which report the uncorrected (ue) and corrected
 * (ce) error counts.
 *
 * The format of one line is below,
 *
 * [ce|ue]: count
 *
 * Example:
 *
 * .. code-block:: bash
 *
 *	ue: 0
 *	ce: 1
 *
 */
static ssize_t amdgpu_ras_sysfs_read(struct device *dev,
		struct device_attribute *attr, char *buf)
{ … }

/* obj begin */

#define get_obj(obj) …
#define alive_obj(obj) …

static inline void put_obj(struct ras_manager *obj)
{ … }

/* make one obj and return it. */
static struct ras_manager *amdgpu_ras_create_obj(struct amdgpu_device *adev,
		struct ras_common_if *head)
{ … }

/* return an obj equal to head, or the first when head is NULL */
struct ras_manager *amdgpu_ras_find_obj(struct amdgpu_device *adev,
		struct ras_common_if *head)
{ … }
/* obj end */

/* feature ctl begin */
static int amdgpu_ras_is_feature_allowed(struct amdgpu_device *adev,
					 struct ras_common_if *head)
{ … }

static int amdgpu_ras_is_feature_enabled(struct amdgpu_device *adev,
		struct ras_common_if *head)
{ … }

/*
 * if obj is not created, then create one.
 * set feature enable flag.
 */
static int __amdgpu_ras_feature_enable(struct amdgpu_device *adev,
		struct ras_common_if *head, int enable)
{ … }

/* wrapper of psp_ras_enable_features */
int amdgpu_ras_feature_enable(struct amdgpu_device *adev,
		struct ras_common_if *head, bool enable)
{ … }

/* Only used in device probe stage and called only once. */
int amdgpu_ras_feature_enable_on_boot(struct amdgpu_device *adev,
		struct ras_common_if *head, bool enable)
{ … }

static int amdgpu_ras_disable_all_features(struct amdgpu_device *adev,
		bool bypass)
{ … }

static int amdgpu_ras_enable_all_features(struct amdgpu_device *adev,
		bool bypass)
{ … }
/* feature ctl end */

static int amdgpu_ras_block_match_default(struct amdgpu_ras_block_object *block_obj,
		enum amdgpu_ras_block block)
{ … }

static struct amdgpu_ras_block_object *amdgpu_ras_get_ras_block(struct amdgpu_device *adev,
					enum amdgpu_ras_block block, uint32_t sub_block_index)
{ … }

static void amdgpu_ras_get_ecc_info(struct amdgpu_device *adev, struct ras_err_data *err_data)
{ … }

static void amdgpu_ras_error_print_error_data(struct amdgpu_device *adev,
					      struct ras_manager *ras_mgr,
					      struct ras_err_data *err_data,
					      struct ras_query_context *qctx,
					      const char *blk_name,
					      bool is_ue,
					      bool is_de)
{ … }

static inline bool err_data_has_source_info(struct ras_err_data *data)
{ … }

static void amdgpu_ras_error_generate_report(struct amdgpu_device *adev,
					     struct ras_query_if *query_if,
					     struct ras_err_data *err_data,
					     struct ras_query_context *qctx)
{ … }

static void amdgpu_rasmgr_error_data_statistic_update(struct ras_manager *obj, struct ras_err_data *err_data)
{ … }

static struct ras_manager *get_ras_manager(struct amdgpu_device *adev, enum amdgpu_ras_block blk)
{ … }

int amdgpu_ras_bind_aca(struct amdgpu_device *adev, enum amdgpu_ras_block blk,
			const struct aca_info *aca_info, void *data)
{ … }

int amdgpu_ras_unbind_aca(struct amdgpu_device *adev, enum amdgpu_ras_block blk)
{ … }

static int amdgpu_aca_log_ras_error_data(struct amdgpu_device *adev, enum amdgpu_ras_block blk,
					 enum aca_error_type type, struct ras_err_data *err_data,
					 struct ras_query_context *qctx)
{ … }

ssize_t amdgpu_ras_aca_sysfs_read(struct device *dev, struct device_attribute *attr,
				  struct aca_handle *handle, char *buf, void *data)
{ … }

static int amdgpu_ras_query_error_status_helper(struct amdgpu_device *adev,
						struct ras_query_if *info,
						struct ras_err_data *err_data,
						struct ras_query_context *qctx,
						unsigned int error_query_mode)
{ … }

/* query/inject/cure begin */
static int amdgpu_ras_query_error_status_with_event(struct amdgpu_device *adev,
						    struct ras_query_if *info,
						    enum ras_event_type type)
{ … }

int amdgpu_ras_query_error_status(struct amdgpu_device *adev, struct ras_query_if *info)
{ … }

int amdgpu_ras_reset_error_count(struct amdgpu_device *adev,
		enum amdgpu_ras_block block)
{ … }

int amdgpu_ras_reset_error_status(struct amdgpu_device *adev,
		enum amdgpu_ras_block block)
{ … }

/* wrapper of psp_ras_trigger_error */
int amdgpu_ras_error_inject(struct amdgpu_device *adev,
		struct ras_inject_if *info)
{ … }

/**
 * amdgpu_ras_query_error_count_helper -- Get error counter for specific IP
 * @adev: pointer to AMD GPU device
 * @ce_count: pointer to an integer to be set to the count of correctible errors.
 * @ue_count: pointer to an integer to be set to the count of uncorrectible errors.
 * @query_info: pointer to ras_query_if
 *
 * Return 0 for query success or do nothing, otherwise return an error
 * on failures
 */
static int amdgpu_ras_query_error_count_helper(struct amdgpu_device *adev,
					       unsigned long *ce_count,
					       unsigned long *ue_count,
					       struct ras_query_if *query_info)
{ … }

/**
 * amdgpu_ras_query_error_count -- Get error counts of all IPs or specific IP
 * @adev: pointer to AMD GPU device
 * @ce_count: pointer to an integer to be set to the count of correctible errors.
 * @ue_count: pointer to an integer to be set to the count of uncorrectible
 * errors.
 * @query_info: pointer to ras_query_if if the query request is only for
 * specific ip block; if info is NULL, then the qurey request is for
 * all the ip blocks that support query ras error counters/status
 *
 * If set, @ce_count or @ue_count, count and return the corresponding
 * error counts in those integer pointers. Return 0 if the device
 * supports RAS. Return -EOPNOTSUPP if the device doesn't support RAS.
 */
int amdgpu_ras_query_error_count(struct amdgpu_device *adev,
				 unsigned long *ce_count,
				 unsigned long *ue_count,
				 struct ras_query_if *query_info)
{ … }
/* query/inject/cure end */


/* sysfs begin */

static int amdgpu_ras_badpages_read(struct amdgpu_device *adev,
		struct ras_badpage **bps, unsigned int *count);

static char *amdgpu_ras_badpage_flags_str(unsigned int flags)
{ … }

/**
 * DOC: AMDGPU RAS sysfs gpu_vram_bad_pages Interface
 *
 * It allows user to read the bad pages of vram on the gpu through
 * /sys/class/drm/card[0/1/2...]/device/ras/gpu_vram_bad_pages
 *
 * It outputs multiple lines, and each line stands for one gpu page.
 *
 * The format of one line is below,
 * gpu pfn : gpu page size : flags
 *
 * gpu pfn and gpu page size are printed in hex format.
 * flags can be one of below character,
 *
 * R: reserved, this gpu page is reserved and not able to use.
 *
 * P: pending for reserve, this gpu page is marked as bad, will be reserved
 * in next window of page_reserve.
 *
 * F: unable to reserve. this gpu page can't be reserved due to some reasons.
 *
 * Examples:
 *
 * .. code-block:: bash
 *
 *	0x00000001 : 0x00001000 : R
 *	0x00000002 : 0x00001000 : P
 *
 */

static ssize_t amdgpu_ras_sysfs_badpages_read(struct file *f,
		struct kobject *kobj, struct bin_attribute *attr,
		char *buf, loff_t ppos, size_t count)
{ … }

static ssize_t amdgpu_ras_sysfs_features_read(struct device *dev,
		struct device_attribute *attr, char *buf)
{ … }

static ssize_t amdgpu_ras_sysfs_version_show(struct device *dev,
		struct device_attribute *attr, char *buf)
{ … }

static ssize_t amdgpu_ras_sysfs_schema_show(struct device *dev,
		struct device_attribute *attr, char *buf)
{ … }

static struct { … } dump_event[] = …;

static ssize_t amdgpu_ras_sysfs_event_state_show(struct device *dev,
						 struct device_attribute *attr, char *buf)
{ … }

static void amdgpu_ras_sysfs_remove_bad_page_node(struct amdgpu_device *adev)
{ … }

static int amdgpu_ras_sysfs_remove_dev_attr_node(struct amdgpu_device *adev)
{ … }

int amdgpu_ras_sysfs_create(struct amdgpu_device *adev,
		struct ras_common_if *head)
{ … }

int amdgpu_ras_sysfs_remove(struct amdgpu_device *adev,
		struct ras_common_if *head)
{ … }

static int amdgpu_ras_sysfs_remove_all(struct amdgpu_device *adev)
{ … }
/* sysfs end */

/**
 * DOC: AMDGPU RAS Reboot Behavior for Unrecoverable Errors
 *
 * Normally when there is an uncorrectable error, the driver will reset
 * the GPU to recover.  However, in the event of an unrecoverable error,
 * the driver provides an interface to reboot the system automatically
 * in that event.
 *
 * The following file in debugfs provides that interface:
 * /sys/kernel/debug/dri/[0/1/2...]/ras/auto_reboot
 *
 * Usage:
 *
 * .. code-block:: bash
 *
 *	echo true > .../ras/auto_reboot
 *
 */
/* debugfs begin */
static struct dentry *amdgpu_ras_debugfs_create_ctrl_node(struct amdgpu_device *adev)
{ … }

static void amdgpu_ras_debugfs_create(struct amdgpu_device *adev,
				      struct ras_fs_if *head,
				      struct dentry *dir)
{ … }

static bool amdgpu_ras_aca_is_supported(struct amdgpu_device *adev)
{ … }

void amdgpu_ras_debugfs_create_all(struct amdgpu_device *adev)
{ … }

/* debugfs end */

/* ras fs */
static BIN_ATTR(gpu_vram_bad_pages, S_IRUGO,
		amdgpu_ras_sysfs_badpages_read, NULL, 0);
static DEVICE_ATTR(features, S_IRUGO,
		amdgpu_ras_sysfs_features_read, NULL);
static DEVICE_ATTR(version, 0444,
		amdgpu_ras_sysfs_version_show, NULL);
static DEVICE_ATTR(schema, 0444,
		amdgpu_ras_sysfs_schema_show, NULL);
static DEVICE_ATTR(event_state, 0444,
		   amdgpu_ras_sysfs_event_state_show, NULL);
static int amdgpu_ras_fs_init(struct amdgpu_device *adev)
{ … }

static int amdgpu_ras_fs_fini(struct amdgpu_device *adev)
{ … }
/* ras fs end */

/* ih begin */

/* For the hardware that cannot enable bif ring for both ras_controller_irq
 * and ras_err_evnet_athub_irq ih cookies, the driver has to poll status
 * register to check whether the interrupt is triggered or not, and properly
 * ack the interrupt if it is there
 */
void amdgpu_ras_interrupt_fatal_error_handler(struct amdgpu_device *adev)
{ … }

static void amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager *obj,
				struct amdgpu_iv_entry *entry)
{ … }

static void amdgpu_ras_interrupt_poison_creation_handler(struct ras_manager *obj,
				struct amdgpu_iv_entry *entry)
{ … }

static void amdgpu_ras_interrupt_umc_handler(struct ras_manager *obj,
				struct amdgpu_iv_entry *entry)
{ … }

static void amdgpu_ras_interrupt_handler(struct ras_manager *obj)
{ … }

static void amdgpu_ras_interrupt_process_handler(struct work_struct *work)
{ … }

int amdgpu_ras_interrupt_dispatch(struct amdgpu_device *adev,
		struct ras_dispatch_if *info)
{ … }

int amdgpu_ras_interrupt_remove_handler(struct amdgpu_device *adev,
		struct ras_common_if *head)
{ … }

int amdgpu_ras_interrupt_add_handler(struct amdgpu_device *adev,
		struct ras_common_if *head)
{ … }

static int amdgpu_ras_interrupt_remove_all(struct amdgpu_device *adev)
{ … }
/* ih end */

/* traversal all IPs except NBIO to query error counter */
static void amdgpu_ras_log_on_err_counter(struct amdgpu_device *adev, enum ras_event_type type)
{ … }

/* Parse RdRspStatus and WrRspStatus */
static void amdgpu_ras_error_status_query(struct amdgpu_device *adev,
					  struct ras_query_if *info)
{ … }

static void amdgpu_ras_query_err_status(struct amdgpu_device *adev)
{ … }

/* recovery begin */

/* return 0 on success.
 * caller need free bps.
 */
static int amdgpu_ras_badpages_read(struct amdgpu_device *adev,
		struct ras_badpage **bps, unsigned int *count)
{ … }

static void amdgpu_ras_set_fed_all(struct amdgpu_device *adev,
				   struct amdgpu_hive_info *hive, bool status)
{ … }

bool amdgpu_ras_in_recovery(struct amdgpu_device *adev)
{ … }

static enum ras_event_type amdgpu_ras_get_fatal_error_event(struct amdgpu_device *adev)
{ … }

static void amdgpu_ras_do_recovery(struct work_struct *work)
{ … }

/* alloc/realloc bps array */
static int amdgpu_ras_realloc_eh_data_space(struct amdgpu_device *adev,
		struct ras_err_handler_data *data, int pages)
{ … }

/* it deal with vram only. */
int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
		struct eeprom_table_record *bps, int pages)
{ … }

/*
 * write error record array to eeprom, the function should be
 * protected by recovery_lock
 * new_cnt: new added UE count, excluding reserved bad pages, can be NULL
 */
int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev,
		unsigned long *new_cnt)
{ … }

/*
 * read error record array in eeprom and reserve enough space for
 * storing new bad pages
 */
static int amdgpu_ras_load_bad_pages(struct amdgpu_device *adev)
{ … }

static bool amdgpu_ras_check_bad_page_unlock(struct amdgpu_ras *con,
				uint64_t addr)
{ … }

/*
 * check if an address belongs to bad page
 *
 * Note: this check is only for umc block
 */
static bool amdgpu_ras_check_bad_page(struct amdgpu_device *adev,
				uint64_t addr)
{ … }

static void amdgpu_ras_validate_threshold(struct amdgpu_device *adev,
					  uint32_t max_count)
{ … }

int amdgpu_ras_put_poison_req(struct amdgpu_device *adev,
		enum amdgpu_ras_block block, uint16_t pasid,
		pasid_notify pasid_fn, void *data, uint32_t reset)
{ … }

static int amdgpu_ras_get_poison_req(struct amdgpu_device *adev,
		struct ras_poison_msg *poison_msg)
{ … }

static void amdgpu_ras_ecc_log_init(struct ras_ecc_log_info *ecc_log)
{ … }

static void amdgpu_ras_ecc_log_fini(struct ras_ecc_log_info *ecc_log)
{ … }

static bool amdgpu_ras_schedule_retirement_dwork(struct amdgpu_ras *con,
				uint32_t delayed_ms)
{ … }

static void amdgpu_ras_do_page_retirement(struct work_struct *work)
{ … }

static int amdgpu_ras_poison_creation_handler(struct amdgpu_device *adev,
				uint32_t poison_creation_count)
{ … }

static void amdgpu_ras_clear_poison_fifo(struct amdgpu_device *adev)
{ … }

static int amdgpu_ras_poison_consumption_handler(struct amdgpu_device *adev,
			uint32_t msg_count, uint32_t *gpu_reset)
{ … }

static int amdgpu_ras_page_retirement_thread(void *param)
{ … }

int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
{ … }

static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev)
{ … }
/* recovery end */

static bool amdgpu_ras_asic_supported(struct amdgpu_device *adev)
{ … }

/*
 * this is workaround for vega20 workstation sku,
 * force enable gfx ras, ignore vbios gfx ras flag
 * due to GC EDC can not write
 */
static void amdgpu_ras_get_quirks(struct amdgpu_device *adev)
{ … }

/* Query ras capablity via atomfirmware interface */
static void amdgpu_ras_query_ras_capablity_from_vbios(struct amdgpu_device *adev)
{ … }

/* Query poison mode from umc/df IP callbacks */
static void amdgpu_ras_query_poison_mode(struct amdgpu_device *adev)
{ … }

/*
 * check hardware's ras ability which will be saved in hw_supported.
 * if hardware does not support ras, we can skip some ras initializtion and
 * forbid some ras operations from IP.
 * if software itself, say boot parameter, limit the ras ability. We still
 * need allow IP do some limited operations, like disable. In such case,
 * we have to initialize ras as normal. but need check if operation is
 * allowed or not in each function.
 */
static void amdgpu_ras_check_supported(struct amdgpu_device *adev)
{ … }

static void amdgpu_ras_counte_dw(struct work_struct *work)
{ … }

static int amdgpu_get_ras_schema(struct amdgpu_device *adev)
{ … }

static void ras_event_mgr_init(struct ras_event_manager *mgr)
{ … }

static void amdgpu_ras_event_mgr_init(struct amdgpu_device *adev)
{ … }

static void amdgpu_ras_init_reserved_vram_size(struct amdgpu_device *adev)
{ … }

int amdgpu_ras_init(struct amdgpu_device *adev)
{ … }

int amdgpu_persistent_edc_harvesting_supported(struct amdgpu_device *adev)
{ … }

static int amdgpu_persistent_edc_harvesting(struct amdgpu_device *adev,
					struct ras_common_if *ras_block)
{ … }

bool amdgpu_ras_is_poison_mode_supported(struct amdgpu_device *adev)
{ … }

/* helper function to handle common stuff in ip late init phase */
int amdgpu_ras_block_late_init(struct amdgpu_device *adev,
			 struct ras_common_if *ras_block)
{ … }

static int amdgpu_ras_block_late_init_default(struct amdgpu_device *adev,
			 struct ras_common_if *ras_block)
{ … }

/* helper function to remove ras fs node and interrupt handler */
void amdgpu_ras_block_late_fini(struct amdgpu_device *adev,
			  struct ras_common_if *ras_block)
{ … }

static void amdgpu_ras_block_late_fini_default(struct amdgpu_device *adev,
			  struct ras_common_if *ras_block)
{ … }

/* do some init work after IP late init as dependence.
 * and it runs in resume/gpu reset/booting up cases.
 */
void amdgpu_ras_resume(struct amdgpu_device *adev)
{ … }

void amdgpu_ras_suspend(struct amdgpu_device *adev)
{ … }

int amdgpu_ras_late_init(struct amdgpu_device *adev)
{ … }

/* do some fini work before IP fini as dependence */
int amdgpu_ras_pre_fini(struct amdgpu_device *adev)
{ … }

int amdgpu_ras_fini(struct amdgpu_device *adev)
{ … }

bool amdgpu_ras_get_fed_status(struct amdgpu_device *adev)
{ … }

void amdgpu_ras_set_fed(struct amdgpu_device *adev, bool status)
{ … }

static struct ras_event_manager *__get_ras_event_mgr(struct amdgpu_device *adev)
{ … }

int amdgpu_ras_mark_ras_event_caller(struct amdgpu_device *adev, enum ras_event_type type,
				     const void *caller)
{ … }

u64 amdgpu_ras_acquire_event_id(struct amdgpu_device *adev, enum ras_event_type type)
{ … }

void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev)
{ … }

bool amdgpu_ras_need_emergency_restart(struct amdgpu_device *adev)
{ … }

void amdgpu_release_ras_context(struct amdgpu_device *adev)
{ … }

#ifdef CONFIG_X86_MCE_AMD
static struct amdgpu_device *find_adev(uint32_t node_id)
{ … }

#define GET_MCA_IPID_GPUID(m) …
#define GET_UMC_INST(m) …
#define GET_CHAN_INDEX(m) …
#define GPU_ID_OFFSET …

static int amdgpu_bad_page_notifier(struct notifier_block *nb,
				    unsigned long val, void *data)
{ … }

static struct notifier_block amdgpu_bad_page_nb = …;

static void amdgpu_register_bad_pages_mca_notifier(struct amdgpu_device *adev)
{ … }
#endif

struct amdgpu_ras *amdgpu_ras_get_context(struct amdgpu_device *adev)
{ … }

int amdgpu_ras_set_context(struct amdgpu_device *adev, struct amdgpu_ras *ras_con)
{ … }

/* check if ras is supported on block, say, sdma, gfx */
int amdgpu_ras_is_supported(struct amdgpu_device *adev,
		unsigned int block)
{ … }

int amdgpu_ras_reset_gpu(struct amdgpu_device *adev)
{ … }

int amdgpu_ras_set_mca_debug_mode(struct amdgpu_device *adev, bool enable)
{ … }

int amdgpu_ras_set_aca_debug_mode(struct amdgpu_device *adev, bool enable)
{ … }

bool amdgpu_ras_get_aca_debug_mode(struct amdgpu_device *adev)
{ … }

bool amdgpu_ras_get_error_query_mode(struct amdgpu_device *adev,
				     unsigned int *error_query_mode)
{ … }

/* Register each ip ras block into amdgpu ras */
int amdgpu_ras_register_ras_block(struct amdgpu_device *adev,
		struct amdgpu_ras_block_object *ras_block_obj)
{ … }

void amdgpu_ras_get_error_type_name(uint32_t err_type, char *err_type_name)
{ … }

bool amdgpu_ras_inst_get_memory_id_field(struct amdgpu_device *adev,
					 const struct amdgpu_ras_err_status_reg_entry *reg_entry,
					 uint32_t instance,
					 uint32_t *memory_id)
{ … }

bool amdgpu_ras_inst_get_err_cnt_field(struct amdgpu_device *adev,
				       const struct amdgpu_ras_err_status_reg_entry *reg_entry,
				       uint32_t instance,
				       unsigned long *err_cnt)
{ … }

void amdgpu_ras_inst_query_ras_error_count(struct amdgpu_device *adev,
					   const struct amdgpu_ras_err_status_reg_entry *reg_list,
					   uint32_t reg_list_size,
					   const struct amdgpu_ras_memory_id_entry *mem_list,
					   uint32_t mem_list_size,
					   uint32_t instance,
					   uint32_t err_type,
					   unsigned long *err_count)
{ … }

void amdgpu_ras_inst_reset_ras_error_count(struct amdgpu_device *adev,
					   const struct amdgpu_ras_err_status_reg_entry *reg_list,
					   uint32_t reg_list_size,
					   uint32_t instance)
{ … }

int amdgpu_ras_error_data_init(struct ras_err_data *err_data)
{ … }

static void amdgpu_ras_error_node_release(struct ras_err_node *err_node)
{ … }

void amdgpu_ras_error_data_fini(struct ras_err_data *err_data)
{ … }

static struct ras_err_node *amdgpu_ras_error_find_node_by_id(struct ras_err_data *err_data,
							     struct amdgpu_smuio_mcm_config_info *mcm_info)
{ … }

static struct ras_err_node *amdgpu_ras_error_node_new(void)
{ … }

static int ras_err_info_cmp(void *priv, const struct list_head *a, const struct list_head *b)
{ … }

static struct ras_err_info *amdgpu_ras_error_get_info(struct ras_err_data *err_data,
				struct amdgpu_smuio_mcm_config_info *mcm_info)
{ … }

void amdgpu_ras_add_mca_err_addr(struct ras_err_info *err_info, struct ras_err_addr *err_addr)
{ … }

void amdgpu_ras_del_mca_err_addr(struct ras_err_info *err_info, struct ras_err_addr *mca_err_addr)
{ … }

int amdgpu_ras_error_statistic_ue_count(struct ras_err_data *err_data,
		struct amdgpu_smuio_mcm_config_info *mcm_info,
		struct ras_err_addr *err_addr, u64 count)
{ … }

int amdgpu_ras_error_statistic_ce_count(struct ras_err_data *err_data,
		struct amdgpu_smuio_mcm_config_info *mcm_info,
		struct ras_err_addr *err_addr, u64 count)
{ … }

int amdgpu_ras_error_statistic_de_count(struct ras_err_data *err_data,
		struct amdgpu_smuio_mcm_config_info *mcm_info,
		struct ras_err_addr *err_addr, u64 count)
{ … }

#define mmMP0_SMN_C2PMSG_92 …
#define mmMP0_SMN_C2PMSG_126 …
static void amdgpu_ras_boot_time_error_reporting(struct amdgpu_device *adev,
						 u32 instance)
{ … }

static bool amdgpu_ras_boot_error_detected(struct amdgpu_device *adev,
					   u32 instance)
{ … }

void amdgpu_ras_query_boot_status(struct amdgpu_device *adev, u32 num_instances)
{ … }

int amdgpu_ras_reserve_page(struct amdgpu_device *adev, uint64_t pfn)
{ … }

void amdgpu_ras_event_log_print(struct amdgpu_device *adev, u64 event_id,
				const char *fmt, ...)
{ … }
linux/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c