linux/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c

/*
 * Copyright 2018 Advanced Micro Devices, Inc.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
 * OTHER DEALINGS IN THE SOFTWARE.
 *
 *
 */
#include <linux/list.h>
#include "amdgpu.h"
#include "amdgpu_xgmi.h"
#include "amdgpu_ras.h"
#include "soc15.h"
#include "df/df_3_6_offset.h"
#include "xgmi/xgmi_4_0_0_smn.h"
#include "xgmi/xgmi_4_0_0_sh_mask.h"
#include "xgmi/xgmi_6_1_0_sh_mask.h"
#include "wafl/wafl2_4_0_0_smn.h"
#include "wafl/wafl2_4_0_0_sh_mask.h"

#include "amdgpu_reset.h"

#define smnPCS_XGMI3X16_PCS_ERROR_STATUS
#define smnPCS_XGMI3X16_PCS_ERROR_NONCORRECTABLE_MASK
#define smnPCS_GOPX1_PCS_ERROR_STATUS
#define smnPCS_GOPX1_PCS_ERROR_NONCORRECTABLE_MASK

static DEFINE_MUTEX(xgmi_mutex);

#define AMDGPU_MAX_XGMI_DEVICE_PER_HIVE

static LIST_HEAD(xgmi_hive_list);

static const int xgmi_pcs_err_status_reg_vg20[] =;

static const int wafl_pcs_err_status_reg_vg20[] =;

static const int xgmi_pcs_err_status_reg_arct[] =;

/* same as vg20*/
static const int wafl_pcs_err_status_reg_arct[] =;

static const int xgmi3x16_pcs_err_status_reg_aldebaran[] =;

static const int xgmi3x16_pcs_err_noncorrectable_mask_reg_aldebaran[] =;

static const int walf_pcs_err_status_reg_aldebaran[] =;

static const int walf_pcs_err_noncorrectable_mask_reg_aldebaran[] =;

static const int xgmi3x16_pcs_err_status_reg_v6_4[] =;

static const int xgmi3x16_pcs_err_noncorrectable_mask_reg_v6_4[] =;

static const u64 xgmi_v6_4_0_mca_base_array[] =;

static const char *xgmi_v6_4_0_ras_error_code_ext[32] =;

static const struct amdgpu_pcs_ras_field xgmi_pcs_ras_fields[] =;

static const struct amdgpu_pcs_ras_field wafl_pcs_ras_fields[] =;

static const struct amdgpu_pcs_ras_field xgmi3x16_pcs_ras_fields[] =;

/**
 * DOC: AMDGPU XGMI Support
 *
 * XGMI is a high speed interconnect that joins multiple GPU cards
 * into a homogeneous memory space that is organized by a collective
 * hive ID and individual node IDs, both of which are 64-bit numbers.
 *
 * The file xgmi_device_id contains the unique per GPU device ID and
 * is stored in the /sys/class/drm/card${cardno}/device/ directory.
 *
 * Inside the device directory a sub-directory 'xgmi_hive_info' is
 * created which contains the hive ID and the list of nodes.
 *
 * The hive ID is stored in:
 *   /sys/class/drm/card${cardno}/device/xgmi_hive_info/xgmi_hive_id
 *
 * The node information is stored in numbered directories:
 *   /sys/class/drm/card${cardno}/device/xgmi_hive_info/node${nodeno}/xgmi_device_id
 *
 * Each device has their own xgmi_hive_info direction with a mirror
 * set of node sub-directories.
 *
 * The XGMI memory space is built by contiguously adding the power of
 * two padded VRAM space from each node to each other.
 *
 */

static struct attribute amdgpu_xgmi_hive_id =;

static struct attribute *amdgpu_xgmi_hive_attrs[] =;
ATTRIBUTE_GROUPS();

static ssize_t amdgpu_xgmi_show_attrs(struct kobject *kobj,
	struct attribute *attr, char *buf)
{}

static void amdgpu_xgmi_hive_release(struct kobject *kobj)
{}

static const struct sysfs_ops amdgpu_xgmi_hive_ops =;

static const struct kobj_type amdgpu_xgmi_hive_type =;

static ssize_t amdgpu_xgmi_show_device_id(struct device *dev,
				     struct device_attribute *attr,
				     char *buf)
{}

static ssize_t amdgpu_xgmi_show_physical_id(struct device *dev,
				     struct device_attribute *attr,
				     char *buf)
{}

static ssize_t amdgpu_xgmi_show_num_hops(struct device *dev,
					struct device_attribute *attr,
					char *buf)
{}

static ssize_t amdgpu_xgmi_show_num_links(struct device *dev,
					struct device_attribute *attr,
					char *buf)
{}

static ssize_t amdgpu_xgmi_show_connected_port_num(struct device *dev,
					struct device_attribute *attr,
					char *buf)
{}

#define AMDGPU_XGMI_SET_FICAA(o)
static ssize_t amdgpu_xgmi_show_error(struct device *dev,
				      struct device_attribute *attr,
				      char *buf)
{}


static DEVICE_ATTR(xgmi_device_id, S_IRUGO, amdgpu_xgmi_show_device_id, NULL);
static DEVICE_ATTR(xgmi_physical_id, 0444, amdgpu_xgmi_show_physical_id, NULL);
static DEVICE_ATTR(xgmi_error, S_IRUGO, amdgpu_xgmi_show_error, NULL);
static DEVICE_ATTR(xgmi_num_hops, S_IRUGO, amdgpu_xgmi_show_num_hops, NULL);
static DEVICE_ATTR(xgmi_num_links, S_IRUGO, amdgpu_xgmi_show_num_links, NULL);
static DEVICE_ATTR(xgmi_port_num, S_IRUGO, amdgpu_xgmi_show_connected_port_num, NULL);

static int amdgpu_xgmi_sysfs_add_dev_info(struct amdgpu_device *adev,
					 struct amdgpu_hive_info *hive)
{}

static void amdgpu_xgmi_sysfs_rem_dev_info(struct amdgpu_device *adev,
					  struct amdgpu_hive_info *hive)
{}



struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev)
{}

void amdgpu_put_xgmi_hive(struct amdgpu_hive_info *hive)
{}

int amdgpu_xgmi_set_pstate(struct amdgpu_device *adev, int pstate)
{}

int amdgpu_xgmi_update_topology(struct amdgpu_hive_info *hive, struct amdgpu_device *adev)
{}


/*
 * NOTE psp_xgmi_node_info.num_hops layout is as follows:
 * num_hops[7:6] = link type (0 = xGMI2, 1 = xGMI3, 2/3 = reserved)
 * num_hops[5:3] = reserved
 * num_hops[2:0] = number of hops
 */
int amdgpu_xgmi_get_hops_count(struct amdgpu_device *adev,
		struct amdgpu_device *peer_adev)
{}

int amdgpu_xgmi_get_num_links(struct amdgpu_device *adev,
		struct amdgpu_device *peer_adev)
{}

/*
 * Devices that support extended data require the entire hive to initialize with
 * the shared memory buffer flag set.
 *
 * Hive locks and conditions apply - see amdgpu_xgmi_add_device
 */
static int amdgpu_xgmi_initialize_hive_get_data_partition(struct amdgpu_hive_info *hive,
							bool set_extended_data)
{}

static void amdgpu_xgmi_fill_topology_info(struct amdgpu_device *adev,
	struct amdgpu_device *peer_adev)
{}

int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
{}

int amdgpu_xgmi_remove_device(struct amdgpu_device *adev)
{}

static int xgmi_v6_4_0_aca_bank_parser(struct aca_handle *handle, struct aca_bank *bank,
				       enum aca_smu_type type, void *data)
{}

static const struct aca_bank_ops xgmi_v6_4_0_aca_bank_ops =;

static const struct aca_info xgmi_v6_4_0_aca_info =;

static int amdgpu_xgmi_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *ras_block)
{}

uint64_t amdgpu_xgmi_get_relative_phy_addr(struct amdgpu_device *adev,
					   uint64_t addr)
{}

static void pcs_clear_status(struct amdgpu_device *adev, uint32_t pcs_status_reg)
{}

static void amdgpu_xgmi_legacy_reset_ras_error_count(struct amdgpu_device *adev)
{}

static void __xgmi_v6_4_0_reset_error_count(struct amdgpu_device *adev, int xgmi_inst, u64 mca_base)
{}

static void xgmi_v6_4_0_reset_error_count(struct amdgpu_device *adev, int xgmi_inst)
{}

static void xgmi_v6_4_0_reset_ras_error_count(struct amdgpu_device *adev)
{}

static void amdgpu_xgmi_reset_ras_error_count(struct amdgpu_device *adev)
{}

static int amdgpu_xgmi_query_pcs_error_status(struct amdgpu_device *adev,
					      uint32_t value,
						  uint32_t mask_value,
					      uint32_t *ue_count,
					      uint32_t *ce_count,
					      bool is_xgmi_pcs,
						  bool check_mask)
{}

static void amdgpu_xgmi_legacy_query_ras_error_count(struct amdgpu_device *adev,
						     void *ras_error_status)
{}

static enum aca_error_type xgmi_v6_4_0_pcs_mca_get_error_type(struct amdgpu_device *adev, u64 status)
{}

static void __xgmi_v6_4_0_query_error_count(struct amdgpu_device *adev, struct amdgpu_smuio_mcm_config_info *mcm_info,
					    u64 mca_base, struct ras_err_data *err_data)
{}

static void xgmi_v6_4_0_query_error_count(struct amdgpu_device *adev, int xgmi_inst, struct ras_err_data *err_data)
{}

static void xgmi_v6_4_0_query_ras_error_count(struct amdgpu_device *adev, void *ras_error_status)
{}

static void amdgpu_xgmi_query_ras_error_count(struct amdgpu_device *adev,
					      void *ras_error_status)
{}

/* Trigger XGMI/WAFL error */
static int amdgpu_ras_error_inject_xgmi(struct amdgpu_device *adev,
			void *inject_if, uint32_t instance_mask)
{}

struct amdgpu_ras_block_hw_ops  xgmi_ras_hw_ops =;

struct amdgpu_xgmi_ras xgmi_ras =;

int amdgpu_xgmi_ras_sw_init(struct amdgpu_device *adev)
{}