i915_gpu_error.c | Explore in Territory

/*
 * Copyright (c) 2008 Intel Corporation
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice (including the next
 * paragraph) shall be included in all copies or substantial portions of the
 * Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 * IN THE SOFTWARE.
 *
 * Authors:
 *    Eric Anholt <[email protected]>
 *    Keith Packard <[email protected]>
 *    Mika Kuoppala <[email protected]>
 *
 */

#include <linux/ascii85.h>
#include <linux/debugfs.h>
#include <linux/highmem.h>
#include <linux/nmi.h>
#include <linux/pagevec.h>
#include <linux/scatterlist.h>
#include <linux/string_helpers.h>
#include <linux/utsname.h>
#include <linux/zlib.h>

#include <drm/drm_cache.h>
#include <drm/drm_print.h>

#include "display/intel_dmc.h"
#include "display/intel_overlay.h"

#include "gem/i915_gem_context.h"
#include "gem/i915_gem_lmem.h"
#include "gt/intel_engine_regs.h"
#include "gt/intel_gt.h"
#include "gt/intel_gt_mcr.h"
#include "gt/intel_gt_pm.h"
#include "gt/intel_gt_regs.h"
#include "gt/uc/intel_guc_capture.h"

#include "i915_driver.h"
#include "i915_drv.h"
#include "i915_gpu_error.h"
#include "i915_memcpy.h"
#include "i915_reg.h"
#include "i915_scatterlist.h"
#include "i915_sysfs.h"
#include "i915_utils.h"

#define ALLOW_FAIL …
#define ATOMIC_MAYFAIL …

static void __sg_set_buf(struct scatterlist *sg,
			 void *addr, unsigned int len, loff_t it)
{ … }

static bool __i915_error_grow(struct drm_i915_error_state_buf *e, size_t len)
{ … }

__printf(2, 0)
static void i915_error_vprintf(struct drm_i915_error_state_buf *e,
			       const char *fmt, va_list args)
{ … }

static void i915_error_puts(struct drm_i915_error_state_buf *e, const char *str)
{ … }

#define err_printf(e, ...) …
#define err_puts(e, s) …

static void __i915_printfn_error(struct drm_printer *p, struct va_format *vaf)
{ … }

static inline struct drm_printer
i915_error_printer(struct drm_i915_error_state_buf *e)
{ … }

/* single threaded page allocator with a reserved stash for emergencies */
static void pool_fini(struct folio_batch *fbatch)
{ … }

static int pool_refill(struct folio_batch *fbatch, gfp_t gfp)
{ … }

static int pool_init(struct folio_batch *fbatch, gfp_t gfp)
{ … }

static void *pool_alloc(struct folio_batch *fbatch, gfp_t gfp)
{ … }

static void pool_free(struct folio_batch *fbatch, void *addr)
{ … }

#ifdef CONFIG_DRM_I915_COMPRESS_ERROR

struct i915_vma_compress { … };

static bool compress_init(struct i915_vma_compress *c)
{ … }

static bool compress_start(struct i915_vma_compress *c)
{ … }

static void *compress_next_page(struct i915_vma_compress *c,
				struct i915_vma_coredump *dst)
{ … }

static int compress_page(struct i915_vma_compress *c,
			 void *src,
			 struct i915_vma_coredump *dst,
			 bool wc)
{ … }

static int compress_flush(struct i915_vma_compress *c,
			  struct i915_vma_coredump *dst)
{ … }

static void compress_finish(struct i915_vma_compress *c)
{ … }

static void compress_fini(struct i915_vma_compress *c)
{ … }

static void err_compression_marker(struct drm_i915_error_state_buf *m)
{ … }

#else

struct i915_vma_compress {
	struct folio_batch pool;
};

static bool compress_init(struct i915_vma_compress *c)
{
	return pool_init(&c->pool, ALLOW_FAIL) == 0;
}

static bool compress_start(struct i915_vma_compress *c)
{
	return true;
}

static int compress_page(struct i915_vma_compress *c,
			 void *src,
			 struct i915_vma_coredump *dst,
			 bool wc)
{
	void *ptr;

	ptr = pool_alloc(&c->pool, ALLOW_FAIL);
	if (!ptr)
		return -ENOMEM;

	if (!(wc && i915_memcpy_from_wc(ptr, src, PAGE_SIZE)))
		memcpy(ptr, src, PAGE_SIZE);
	list_add_tail(&virt_to_page(ptr)->lru, &dst->page_list);
	cond_resched();

	return 0;
}

static int compress_flush(struct i915_vma_compress *c,
			  struct i915_vma_coredump *dst)
{
	return 0;
}

static void compress_finish(struct i915_vma_compress *c)
{
}

static void compress_fini(struct i915_vma_compress *c)
{
	pool_fini(&c->pool);
}

static void err_compression_marker(struct drm_i915_error_state_buf *m)
{
	err_puts(m, "~");
}

#endif

static void error_print_instdone(struct drm_i915_error_state_buf *m,
				 const struct intel_engine_coredump *ee)
{ … }

static void error_print_request(struct drm_i915_error_state_buf *m,
				const char *prefix,
				const struct i915_request_coredump *erq)
{ … }

static void error_print_context(struct drm_i915_error_state_buf *m,
				const char *header,
				const struct i915_gem_context_coredump *ctx)
{ … }

static struct i915_vma_coredump *
__find_vma(struct i915_vma_coredump *vma, const char *name)
{ … }

static struct i915_vma_coredump *
intel_gpu_error_find_batch(const struct intel_engine_coredump *ee)
{ … }

static void error_print_engine(struct drm_i915_error_state_buf *m,
			       const struct intel_engine_coredump *ee)
{ … }

void i915_error_printf(struct drm_i915_error_state_buf *e, const char *f, ...)
{ … }

static void intel_gpu_error_print_vma(struct drm_i915_error_state_buf *m,
				      const struct intel_engine_cs *engine,
				      const struct i915_vma_coredump *vma)
{ … }

static void err_print_capabilities(struct drm_i915_error_state_buf *m,
				   struct i915_gpu_coredump *error)
{ … }

static void err_print_params(struct drm_i915_error_state_buf *m,
			     const struct i915_params *params)
{ … }

static void err_print_pciid(struct drm_i915_error_state_buf *m,
			    struct drm_i915_private *i915)
{ … }

static void err_print_guc_ctb(struct drm_i915_error_state_buf *m,
			      const char *name,
			      const struct intel_ctb_coredump *ctb)
{ … }

static void err_print_uc(struct drm_i915_error_state_buf *m,
			 const struct intel_uc_coredump *error_uc)
{ … }

static void err_free_sgl(struct scatterlist *sgl)
{ … }

static void err_print_gt_info(struct drm_i915_error_state_buf *m,
			      struct intel_gt_coredump *gt)
{ … }

static void err_print_gt_display(struct drm_i915_error_state_buf *m,
				 struct intel_gt_coredump *gt)
{ … }

static void err_print_gt_global_nonguc(struct drm_i915_error_state_buf *m,
				       struct intel_gt_coredump *gt)
{ … }

static void err_print_gt_global(struct drm_i915_error_state_buf *m,
				struct intel_gt_coredump *gt)
{ … }

static void err_print_gt_fences(struct drm_i915_error_state_buf *m,
				struct intel_gt_coredump *gt)
{ … }

static void err_print_gt_engines(struct drm_i915_error_state_buf *m,
				 struct intel_gt_coredump *gt)
{ … }

static void __err_print_to_sgl(struct drm_i915_error_state_buf *m,
			       struct i915_gpu_coredump *error)
{ … }

static int err_print_to_sgl(struct i915_gpu_coredump *error)
{ … }

ssize_t i915_gpu_coredump_copy_to_buffer(struct i915_gpu_coredump *error,
					 char *buf, loff_t off, size_t rem)
{ … }

static void i915_vma_coredump_free(struct i915_vma_coredump *vma)
{ … }

static void cleanup_params(struct i915_gpu_coredump *error)
{ … }

static void cleanup_uc(struct intel_uc_coredump *uc)
{ … }

static void cleanup_gt(struct intel_gt_coredump *gt)
{ … }

void __i915_gpu_coredump_free(struct kref *error_ref)
{ … }

static struct i915_vma_coredump *
i915_vma_coredump_create(const struct intel_gt *gt,
			 const struct i915_vma_resource *vma_res,
			 struct i915_vma_compress *compress,
			 const char *name)

{ … }

static void gt_record_fences(struct intel_gt_coredump *gt)
{ … }

static void engine_record_registers(struct intel_engine_coredump *ee)
{ … }

static void record_request(const struct i915_request *request,
			   struct i915_request_coredump *erq)
{ … }

static void engine_record_execlists(struct intel_engine_coredump *ee)
{ … }

static bool record_context(struct i915_gem_context_coredump *e,
			   struct intel_context *ce)
{ … }

struct intel_engine_capture_vma { … };

static struct intel_engine_capture_vma *
capture_vma_snapshot(struct intel_engine_capture_vma *next,
		     struct i915_vma_resource *vma_res,
		     gfp_t gfp, const char *name)
{ … }

static struct intel_engine_capture_vma *
capture_vma(struct intel_engine_capture_vma *next,
	    struct i915_vma *vma,
	    const char *name,
	    gfp_t gfp)
{ … }

static struct intel_engine_capture_vma *
capture_user(struct intel_engine_capture_vma *capture,
	     const struct i915_request *rq,
	     gfp_t gfp)
{ … }

static void add_vma(struct intel_engine_coredump *ee,
		    struct i915_vma_coredump *vma)
{ … }

static struct i915_vma_coredump *
create_vma_coredump(const struct intel_gt *gt, struct i915_vma *vma,
		    const char *name, struct i915_vma_compress *compress)
{ … }

static void add_vma_coredump(struct intel_engine_coredump *ee,
			     const struct intel_gt *gt,
			     struct i915_vma *vma,
			     const char *name,
			     struct i915_vma_compress *compress)
{ … }

struct intel_engine_coredump *
intel_engine_coredump_alloc(struct intel_engine_cs *engine, gfp_t gfp, u32 dump_flags)
{ … }

static struct intel_engine_capture_vma *
engine_coredump_add_context(struct intel_engine_coredump *ee,
			    struct intel_context *ce,
			    gfp_t gfp)
{ … }

struct intel_engine_capture_vma *
intel_engine_coredump_add_request(struct intel_engine_coredump *ee,
				  struct i915_request *rq,
				  gfp_t gfp)
{ … }

void
intel_engine_coredump_add_vma(struct intel_engine_coredump *ee,
			      struct intel_engine_capture_vma *capture,
			      struct i915_vma_compress *compress)
{ … }

static struct intel_engine_coredump *
capture_engine(struct intel_engine_cs *engine,
	       struct i915_vma_compress *compress,
	       u32 dump_flags)
{ … }

static void
gt_record_engines(struct intel_gt_coredump *gt,
		  intel_engine_mask_t engine_mask,
		  struct i915_vma_compress *compress,
		  u32 dump_flags)
{ … }

static void gt_record_guc_ctb(struct intel_ctb_coredump *saved,
			      const struct intel_guc_ct_buffer *ctb,
			      const void *blob_ptr, struct intel_guc *guc)
{ … }

static struct intel_uc_coredump *
gt_record_uc(struct intel_gt_coredump *gt,
	     struct i915_vma_compress *compress)
{ … }

/* Capture display registers. */
static void gt_record_display_regs(struct intel_gt_coredump *gt)
{ … }

/* Capture all other registers that GuC doesn't capture. */
static void gt_record_global_nonguc_regs(struct intel_gt_coredump *gt)
{ … }

/*
 * Capture all registers that relate to workload submission.
 * NOTE: In GuC submission, when GuC resets an engine, it can dump these for us
 */
static void gt_record_global_regs(struct intel_gt_coredump *gt)
{ … }

static void gt_record_info(struct intel_gt_coredump *gt)
{ … }

/*
 * Generate a semi-unique error code. The code is not meant to have meaning, The
 * code's only purpose is to try to prevent false duplicated bug reports by
 * grossly estimating a GPU error state.
 *
 * TODO Ideally, hashing the batchbuffer would be a very nice way to determine
 * the hang if we could strip the GTT offset information from it.
 *
 * It's only a small step better than a random number in its current form.
 */
static u32 generate_ecode(const struct intel_engine_coredump *ee)
{ … }

static const char *error_msg(struct i915_gpu_coredump *error)
{ … }

static void capture_gen(struct i915_gpu_coredump *error)
{ … }

struct i915_gpu_coredump *
i915_gpu_coredump_alloc(struct drm_i915_private *i915, gfp_t gfp)
{ … }

#define DAY_AS_SECONDS(x) …

struct intel_gt_coredump *
intel_gt_coredump_alloc(struct intel_gt *gt, gfp_t gfp, u32 dump_flags)
{ … }

struct i915_vma_compress *
i915_vma_capture_prepare(struct intel_gt_coredump *gt)
{ … }

void i915_vma_capture_finish(struct intel_gt_coredump *gt,
			     struct i915_vma_compress *compress)
{ … }

static struct i915_gpu_coredump *
__i915_gpu_coredump(struct intel_gt *gt, intel_engine_mask_t engine_mask, u32 dump_flags)
{ … }

static struct i915_gpu_coredump *
i915_gpu_coredump(struct intel_gt *gt, intel_engine_mask_t engine_mask, u32 dump_flags)
{ … }

void i915_error_state_store(struct i915_gpu_coredump *error)
{ … }

/**
 * i915_capture_error_state - capture an error record for later analysis
 * @gt: intel_gt which originated the hang
 * @engine_mask: hung engines
 * @dump_flags: dump flags
 *
 * Should be called when an error is detected (either a hang or an error
 * interrupt) to capture error state from the time of the error.  Fills
 * out a structure which becomes available in debugfs for user level tools
 * to pick up.
 */
void i915_capture_error_state(struct intel_gt *gt,
			      intel_engine_mask_t engine_mask, u32 dump_flags)
{ … }

static struct i915_gpu_coredump *
i915_first_error_state(struct drm_i915_private *i915)
{ … }

void i915_reset_error_state(struct drm_i915_private *i915)
{ … }

void i915_disable_error_state(struct drm_i915_private *i915, int err)
{ … }

#if IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)
void intel_klog_error_capture(struct intel_gt *gt,
			      intel_engine_mask_t engine_mask)
{
	static int g_count;
	struct drm_i915_private *i915 = gt->i915;
	struct i915_gpu_coredump *error;
	intel_wakeref_t wakeref;
	size_t buf_size = PAGE_SIZE * 128;
	size_t pos_err;
	char *buf, *ptr, *next;
	int l_count = g_count++;
	int line = 0;

	/* Can't allocate memory during a reset */
	if (test_bit(I915_RESET_BACKOFF, &gt->reset.flags)) {
		drm_err(&gt->i915->drm, "[Capture/%d.%d] Inside GT reset, skipping error capture :(\n",
			l_count, line++);
		return;
	}

	error = READ_ONCE(i915->gpu_error.first_error);
	if (error) {
		drm_err(&i915->drm, "[Capture/%d.%d] Clearing existing error capture first...\n",
			l_count, line++);
		i915_reset_error_state(i915);
	}

	with_intel_runtime_pm(&i915->runtime_pm, wakeref)
		error = i915_gpu_coredump(gt, engine_mask, CORE_DUMP_FLAG_NONE);

	if (IS_ERR(error)) {
		drm_err(&i915->drm, "[Capture/%d.%d] Failed to capture error capture: %ld!\n",
			l_count, line++, PTR_ERR(error));
		return;
	}

	buf = kvmalloc(buf_size, GFP_KERNEL);
	if (!buf) {
		drm_err(&i915->drm, "[Capture/%d.%d] Failed to allocate buffer for error capture!\n",
			l_count, line++);
		i915_gpu_coredump_put(error);
		return;
	}

	drm_info(&i915->drm, "[Capture/%d.%d] Dumping i915 error capture for %ps...\n",
		 l_count, line++, __builtin_return_address(0));

	/* Largest string length safe to print via dmesg */
#define MAX_CHUNK …

	pos_err = 0;
	while (1) {
		ssize_t got = i915_gpu_coredump_copy_to_buffer(error, buf, pos_err, buf_size - 1);

		if (got <= 0)
			break;

		buf[got] = 0;
		pos_err += got;

		ptr = buf;
		while (got > 0) {
			size_t count;
			char tag[2];

			next = strnchr(ptr, got, '\n');
			if (next) {
				count = next - ptr;
				*next = 0;
				tag[0] = '>';
				tag[1] = '<';
			} else {
				count = got;
				tag[0] = '}';
				tag[1] = '{';
			}

			if (count > MAX_CHUNK) {
				size_t pos;
				char *ptr2 = ptr;

				for (pos = MAX_CHUNK; pos < count; pos += MAX_CHUNK) {
					char chr = ptr[pos];

					ptr[pos] = 0;
					drm_info(&i915->drm, "[Capture/%d.%d] }%s{\n",
						 l_count, line++, ptr2);
					ptr[pos] = chr;
					ptr2 = ptr + pos;

					/*
					 * If spewing large amounts of data via a serial console,
					 * this can be a very slow process. So be friendly and try
					 * not to cause 'softlockup on CPU' problems.
					 */
					cond_resched();
				}

				if (ptr2 < (ptr + count))
					drm_info(&i915->drm, "[Capture/%d.%d] %c%s%c\n",
						 l_count, line++, tag[0], ptr2, tag[1]);
				else if (tag[0] == '>')
					drm_info(&i915->drm, "[Capture/%d.%d] ><\n",
						 l_count, line++);
			} else {
				drm_info(&i915->drm, "[Capture/%d.%d] %c%s%c\n",
					 l_count, line++, tag[0], ptr, tag[1]);
			}

			ptr = next;
			got -= count;
			if (next) {
				ptr++;
				got--;
			}

			/* As above. */
			cond_resched();
		}

		if (got)
			drm_info(&i915->drm, "[Capture/%d.%d] Got %zd bytes remaining!\n",
				 l_count, line++, got);
	}

	kvfree(buf);

	drm_info(&i915->drm, "[Capture/%d.%d] Dumped %zd bytes\n", l_count, line++, pos_err);
}
#endif

static ssize_t gpu_state_read(struct file *file, char __user *ubuf,
			      size_t count, loff_t *pos)
{ … }

static int gpu_state_release(struct inode *inode, struct file *file)
{ … }

static int i915_gpu_info_open(struct inode *inode, struct file *file)
{ … }

static const struct file_operations i915_gpu_info_fops = …;

static ssize_t
i915_error_state_write(struct file *filp,
		       const char __user *ubuf,
		       size_t cnt,
		       loff_t *ppos)
{ … }

static int i915_error_state_open(struct inode *inode, struct file *file)
{ … }

static const struct file_operations i915_error_state_fops = …;

void i915_gpu_error_debugfs_register(struct drm_i915_private *i915)
{ … }

static ssize_t error_state_read(struct file *filp, struct kobject *kobj,
				struct bin_attribute *attr, char *buf,
				loff_t off, size_t count)
{ … }

static ssize_t error_state_write(struct file *file, struct kobject *kobj,
				 struct bin_attribute *attr, char *buf,
				 loff_t off, size_t count)
{ … }

static const struct bin_attribute error_state_attr = …;

void i915_gpu_error_sysfs_setup(struct drm_i915_private *i915)
{ … }

void i915_gpu_error_sysfs_teardown(struct drm_i915_private *i915)
{ … }
linux/drivers/gpu/drm/i915/i915_gpu_error.c