// SPDX-License-Identifier: MIT /* * Copyright © 2023 Intel Corporation */ #include "xe_devcoredump.h" #include "xe_devcoredump_types.h" #include <linux/devcoredump.h> #include <generated/utsrelease.h> #include <drm/drm_managed.h> #include "xe_device.h" #include "xe_exec_queue.h" #include "xe_force_wake.h" #include "xe_gt.h" #include "xe_gt_printk.h" #include "xe_guc_ct.h" #include "xe_guc_submit.h" #include "xe_hw_engine.h" #include "xe_sched_job.h" #include "xe_vm.h" /** * DOC: Xe device coredump * * Devices overview: * Xe uses dev_coredump infrastructure for exposing the crash errors in a * standardized way. * devcoredump exposes a temporary device under /sys/class/devcoredump/ * which is linked with our card device directly. * The core dump can be accessed either from * /sys/class/drm/card<n>/device/devcoredump/ or from * /sys/class/devcoredump/devcd<m> where * /sys/class/devcoredump/devcd<m>/failing_device is a link to * /sys/class/drm/card<n>/device/. * * Snapshot at hang: * The 'data' file is printed with a drm_printer pointer at devcoredump read * time. For this reason, we need to take snapshots from when the hang has * happened, and not only when the user is reading the file. Otherwise the * information is outdated since the resets might have happened in between. * * 'First' failure snapshot: * In general, the first hang is the most critical one since the following hangs * can be a consequence of the initial hang. For this reason we only take the * snapshot of the 'first' failure and ignore subsequent calls of this function, * at least while the coredump device is alive. Dev_coredump has a delayed work * queue that will eventually delete the device and free all the dump * information. */ #ifdef CONFIG_DEV_COREDUMP /* 1 hour timeout */ #define XE_COREDUMP_TIMEOUT_JIFFIES … static struct xe_device *coredump_to_xe(const struct xe_devcoredump *coredump) { … } static struct xe_guc *exec_queue_to_guc(struct xe_exec_queue *q) { … } static void xe_devcoredump_deferred_snap_work(struct work_struct *work) { … } static ssize_t xe_devcoredump_read(char *buffer, loff_t offset, size_t count, void *data, size_t datalen) { … } static void xe_devcoredump_free(void *data) { … } static void devcoredump_snapshot(struct xe_devcoredump *coredump, struct xe_sched_job *job) { … } /** * xe_devcoredump - Take the required snapshots and initialize coredump device. * @job: The faulty xe_sched_job, where the issue was detected. * * This function should be called at the crash time within the serialized * gt_reset. It is skipped if we still have the core dump device available * with the information of the 'first' snapshot. */ void xe_devcoredump(struct xe_sched_job *job) { … } static void xe_driver_devcoredump_fini(void *arg) { … } int xe_devcoredump_init(struct xe_device *xe) { … } #endif