kfd_process.c | Explore in Territory

// SPDX-License-Identifier: GPL-2.0 OR MIT
/*
 * Copyright 2014-2022 Advanced Micro Devices, Inc.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
 * OTHER DEALINGS IN THE SOFTWARE.
 */

#include <linux/mutex.h>
#include <linux/log2.h>
#include <linux/sched.h>
#include <linux/sched/mm.h>
#include <linux/sched/task.h>
#include <linux/mmu_context.h>
#include <linux/slab.h>
#include <linux/notifier.h>
#include <linux/compat.h>
#include <linux/mman.h>
#include <linux/file.h>
#include <linux/pm_runtime.h>
#include "amdgpu_amdkfd.h"
#include "amdgpu.h"

struct mm_struct;

#include "kfd_priv.h"
#include "kfd_device_queue_manager.h"
#include "kfd_svm.h"
#include "kfd_smi_events.h"
#include "kfd_debug.h"

/*
 * List of struct kfd_process (field kfd_process).
 * Unique/indexed by mm_struct*
 */
DEFINE_HASHTABLE(kfd_processes_table, KFD_PROCESS_TABLE_SIZE);
DEFINE_MUTEX(…) …;

DEFINE_SRCU(…);

/* For process termination handling */
static struct workqueue_struct *kfd_process_wq;

/* Ordered, single-threaded workqueue for restoring evicted
 * processes. Restoring multiple processes concurrently under memory
 * pressure can lead to processes blocking each other from validating
 * their BOs and result in a live-lock situation where processes
 * remain evicted indefinitely.
 */
static struct workqueue_struct *kfd_restore_wq;

static struct kfd_process *find_process(const struct task_struct *thread,
					bool ref);
static void kfd_process_ref_release(struct kref *ref);
static struct kfd_process *create_process(const struct task_struct *thread);

static void evict_process_worker(struct work_struct *work);
static void restore_process_worker(struct work_struct *work);

static void kfd_process_device_destroy_cwsr_dgpu(struct kfd_process_device *pdd);

struct kfd_procfs_tree { … };

static struct kfd_procfs_tree procfs;

/*
 * Structure for SDMA activity tracking
 */
struct kfd_sdma_activity_handler_workarea { … };

struct temp_sdma_queue_list { … };

static void kfd_sdma_activity_worker(struct work_struct *work)
{ … }

/**
 * kfd_get_cu_occupancy - Collect number of waves in-flight on this device
 * by current process. Translates acquired wave count into number of compute units
 * that are occupied.
 *
 * @attr: Handle of attribute that allows reporting of wave count. The attribute
 * handle encapsulates GPU device it is associated with, thereby allowing collection
 * of waves in flight, etc
 * @buffer: Handle of user provided buffer updated with wave count
 *
 * Return: Number of bytes written to user buffer or an error value
 */
static int kfd_get_cu_occupancy(struct attribute *attr, char *buffer)
{ … }

static ssize_t kfd_procfs_show(struct kobject *kobj, struct attribute *attr,
			       char *buffer)
{ … }

static void kfd_procfs_kobj_release(struct kobject *kobj)
{ … }

static const struct sysfs_ops kfd_procfs_ops = …;

static const struct kobj_type procfs_type = …;

void kfd_procfs_init(void)
{ … }

void kfd_procfs_shutdown(void)
{ … }

static ssize_t kfd_procfs_queue_show(struct kobject *kobj,
				     struct attribute *attr, char *buffer)
{ … }

static ssize_t kfd_procfs_stats_show(struct kobject *kobj,
				     struct attribute *attr, char *buffer)
{ … }

static ssize_t kfd_sysfs_counters_show(struct kobject *kobj,
				       struct attribute *attr, char *buf)
{ … }

static struct attribute attr_queue_size = …;

static struct attribute attr_queue_type = …;

static struct attribute attr_queue_gpuid = …;

static struct attribute *procfs_queue_attrs[] = …;
ATTRIBUTE_GROUPS(…);

static const struct sysfs_ops procfs_queue_ops = …;

static const struct kobj_type procfs_queue_type = …;

static const struct sysfs_ops procfs_stats_ops = …;

static const struct kobj_type procfs_stats_type = …;

static const struct sysfs_ops sysfs_counters_ops = …;

static const struct kobj_type sysfs_counters_type = …;

int kfd_procfs_add_queue(struct queue *q)
{ … }

static void kfd_sysfs_create_file(struct kobject *kobj, struct attribute *attr,
				 char *name)
{ … }

static void kfd_procfs_add_sysfs_stats(struct kfd_process *p)
{ … }

static void kfd_procfs_add_sysfs_counters(struct kfd_process *p)
{ … }

static void kfd_procfs_add_sysfs_files(struct kfd_process *p)
{ … }

void kfd_procfs_del_queue(struct queue *q)
{ … }

int kfd_process_create_wq(void)
{ … }

void kfd_process_destroy_wq(void)
{ … }

static void kfd_process_free_gpuvm(struct kgd_mem *mem,
			struct kfd_process_device *pdd, void **kptr)
{ … }

/* kfd_process_alloc_gpuvm - Allocate GPU VM for the KFD process
 *	This function should be only called right after the process
 *	is created and when kfd_processes_mutex is still being held
 *	to avoid concurrency. Because of that exclusiveness, we do
 *	not need to take p->mutex.
 */
static int kfd_process_alloc_gpuvm(struct kfd_process_device *pdd,
				   uint64_t gpu_va, uint32_t size,
				   uint32_t flags, struct kgd_mem **mem, void **kptr)
{ … }

/* kfd_process_device_reserve_ib_mem - Reserve memory inside the
 *	process for IB usage The memory reserved is for KFD to submit
 *	IB to AMDGPU from kernel.  If the memory is reserved
 *	successfully, ib_kaddr will have the CPU/kernel
 *	address. Check ib_kaddr before accessing the memory.
 */
static int kfd_process_device_reserve_ib_mem(struct kfd_process_device *pdd)
{ … }

static void kfd_process_device_destroy_ib_mem(struct kfd_process_device *pdd)
{ … }

struct kfd_process *kfd_create_process(struct task_struct *thread)
{ … }

struct kfd_process *kfd_get_process(const struct task_struct *thread)
{ … }

static struct kfd_process *find_process_by_mm(const struct mm_struct *mm)
{ … }

static struct kfd_process *find_process(const struct task_struct *thread,
					bool ref)
{ … }

void kfd_unref_process(struct kfd_process *p)
{ … }

/* This increments the process->ref counter. */
struct kfd_process *kfd_lookup_process_by_pid(struct pid *pid)
{ … }

static void kfd_process_device_free_bos(struct kfd_process_device *pdd)
{ … }

/*
 * Just kunmap and unpin signal BO here. It will be freed in
 * kfd_process_free_outstanding_kfd_bos()
 */
static void kfd_process_kunmap_signal_bo(struct kfd_process *p)
{ … }

static void kfd_process_free_outstanding_kfd_bos(struct kfd_process *p)
{ … }

static void kfd_process_destroy_pdds(struct kfd_process *p)
{ … }

static void kfd_process_remove_sysfs(struct kfd_process *p)
{ … }

/* No process locking is needed in this function, because the process
 * is not findable any more. We must assume that no other thread is
 * using it any more, otherwise we couldn't safely free the process
 * structure in the end.
 */
static void kfd_process_wq_release(struct work_struct *work)
{ … }

static void kfd_process_ref_release(struct kref *ref)
{ … }

static struct mmu_notifier *kfd_process_alloc_notifier(struct mm_struct *mm)
{ … }

static void kfd_process_free_notifier(struct mmu_notifier *mn)
{ … }

static void kfd_process_notifier_release_internal(struct kfd_process *p)
{ … }

static void kfd_process_notifier_release(struct mmu_notifier *mn,
					struct mm_struct *mm)
{ … }

static const struct mmu_notifier_ops kfd_process_mmu_notifier_ops = …;

/*
 * This code handles the case when driver is being unloaded before all
 * mm_struct are released.  We need to safely free the kfd_process and
 * avoid race conditions with mmu_notifier that might try to free them.
 *
 */
void kfd_cleanup_processes(void)
{ … }

int kfd_process_init_cwsr_apu(struct kfd_process *p, struct file *filep)
{ … }

static int kfd_process_device_init_cwsr_dgpu(struct kfd_process_device *pdd)
{ … }

static void kfd_process_device_destroy_cwsr_dgpu(struct kfd_process_device *pdd)
{ … }

void kfd_process_set_trap_handler(struct qcm_process_device *qpd,
				  uint64_t tba_addr,
				  uint64_t tma_addr)
{ … }

bool kfd_process_xnack_mode(struct kfd_process *p, bool supported)
{ … }

void kfd_process_set_trap_debug_flag(struct qcm_process_device *qpd,
				     bool enabled)
{ … }

/*
 * On return the kfd_process is fully operational and will be freed when the
 * mm is released
 */
static struct kfd_process *create_process(const struct task_struct *thread)
{ … }

struct kfd_process_device *kfd_get_process_device_data(struct kfd_node *dev,
							struct kfd_process *p)
{ … }

struct kfd_process_device *kfd_create_process_device_data(struct kfd_node *dev,
							struct kfd_process *p)
{ … }

/**
 * kfd_process_device_init_vm - Initialize a VM for a process-device
 *
 * @pdd: The process-device
 * @drm_file: Optional pointer to a DRM file descriptor
 *
 * If @drm_file is specified, it will be used to acquire the VM from
 * that file descriptor. If successful, the @pdd takes ownership of
 * the file descriptor.
 *
 * If @drm_file is NULL, a new VM is created.
 *
 * Returns 0 on success, -errno on failure.
 */
int kfd_process_device_init_vm(struct kfd_process_device *pdd,
			       struct file *drm_file)
{ … }

/*
 * Direct the IOMMU to bind the process (specifically the pasid->mm)
 * to the device.
 * Unbinding occurs when the process dies or the device is removed.
 *
 * Assumes that the process lock is held.
 */
struct kfd_process_device *kfd_bind_process_to_device(struct kfd_node *dev,
							struct kfd_process *p)
{ … }

/* Create specific handle mapped to mem from process local memory idr
 * Assumes that the process lock is held.
 */
int kfd_process_device_create_obj_handle(struct kfd_process_device *pdd,
					void *mem)
{ … }

/* Translate specific handle from process local memory idr
 * Assumes that the process lock is held.
 */
void *kfd_process_device_translate_handle(struct kfd_process_device *pdd,
					int handle)
{ … }

/* Remove specific handle from process local memory idr
 * Assumes that the process lock is held.
 */
void kfd_process_device_remove_obj_handle(struct kfd_process_device *pdd,
					int handle)
{ … }

/* This increments the process->ref counter. */
struct kfd_process *kfd_lookup_process_by_pasid(u32 pasid)
{ … }

/* This increments the process->ref counter. */
struct kfd_process *kfd_lookup_process_by_mm(const struct mm_struct *mm)
{ … }

/* kfd_process_evict_queues - Evict all user queues of a process
 *
 * Eviction is reference-counted per process-device. This means multiple
 * evictions from different sources can be nested safely.
 */
int kfd_process_evict_queues(struct kfd_process *p, uint32_t trigger)
{ … }

/* kfd_process_restore_queues - Restore all user queues of a process */
int kfd_process_restore_queues(struct kfd_process *p)
{ … }

int kfd_process_gpuidx_from_gpuid(struct kfd_process *p, uint32_t gpu_id)
{ … }

int
kfd_process_gpuid_from_node(struct kfd_process *p, struct kfd_node *node,
			    uint32_t *gpuid, uint32_t *gpuidx)
{ … }

static int signal_eviction_fence(struct kfd_process *p)
{ … }

static void evict_process_worker(struct work_struct *work)
{ … }

static int restore_process_helper(struct kfd_process *p)
{ … }

static void restore_process_worker(struct work_struct *work)
{ … }

void kfd_suspend_all_processes(void)
{ … }

int kfd_resume_all_processes(void)
{ … }

int kfd_reserved_mem_mmap(struct kfd_node *dev, struct kfd_process *process,
			  struct vm_area_struct *vma)
{ … }

/* assumes caller holds process lock. */
int kfd_process_drain_interrupts(struct kfd_process_device *pdd)
{ … }

void kfd_process_close_interrupt_drain(unsigned int pasid)
{ … }

struct send_exception_work_handler_workarea { … };

static void send_exception_work_handler(struct work_struct *work)
{ … }

int kfd_send_exception_to_runtime(struct kfd_process *p,
			unsigned int queue_id,
			uint64_t error_reason)
{ … }

struct kfd_process_device *kfd_process_device_data_by_id(struct kfd_process *p, uint32_t gpu_id)
{ … }

int kfd_process_get_user_gpu_id(struct kfd_process *p, uint32_t actual_gpu_id)
{ … }

#if defined(CONFIG_DEBUG_FS)

int kfd_debugfs_mqds_by_process(struct seq_file *m, void *data)
{ … }

#endif
linux/drivers/gpu/drm/amd/amdkfd/kfd_process.c