// SPDX-License-Identifier: MIT /* * Copyright 2014-2018 Advanced Micro Devices, Inc. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR * OTHER DEALINGS IN THE SOFTWARE. */ #include <linux/dma-buf.h> #include <linux/list.h> #include <linux/pagemap.h> #include <linux/sched/mm.h> #include <linux/sched/task.h> #include <linux/fdtable.h> #include <drm/ttm/ttm_tt.h> #include <drm/drm_exec.h> #include "amdgpu_object.h" #include "amdgpu_gem.h" #include "amdgpu_vm.h" #include "amdgpu_hmm.h" #include "amdgpu_amdkfd.h" #include "amdgpu_dma_buf.h" #include <uapi/linux/kfd_ioctl.h> #include "amdgpu_xgmi.h" #include "kfd_priv.h" #include "kfd_smi_events.h" /* Userptr restore delay, just long enough to allow consecutive VM * changes to accumulate */ #define AMDGPU_USERPTR_RESTORE_DELAY_MS … #define AMDGPU_RESERVE_MEM_LIMIT … /* * Align VRAM availability to 2MB to avoid fragmentation caused by 4K allocations in the tail 2MB * BO chunk */ #define VRAM_AVAILABLITY_ALIGN … /* Impose limit on how much memory KFD can use */ static struct { … } kfd_mem_limit; static const char * const domain_bit_to_string[] = …; #define domain_string(domain) … static void amdgpu_amdkfd_restore_userptr_worker(struct work_struct *work); static bool kfd_mem_is_attached(struct amdgpu_vm *avm, struct kgd_mem *mem) { … } /** * reuse_dmamap() - Check whether adev can share the original * userptr BO * * If both adev and bo_adev are in direct mapping or * in the same iommu group, they can share the original BO. * * @adev: Device to which can or cannot share the original BO * @bo_adev: Device to which allocated BO belongs to * * Return: returns true if adev can share original userptr BO, * false otherwise. */ static bool reuse_dmamap(struct amdgpu_device *adev, struct amdgpu_device *bo_adev) { … } /* Set memory usage limits. Current, limits are * System (TTM + userptr) memory - 15/16th System RAM * TTM memory - 3/8th System RAM */ void amdgpu_amdkfd_gpuvm_init_mem_limits(void) { … } void amdgpu_amdkfd_reserve_system_mem(uint64_t size) { … } /* Estimate page table size needed to represent a given memory size * * With 4KB pages, we need one 8 byte PTE for each 4KB of memory * (factor 512, >> 9). With 2MB pages, we need one 8 byte PTE for 2MB * of memory (factor 256K, >> 18). ROCm user mode tries to optimize * for 2MB pages for TLB efficiency. However, small allocations and * fragmented system memory still need some 4KB pages. We choose a * compromise that should work in most cases without reserving too * much memory for page tables unnecessarily (factor 16K, >> 14). */ #define ESTIMATE_PT_SIZE(mem_size) … /** * amdgpu_amdkfd_reserve_mem_limit() - Decrease available memory by size * of buffer. * * @adev: Device to which allocated BO belongs to * @size: Size of buffer, in bytes, encapsulated by B0. This should be * equivalent to amdgpu_bo_size(BO) * @alloc_flag: Flag used in allocating a BO as noted above * @xcp_id: xcp_id is used to get xcp from xcp manager, one xcp is * managed as one compute node in driver for app * * Return: * returns -ENOMEM in case of error, ZERO otherwise */ int amdgpu_amdkfd_reserve_mem_limit(struct amdgpu_device *adev, uint64_t size, u32 alloc_flag, int8_t xcp_id) { … } void amdgpu_amdkfd_unreserve_mem_limit(struct amdgpu_device *adev, uint64_t size, u32 alloc_flag, int8_t xcp_id) { … } void amdgpu_amdkfd_release_notify(struct amdgpu_bo *bo) { … } /** * create_dmamap_sg_bo() - Creates a amdgpu_bo object to reflect information * about USERPTR or DOOREBELL or MMIO BO. * * @adev: Device for which dmamap BO is being created * @mem: BO of peer device that is being DMA mapped. Provides parameters * in building the dmamap BO * @bo_out: Output parameter updated with handle of dmamap BO */ static int create_dmamap_sg_bo(struct amdgpu_device *adev, struct kgd_mem *mem, struct amdgpu_bo **bo_out) { … } /* amdgpu_amdkfd_remove_eviction_fence - Removes eviction fence from BO's * reservation object. * * @bo: [IN] Remove eviction fence(s) from this BO * @ef: [IN] This eviction fence is removed if it * is present in the shared list. * * NOTE: Must be called with BO reserved i.e. bo->tbo.resv->lock held. */ static int amdgpu_amdkfd_remove_eviction_fence(struct amdgpu_bo *bo, struct amdgpu_amdkfd_fence *ef) { … } int amdgpu_amdkfd_remove_fence_on_pt_pd_bos(struct amdgpu_bo *bo) { … } static int amdgpu_amdkfd_bo_validate(struct amdgpu_bo *bo, uint32_t domain, bool wait) { … } int amdgpu_amdkfd_bo_validate_and_fence(struct amdgpu_bo *bo, uint32_t domain, struct dma_fence *fence) { … } static int amdgpu_amdkfd_validate_vm_bo(void *_unused, struct amdgpu_bo *bo) { … } /* vm_validate_pt_pd_bos - Validate page table and directory BOs * * Page directories are not updated here because huge page handling * during page table updates can invalidate page directory entries * again. Page directories are only updated after updating page * tables. */ static int vm_validate_pt_pd_bos(struct amdgpu_vm *vm, struct ww_acquire_ctx *ticket) { … } static int vm_update_pds(struct amdgpu_vm *vm, struct amdgpu_sync *sync) { … } static uint64_t get_pte_flags(struct amdgpu_device *adev, struct kgd_mem *mem) { … } /** * create_sg_table() - Create an sg_table for a contiguous DMA addr range * @addr: The starting address to point to * @size: Size of memory area in bytes being pointed to * * Allocates an instance of sg_table and initializes it to point to memory * area specified by input parameters. The address used to build is assumed * to be DMA mapped, if needed. * * DOORBELL or MMIO BOs use only one scatterlist node in their sg_table * because they are physically contiguous. * * Return: Initialized instance of SG Table or NULL */ static struct sg_table *create_sg_table(uint64_t addr, uint32_t size) { … } static int kfd_mem_dmamap_userptr(struct kgd_mem *mem, struct kfd_mem_attachment *attachment) { … } static int kfd_mem_dmamap_dmabuf(struct kfd_mem_attachment *attachment) { … } /** * kfd_mem_dmamap_sg_bo() - Create DMA mapped sg_table to access DOORBELL or MMIO BO * @mem: SG BO of the DOORBELL or MMIO resource on the owning device * @attachment: Virtual address attachment of the BO on accessing device * * An access request from the device that owns DOORBELL does not require DMA mapping. * This is because the request doesn't go through PCIe root complex i.e. it instead * loops back. The need to DMA map arises only when accessing peer device's DOORBELL * * In contrast, all access requests for MMIO need to be DMA mapped without regard to * device ownership. This is because access requests for MMIO go through PCIe root * complex. * * This is accomplished in two steps: * - Obtain DMA mapped address of DOORBELL or MMIO memory that could be used * in updating requesting device's page table * - Signal TTM to mark memory pointed to by requesting device's BO as GPU * accessible. This allows an update of requesting device's page table * with entries associated with DOOREBELL or MMIO memory * * This method is invoked in the following contexts: * - Mapping of DOORBELL or MMIO BO of same or peer device * - Validating an evicted DOOREBELL or MMIO BO on device seeking access * * Return: ZERO if successful, NON-ZERO otherwise */ static int kfd_mem_dmamap_sg_bo(struct kgd_mem *mem, struct kfd_mem_attachment *attachment) { … } static int kfd_mem_dmamap_attachment(struct kgd_mem *mem, struct kfd_mem_attachment *attachment) { … } static void kfd_mem_dmaunmap_userptr(struct kgd_mem *mem, struct kfd_mem_attachment *attachment) { … } static void kfd_mem_dmaunmap_dmabuf(struct kfd_mem_attachment *attachment) { … } /** * kfd_mem_dmaunmap_sg_bo() - Free DMA mapped sg_table of DOORBELL or MMIO BO * @mem: SG BO of the DOORBELL or MMIO resource on the owning device * @attachment: Virtual address attachment of the BO on accessing device * * The method performs following steps: * - Signal TTM to mark memory pointed to by BO as GPU inaccessible * - Free SG Table that is used to encapsulate DMA mapped memory of * peer device's DOORBELL or MMIO memory * * This method is invoked in the following contexts: * UNMapping of DOORBELL or MMIO BO on a device having access to its memory * Eviction of DOOREBELL or MMIO BO on device having access to its memory * * Return: void */ static void kfd_mem_dmaunmap_sg_bo(struct kgd_mem *mem, struct kfd_mem_attachment *attachment) { … } static void kfd_mem_dmaunmap_attachment(struct kgd_mem *mem, struct kfd_mem_attachment *attachment) { … } static int kfd_mem_export_dmabuf(struct kgd_mem *mem) { … } static int kfd_mem_attach_dmabuf(struct amdgpu_device *adev, struct kgd_mem *mem, struct amdgpu_bo **bo) { … } /* kfd_mem_attach - Add a BO to a VM * * Everything that needs to bo done only once when a BO is first added * to a VM. It can later be mapped and unmapped many times without * repeating these steps. * * 0. Create BO for DMA mapping, if needed * 1. Allocate and initialize BO VA entry data structure * 2. Add BO to the VM * 3. Determine ASIC-specific PTE flags * 4. Alloc page tables and directories if needed * 4a. Validate new page tables and directories */ static int kfd_mem_attach(struct amdgpu_device *adev, struct kgd_mem *mem, struct amdgpu_vm *vm, bool is_aql) { … } static void kfd_mem_detach(struct kfd_mem_attachment *attachment) { … } static void add_kgd_mem_to_kfd_bo_list(struct kgd_mem *mem, struct amdkfd_process_info *process_info, bool userptr) { … } static void remove_kgd_mem_from_kfd_bo_list(struct kgd_mem *mem, struct amdkfd_process_info *process_info) { … } /* Initializes user pages. It registers the MMU notifier and validates * the userptr BO in the GTT domain. * * The BO must already be on the userptr_valid_list. Otherwise an * eviction and restore may happen that leaves the new BO unmapped * with the user mode queues running. * * Takes the process_info->lock to protect against concurrent restore * workers. * * Returns 0 for success, negative errno for errors. */ static int init_user_pages(struct kgd_mem *mem, uint64_t user_addr, bool criu_resume) { … } /* Reserving a BO and its page table BOs must happen atomically to * avoid deadlocks. Some operations update multiple VMs at once. Track * all the reservation info in a context structure. Optionally a sync * object can track VM updates. */ struct bo_vm_reservation_context { … }; enum bo_vm_match { … }; /** * reserve_bo_and_vm - reserve a BO and a VM unconditionally. * @mem: KFD BO structure. * @vm: the VM to reserve. * @ctx: the struct that will be used in unreserve_bo_and_vms(). */ static int reserve_bo_and_vm(struct kgd_mem *mem, struct amdgpu_vm *vm, struct bo_vm_reservation_context *ctx) { … } /** * reserve_bo_and_cond_vms - reserve a BO and some VMs conditionally * @mem: KFD BO structure. * @vm: the VM to reserve. If NULL, then all VMs associated with the BO * is used. Otherwise, a single VM associated with the BO. * @map_type: the mapping status that will be used to filter the VMs. * @ctx: the struct that will be used in unreserve_bo_and_vms(). * * Returns 0 for success, negative for failure. */ static int reserve_bo_and_cond_vms(struct kgd_mem *mem, struct amdgpu_vm *vm, enum bo_vm_match map_type, struct bo_vm_reservation_context *ctx) { … } /** * unreserve_bo_and_vms - Unreserve BO and VMs from a reservation context * @ctx: Reservation context to unreserve * @wait: Optionally wait for a sync object representing pending VM updates * @intr: Whether the wait is interruptible * * Also frees any resources allocated in * reserve_bo_and_(cond_)vm(s). Returns the status from * amdgpu_sync_wait. */ static int unreserve_bo_and_vms(struct bo_vm_reservation_context *ctx, bool wait, bool intr) { … } static void unmap_bo_from_gpuvm(struct kgd_mem *mem, struct kfd_mem_attachment *entry, struct amdgpu_sync *sync) { … } static int update_gpuvm_pte(struct kgd_mem *mem, struct kfd_mem_attachment *entry, struct amdgpu_sync *sync) { … } static int map_bo_to_gpuvm(struct kgd_mem *mem, struct kfd_mem_attachment *entry, struct amdgpu_sync *sync, bool no_update_pte) { … } static int process_validate_vms(struct amdkfd_process_info *process_info, struct ww_acquire_ctx *ticket) { … } static int process_sync_pds_resv(struct amdkfd_process_info *process_info, struct amdgpu_sync *sync) { … } static int process_update_pds(struct amdkfd_process_info *process_info, struct amdgpu_sync *sync) { … } static int init_kfd_vm(struct amdgpu_vm *vm, void **process_info, struct dma_fence **ef) { … } /** * amdgpu_amdkfd_gpuvm_pin_bo() - Pins a BO using following criteria * @bo: Handle of buffer object being pinned * @domain: Domain into which BO should be pinned * * - USERPTR BOs are UNPINNABLE and will return error * - All other BO types (GTT, VRAM, MMIO and DOORBELL) will have their * PIN count incremented. It is valid to PIN a BO multiple times * * Return: ZERO if successful in pinning, Non-Zero in case of error. */ static int amdgpu_amdkfd_gpuvm_pin_bo(struct amdgpu_bo *bo, u32 domain) { … } /** * amdgpu_amdkfd_gpuvm_unpin_bo() - Unpins BO using following criteria * @bo: Handle of buffer object being unpinned * * - Is a illegal request for USERPTR BOs and is ignored * - All other BO types (GTT, VRAM, MMIO and DOORBELL) will have their * PIN count decremented. Calls to UNPIN must balance calls to PIN */ static void amdgpu_amdkfd_gpuvm_unpin_bo(struct amdgpu_bo *bo) { … } int amdgpu_amdkfd_gpuvm_set_vm_pasid(struct amdgpu_device *adev, struct amdgpu_vm *avm, u32 pasid) { … } int amdgpu_amdkfd_gpuvm_acquire_process_vm(struct amdgpu_device *adev, struct amdgpu_vm *avm, void **process_info, struct dma_fence **ef) { … } void amdgpu_amdkfd_gpuvm_destroy_cb(struct amdgpu_device *adev, struct amdgpu_vm *vm) { … } void amdgpu_amdkfd_gpuvm_release_process_vm(struct amdgpu_device *adev, void *drm_priv) { … } uint64_t amdgpu_amdkfd_gpuvm_get_process_page_dir(void *drm_priv) { … } void amdgpu_amdkfd_block_mmu_notifications(void *p) { … } int amdgpu_amdkfd_criu_resume(void *p) { … } size_t amdgpu_amdkfd_get_available_memory(struct amdgpu_device *adev, uint8_t xcp_id) { … } int amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu( struct amdgpu_device *adev, uint64_t va, uint64_t size, void *drm_priv, struct kgd_mem **mem, uint64_t *offset, uint32_t flags, bool criu_resume) { … } int amdgpu_amdkfd_gpuvm_free_memory_of_gpu( struct amdgpu_device *adev, struct kgd_mem *mem, void *drm_priv, uint64_t *size) { … } int amdgpu_amdkfd_gpuvm_map_memory_to_gpu( struct amdgpu_device *adev, struct kgd_mem *mem, void *drm_priv) { … } int amdgpu_amdkfd_gpuvm_dmaunmap_mem(struct kgd_mem *mem, void *drm_priv) { … } int amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu( struct amdgpu_device *adev, struct kgd_mem *mem, void *drm_priv) { … } int amdgpu_amdkfd_gpuvm_sync_memory( struct amdgpu_device *adev, struct kgd_mem *mem, bool intr) { … } /** * amdgpu_amdkfd_map_gtt_bo_to_gart - Map BO to GART and increment reference count * @bo: Buffer object to be mapped * * Before return, bo reference count is incremented. To release the reference and unpin/ * unmap the BO, call amdgpu_amdkfd_free_gtt_mem. */ int amdgpu_amdkfd_map_gtt_bo_to_gart(struct amdgpu_bo *bo) { … } /** amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel() - Map a GTT BO for kernel CPU access * * @mem: Buffer object to be mapped for CPU access * @kptr[out]: pointer in kernel CPU address space * @size[out]: size of the buffer * * Pins the BO and maps it for kernel CPU access. The eviction fence is removed * from the BO, since pinned BOs cannot be evicted. The bo must remain on the * validate_list, so the GPU mapping can be restored after a page table was * evicted. * * Return: 0 on success, error code on failure */ int amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel(struct kgd_mem *mem, void **kptr, uint64_t *size) { … } /** amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel() - Unmap a GTT BO for kernel CPU access * * @mem: Buffer object to be unmapped for CPU access * * Removes the kernel CPU mapping and unpins the BO. It does not restore the * eviction fence, so this function should only be used for cleanup before the * BO is destroyed. */ void amdgpu_amdkfd_gpuvm_unmap_gtt_bo_from_kernel(struct kgd_mem *mem) { … } int amdgpu_amdkfd_gpuvm_get_vm_fault_info(struct amdgpu_device *adev, struct kfd_vm_fault_info *mem) { … } static int import_obj_create(struct amdgpu_device *adev, struct dma_buf *dma_buf, struct drm_gem_object *obj, uint64_t va, void *drm_priv, struct kgd_mem **mem, uint64_t *size, uint64_t *mmap_offset) { … } int amdgpu_amdkfd_gpuvm_import_dmabuf_fd(struct amdgpu_device *adev, int fd, uint64_t va, void *drm_priv, struct kgd_mem **mem, uint64_t *size, uint64_t *mmap_offset) { … } int amdgpu_amdkfd_gpuvm_export_dmabuf(struct kgd_mem *mem, struct dma_buf **dma_buf) { … } /* Evict a userptr BO by stopping the queues if necessary * * Runs in MMU notifier, may be in RECLAIM_FS context. This means it * cannot do any memory allocations, and cannot take any locks that * are held elsewhere while allocating memory. * * It doesn't do anything to the BO itself. The real work happens in * restore, where we get updated page addresses. This function only * ensures that GPU access to the BO is stopped. */ int amdgpu_amdkfd_evict_userptr(struct mmu_interval_notifier *mni, unsigned long cur_seq, struct kgd_mem *mem) { … } /* Update invalid userptr BOs * * Moves invalidated (evicted) userptr BOs from userptr_valid_list to * userptr_inval_list and updates user pages for all BOs that have * been invalidated since their last update. */ static int update_invalid_user_pages(struct amdkfd_process_info *process_info, struct mm_struct *mm) { … } /* Validate invalid userptr BOs * * Validates BOs on the userptr_inval_list. Also updates GPUVM page tables * with new page addresses and waits for the page table updates to complete. */ static int validate_invalid_user_pages(struct amdkfd_process_info *process_info) { … } /* Confirm that all user pages are valid while holding the notifier lock * * Moves valid BOs from the userptr_inval_list back to userptr_val_list. */ static int confirm_valid_user_pages_locked(struct amdkfd_process_info *process_info) { … } /* Worker callback to restore evicted userptr BOs * * Tries to update and validate all userptr BOs. If successful and no * concurrent evictions happened, the queues are restarted. Otherwise, * reschedule for another attempt later. */ static void amdgpu_amdkfd_restore_userptr_worker(struct work_struct *work) { … } static void replace_eviction_fence(struct dma_fence __rcu **ef, struct dma_fence *new_ef) { … } /** amdgpu_amdkfd_gpuvm_restore_process_bos - Restore all BOs for the given * KFD process identified by process_info * * @process_info: amdkfd_process_info of the KFD process * * After memory eviction, restore thread calls this function. The function * should be called when the Process is still valid. BO restore involves - * * 1. Release old eviction fence and create new one * 2. Get two copies of PD BO list from all the VMs. Keep one copy as pd_list. * 3 Use the second PD list and kfd_bo_list to create a list (ctx.list) of * BOs that need to be reserved. * 4. Reserve all the BOs * 5. Validate of PD and PT BOs. * 6. Validate all KFD BOs using kfd_bo_list and Map them and add new fence * 7. Add fence to all PD and PT BOs. * 8. Unreserve all BOs */ int amdgpu_amdkfd_gpuvm_restore_process_bos(void *info, struct dma_fence __rcu **ef) { … } int amdgpu_amdkfd_add_gws_to_process(void *info, void *gws, struct kgd_mem **mem) { … } int amdgpu_amdkfd_remove_gws_from_process(void *info, void *mem) { … } /* Returns GPU-specific tiling mode information */ int amdgpu_amdkfd_get_tile_config(struct amdgpu_device *adev, struct tile_config *config) { … } bool amdgpu_amdkfd_bo_mapped_to_dev(struct amdgpu_device *adev, struct kgd_mem *mem) { … } #if defined(CONFIG_DEBUG_FS) int kfd_debugfs_kfd_mem_limits(struct seq_file *m, void *data) { … } #endif