// SPDX-License-Identifier: GPL-2.0 OR MIT /* * Copyright 2020-2021 Advanced Micro Devices, Inc. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR * OTHER DEALINGS IN THE SOFTWARE. */ #include <linux/types.h> #include <linux/sched/task.h> #include <linux/dynamic_debug.h> #include <drm/ttm/ttm_tt.h> #include <drm/drm_exec.h> #include "amdgpu_sync.h" #include "amdgpu_object.h" #include "amdgpu_vm.h" #include "amdgpu_hmm.h" #include "amdgpu.h" #include "amdgpu_xgmi.h" #include "kfd_priv.h" #include "kfd_svm.h" #include "kfd_migrate.h" #include "kfd_smi_events.h" #ifdef dev_fmt #undef dev_fmt #endif #define dev_fmt(fmt) … #define AMDGPU_SVM_RANGE_RESTORE_DELAY_MS … /* Long enough to ensure no retry fault comes after svm range is restored and * page table is updated. */ #define AMDGPU_SVM_RANGE_RETRY_FAULT_PENDING … #if IS_ENABLED(CONFIG_DYNAMIC_DEBUG) #define dynamic_svm_range_dump(svms) … #else #define dynamic_svm_range_dump … #endif /* Giant svm range split into smaller ranges based on this, it is decided using * minimum of all dGPU/APU 1/32 VRAM size, between 2MB to 1GB and alignment to * power of 2MB. */ static uint64_t max_svm_range_pages; struct criu_svm_metadata { … }; static void svm_range_evict_svm_bo_worker(struct work_struct *work); static bool svm_range_cpu_invalidate_pagetables(struct mmu_interval_notifier *mni, const struct mmu_notifier_range *range, unsigned long cur_seq); static int svm_range_check_vm(struct kfd_process *p, uint64_t start, uint64_t last, uint64_t *bo_s, uint64_t *bo_l); static const struct mmu_interval_notifier_ops svm_range_mn_ops = …; /** * svm_range_unlink - unlink svm_range from lists and interval tree * @prange: svm range structure to be removed * * Remove the svm_range from the svms and svm_bo lists and the svms * interval tree. * * Context: The caller must hold svms->lock */ static void svm_range_unlink(struct svm_range *prange) { … } static void svm_range_add_notifier_locked(struct mm_struct *mm, struct svm_range *prange) { … } /** * svm_range_add_to_svms - add svm range to svms * @prange: svm range structure to be added * * Add the svm range to svms interval tree and link list * * Context: The caller must hold svms->lock */ static void svm_range_add_to_svms(struct svm_range *prange) { … } static void svm_range_remove_notifier(struct svm_range *prange) { … } static bool svm_is_valid_dma_mapping_addr(struct device *dev, dma_addr_t dma_addr) { … } static int svm_range_dma_map_dev(struct amdgpu_device *adev, struct svm_range *prange, unsigned long offset, unsigned long npages, unsigned long *hmm_pfns, uint32_t gpuidx) { … } static int svm_range_dma_map(struct svm_range *prange, unsigned long *bitmap, unsigned long offset, unsigned long npages, unsigned long *hmm_pfns) { … } void svm_range_dma_unmap_dev(struct device *dev, dma_addr_t *dma_addr, unsigned long offset, unsigned long npages) { … } void svm_range_dma_unmap(struct svm_range *prange) { … } static void svm_range_free(struct svm_range *prange, bool do_unmap) { … } static void svm_range_set_default_attributes(int32_t *location, int32_t *prefetch_loc, uint8_t *granularity, uint32_t *flags) { … } static struct svm_range *svm_range_new(struct svm_range_list *svms, uint64_t start, uint64_t last, bool update_mem_usage) { … } static bool svm_bo_ref_unless_zero(struct svm_range_bo *svm_bo) { … } static void svm_range_bo_release(struct kref *kref) { … } static void svm_range_bo_wq_release(struct work_struct *work) { … } static void svm_range_bo_release_async(struct kref *kref) { … } void svm_range_bo_unref_async(struct svm_range_bo *svm_bo) { … } static void svm_range_bo_unref(struct svm_range_bo *svm_bo) { … } static bool svm_range_validate_svm_bo(struct kfd_node *node, struct svm_range *prange) { … } static struct svm_range_bo *svm_range_bo_new(void) { … } int svm_range_vram_node_new(struct kfd_node *node, struct svm_range *prange, bool clear) { … } void svm_range_vram_node_free(struct svm_range *prange) { … } struct kfd_node * svm_range_get_node_by_id(struct svm_range *prange, uint32_t gpu_id) { … } struct kfd_process_device * svm_range_get_pdd_by_node(struct svm_range *prange, struct kfd_node *node) { … } static int svm_range_bo_validate(void *param, struct amdgpu_bo *bo) { … } static int svm_range_check_attr(struct kfd_process *p, uint32_t nattr, struct kfd_ioctl_svm_attribute *attrs) { … } static void svm_range_apply_attrs(struct kfd_process *p, struct svm_range *prange, uint32_t nattr, struct kfd_ioctl_svm_attribute *attrs, bool *update_mapping) { … } static bool svm_range_is_same_attrs(struct kfd_process *p, struct svm_range *prange, uint32_t nattr, struct kfd_ioctl_svm_attribute *attrs) { … } /** * svm_range_debug_dump - print all range information from svms * @svms: svm range list header * * debug output svm range start, end, prefetch location from svms * interval tree and link list * * Context: The caller must hold svms->lock */ static void svm_range_debug_dump(struct svm_range_list *svms) { … } static void * svm_range_copy_array(void *psrc, size_t size, uint64_t num_elements, uint64_t offset, uint64_t *vram_pages) { … } static int svm_range_copy_dma_addrs(struct svm_range *dst, struct svm_range *src) { … } static int svm_range_split_array(void *ppnew, void *ppold, size_t size, uint64_t old_start, uint64_t old_n, uint64_t new_start, uint64_t new_n, uint64_t *new_vram_pages) { … } static int svm_range_split_pages(struct svm_range *new, struct svm_range *old, uint64_t start, uint64_t last) { … } static int svm_range_split_nodes(struct svm_range *new, struct svm_range *old, uint64_t start, uint64_t last) { … } /** * svm_range_split_adjust - split range and adjust * * @new: new range * @old: the old range * @start: the old range adjust to start address in pages * @last: the old range adjust to last address in pages * * Copy system memory dma_addr or vram ttm_res in old range to new * range from new_start up to size new->npages, the remaining old range is from * start to last * * Return: * 0 - OK, -ENOMEM - out of memory */ static int svm_range_split_adjust(struct svm_range *new, struct svm_range *old, uint64_t start, uint64_t last) { … } /** * svm_range_split - split a range in 2 ranges * * @prange: the svm range to split * @start: the remaining range start address in pages * @last: the remaining range last address in pages * @new: the result new range generated * * Two cases only: * case 1: if start == prange->start * prange ==> prange[start, last] * new range [last + 1, prange->last] * * case 2: if last == prange->last * prange ==> prange[start, last] * new range [prange->start, start - 1] * * Return: * 0 - OK, -ENOMEM - out of memory, -EINVAL - invalid start, last */ static int svm_range_split(struct svm_range *prange, uint64_t start, uint64_t last, struct svm_range **new) { … } static int svm_range_split_tail(struct svm_range *prange, uint64_t new_last, struct list_head *insert_list, struct list_head *remap_list) { … } static int svm_range_split_head(struct svm_range *prange, uint64_t new_start, struct list_head *insert_list, struct list_head *remap_list) { … } static void svm_range_add_child(struct svm_range *prange, struct mm_struct *mm, struct svm_range *pchild, enum svm_work_list_ops op) { … } static bool svm_nodes_in_same_hive(struct kfd_node *node_a, struct kfd_node *node_b) { … } static uint64_t svm_range_get_pte_flags(struct kfd_node *node, struct svm_range *prange, int domain) { … } static int svm_range_unmap_from_gpu(struct amdgpu_device *adev, struct amdgpu_vm *vm, uint64_t start, uint64_t last, struct dma_fence **fence) { … } static int svm_range_unmap_from_gpus(struct svm_range *prange, unsigned long start, unsigned long last, uint32_t trigger) { … } static int svm_range_map_to_gpu(struct kfd_process_device *pdd, struct svm_range *prange, unsigned long offset, unsigned long npages, bool readonly, dma_addr_t *dma_addr, struct amdgpu_device *bo_adev, struct dma_fence **fence, bool flush_tlb) { … } static int svm_range_map_to_gpus(struct svm_range *prange, unsigned long offset, unsigned long npages, bool readonly, unsigned long *bitmap, bool wait, bool flush_tlb) { … } struct svm_validate_context { … }; static int svm_range_reserve_bos(struct svm_validate_context *ctx, bool intr) { … } static void svm_range_unreserve_bos(struct svm_validate_context *ctx) { … } static void *kfd_svm_page_owner(struct kfd_process *p, int32_t gpuidx) { … } /* * Validation+GPU mapping with concurrent invalidation (MMU notifiers) * * To prevent concurrent destruction or change of range attributes, the * svm_read_lock must be held. The caller must not hold the svm_write_lock * because that would block concurrent evictions and lead to deadlocks. To * serialize concurrent migrations or validations of the same range, the * prange->migrate_mutex must be held. * * For VRAM ranges, the SVM BO must be allocated and valid (protected by its * eviction fence. * * The following sequence ensures race-free validation and GPU mapping: * * 1. Reserve page table (and SVM BO if range is in VRAM) * 2. hmm_range_fault to get page addresses (if system memory) * 3. DMA-map pages (if system memory) * 4-a. Take notifier lock * 4-b. Check that pages still valid (mmu_interval_read_retry) * 4-c. Check that the range was not split or otherwise invalidated * 4-d. Update GPU page table * 4.e. Release notifier lock * 5. Release page table (and SVM BO) reservation */ static int svm_range_validate_and_map(struct mm_struct *mm, unsigned long map_start, unsigned long map_last, struct svm_range *prange, int32_t gpuidx, bool intr, bool wait, bool flush_tlb) { … } /** * svm_range_list_lock_and_flush_work - flush pending deferred work * * @svms: the svm range list * @mm: the mm structure * * Context: Returns with mmap write lock held, pending deferred work flushed * */ void svm_range_list_lock_and_flush_work(struct svm_range_list *svms, struct mm_struct *mm) { … } static void svm_range_restore_work(struct work_struct *work) { … } /** * svm_range_evict - evict svm range * @prange: svm range structure * @mm: current process mm_struct * @start: starting process queue number * @last: last process queue number * @event: mmu notifier event when range is evicted or migrated * * Stop all queues of the process to ensure GPU doesn't access the memory, then * return to let CPU evict the buffer and proceed CPU pagetable update. * * Don't need use lock to sync cpu pagetable invalidation with GPU execution. * If invalidation happens while restore work is running, restore work will * restart to ensure to get the latest CPU pages mapping to GPU, then start * the queues. */ static int svm_range_evict(struct svm_range *prange, struct mm_struct *mm, unsigned long start, unsigned long last, enum mmu_notifier_event event) { … } static struct svm_range *svm_range_clone(struct svm_range *old) { … } void svm_range_set_max_pages(struct amdgpu_device *adev) { … } static int svm_range_split_new(struct svm_range_list *svms, uint64_t start, uint64_t last, uint64_t max_pages, struct list_head *insert_list, struct list_head *update_list) { … } /** * svm_range_add - add svm range and handle overlap * @p: the range add to this process svms * @start: page size aligned * @size: page size aligned * @nattr: number of attributes * @attrs: array of attributes * @update_list: output, the ranges need validate and update GPU mapping * @insert_list: output, the ranges need insert to svms * @remove_list: output, the ranges are replaced and need remove from svms * @remap_list: output, remap unaligned svm ranges * * Check if the virtual address range has overlap with any existing ranges, * split partly overlapping ranges and add new ranges in the gaps. All changes * should be applied to the range_list and interval tree transactionally. If * any range split or allocation fails, the entire update fails. Therefore any * existing overlapping svm_ranges are cloned and the original svm_ranges left * unchanged. * * If the transaction succeeds, the caller can update and insert clones and * new ranges, then free the originals. * * Otherwise the caller can free the clones and new ranges, while the old * svm_ranges remain unchanged. * * Context: Process context, caller must hold svms->lock * * Return: * 0 - OK, otherwise error code */ static int svm_range_add(struct kfd_process *p, uint64_t start, uint64_t size, uint32_t nattr, struct kfd_ioctl_svm_attribute *attrs, struct list_head *update_list, struct list_head *insert_list, struct list_head *remove_list, struct list_head *remap_list) { … } static void svm_range_update_notifier_and_interval_tree(struct mm_struct *mm, struct svm_range *prange) { … } static void svm_range_handle_list_op(struct svm_range_list *svms, struct svm_range *prange, struct mm_struct *mm) { … } static void svm_range_drain_retry_fault(struct svm_range_list *svms) { … } static void svm_range_deferred_list_work(struct work_struct *work) { … } void svm_range_add_list_work(struct svm_range_list *svms, struct svm_range *prange, struct mm_struct *mm, enum svm_work_list_ops op) { … } void schedule_deferred_list_work(struct svm_range_list *svms) { … } static void svm_range_unmap_split(struct mm_struct *mm, struct svm_range *parent, struct svm_range *prange, unsigned long start, unsigned long last) { … } static void svm_range_unmap_from_cpu(struct mm_struct *mm, struct svm_range *prange, unsigned long start, unsigned long last) { … } /** * svm_range_cpu_invalidate_pagetables - interval notifier callback * @mni: mmu_interval_notifier struct * @range: mmu_notifier_range struct * @cur_seq: value to pass to mmu_interval_set_seq() * * If event is MMU_NOTIFY_UNMAP, this is from CPU unmap range, otherwise, it * is from migration, or CPU page invalidation callback. * * For unmap event, unmap range from GPUs, remove prange from svms in a delayed * work thread, and split prange if only part of prange is unmapped. * * For invalidation event, if GPU retry fault is not enabled, evict the queues, * then schedule svm_range_restore_work to update GPU mapping and resume queues. * If GPU retry fault is enabled, unmap the svm range from GPU, retry fault will * update GPU mapping to recover. * * Context: mmap lock, notifier_invalidate_start lock are held * for invalidate event, prange lock is held if this is from migration */ static bool svm_range_cpu_invalidate_pagetables(struct mmu_interval_notifier *mni, const struct mmu_notifier_range *range, unsigned long cur_seq) { … } /** * svm_range_from_addr - find svm range from fault address * @svms: svm range list header * @addr: address to search range interval tree, in pages * @parent: parent range if range is on child list * * Context: The caller must hold svms->lock * * Return: the svm_range found or NULL */ struct svm_range * svm_range_from_addr(struct svm_range_list *svms, unsigned long addr, struct svm_range **parent) { … } /* svm_range_best_restore_location - decide the best fault restore location * @prange: svm range structure * @adev: the GPU on which vm fault happened * * This is only called when xnack is on, to decide the best location to restore * the range mapping after GPU vm fault. Caller uses the best location to do * migration if actual loc is not best location, then update GPU page table * mapping to the best location. * * If the preferred loc is accessible by faulting GPU, use preferred loc. * If vm fault gpu idx is on range ACCESSIBLE bitmap, best_loc is vm fault gpu * If vm fault gpu idx is on range ACCESSIBLE_IN_PLACE bitmap, then * if range actual loc is cpu, best_loc is cpu * if vm fault gpu is on xgmi same hive of range actual loc gpu, best_loc is * range actual loc. * Otherwise, GPU no access, best_loc is -1. * * Return: * -1 means vm fault GPU no access * 0 for CPU or GPU id */ static int32_t svm_range_best_restore_location(struct svm_range *prange, struct kfd_node *node, int32_t *gpuidx) { … } static int svm_range_get_range_boundaries(struct kfd_process *p, int64_t addr, unsigned long *start, unsigned long *last, bool *is_heap_stack) { … } static int svm_range_check_vm_userptr(struct kfd_process *p, uint64_t start, uint64_t last, uint64_t *bo_s, uint64_t *bo_l) { … } static struct svm_range *svm_range_create_unregistered_range(struct kfd_node *node, struct kfd_process *p, struct mm_struct *mm, int64_t addr) { … } /* svm_range_skip_recover - decide if prange can be recovered * @prange: svm range structure * * GPU vm retry fault handle skip recover the range for cases: * 1. prange is on deferred list to be removed after unmap, it is stale fault, * deferred list work will drain the stale fault before free the prange. * 2. prange is on deferred list to add interval notifier after split, or * 3. prange is child range, it is split from parent prange, recover later * after interval notifier is added. * * Return: true to skip recover, false to recover */ static bool svm_range_skip_recover(struct svm_range *prange) { … } static void svm_range_count_fault(struct kfd_node *node, struct kfd_process *p, int32_t gpuidx) { … } static bool svm_fault_allowed(struct vm_area_struct *vma, bool write_fault) { … } int svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid, uint32_t vmid, uint32_t node_id, uint64_t addr, bool write_fault) { … } int svm_range_switch_xnack_reserve_mem(struct kfd_process *p, bool xnack_enabled) { … } void svm_range_list_fini(struct kfd_process *p) { … } int svm_range_list_init(struct kfd_process *p) { … } /** * svm_range_check_vm - check if virtual address range mapped already * @p: current kfd_process * @start: range start address, in pages * @last: range last address, in pages * @bo_s: mapping start address in pages if address range already mapped * @bo_l: mapping last address in pages if address range already mapped * * The purpose is to avoid virtual address ranges already allocated by * kfd_ioctl_alloc_memory_of_gpu ioctl. * It looks for each pdd in the kfd_process. * * Context: Process context * * Return 0 - OK, if the range is not mapped. * Otherwise error code: * -EADDRINUSE - if address is mapped already by kfd_ioctl_alloc_memory_of_gpu * -ERESTARTSYS - A wait for the buffer to become unreserved was interrupted by * a signal. Release all buffer reservations and return to user-space. */ static int svm_range_check_vm(struct kfd_process *p, uint64_t start, uint64_t last, uint64_t *bo_s, uint64_t *bo_l) { … } /** * svm_range_is_valid - check if virtual address range is valid * @p: current kfd_process * @start: range start address, in pages * @size: range size, in pages * * Valid virtual address range means it belongs to one or more VMAs * * Context: Process context * * Return: * 0 - OK, otherwise error code */ static int svm_range_is_valid(struct kfd_process *p, uint64_t start, uint64_t size) { … } /** * svm_range_best_prefetch_location - decide the best prefetch location * @prange: svm range structure * * For xnack off: * If range map to single GPU, the best prefetch location is prefetch_loc, which * can be CPU or GPU. * * If range is ACCESS or ACCESS_IN_PLACE by mGPUs, only if mGPU connection on * XGMI same hive, the best prefetch location is prefetch_loc GPU, othervise * the best prefetch location is always CPU, because GPU can not have coherent * mapping VRAM of other GPUs even with large-BAR PCIe connection. * * For xnack on: * If range is not ACCESS_IN_PLACE by mGPUs, the best prefetch location is * prefetch_loc, other GPU access will generate vm fault and trigger migration. * * If range is ACCESS_IN_PLACE by mGPUs, only if mGPU connection on XGMI same * hive, the best prefetch location is prefetch_loc GPU, otherwise the best * prefetch location is always CPU. * * Context: Process context * * Return: * 0 for CPU or GPU id */ static uint32_t svm_range_best_prefetch_location(struct svm_range *prange) { … } /* svm_range_trigger_migration - start page migration if prefetch loc changed * @mm: current process mm_struct * @prange: svm range structure * @migrated: output, true if migration is triggered * * If range perfetch_loc is GPU, actual loc is cpu 0, then migrate the range * from ram to vram. * If range prefetch_loc is cpu 0, actual loc is GPU, then migrate the range * from vram to ram. * * If GPU vm fault retry is not enabled, migration interact with MMU notifier * and restore work: * 1. migrate_vma_setup invalidate pages, MMU notifier callback svm_range_evict * stops all queues, schedule restore work * 2. svm_range_restore_work wait for migration is done by * a. svm_range_validate_vram takes prange->migrate_mutex * b. svm_range_validate_ram HMM get pages wait for CPU fault handle returns * 3. restore work update mappings of GPU, resume all queues. * * Context: Process context * * Return: * 0 - OK, otherwise - error code of migration */ static int svm_range_trigger_migration(struct mm_struct *mm, struct svm_range *prange, bool *migrated) { … } int svm_range_schedule_evict_svm_bo(struct amdgpu_amdkfd_fence *fence) { … } static void svm_range_evict_svm_bo_worker(struct work_struct *work) { … } static int svm_range_set_attr(struct kfd_process *p, struct mm_struct *mm, uint64_t start, uint64_t size, uint32_t nattr, struct kfd_ioctl_svm_attribute *attrs) { … } static int svm_range_get_attr(struct kfd_process *p, struct mm_struct *mm, uint64_t start, uint64_t size, uint32_t nattr, struct kfd_ioctl_svm_attribute *attrs) { … } int kfd_criu_resume_svm(struct kfd_process *p) { … } int kfd_criu_restore_svm(struct kfd_process *p, uint8_t __user *user_priv_ptr, uint64_t *priv_data_offset, uint64_t max_priv_data_size) { … } int svm_range_get_info(struct kfd_process *p, uint32_t *num_svm_ranges, uint64_t *svm_priv_data_size) { … } int kfd_criu_checkpoint_svm(struct kfd_process *p, uint8_t __user *user_priv_data, uint64_t *priv_data_offset) { … } int svm_ioctl(struct kfd_process *p, enum kfd_ioctl_svm_op op, uint64_t start, uint64_t size, uint32_t nattrs, struct kfd_ioctl_svm_attribute *attrs) { … }