// SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. * * The iopt_pages is the center of the storage and motion of PFNs. Each * iopt_pages represents a logical linear array of full PFNs. The array is 0 * based and has npages in it. Accessors use 'index' to refer to the entry in * this logical array, regardless of its storage location. * * PFNs are stored in a tiered scheme: * 1) iopt_pages::pinned_pfns xarray * 2) An iommu_domain * 3) The origin of the PFNs, i.e. the userspace pointer * * PFN have to be copied between all combinations of tiers, depending on the * configuration. * * When a PFN is taken out of the userspace pointer it is pinned exactly once. * The storage locations of the PFN's index are tracked in the two interval * trees. If no interval includes the index then it is not pinned. * * If access_itree includes the PFN's index then an in-kernel access has * requested the page. The PFN is stored in the xarray so other requestors can * continue to find it. * * If the domains_itree includes the PFN's index then an iommu_domain is storing * the PFN and it can be read back using iommu_iova_to_phys(). To avoid * duplicating storage the xarray is not used if only iommu_domains are using * the PFN's index. * * As a general principle this is designed so that destroy never fails. This * means removing an iommu_domain or releasing a in-kernel access will not fail * due to insufficient memory. In practice this means some cases have to hold * PFNs in the xarray even though they are also being stored in an iommu_domain. * * While the iopt_pages can use an iommu_domain as storage, it does not have an * IOVA itself. Instead the iopt_area represents a range of IOVA and uses the * iopt_pages as the PFN provider. Multiple iopt_areas can share the iopt_pages * and reference their own slice of the PFN array, with sub page granularity. * * In this file the term 'last' indicates an inclusive and closed interval, eg * [0,0] refers to a single PFN. 'end' means an open range, eg [0,0) refers to * no PFNs. * * Be cautious of overflow. An IOVA can go all the way up to U64_MAX, so * last_iova + 1 can overflow. An iopt_pages index will always be much less than * ULONG_MAX so last_index + 1 cannot overflow. */ #include <linux/highmem.h> #include <linux/iommu.h> #include <linux/iommufd.h> #include <linux/kthread.h> #include <linux/overflow.h> #include <linux/slab.h> #include <linux/sched/mm.h> #include "double_span.h" #include "io_pagetable.h" #ifndef CONFIG_IOMMUFD_TEST #define TEMP_MEMORY_LIMIT … #else #define TEMP_MEMORY_LIMIT … #endif #define BATCH_BACKUP_SIZE … /* * More memory makes pin_user_pages() and the batching more efficient, but as * this is only a performance optimization don't try too hard to get it. A 64k * allocation can hold about 26M of 4k pages and 13G of 2M pages in an * pfn_batch. Various destroy paths cannot fail and provide a small amount of * stack memory as a backup contingency. If backup_len is given this cannot * fail. */ static void *temp_kmalloc(size_t *size, void *backup, size_t backup_len) { … } void interval_tree_double_span_iter_update( struct interval_tree_double_span_iter *iter) { … } void interval_tree_double_span_iter_first( struct interval_tree_double_span_iter *iter, struct rb_root_cached *itree1, struct rb_root_cached *itree2, unsigned long first_index, unsigned long last_index) { … } void interval_tree_double_span_iter_next( struct interval_tree_double_span_iter *iter) { … } static void iopt_pages_add_npinned(struct iopt_pages *pages, size_t npages) { … } static void iopt_pages_sub_npinned(struct iopt_pages *pages, size_t npages) { … } static void iopt_pages_err_unpin(struct iopt_pages *pages, unsigned long start_index, unsigned long last_index, struct page **page_list) { … } /* * index is the number of PAGE_SIZE units from the start of the area's * iopt_pages. If the iova is sub page-size then the area has an iova that * covers a portion of the first and last pages in the range. */ static unsigned long iopt_area_index_to_iova(struct iopt_area *area, unsigned long index) { … } static unsigned long iopt_area_index_to_iova_last(struct iopt_area *area, unsigned long index) { … } static void iommu_unmap_nofail(struct iommu_domain *domain, unsigned long iova, size_t size) { … } static void iopt_area_unmap_domain_range(struct iopt_area *area, struct iommu_domain *domain, unsigned long start_index, unsigned long last_index) { … } static struct iopt_area *iopt_pages_find_domain_area(struct iopt_pages *pages, unsigned long index) { … } /* * A simple datastructure to hold a vector of PFNs, optimized for contiguous * PFNs. This is used as a temporary holding memory for shuttling pfns from one * place to another. Generally everything is made more efficient if operations * work on the largest possible grouping of pfns. eg fewer lock/unlock cycles, * better cache locality, etc */ struct pfn_batch { … }; static void batch_clear(struct pfn_batch *batch) { … } /* * Carry means we carry a portion of the final hugepage over to the front of the * batch */ static void batch_clear_carry(struct pfn_batch *batch, unsigned int keep_pfns) { … } static void batch_skip_carry(struct pfn_batch *batch, unsigned int skip_pfns) { … } static int __batch_init(struct pfn_batch *batch, size_t max_pages, void *backup, size_t backup_len) { … } static int batch_init(struct pfn_batch *batch, size_t max_pages) { … } static void batch_init_backup(struct pfn_batch *batch, size_t max_pages, void *backup, size_t backup_len) { … } static void batch_destroy(struct pfn_batch *batch, void *backup) { … } /* true if the pfn was added, false otherwise */ static bool batch_add_pfn(struct pfn_batch *batch, unsigned long pfn) { … } /* * Fill the batch with pfns from the domain. When the batch is full, or it * reaches last_index, the function will return. The caller should use * batch->total_pfns to determine the starting point for the next iteration. */ static void batch_from_domain(struct pfn_batch *batch, struct iommu_domain *domain, struct iopt_area *area, unsigned long start_index, unsigned long last_index) { … } static struct page **raw_pages_from_domain(struct iommu_domain *domain, struct iopt_area *area, unsigned long start_index, unsigned long last_index, struct page **out_pages) { … } /* Continues reading a domain until we reach a discontinuity in the pfns. */ static void batch_from_domain_continue(struct pfn_batch *batch, struct iommu_domain *domain, struct iopt_area *area, unsigned long start_index, unsigned long last_index) { … } /* * This is part of the VFIO compatibility support for VFIO_TYPE1_IOMMU. That * mode permits splitting a mapped area up, and then one of the splits is * unmapped. Doing this normally would cause us to violate our invariant of * pairing map/unmap. Thus, to support old VFIO compatibility disable support * for batching consecutive PFNs. All PFNs mapped into the iommu are done in * PAGE_SIZE units, not larger or smaller. */ static int batch_iommu_map_small(struct iommu_domain *domain, unsigned long iova, phys_addr_t paddr, size_t size, int prot) { … } static int batch_to_domain(struct pfn_batch *batch, struct iommu_domain *domain, struct iopt_area *area, unsigned long start_index) { … } static void batch_from_xarray(struct pfn_batch *batch, struct xarray *xa, unsigned long start_index, unsigned long last_index) { … } static void batch_from_xarray_clear(struct pfn_batch *batch, struct xarray *xa, unsigned long start_index, unsigned long last_index) { … } static void clear_xarray(struct xarray *xa, unsigned long start_index, unsigned long last_index) { … } static int pages_to_xarray(struct xarray *xa, unsigned long start_index, unsigned long last_index, struct page **pages) { … } static void batch_from_pages(struct pfn_batch *batch, struct page **pages, size_t npages) { … } static void batch_unpin(struct pfn_batch *batch, struct iopt_pages *pages, unsigned int first_page_off, size_t npages) { … } static void copy_data_page(struct page *page, void *data, unsigned long offset, size_t length, unsigned int flags) { … } static unsigned long batch_rw(struct pfn_batch *batch, void *data, unsigned long offset, unsigned long length, unsigned int flags) { … } /* pfn_reader_user is just the pin_user_pages() path */ struct pfn_reader_user { … }; static void pfn_reader_user_init(struct pfn_reader_user *user, struct iopt_pages *pages) { … } static void pfn_reader_user_destroy(struct pfn_reader_user *user, struct iopt_pages *pages) { … } static int pfn_reader_user_pin(struct pfn_reader_user *user, struct iopt_pages *pages, unsigned long start_index, unsigned long last_index) { … } /* This is the "modern" and faster accounting method used by io_uring */ static int incr_user_locked_vm(struct iopt_pages *pages, unsigned long npages) { … } static void decr_user_locked_vm(struct iopt_pages *pages, unsigned long npages) { … } /* This is the accounting method used for compatibility with VFIO */ static int update_mm_locked_vm(struct iopt_pages *pages, unsigned long npages, bool inc, struct pfn_reader_user *user) { … } static int do_update_pinned(struct iopt_pages *pages, unsigned long npages, bool inc, struct pfn_reader_user *user) { … } static void update_unpinned(struct iopt_pages *pages) { … } /* * Changes in the number of pages pinned is done after the pages have been read * and processed. If the user lacked the limit then the error unwind will unpin * everything that was just pinned. This is because it is expensive to calculate * how many pages we have already pinned within a range to generate an accurate * prediction in advance of doing the work to actually pin them. */ static int pfn_reader_user_update_pinned(struct pfn_reader_user *user, struct iopt_pages *pages) { … } /* * PFNs are stored in three places, in order of preference: * - The iopt_pages xarray. This is only populated if there is a * iopt_pages_access * - The iommu_domain under an area * - The original PFN source, ie pages->source_mm * * This iterator reads the pfns optimizing to load according to the * above order. */ struct pfn_reader { … }; static int pfn_reader_update_pinned(struct pfn_reader *pfns) { … } /* * The batch can contain a mixture of pages that are still in use and pages that * need to be unpinned. Unpin only pages that are not held anywhere else. */ static void pfn_reader_unpin(struct pfn_reader *pfns) { … } /* Process a single span to load it from the proper storage */ static int pfn_reader_fill_span(struct pfn_reader *pfns) { … } static bool pfn_reader_done(struct pfn_reader *pfns) { … } static int pfn_reader_next(struct pfn_reader *pfns) { … } static int pfn_reader_init(struct pfn_reader *pfns, struct iopt_pages *pages, unsigned long start_index, unsigned long last_index) { … } /* * There are many assertions regarding the state of pages->npinned vs * pages->last_pinned, for instance something like unmapping a domain must only * decrement the npinned, and pfn_reader_destroy() must be called only after all * the pins are updated. This is fine for success flows, but error flows * sometimes need to release the pins held inside the pfn_reader before going on * to complete unmapping and releasing pins held in domains. */ static void pfn_reader_release_pins(struct pfn_reader *pfns) { … } static void pfn_reader_destroy(struct pfn_reader *pfns) { … } static int pfn_reader_first(struct pfn_reader *pfns, struct iopt_pages *pages, unsigned long start_index, unsigned long last_index) { … } struct iopt_pages *iopt_alloc_pages(void __user *uptr, unsigned long length, bool writable) { … } void iopt_release_pages(struct kref *kref) { … } static void iopt_area_unpin_domain(struct pfn_batch *batch, struct iopt_area *area, struct iopt_pages *pages, struct iommu_domain *domain, unsigned long start_index, unsigned long last_index, unsigned long *unmapped_end_index, unsigned long real_last_index) { … } static void __iopt_area_unfill_domain(struct iopt_area *area, struct iopt_pages *pages, struct iommu_domain *domain, unsigned long last_index) { … } static void iopt_area_unfill_partial_domain(struct iopt_area *area, struct iopt_pages *pages, struct iommu_domain *domain, unsigned long end_index) { … } /** * iopt_area_unmap_domain() - Unmap without unpinning PFNs in a domain * @area: The IOVA range to unmap * @domain: The domain to unmap * * The caller must know that unpinning is not required, usually because there * are other domains in the iopt. */ void iopt_area_unmap_domain(struct iopt_area *area, struct iommu_domain *domain) { … } /** * iopt_area_unfill_domain() - Unmap and unpin PFNs in a domain * @area: IOVA area to use * @pages: page supplier for the area (area->pages is NULL) * @domain: Domain to unmap from * * The domain should be removed from the domains_itree before calling. The * domain will always be unmapped, but the PFNs may not be unpinned if there are * still accesses. */ void iopt_area_unfill_domain(struct iopt_area *area, struct iopt_pages *pages, struct iommu_domain *domain) { … } /** * iopt_area_fill_domain() - Map PFNs from the area into a domain * @area: IOVA area to use * @domain: Domain to load PFNs into * * Read the pfns from the area's underlying iopt_pages and map them into the * given domain. Called when attaching a new domain to an io_pagetable. */ int iopt_area_fill_domain(struct iopt_area *area, struct iommu_domain *domain) { … } /** * iopt_area_fill_domains() - Install PFNs into the area's domains * @area: The area to act on * @pages: The pages associated with the area (area->pages is NULL) * * Called during area creation. The area is freshly created and not inserted in * the domains_itree yet. PFNs are read and loaded into every domain held in the * area's io_pagetable and the area is installed in the domains_itree. * * On failure all domains are left unchanged. */ int iopt_area_fill_domains(struct iopt_area *area, struct iopt_pages *pages) { … } /** * iopt_area_unfill_domains() - unmap PFNs from the area's domains * @area: The area to act on * @pages: The pages associated with the area (area->pages is NULL) * * Called during area destruction. This unmaps the iova's covered by all the * area's domains and releases the PFNs. */ void iopt_area_unfill_domains(struct iopt_area *area, struct iopt_pages *pages) { … } static void iopt_pages_unpin_xarray(struct pfn_batch *batch, struct iopt_pages *pages, unsigned long start_index, unsigned long end_index) { … } /** * iopt_pages_unfill_xarray() - Update the xarry after removing an access * @pages: The pages to act on * @start_index: Starting PFN index * @last_index: Last PFN index * * Called when an iopt_pages_access is removed, removes pages from the itree. * The access should already be removed from the access_itree. */ void iopt_pages_unfill_xarray(struct iopt_pages *pages, unsigned long start_index, unsigned long last_index) { … } /** * iopt_pages_fill_from_xarray() - Fast path for reading PFNs * @pages: The pages to act on * @start_index: The first page index in the range * @last_index: The last page index in the range * @out_pages: The output array to return the pages * * This can be called if the caller is holding a refcount on an * iopt_pages_access that is known to have already been filled. It quickly reads * the pages directly from the xarray. * * This is part of the SW iommu interface to read pages for in-kernel use. */ void iopt_pages_fill_from_xarray(struct iopt_pages *pages, unsigned long start_index, unsigned long last_index, struct page **out_pages) { … } static int iopt_pages_fill_from_domain(struct iopt_pages *pages, unsigned long start_index, unsigned long last_index, struct page **out_pages) { … } static int iopt_pages_fill_from_mm(struct iopt_pages *pages, struct pfn_reader_user *user, unsigned long start_index, unsigned long last_index, struct page **out_pages) { … } /** * iopt_pages_fill_xarray() - Read PFNs * @pages: The pages to act on * @start_index: The first page index in the range * @last_index: The last page index in the range * @out_pages: The output array to return the pages, may be NULL * * This populates the xarray and returns the pages in out_pages. As the slow * path this is able to copy pages from other storage tiers into the xarray. * * On failure the xarray is left unchanged. * * This is part of the SW iommu interface to read pages for in-kernel use. */ int iopt_pages_fill_xarray(struct iopt_pages *pages, unsigned long start_index, unsigned long last_index, struct page **out_pages) { … } /* * This uses the pfn_reader instead of taking a shortcut by using the mm. It can * do every scenario and is fully consistent with what an iommu_domain would * see. */ static int iopt_pages_rw_slow(struct iopt_pages *pages, unsigned long start_index, unsigned long last_index, unsigned long offset, void *data, unsigned long length, unsigned int flags) { … } /* * A medium speed path that still allows DMA inconsistencies, but doesn't do any * memory allocations or interval tree searches. */ static int iopt_pages_rw_page(struct iopt_pages *pages, unsigned long index, unsigned long offset, void *data, unsigned long length, unsigned int flags) { … } /** * iopt_pages_rw_access - Copy to/from a linear slice of the pages * @pages: pages to act on * @start_byte: First byte of pages to copy to/from * @data: Kernel buffer to get/put the data * @length: Number of bytes to copy * @flags: IOMMUFD_ACCESS_RW_* flags * * This will find each page in the range, kmap it and then memcpy to/from * the given kernel buffer. */ int iopt_pages_rw_access(struct iopt_pages *pages, unsigned long start_byte, void *data, unsigned long length, unsigned int flags) { … } static struct iopt_pages_access * iopt_pages_get_exact_access(struct iopt_pages *pages, unsigned long index, unsigned long last) { … } /** * iopt_area_add_access() - Record an in-knerel access for PFNs * @area: The source of PFNs * @start_index: First page index * @last_index: Inclusive last page index * @out_pages: Output list of struct page's representing the PFNs * @flags: IOMMUFD_ACCESS_RW_* flags * * Record that an in-kernel access will be accessing the pages, ensure they are * pinned, and return the PFNs as a simple list of 'struct page *'. * * This should be undone through a matching call to iopt_area_remove_access() */ int iopt_area_add_access(struct iopt_area *area, unsigned long start_index, unsigned long last_index, struct page **out_pages, unsigned int flags) { … } /** * iopt_area_remove_access() - Release an in-kernel access for PFNs * @area: The source of PFNs * @start_index: First page index * @last_index: Inclusive last page index * * Undo iopt_area_add_access() and unpin the pages if necessary. The caller * must stop using the PFNs before calling this. */ void iopt_area_remove_access(struct iopt_area *area, unsigned long start_index, unsigned long last_index) { … }