i915_gem_execbuffer.c | Explore in Territory

/*
 * SPDX-License-Identifier: MIT
 *
 * Copyright © 2008,2010 Intel Corporation
 */

#include <linux/dma-resv.h>
#include <linux/highmem.h>
#include <linux/sync_file.h>
#include <linux/uaccess.h>

#include <drm/drm_auth.h>
#include <drm/drm_syncobj.h>

#include "gem/i915_gem_ioctls.h"
#include "gt/intel_context.h"
#include "gt/intel_gpu_commands.h"
#include "gt/intel_gt.h"
#include "gt/intel_gt_buffer_pool.h"
#include "gt/intel_gt_pm.h"
#include "gt/intel_ring.h"

#include "pxp/intel_pxp.h"

#include "i915_cmd_parser.h"
#include "i915_drv.h"
#include "i915_file_private.h"
#include "i915_gem_clflush.h"
#include "i915_gem_context.h"
#include "i915_gem_evict.h"
#include "i915_gem_ioctls.h"
#include "i915_reg.h"
#include "i915_trace.h"
#include "i915_user_extensions.h"

struct eb_vma { … };

enum { … };

/* __EXEC_OBJECT_ flags > BIT(29) defined in i915_vma.h */
#define __EXEC_OBJECT_HAS_PIN …
#define __EXEC_OBJECT_HAS_FENCE …
#define __EXEC_OBJECT_USERPTR_INIT …
#define __EXEC_OBJECT_NEEDS_MAP …
#define __EXEC_OBJECT_NEEDS_BIAS …
#define __EXEC_OBJECT_INTERNAL_FLAGS …
#define __EXEC_OBJECT_RESERVED …

#define __EXEC_HAS_RELOC …
#define __EXEC_ENGINE_PINNED …
#define __EXEC_USERPTR_USED …
#define __EXEC_INTERNAL_FLAGS …
#define UPDATE …

#define BATCH_OFFSET_BIAS …

#define __I915_EXEC_ILLEGAL_FLAGS …

/* Catch emission of unexpected errors for CI! */
#if IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)
#undef EINVAL
#define EINVAL …
#endif

/**
 * DOC: User command execution
 *
 * Userspace submits commands to be executed on the GPU as an instruction
 * stream within a GEM object we call a batchbuffer. This instructions may
 * refer to other GEM objects containing auxiliary state such as kernels,
 * samplers, render targets and even secondary batchbuffers. Userspace does
 * not know where in the GPU memory these objects reside and so before the
 * batchbuffer is passed to the GPU for execution, those addresses in the
 * batchbuffer and auxiliary objects are updated. This is known as relocation,
 * or patching. To try and avoid having to relocate each object on the next
 * execution, userspace is told the location of those objects in this pass,
 * but this remains just a hint as the kernel may choose a new location for
 * any object in the future.
 *
 * At the level of talking to the hardware, submitting a batchbuffer for the
 * GPU to execute is to add content to a buffer from which the HW
 * command streamer is reading.
 *
 * 1. Add a command to load the HW context. For Logical Ring Contexts, i.e.
 *    Execlists, this command is not placed on the same buffer as the
 *    remaining items.
 *
 * 2. Add a command to invalidate caches to the buffer.
 *
 * 3. Add a batchbuffer start command to the buffer; the start command is
 *    essentially a token together with the GPU address of the batchbuffer
 *    to be executed.
 *
 * 4. Add a pipeline flush to the buffer.
 *
 * 5. Add a memory write command to the buffer to record when the GPU
 *    is done executing the batchbuffer. The memory write writes the
 *    global sequence number of the request, ``i915_request::global_seqno``;
 *    the i915 driver uses the current value in the register to determine
 *    if the GPU has completed the batchbuffer.
 *
 * 6. Add a user interrupt command to the buffer. This command instructs
 *    the GPU to issue an interrupt when the command, pipeline flush and
 *    memory write are completed.
 *
 * 7. Inform the hardware of the additional commands added to the buffer
 *    (by updating the tail pointer).
 *
 * Processing an execbuf ioctl is conceptually split up into a few phases.
 *
 * 1. Validation - Ensure all the pointers, handles and flags are valid.
 * 2. Reservation - Assign GPU address space for every object
 * 3. Relocation - Update any addresses to point to the final locations
 * 4. Serialisation - Order the request with respect to its dependencies
 * 5. Construction - Construct a request to execute the batchbuffer
 * 6. Submission (at some point in the future execution)
 *
 * Reserving resources for the execbuf is the most complicated phase. We
 * neither want to have to migrate the object in the address space, nor do
 * we want to have to update any relocations pointing to this object. Ideally,
 * we want to leave the object where it is and for all the existing relocations
 * to match. If the object is given a new address, or if userspace thinks the
 * object is elsewhere, we have to parse all the relocation entries and update
 * the addresses. Userspace can set the I915_EXEC_NORELOC flag to hint that
 * all the target addresses in all of its objects match the value in the
 * relocation entries and that they all match the presumed offsets given by the
 * list of execbuffer objects. Using this knowledge, we know that if we haven't
 * moved any buffers, all the relocation entries are valid and we can skip
 * the update. (If userspace is wrong, the likely outcome is an impromptu GPU
 * hang.) The requirement for using I915_EXEC_NO_RELOC are:
 *
 *      The addresses written in the objects must match the corresponding
 *      reloc.presumed_offset which in turn must match the corresponding
 *      execobject.offset.
 *
 *      Any render targets written to in the batch must be flagged with
 *      EXEC_OBJECT_WRITE.
 *
 *      To avoid stalling, execobject.offset should match the current
 *      address of that object within the active context.
 *
 * The reservation is done is multiple phases. First we try and keep any
 * object already bound in its current location - so as long as meets the
 * constraints imposed by the new execbuffer. Any object left unbound after the
 * first pass is then fitted into any available idle space. If an object does
 * not fit, all objects are removed from the reservation and the process rerun
 * after sorting the objects into a priority order (more difficult to fit
 * objects are tried first). Failing that, the entire VM is cleared and we try
 * to fit the execbuf once last time before concluding that it simply will not
 * fit.
 *
 * A small complication to all of this is that we allow userspace not only to
 * specify an alignment and a size for the object in the address space, but
 * we also allow userspace to specify the exact offset. This objects are
 * simpler to place (the location is known a priori) all we have to do is make
 * sure the space is available.
 *
 * Once all the objects are in place, patching up the buried pointers to point
 * to the final locations is a fairly simple job of walking over the relocation
 * entry arrays, looking up the right address and rewriting the value into
 * the object. Simple! ... The relocation entries are stored in user memory
 * and so to access them we have to copy them into a local buffer. That copy
 * has to avoid taking any pagefaults as they may lead back to a GEM object
 * requiring the struct_mutex (i.e. recursive deadlock). So once again we split
 * the relocation into multiple passes. First we try to do everything within an
 * atomic context (avoid the pagefaults) which requires that we never wait. If
 * we detect that we may wait, or if we need to fault, then we have to fallback
 * to a slower path. The slowpath has to drop the mutex. (Can you hear alarm
 * bells yet?) Dropping the mutex means that we lose all the state we have
 * built up so far for the execbuf and we must reset any global data. However,
 * we do leave the objects pinned in their final locations - which is a
 * potential issue for concurrent execbufs. Once we have left the mutex, we can
 * allocate and copy all the relocation entries into a large array at our
 * leisure, reacquire the mutex, reclaim all the objects and other state and
 * then proceed to update any incorrect addresses with the objects.
 *
 * As we process the relocation entries, we maintain a record of whether the
 * object is being written to. Using NORELOC, we expect userspace to provide
 * this information instead. We also check whether we can skip the relocation
 * by comparing the expected value inside the relocation entry with the target's
 * final address. If they differ, we have to map the current object and rewrite
 * the 4 or 8 byte pointer within.
 *
 * Serialising an execbuf is quite simple according to the rules of the GEM
 * ABI. Execution within each context is ordered by the order of submission.
 * Writes to any GEM object are in order of submission and are exclusive. Reads
 * from a GEM object are unordered with respect to other reads, but ordered by
 * writes. A write submitted after a read cannot occur before the read, and
 * similarly any read submitted after a write cannot occur before the write.
 * Writes are ordered between engines such that only one write occurs at any
 * time (completing any reads beforehand) - using semaphores where available
 * and CPU serialisation otherwise. Other GEM access obey the same rules, any
 * write (either via mmaps using set-domain, or via pwrite) must flush all GPU
 * reads before starting, and any read (either using set-domain or pread) must
 * flush all GPU writes before starting. (Note we only employ a barrier before,
 * we currently rely on userspace not concurrently starting a new execution
 * whilst reading or writing to an object. This may be an advantage or not
 * depending on how much you trust userspace not to shoot themselves in the
 * foot.) Serialisation may just result in the request being inserted into
 * a DAG awaiting its turn, but most simple is to wait on the CPU until
 * all dependencies are resolved.
 *
 * After all of that, is just a matter of closing the request and handing it to
 * the hardware (well, leaving it in a queue to be executed). However, we also
 * offer the ability for batchbuffers to be run with elevated privileges so
 * that they access otherwise hidden registers. (Used to adjust L3 cache etc.)
 * Before any batch is given extra privileges we first must check that it
 * contains no nefarious instructions, we check that each instruction is from
 * our whitelist and all registers are also from an allowed list. We first
 * copy the user's batchbuffer to a shadow (so that the user doesn't have
 * access to it, either by the CPU or GPU as we scan it) and then parse each
 * instruction. If everything is ok, we set a flag telling the hardware to run
 * the batchbuffer in trusted mode, otherwise the ioctl is rejected.
 */

struct eb_fence { … };

struct i915_execbuffer { … };

static int eb_parse(struct i915_execbuffer *eb);
static int eb_pin_engine(struct i915_execbuffer *eb, bool throttle);
static void eb_unpin_engine(struct i915_execbuffer *eb);
static void eb_capture_release(struct i915_execbuffer *eb);

static bool eb_use_cmdparser(const struct i915_execbuffer *eb)
{ … }

static int eb_create(struct i915_execbuffer *eb)
{ … }

static bool
eb_vma_misplaced(const struct drm_i915_gem_exec_object2 *entry,
		 const struct i915_vma *vma,
		 unsigned int flags)
{ … }

static u64 eb_pin_flags(const struct drm_i915_gem_exec_object2 *entry,
			unsigned int exec_flags)
{ … }

static int
eb_pin_vma(struct i915_execbuffer *eb,
	   const struct drm_i915_gem_exec_object2 *entry,
	   struct eb_vma *ev)
{ … }

static void
eb_unreserve_vma(struct eb_vma *ev)
{ … }

static int
eb_validate_vma(struct i915_execbuffer *eb,
		struct drm_i915_gem_exec_object2 *entry,
		struct i915_vma *vma)
{ … }

static bool
is_batch_buffer(struct i915_execbuffer *eb, unsigned int buffer_idx)
{ … }

static int
eb_add_vma(struct i915_execbuffer *eb,
	   unsigned int *current_batch,
	   unsigned int i,
	   struct i915_vma *vma)
{ … }

static int use_cpu_reloc(const struct reloc_cache *cache,
			 const struct drm_i915_gem_object *obj)
{ … }

static int eb_reserve_vma(struct i915_execbuffer *eb,
			  struct eb_vma *ev,
			  u64 pin_flags)
{ … }

static bool eb_unbind(struct i915_execbuffer *eb, bool force)
{ … }

static int eb_reserve(struct i915_execbuffer *eb)
{ … }

static int eb_select_context(struct i915_execbuffer *eb)
{ … }

static int __eb_add_lut(struct i915_execbuffer *eb,
			u32 handle, struct i915_vma *vma)
{ … }

static struct i915_vma *eb_lookup_vma(struct i915_execbuffer *eb, u32 handle)
{ … }

static int eb_lookup_vmas(struct i915_execbuffer *eb)
{ … }

static int eb_lock_vmas(struct i915_execbuffer *eb)
{ … }

static int eb_validate_vmas(struct i915_execbuffer *eb)
{ … }

static struct eb_vma *
eb_get_vma(const struct i915_execbuffer *eb, unsigned long handle)
{ … }

static void eb_release_vmas(struct i915_execbuffer *eb, bool final)
{ … }

static void eb_destroy(const struct i915_execbuffer *eb)
{ … }

static u64
relocation_target(const struct drm_i915_gem_relocation_entry *reloc,
		  const struct i915_vma *target)
{ … }

static void reloc_cache_init(struct reloc_cache *cache,
			     struct drm_i915_private *i915)
{ … }

static void *unmask_page(unsigned long p)
{ … }

static unsigned int unmask_flags(unsigned long p)
{ … }

#define KMAP …

static struct i915_ggtt *cache_to_ggtt(struct reloc_cache *cache)
{ … }

static void reloc_cache_unmap(struct reloc_cache *cache)
{ … }

static void reloc_cache_remap(struct reloc_cache *cache,
			      struct drm_i915_gem_object *obj)
{ … }

static void reloc_cache_reset(struct reloc_cache *cache, struct i915_execbuffer *eb)
{ … }

static void *reloc_kmap(struct drm_i915_gem_object *obj,
			struct reloc_cache *cache,
			unsigned long pageno)
{ … }

static void *reloc_iomap(struct i915_vma *batch,
			 struct i915_execbuffer *eb,
			 unsigned long page)
{ … }

static void *reloc_vaddr(struct i915_vma *vma,
			 struct i915_execbuffer *eb,
			 unsigned long page)
{ … }

static void clflush_write32(u32 *addr, u32 value, unsigned int flushes)
{ … }

static u64
relocate_entry(struct i915_vma *vma,
	       const struct drm_i915_gem_relocation_entry *reloc,
	       struct i915_execbuffer *eb,
	       const struct i915_vma *target)
{ … }

static u64
eb_relocate_entry(struct i915_execbuffer *eb,
		  struct eb_vma *ev,
		  const struct drm_i915_gem_relocation_entry *reloc)
{ … }

static int eb_relocate_vma(struct i915_execbuffer *eb, struct eb_vma *ev)
{ … }

static int
eb_relocate_vma_slow(struct i915_execbuffer *eb, struct eb_vma *ev)
{ … }

static int check_relocations(const struct drm_i915_gem_exec_object2 *entry)
{ … }

static int eb_copy_relocations(const struct i915_execbuffer *eb)
{ … }

static int eb_prefault_relocations(const struct i915_execbuffer *eb)
{ … }

static int eb_reinit_userptr(struct i915_execbuffer *eb)
{ … }

static noinline int eb_relocate_parse_slow(struct i915_execbuffer *eb)
{ … }

static int eb_relocate_parse(struct i915_execbuffer *eb)
{ … }

/*
 * Using two helper loops for the order of which requests / batches are created
 * and added the to backend. Requests are created in order from the parent to
 * the last child. Requests are added in the reverse order, from the last child
 * to parent. This is done for locking reasons as the timeline lock is acquired
 * during request creation and released when the request is added to the
 * backend. To make lockdep happy (see intel_context_timeline_lock) this must be
 * the ordering.
 */
#define for_each_batch_create_order(_eb, _i) …
#define for_each_batch_add_order(_eb, _i) …

static struct i915_request *
eb_find_first_request_added(struct i915_execbuffer *eb)
{ … }

#if IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR)

/* Stage with GFP_KERNEL allocations before we enter the signaling critical path */
static int eb_capture_stage(struct i915_execbuffer *eb)
{ … }

/* Commit once we're in the critical path */
static void eb_capture_commit(struct i915_execbuffer *eb)
{ … }

/*
 * Release anything that didn't get committed due to errors.
 * The capture_list will otherwise be freed at request retire.
 */
static void eb_capture_release(struct i915_execbuffer *eb)
{ … }

static void eb_capture_list_clear(struct i915_execbuffer *eb)
{ … }

#else

static int eb_capture_stage(struct i915_execbuffer *eb)
{
	return 0;
}

static void eb_capture_commit(struct i915_execbuffer *eb)
{
}

static void eb_capture_release(struct i915_execbuffer *eb)
{
}

static void eb_capture_list_clear(struct i915_execbuffer *eb)
{
}

#endif

static int eb_move_to_gpu(struct i915_execbuffer *eb)
{ … }

static int i915_gem_check_execbuffer(struct drm_i915_private *i915,
				     struct drm_i915_gem_execbuffer2 *exec)
{ … }

static int i915_reset_gen7_sol_offsets(struct i915_request *rq)
{ … }

static struct i915_vma *
shadow_batch_pin(struct i915_execbuffer *eb,
		 struct drm_i915_gem_object *obj,
		 struct i915_address_space *vm,
		 unsigned int flags)
{ … }

static struct i915_vma *eb_dispatch_secure(struct i915_execbuffer *eb, struct i915_vma *vma)
{ … }

static int eb_parse(struct i915_execbuffer *eb)
{ … }

static int eb_request_submit(struct i915_execbuffer *eb,
			     struct i915_request *rq,
			     struct i915_vma *batch,
			     u64 batch_len)
{ … }

static int eb_submit(struct i915_execbuffer *eb)
{ … }

/*
 * Find one BSD ring to dispatch the corresponding BSD command.
 * The engine index is returned.
 */
static unsigned int
gen8_dispatch_bsd_engine(struct drm_i915_private *i915,
			 struct drm_file *file)
{ … }

static const enum intel_engine_id user_ring_map[] = …;

static struct i915_request *eb_throttle(struct i915_execbuffer *eb, struct intel_context *ce)
{ … }

static int eb_pin_timeline(struct i915_execbuffer *eb, struct intel_context *ce,
			   bool throttle)
{ … }

static int eb_pin_engine(struct i915_execbuffer *eb, bool throttle)
{ … }

static void eb_unpin_engine(struct i915_execbuffer *eb)
{ … }

static unsigned int
eb_select_legacy_ring(struct i915_execbuffer *eb)
{ … }

static int
eb_select_engine(struct i915_execbuffer *eb)
{ … }

static void
eb_put_engine(struct i915_execbuffer *eb)
{ … }

static void
__free_fence_array(struct eb_fence *fences, unsigned int n)
{ … }

static int
add_timeline_fence_array(struct i915_execbuffer *eb,
			 const struct drm_i915_gem_execbuffer_ext_timeline_fences *timeline_fences)
{ … }

static int add_fence_array(struct i915_execbuffer *eb)
{ … }

static void put_fence_array(struct eb_fence *fences, int num_fences)
{ … }

static int
await_fence_array(struct i915_execbuffer *eb,
		  struct i915_request *rq)
{ … }

static void signal_fence_array(const struct i915_execbuffer *eb,
			       struct dma_fence * const fence)
{ … }

static int
parse_timeline_fences(struct i915_user_extension __user *ext, void *data)
{ … }

static void retire_requests(struct intel_timeline *tl, struct i915_request *end)
{ … }

static int eb_request_add(struct i915_execbuffer *eb, struct i915_request *rq,
			  int err, bool last_parallel)
{ … }

static int eb_requests_add(struct i915_execbuffer *eb, int err)
{ … }

static const i915_user_extension_fn execbuf_extensions[] = …;

static int
parse_execbuf2_extensions(struct drm_i915_gem_execbuffer2 *args,
			  struct i915_execbuffer *eb)
{ … }

static void eb_requests_get(struct i915_execbuffer *eb)
{ … }

static void eb_requests_put(struct i915_execbuffer *eb)
{ … }

static struct sync_file *
eb_composite_fence_create(struct i915_execbuffer *eb, int out_fence_fd)
{ … }

static struct sync_file *
eb_fences_add(struct i915_execbuffer *eb, struct i915_request *rq,
	      struct dma_fence *in_fence, int out_fence_fd)
{ … }

static struct intel_context *
eb_find_context(struct i915_execbuffer *eb, unsigned int context_number)
{ … }

static struct sync_file *
eb_requests_create(struct i915_execbuffer *eb, struct dma_fence *in_fence,
		   int out_fence_fd)
{ … }

static int
i915_gem_do_execbuffer(struct drm_device *dev,
		       struct drm_file *file,
		       struct drm_i915_gem_execbuffer2 *args,
		       struct drm_i915_gem_exec_object2 *exec)
{ … }

static size_t eb_element_size(void)
{ … }

static bool check_buffer_count(size_t count)
{ … }

int
i915_gem_execbuffer2_ioctl(struct drm_device *dev, void *data,
			   struct drm_file *file)
{ … }
linux/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c