intel_execlists_submission.c | Explore in Territory

// SPDX-License-Identifier: MIT
/*
 * Copyright © 2014 Intel Corporation
 */

/**
 * DOC: Logical Rings, Logical Ring Contexts and Execlists
 *
 * Motivation:
 * GEN8 brings an expansion of the HW contexts: "Logical Ring Contexts".
 * These expanded contexts enable a number of new abilities, especially
 * "Execlists" (also implemented in this file).
 *
 * One of the main differences with the legacy HW contexts is that logical
 * ring contexts incorporate many more things to the context's state, like
 * PDPs or ringbuffer control registers:
 *
 * The reason why PDPs are included in the context is straightforward: as
 * PPGTTs (per-process GTTs) are actually per-context, having the PDPs
 * contained there mean you don't need to do a ppgtt->switch_mm yourself,
 * instead, the GPU will do it for you on the context switch.
 *
 * But, what about the ringbuffer control registers (head, tail, etc..)?
 * shouldn't we just need a set of those per engine command streamer? This is
 * where the name "Logical Rings" starts to make sense: by virtualizing the
 * rings, the engine cs shifts to a new "ring buffer" with every context
 * switch. When you want to submit a workload to the GPU you: A) choose your
 * context, B) find its appropriate virtualized ring, C) write commands to it
 * and then, finally, D) tell the GPU to switch to that context.
 *
 * Instead of the legacy MI_SET_CONTEXT, the way you tell the GPU to switch
 * to a contexts is via a context execution list, ergo "Execlists".
 *
 * LRC implementation:
 * Regarding the creation of contexts, we have:
 *
 * - One global default context.
 * - One local default context for each opened fd.
 * - One local extra context for each context create ioctl call.
 *
 * Now that ringbuffers belong per-context (and not per-engine, like before)
 * and that contexts are uniquely tied to a given engine (and not reusable,
 * like before) we need:
 *
 * - One ringbuffer per-engine inside each context.
 * - One backing object per-engine inside each context.
 *
 * The global default context starts its life with these new objects fully
 * allocated and populated. The local default context for each opened fd is
 * more complex, because we don't know at creation time which engine is going
 * to use them. To handle this, we have implemented a deferred creation of LR
 * contexts:
 *
 * The local context starts its life as a hollow or blank holder, that only
 * gets populated for a given engine once we receive an execbuffer. If later
 * on we receive another execbuffer ioctl for the same context but a different
 * engine, we allocate/populate a new ringbuffer and context backing object and
 * so on.
 *
 * Finally, regarding local contexts created using the ioctl call: as they are
 * only allowed with the render ring, we can allocate & populate them right
 * away (no need to defer anything, at least for now).
 *
 * Execlists implementation:
 * Execlists are the new method by which, on gen8+ hardware, workloads are
 * submitted for execution (as opposed to the legacy, ringbuffer-based, method).
 * This method works as follows:
 *
 * When a request is committed, its commands (the BB start and any leading or
 * trailing commands, like the seqno breadcrumbs) are placed in the ringbuffer
 * for the appropriate context. The tail pointer in the hardware context is not
 * updated at this time, but instead, kept by the driver in the ringbuffer
 * structure. A structure representing this request is added to a request queue
 * for the appropriate engine: this structure contains a copy of the context's
 * tail after the request was written to the ring buffer and a pointer to the
 * context itself.
 *
 * If the engine's request queue was empty before the request was added, the
 * queue is processed immediately. Otherwise the queue will be processed during
 * a context switch interrupt. In any case, elements on the queue will get sent
 * (in pairs) to the GPU's ExecLists Submit Port (ELSP, for short) with a
 * globally unique 20-bits submission ID.
 *
 * When execution of a request completes, the GPU updates the context status
 * buffer with a context complete event and generates a context switch interrupt.
 * During the interrupt handling, the driver examines the events in the buffer:
 * for each context complete event, if the announced ID matches that on the head
 * of the request queue, then that request is retired and removed from the queue.
 *
 * After processing, if any requests were retired and the queue is not empty
 * then a new execution list can be submitted. The two requests at the front of
 * the queue are next to be submitted but since a context may not occur twice in
 * an execution list, if subsequent requests have the same ID as the first then
 * the two requests must be combined. This is done simply by discarding requests
 * at the head of the queue until either only one requests is left (in which case
 * we use a NULL second context) or the first two requests have unique IDs.
 *
 * By always executing the first two requests in the queue the driver ensures
 * that the GPU is kept as busy as possible. In the case where a single context
 * completes but a second context is still executing, the request for this second
 * context will be at the head of the queue when we remove the first one. This
 * request will then be resubmitted along with a new request for a different context,
 * which will cause the hardware to continue executing the second request and queue
 * the new request (the GPU detects the condition of a context getting preempted
 * with the same context and optimizes the context switch flow by not doing
 * preemption, but just sampling the new tail pointer).
 *
 */
#include <linux/interrupt.h>
#include <linux/string_helpers.h>

#include "i915_drv.h"
#include "i915_reg.h"
#include "i915_trace.h"
#include "i915_vgpu.h"
#include "gen8_engine_cs.h"
#include "intel_breadcrumbs.h"
#include "intel_context.h"
#include "intel_engine_heartbeat.h"
#include "intel_engine_pm.h"
#include "intel_engine_regs.h"
#include "intel_engine_stats.h"
#include "intel_execlists_submission.h"
#include "intel_gt.h"
#include "intel_gt_irq.h"
#include "intel_gt_pm.h"
#include "intel_gt_regs.h"
#include "intel_gt_requests.h"
#include "intel_lrc.h"
#include "intel_lrc_reg.h"
#include "intel_mocs.h"
#include "intel_reset.h"
#include "intel_ring.h"
#include "intel_workarounds.h"
#include "shmem_utils.h"

#define RING_EXECLIST_QFULL …
#define RING_EXECLIST1_VALID …
#define RING_EXECLIST0_VALID …
#define RING_EXECLIST_ACTIVE_STATUS …
#define RING_EXECLIST1_ACTIVE …
#define RING_EXECLIST0_ACTIVE …

#define GEN8_CTX_STATUS_IDLE_ACTIVE …
#define GEN8_CTX_STATUS_PREEMPTED …
#define GEN8_CTX_STATUS_ELEMENT_SWITCH …
#define GEN8_CTX_STATUS_ACTIVE_IDLE …
#define GEN8_CTX_STATUS_COMPLETE …
#define GEN8_CTX_STATUS_LITE_RESTORE …

#define GEN8_CTX_STATUS_COMPLETED_MASK …

#define GEN12_CTX_STATUS_SWITCHED_TO_NEW_QUEUE …
#define GEN12_CTX_SWITCH_DETAIL(csb_dw) …
#define GEN12_CSB_SW_CTX_ID_MASK …
#define GEN12_IDLE_CTX_ID …
#define GEN12_CSB_CTX_VALID(csb_dw) …

#define XEHP_CTX_STATUS_SWITCHED_TO_NEW_QUEUE …
#define XEHP_CSB_SW_CTX_ID_MASK …
#define XEHP_IDLE_CTX_ID …
#define XEHP_CSB_CTX_VALID(csb_dw) …

/* Typical size of the average request (2 pipecontrols and a MI_BB) */
#define EXECLISTS_REQUEST_SIZE …

struct virtual_engine { … };

static struct virtual_engine *to_virtual_engine(struct intel_engine_cs *engine)
{ … }

static struct intel_context *
execlists_create_virtual(struct intel_engine_cs **siblings, unsigned int count,
			 unsigned long flags);

static struct i915_request *
__active_request(const struct intel_timeline * const tl,
		 struct i915_request *rq,
		 int error)
{ … }

static struct i915_request *
active_request(const struct intel_timeline * const tl, struct i915_request *rq)
{ … }

static void ring_set_paused(const struct intel_engine_cs *engine, int state)
{ … }

static struct i915_priolist *to_priolist(struct rb_node *rb)
{ … }

static int rq_prio(const struct i915_request *rq)
{ … }

static int effective_prio(const struct i915_request *rq)
{ … }

static int queue_prio(const struct i915_sched_engine *sched_engine)
{ … }

static int virtual_prio(const struct intel_engine_execlists *el)
{ … }

static bool need_preempt(const struct intel_engine_cs *engine,
			 const struct i915_request *rq)
{ … }

__maybe_unused static bool
assert_priority_queue(const struct i915_request *prev,
		      const struct i915_request *next)
{ … }

static struct i915_request *
__unwind_incomplete_requests(struct intel_engine_cs *engine)
{ … }

struct i915_request *
execlists_unwind_incomplete_requests(struct intel_engine_execlists *execlists)
{ … }

static void
execlists_context_status_change(struct i915_request *rq, unsigned long status)
{ … }

static void reset_active(struct i915_request *rq,
			 struct intel_engine_cs *engine)
{ … }

static bool bad_request(const struct i915_request *rq)
{ … }

static struct intel_engine_cs *
__execlists_schedule_in(struct i915_request *rq)
{ … }

static void execlists_schedule_in(struct i915_request *rq, int idx)
{ … }

static void
resubmit_virtual_request(struct i915_request *rq, struct virtual_engine *ve)
{ … }

static void kick_siblings(struct i915_request *rq, struct intel_context *ce)
{ … }

static void __execlists_schedule_out(struct i915_request * const rq,
				     struct intel_context * const ce)
{ … }

static inline void execlists_schedule_out(struct i915_request *rq)
{ … }

static u32 map_i915_prio_to_lrc_desc_prio(int prio)
{ … }

static u64 execlists_update_context(struct i915_request *rq)
{ … }

static void write_desc(struct intel_engine_execlists *execlists, u64 desc, u32 port)
{ … }

static __maybe_unused char *
dump_port(char *buf, int buflen, const char *prefix, struct i915_request *rq)
{ … }

static __maybe_unused noinline void
trace_ports(const struct intel_engine_execlists *execlists,
	    const char *msg,
	    struct i915_request * const *ports)
{ … }

static bool
reset_in_progress(const struct intel_engine_cs *engine)
{ … }

static __maybe_unused noinline bool
assert_pending_valid(const struct intel_engine_execlists *execlists,
		     const char *msg)
{ … }

static void execlists_submit_ports(struct intel_engine_cs *engine)
{ … }

static bool ctx_single_port_submission(const struct intel_context *ce)
{ … }

static bool can_merge_ctx(const struct intel_context *prev,
			  const struct intel_context *next)
{ … }

static unsigned long i915_request_flags(const struct i915_request *rq)
{ … }

static bool can_merge_rq(const struct i915_request *prev,
			 const struct i915_request *next)
{ … }

static bool virtual_matches(const struct virtual_engine *ve,
			    const struct i915_request *rq,
			    const struct intel_engine_cs *engine)
{ … }

static struct virtual_engine *
first_virtual_engine(struct intel_engine_cs *engine)
{ … }

static void virtual_xfer_context(struct virtual_engine *ve,
				 struct intel_engine_cs *engine)
{ … }

static void defer_request(struct i915_request *rq, struct list_head * const pl)
{ … }

static void defer_active(struct intel_engine_cs *engine)
{ … }

static bool
timeslice_yield(const struct intel_engine_execlists *el,
		const struct i915_request *rq)
{ … }

static bool needs_timeslice(const struct intel_engine_cs *engine,
			    const struct i915_request *rq)
{ … }

static bool
timeslice_expired(struct intel_engine_cs *engine, const struct i915_request *rq)
{ … }

static unsigned long timeslice(const struct intel_engine_cs *engine)
{ … }

static void start_timeslice(struct intel_engine_cs *engine)
{ … }

static void record_preemption(struct intel_engine_execlists *execlists)
{ … }

static unsigned long active_preempt_timeout(struct intel_engine_cs *engine,
					    const struct i915_request *rq)
{ … }

static void set_preempt_timeout(struct intel_engine_cs *engine,
				const struct i915_request *rq)
{ … }

static bool completed(const struct i915_request *rq)
{ … }

static void execlists_dequeue(struct intel_engine_cs *engine)
{ … }

static void execlists_dequeue_irq(struct intel_engine_cs *engine)
{ … }

static void clear_ports(struct i915_request **ports, int count)
{ … }

static void
copy_ports(struct i915_request **dst, struct i915_request **src, int count)
{ … }

static struct i915_request **
cancel_port_requests(struct intel_engine_execlists * const execlists,
		     struct i915_request **inactive)
{ … }

/*
 * Starting with Gen12, the status has a new format:
 *
 *     bit  0:     switched to new queue
 *     bit  1:     reserved
 *     bit  2:     semaphore wait mode (poll or signal), only valid when
 *                 switch detail is set to "wait on semaphore"
 *     bits 3-5:   engine class
 *     bits 6-11:  engine instance
 *     bits 12-14: reserved
 *     bits 15-25: sw context id of the lrc the GT switched to
 *     bits 26-31: sw counter of the lrc the GT switched to
 *     bits 32-35: context switch detail
 *                  - 0: ctx complete
 *                  - 1: wait on sync flip
 *                  - 2: wait on vblank
 *                  - 3: wait on scanline
 *                  - 4: wait on semaphore
 *                  - 5: context preempted (not on SEMAPHORE_WAIT or
 *                       WAIT_FOR_EVENT)
 *     bit  36:    reserved
 *     bits 37-43: wait detail (for switch detail 1 to 4)
 *     bits 44-46: reserved
 *     bits 47-57: sw context id of the lrc the GT switched away from
 *     bits 58-63: sw counter of the lrc the GT switched away from
 *
 * Xe_HP csb shuffles things around compared to TGL:
 *
 *     bits 0-3:   context switch detail (same possible values as TGL)
 *     bits 4-9:   engine instance
 *     bits 10-25: sw context id of the lrc the GT switched to
 *     bits 26-31: sw counter of the lrc the GT switched to
 *     bit  32:    semaphore wait mode (poll or signal), Only valid when
 *                 switch detail is set to "wait on semaphore"
 *     bit  33:    switched to new queue
 *     bits 34-41: wait detail (for switch detail 1 to 4)
 *     bits 42-57: sw context id of the lrc the GT switched away from
 *     bits 58-63: sw counter of the lrc the GT switched away from
 */
static inline bool
__gen12_csb_parse(bool ctx_to_valid, bool ctx_away_valid, bool new_queue,
		  u8 switch_detail)
{ … }

static bool xehp_csb_parse(const u64 csb)
{ … }

static bool gen12_csb_parse(const u64 csb)
{ … }

static bool gen8_csb_parse(const u64 csb)
{ … }

static noinline u64
wa_csb_read(const struct intel_engine_cs *engine, u64 * const csb)
{ … }

static u64 csb_read(const struct intel_engine_cs *engine, u64 * const csb)
{ … }

static void new_timeslice(struct intel_engine_execlists *el)
{ … }

static struct i915_request **
process_csb(struct intel_engine_cs *engine, struct i915_request **inactive)
{ … }

static void post_process_csb(struct i915_request **port,
			     struct i915_request **last)
{ … }

static void __execlists_hold(struct i915_request *rq)
{ … }

static bool execlists_hold(struct intel_engine_cs *engine,
			   struct i915_request *rq)
{ … }

static bool hold_request(const struct i915_request *rq)
{ … }

static void __execlists_unhold(struct i915_request *rq)
{ … }

static void execlists_unhold(struct intel_engine_cs *engine,
			     struct i915_request *rq)
{ … }

struct execlists_capture { … };

static void execlists_capture_work(struct work_struct *work)
{ … }

static struct execlists_capture *capture_regs(struct intel_engine_cs *engine)
{ … }

static struct i915_request *
active_context(struct intel_engine_cs *engine, u32 ccid)
{ … }

static u32 active_ccid(struct intel_engine_cs *engine)
{ … }

static void execlists_capture(struct intel_engine_cs *engine)
{ … }

static void execlists_reset(struct intel_engine_cs *engine, const char *msg)
{ … }

static bool preempt_timeout(const struct intel_engine_cs *const engine)
{ … }

/*
 * Check the unread Context Status Buffers and manage the submission of new
 * contexts to the ELSP accordingly.
 */
static void execlists_submission_tasklet(struct tasklet_struct *t)
{ … }

static void execlists_irq_handler(struct intel_engine_cs *engine, u16 iir)
{ … }

static void __execlists_kick(struct intel_engine_execlists *execlists)
{ … }

#define execlists_kick(t, member) …

static void execlists_timeslice(struct timer_list *timer)
{ … }

static void execlists_preempt(struct timer_list *timer)
{ … }

static void queue_request(struct intel_engine_cs *engine,
			  struct i915_request *rq)
{ … }

static bool submit_queue(struct intel_engine_cs *engine,
			 const struct i915_request *rq)
{ … }

static bool ancestor_on_hold(const struct intel_engine_cs *engine,
			     const struct i915_request *rq)
{ … }

static void execlists_submit_request(struct i915_request *request)
{ … }

static int
__execlists_context_pre_pin(struct intel_context *ce,
			    struct intel_engine_cs *engine,
			    struct i915_gem_ww_ctx *ww, void **vaddr)
{ … }

static int execlists_context_pre_pin(struct intel_context *ce,
				     struct i915_gem_ww_ctx *ww,
				     void **vaddr)
{ … }

static int execlists_context_pin(struct intel_context *ce, void *vaddr)
{ … }

static int execlists_context_alloc(struct intel_context *ce)
{ … }

static void execlists_context_cancel_request(struct intel_context *ce,
					     struct i915_request *rq)
{ … }

static struct intel_context *
execlists_create_parallel(struct intel_engine_cs **engines,
			  unsigned int num_siblings,
			  unsigned int width)
{ … }

static const struct intel_context_ops execlists_context_ops = …;

static int emit_pdps(struct i915_request *rq)
{ … }

static int execlists_request_alloc(struct i915_request *request)
{ … }

static void reset_csb_pointers(struct intel_engine_cs *engine)
{ … }

static void sanitize_hwsp(struct intel_engine_cs *engine)
{ … }

static void execlists_sanitize(struct intel_engine_cs *engine)
{ … }

static void enable_error_interrupt(struct intel_engine_cs *engine)
{ … }

static void enable_execlists(struct intel_engine_cs *engine)
{ … }

static int execlists_resume(struct intel_engine_cs *engine)
{ … }

static void execlists_reset_prepare(struct intel_engine_cs *engine)
{ … }

static struct i915_request **
reset_csb(struct intel_engine_cs *engine, struct i915_request **inactive)
{ … }

static void
execlists_reset_active(struct intel_engine_cs *engine, bool stalled)
{ … }

static void execlists_reset_csb(struct intel_engine_cs *engine, bool stalled)
{ … }

static void execlists_reset_rewind(struct intel_engine_cs *engine, bool stalled)
{ … }

static void nop_submission_tasklet(struct tasklet_struct *t)
{ … }

static void execlists_reset_cancel(struct intel_engine_cs *engine)
{ … }

static void execlists_reset_finish(struct intel_engine_cs *engine)
{ … }

static void gen8_logical_ring_enable_irq(struct intel_engine_cs *engine)
{ … }

static void gen8_logical_ring_disable_irq(struct intel_engine_cs *engine)
{ … }

static void execlists_park(struct intel_engine_cs *engine)
{ … }

static void add_to_engine(struct i915_request *rq)
{ … }

static void remove_from_engine(struct i915_request *rq)
{ … }

static bool can_preempt(struct intel_engine_cs *engine)
{ … }

static void kick_execlists(const struct i915_request *rq, int prio)
{ … }

static void execlists_set_default_submission(struct intel_engine_cs *engine)
{ … }

static void execlists_shutdown(struct intel_engine_cs *engine)
{ … }

static void execlists_release(struct intel_engine_cs *engine)
{ … }

static ktime_t __execlists_engine_busyness(struct intel_engine_cs *engine,
					   ktime_t *now)
{ … }

static ktime_t execlists_engine_busyness(struct intel_engine_cs *engine,
					 ktime_t *now)
{ … }

static void
logical_ring_default_vfuncs(struct intel_engine_cs *engine)
{ … }

static void logical_ring_default_irqs(struct intel_engine_cs *engine)
{ … }

static void rcs_submission_override(struct intel_engine_cs *engine)
{ … }

int intel_execlists_submission_setup(struct intel_engine_cs *engine)
{ … }

static struct list_head *virtual_queue(struct virtual_engine *ve)
{ … }

static void rcu_virtual_context_destroy(struct work_struct *wrk)
{ … }

static void virtual_context_destroy(struct kref *kref)
{ … }

static void virtual_engine_initial_hint(struct virtual_engine *ve)
{ … }

static int virtual_context_alloc(struct intel_context *ce)
{ … }

static int virtual_context_pre_pin(struct intel_context *ce,
				   struct i915_gem_ww_ctx *ww,
				   void **vaddr)
{ … }

static int virtual_context_pin(struct intel_context *ce, void *vaddr)
{ … }

static void virtual_context_enter(struct intel_context *ce)
{ … }

static void virtual_context_exit(struct intel_context *ce)
{ … }

static struct intel_engine_cs *
virtual_get_sibling(struct intel_engine_cs *engine, unsigned int sibling)
{ … }

static const struct intel_context_ops virtual_context_ops = …;

static intel_engine_mask_t virtual_submission_mask(struct virtual_engine *ve)
{ … }

static void virtual_submission_tasklet(struct tasklet_struct *t)
{ … }

static void virtual_submit_request(struct i915_request *rq)
{ … }

static struct intel_context *
execlists_create_virtual(struct intel_engine_cs **siblings, unsigned int count,
			 unsigned long flags)
{ … }

void intel_execlists_show_requests(struct intel_engine_cs *engine,
				   struct drm_printer *m,
				   void (*show_request)(struct drm_printer *m,
							const struct i915_request *rq,
							const char *prefix,
							int indent),
				   unsigned int max)
{ … }

void intel_execlists_dump_active_requests(struct intel_engine_cs *engine,
					  struct i915_request *hung_rq,
					  struct drm_printer *m)
{ … }

#if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
#include "selftest_execlists.c"
#endif
linux/drivers/gpu/drm/i915/gt/intel_execlists_submission.c