intel_lrc.c | Explore in Territory

// SPDX-License-Identifier: MIT
/*
 * Copyright © 2014 Intel Corporation
 */

#include "gem/i915_gem_lmem.h"

#include "gen8_engine_cs.h"
#include "i915_drv.h"
#include "i915_perf.h"
#include "i915_reg.h"
#include "intel_context.h"
#include "intel_engine.h"
#include "intel_engine_regs.h"
#include "intel_gpu_commands.h"
#include "intel_gt.h"
#include "intel_gt_regs.h"
#include "intel_lrc.h"
#include "intel_lrc_reg.h"
#include "intel_ring.h"
#include "shmem_utils.h"

/*
 * The per-platform tables are u8-encoded in @data. Decode @data and set the
 * addresses' offset and commands in @regs. The following encoding is used
 * for each byte. There are 2 steps: decoding commands and decoding addresses.
 *
 * Commands:
 * [7]: create NOPs - number of NOPs are set in lower bits
 * [6]: When creating MI_LOAD_REGISTER_IMM command, allow to set
 *      MI_LRI_FORCE_POSTED
 * [5:0]: Number of NOPs or registers to set values to in case of
 *        MI_LOAD_REGISTER_IMM
 *
 * Addresses: these are decoded after a MI_LOAD_REGISTER_IMM command by "count"
 * number of registers. They are set by using the REG/REG16 macros: the former
 * is used for offsets smaller than 0x200 while the latter is for values bigger
 * than that. Those macros already set all the bits documented below correctly:
 *
 * [7]: When a register offset needs more than 6 bits, use additional bytes, to
 *      follow, for the lower bits
 * [6:0]: Register offset, without considering the engine base.
 *
 * This function only tweaks the commands and register offsets. Values are not
 * filled out.
 */
static void set_offsets(u32 *regs,
			const u8 *data,
			const struct intel_engine_cs *engine,
			bool close)
#define NOP(x) (BIT(7) | (x))
#define LRI(count, flags) ((flags) << 6 | (count) | BUILD_BUG_ON_ZERO(count >= BIT(6)))
#define POSTED BIT(0)
#define REG(x) (((x) >> 2) | BUILD_BUG_ON_ZERO(x >= 0x200))
#define REG16(x) \
	(((x) >> 9) | BIT(7) | BUILD_BUG_ON_ZERO(x >= 0x10000)), \
	(((x) >> 2) & 0x7f)
#define END 0
{ … }

static const u8 gen8_xcs_offsets[] = …;

static const u8 gen9_xcs_offsets[] = …;

static const u8 gen12_xcs_offsets[] = …;

static const u8 dg2_xcs_offsets[] = …;

static const u8 gen8_rcs_offsets[] = …;

static const u8 gen9_rcs_offsets[] = …;

static const u8 gen11_rcs_offsets[] = …;

static const u8 gen12_rcs_offsets[] = …;

static const u8 dg2_rcs_offsets[] = …;

static const u8 mtl_rcs_offsets[] = …;

#undef END
#undef REG16
#undef REG
#undef LRI
#undef NOP

static const u8 *reg_offsets(const struct intel_engine_cs *engine)
{ … }

static int lrc_ring_mi_mode(const struct intel_engine_cs *engine)
{ … }

static int lrc_ring_bb_offset(const struct intel_engine_cs *engine)
{ … }

static int lrc_ring_gpr0(const struct intel_engine_cs *engine)
{ … }

static int lrc_ring_wa_bb_per_ctx(const struct intel_engine_cs *engine)
{ … }

static int lrc_ring_indirect_ptr(const struct intel_engine_cs *engine)
{ … }

static int lrc_ring_indirect_offset(const struct intel_engine_cs *engine)
{ … }

static int lrc_ring_cmd_buf_cctl(const struct intel_engine_cs *engine)
{ … }

static u32
lrc_ring_indirect_offset_default(const struct intel_engine_cs *engine)
{ … }

static void
lrc_setup_bb_per_ctx(u32 *regs,
		     const struct intel_engine_cs *engine,
		     u32 ctx_bb_ggtt_addr)
{ … }

static void
lrc_setup_indirect_ctx(u32 *regs,
		       const struct intel_engine_cs *engine,
		       u32 ctx_bb_ggtt_addr,
		       u32 size)
{ … }

static bool ctx_needs_runalone(const struct intel_context *ce)
{ … }

static void init_common_regs(u32 * const regs,
			     const struct intel_context *ce,
			     const struct intel_engine_cs *engine,
			     bool inhibit)
{ … }

static void init_wa_bb_regs(u32 * const regs,
			    const struct intel_engine_cs *engine)
{ … }

static void init_ppgtt_regs(u32 *regs, const struct i915_ppgtt *ppgtt)
{ … }

static struct i915_ppgtt *vm_alias(struct i915_address_space *vm)
{ … }

static void __reset_stop_ring(u32 *regs, const struct intel_engine_cs *engine)
{ … }

static void __lrc_init_regs(u32 *regs,
			    const struct intel_context *ce,
			    const struct intel_engine_cs *engine,
			    bool inhibit)
{ … }

void lrc_init_regs(const struct intel_context *ce,
		   const struct intel_engine_cs *engine,
		   bool inhibit)
{ … }

void lrc_reset_regs(const struct intel_context *ce,
		    const struct intel_engine_cs *engine)
{ … }

static void
set_redzone(void *vaddr, const struct intel_engine_cs *engine)
{ … }

static void
check_redzone(const void *vaddr, const struct intel_engine_cs *engine)
{ … }

static u32 context_wa_bb_offset(const struct intel_context *ce)
{ … }

/*
 * per_ctx below determines which WABB section is used.
 * When true, the function returns the location of the
 * PER_CTX_BB.  When false, the function returns the
 * location of the INDIRECT_CTX.
 */
static u32 *context_wabb(const struct intel_context *ce, bool per_ctx)
{ … }

void lrc_init_state(struct intel_context *ce,
		    struct intel_engine_cs *engine,
		    void *state)
{ … }

u32 lrc_indirect_bb(const struct intel_context *ce)
{ … }

static u32 *setup_predicate_disable_wa(const struct intel_context *ce, u32 *cs)
{ … }

static struct i915_vma *
__lrc_alloc_state(struct intel_context *ce, struct intel_engine_cs *engine)
{ … }

static struct intel_timeline *
pinned_timeline(struct intel_context *ce, struct intel_engine_cs *engine)
{ … }

int lrc_alloc(struct intel_context *ce, struct intel_engine_cs *engine)
{ … }

void lrc_reset(struct intel_context *ce)
{ … }

int
lrc_pre_pin(struct intel_context *ce,
	    struct intel_engine_cs *engine,
	    struct i915_gem_ww_ctx *ww,
	    void **vaddr)
{ … }

int
lrc_pin(struct intel_context *ce,
	struct intel_engine_cs *engine,
	void *vaddr)
{ … }

void lrc_unpin(struct intel_context *ce)
{ … }

void lrc_post_unpin(struct intel_context *ce)
{ … }

void lrc_fini(struct intel_context *ce)
{ … }

void lrc_destroy(struct kref *kref)
{ … }

static u32 *
gen12_emit_timestamp_wa(const struct intel_context *ce, u32 *cs)
{ … }

static u32 *
gen12_emit_restore_scratch(const struct intel_context *ce, u32 *cs)
{ … }

static u32 *
gen12_emit_cmd_buf_wa(const struct intel_context *ce, u32 *cs)
{ … }

/*
 * The bspec's tuning guide asks us to program a vertical watermark value of
 * 0x3FF.  However this register is not saved/restored properly by the
 * hardware, so we're required to apply the desired value via INDIRECT_CTX
 * batch buffer to ensure the value takes effect properly.  All other bits
 * in this register should remain at 0 (the hardware default).
 */
static u32 *
dg2_emit_draw_watermark_setting(u32 *cs)
{ … }

static u32 *
gen12_invalidate_state_cache(u32 *cs)
{ … }

static u32 *
gen12_emit_indirect_ctx_rcs(const struct intel_context *ce, u32 *cs)
{ … }

static u32 *
gen12_emit_indirect_ctx_xcs(const struct intel_context *ce, u32 *cs)
{ … }

static u32 *xehp_emit_fastcolor_blt_wabb(const struct intel_context *ce, u32 *cs)
{ … }

static u32 *
xehp_emit_per_ctx_bb(const struct intel_context *ce, u32 *cs)
{ … }

static void
setup_per_ctx_bb(const struct intel_context *ce,
		 const struct intel_engine_cs *engine,
		 u32 *(*emit)(const struct intel_context *, u32 *))
{ … }

static void
setup_indirect_ctx_bb(const struct intel_context *ce,
		      const struct intel_engine_cs *engine,
		      u32 *(*emit)(const struct intel_context *, u32 *))
{ … }

/*
 * The context descriptor encodes various attributes of a context,
 * including its GTT address and some flags. Because it's fairly
 * expensive to calculate, we'll just do it once and cache the result,
 * which remains valid until the context is unpinned.
 *
 * This is what a descriptor looks like, from LSB to MSB::
 *
 *      bits  0-11:    flags, GEN8_CTX_* (cached in ctx->desc_template)
 *      bits 12-31:    LRCA, GTT address of (the HWSP of) this context
 *      bits 32-52:    ctx ID, a globally unique tag (highest bit used by GuC)
 *      bits 53-54:    mbz, reserved for use by hardware
 *      bits 55-63:    group ID, currently unused and set to 0
 *
 * Starting from Gen11, the upper dword of the descriptor has a new format:
 *
 *      bits 32-36:    reserved
 *      bits 37-47:    SW context ID
 *      bits 48:53:    engine instance
 *      bit 54:        mbz, reserved for use by hardware
 *      bits 55-60:    SW counter
 *      bits 61-63:    engine class
 *
 * On Xe_HP, the upper dword of the descriptor has a new format:
 *
 *      bits 32-37:    virtual function number
 *      bit 38:        mbz, reserved for use by hardware
 *      bits 39-54:    SW context ID
 *      bits 55-57:    reserved
 *      bits 58-63:    SW counter
 *
 * engine info, SW context ID and SW counter need to form a unique number
 * (Context ID) per lrc.
 */
static u32 lrc_descriptor(const struct intel_context *ce)
{ … }

u32 lrc_update_regs(const struct intel_context *ce,
		    const struct intel_engine_cs *engine,
		    u32 head)
{ … }

void lrc_update_offsets(struct intel_context *ce,
			struct intel_engine_cs *engine)
{ … }

void lrc_check_regs(const struct intel_context *ce,
		    const struct intel_engine_cs *engine,
		    const char *when)
{ … }

/*
 * In this WA we need to set GEN8_L3SQCREG4[21:21] and reset it after
 * PIPE_CONTROL instruction. This is required for the flush to happen correctly
 * but there is a slight complication as this is applied in WA batch where the
 * values are only initialized once so we cannot take register value at the
 * beginning and reuse it further; hence we save its value to memory, upload a
 * constant value with bit21 set and then we restore it back with the saved value.
 * To simplify the WA, a constant value is formed by using the default value
 * of this register. This shouldn't be a problem because we are only modifying
 * it for a short period and this batch in non-premptible. We can ofcourse
 * use additional instructions that read the actual value of the register
 * at that time and set our bit of interest but it makes the WA complicated.
 *
 * This WA is also required for Gen9 so extracting as a function avoids
 * code duplication.
 */
static u32 *
gen8_emit_flush_coherentl3_wa(struct intel_engine_cs *engine, u32 *batch)
{ … }

/*
 * Typically we only have one indirect_ctx and per_ctx batch buffer which are
 * initialized at the beginning and shared across all contexts but this field
 * helps us to have multiple batches at different offsets and select them based
 * on a criteria. At the moment this batch always start at the beginning of the page
 * and at this point we don't have multiple wa_ctx batch buffers.
 *
 * The number of WA applied are not known at the beginning; we use this field
 * to return the no of DWORDS written.
 *
 * It is to be noted that this batch does not contain MI_BATCH_BUFFER_END
 * so it adds NOOPs as padding to make it cacheline aligned.
 * MI_BATCH_BUFFER_END will be added to perctx batch and both of them together
 * makes a complete batch buffer.
 */
static u32 *gen8_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
{ … }

struct lri { … };

static u32 *emit_lri(u32 *batch, const struct lri *lri, unsigned int count)
{ … }

static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
{ … }

#define CTX_WA_BB_SIZE …

static int lrc_create_wa_ctx(struct intel_engine_cs *engine)
{ … }

void lrc_fini_wa_ctx(struct intel_engine_cs *engine)
{ … }

wa_bb_func_t;

void lrc_init_wa_ctx(struct intel_engine_cs *engine)
{ … }

static void st_runtime_underflow(struct intel_context_stats *stats, s32 dt)
{ … }

static u32 lrc_get_runtime(const struct intel_context *ce)
{ … }

void lrc_update_runtime(struct intel_context *ce)
{ … }

#if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
#include "selftest_lrc.c"
#endif
linux/drivers/gpu/drm/i915/gt/intel_lrc.c