//===-- AMDGPUKernelCodeT.h - Print AMDGPU assembly code ---------*- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// \file AMDKernelCodeT.h //===----------------------------------------------------------------------===// #ifndef AMDKERNELCODET_H #define AMDKERNELCODET_H #include <cstdint> //---------------------------------------------------------------------------// // AMD Kernel Code, and its dependencies // //---------------------------------------------------------------------------// hsa_powertwo8_t; hsa_ext_code_kind_t; hsa_ext_brig_profile8_t; hsa_ext_brig_machine_model8_t; hsa_ext_control_directive_present64_t; hsa_ext_exception_kind16_t; hsa_ext_code_kind32_t; hsa_dim3_t; /// The version of the amd_*_code_t struct. Minor versions must be /// backward compatible. amd_code_version32_t; enum amd_code_version_t { … }; // Sets val bits for specified mask in specified dst packed instance. #define AMD_HSA_BITS_SET(dst, mask, val) … // Gets bits for specified mask from specified src packed instance. #define AMD_HSA_BITS_GET(src, mask) … \ /// The values used to define the number of bytes to use for the /// swizzle element size. enum amd_element_byte_size_t { … }; /// Shader program settings for CS. Contains COMPUTE_PGM_RSRC1 and /// COMPUTE_PGM_RSRC2 registers. amd_compute_pgm_resource_register64_t; /// Every amd_*_code_t has the following properties, which are composed of /// a number of bit fields. Every bit field has a mask (AMD_CODE_PROPERTY_*), /// bit width (AMD_CODE_PROPERTY_*_WIDTH, and bit shift amount /// (AMD_CODE_PROPERTY_*_SHIFT) for convenient access. Unused bits must be 0. /// /// (Note that bit fields cannot be used as their layout is /// implementation defined in the C standard and so cannot be used to /// specify an ABI) amd_code_property32_t; enum amd_code_property_mask_t { … }; /// The hsa_ext_control_directives_t specifies the values for the HSAIL /// control directives. These control how the finalizer generates code. This /// struct is used both as an argument to hsaFinalizeKernel to specify values for /// the control directives, and is used in HsaKernelCode to record the values of /// the control directives that the finalize used when generating the code which /// either came from the finalizer argument or explicit HSAIL control /// directives. See the definition of the control directives in HSA Programmer's /// Reference Manual which also defines how the values specified as finalizer /// arguments have to agree with the control directives in the HSAIL code. hsa_ext_control_directives_t; /// AMD Kernel Code Object (amd_kernel_code_t). GPU CP uses the AMD Kernel /// Code Object to set up the hardware to execute the kernel dispatch. /// /// Initial Kernel Register State. /// /// Initial kernel register state will be set up by CP/SPI prior to the start /// of execution of every wavefront. This is limited by the constraints of the /// current hardware. /// /// The order of the SGPR registers is defined, but the Finalizer can specify /// which ones are actually setup in the amd_kernel_code_t object using the /// enable_sgpr_* bit fields. The register numbers used for enabled registers /// are dense starting at SGPR0: the first enabled register is SGPR0, the next /// enabled register is SGPR1 etc.; disabled registers do not have an SGPR /// number. /// /// The initial SGPRs comprise up to 16 User SRGPs that are set up by CP and /// apply to all waves of the grid. It is possible to specify more than 16 User /// SGPRs using the enable_sgpr_* bit fields, in which case only the first 16 /// are actually initialized. These are then immediately followed by the System /// SGPRs that are set up by ADC/SPI and can have different values for each wave /// of the grid dispatch. /// /// SGPR register initial state is defined as follows: /// /// Private Segment Buffer (enable_sgpr_private_segment_buffer): /// Number of User SGPR registers: 4. V# that can be used, together with /// Scratch Wave Offset as an offset, to access the Private/Spill/Arg /// segments using a segment address. It must be set as follows: /// - Base address: of the scratch memory area used by the dispatch. It /// does not include the scratch wave offset. It will be the per process /// SH_HIDDEN_PRIVATE_BASE_VMID plus any offset from this dispatch (for /// example there may be a per pipe offset, or per AQL Queue offset). /// - Stride + data_format: Element Size * Index Stride (???) /// - Cache swizzle: ??? /// - Swizzle enable: SH_STATIC_MEM_CONFIG.SWIZZLE_ENABLE (must be 1 for /// scratch) /// - Num records: Flat Scratch Work Item Size / Element Size (???) /// - Dst_sel_*: ??? /// - Num_format: ??? /// - Element_size: SH_STATIC_MEM_CONFIG.ELEMENT_SIZE (will be DWORD, must /// agree with amd_kernel_code_t.privateElementSize) /// - Index_stride: SH_STATIC_MEM_CONFIG.INDEX_STRIDE (will be 64 as must /// be number of wavefront lanes for scratch, must agree with /// amd_kernel_code_t.wavefrontSize) /// - Add tid enable: 1 /// - ATC: from SH_MEM_CONFIG.PRIVATE_ATC, /// - Hash_enable: ??? /// - Heap: ??? /// - Mtype: from SH_STATIC_MEM_CONFIG.PRIVATE_MTYPE /// - Type: 0 (a buffer) (???) /// /// Dispatch Ptr (enable_sgpr_dispatch_ptr): /// Number of User SGPR registers: 2. 64 bit address of AQL dispatch packet /// for kernel actually executing. /// /// Queue Ptr (enable_sgpr_queue_ptr): /// Number of User SGPR registers: 2. 64 bit address of AmdQueue object for /// AQL queue on which the dispatch packet was queued. /// /// Kernarg Segment Ptr (enable_sgpr_kernarg_segment_ptr): /// Number of User SGPR registers: 2. 64 bit address of Kernarg segment. This /// is directly copied from the kernargPtr in the dispatch packet. Having CP /// load it once avoids loading it at the beginning of every wavefront. /// /// Dispatch Id (enable_sgpr_dispatch_id): /// Number of User SGPR registers: 2. 64 bit Dispatch ID of the dispatch /// packet being executed. /// /// Flat Scratch Init (enable_sgpr_flat_scratch_init): /// Number of User SGPR registers: 2. This is 2 SGPRs. /// /// For CI/VI: /// The first SGPR is a 32 bit byte offset from SH_MEM_HIDDEN_PRIVATE_BASE /// to base of memory for scratch for this dispatch. This is the same offset /// used in computing the Scratch Segment Buffer base address. The value of /// Scratch Wave Offset must be added by the kernel code and moved to /// SGPRn-4 for use as the FLAT SCRATCH BASE in flat memory instructions. /// /// The second SGPR is 32 bit byte size of a single work-item's scratch /// memory usage. This is directly loaded from the dispatch packet Private /// Segment Byte Size and rounded up to a multiple of DWORD. /// /// \todo [Does CP need to round this to >4 byte alignment?] /// /// The kernel code must move to SGPRn-3 for use as the FLAT SCRATCH SIZE in /// flat memory instructions. Having CP load it once avoids loading it at /// the beginning of every wavefront. /// /// For PI: /// This is the 64 bit base address of the scratch backing memory for /// allocated by CP for this dispatch. /// /// Private Segment Size (enable_sgpr_private_segment_size): /// Number of User SGPR registers: 1. The 32 bit byte size of a single /// work-item's scratch memory allocation. This is the value from the dispatch /// packet. Private Segment Byte Size rounded up by CP to a multiple of DWORD. /// /// \todo [Does CP need to round this to >4 byte alignment?] /// /// Having CP load it once avoids loading it at the beginning of every /// wavefront. /// /// \todo [This will not be used for CI/VI since it is the same value as /// the second SGPR of Flat Scratch Init. However, it is need for PI which /// changes meaning of Flat Scratchg Init..] /// /// Grid Work-Group Count X (enable_sgpr_grid_workgroup_count_x): /// Number of User SGPR registers: 1. 32 bit count of the number of /// work-groups in the X dimension for the grid being executed. Computed from /// the fields in the HsaDispatchPacket as /// ((gridSize.x+workgroupSize.x-1)/workgroupSize.x). /// /// Grid Work-Group Count Y (enable_sgpr_grid_workgroup_count_y): /// Number of User SGPR registers: 1. 32 bit count of the number of /// work-groups in the Y dimension for the grid being executed. Computed from /// the fields in the HsaDispatchPacket as /// ((gridSize.y+workgroupSize.y-1)/workgroupSize.y). /// /// Only initialized if <16 previous SGPRs initialized. /// /// Grid Work-Group Count Z (enable_sgpr_grid_workgroup_count_z): /// Number of User SGPR registers: 1. 32 bit count of the number of /// work-groups in the Z dimension for the grid being executed. Computed /// from the fields in the HsaDispatchPacket as /// ((gridSize.z+workgroupSize.z-1)/workgroupSize.z). /// /// Only initialized if <16 previous SGPRs initialized. /// /// Work-Group Id X (enable_sgpr_workgroup_id_x): /// Number of System SGPR registers: 1. 32 bit work group id in X dimension /// of grid for wavefront. Always present. /// /// Work-Group Id Y (enable_sgpr_workgroup_id_y): /// Number of System SGPR registers: 1. 32 bit work group id in Y dimension /// of grid for wavefront. /// /// Work-Group Id Z (enable_sgpr_workgroup_id_z): /// Number of System SGPR registers: 1. 32 bit work group id in Z dimension /// of grid for wavefront. If present then Work-group Id Y will also be /// present /// /// Work-Group Info (enable_sgpr_workgroup_info): /// Number of System SGPR registers: 1. {first_wave, 14'b0000, /// ordered_append_term[10:0], threadgroup_size_in_waves[5:0]} /// /// Private Segment Wave Byte Offset /// (enable_sgpr_private_segment_wave_byte_offset): /// Number of System SGPR registers: 1. 32 bit byte offset from base of /// dispatch scratch base. Must be used as an offset with Private/Spill/Arg /// segment address when using Scratch Segment Buffer. It must be added to /// Flat Scratch Offset if setting up FLAT SCRATCH for flat addressing. /// /// /// The order of the VGPR registers is defined, but the Finalizer can specify /// which ones are actually setup in the amd_kernel_code_t object using the /// enableVgpr* bit fields. The register numbers used for enabled registers /// are dense starting at VGPR0: the first enabled register is VGPR0, the next /// enabled register is VGPR1 etc.; disabled registers do not have an VGPR /// number. /// /// VGPR register initial state is defined as follows: /// /// Work-Item Id X (always initialized): /// Number of registers: 1. 32 bit work item id in X dimension of work-group /// for wavefront lane. /// /// Work-Item Id X (enable_vgpr_workitem_id > 0): /// Number of registers: 1. 32 bit work item id in Y dimension of work-group /// for wavefront lane. /// /// Work-Item Id X (enable_vgpr_workitem_id > 0): /// Number of registers: 1. 32 bit work item id in Z dimension of work-group /// for wavefront lane. /// /// /// The setting of registers is being done by existing GPU hardware as follows: /// 1) SGPRs before the Work-Group Ids are set by CP using the 16 User Data /// registers. /// 2) Work-group Id registers X, Y, Z are set by SPI which supports any /// combination including none. /// 3) Scratch Wave Offset is also set by SPI which is why its value cannot /// be added into the value Flat Scratch Offset which would avoid the /// Finalizer generated prolog having to do the add. /// 4) The VGPRs are set by SPI which only supports specifying either (X), /// (X, Y) or (X, Y, Z). /// /// Flat Scratch Dispatch Offset and Flat Scratch Size are adjacent SGRRs so /// they can be moved as a 64 bit value to the hardware required SGPRn-3 and /// SGPRn-4 respectively using the Finalizer ?FLAT_SCRATCH? Register. /// /// The global segment can be accessed either using flat operations or buffer /// operations. If buffer operations are used then the Global Buffer used to /// access HSAIL Global/Readonly/Kernarg (which are combine) segments using a /// segment address is not passed into the kernel code by CP since its base /// address is always 0. Instead the Finalizer generates prolog code to /// initialize 4 SGPRs with a V# that has the following properties, and then /// uses that in the buffer instructions: /// - base address of 0 /// - no swizzle /// - ATC=1 /// - MTYPE set to support memory coherence specified in /// amd_kernel_code_t.globalMemoryCoherence /// /// When the Global Buffer is used to access the Kernarg segment, must add the /// dispatch packet kernArgPtr to a kernarg segment address before using this V#. /// Alternatively scalar loads can be used if the kernarg offset is uniform, as /// the kernarg segment is constant for the duration of the kernel execution. /// struct amd_kernel_code_t { … }; #endif // AMDKERNELCODET_H