//===-- AMDGPULowerModuleLDSPass.cpp ------------------------------*- C++ -*-=// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // This pass eliminates local data store, LDS, uses from non-kernel functions. // LDS is contiguous memory allocated per kernel execution. // // Background. // // The programming model is global variables, or equivalently function local // static variables, accessible from kernels or other functions. For uses from // kernels this is straightforward - assign an integer to the kernel for the // memory required by all the variables combined, allocate them within that. // For uses from functions there are performance tradeoffs to choose between. // // This model means the GPU runtime can specify the amount of memory allocated. // If this is more than the kernel assumed, the excess can be made available // using a language specific feature, which IR represents as a variable with // no initializer. This feature is referred to here as "Dynamic LDS" and is // lowered slightly differently to the normal case. // // Consequences of this GPU feature: // - memory is limited and exceeding it halts compilation // - a global accessed by one kernel exists independent of other kernels // - a global exists independent of simultaneous execution of the same kernel // - the address of the global may be different from different kernels as they // do not alias, which permits only allocating variables they use // - if the address is allowed to differ, functions need help to find it // // Uses from kernels are implemented here by grouping them in a per-kernel // struct instance. This duplicates the variables, accurately modelling their // aliasing properties relative to a single global representation. It also // permits control over alignment via padding. // // Uses from functions are more complicated and the primary purpose of this // IR pass. Several different lowering are chosen between to meet requirements // to avoid allocating any LDS where it is not necessary, as that impacts // occupancy and may fail the compilation, while not imposing overhead on a // feature whose primary advantage over global memory is performance. The basic // design goal is to avoid one kernel imposing overhead on another. // // Implementation. // // LDS variables with constant annotation or non-undef initializer are passed // through unchanged for simplification or error diagnostics in later passes. // Non-undef initializers are not yet implemented for LDS. // // LDS variables that are always allocated at the same address can be found // by lookup at that address. Otherwise runtime information/cost is required. // // The simplest strategy possible is to group all LDS variables in a single // struct and allocate that struct in every kernel such that the original // variables are always at the same address. LDS is however a limited resource // so this strategy is unusable in practice. It is not implemented here. // // Strategy | Precise allocation | Zero runtime cost | General purpose | // --------+--------------------+-------------------+-----------------+ // Module | No | Yes | Yes | // Table | Yes | No | Yes | // Kernel | Yes | Yes | No | // Hybrid | Yes | Partial | Yes | // // "Module" spends LDS memory to save cycles. "Table" spends cycles and global // memory to save LDS. "Kernel" is as fast as kernel allocation but only works // for variables that are known reachable from a single kernel. "Hybrid" picks // between all three. When forced to choose between LDS and cycles we minimise // LDS use. // The "module" lowering implemented here finds LDS variables which are used by // non-kernel functions and creates a new struct with a field for each of those // LDS variables. Variables that are only used from kernels are excluded. // // The "table" lowering implemented here has three components. // First kernels are assigned a unique integer identifier which is available in // functions it calls through the intrinsic amdgcn_lds_kernel_id. The integer // is passed through a specific SGPR, thus works with indirect calls. // Second, each kernel allocates LDS variables independent of other kernels and // writes the addresses it chose for each variable into an array in consistent // order. If the kernel does not allocate a given variable, it writes undef to // the corresponding array location. These arrays are written to a constant // table in the order matching the kernel unique integer identifier. // Third, uses from non-kernel functions are replaced with a table lookup using // the intrinsic function to find the address of the variable. // // "Kernel" lowering is only applicable for variables that are unambiguously // reachable from exactly one kernel. For those cases, accesses to the variable // can be lowered to ConstantExpr address of a struct instance specific to that // one kernel. This is zero cost in space and in compute. It will raise a fatal // error on any variable that might be reachable from multiple kernels and is // thus most easily used as part of the hybrid lowering strategy. // // Hybrid lowering is a mixture of the above. It uses the zero cost kernel // lowering where it can. It lowers the variable accessed by the greatest // number of kernels using the module strategy as that is free for the first // variable. Any futher variables that can be lowered with the module strategy // without incurring LDS memory overhead are. The remaining ones are lowered // via table. // // Consequences // - No heuristics or user controlled magic numbers, hybrid is the right choice // - Kernels that don't use functions (or have had them all inlined) are not // affected by any lowering for kernels that do. // - Kernels that don't make indirect function calls are not affected by those // that do. // - Variables which are used by lots of kernels, e.g. those injected by a // language runtime in most kernels, are expected to have no overhead // - Implementations that instantiate templates per-kernel where those templates // use LDS are expected to hit the "Kernel" lowering strategy // - The runtime properties impose a cost in compiler implementation complexity // // Dynamic LDS implementation // Dynamic LDS is lowered similarly to the "table" strategy above and uses the // same intrinsic to identify which kernel is at the root of the dynamic call // graph. This relies on the specified behaviour that all dynamic LDS variables // alias one another, i.e. are at the same address, with respect to a given // kernel. Therefore this pass creates new dynamic LDS variables for each kernel // that allocates any dynamic LDS and builds a table of addresses out of those. // The AMDGPUPromoteAlloca pass skips kernels that use dynamic LDS. // The corresponding optimisation for "kernel" lowering where the table lookup // is elided is not implemented. // // // Implementation notes / limitations // A single LDS global variable represents an instance per kernel that can reach // said variables. This pass essentially specialises said variables per kernel. // Handling ConstantExpr during the pass complicated this significantly so now // all ConstantExpr uses of LDS variables are expanded to instructions. This // may need amending when implementing non-undef initialisers. // // Lowering is split between this IR pass and the back end. This pass chooses // where given variables should be allocated and marks them with metadata, // MD_absolute_symbol. The backend places the variables in coincidentally the // same location and raises a fatal error if something has gone awry. This works // in practice because the only pass between this one and the backend that // changes LDS is PromoteAlloca and the changes it makes do not conflict. // // Addresses are written to constant global arrays based on the same metadata. // // The backend lowers LDS variables in the order of traversal of the function. // This is at odds with the deterministic layout required. The workaround is to // allocate the fixed-address variables immediately upon starting the function // where they can be placed as intended. This requires a means of mapping from // the function to the variables that it allocates. For the module scope lds, // this is via metadata indicating whether the variable is not required. If a // pass deletes that metadata, a fatal error on disagreement with the absolute // symbol metadata will occur. For kernel scope and dynamic, this is by _name_ // correspondence between the function and the variable. It requires the // kernel to have a name (which is only a limitation for tests in practice) and // for nothing to rename the corresponding symbols. This is a hazard if the pass // is run multiple times during debugging. Alternative schemes considered all // involve bespoke metadata. // // If the name correspondence can be replaced, multiple distinct kernels that // have the same memory layout can map to the same kernel id (as the address // itself is handled by the absolute symbol metadata) and that will allow more // uses of the "kernel" style faster lowering and reduce the size of the lookup // tables. // // There is a test that checks this does not fire for a graphics shader. This // lowering is expected to work for graphics if the isKernel test is changed. // // The current markUsedByKernel is sufficient for PromoteAlloca but is elided // before codegen. Replacing this with an equivalent intrinsic which lasts until // shortly after the machine function lowering of LDS would help break the name // mapping. The other part needed is probably to amend PromoteAlloca to embed // the LDS variables it creates in the same struct created here. That avoids the // current hazard where a PromoteAlloca LDS variable might be allocated before // the kernel scope (and thus error on the address check). Given a new invariant // that no LDS variables exist outside of the structs managed here, and an // intrinsic that lasts until after the LDS frame lowering, it should be // possible to drop the name mapping and fold equivalent memory layouts. // //===----------------------------------------------------------------------===// #include "AMDGPU.h" #include "AMDGPUMemoryUtils.h" #include "AMDGPUTargetMachine.h" #include "Utils/AMDGPUBaseInfo.h" #include "llvm/ADT/BitVector.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/DenseSet.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SetOperations.h" #include "llvm/Analysis/CallGraph.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/InlineAsm.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicsAMDGPU.h" #include "llvm/IR/MDBuilder.h" #include "llvm/IR/ReplaceConstant.h" #include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/Format.h" #include "llvm/Support/OptimizedStructLayout.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/ModuleUtils.h" #include <vector> #include <cstdio> #define DEBUG_TYPE … usingnamespacellvm; usingnamespaceAMDGPU; namespace { cl::opt<bool> SuperAlignLDSGlobals( "amdgpu-super-align-lds-globals", cl::desc("Increase alignment of LDS if it is not on align boundary"), cl::init(true), cl::Hidden); enum class LoweringKind { … }; cl::opt<LoweringKind> LoweringKindLoc( "amdgpu-lower-module-lds-strategy", cl::desc("Specify lowering strategy for function LDS access:"), cl::Hidden, cl::init(LoweringKind::hybrid), cl::values( clEnumValN(LoweringKind::table, "table", "Lower via table lookup"), clEnumValN(LoweringKind::module, "module", "Lower via module struct"), clEnumValN( LoweringKind::kernel, "kernel", "Lower variables reachable from one kernel, otherwise abort"), clEnumValN(LoweringKind::hybrid, "hybrid", "Lower via mixture of above strategies"))); template <typename T> std::vector<T> sortByName(std::vector<T> &&V) { … } class AMDGPULowerModuleLDS { … }; class AMDGPULowerModuleLDSLegacy : public ModulePass { … }; } // namespace char AMDGPULowerModuleLDSLegacy::ID = …; char &llvm::AMDGPULowerModuleLDSLegacyPassID = …; INITIALIZE_PASS_BEGIN(AMDGPULowerModuleLDSLegacy, DEBUG_TYPE, "Lower uses of LDS variables from non-kernel functions", false, false) INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) INITIALIZE_PASS_END(AMDGPULowerModuleLDSLegacy, DEBUG_TYPE, "Lower uses of LDS variables from non-kernel functions", false, false) ModulePass * llvm::createAMDGPULowerModuleLDSLegacyPass(const AMDGPUTargetMachine *TM) { … } PreservedAnalyses AMDGPULowerModuleLDSPass::run(Module &M, ModuleAnalysisManager &) { … }