//===-- AMDGPULowerBufferFatPointers.cpp ---------------------------=// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // This pass lowers operations on buffer fat pointers (addrspace 7) to // operations on buffer resources (addrspace 8) and is needed for correct // codegen. // // # Background // // Address space 7 (the buffer fat pointer) is a 160-bit pointer that consists // of a 128-bit buffer descriptor and a 32-bit offset into that descriptor. // The buffer resource part needs to be it needs to be a "raw" buffer resource // (it must have a stride of 0 and bounds checks must be in raw buffer mode // or disabled). // // When these requirements are met, a buffer resource can be treated as a // typical (though quite wide) pointer that follows typical LLVM pointer // semantics. This allows the frontend to reason about such buffers (which are // often encountered in the context of SPIR-V kernels). // // However, because of their non-power-of-2 size, these fat pointers cannot be // present during translation to MIR (though this restriction may be lifted // during the transition to GlobalISel). Therefore, this pass is needed in order // to correctly implement these fat pointers. // // The resource intrinsics take the resource part (the address space 8 pointer) // and the offset part (the 32-bit integer) as separate arguments. In addition, // many users of these buffers manipulate the offset while leaving the resource // part alone. For these reasons, we want to typically separate the resource // and offset parts into separate variables, but combine them together when // encountering cases where this is required, such as by inserting these values // into aggretates or moving them to memory. // // Therefore, at a high level, `ptr addrspace(7) %x` becomes `ptr addrspace(8) // %x.rsrc` and `i32 %x.off`, which will be combined into `{ptr addrspace(8), // i32} %x = {%x.rsrc, %x.off}` if needed. Similarly, `vector<Nxp7>` becomes // `{vector<Nxp8>, vector<Nxi32 >}` and its component parts. // // # Implementation // // This pass proceeds in three main phases: // // ## Rewriting loads and stores of p7 // // The first phase is to rewrite away all loads and stors of `ptr addrspace(7)`, // including aggregates containing such pointers, to ones that use `i160`. This // is handled by `StoreFatPtrsAsIntsVisitor` , which visits loads, stores, and // allocas and, if the loaded or stored type contains `ptr addrspace(7)`, // rewrites that type to one where the p7s are replaced by i160s, copying other // parts of aggregates as needed. In the case of a store, each pointer is // `ptrtoint`d to i160 before storing, and load integers are `inttoptr`d back. // This same transformation is applied to vectors of pointers. // // Such a transformation allows the later phases of the pass to not need // to handle buffer fat pointers moving to and from memory, where we load // have to handle the incompatibility between a `{Nxp8, Nxi32}` representation // and `Nxi60` directly. Instead, that transposing action (where the vectors // of resources and vectors of offsets are concatentated before being stored to // memory) are handled through implementing `inttoptr` and `ptrtoint` only. // // Atomics operations on `ptr addrspace(7)` values are not suppported, as the // hardware does not include a 160-bit atomic. // // ## Type remapping // // We use a `ValueMapper` to mangle uses of [vectors of] buffer fat pointers // to the corresponding struct type, which has a resource part and an offset // part. // // This uses a `BufferFatPtrToStructTypeMap` and a `FatPtrConstMaterializer` // to, usually by way of `setType`ing values. Constants are handled here // because there isn't a good way to fix them up later. // // This has the downside of leaving the IR in an invalid state (for example, // the instruction `getelementptr {ptr addrspace(8), i32} %p, ...` will exist), // but all such invalid states will be resolved by the third phase. // // Functions that don't take buffer fat pointers are modified in place. Those // that do take such pointers have their basic blocks moved to a new function // with arguments that are {ptr addrspace(8), i32} arguments and return values. // This phase also records intrinsics so that they can be remangled or deleted // later. // // // ## Splitting pointer structs // // The meat of this pass consists of defining semantics for operations that // produce or consume [vectors of] buffer fat pointers in terms of their // resource and offset parts. This is accomplished throgh the `SplitPtrStructs` // visitor. // // In the first pass through each function that is being lowered, the splitter // inserts new instructions to implement the split-structures behavior, which is // needed for correctness and performance. It records a list of "split users", // instructions that are being replaced by operations on the resource and offset // parts. // // Split users do not necessarily need to produce parts themselves ( // a `load float, ptr addrspace(7)` does not, for example), but, if they do not // generate fat buffer pointers, they must RAUW in their replacement // instructions during the initial visit. // // When these new instructions are created, they use the split parts recorded // for their initial arguments in order to generate their replacements, creating // a parallel set of instructions that does not refer to the original fat // pointer values but instead to their resource and offset components. // // Instructions, such as `extractvalue`, that produce buffer fat pointers from // sources that do not have split parts, have such parts generated using // `extractvalue`. This is also the initial handling of PHI nodes, which // are then cleaned up. // // ### Conditionals // // PHI nodes are initially given resource parts via `extractvalue`. However, // this is not an efficient rewrite of such nodes, as, in most cases, the // resource part in a conditional or loop remains constant throughout the loop // and only the offset varies. Failing to optimize away these constant resources // would cause additional registers to be sent around loops and might lead to // waterfall loops being generated for buffer operations due to the // "non-uniform" resource argument. // // Therefore, after all instructions have been visited, the pointer splitter // post-processes all encountered conditionals. Given a PHI node or select, // getPossibleRsrcRoots() collects all values that the resource parts of that // conditional's input could come from as well as collecting all conditional // instructions encountered during the search. If, after filtering out the // initial node itself, the set of encountered conditionals is a subset of the // potential roots and there is a single potential resource that isn't in the // conditional set, that value is the only possible value the resource argument // could have throughout the control flow. // // If that condition is met, then a PHI node can have its resource part changed // to the singleton value and then be replaced by a PHI on the offsets. // Otherwise, each PHI node is split into two, one for the resource part and one // for the offset part, which replace the temporary `extractvalue` instructions // that were added during the first pass. // // Similar logic applies to `select`, where // `%z = select i1 %cond, %cond, ptr addrspace(7) %x, ptr addrspace(7) %y` // can be split into `%z.rsrc = %x.rsrc` and // `%z.off = select i1 %cond, ptr i32 %x.off, i32 %y.off` // if both `%x` and `%y` have the same resource part, but two `select` // operations will be needed if they do not. // // ### Final processing // // After conditionals have been cleaned up, the IR for each function is // rewritten to remove all the old instructions that have been split up. // // Any instruction that used to produce a buffer fat pointer (and therefore now // produces a resource-and-offset struct after type remapping) is // replaced as follows: // 1. All debug value annotations are cloned to reflect that the resource part // and offset parts are computed separately and constitute different // fragments of the underlying source language variable. // 2. All uses that were themselves split are replaced by a `poison` of the // struct type, as they will themselves be erased soon. This rule, combined // with debug handling, should leave the use lists of split instructions // empty in almost all cases. // 3. If a user of the original struct-valued result remains, the structure // needed for the new types to work is constructed out of the newly-defined // parts, and the original instruction is replaced by this structure // before being erased. Instructions requiring this construction include // `ret` and `insertvalue`. // // # Consequences // // This pass does not alter the CFG. // // Alias analysis information will become coarser, as the LLVM alias analyzer // cannot handle the buffer intrinsics. Specifically, while we can determine // that the following two loads do not alias: // ``` // %y = getelementptr i32, ptr addrspace(7) %x, i32 1 // %a = load i32, ptr addrspace(7) %x // %b = load i32, ptr addrspace(7) %y // ``` // we cannot (except through some code that runs during scheduling) determine // that the rewritten loads below do not alias. // ``` // %y.off = add i32 %x.off, 1 // %a = call @llvm.amdgcn.raw.ptr.buffer.load(ptr addrspace(8) %x.rsrc, i32 // %x.off, ...) // %b = call @llvm.amdgcn.raw.ptr.buffer.load(ptr addrspace(8) // %x.rsrc, i32 %y.off, ...) // ``` // However, existing alias information is preserved. //===----------------------------------------------------------------------===// #include "AMDGPU.h" #include "AMDGPUTargetMachine.h" #include "GCNSubtarget.h" #include "SIDefines.h" #include "llvm/ADT/SetOperations.h" #include "llvm/ADT/SmallVector.h" #include "llvm/Analysis/ConstantFolding.h" #include "llvm/Analysis/Utils/Local.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/AttributeMask.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DebugInfo.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/InstIterator.h" #include "llvm/IR/InstVisitor.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Intrinsics.h" #include "llvm/IR/IntrinsicsAMDGPU.h" #include "llvm/IR/Metadata.h" #include "llvm/IR/Operator.h" #include "llvm/IR/PatternMatch.h" #include "llvm/IR/ReplaceConstant.h" #include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/AtomicOrdering.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Transforms/Utils/Cloning.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/ValueMapper.h" #define DEBUG_TYPE … usingnamespacellvm; static constexpr unsigned BufferOffsetWidth = …; namespace { /// Recursively replace instances of ptr addrspace(7) and vector<Nxptr /// addrspace(7)> with some other type as defined by the relevant subclass. class BufferFatPtrTypeLoweringBase : public ValueMapTypeRemapper { … }; /// Remap ptr addrspace(7) to i160 and vector<Nxptr addrspace(7)> to /// vector<Nxi60> in order to correctly handling loading/storing these values /// from memory. class BufferFatPtrToIntTypeMap : public BufferFatPtrTypeLoweringBase { … }; /// Remap ptr addrspace(7) to {ptr addrspace(8), i32} (the resource and offset /// parts of the pointer) so that we can easily rewrite operations on these /// values that aren't loading them from or storing them to memory. class BufferFatPtrToStructTypeMap : public BufferFatPtrTypeLoweringBase { … }; } // namespace // This code is adapted from the type remapper in lib/Linker/IRMover.cpp Type *BufferFatPtrTypeLoweringBase::remapTypeImpl( Type *Ty, SmallPtrSetImpl<StructType *> &Seen) { … } Type *BufferFatPtrTypeLoweringBase::remapType(Type *SrcTy) { … } Type *BufferFatPtrToStructTypeMap::remapScalar(PointerType *PT) { … } Type *BufferFatPtrToStructTypeMap::remapVector(VectorType *VT) { … } static bool isBufferFatPtrOrVector(Type *Ty) { … } // True if the type is {ptr addrspace(8), i32} or a struct containing vectors of // those types. Used to quickly skip instructions we don't need to process. static bool isSplitFatPtr(Type *Ty) { … } // True if the result type or any argument types are buffer fat pointers. static bool isBufferFatPtrConst(Constant *C) { … } namespace { /// Convert [vectors of] buffer fat pointers to integers when they are read from /// or stored to memory. This ensures that these pointers will have the same /// memory layout as before they are lowered, even though they will no longer /// have their previous layout in registers/in the program (they'll be broken /// down into resource and offset parts). This has the downside of imposing /// marshalling costs when reading or storing these values, but since placing /// such pointers into memory is an uncommon operation at best, we feel that /// this cost is acceptable for better performance in the common case. class StoreFatPtrsAsIntsVisitor : public InstVisitor<StoreFatPtrsAsIntsVisitor, bool> { … }; } // namespace Value *StoreFatPtrsAsIntsVisitor::fatPtrsToInts(Value *V, Type *From, Type *To, const Twine &Name) { … } Value *StoreFatPtrsAsIntsVisitor::intsToFatPtrs(Value *V, Type *From, Type *To, const Twine &Name) { … } bool StoreFatPtrsAsIntsVisitor::processFunction(Function &F) { … } bool StoreFatPtrsAsIntsVisitor::visitAllocaInst(AllocaInst &I) { … } bool StoreFatPtrsAsIntsVisitor::visitGetElementPtrInst(GetElementPtrInst &I) { … } bool StoreFatPtrsAsIntsVisitor::visitLoadInst(LoadInst &LI) { … } bool StoreFatPtrsAsIntsVisitor::visitStoreInst(StoreInst &SI) { … } /// Return the ptr addrspace(8) and i32 (resource and offset parts) in a lowered /// buffer fat pointer constant. static std::pair<Constant *, Constant *> splitLoweredFatBufferConst(Constant *C) { … } namespace { /// Handle the remapping of ptr addrspace(7) constants. class FatPtrConstMaterializer final : public ValueMaterializer { … }; } // namespace Constant *FatPtrConstMaterializer::materializeBufferFatPtrConst(Constant *C) { … } Value *FatPtrConstMaterializer::materialize(Value *V) { … } PtrParts; namespace { // The visitor returns the resource and offset parts for an instruction if they // can be computed, or (nullptr, nullptr) for cases that don't have a meaningful // value mapping. class SplitPtrStructs : public InstVisitor<SplitPtrStructs, PtrParts> { … }; } // namespace void SplitPtrStructs::copyMetadata(Value *Dest, Value *Src) { … } PtrParts SplitPtrStructs::getPtrParts(Value *V) { … } /// Returns the instruction that defines the resource part of the value V. /// Note that this is not getUnderlyingObject(), since that looks through /// operations like ptrmask which might modify the resource part. /// /// We can limit ourselves to just looking through GEPs followed by looking /// through addrspacecasts because only those two operations preserve the /// resource part, and because operations on an `addrspace(8)` (which is the /// legal input to this addrspacecast) would produce a different resource part. static Value *rsrcPartRoot(Value *V) { … } void SplitPtrStructs::getPossibleRsrcRoots(Instruction *I, SmallPtrSetImpl<Value *> &Roots, SmallPtrSetImpl<Value *> &Seen) { … } void SplitPtrStructs::processConditionals() { … } void SplitPtrStructs::killAndReplaceSplitInstructions( SmallVectorImpl<Instruction *> &Origs) { … } void SplitPtrStructs::setAlign(CallInst *Intr, Align A, unsigned RsrcArgIdx) { … } void SplitPtrStructs::insertPreMemOpFence(AtomicOrdering Order, SyncScope::ID SSID) { … } void SplitPtrStructs::insertPostMemOpFence(AtomicOrdering Order, SyncScope::ID SSID) { … } Value *SplitPtrStructs::handleMemoryInst(Instruction *I, Value *Arg, Value *Ptr, Type *Ty, Align Alignment, AtomicOrdering Order, bool IsVolatile, SyncScope::ID SSID) { … } PtrParts SplitPtrStructs::visitInstruction(Instruction &I) { … } PtrParts SplitPtrStructs::visitLoadInst(LoadInst &LI) { … } PtrParts SplitPtrStructs::visitStoreInst(StoreInst &SI) { … } PtrParts SplitPtrStructs::visitAtomicRMWInst(AtomicRMWInst &AI) { … } // Unlike load, store, and RMW, cmpxchg needs special handling to account // for the boolean argument. PtrParts SplitPtrStructs::visitAtomicCmpXchgInst(AtomicCmpXchgInst &AI) { … } PtrParts SplitPtrStructs::visitGetElementPtrInst(GetElementPtrInst &GEP) { … } PtrParts SplitPtrStructs::visitPtrToIntInst(PtrToIntInst &PI) { … } PtrParts SplitPtrStructs::visitIntToPtrInst(IntToPtrInst &IP) { … } PtrParts SplitPtrStructs::visitAddrSpaceCastInst(AddrSpaceCastInst &I) { … } PtrParts SplitPtrStructs::visitICmpInst(ICmpInst &Cmp) { … } PtrParts SplitPtrStructs::visitFreezeInst(FreezeInst &I) { … } PtrParts SplitPtrStructs::visitExtractElementInst(ExtractElementInst &I) { … } PtrParts SplitPtrStructs::visitInsertElementInst(InsertElementInst &I) { … } PtrParts SplitPtrStructs::visitShuffleVectorInst(ShuffleVectorInst &I) { … } PtrParts SplitPtrStructs::visitPHINode(PHINode &PHI) { … } PtrParts SplitPtrStructs::visitSelectInst(SelectInst &SI) { … } /// Returns true if this intrinsic needs to be removed when it is /// applied to `ptr addrspace(7)` values. Calls to these intrinsics are /// rewritten into calls to versions of that intrinsic on the resource /// descriptor. static bool isRemovablePointerIntrinsic(Intrinsic::ID IID) { … } PtrParts SplitPtrStructs::visitIntrinsicInst(IntrinsicInst &I) { … } void SplitPtrStructs::processFunction(Function &F) { … } namespace { class AMDGPULowerBufferFatPointers : public ModulePass { … }; } // namespace /// Returns true if there are values that have a buffer fat pointer in them, /// which means we'll need to perform rewrites on this function. As a side /// effect, this will populate the type remapping cache. static bool containsBufferFatPointers(const Function &F, BufferFatPtrToStructTypeMap *TypeMap) { … } static bool hasFatPointerInterface(const Function &F, BufferFatPtrToStructTypeMap *TypeMap) { … } /// Move the body of `OldF` into a new function, returning it. static Function *moveFunctionAdaptingType(Function *OldF, FunctionType *NewTy, ValueToValueMapTy &CloneMap) { … } static void makeCloneInPraceMap(Function *F, ValueToValueMapTy &CloneMap) { … } bool AMDGPULowerBufferFatPointers::run(Module &M, const TargetMachine &TM) { … } bool AMDGPULowerBufferFatPointers::runOnModule(Module &M) { … } char AMDGPULowerBufferFatPointers::ID = …; char &llvm::AMDGPULowerBufferFatPointersID = …; void AMDGPULowerBufferFatPointers::getAnalysisUsage(AnalysisUsage &AU) const { … } #define PASS_DESC … INITIALIZE_PASS_BEGIN(AMDGPULowerBufferFatPointers, DEBUG_TYPE, PASS_DESC, false, false) INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) INITIALIZE_PASS_END(AMDGPULowerBufferFatPointers, DEBUG_TYPE, PASS_DESC, false, false) #undef PASS_DESC ModulePass *llvm::createAMDGPULowerBufferFatPointersPass() { … } PreservedAnalyses AMDGPULowerBufferFatPointersPass::run(Module &M, ModuleAnalysisManager &MA) { … }