//===- SeparateConstOffsetFromGEP.cpp -------------------------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // Loop unrolling may create many similar GEPs for array accesses. // e.g., a 2-level loop // // float a[32][32]; // global variable // // for (int i = 0; i < 2; ++i) { // for (int j = 0; j < 2; ++j) { // ... // ... = a[x + i][y + j]; // ... // } // } // // will probably be unrolled to: // // gep %a, 0, %x, %y; load // gep %a, 0, %x, %y + 1; load // gep %a, 0, %x + 1, %y; load // gep %a, 0, %x + 1, %y + 1; load // // LLVM's GVN does not use partial redundancy elimination yet, and is thus // unable to reuse (gep %a, 0, %x, %y). As a result, this misoptimization incurs // significant slowdown in targets with limited addressing modes. For instance, // because the PTX target does not support the reg+reg addressing mode, the // NVPTX backend emits PTX code that literally computes the pointer address of // each GEP, wasting tons of registers. It emits the following PTX for the // first load and similar PTX for other loads. // // mov.u32 %r1, %x; // mov.u32 %r2, %y; // mul.wide.u32 %rl2, %r1, 128; // mov.u64 %rl3, a; // add.s64 %rl4, %rl3, %rl2; // mul.wide.u32 %rl5, %r2, 4; // add.s64 %rl6, %rl4, %rl5; // ld.global.f32 %f1, [%rl6]; // // To reduce the register pressure, the optimization implemented in this file // merges the common part of a group of GEPs, so we can compute each pointer // address by adding a simple offset to the common part, saving many registers. // // It works by splitting each GEP into a variadic base and a constant offset. // The variadic base can be computed once and reused by multiple GEPs, and the // constant offsets can be nicely folded into the reg+immediate addressing mode // (supported by most targets) without using any extra register. // // For instance, we transform the four GEPs and four loads in the above example // into: // // base = gep a, 0, x, y // load base // load base + 1 * sizeof(float) // load base + 32 * sizeof(float) // load base + 33 * sizeof(float) // // Given the transformed IR, a backend that supports the reg+immediate // addressing mode can easily fold the pointer arithmetics into the loads. For // example, the NVPTX backend can easily fold the pointer arithmetics into the // ld.global.f32 instructions, and the resultant PTX uses much fewer registers. // // mov.u32 %r1, %tid.x; // mov.u32 %r2, %tid.y; // mul.wide.u32 %rl2, %r1, 128; // mov.u64 %rl3, a; // add.s64 %rl4, %rl3, %rl2; // mul.wide.u32 %rl5, %r2, 4; // add.s64 %rl6, %rl4, %rl5; // ld.global.f32 %f1, [%rl6]; // so far the same as unoptimized PTX // ld.global.f32 %f2, [%rl6+4]; // much better // ld.global.f32 %f3, [%rl6+128]; // much better // ld.global.f32 %f4, [%rl6+132]; // much better // // Another improvement enabled by the LowerGEP flag is to lower a GEP with // multiple indices to either multiple GEPs with a single index or arithmetic // operations (depending on whether the target uses alias analysis in codegen). // Such transformation can have following benefits: // (1) It can always extract constants in the indices of structure type. // (2) After such Lowering, there are more optimization opportunities such as // CSE, LICM and CGP. // // E.g. The following GEPs have multiple indices: // BB1: // %p = getelementptr [10 x %struct]* %ptr, i64 %i, i64 %j1, i32 3 // load %p // ... // BB2: // %p2 = getelementptr [10 x %struct]* %ptr, i64 %i, i64 %j1, i32 2 // load %p2 // ... // // We can not do CSE to the common part related to index "i64 %i". Lowering // GEPs can achieve such goals. // If the target does not use alias analysis in codegen, this pass will // lower a GEP with multiple indices into arithmetic operations: // BB1: // %1 = ptrtoint [10 x %struct]* %ptr to i64 ; CSE opportunity // %2 = mul i64 %i, length_of_10xstruct ; CSE opportunity // %3 = add i64 %1, %2 ; CSE opportunity // %4 = mul i64 %j1, length_of_struct // %5 = add i64 %3, %4 // %6 = add i64 %3, struct_field_3 ; Constant offset // %p = inttoptr i64 %6 to i32* // load %p // ... // BB2: // %7 = ptrtoint [10 x %struct]* %ptr to i64 ; CSE opportunity // %8 = mul i64 %i, length_of_10xstruct ; CSE opportunity // %9 = add i64 %7, %8 ; CSE opportunity // %10 = mul i64 %j2, length_of_struct // %11 = add i64 %9, %10 // %12 = add i64 %11, struct_field_2 ; Constant offset // %p = inttoptr i64 %12 to i32* // load %p2 // ... // // If the target uses alias analysis in codegen, this pass will lower a GEP // with multiple indices into multiple GEPs with a single index: // BB1: // %1 = bitcast [10 x %struct]* %ptr to i8* ; CSE opportunity // %2 = mul i64 %i, length_of_10xstruct ; CSE opportunity // %3 = getelementptr i8* %1, i64 %2 ; CSE opportunity // %4 = mul i64 %j1, length_of_struct // %5 = getelementptr i8* %3, i64 %4 // %6 = getelementptr i8* %5, struct_field_3 ; Constant offset // %p = bitcast i8* %6 to i32* // load %p // ... // BB2: // %7 = bitcast [10 x %struct]* %ptr to i8* ; CSE opportunity // %8 = mul i64 %i, length_of_10xstruct ; CSE opportunity // %9 = getelementptr i8* %7, i64 %8 ; CSE opportunity // %10 = mul i64 %j2, length_of_struct // %11 = getelementptr i8* %9, i64 %10 // %12 = getelementptr i8* %11, struct_field_2 ; Constant offset // %p2 = bitcast i8* %12 to i32* // load %p2 // ... // // Lowering GEPs can also benefit other passes such as LICM and CGP. // LICM (Loop Invariant Code Motion) can not hoist/sink a GEP of multiple // indices if one of the index is variant. If we lower such GEP into invariant // parts and variant parts, LICM can hoist/sink those invariant parts. // CGP (CodeGen Prepare) tries to sink address calculations that match the // target's addressing modes. A GEP with multiple indices may not match and will // not be sunk. If we lower such GEP into smaller parts, CGP may sink some of // them. So we end up with a better addressing mode. // //===----------------------------------------------------------------------===// #include "llvm/Transforms/Scalar/SeparateConstOffsetFromGEP.h" #include "llvm/ADT/APInt.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/DepthFirstIterator.h" #include "llvm/ADT/SmallVector.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/MemoryBuiltins.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/Constant.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" #include "llvm/IR/GetElementPtrTypeIterator.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/InstrTypes.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Module.h" #include "llvm/IR/PassManager.h" #include "llvm/IR/PatternMatch.h" #include "llvm/IR/Type.h" #include "llvm/IR/User.h" #include "llvm/IR/Value.h" #include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/Local.h" #include <cassert> #include <cstdint> #include <string> usingnamespacellvm; usingnamespacellvm::PatternMatch; static cl::opt<bool> DisableSeparateConstOffsetFromGEP( "disable-separate-const-offset-from-gep", cl::init(false), cl::desc("Do not separate the constant offset from a GEP instruction"), cl::Hidden); // Setting this flag may emit false positives when the input module already // contains dead instructions. Therefore, we set it only in unit tests that are // free of dead code. static cl::opt<bool> VerifyNoDeadCode("reassociate-geps-verify-no-dead-code", cl::init(false), cl::desc("Verify this pass produces no dead code"), cl::Hidden); namespace { /// A helper class for separating a constant offset from a GEP index. /// /// In real programs, a GEP index may be more complicated than a simple addition /// of something and a constant integer which can be trivially splitted. For /// example, to split ((a << 3) | 5) + b, we need to search deeper for the /// constant offset, so that we can separate the index to (a << 3) + b and 5. /// /// Therefore, this class looks into the expression that computes a given GEP /// index, and tries to find a constant integer that can be hoisted to the /// outermost level of the expression as an addition. Not every constant in an /// expression can jump out. e.g., we cannot transform (b * (a + 5)) to (b * a + /// 5); nor can we transform (3 * (a + 5)) to (3 * a + 5), however in this case, /// -instcombine probably already optimized (3 * (a + 5)) to (3 * a + 15). class ConstantOffsetExtractor { … }; /// A pass that tries to split every GEP in the function into a variadic /// base and a constant offset. It is a FunctionPass because searching for the /// constant offset may inspect other basic blocks. class SeparateConstOffsetFromGEPLegacyPass : public FunctionPass { … }; /// A pass that tries to split every GEP in the function into a variadic /// base and a constant offset. It is a FunctionPass because searching for the /// constant offset may inspect other basic blocks. class SeparateConstOffsetFromGEP { … }; } // end anonymous namespace char SeparateConstOffsetFromGEPLegacyPass::ID = …; INITIALIZE_PASS_BEGIN( SeparateConstOffsetFromGEPLegacyPass, "separate-const-offset-from-gep", "Split GEPs to a variadic base and a constant offset for better CSE", false, false) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) INITIALIZE_PASS_END( SeparateConstOffsetFromGEPLegacyPass, "separate-const-offset-from-gep", "Split GEPs to a variadic base and a constant offset for better CSE", false, false) FunctionPass *llvm::createSeparateConstOffsetFromGEPPass(bool LowerGEP) { … } bool ConstantOffsetExtractor::CanTraceInto(bool SignExtended, bool ZeroExtended, BinaryOperator *BO, bool NonNegative) { … } APInt ConstantOffsetExtractor::findInEitherOperand(BinaryOperator *BO, bool SignExtended, bool ZeroExtended) { … } APInt ConstantOffsetExtractor::find(Value *V, bool SignExtended, bool ZeroExtended, bool NonNegative) { … } Value *ConstantOffsetExtractor::applyExts(Value *V) { … } Value *ConstantOffsetExtractor::rebuildWithoutConstOffset() { … } Value * ConstantOffsetExtractor::distributeExtsAndCloneChain(unsigned ChainIndex) { … } Value *ConstantOffsetExtractor::removeConstOffset(unsigned ChainIndex) { … } Value *ConstantOffsetExtractor::Extract(Value *Idx, GetElementPtrInst *GEP, User *&UserChainTail) { … } int64_t ConstantOffsetExtractor::Find(Value *Idx, GetElementPtrInst *GEP) { … } bool SeparateConstOffsetFromGEP::canonicalizeArrayIndicesToIndexSize( GetElementPtrInst *GEP) { … } int64_t SeparateConstOffsetFromGEP::accumulateByteOffset(GetElementPtrInst *GEP, bool &NeedsExtraction) { … } void SeparateConstOffsetFromGEP::lowerToSingleIndexGEPs( GetElementPtrInst *Variadic, int64_t AccumulativeByteOffset) { … } void SeparateConstOffsetFromGEP::lowerToArithmetics(GetElementPtrInst *Variadic, int64_t AccumulativeByteOffset) { … } bool SeparateConstOffsetFromGEP::reorderGEP(GetElementPtrInst *GEP, TargetTransformInfo &TTI) { … } bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) { … } bool SeparateConstOffsetFromGEPLegacyPass::runOnFunction(Function &F) { … } bool SeparateConstOffsetFromGEP::run(Function &F) { … } Instruction *SeparateConstOffsetFromGEP::findClosestMatchingDominator( ExprKey Key, Instruction *Dominatee, DenseMap<ExprKey, SmallVector<Instruction *, 2>> &DominatingExprs) { … } bool SeparateConstOffsetFromGEP::reuniteExts(Instruction *I) { … } bool SeparateConstOffsetFromGEP::reuniteExts(Function &F) { … } void SeparateConstOffsetFromGEP::verifyNoDeadCode(Function &F) { … } bool SeparateConstOffsetFromGEP::isLegalToSwapOperand( GetElementPtrInst *FirstGEP, GetElementPtrInst *SecondGEP, Loop *CurLoop) { … } bool SeparateConstOffsetFromGEP::hasMoreThanOneUseInLoop(Value *V, Loop *L) { … } void SeparateConstOffsetFromGEP::swapGEPOperand(GetElementPtrInst *First, GetElementPtrInst *Second) { … } void SeparateConstOffsetFromGEPPass::printPipeline( raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) { … } PreservedAnalyses SeparateConstOffsetFromGEPPass::run(Function &F, FunctionAnalysisManager &AM) { … }