//===- LoopIdiomRecognize.cpp - Loop idiom recognition --------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // This pass implements an idiom recognizer that transforms simple loops into a // non-loop form. In cases that this kicks in, it can be a significant // performance win. // // If compiling for code size we avoid idiom recognition if the resulting // code could be larger than the code for the original loop. One way this could // happen is if the loop is not removable after idiom recognition due to the // presence of non-idiom instructions. The initial implementation of the // heuristics applies to idioms in multi-block loops. // //===----------------------------------------------------------------------===// // // TODO List: // // Future loop memory idioms to recognize: // memcmp, strlen, etc. // // This could recognize common matrix multiplies and dot product idioms and // replace them with calls to BLAS (if linked in??). // //===----------------------------------------------------------------------===// #include "llvm/Transforms/Scalar/LoopIdiomRecognize.h" #include "llvm/ADT/APInt.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/MapVector.h" #include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringRef.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/CmpInstAnalysis.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/LoopPass.h" #include "llvm/Analysis/MemoryLocation.h" #include "llvm/Analysis/MemorySSA.h" #include "llvm/Analysis/MemorySSAUpdater.h" #include "llvm/Analysis/MustExecute.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/Constant.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/DebugLoc.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/GlobalValue.h" #include "llvm/IR/GlobalVariable.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/InstrTypes.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Intrinsics.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Module.h" #include "llvm/IR/PassManager.h" #include "llvm/IR/PatternMatch.h" #include "llvm/IR/Type.h" #include "llvm/IR/User.h" #include "llvm/IR/Value.h" #include "llvm/IR/ValueHandle.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/InstructionCost.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils/BuildLibCalls.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/LoopUtils.h" #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" #include <algorithm> #include <cassert> #include <cstdint> #include <utility> #include <vector> usingnamespacellvm; #define DEBUG_TYPE … STATISTIC(NumMemSet, "Number of memset's formed from loop stores"); STATISTIC(NumMemCpy, "Number of memcpy's formed from loop load+stores"); STATISTIC(NumMemMove, "Number of memmove's formed from loop load+stores"); STATISTIC( NumShiftUntilBitTest, "Number of uncountable loops recognized as 'shift until bitttest' idiom"); STATISTIC(NumShiftUntilZero, "Number of uncountable loops recognized as 'shift until zero' idiom"); bool DisableLIRP::All; static cl::opt<bool, true> DisableLIRPAll("disable-" DEBUG_TYPE "-all", cl::desc("Options to disable Loop Idiom Recognize Pass."), cl::location(DisableLIRP::All), cl::init(false), cl::ReallyHidden); bool DisableLIRP::Memset; static cl::opt<bool, true> DisableLIRPMemset("disable-" DEBUG_TYPE "-memset", cl::desc("Proceed with loop idiom recognize pass, but do " "not convert loop(s) to memset."), cl::location(DisableLIRP::Memset), cl::init(false), cl::ReallyHidden); bool DisableLIRP::Memcpy; static cl::opt<bool, true> DisableLIRPMemcpy("disable-" DEBUG_TYPE "-memcpy", cl::desc("Proceed with loop idiom recognize pass, but do " "not convert loop(s) to memcpy."), cl::location(DisableLIRP::Memcpy), cl::init(false), cl::ReallyHidden); static cl::opt<bool> UseLIRCodeSizeHeurs( "use-lir-code-size-heurs", cl::desc("Use loop idiom recognition code size heuristics when compiling" "with -Os/-Oz"), cl::init(true), cl::Hidden); namespace { class LoopIdiomRecognize { … }; } // end anonymous namespace PreservedAnalyses LoopIdiomRecognizePass::run(Loop &L, LoopAnalysisManager &AM, LoopStandardAnalysisResults &AR, LPMUpdater &) { … } static void deleteDeadInstruction(Instruction *I) { … } //===----------------------------------------------------------------------===// // // Implementation of LoopIdiomRecognize // //===----------------------------------------------------------------------===// bool LoopIdiomRecognize::runOnLoop(Loop *L) { … } bool LoopIdiomRecognize::runOnCountableLoop() { … } static APInt getStoreStride(const SCEVAddRecExpr *StoreEv) { … } /// getMemSetPatternValue - If a strided store of the specified value is safe to /// turn into a memset_pattern16, return a ConstantArray of 16 bytes that should /// be passed in. Otherwise, return null. /// /// Note that we don't ever attempt to use memset_pattern8 or 4, because these /// just replicate their input array and then pass on to memset_pattern16. static Constant *getMemSetPatternValue(Value *V, const DataLayout *DL) { … } LoopIdiomRecognize::LegalStoreKind LoopIdiomRecognize::isLegalStore(StoreInst *SI) { … } void LoopIdiomRecognize::collectStores(BasicBlock *BB) { … } /// runOnLoopBlock - Process the specified block, which lives in a counted loop /// with the specified backedge count. This block is known to be in the current /// loop and not in any subloops. bool LoopIdiomRecognize::runOnLoopBlock( BasicBlock *BB, const SCEV *BECount, SmallVectorImpl<BasicBlock *> &ExitBlocks) { … } /// See if this store(s) can be promoted to a memset. bool LoopIdiomRecognize::processLoopStores(SmallVectorImpl<StoreInst *> &SL, const SCEV *BECount, ForMemset For) { … } /// processLoopMemIntrinsic - Template function for calling different processor /// functions based on mem intrinsic type. template <typename MemInst> bool LoopIdiomRecognize::processLoopMemIntrinsic( BasicBlock *BB, bool (LoopIdiomRecognize::*Processor)(MemInst *, const SCEV *), const SCEV *BECount) { … } /// processLoopMemCpy - See if this memcpy can be promoted to a large memcpy bool LoopIdiomRecognize::processLoopMemCpy(MemCpyInst *MCI, const SCEV *BECount) { … } /// processLoopMemSet - See if this memset can be promoted to a large memset. bool LoopIdiomRecognize::processLoopMemSet(MemSetInst *MSI, const SCEV *BECount) { … } /// mayLoopAccessLocation - Return true if the specified loop might access the /// specified pointer location, which is a loop-strided access. The 'Access' /// argument specifies what the verboten forms of access are (read or write). static bool mayLoopAccessLocation(Value *Ptr, ModRefInfo Access, Loop *L, const SCEV *BECount, const SCEV *StoreSizeSCEV, AliasAnalysis &AA, SmallPtrSetImpl<Instruction *> &IgnoredInsts) { … } // If we have a negative stride, Start refers to the end of the memory location // we're trying to memset. Therefore, we need to recompute the base pointer, // which is just Start - BECount*Size. static const SCEV *getStartForNegStride(const SCEV *Start, const SCEV *BECount, Type *IntPtr, const SCEV *StoreSizeSCEV, ScalarEvolution *SE) { … } /// Compute the number of bytes as a SCEV from the backedge taken count. /// /// This also maps the SCEV into the provided type and tries to handle the /// computation in a way that will fold cleanly. static const SCEV *getNumBytes(const SCEV *BECount, Type *IntPtr, const SCEV *StoreSizeSCEV, Loop *CurLoop, const DataLayout *DL, ScalarEvolution *SE) { … } /// processLoopStridedStore - We see a strided store of some value. If we can /// transform this into a memset or memset_pattern in the loop preheader, do so. bool LoopIdiomRecognize::processLoopStridedStore( Value *DestPtr, const SCEV *StoreSizeSCEV, MaybeAlign StoreAlignment, Value *StoredVal, Instruction *TheStore, SmallPtrSetImpl<Instruction *> &Stores, const SCEVAddRecExpr *Ev, const SCEV *BECount, bool IsNegStride, bool IsLoopMemset) { … } /// If the stored value is a strided load in the same loop with the same stride /// this may be transformable into a memcpy. This kicks in for stuff like /// for (i) A[i] = B[i]; bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI, const SCEV *BECount) { … } namespace { class MemmoveVerifier { … }; } // namespace bool LoopIdiomRecognize::processLoopStoreOfLoopLoad( Value *DestPtr, Value *SourcePtr, const SCEV *StoreSizeSCEV, MaybeAlign StoreAlign, MaybeAlign LoadAlign, Instruction *TheStore, Instruction *TheLoad, const SCEVAddRecExpr *StoreEv, const SCEVAddRecExpr *LoadEv, const SCEV *BECount) { … } // When compiling for codesize we avoid idiom recognition for a multi-block loop // unless it is a loop_memset idiom or a memset/memcpy idiom in a nested loop. // bool LoopIdiomRecognize::avoidLIRForMultiBlockLoop(bool IsMemset, bool IsLoopMemset) { … } bool LoopIdiomRecognize::runOnNoncountableLoop() { … } /// Check if the given conditional branch is based on the comparison between /// a variable and zero, and if the variable is non-zero or zero (JmpOnZero is /// true), the control yields to the loop entry. If the branch matches the /// behavior, the variable involved in the comparison is returned. This function /// will be called to see if the precondition and postcondition of the loop are /// in desirable form. static Value *matchCondition(BranchInst *BI, BasicBlock *LoopEntry, bool JmpOnZero = false) { … } /// Check if the given conditional branch is based on an unsigned less-than /// comparison between a variable and a constant, and if the comparison is false /// the control yields to the loop entry. If the branch matches the behaviour, /// the variable involved in the comparison is returned. static Value *matchShiftULTCondition(BranchInst *BI, BasicBlock *LoopEntry, APInt &Threshold) { … } // Check if the recurrence variable `VarX` is in the right form to create // the idiom. Returns the value coerced to a PHINode if so. static PHINode *getRecurrenceVar(Value *VarX, Instruction *DefX, BasicBlock *LoopEntry) { … } /// Return true if the idiom is detected in the loop. /// /// Additionally: /// 1) \p CntInst is set to the instruction Counting Leading Zeros (CTLZ) /// or nullptr if there is no such. /// 2) \p CntPhi is set to the corresponding phi node /// or nullptr if there is no such. /// 3) \p InitX is set to the value whose CTLZ could be used. /// 4) \p DefX is set to the instruction calculating Loop exit condition. /// 5) \p Threshold is set to the constant involved in the unsigned less-than /// comparison. /// /// The core idiom we are trying to detect is: /// \code /// if (x0 < 2) /// goto loop-exit // the precondition of the loop /// cnt0 = init-val /// do { /// x = phi (x0, x.next); //PhiX /// cnt = phi (cnt0, cnt.next) /// /// cnt.next = cnt + 1; /// ... /// x.next = x >> 1; // DefX /// } while (x >= 4) /// loop-exit: /// \endcode static bool detectShiftUntilLessThanIdiom(Loop *CurLoop, const DataLayout &DL, Intrinsic::ID &IntrinID, Value *&InitX, Instruction *&CntInst, PHINode *&CntPhi, Instruction *&DefX, APInt &Threshold) { … } /// Return true iff the idiom is detected in the loop. /// /// Additionally: /// 1) \p CntInst is set to the instruction counting the population bit. /// 2) \p CntPhi is set to the corresponding phi node. /// 3) \p Var is set to the value whose population bits are being counted. /// /// The core idiom we are trying to detect is: /// \code /// if (x0 != 0) /// goto loop-exit // the precondition of the loop /// cnt0 = init-val; /// do { /// x1 = phi (x0, x2); /// cnt1 = phi(cnt0, cnt2); /// /// cnt2 = cnt1 + 1; /// ... /// x2 = x1 & (x1 - 1); /// ... /// } while(x != 0); /// /// loop-exit: /// \endcode static bool detectPopcountIdiom(Loop *CurLoop, BasicBlock *PreCondBB, Instruction *&CntInst, PHINode *&CntPhi, Value *&Var) { … } /// Return true if the idiom is detected in the loop. /// /// Additionally: /// 1) \p CntInst is set to the instruction Counting Leading Zeros (CTLZ) /// or nullptr if there is no such. /// 2) \p CntPhi is set to the corresponding phi node /// or nullptr if there is no such. /// 3) \p Var is set to the value whose CTLZ could be used. /// 4) \p DefX is set to the instruction calculating Loop exit condition. /// /// The core idiom we are trying to detect is: /// \code /// if (x0 == 0) /// goto loop-exit // the precondition of the loop /// cnt0 = init-val; /// do { /// x = phi (x0, x.next); //PhiX /// cnt = phi(cnt0, cnt.next); /// /// cnt.next = cnt + 1; /// ... /// x.next = x >> 1; // DefX /// ... /// } while(x.next != 0); /// /// loop-exit: /// \endcode static bool detectShiftUntilZeroIdiom(Loop *CurLoop, const DataLayout &DL, Intrinsic::ID &IntrinID, Value *&InitX, Instruction *&CntInst, PHINode *&CntPhi, Instruction *&DefX) { … } // Check if CTLZ / CTTZ intrinsic is profitable. Assume it is always // profitable if we delete the loop. bool LoopIdiomRecognize::isProfitableToInsertFFS(Intrinsic::ID IntrinID, Value *InitX, bool ZeroCheck, size_t CanonicalSize) { … } /// Convert CTLZ / CTTZ idiom loop into countable loop. /// If CTLZ / CTTZ inserted as a new trip count returns true; otherwise, /// returns false. bool LoopIdiomRecognize::insertFFSIfProfitable(Intrinsic::ID IntrinID, Value *InitX, Instruction *DefX, PHINode *CntPhi, Instruction *CntInst) { … } /// Recognize CTLZ or CTTZ idiom in a non-countable loop and convert the loop /// to countable (with CTLZ / CTTZ trip count). If CTLZ / CTTZ inserted as a new /// trip count returns true; otherwise, returns false. bool LoopIdiomRecognize::recognizeAndInsertFFS() { … } bool LoopIdiomRecognize::recognizeShiftUntilLessThan() { … } /// Recognizes a population count idiom in a non-countable loop. /// /// If detected, transforms the relevant code to issue the popcount intrinsic /// function call, and returns true; otherwise, returns false. bool LoopIdiomRecognize::recognizePopcount() { … } static CallInst *createPopcntIntrinsic(IRBuilder<> &IRBuilder, Value *Val, const DebugLoc &DL) { … } static CallInst *createFFSIntrinsic(IRBuilder<> &IRBuilder, Value *Val, const DebugLoc &DL, bool ZeroCheck, Intrinsic::ID IID) { … } /// Transform the following loop (Using CTLZ, CTTZ is similar): /// loop: /// CntPhi = PHI [Cnt0, CntInst] /// PhiX = PHI [InitX, DefX] /// CntInst = CntPhi + 1 /// DefX = PhiX >> 1 /// LOOP_BODY /// Br: loop if (DefX != 0) /// Use(CntPhi) or Use(CntInst) /// /// Into: /// If CntPhi used outside the loop: /// CountPrev = BitWidth(InitX) - CTLZ(InitX >> 1) /// Count = CountPrev + 1 /// else /// Count = BitWidth(InitX) - CTLZ(InitX) /// loop: /// CntPhi = PHI [Cnt0, CntInst] /// PhiX = PHI [InitX, DefX] /// PhiCount = PHI [Count, Dec] /// CntInst = CntPhi + 1 /// DefX = PhiX >> 1 /// Dec = PhiCount - 1 /// LOOP_BODY /// Br: loop if (Dec != 0) /// Use(CountPrev + Cnt0) // Use(CntPhi) /// or /// Use(Count + Cnt0) // Use(CntInst) /// /// If LOOP_BODY is empty the loop will be deleted. /// If CntInst and DefX are not used in LOOP_BODY they will be removed. void LoopIdiomRecognize::transformLoopToCountable( Intrinsic::ID IntrinID, BasicBlock *Preheader, Instruction *CntInst, PHINode *CntPhi, Value *InitX, Instruction *DefX, const DebugLoc &DL, bool ZeroCheck, bool IsCntPhiUsedOutsideLoop, bool InsertSub) { … } void LoopIdiomRecognize::transformLoopToPopcount(BasicBlock *PreCondBB, Instruction *CntInst, PHINode *CntPhi, Value *Var) { … } /// Match loop-invariant value. template <typename SubPattern_t> struct match_LoopInvariant { … }; /// Matches if the value is loop-invariant. template <typename Ty> inline match_LoopInvariant<Ty> m_LoopInvariant(const Ty &M, const Loop *L) { … } /// Return true if the idiom is detected in the loop. /// /// The core idiom we are trying to detect is: /// \code /// entry: /// <...> /// %bitmask = shl i32 1, %bitpos /// br label %loop /// /// loop: /// %x.curr = phi i32 [ %x, %entry ], [ %x.next, %loop ] /// %x.curr.bitmasked = and i32 %x.curr, %bitmask /// %x.curr.isbitunset = icmp eq i32 %x.curr.bitmasked, 0 /// %x.next = shl i32 %x.curr, 1 /// <...> /// br i1 %x.curr.isbitunset, label %loop, label %end /// /// end: /// %x.curr.res = phi i32 [ %x.curr, %loop ] <...> /// %x.next.res = phi i32 [ %x.next, %loop ] <...> /// <...> /// \endcode static bool detectShiftUntilBitTestIdiom(Loop *CurLoop, Value *&BaseX, Value *&BitMask, Value *&BitPos, Value *&CurrX, Instruction *&NextX) { … } /// Look for the following loop: /// \code /// entry: /// <...> /// %bitmask = shl i32 1, %bitpos /// br label %loop /// /// loop: /// %x.curr = phi i32 [ %x, %entry ], [ %x.next, %loop ] /// %x.curr.bitmasked = and i32 %x.curr, %bitmask /// %x.curr.isbitunset = icmp eq i32 %x.curr.bitmasked, 0 /// %x.next = shl i32 %x.curr, 1 /// <...> /// br i1 %x.curr.isbitunset, label %loop, label %end /// /// end: /// %x.curr.res = phi i32 [ %x.curr, %loop ] <...> /// %x.next.res = phi i32 [ %x.next, %loop ] <...> /// <...> /// \endcode /// /// And transform it into: /// \code /// entry: /// %bitmask = shl i32 1, %bitpos /// %lowbitmask = add i32 %bitmask, -1 /// %mask = or i32 %lowbitmask, %bitmask /// %x.masked = and i32 %x, %mask /// %x.masked.numleadingzeros = call i32 @llvm.ctlz.i32(i32 %x.masked, /// i1 true) /// %x.masked.numactivebits = sub i32 32, %x.masked.numleadingzeros /// %x.masked.leadingonepos = add i32 %x.masked.numactivebits, -1 /// %backedgetakencount = sub i32 %bitpos, %x.masked.leadingonepos /// %tripcount = add i32 %backedgetakencount, 1 /// %x.curr = shl i32 %x, %backedgetakencount /// %x.next = shl i32 %x, %tripcount /// br label %loop /// /// loop: /// %loop.iv = phi i32 [ 0, %entry ], [ %loop.iv.next, %loop ] /// %loop.iv.next = add nuw i32 %loop.iv, 1 /// %loop.ivcheck = icmp eq i32 %loop.iv.next, %tripcount /// <...> /// br i1 %loop.ivcheck, label %end, label %loop /// /// end: /// %x.curr.res = phi i32 [ %x.curr, %loop ] <...> /// %x.next.res = phi i32 [ %x.next, %loop ] <...> /// <...> /// \endcode bool LoopIdiomRecognize::recognizeShiftUntilBitTest() { … } /// Return true if the idiom is detected in the loop. /// /// The core idiom we are trying to detect is: /// \code /// entry: /// <...> /// %start = <...> /// %extraoffset = <...> /// <...> /// br label %for.cond /// /// loop: /// %iv = phi i8 [ %start, %entry ], [ %iv.next, %for.cond ] /// %nbits = add nsw i8 %iv, %extraoffset /// %val.shifted = {{l,a}shr,shl} i8 %val, %nbits /// %val.shifted.iszero = icmp eq i8 %val.shifted, 0 /// %iv.next = add i8 %iv, 1 /// <...> /// br i1 %val.shifted.iszero, label %end, label %loop /// /// end: /// %iv.res = phi i8 [ %iv, %loop ] <...> /// %nbits.res = phi i8 [ %nbits, %loop ] <...> /// %val.shifted.res = phi i8 [ %val.shifted, %loop ] <...> /// %val.shifted.iszero.res = phi i1 [ %val.shifted.iszero, %loop ] <...> /// %iv.next.res = phi i8 [ %iv.next, %loop ] <...> /// <...> /// \endcode static bool detectShiftUntilZeroIdiom(Loop *CurLoop, ScalarEvolution *SE, Instruction *&ValShiftedIsZero, Intrinsic::ID &IntrinID, Instruction *&IV, Value *&Start, Value *&Val, const SCEV *&ExtraOffsetExpr, bool &InvertedCond) { … } /// Look for the following loop: /// \code /// entry: /// <...> /// %start = <...> /// %extraoffset = <...> /// <...> /// br label %for.cond /// /// loop: /// %iv = phi i8 [ %start, %entry ], [ %iv.next, %for.cond ] /// %nbits = add nsw i8 %iv, %extraoffset /// %val.shifted = {{l,a}shr,shl} i8 %val, %nbits /// %val.shifted.iszero = icmp eq i8 %val.shifted, 0 /// %iv.next = add i8 %iv, 1 /// <...> /// br i1 %val.shifted.iszero, label %end, label %loop /// /// end: /// %iv.res = phi i8 [ %iv, %loop ] <...> /// %nbits.res = phi i8 [ %nbits, %loop ] <...> /// %val.shifted.res = phi i8 [ %val.shifted, %loop ] <...> /// %val.shifted.iszero.res = phi i1 [ %val.shifted.iszero, %loop ] <...> /// %iv.next.res = phi i8 [ %iv.next, %loop ] <...> /// <...> /// \endcode /// /// And transform it into: /// \code /// entry: /// <...> /// %start = <...> /// %extraoffset = <...> /// <...> /// %val.numleadingzeros = call i8 @llvm.ct{l,t}z.i8(i8 %val, i1 0) /// %val.numactivebits = sub i8 8, %val.numleadingzeros /// %extraoffset.neg = sub i8 0, %extraoffset /// %tmp = add i8 %val.numactivebits, %extraoffset.neg /// %iv.final = call i8 @llvm.smax.i8(i8 %tmp, i8 %start) /// %loop.tripcount = sub i8 %iv.final, %start /// br label %loop /// /// loop: /// %loop.iv = phi i8 [ 0, %entry ], [ %loop.iv.next, %loop ] /// %loop.iv.next = add i8 %loop.iv, 1 /// %loop.ivcheck = icmp eq i8 %loop.iv.next, %loop.tripcount /// %iv = add i8 %loop.iv, %start /// <...> /// br i1 %loop.ivcheck, label %end, label %loop /// /// end: /// %iv.res = phi i8 [ %iv.final, %loop ] <...> /// <...> /// \endcode bool LoopIdiomRecognize::recognizeShiftUntilZero() { … }