//===- AggressiveInstCombine.cpp ------------------------------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // This file implements the aggressive expression pattern combiner classes. // Currently, it handles expression patterns for: // * Truncate instruction // //===----------------------------------------------------------------------===// #include "llvm/Transforms/AggressiveInstCombine/AggressiveInstCombine.h" #include "AggressiveInstCombineInternal.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/BasicAliasAnalysis.h" #include "llvm/Analysis/ConstantFolding.h" #include "llvm/Analysis/DomTreeUpdater.h" #include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/PatternMatch.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/BuildLibCalls.h" #include "llvm/Transforms/Utils/Local.h" usingnamespacellvm; usingnamespacePatternMatch; #define DEBUG_TYPE … STATISTIC(NumAnyOrAllBitsSet, "Number of any/all-bits-set patterns folded"); STATISTIC(NumGuardedRotates, "Number of guarded rotates transformed into funnel shifts"); STATISTIC(NumGuardedFunnelShifts, "Number of guarded funnel shifts transformed into funnel shifts"); STATISTIC(NumPopCountRecognized, "Number of popcount idioms recognized"); static cl::opt<unsigned> MaxInstrsToScan( "aggressive-instcombine-max-scan-instrs", cl::init(64), cl::Hidden, cl::desc("Max number of instructions to scan for aggressive instcombine.")); static cl::opt<unsigned> StrNCmpInlineThreshold( "strncmp-inline-threshold", cl::init(3), cl::Hidden, cl::desc("The maximum length of a constant string for a builtin string cmp " "call eligible for inlining. The default value is 3.")); static cl::opt<unsigned> MemChrInlineThreshold("memchr-inline-threshold", cl::init(3), cl::Hidden, cl::desc("The maximum length of a constant string to " "inline a memchr call.")); /// Match a pattern for a bitwise funnel/rotate operation that partially guards /// against undefined behavior by branching around the funnel-shift/rotation /// when the shift amount is 0. static bool foldGuardedFunnelShift(Instruction &I, const DominatorTree &DT) { … } /// This is used by foldAnyOrAllBitsSet() to capture a source value (Root) and /// the bit indexes (Mask) needed by a masked compare. If we're matching a chain /// of 'and' ops, then we also need to capture the fact that we saw an /// "and X, 1", so that's an extra return value for that case. struct MaskOps { … }; /// This is a recursive helper for foldAnyOrAllBitsSet() that walks through a /// chain of 'and' or 'or' instructions looking for shift ops of a common source /// value. Examples: /// or (or (or X, (X >> 3)), (X >> 5)), (X >> 8) /// returns { X, 0x129 } /// and (and (X >> 1), 1), (X >> 4) /// returns { X, 0x12 } static bool matchAndOrChain(Value *V, MaskOps &MOps) { … } /// Match patterns that correspond to "any-bits-set" and "all-bits-set". /// These will include a chain of 'or' or 'and'-shifted bits from a /// common source value: /// and (or (lshr X, C), ...), 1 --> (X & CMask) != 0 /// and (and (lshr X, C), ...), 1 --> (X & CMask) == CMask /// Note: "any-bits-clear" and "all-bits-clear" are variations of these patterns /// that differ only with a final 'not' of the result. We expect that final /// 'not' to be folded with the compare that we create here (invert predicate). static bool foldAnyOrAllBitsSet(Instruction &I) { … } // Try to recognize below function as popcount intrinsic. // This is the "best" algorithm from // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel // Also used in TargetLowering::expandCTPOP(). // // int popcount(unsigned int i) { // i = i - ((i >> 1) & 0x55555555); // i = (i & 0x33333333) + ((i >> 2) & 0x33333333); // i = ((i + (i >> 4)) & 0x0F0F0F0F); // return (i * 0x01010101) >> 24; // } static bool tryToRecognizePopCount(Instruction &I) { … } /// Fold smin(smax(fptosi(x), C1), C2) to llvm.fptosi.sat(x), providing C1 and /// C2 saturate the value of the fp conversion. The transform is not reversable /// as the fptosi.sat is more defined than the input - all values produce a /// valid value for the fptosi.sat, where as some produce poison for original /// that were out of range of the integer conversion. The reversed pattern may /// use fmax and fmin instead. As we cannot directly reverse the transform, and /// it is not always profitable, we make it conditional on the cost being /// reported as lower by TTI. static bool tryToFPToSat(Instruction &I, TargetTransformInfo &TTI) { … } /// Try to replace a mathlib call to sqrt with the LLVM intrinsic. This avoids /// pessimistic codegen that has to account for setting errno and can enable /// vectorization. static bool foldSqrt(CallInst *Call, LibFunc Func, TargetTransformInfo &TTI, TargetLibraryInfo &TLI, AssumptionCache &AC, DominatorTree &DT) { … } // Check if this array of constants represents a cttz table. // Iterate over the elements from \p Table by trying to find/match all // the numbers from 0 to \p InputBits that should represent cttz results. static bool isCTTZTable(const ConstantDataArray &Table, uint64_t Mul, uint64_t Shift, uint64_t InputBits) { … } // Try to recognize table-based ctz implementation. // E.g., an example in C (for more cases please see the llvm/tests): // int f(unsigned x) { // static const char table[32] = // {0, 1, 28, 2, 29, 14, 24, 3, 30, // 22, 20, 15, 25, 17, 4, 8, 31, 27, // 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9}; // return table[((unsigned)((x & -x) * 0x077CB531U)) >> 27]; // } // this can be lowered to `cttz` instruction. // There is also a special case when the element is 0. // // Here are some examples or LLVM IR for a 64-bit target: // // CASE 1: // %sub = sub i32 0, %x // %and = and i32 %sub, %x // %mul = mul i32 %and, 125613361 // %shr = lshr i32 %mul, 27 // %idxprom = zext i32 %shr to i64 // %arrayidx = getelementptr inbounds [32 x i8], [32 x i8]* @ctz1.table, i64 0, // i64 %idxprom // %0 = load i8, i8* %arrayidx, align 1, !tbaa !8 // // CASE 2: // %sub = sub i32 0, %x // %and = and i32 %sub, %x // %mul = mul i32 %and, 72416175 // %shr = lshr i32 %mul, 26 // %idxprom = zext i32 %shr to i64 // %arrayidx = getelementptr inbounds [64 x i16], [64 x i16]* @ctz2.table, // i64 0, i64 %idxprom // %0 = load i16, i16* %arrayidx, align 2, !tbaa !8 // // CASE 3: // %sub = sub i32 0, %x // %and = and i32 %sub, %x // %mul = mul i32 %and, 81224991 // %shr = lshr i32 %mul, 27 // %idxprom = zext i32 %shr to i64 // %arrayidx = getelementptr inbounds [32 x i32], [32 x i32]* @ctz3.table, // i64 0, i64 %idxprom // %0 = load i32, i32* %arrayidx, align 4, !tbaa !8 // // CASE 4: // %sub = sub i64 0, %x // %and = and i64 %sub, %x // %mul = mul i64 %and, 283881067100198605 // %shr = lshr i64 %mul, 58 // %arrayidx = getelementptr inbounds [64 x i8], [64 x i8]* @table, i64 0, // i64 %shr // %0 = load i8, i8* %arrayidx, align 1, !tbaa !8 // // All this can be lowered to @llvm.cttz.i32/64 intrinsic. static bool tryToRecognizeTableBasedCttz(Instruction &I) { … } /// This is used by foldLoadsRecursive() to capture a Root Load node which is /// of type or(load, load) and recursively build the wide load. Also capture the /// shift amount, zero extend type and loadSize. struct LoadOps { … }; // Identify and Merge consecutive loads recursively which is of the form // (ZExt(L1) << shift1) | (ZExt(L2) << shift2) -> ZExt(L3) << shift1 // (ZExt(L1) << shift1) | ZExt(L2) -> ZExt(L3) static bool foldLoadsRecursive(Value *V, LoadOps &LOps, const DataLayout &DL, AliasAnalysis &AA) { … } // For a given BB instruction, evaluate all loads in the chain that form a // pattern which suggests that the loads can be combined. The one and only use // of the loads is to form a wider load. static bool foldConsecutiveLoads(Instruction &I, const DataLayout &DL, TargetTransformInfo &TTI, AliasAnalysis &AA, const DominatorTree &DT) { … } // Calculate GEP Stride and accumulated const ModOffset. Return Stride and // ModOffset static std::pair<APInt, APInt> getStrideAndModOffsetOfGEP(Value *PtrOp, const DataLayout &DL) { … } /// If C is a constant patterned array and all valid loaded results for given /// alignment are same to a constant, return that constant. static bool foldPatternedLoads(Instruction &I, const DataLayout &DL) { … } namespace { class StrNCmpInliner { … }; } // namespace /// First we normalize calls to strncmp/strcmp to the form of /// compare(s1, s2, N), which means comparing first N bytes of s1 and s2 /// (without considering '\0'). /// /// Examples: /// /// \code /// strncmp(s, "a", 3) -> compare(s, "a", 2) /// strncmp(s, "abc", 3) -> compare(s, "abc", 3) /// strncmp(s, "a\0b", 3) -> compare(s, "a\0b", 2) /// strcmp(s, "a") -> compare(s, "a", 2) /// /// char s2[] = {'a'} /// strncmp(s, s2, 3) -> compare(s, s2, 3) /// /// char s2[] = {'a', 'b', 'c', 'd'} /// strncmp(s, s2, 3) -> compare(s, s2, 3) /// \endcode /// /// We only handle cases where N and exactly one of s1 and s2 are constant. /// Cases that s1 and s2 are both constant are already handled by the /// instcombine pass. /// /// We do not handle cases where N > StrNCmpInlineThreshold. /// /// We also do not handles cases where N < 2, which are already /// handled by the instcombine pass. /// bool StrNCmpInliner::optimizeStrNCmp() { … } /// Convert /// /// \code /// ret = compare(s1, s2, N) /// \endcode /// /// into /// /// \code /// ret = (int)s1[0] - (int)s2[0] /// if (ret != 0) /// goto NE /// ... /// ret = (int)s1[N-2] - (int)s2[N-2] /// if (ret != 0) /// goto NE /// ret = (int)s1[N-1] - (int)s2[N-1] /// NE: /// \endcode /// /// CFG before and after the transformation: /// /// (before) /// BBCI /// /// (after) /// BBCI -> BBSubs[0] (sub,icmp) --NE-> BBNE -> BBTail /// | ^ /// E | /// | | /// BBSubs[1] (sub,icmp) --NE-----+ /// ... | /// BBSubs[N-1] (sub) ---------+ /// void StrNCmpInliner::inlineCompare(Value *LHS, StringRef RHS, uint64_t N, bool Swapped) { … } /// Convert memchr with a small constant string into a switch static bool foldMemChr(CallInst *Call, DomTreeUpdater *DTU, const DataLayout &DL) { … } static bool foldLibCalls(Instruction &I, TargetTransformInfo &TTI, TargetLibraryInfo &TLI, AssumptionCache &AC, DominatorTree &DT, const DataLayout &DL, bool &MadeCFGChange) { … } /// This is the entry point for folds that could be implemented in regular /// InstCombine, but they are separated because they are not expected to /// occur frequently and/or have more than a constant-length pattern match. static bool foldUnusualPatterns(Function &F, DominatorTree &DT, TargetTransformInfo &TTI, TargetLibraryInfo &TLI, AliasAnalysis &AA, AssumptionCache &AC, bool &MadeCFGChange) { … } /// This is the entry point for all transforms. Pass manager differences are /// handled in the callers of this function. static bool runImpl(Function &F, AssumptionCache &AC, TargetTransformInfo &TTI, TargetLibraryInfo &TLI, DominatorTree &DT, AliasAnalysis &AA, bool &MadeCFGChange) { … } PreservedAnalyses AggressiveInstCombinePass::run(Function &F, FunctionAnalysisManager &AM) { … }