//===------- VectorCombine.cpp - Optimize partial vector operations -------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // This pass optimizes scalar/vector interactions using target cost models. The // transforms implemented here may not fit in traditional loop-based or SLP // vectorization passes. // //===----------------------------------------------------------------------===// #include "llvm/Transforms/Vectorize/VectorCombine.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/ScopeExit.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/BasicAliasAnalysis.h" #include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/Loads.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/Analysis/VectorUtils.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/PatternMatch.h" #include "llvm/Support/CommandLine.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/LoopUtils.h" #include <numeric> #include <queue> #define DEBUG_TYPE … #include "llvm/Transforms/Utils/InstructionWorklist.h" usingnamespacellvm; usingnamespacellvm::PatternMatch; STATISTIC(NumVecLoad, "Number of vector loads formed"); STATISTIC(NumVecCmp, "Number of vector compares formed"); STATISTIC(NumVecBO, "Number of vector binops formed"); STATISTIC(NumVecCmpBO, "Number of vector compare + binop formed"); STATISTIC(NumShufOfBitcast, "Number of shuffles moved after bitcast"); STATISTIC(NumScalarBO, "Number of scalar binops formed"); STATISTIC(NumScalarCmp, "Number of scalar compares formed"); static cl::opt<bool> DisableVectorCombine( "disable-vector-combine", cl::init(false), cl::Hidden, cl::desc("Disable all vector combine transforms")); static cl::opt<bool> DisableBinopExtractShuffle( "disable-binop-extract-shuffle", cl::init(false), cl::Hidden, cl::desc("Disable binop extract to shuffle transforms")); static cl::opt<unsigned> MaxInstrsToScan( "vector-combine-max-scan-instrs", cl::init(30), cl::Hidden, cl::desc("Max number of instructions to scan for vector combining.")); static const unsigned InvalidIndex = …; namespace { class VectorCombine { … }; } // namespace /// Return the source operand of a potentially bitcasted value. If there is no /// bitcast, return the input value itself. static Value *peekThroughBitcasts(Value *V) { … } static bool canWidenLoad(LoadInst *Load, const TargetTransformInfo &TTI) { … } bool VectorCombine::vectorizeLoadInsert(Instruction &I) { … } /// If we are loading a vector and then inserting it into a larger vector with /// undefined elements, try to load the larger vector and eliminate the insert. /// This removes a shuffle in IR and may allow combining of other loaded values. bool VectorCombine::widenSubvectorLoad(Instruction &I) { … } /// Determine which, if any, of the inputs should be replaced by a shuffle /// followed by extract from a different index. ExtractElementInst *VectorCombine::getShuffleExtract( ExtractElementInst *Ext0, ExtractElementInst *Ext1, unsigned PreferredExtractIndex = InvalidIndex) const { … } /// Compare the relative costs of 2 extracts followed by scalar operation vs. /// vector operation(s) followed by extract. Return true if the existing /// instructions are cheaper than a vector alternative. Otherwise, return false /// and if one of the extracts should be transformed to a shufflevector, set /// \p ConvertToShuffle to that extract instruction. bool VectorCombine::isExtractExtractCheap(ExtractElementInst *Ext0, ExtractElementInst *Ext1, const Instruction &I, ExtractElementInst *&ConvertToShuffle, unsigned PreferredExtractIndex) { … } /// Create a shuffle that translates (shifts) 1 element from the input vector /// to a new element location. static Value *createShiftShuffle(Value *Vec, unsigned OldIndex, unsigned NewIndex, IRBuilder<> &Builder) { … } /// Given an extract element instruction with constant index operand, shuffle /// the source vector (shift the scalar element) to a NewIndex for extraction. /// Return null if the input can be constant folded, so that we are not creating /// unnecessary instructions. static ExtractElementInst *translateExtract(ExtractElementInst *ExtElt, unsigned NewIndex, IRBuilder<> &Builder) { … } /// Try to reduce extract element costs by converting scalar compares to vector /// compares followed by extract. /// cmp (ext0 V0, C), (ext1 V1, C) void VectorCombine::foldExtExtCmp(ExtractElementInst *Ext0, ExtractElementInst *Ext1, Instruction &I) { … } /// Try to reduce extract element costs by converting scalar binops to vector /// binops followed by extract. /// bo (ext0 V0, C), (ext1 V1, C) void VectorCombine::foldExtExtBinop(ExtractElementInst *Ext0, ExtractElementInst *Ext1, Instruction &I) { … } /// Match an instruction with extracted vector operands. bool VectorCombine::foldExtractExtract(Instruction &I) { … } /// Try to replace an extract + scalar fneg + insert with a vector fneg + /// shuffle. bool VectorCombine::foldInsExtFNeg(Instruction &I) { … } /// If this is a bitcast of a shuffle, try to bitcast the source vector to the /// destination type followed by shuffle. This can enable further transforms by /// moving bitcasts or shuffles together. bool VectorCombine::foldBitcastShuffle(Instruction &I) { … } /// VP Intrinsics whose vector operands are both splat values may be simplified /// into the scalar version of the operation and the result splatted. This /// can lead to scalarization down the line. bool VectorCombine::scalarizeVPIntrinsic(Instruction &I) { … } /// Match a vector binop or compare instruction with at least one inserted /// scalar operand and convert to scalar binop/cmp followed by insertelement. bool VectorCombine::scalarizeBinopOrCmp(Instruction &I) { … } /// Try to combine a scalar binop + 2 scalar compares of extracted elements of /// a vector into vector operations followed by extract. Note: The SLP pass /// may miss this pattern because of implementation problems. bool VectorCombine::foldExtractedCmps(Instruction &I) { … } // Check if memory loc modified between two instrs in the same BB static bool isMemModifiedBetween(BasicBlock::iterator Begin, BasicBlock::iterator End, const MemoryLocation &Loc, AAResults &AA) { … } namespace { /// Helper class to indicate whether a vector index can be safely scalarized and /// if a freeze needs to be inserted. class ScalarizationResult { … }; } // namespace /// Check if it is legal to scalarize a memory access to \p VecTy at index \p /// Idx. \p Idx must access a valid vector element. static ScalarizationResult canScalarizeAccess(VectorType *VecTy, Value *Idx, Instruction *CtxI, AssumptionCache &AC, const DominatorTree &DT) { … } /// The memory operation on a vector of \p ScalarType had alignment of /// \p VectorAlignment. Compute the maximal, but conservatively correct, /// alignment that will be valid for the memory operation on a single scalar /// element of the same type with index \p Idx. static Align computeAlignmentAfterScalarization(Align VectorAlignment, Type *ScalarType, Value *Idx, const DataLayout &DL) { … } // Combine patterns like: // %0 = load <4 x i32>, <4 x i32>* %a // %1 = insertelement <4 x i32> %0, i32 %b, i32 1 // store <4 x i32> %1, <4 x i32>* %a // to: // %0 = bitcast <4 x i32>* %a to i32* // %1 = getelementptr inbounds i32, i32* %0, i64 0, i64 1 // store i32 %b, i32* %1 bool VectorCombine::foldSingleElementStore(Instruction &I) { … } /// Try to scalarize vector loads feeding extractelement instructions. bool VectorCombine::scalarizeLoadExtract(Instruction &I) { … } /// Try to convert "shuffle (binop), (binop)" into "binop (shuffle), (shuffle)". bool VectorCombine::foldShuffleOfBinops(Instruction &I) { … } /// Try to convert "shuffle (castop), (castop)" with a shared castop operand /// into "castop (shuffle)". bool VectorCombine::foldShuffleOfCastops(Instruction &I) { … } /// Try to convert "shuffle (shuffle x, undef), (shuffle y, undef)" /// into "shuffle x, y". bool VectorCombine::foldShuffleOfShuffles(Instruction &I) { … } /// Try to convert /// "shuffle (intrinsic), (intrinsic)" into "intrinsic (shuffle), (shuffle)". bool VectorCombine::foldShuffleOfIntrinsics(Instruction &I) { … } InstLane; static InstLane lookThroughShuffles(Use *U, int Lane) { … } static SmallVector<InstLane> generateInstLaneVectorFromOperand(ArrayRef<InstLane> Item, int Op) { … } /// Detect concat of multiple values into a vector static bool isFreeConcat(ArrayRef<InstLane> Item, const TargetTransformInfo &TTI) { … } static Value *generateNewInstTree(ArrayRef<InstLane> Item, FixedVectorType *Ty, const SmallPtrSet<Use *, 4> &IdentityLeafs, const SmallPtrSet<Use *, 4> &SplatLeafs, const SmallPtrSet<Use *, 4> &ConcatLeafs, IRBuilder<> &Builder) { … } // Starting from a shuffle, look up through operands tracking the shuffled index // of each lane. If we can simplify away the shuffles to identities then // do so. bool VectorCombine::foldShuffleToIdentity(Instruction &I) { … } /// Given a commutative reduction, the order of the input lanes does not alter /// the results. We can use this to remove certain shuffles feeding the /// reduction, removing the need to shuffle at all. bool VectorCombine::foldShuffleFromReductions(Instruction &I) { … } /// Determine if its more efficient to fold: /// reduce(trunc(x)) -> trunc(reduce(x)). /// reduce(sext(x)) -> sext(reduce(x)). /// reduce(zext(x)) -> zext(reduce(x)). bool VectorCombine::foldCastFromReductions(Instruction &I) { … } /// This method looks for groups of shuffles acting on binops, of the form: /// %x = shuffle ... /// %y = shuffle ... /// %a = binop %x, %y /// %b = binop %x, %y /// shuffle %a, %b, selectmask /// We may, especially if the shuffle is wider than legal, be able to convert /// the shuffle to a form where only parts of a and b need to be computed. On /// architectures with no obvious "select" shuffle, this can reduce the total /// number of operations if the target reports them as cheaper. bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) { … } /// Check if instruction depends on ZExt and this ZExt can be moved after the /// instruction. Move ZExt if it is profitable. For example: /// logic(zext(x),y) -> zext(logic(x,trunc(y))) /// lshr((zext(x),y) -> zext(lshr(x,trunc(y))) /// Cost model calculations takes into account if zext(x) has other users and /// whether it can be propagated through them too. bool VectorCombine::shrinkType(llvm::Instruction &I) { … } /// This is the entry point for all transforms. Pass manager differences are /// handled in the callers of this function. bool VectorCombine::run() { … } PreservedAnalyses VectorCombinePass::run(Function &F, FunctionAnalysisManager &FAM) { … }