#include "llvm/Transforms/Vectorize/LoopVectorize.h"
#include "LoopVectorizationPlanner.h"
#include "VPRecipeBuilder.h"
#include "VPlan.h"
#include "VPlanAnalysis.h"
#include "VPlanHCFGBuilder.h"
#include "VPlanPatternMatch.h"
#include "VPlanTransforms.h"
#include "VPlanUtils.h"
#include "VPlanVerifier.h"
#include "llvm/ADT/APInt.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/DenseMapInfo.h"
#include "llvm/ADT/Hashing.h"
#include "llvm/ADT/MapVector.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/ADT/Twine.h"
#include "llvm/ADT/TypeSwitch.h"
#include "llvm/ADT/iterator_range.h"
#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/BasicAliasAnalysis.h"
#include "llvm/Analysis/BlockFrequencyInfo.h"
#include "llvm/Analysis/CFG.h"
#include "llvm/Analysis/CodeMetrics.h"
#include "llvm/Analysis/DemandedBits.h"
#include "llvm/Analysis/GlobalsModRef.h"
#include "llvm/Analysis/LoopAccessAnalysis.h"
#include "llvm/Analysis/LoopAnalysisManager.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/LoopIterator.h"
#include "llvm/Analysis/OptimizationRemarkEmitter.h"
#include "llvm/Analysis/ProfileSummaryInfo.h"
#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/Analysis/ScalarEvolutionExpressions.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/Analysis/VectorUtils.h"
#include "llvm/IR/Attributes.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/CFG.h"
#include "llvm/IR/Constant.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/DebugInfo.h"
#include "llvm/IR/DebugLoc.h"
#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/DiagnosticInfo.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/InstrTypes.h"
#include "llvm/IR/Instruction.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/MDBuilder.h"
#include "llvm/IR/Metadata.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/Operator.h"
#include "llvm/IR/PatternMatch.h"
#include "llvm/IR/ProfDataUtils.h"
#include "llvm/IR/Type.h"
#include "llvm/IR/Use.h"
#include "llvm/IR/User.h"
#include "llvm/IR/Value.h"
#include "llvm/IR/Verifier.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/InstructionCost.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/Transforms/Utils/InjectTLIMappings.h"
#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Transforms/Utils/LoopSimplify.h"
#include "llvm/Transforms/Utils/LoopUtils.h"
#include "llvm/Transforms/Utils/LoopVersioning.h"
#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
#include "llvm/Transforms/Utils/SizeOpts.h"
#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
#include <algorithm>
#include <cassert>
#include <cstdint>
#include <functional>
#include <iterator>
#include <limits>
#include <memory>
#include <string>
#include <tuple>
#include <utility>
usingnamespacellvm;
#define LV_NAME …
#define DEBUG_TYPE …
#ifndef NDEBUG
const char VerboseDebug[] = DEBUG_TYPE "-verbose";
#endif
const char LLVMLoopVectorizeFollowupAll[] = …;
const char LLVMLoopVectorizeFollowupVectorized[] = …;
const char LLVMLoopVectorizeFollowupEpilogue[] = …;
STATISTIC(LoopsVectorized, "Number of loops vectorized");
STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized");
static cl::opt<bool> EnableEpilogueVectorization(
"enable-epilogue-vectorization", cl::init(true), cl::Hidden,
cl::desc("Enable vectorization of epilogue loops."));
static cl::opt<unsigned> EpilogueVectorizationForceVF(
"epilogue-vectorization-force-VF", cl::init(1), cl::Hidden,
cl::desc("When epilogue vectorization is enabled, and a value greater than "
"1 is specified, forces the given VF for all applicable epilogue "
"loops."));
static cl::opt<unsigned> EpilogueVectorizationMinVF(
"epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden,
cl::desc("Only loops with vectorization factor equal to or larger than "
"the specified value are considered for epilogue vectorization."));
static cl::opt<unsigned> TinyTripCountVectorThreshold(
"vectorizer-min-trip-count", cl::init(16), cl::Hidden,
cl::desc("Loops with a constant trip count that is smaller than this "
"value are vectorized only if no scalar iteration overheads "
"are incurred."));
static cl::opt<unsigned> VectorizeMemoryCheckThreshold(
"vectorize-memory-check-threshold", cl::init(128), cl::Hidden,
cl::desc("The maximum allowed number of runtime memory checks"));
namespace PreferPredicateTy {
enum Option { … };
}
static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue(
"prefer-predicate-over-epilogue",
cl::init(PreferPredicateTy::ScalarEpilogue),
cl::Hidden,
cl::desc("Tail-folding and predication preferences over creating a scalar "
"epilogue loop."),
cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue,
"scalar-epilogue",
"Don't tail-predicate loops, create scalar epilogue"),
clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue,
"predicate-else-scalar-epilogue",
"prefer tail-folding, create scalar epilogue if tail "
"folding fails."),
clEnumValN(PreferPredicateTy::PredicateOrDontVectorize,
"predicate-dont-vectorize",
"prefers tail-folding, don't attempt vectorization if "
"tail-folding fails.")));
static cl::opt<TailFoldingStyle> ForceTailFoldingStyle(
"force-tail-folding-style", cl::desc("Force the tail folding style"),
cl::init(TailFoldingStyle::None),
cl::values(
clEnumValN(TailFoldingStyle::None, "none", "Disable tail folding"),
clEnumValN(
TailFoldingStyle::Data, "data",
"Create lane mask for data only, using active.lane.mask intrinsic"),
clEnumValN(TailFoldingStyle::DataWithoutLaneMask,
"data-without-lane-mask",
"Create lane mask with compare/stepvector"),
clEnumValN(TailFoldingStyle::DataAndControlFlow, "data-and-control",
"Create lane mask using active.lane.mask intrinsic, and use "
"it for both data and control flow"),
clEnumValN(TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck,
"data-and-control-without-rt-check",
"Similar to data-and-control, but remove the runtime check"),
clEnumValN(TailFoldingStyle::DataWithEVL, "data-with-evl",
"Use predicated EVL instructions for tail folding. If EVL "
"is unsupported, fallback to data-without-lane-mask.")));
static cl::opt<bool> MaximizeBandwidth(
"vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
cl::desc("Maximize bandwidth when selecting vectorization factor which "
"will be determined by the smallest type in loop."));
static cl::opt<bool> EnableInterleavedMemAccesses(
"enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
"enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
static cl::opt<unsigned> ForceTargetNumScalarRegs(
"force-target-num-scalar-regs", cl::init(0), cl::Hidden,
cl::desc("A flag that overrides the target's number of scalar registers."));
static cl::opt<unsigned> ForceTargetNumVectorRegs(
"force-target-num-vector-regs", cl::init(0), cl::Hidden,
cl::desc("A flag that overrides the target's number of vector registers."));
static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
"force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
cl::desc("A flag that overrides the target's max interleave factor for "
"scalar loops."));
static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
"force-target-max-vector-interleave", cl::init(0), cl::Hidden,
cl::desc("A flag that overrides the target's max interleave factor for "
"vectorized loops."));
cl::opt<unsigned> ForceTargetInstructionCost(
"force-target-instruction-cost", cl::init(0), cl::Hidden,
cl::desc("A flag that overrides the target's expected cost for "
"an instruction to a single constant value. Mostly "
"useful for getting consistent testing."));
static cl::opt<bool> ForceTargetSupportsScalableVectors(
"force-target-supports-scalable-vectors", cl::init(false), cl::Hidden,
cl::desc(
"Pretend that scalable vectors are supported, even if the target does "
"not support them. This flag should only be used for testing."));
static cl::opt<unsigned> SmallLoopCost(
"small-loop-cost", cl::init(20), cl::Hidden,
cl::desc(
"The cost of a loop that is considered 'small' by the interleaver."));
static cl::opt<bool> LoopVectorizeWithBlockFrequency(
"loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
cl::desc("Enable the use of the block frequency analysis to access PGO "
"heuristics minimizing code growth in cold regions and being more "
"aggressive in hot regions."));
static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
"enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
cl::desc(
"Enable runtime interleaving until load/store ports are saturated"));
static cl::opt<unsigned> NumberOfStoresToPredicate(
"vectorize-num-stores-pred", cl::init(1), cl::Hidden,
cl::desc("Max number of stores to be predicated behind an if."));
static cl::opt<bool> EnableIndVarRegisterHeur(
"enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
cl::desc("Count the induction variable only once when interleaving"));
static cl::opt<bool> EnableCondStoresVectorization(
"enable-cond-stores-vec", cl::init(true), cl::Hidden,
cl::desc("Enable if predication of stores during vectorization."));
static cl::opt<unsigned> MaxNestedScalarReductionIC(
"max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
cl::desc("The maximum interleave count to use when interleaving a scalar "
"reduction in a nested loop."));
static cl::opt<bool>
PreferInLoopReductions("prefer-inloop-reductions", cl::init(false),
cl::Hidden,
cl::desc("Prefer in-loop vector reductions, "
"overriding the targets preference."));
static cl::opt<bool> ForceOrderedReductions(
"force-ordered-reductions", cl::init(false), cl::Hidden,
cl::desc("Enable the vectorisation of loops with in-order (strict) "
"FP reductions"));
static cl::opt<bool> PreferPredicatedReductionSelect(
"prefer-predicated-reduction-select", cl::init(false), cl::Hidden,
cl::desc(
"Prefer predicating a reduction operation over an after loop select."));
namespace llvm {
cl::opt<bool> EnableVPlanNativePath(
"enable-vplan-native-path", cl::Hidden,
cl::desc("Enable VPlan-native vectorization path with "
"support for outer loop vectorization."));
}
static cl::opt<bool> VPlanBuildStressTest(
"vplan-build-stress-test", cl::init(false), cl::Hidden,
cl::desc(
"Build VPlan for every supported loop nest in the function and bail "
"out right after the build (stress test the VPlan H-CFG construction "
"in the VPlan-native vectorization path)."));
cl::opt<bool> llvm::EnableLoopInterleaving(
"interleave-loops", cl::init(true), cl::Hidden,
cl::desc("Enable loop interleaving in Loop vectorization passes"));
cl::opt<bool> llvm::EnableLoopVectorization(
"vectorize-loops", cl::init(true), cl::Hidden,
cl::desc("Run the Loop vectorization passes"));
static cl::opt<cl::boolOrDefault> ForceSafeDivisor(
"force-widen-divrem-via-safe-divisor", cl::Hidden,
cl::desc(
"Override cost based safe divisor widening for div/rem instructions"));
static cl::opt<bool> UseWiderVFIfCallVariantsPresent(
"vectorizer-maximize-bandwidth-for-vector-calls", cl::init(true),
cl::Hidden,
cl::desc("Try wider VFs if they enable the use of vector variants"));
static constexpr uint32_t SCEVCheckBypassWeights[] = …;
static constexpr uint32_t MemCheckBypassWeights[] = …;
static constexpr uint32_t MinItersBypassWeights[] = …;
static bool hasIrregularType(Type *Ty, const DataLayout &DL) { … }
static std::optional<unsigned>
getSmallBestKnownTC(PredicatedScalarEvolution &PSE, Loop *L,
bool CanUseConstantMax = true) { … }
namespace {
class GeneratedRTChecks;
SCEV2ValueTy;
}
namespace llvm {
AnalysisKey ShouldRunExtraVectorPasses::Key;
class InnerLoopVectorizer { … };
struct EpilogueLoopVectorizationInfo { … };
class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer { … };
class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer { … };
class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer { … };
}
static DebugLoc getDebugLocFromInstOrOperands(Instruction *I) { … }
#ifndef NDEBUG
static void debugVectorizationMessage(const StringRef Prefix,
const StringRef DebugMsg,
Instruction *I) {
dbgs() << "LV: " << Prefix << DebugMsg;
if (I != nullptr)
dbgs() << " " << *I;
else
dbgs() << '.';
dbgs() << '\n';
}
#endif
static OptimizationRemarkAnalysis
createLVAnalysis(const char *PassName, StringRef RemarkName, Loop *TheLoop,
Instruction *I, DebugLoc DL = { … }
namespace llvm {
Value *createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF,
int64_t Step) { … }
Value *getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF) { … }
void reportVectorizationFailure(const StringRef DebugMsg,
const StringRef OREMsg, const StringRef ORETag,
OptimizationRemarkEmitter *ORE, Loop *TheLoop,
Instruction *I) { … }
static void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag,
OptimizationRemarkEmitter *ORE,
Loop *TheLoop, Instruction *I = nullptr,
DebugLoc DL = { … }
static void reportVectorization(OptimizationRemarkEmitter *ORE, Loop *TheLoop,
VectorizationFactor VF, unsigned IC) { … }
}
namespace llvm {
enum ScalarEpilogueLowering { … };
InstructionVFPair;
class LoopVectorizationCostModel { … };
}
namespace {
class GeneratedRTChecks { … };
}
static bool useActiveLaneMask(TailFoldingStyle Style) { … }
static bool useActiveLaneMaskForControlFlow(TailFoldingStyle Style) { … }
static bool isExplicitVecOuterLoop(Loop *OuterLp,
OptimizationRemarkEmitter *ORE) { … }
static void collectSupportedLoops(Loop &L, LoopInfo *LI,
OptimizationRemarkEmitter *ORE,
SmallVectorImpl<Loop *> &V) { … }
static Value *
emitTransformedIndex(IRBuilderBase &B, Value *Index, Value *StartValue,
Value *Step,
InductionDescriptor::InductionKind InductionKind,
const BinaryOperator *InductionBinOp) { … }
std::optional<unsigned> getMaxVScale(const Function &F,
const TargetTransformInfo &TTI) { … }
static bool isIndvarOverflowCheckKnownFalse(
const LoopVectorizationCostModel *Cost,
ElementCount VF, std::optional<unsigned> UF = std::nullopt) { … }
static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) { … }
void InnerLoopVectorizer::scalarizeInstruction(const Instruction *Instr,
VPReplicateRecipe *RepRecipe,
const VPLane &Lane,
VPTransformState &State) { … }
Value *
InnerLoopVectorizer::getOrCreateVectorTripCount(BasicBlock *InsertBlock) { … }
void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) { … }
BasicBlock *InnerLoopVectorizer::emitSCEVChecks(BasicBlock *Bypass) { … }
BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(BasicBlock *Bypass) { … }
void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) { … }
PHINode *InnerLoopVectorizer::createInductionResumeValue(
PHINode *OrigPhi, const InductionDescriptor &II, Value *Step,
ArrayRef<BasicBlock *> BypassBlocks,
std::pair<BasicBlock *, Value *> AdditionalBypass) { … }
static Value *getExpandedStep(const InductionDescriptor &ID,
const SCEV2ValueTy &ExpandedSCEVs) { … }
void InnerLoopVectorizer::createInductionResumeValues(
const SCEV2ValueTy &ExpandedSCEVs,
std::pair<BasicBlock *, Value *> AdditionalBypass) { … }
std::pair<BasicBlock *, Value *>
InnerLoopVectorizer::createVectorizedLoopSkeleton(
const SCEV2ValueTy &ExpandedSCEVs) { … }
void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
const InductionDescriptor &II,
Value *VectorTripCount, Value *EndValue,
BasicBlock *MiddleBlock, VPlan &Plan,
VPTransformState &State) { … }
namespace {
struct CSEDenseMapInfo { … };
}
static void cse(BasicBlock *BB) { … }
InstructionCost
LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,
ElementCount VF) const { … }
static Type *maybeVectorizeType(Type *Elt, ElementCount VF) { … }
InstructionCost
LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
ElementCount VF) const { … }
void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State,
VPlan &Plan) { … }
void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) { … }
void InnerLoopVectorizer::fixNonInductionPHIs(VPlan &Plan,
VPTransformState &State) { … }
void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { … }
bool LoopVectorizationCostModel::isScalarWithPredication(
Instruction *I, ElementCount VF) const { … }
bool LoopVectorizationCostModel::isPredicatedInst(Instruction *I) const { … }
std::pair<InstructionCost, InstructionCost>
LoopVectorizationCostModel::getDivRemSpeculationCost(Instruction *I,
ElementCount VF) const { … }
bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
Instruction *I, ElementCount VF) const { … }
bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(
Instruction *I, ElementCount VF) { … }
void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { … }
bool LoopVectorizationCostModel::runtimeChecksRequired() { … }
bool LoopVectorizationCostModel::isScalableVectorizationAllowed() { … }
ElementCount
LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) { … }
FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF(
unsigned MaxTripCount, ElementCount UserVF, bool FoldTailByMasking) { … }
FixedScalableVFPair
LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { … }
ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
unsigned MaxTripCount, unsigned SmallestType, unsigned WidestType,
ElementCount MaxSafeVF, bool FoldTailByMasking) { … }
static std::optional<unsigned>
getVScaleForTuning(const Loop *L, const TargetTransformInfo &TTI) { … }
bool LoopVectorizationPlanner::isMoreProfitable(
const VectorizationFactor &A, const VectorizationFactor &B) const { … }
void LoopVectorizationPlanner::emitInvalidCostRemarks(
OptimizationRemarkEmitter *ORE) { … }
static bool willGenerateVectors(VPlan &Plan, ElementCount VF,
const TargetTransformInfo &TTI) { … }
#ifndef NDEBUG
VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
InstructionCost ExpectedCost = CM.expectedCost(ElementCount::getFixed(1));
LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n");
assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop");
assert(any_of(VPlans,
[](std::unique_ptr<VPlan> &P) {
return P->hasVF(ElementCount::getFixed(1));
}) &&
"Expected Scalar VF to be a candidate");
const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost,
ExpectedCost);
VectorizationFactor ChosenFactor = ScalarCost;
bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled;
if (ForceVectorization &&
(VPlans.size() > 1 || !VPlans[0]->hasScalarVFOnly())) {
ChosenFactor.Cost = InstructionCost::getMax();
}
for (auto &P : VPlans) {
for (ElementCount VF : P->vectorFactors()) {
if (VF.isScalar())
continue;
InstructionCost C = CM.expectedCost(VF);
VectorizationFactor Candidate(VF, C, ScalarCost.ScalarCost);
unsigned AssumedMinimumVscale =
getVScaleForTuning(OrigLoop, TTI).value_or(1);
unsigned Width =
Candidate.Width.isScalable()
? Candidate.Width.getKnownMinValue() * AssumedMinimumVscale
: Candidate.Width.getFixedValue();
LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << VF
<< " costs: " << (Candidate.Cost / Width));
if (VF.isScalable())
LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of "
<< AssumedMinimumVscale << ")");
LLVM_DEBUG(dbgs() << ".\n");
if (!ForceVectorization && !willGenerateVectors(*P, VF, TTI)) {
LLVM_DEBUG(
dbgs()
<< "LV: Not considering vector loop of width " << VF
<< " because it will not generate any vector instructions.\n");
continue;
}
if (isMoreProfitable(Candidate, ChosenFactor))
ChosenFactor = Candidate;
}
}
if (!EnableCondStoresVectorization && CM.hasPredStores()) {
reportVectorizationFailure(
"There are conditional stores.",
"store that is conditionally executed prevents vectorization",
"ConditionalStore", ORE, OrigLoop);
ChosenFactor = ScalarCost;
}
LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() &&
!isMoreProfitable(ChosenFactor, ScalarCost)) dbgs()
<< "LV: Vectorization seems to be not beneficial, "
<< "but was forced by a user.\n");
LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << ChosenFactor.Width << ".\n");
return ChosenFactor;
}
#endif
bool LoopVectorizationPlanner::isCandidateForEpilogueVectorization(
ElementCount VF) const { … }
bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(
const ElementCount VF) const { … }
VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
const ElementCount MainLoopVF, unsigned IC) { … }
std::pair<unsigned, unsigned>
LoopVectorizationCostModel::getSmallestAndWidestTypes() { … }
void LoopVectorizationCostModel::collectElementTypesForWidening() { … }
unsigned
LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
InstructionCost LoopCost) { … }
SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) { … }
bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I,
ElementCount VF) { … }
void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) { … }
InstructionCost LoopVectorizationCostModel::computePredInstDiscount(
Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) { … }
InstructionCost LoopVectorizationCostModel::expectedCost(ElementCount VF) { … }
static const SCEV *getAddressAccessSCEV(
Value *Ptr,
LoopVectorizationLegality *Legal,
PredicatedScalarEvolution &PSE,
const Loop *TheLoop) { … }
InstructionCost
LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
ElementCount VF) { … }
InstructionCost
LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
ElementCount VF) { … }
InstructionCost
LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
ElementCount VF) { … }
InstructionCost
LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
ElementCount VF) { … }
InstructionCost
LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
ElementCount VF) { … }
std::optional<InstructionCost>
LoopVectorizationCostModel::getReductionPatternCost(
Instruction *I, ElementCount VF, Type *Ty,
TTI::TargetCostKind CostKind) const { … }
InstructionCost
LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
ElementCount VF) { … }
InstructionCost LoopVectorizationCostModel::getScalarizationOverhead(
Instruction *I, ElementCount VF, TTI::TargetCostKind CostKind) const { … }
void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { … }
void LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) { … }
InstructionCost
LoopVectorizationCostModel::getInstructionCost(Instruction *I,
ElementCount VF) { … }
void LoopVectorizationCostModel::collectValuesToIgnore() { … }
void LoopVectorizationCostModel::collectInLoopReductions() { … }
static ElementCount determineVPlanVF(const TargetTransformInfo &TTI,
LoopVectorizationCostModel &CM) { … }
VectorizationFactor
LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) { … }
void LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { … }
InstructionCost VPCostContext::getLegacyCost(Instruction *UI,
ElementCount VF) const { … }
bool VPCostContext::skipCostComputation(Instruction *UI, bool IsVector) const { … }
InstructionCost
LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF,
VPCostContext &CostCtx) const { … }
InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan,
ElementCount VF) const { … }
#ifndef NDEBUG
static bool planContainsAdditionalSimplifications(VPlan &Plan,
VPCostContext &CostCtx,
Loop *TheLoop) {
auto GetInstructionForCost = [](const VPRecipeBase *R) -> Instruction * {
if (auto *S = dyn_cast<VPSingleDefRecipe>(R))
return dyn_cast_or_null<Instruction>(S->getUnderlyingValue());
if (auto *WidenMem = dyn_cast<VPWidenMemoryRecipe>(R))
return &WidenMem->getIngredient();
return nullptr;
};
DenseSet<Instruction *> SeenInstrs;
auto Iter = vp_depth_first_deep(Plan.getVectorLoopRegion()->getEntry());
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
for (VPRecipeBase &R : *VPBB) {
if (auto *IR = dyn_cast<VPInterleaveRecipe>(&R)) {
auto *IG = IR->getInterleaveGroup();
unsigned NumMembers = IG->getNumMembers();
for (unsigned I = 0; I != NumMembers; ++I) {
if (Instruction *M = IG->getMember(I))
SeenInstrs.insert(M);
}
continue;
}
if (Instruction *UI = GetInstructionForCost(&R))
SeenInstrs.insert(UI);
}
}
return any_of(TheLoop->blocks(), [&SeenInstrs, &CostCtx,
TheLoop](BasicBlock *BB) {
return any_of(*BB, [&SeenInstrs, &CostCtx, TheLoop, BB](Instruction &I) {
if (isa<PHINode>(&I) && BB == TheLoop->getHeader())
return false;
return !SeenInstrs.contains(&I) && !CostCtx.skipCostComputation(&I, true);
});
});
}
#endif
VectorizationFactor LoopVectorizationPlanner::computeBestVF() { … }
static void addRuntimeUnrollDisableMetaData(Loop *L) { … }
static void createAndCollectMergePhiForReduction(
VPInstruction *RedResult,
VPTransformState &State, Loop *OrigLoop, BasicBlock *LoopMiddleBlock,
bool VectorizingEpilogue) { … }
DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
ElementCount BestVF, unsigned BestUF, VPlan &BestVPlan,
InnerLoopVectorizer &ILV, DominatorTree *DT, bool IsEpilogueVectorization,
const DenseMap<const SCEV *, Value *> *ExpandedSCEVs) { … }
std::pair<BasicBlock *, Value *>
EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton(
const SCEV2ValueTy &ExpandedSCEVs) { … }
void EpilogueVectorizerMainLoop::printDebugTracesAtStart() { … }
void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() { … }
BasicBlock *
EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass,
bool ForEpilogue) { … }
std::pair<BasicBlock *, Value *>
EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton(
const SCEV2ValueTy &ExpandedSCEVs) { … }
BasicBlock *
EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck(
BasicBlock *Bypass, BasicBlock *Insert) { … }
void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() { … }
void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() { … }
iterator_range<mapped_iterator<Use *, std::function<VPValue *(Value *)>>>
VPRecipeBuilder::mapToVPValues(User::op_range Operands) { … }
void VPRecipeBuilder::createSwitchEdgeMasks(SwitchInst *SI) { … }
VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst) { … }
VPValue *VPRecipeBuilder::getEdgeMask(BasicBlock *Src, BasicBlock *Dst) const { … }
void VPRecipeBuilder::createHeaderMask() { … }
VPValue *VPRecipeBuilder::getBlockInMask(BasicBlock *BB) const { … }
void VPRecipeBuilder::createBlockInMask(BasicBlock *BB) { … }
VPWidenMemoryRecipe *
VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
VFRange &Range) { … }
static VPWidenIntOrFpInductionRecipe *
createWidenInductionRecipes(PHINode *Phi, Instruction *PhiOrTrunc,
VPValue *Start, const InductionDescriptor &IndDesc,
VPlan &Plan, ScalarEvolution &SE, Loop &OrigLoop) { … }
VPHeaderPHIRecipe *VPRecipeBuilder::tryToOptimizeInductionPHI(
PHINode *Phi, ArrayRef<VPValue *> Operands, VFRange &Range) { … }
VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate(
TruncInst *I, ArrayRef<VPValue *> Operands, VFRange &Range) { … }
VPBlendRecipe *VPRecipeBuilder::tryToBlend(PHINode *Phi,
ArrayRef<VPValue *> Operands) { … }
VPSingleDefRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,
ArrayRef<VPValue *> Operands,
VFRange &Range) { … }
bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const { … }
VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I,
ArrayRef<VPValue *> Operands,
VPBasicBlock *VPBB) { … }
VPHistogramRecipe *
VPRecipeBuilder::tryToWidenHistogram(const HistogramInfo *HI,
ArrayRef<VPValue *> Operands) { … }
void VPRecipeBuilder::fixHeaderPhis() { … }
VPReplicateRecipe *VPRecipeBuilder::handleReplication(Instruction *I,
VFRange &Range) { … }
VPRecipeBase *
VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
ArrayRef<VPValue *> Operands,
VFRange &Range, VPBasicBlock *VPBB) { … }
void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
ElementCount MaxVF) { … }
static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW,
DebugLoc DL) { … }
static SetVector<VPIRInstruction *> collectUsersInExitBlock(
Loop *OrigLoop, VPRecipeBuilder &Builder, VPlan &Plan,
const MapVector<PHINode *, InductionDescriptor> &Inductions) { … }
static void
addUsersInExitBlock(VPlan &Plan,
const SetVector<VPIRInstruction *> &ExitUsersToFix) { … }
static void addLiveOutsForFirstOrderRecurrences(
VPlan &Plan, SetVector<VPIRInstruction *> &ExitUsersToFix) { … }
VPlanPtr
LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) { … }
VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { … }
void LoopVectorizationPlanner::adjustRecipesForReductions(
VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder, ElementCount MinVF) { … }
void VPDerivedIVRecipe::execute(VPTransformState &State) { … }
void VPReplicateRecipe::execute(VPTransformState &State) { … }
static ScalarEpilogueLowering getScalarEpilogueLowering(
Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI,
BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
LoopVectorizationLegality &LVL, InterleavedAccessInfo *IAI) { … }
static bool processLoopInVPlanNativePath(
Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT,
LoopVectorizationLegality *LVL, TargetTransformInfo *TTI,
TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC,
OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI,
ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints,
LoopVectorizationRequirements &Requirements) { … }
static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) { … }
static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks,
VectorizationFactor &VF,
std::optional<unsigned> VScale, Loop *L,
PredicatedScalarEvolution &PSE,
ScalarEpilogueLowering SEL) { … }
LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts)
: … { … }
bool LoopVectorizePass::processLoop(Loop *L) { … }
LoopVectorizeResult LoopVectorizePass::runImpl(Function &F) { … }
PreservedAnalyses LoopVectorizePass::run(Function &F,
FunctionAnalysisManager &AM) { … }
void LoopVectorizePass::printPipeline(
raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) { … }