LoopVectorize.cpp | Explore in Territory

//===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
// and generates target-independent LLVM-IR.
// The vectorizer uses the TargetTransformInfo analysis to estimate the costs
// of instructions in order to estimate the profitability of vectorization.
//
// The loop vectorizer combines consecutive loop iterations into a single
// 'wide' iteration. After this transformation the index is incremented
// by the SIMD vector width, and not by one.
//
// This pass has three parts:
// 1. The main loop pass that drives the different parts.
// 2. LoopVectorizationLegality - A unit that checks for the legality
//    of the vectorization.
// 3. InnerLoopVectorizer - A unit that performs the actual
//    widening of instructions.
// 4. LoopVectorizationCostModel - A unit that checks for the profitability
//    of vectorization. It decides on the optimal vector width, which
//    can be one, if vectorization is not profitable.
//
// There is a development effort going on to migrate loop vectorizer to the
// VPlan infrastructure and to introduce outer loop vectorization support (see
// docs/VectorizationPlan.rst and
// http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
// purpose, we temporarily introduced the VPlan-native vectorization path: an
// alternative vectorization path that is natively implemented on top of the
// VPlan infrastructure. See EnableVPlanNativePath for enabling.
//
//===----------------------------------------------------------------------===//
//
// The reduction-variable vectorization is based on the paper:
//  D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
//
// Variable uniformity checks are inspired by:
//  Karrenberg, R. and Hack, S. Whole Function Vectorization.
//
// The interleaved access vectorization is based on the paper:
//  Dorit Nuzman, Ira Rosen and Ayal Zaks.  Auto-Vectorization of Interleaved
//  Data for SIMD
//
// Other ideas/concepts are from:
//  A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
//
//  S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua.  An Evaluation of
//  Vectorizing Compilers.
//
//===----------------------------------------------------------------------===//

#include "llvm/Transforms/Vectorize/LoopVectorize.h"
#include "LoopVectorizationPlanner.h"
#include "VPRecipeBuilder.h"
#include "VPlan.h"
#include "VPlanAnalysis.h"
#include "VPlanHCFGBuilder.h"
#include "VPlanPatternMatch.h"
#include "VPlanTransforms.h"
#include "VPlanUtils.h"
#include "VPlanVerifier.h"
#include "llvm/ADT/APInt.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/DenseMapInfo.h"
#include "llvm/ADT/Hashing.h"
#include "llvm/ADT/MapVector.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/ADT/Twine.h"
#include "llvm/ADT/TypeSwitch.h"
#include "llvm/ADT/iterator_range.h"
#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/BasicAliasAnalysis.h"
#include "llvm/Analysis/BlockFrequencyInfo.h"
#include "llvm/Analysis/CFG.h"
#include "llvm/Analysis/CodeMetrics.h"
#include "llvm/Analysis/DemandedBits.h"
#include "llvm/Analysis/GlobalsModRef.h"
#include "llvm/Analysis/LoopAccessAnalysis.h"
#include "llvm/Analysis/LoopAnalysisManager.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/LoopIterator.h"
#include "llvm/Analysis/OptimizationRemarkEmitter.h"
#include "llvm/Analysis/ProfileSummaryInfo.h"
#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/Analysis/ScalarEvolutionExpressions.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/Analysis/VectorUtils.h"
#include "llvm/IR/Attributes.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/CFG.h"
#include "llvm/IR/Constant.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/DebugInfo.h"
#include "llvm/IR/DebugLoc.h"
#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/DiagnosticInfo.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/InstrTypes.h"
#include "llvm/IR/Instruction.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/MDBuilder.h"
#include "llvm/IR/Metadata.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/Operator.h"
#include "llvm/IR/PatternMatch.h"
#include "llvm/IR/ProfDataUtils.h"
#include "llvm/IR/Type.h"
#include "llvm/IR/Use.h"
#include "llvm/IR/User.h"
#include "llvm/IR/Value.h"
#include "llvm/IR/Verifier.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/InstructionCost.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/Transforms/Utils/InjectTLIMappings.h"
#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Transforms/Utils/LoopSimplify.h"
#include "llvm/Transforms/Utils/LoopUtils.h"
#include "llvm/Transforms/Utils/LoopVersioning.h"
#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
#include "llvm/Transforms/Utils/SizeOpts.h"
#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
#include <algorithm>
#include <cassert>
#include <cstdint>
#include <functional>
#include <iterator>
#include <limits>
#include <memory>
#include <string>
#include <tuple>
#include <utility>

usingnamespacellvm;

#define LV_NAME …
#define DEBUG_TYPE …

#ifndef NDEBUG
const char VerboseDebug[] = DEBUG_TYPE "-verbose";
#endif

/// @{
/// Metadata attribute names
const char LLVMLoopVectorizeFollowupAll[] = …;
const char LLVMLoopVectorizeFollowupVectorized[] = …;
const char LLVMLoopVectorizeFollowupEpilogue[] = …;
/// @}

STATISTIC(LoopsVectorized, "Number of loops vectorized");
STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized");

static cl::opt<bool> EnableEpilogueVectorization(
    "enable-epilogue-vectorization", cl::init(true), cl::Hidden,
    cl::desc("Enable vectorization of epilogue loops."));

static cl::opt<unsigned> EpilogueVectorizationForceVF(
    "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden,
    cl::desc("When epilogue vectorization is enabled, and a value greater than "
             "1 is specified, forces the given VF for all applicable epilogue "
             "loops."));

static cl::opt<unsigned> EpilogueVectorizationMinVF(
    "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden,
    cl::desc("Only loops with vectorization factor equal to or larger than "
             "the specified value are considered for epilogue vectorization."));

/// Loops with a known constant trip count below this number are vectorized only
/// if no scalar iteration overheads are incurred.
static cl::opt<unsigned> TinyTripCountVectorThreshold(
    "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
    cl::desc("Loops with a constant trip count that is smaller than this "
             "value are vectorized only if no scalar iteration overheads "
             "are incurred."));

static cl::opt<unsigned> VectorizeMemoryCheckThreshold(
    "vectorize-memory-check-threshold", cl::init(128), cl::Hidden,
    cl::desc("The maximum allowed number of runtime memory checks"));

// Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,
// that predication is preferred, and this lists all options. I.e., the
// vectorizer will try to fold the tail-loop (epilogue) into the vector body
// and predicate the instructions accordingly. If tail-folding fails, there are
// different fallback strategies depending on these values:
namespace PreferPredicateTy {
  enum Option { … };
} // namespace PreferPredicateTy

static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue(
    "prefer-predicate-over-epilogue",
    cl::init(PreferPredicateTy::ScalarEpilogue),
    cl::Hidden,
    cl::desc("Tail-folding and predication preferences over creating a scalar "
             "epilogue loop."),
    cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue,
                         "scalar-epilogue",
                         "Don't tail-predicate loops, create scalar epilogue"),
              clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue,
                         "predicate-else-scalar-epilogue",
                         "prefer tail-folding, create scalar epilogue if tail "
                         "folding fails."),
              clEnumValN(PreferPredicateTy::PredicateOrDontVectorize,
                         "predicate-dont-vectorize",
                         "prefers tail-folding, don't attempt vectorization if "
                         "tail-folding fails.")));

static cl::opt<TailFoldingStyle> ForceTailFoldingStyle(
    "force-tail-folding-style", cl::desc("Force the tail folding style"),
    cl::init(TailFoldingStyle::None),
    cl::values(
        clEnumValN(TailFoldingStyle::None, "none", "Disable tail folding"),
        clEnumValN(
            TailFoldingStyle::Data, "data",
            "Create lane mask for data only, using active.lane.mask intrinsic"),
        clEnumValN(TailFoldingStyle::DataWithoutLaneMask,
                   "data-without-lane-mask",
                   "Create lane mask with compare/stepvector"),
        clEnumValN(TailFoldingStyle::DataAndControlFlow, "data-and-control",
                   "Create lane mask using active.lane.mask intrinsic, and use "
                   "it for both data and control flow"),
        clEnumValN(TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck,
                   "data-and-control-without-rt-check",
                   "Similar to data-and-control, but remove the runtime check"),
        clEnumValN(TailFoldingStyle::DataWithEVL, "data-with-evl",
                   "Use predicated EVL instructions for tail folding. If EVL "
                   "is unsupported, fallback to data-without-lane-mask.")));

static cl::opt<bool> MaximizeBandwidth(
    "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
    cl::desc("Maximize bandwidth when selecting vectorization factor which "
             "will be determined by the smallest type in loop."));

static cl::opt<bool> EnableInterleavedMemAccesses(
    "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
    cl::desc("Enable vectorization on interleaved memory accesses in a loop"));

/// An interleave-group may need masking if it resides in a block that needs
/// predication, or in order to mask away gaps.
static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
    "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
    cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));

static cl::opt<unsigned> ForceTargetNumScalarRegs(
    "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
    cl::desc("A flag that overrides the target's number of scalar registers."));

static cl::opt<unsigned> ForceTargetNumVectorRegs(
    "force-target-num-vector-regs", cl::init(0), cl::Hidden,
    cl::desc("A flag that overrides the target's number of vector registers."));

static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
    "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
    cl::desc("A flag that overrides the target's max interleave factor for "
             "scalar loops."));

static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
    "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
    cl::desc("A flag that overrides the target's max interleave factor for "
             "vectorized loops."));

cl::opt<unsigned> ForceTargetInstructionCost(
    "force-target-instruction-cost", cl::init(0), cl::Hidden,
    cl::desc("A flag that overrides the target's expected cost for "
             "an instruction to a single constant value. Mostly "
             "useful for getting consistent testing."));

static cl::opt<bool> ForceTargetSupportsScalableVectors(
    "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden,
    cl::desc(
        "Pretend that scalable vectors are supported, even if the target does "
        "not support them. This flag should only be used for testing."));

static cl::opt<unsigned> SmallLoopCost(
    "small-loop-cost", cl::init(20), cl::Hidden,
    cl::desc(
        "The cost of a loop that is considered 'small' by the interleaver."));

static cl::opt<bool> LoopVectorizeWithBlockFrequency(
    "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
    cl::desc("Enable the use of the block frequency analysis to access PGO "
             "heuristics minimizing code growth in cold regions and being more "
             "aggressive in hot regions."));

// Runtime interleave loops for load/store throughput.
static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
    "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
    cl::desc(
        "Enable runtime interleaving until load/store ports are saturated"));

/// The number of stores in a loop that are allowed to need predication.
static cl::opt<unsigned> NumberOfStoresToPredicate(
    "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
    cl::desc("Max number of stores to be predicated behind an if."));

static cl::opt<bool> EnableIndVarRegisterHeur(
    "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
    cl::desc("Count the induction variable only once when interleaving"));

static cl::opt<bool> EnableCondStoresVectorization(
    "enable-cond-stores-vec", cl::init(true), cl::Hidden,
    cl::desc("Enable if predication of stores during vectorization."));

static cl::opt<unsigned> MaxNestedScalarReductionIC(
    "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
    cl::desc("The maximum interleave count to use when interleaving a scalar "
             "reduction in a nested loop."));

static cl::opt<bool>
    PreferInLoopReductions("prefer-inloop-reductions", cl::init(false),
                           cl::Hidden,
                           cl::desc("Prefer in-loop vector reductions, "
                                    "overriding the targets preference."));

static cl::opt<bool> ForceOrderedReductions(
    "force-ordered-reductions", cl::init(false), cl::Hidden,
    cl::desc("Enable the vectorisation of loops with in-order (strict) "
             "FP reductions"));

static cl::opt<bool> PreferPredicatedReductionSelect(
    "prefer-predicated-reduction-select", cl::init(false), cl::Hidden,
    cl::desc(
        "Prefer predicating a reduction operation over an after loop select."));

namespace llvm {
cl::opt<bool> EnableVPlanNativePath(
    "enable-vplan-native-path", cl::Hidden,
    cl::desc("Enable VPlan-native vectorization path with "
             "support for outer loop vectorization."));
} // namespace llvm

// This flag enables the stress testing of the VPlan H-CFG construction in the
// VPlan-native vectorization path. It must be used in conjuction with
// -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
// verification of the H-CFGs built.
static cl::opt<bool> VPlanBuildStressTest(
    "vplan-build-stress-test", cl::init(false), cl::Hidden,
    cl::desc(
        "Build VPlan for every supported loop nest in the function and bail "
        "out right after the build (stress test the VPlan H-CFG construction "
        "in the VPlan-native vectorization path)."));

cl::opt<bool> llvm::EnableLoopInterleaving(
    "interleave-loops", cl::init(true), cl::Hidden,
    cl::desc("Enable loop interleaving in Loop vectorization passes"));
cl::opt<bool> llvm::EnableLoopVectorization(
    "vectorize-loops", cl::init(true), cl::Hidden,
    cl::desc("Run the Loop vectorization passes"));

static cl::opt<cl::boolOrDefault> ForceSafeDivisor(
    "force-widen-divrem-via-safe-divisor", cl::Hidden,
    cl::desc(
        "Override cost based safe divisor widening for div/rem instructions"));

static cl::opt<bool> UseWiderVFIfCallVariantsPresent(
    "vectorizer-maximize-bandwidth-for-vector-calls", cl::init(true),
    cl::Hidden,
    cl::desc("Try wider VFs if they enable the use of vector variants"));

// Likelyhood of bypassing the vectorized loop because assumptions about SCEV
// variables not overflowing do not hold. See `emitSCEVChecks`.
static constexpr uint32_t SCEVCheckBypassWeights[] = …;
// Likelyhood of bypassing the vectorized loop because pointers overlap. See
// `emitMemRuntimeChecks`.
static constexpr uint32_t MemCheckBypassWeights[] = …;
// Likelyhood of bypassing the vectorized loop because there are zero trips left
// after prolog. See `emitIterationCountCheck`.
static constexpr uint32_t MinItersBypassWeights[] = …;

/// A helper function that returns true if the given type is irregular. The
/// type is irregular if its allocated size doesn't equal the store size of an
/// element of the corresponding vector type.
static bool hasIrregularType(Type *Ty, const DataLayout &DL) { … }

/// Returns "best known" trip count for the specified loop \p L as defined by
/// the following procedure:
///   1) Returns exact trip count if it is known.
///   2) Returns expected trip count according to profile data if any.
///   3) Returns upper bound estimate if known, and if \p CanUseConstantMax.
///   4) Returns std::nullopt if all of the above failed.
static std::optional<unsigned>
getSmallBestKnownTC(ScalarEvolution &SE, Loop *L,
                    bool CanUseConstantMax = true) { … }

namespace {
// Forward declare GeneratedRTChecks.
class GeneratedRTChecks;

SCEV2ValueTy;
} // namespace

namespace llvm {

AnalysisKey ShouldRunExtraVectorPasses::Key;

/// InnerLoopVectorizer vectorizes loops which contain only one basic
/// block to a specified vectorization factor (VF).
/// This class performs the widening of scalars into vectors, or multiple
/// scalars. This class also implements the following features:
/// * It inserts an epilogue loop for handling loops that don't have iteration
///   counts that are known to be a multiple of the vectorization factor.
/// * It handles the code generation for reduction variables.
/// * Scalarization (implementation using scalars) of un-vectorizable
///   instructions.
/// InnerLoopVectorizer does not perform any vectorization-legality
/// checks, and relies on the caller to check for the different legality
/// aspects. The InnerLoopVectorizer relies on the
/// LoopVectorizationLegality class to provide information about the induction
/// and reduction variables that were found to a given vectorization factor.
class InnerLoopVectorizer { … };

/// Encapsulate information regarding vectorization of a loop and its epilogue.
/// This information is meant to be updated and used across two stages of
/// epilogue vectorization.
struct EpilogueLoopVectorizationInfo { … };

/// An extension of the inner loop vectorizer that creates a skeleton for a
/// vectorized loop that has its epilogue (residual) also vectorized.
/// The idea is to run the vplan on a given loop twice, firstly to setup the
/// skeleton and vectorize the main loop, and secondly to complete the skeleton
/// from the first step and vectorize the epilogue.  This is achieved by
/// deriving two concrete strategy classes from this base class and invoking
/// them in succession from the loop vectorizer planner.
class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer { … };

/// A specialized derived class of inner loop vectorizer that performs
/// vectorization of *main* loops in the process of vectorizing loops and their
/// epilogues.
class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer { … };

// A specialized derived class of inner loop vectorizer that performs
// vectorization of *epilogue* loops in the process of vectorizing loops and
// their epilogues.
class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer { … };
} // end namespace llvm

/// Look for a meaningful debug location on the instruction or it's
/// operands.
static DebugLoc getDebugLocFromInstOrOperands(Instruction *I) { … }

/// Write a \p DebugMsg about vectorization to the debug output stream. If \p I
/// is passed, the message relates to that particular instruction.
#ifndef NDEBUG
static void debugVectorizationMessage(const StringRef Prefix,
                                      const StringRef DebugMsg,
                                      Instruction *I) {
  dbgs() << "LV: " << Prefix << DebugMsg;
  if (I != nullptr)
    dbgs() << " " << *I;
  else
    dbgs() << '.';
  dbgs() << '\n';
}
#endif

/// Create an analysis remark that explains why vectorization failed
///
/// \p PassName is the name of the pass (e.g. can be AlwaysPrint).  \p
/// RemarkName is the identifier for the remark.  If \p I is passed it is an
/// instruction that prevents vectorization.  Otherwise \p TheLoop is used for
/// the location of the remark. If \p DL is passed, use it as debug location for
/// the remark. \return the remark object that can be streamed to.
static OptimizationRemarkAnalysis
createLVAnalysis(const char *PassName, StringRef RemarkName, Loop *TheLoop,
                 Instruction *I, DebugLoc DL = { … }

namespace llvm {

/// Return a value for Step multiplied by VF.
Value *createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF,
                       int64_t Step) { … }

/// Return the runtime value for VF.
Value *getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF) { … }

void reportVectorizationFailure(const StringRef DebugMsg,
                                const StringRef OREMsg, const StringRef ORETag,
                                OptimizationRemarkEmitter *ORE, Loop *TheLoop,
                                Instruction *I) { … }

/// Reports an informative message: print \p Msg for debugging purposes as well
/// as an optimization remark. Uses either \p I as location of the remark, or
/// otherwise \p TheLoop. If \p DL is passed, use it as debug location for the
/// remark. If \p DL is passed, use it as debug location for the remark.
static void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag,
                                    OptimizationRemarkEmitter *ORE,
                                    Loop *TheLoop, Instruction *I = nullptr,
                                    DebugLoc DL = { … }

/// Report successful vectorization of the loop. In case an outer loop is
/// vectorized, prepend "outer" to the vectorization remark.
static void reportVectorization(OptimizationRemarkEmitter *ORE, Loop *TheLoop,
                                VectorizationFactor VF, unsigned IC) { … }

} // end namespace llvm

namespace llvm {

// Loop vectorization cost-model hints how the scalar epilogue loop should be
// lowered.
enum ScalarEpilogueLowering { … };

InstructionVFPair;

/// LoopVectorizationCostModel - estimates the expected speedups due to
/// vectorization.
/// In many cases vectorization is not profitable. This can happen because of
/// a number of reasons. In this class we mainly attempt to predict the
/// expected speedup/slowdowns due to the supported instruction set. We use the
/// TargetTransformInfo to query the different backends for the cost of
/// different operations.
class LoopVectorizationCostModel { … };
} // end namespace llvm

namespace {
/// Helper struct to manage generating runtime checks for vectorization.
///
/// The runtime checks are created up-front in temporary blocks to allow better
/// estimating the cost and un-linked from the existing IR. After deciding to
/// vectorize, the checks are moved back. If deciding not to vectorize, the
/// temporary blocks are completely removed.
class GeneratedRTChecks { … };
} // namespace

static bool useActiveLaneMask(TailFoldingStyle Style) { … }

static bool useActiveLaneMaskForControlFlow(TailFoldingStyle Style) { … }

// Return true if \p OuterLp is an outer loop annotated with hints for explicit
// vectorization. The loop needs to be annotated with #pragma omp simd
// simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
// vector length information is not provided, vectorization is not considered
// explicit. Interleave hints are not allowed either. These limitations will be
// relaxed in the future.
// Please, note that we are currently forced to abuse the pragma 'clang
// vectorize' semantics. This pragma provides *auto-vectorization hints*
// (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
// provides *explicit vectorization hints* (LV can bypass legal checks and
// assume that vectorization is legal). However, both hints are implemented
// using the same metadata (llvm.loop.vectorize, processed by
// LoopVectorizeHints). This will be fixed in the future when the native IR
// representation for pragma 'omp simd' is introduced.
static bool isExplicitVecOuterLoop(Loop *OuterLp,
                                   OptimizationRemarkEmitter *ORE) { … }

static void collectSupportedLoops(Loop &L, LoopInfo *LI,
                                  OptimizationRemarkEmitter *ORE,
                                  SmallVectorImpl<Loop *> &V) { … }

//===----------------------------------------------------------------------===//
// Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
// LoopVectorizationCostModel and LoopVectorizationPlanner.
//===----------------------------------------------------------------------===//

/// Compute the transformed value of Index at offset StartValue using step
/// StepValue.
/// For integer induction, returns StartValue + Index * StepValue.
/// For pointer induction, returns StartValue[Index * StepValue].
/// FIXME: The newly created binary instructions should contain nsw/nuw
/// flags, which can be found from the original scalar operations.
static Value *
emitTransformedIndex(IRBuilderBase &B, Value *Index, Value *StartValue,
                     Value *Step,
                     InductionDescriptor::InductionKind InductionKind,
                     const BinaryOperator *InductionBinOp) { … }

std::optional<unsigned> getMaxVScale(const Function &F,
                                     const TargetTransformInfo &TTI) { … }

/// For the given VF and UF and maximum trip count computed for the loop, return
/// whether the induction variable might overflow in the vectorized loop. If not,
/// then we know a runtime overflow check always evaluates to false and can be
/// removed.
static bool isIndvarOverflowCheckKnownFalse(
    const LoopVectorizationCostModel *Cost,
    ElementCount VF, std::optional<unsigned> UF = std::nullopt) { … }

// Return whether we allow using masked interleave-groups (for dealing with
// strided loads/stores that reside in predicated blocks, or for dealing
// with gaps).
static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) { … }

void InnerLoopVectorizer::scalarizeInstruction(const Instruction *Instr,
                                               VPReplicateRecipe *RepRecipe,
                                               const VPLane &Lane,
                                               VPTransformState &State) { … }

Value *
InnerLoopVectorizer::getOrCreateVectorTripCount(BasicBlock *InsertBlock) { … }

void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) { … }

BasicBlock *InnerLoopVectorizer::emitSCEVChecks(BasicBlock *Bypass) { … }

BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(BasicBlock *Bypass) { … }

void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) { … }

PHINode *InnerLoopVectorizer::createInductionResumeValue(
    PHINode *OrigPhi, const InductionDescriptor &II, Value *Step,
    ArrayRef<BasicBlock *> BypassBlocks,
    std::pair<BasicBlock *, Value *> AdditionalBypass) { … }

/// Return the expanded step for \p ID using \p ExpandedSCEVs to look up SCEV
/// expansion results.
static Value *getExpandedStep(const InductionDescriptor &ID,
                              const SCEV2ValueTy &ExpandedSCEVs) { … }

void InnerLoopVectorizer::createInductionResumeValues(
    const SCEV2ValueTy &ExpandedSCEVs,
    std::pair<BasicBlock *, Value *> AdditionalBypass) { … }

std::pair<BasicBlock *, Value *>
InnerLoopVectorizer::createVectorizedLoopSkeleton(
    const SCEV2ValueTy &ExpandedSCEVs) { … }

// Fix up external users of the induction variable. At this point, we are
// in LCSSA form, with all external PHIs that use the IV having one input value,
// coming from the remainder loop. We need those PHIs to also have a correct
// value for the IV when arriving directly from the middle block.
void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
                                       const InductionDescriptor &II,
                                       Value *VectorTripCount, Value *EndValue,
                                       BasicBlock *MiddleBlock, VPlan &Plan,
                                       VPTransformState &State) { … }

namespace {

struct CSEDenseMapInfo { … };

} // end anonymous namespace

///Perform cse of induction variable instructions.
static void cse(BasicBlock *BB) { … }

InstructionCost
LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,
                                              ElementCount VF) const { … }

static Type *maybeVectorizeType(Type *Elt, ElementCount VF) { … }

InstructionCost
LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
                                                   ElementCount VF) const { … }

void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State,
                                            VPlan &Plan) { … }

void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) { … }

void InnerLoopVectorizer::fixNonInductionPHIs(VPlan &Plan,
                                              VPTransformState &State) { … }

void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { … }

bool LoopVectorizationCostModel::isScalarWithPredication(
    Instruction *I, ElementCount VF) const { … }

// TODO: Fold into LoopVectorizationLegality::isMaskRequired.
bool LoopVectorizationCostModel::isPredicatedInst(Instruction *I) const { … }

std::pair<InstructionCost, InstructionCost>
LoopVectorizationCostModel::getDivRemSpeculationCost(Instruction *I,
                                                    ElementCount VF) const { … }

bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
    Instruction *I, ElementCount VF) const { … }

bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(
    Instruction *I, ElementCount VF) { … }

void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { … }

bool LoopVectorizationCostModel::runtimeChecksRequired() { … }

bool LoopVectorizationCostModel::isScalableVectorizationAllowed() { … }

ElementCount
LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) { … }

FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF(
    unsigned MaxTripCount, ElementCount UserVF, bool FoldTailByMasking) { … }

FixedScalableVFPair
LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { … }

ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
    unsigned MaxTripCount, unsigned SmallestType, unsigned WidestType,
    ElementCount MaxSafeVF, bool FoldTailByMasking) { … }

/// Convenience function that returns the value of vscale_range iff
/// vscale_range.min == vscale_range.max or otherwise returns the value
/// returned by the corresponding TTI method.
static std::optional<unsigned>
getVScaleForTuning(const Loop *L, const TargetTransformInfo &TTI) { … }

bool LoopVectorizationPlanner::isMoreProfitable(
    const VectorizationFactor &A, const VectorizationFactor &B) const { … }

void LoopVectorizationPlanner::emitInvalidCostRemarks(
    OptimizationRemarkEmitter *ORE) { … }

/// Check if any recipe of \p Plan will generate a vector value, which will be
/// assigned a vector register.
static bool willGenerateVectors(VPlan &Plan, ElementCount VF,
                                const TargetTransformInfo &TTI) { … }

#ifndef NDEBUG
VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
  InstructionCost ExpectedCost = CM.expectedCost(ElementCount::getFixed(1));
  LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n");
  assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop");
  assert(any_of(VPlans,
                [](std::unique_ptr<VPlan> &P) {
                  return P->hasVF(ElementCount::getFixed(1));
                }) &&
         "Expected Scalar VF to be a candidate");

  const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost,
                                       ExpectedCost);
  VectorizationFactor ChosenFactor = ScalarCost;

  bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled;
  if (ForceVectorization &&
      (VPlans.size() > 1 || !VPlans[0]->hasScalarVFOnly())) {
    // Ignore scalar width, because the user explicitly wants vectorization.
    // Initialize cost to max so that VF = 2 is, at least, chosen during cost
    // evaluation.
    ChosenFactor.Cost = InstructionCost::getMax();
  }

  for (auto &P : VPlans) {
    for (ElementCount VF : P->vectorFactors()) {
      // The cost for scalar VF=1 is already calculated, so ignore it.
      if (VF.isScalar())
        continue;

      InstructionCost C = CM.expectedCost(VF);
      VectorizationFactor Candidate(VF, C, ScalarCost.ScalarCost);

      unsigned AssumedMinimumVscale =
          getVScaleForTuning(OrigLoop, TTI).value_or(1);
      unsigned Width =
          Candidate.Width.isScalable()
              ? Candidate.Width.getKnownMinValue() * AssumedMinimumVscale
              : Candidate.Width.getFixedValue();
      LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << VF
                        << " costs: " << (Candidate.Cost / Width));
      if (VF.isScalable())
        LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of "
                          << AssumedMinimumVscale << ")");
      LLVM_DEBUG(dbgs() << ".\n");

      if (!ForceVectorization && !willGenerateVectors(*P, VF, TTI)) {
        LLVM_DEBUG(
            dbgs()
            << "LV: Not considering vector loop of width " << VF
            << " because it will not generate any vector instructions.\n");
        continue;
      }

      if (isMoreProfitable(Candidate, ChosenFactor))
        ChosenFactor = Candidate;
    }
  }

  if (!EnableCondStoresVectorization && CM.hasPredStores()) {
    reportVectorizationFailure(
        "There are conditional stores.",
        "store that is conditionally executed prevents vectorization",
        "ConditionalStore", ORE, OrigLoop);
    ChosenFactor = ScalarCost;
  }

  LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() &&
                 !isMoreProfitable(ChosenFactor, ScalarCost)) dbgs()
             << "LV: Vectorization seems to be not beneficial, "
             << "but was forced by a user.\n");
  LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << ChosenFactor.Width << ".\n");
  return ChosenFactor;
}
#endif

bool LoopVectorizationPlanner::isCandidateForEpilogueVectorization(
    ElementCount VF) const { … }

bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(
    const ElementCount VF) const { … }

VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
    const ElementCount MainLoopVF, unsigned IC) { … }

std::pair<unsigned, unsigned>
LoopVectorizationCostModel::getSmallestAndWidestTypes() { … }

void LoopVectorizationCostModel::collectElementTypesForWidening() { … }

unsigned
LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
                                                  InstructionCost LoopCost) { … }

SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) { … }

bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I,
                                                           ElementCount VF) { … }

void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) { … }

InstructionCost LoopVectorizationCostModel::computePredInstDiscount(
    Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) { … }

InstructionCost LoopVectorizationCostModel::expectedCost(ElementCount VF) { … }

/// Gets Address Access SCEV after verifying that the access pattern
/// is loop invariant except the induction variable dependence.
///
/// This SCEV can be sent to the Target in order to estimate the address
/// calculation cost.
static const SCEV *getAddressAccessSCEV(
              Value *Ptr,
              LoopVectorizationLegality *Legal,
              PredicatedScalarEvolution &PSE,
              const Loop *TheLoop) { … }

InstructionCost
LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
                                                        ElementCount VF) { … }

InstructionCost
LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
                                                    ElementCount VF) { … }

InstructionCost
LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
                                                ElementCount VF) { … }

InstructionCost
LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
                                                 ElementCount VF) { … }

InstructionCost
LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
                                                   ElementCount VF) { … }

std::optional<InstructionCost>
LoopVectorizationCostModel::getReductionPatternCost(
    Instruction *I, ElementCount VF, Type *Ty,
    TTI::TargetCostKind CostKind) const { … }

InstructionCost
LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
                                                     ElementCount VF) { … }

InstructionCost LoopVectorizationCostModel::getScalarizationOverhead(
    Instruction *I, ElementCount VF, TTI::TargetCostKind CostKind) const { … }

void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { … }

void LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) { … }

InstructionCost
LoopVectorizationCostModel::getInstructionCost(Instruction *I,
                                               ElementCount VF) { … }

void LoopVectorizationCostModel::collectValuesToIgnore() { … }

void LoopVectorizationCostModel::collectInLoopReductions() { … }

// This function will select a scalable VF if the target supports scalable
// vectors and a fixed one otherwise.
// TODO: we could return a pair of values that specify the max VF and
// min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
// `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
// doesn't have a cost model that can choose which plan to execute if
// more than one is generated.
static ElementCount determineVPlanVF(const TargetTransformInfo &TTI,
                                     LoopVectorizationCostModel &CM) { … }

VectorizationFactor
LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) { … }

void LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { … }

InstructionCost VPCostContext::getLegacyCost(Instruction *UI,
                                             ElementCount VF) const { … }

bool VPCostContext::skipCostComputation(Instruction *UI, bool IsVector) const { … }

InstructionCost
LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF,
                                          VPCostContext &CostCtx) const { … }

InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan,
                                               ElementCount VF) const { … }

#ifndef NDEBUG
/// Return true if the original loop \ TheLoop contains any instructions that do
/// not have corresponding recipes in \p Plan and are not marked to be ignored
/// in \p CostCtx. This means the VPlan contains simplification that the legacy
/// cost-model did not account for.
static bool planContainsAdditionalSimplifications(VPlan &Plan,
                                                  VPCostContext &CostCtx,
                                                  Loop *TheLoop) {
  // First collect all instructions for the recipes in Plan.
  auto GetInstructionForCost = [](const VPRecipeBase *R) -> Instruction * {
    if (auto *S = dyn_cast<VPSingleDefRecipe>(R))
      return dyn_cast_or_null<Instruction>(S->getUnderlyingValue());
    if (auto *WidenMem = dyn_cast<VPWidenMemoryRecipe>(R))
      return &WidenMem->getIngredient();
    return nullptr;
  };

  DenseSet<Instruction *> SeenInstrs;
  auto Iter = vp_depth_first_deep(Plan.getVectorLoopRegion()->getEntry());
  for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
    for (VPRecipeBase &R : *VPBB) {
      if (auto *IR = dyn_cast<VPInterleaveRecipe>(&R)) {
        auto *IG = IR->getInterleaveGroup();
        unsigned NumMembers = IG->getNumMembers();
        for (unsigned I = 0; I != NumMembers; ++I) {
          if (Instruction *M = IG->getMember(I))
            SeenInstrs.insert(M);
        }
        continue;
      }
      if (Instruction *UI = GetInstructionForCost(&R))
        SeenInstrs.insert(UI);
    }
  }

  // Return true if the loop contains any instructions that are not also part of
  // the VPlan or are skipped for VPlan-based cost computations. This indicates
  // that the VPlan contains extra simplifications.
  return any_of(TheLoop->blocks(), [&SeenInstrs, &CostCtx,
                                    TheLoop](BasicBlock *BB) {
    return any_of(*BB, [&SeenInstrs, &CostCtx, TheLoop, BB](Instruction &I) {
      if (isa<PHINode>(&I) && BB == TheLoop->getHeader())
        return false;
      return !SeenInstrs.contains(&I) && !CostCtx.skipCostComputation(&I, true);
    });
  });
}
#endif

VectorizationFactor LoopVectorizationPlanner::computeBestVF() { … }

static void addRuntimeUnrollDisableMetaData(Loop *L) { … }

// Check if \p RedResult is a ComputeReductionResult instruction, and if it is
// create a merge phi node for it.
static void createAndCollectMergePhiForReduction(
    VPInstruction *RedResult,
    VPTransformState &State, Loop *OrigLoop, BasicBlock *LoopMiddleBlock,
    bool VectorizingEpilogue) { … }

DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
    ElementCount BestVF, unsigned BestUF, VPlan &BestVPlan,
    InnerLoopVectorizer &ILV, DominatorTree *DT, bool IsEpilogueVectorization,
    const DenseMap<const SCEV *, Value *> *ExpandedSCEVs) { … }

//===--------------------------------------------------------------------===//
// EpilogueVectorizerMainLoop
//===--------------------------------------------------------------------===//

/// This function is partially responsible for generating the control flow
/// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
std::pair<BasicBlock *, Value *>
EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton(
    const SCEV2ValueTy &ExpandedSCEVs) { … }

void EpilogueVectorizerMainLoop::printDebugTracesAtStart() { … }

void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() { … }

BasicBlock *
EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass,
                                                    bool ForEpilogue) { … }

//===--------------------------------------------------------------------===//
// EpilogueVectorizerEpilogueLoop
//===--------------------------------------------------------------------===//

/// This function is partially responsible for generating the control flow
/// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
std::pair<BasicBlock *, Value *>
EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton(
    const SCEV2ValueTy &ExpandedSCEVs) { … }

BasicBlock *
EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck(
    BasicBlock *Bypass, BasicBlock *Insert) { … }

void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() { … }

void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() { … }

iterator_range<mapped_iterator<Use *, std::function<VPValue *(Value *)>>>
VPRecipeBuilder::mapToVPValues(User::op_range Operands) { … }

void VPRecipeBuilder::createSwitchEdgeMasks(SwitchInst *SI) { … }

VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst) { … }

VPValue *VPRecipeBuilder::getEdgeMask(BasicBlock *Src, BasicBlock *Dst) const { … }

void VPRecipeBuilder::createHeaderMask() { … }

VPValue *VPRecipeBuilder::getBlockInMask(BasicBlock *BB) const { … }

void VPRecipeBuilder::createBlockInMask(BasicBlock *BB) { … }

VPWidenMemoryRecipe *
VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
                                  VFRange &Range) { … }

/// Creates a VPWidenIntOrFpInductionRecpipe for \p Phi. If needed, it will also
/// insert a recipe to expand the step for the induction recipe.
static VPWidenIntOrFpInductionRecipe *
createWidenInductionRecipes(PHINode *Phi, Instruction *PhiOrTrunc,
                            VPValue *Start, const InductionDescriptor &IndDesc,
                            VPlan &Plan, ScalarEvolution &SE, Loop &OrigLoop) { … }

VPHeaderPHIRecipe *VPRecipeBuilder::tryToOptimizeInductionPHI(
    PHINode *Phi, ArrayRef<VPValue *> Operands, VFRange &Range) { … }

VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate(
    TruncInst *I, ArrayRef<VPValue *> Operands, VFRange &Range) { … }

VPBlendRecipe *VPRecipeBuilder::tryToBlend(PHINode *Phi,
                                           ArrayRef<VPValue *> Operands) { … }

VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,
                                                   ArrayRef<VPValue *> Operands,
                                                   VFRange &Range) { … }

bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const { … }

VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I,
                                           ArrayRef<VPValue *> Operands,
                                           VPBasicBlock *VPBB) { … }

VPHistogramRecipe *
VPRecipeBuilder::tryToWidenHistogram(const HistogramInfo *HI,
                                     ArrayRef<VPValue *> Operands) { … }

void VPRecipeBuilder::fixHeaderPhis() { … }

VPReplicateRecipe *VPRecipeBuilder::handleReplication(Instruction *I,
                                                      VFRange &Range) { … }

VPRecipeBase *
VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
                                        ArrayRef<VPValue *> Operands,
                                        VFRange &Range, VPBasicBlock *VPBB) { … }

void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
                                                        ElementCount MaxVF) { … }

// Add the necessary canonical IV and branch recipes required to control the
// loop.
static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW,
                                  DebugLoc DL) { … }

// Collect VPIRInstructions for phis in the original exit block that are modeled
// in VPlan and add the exiting VPValue as operand. Some exiting values are not
// modeled explicitly yet and won't be included. Those are un-truncated
// VPWidenIntOrFpInductionRecipe, VPWidenPointerInductionRecipe and induction
// increments.
static SetVector<VPIRInstruction *> collectUsersInExitBlock(
    Loop *OrigLoop, VPRecipeBuilder &Builder, VPlan &Plan,
    const MapVector<PHINode *, InductionDescriptor> &Inductions) { … }

// Add exit values to \p Plan. Extracts are added for each entry in \p
// ExitUsersToFix if needed and their operands are updated.
static void
addUsersInExitBlock(VPlan &Plan,
                    const SetVector<VPIRInstruction *> &ExitUsersToFix) { … }

/// Handle live-outs for first order reductions, both in the scalar preheader
/// and the original exit block:
/// 1. Feed a resume value for every FOR from the vector loop to the scalar
///    loop, if middle block branches to scalar preheader, by introducing
///    ExtractFromEnd and ResumePhi recipes in each, respectively, and a
///    VPLiveOut which uses the latter and corresponds to the scalar header.
/// 2. Feed the penultimate value of recurrences to their LCSSA phi users in
///    the original exit block using a VPLiveOut.
static void addLiveOutsForFirstOrderRecurrences(
    VPlan &Plan, SetVector<VPIRInstruction *> &ExitUsersToFix) { … }

VPlanPtr
LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) { … }

VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { … }

// Adjust the recipes for reductions. For in-loop reductions the chain of
// instructions leading from the loop exit instr to the phi need to be converted
// to reductions, with one operand being vector and the other being the scalar
// reduction chain. For other reductions, a select is introduced between the phi
// and live-out recipes when folding the tail.
//
// A ComputeReductionResult recipe is added to the middle block, also for
// in-loop reductions which compute their result in-loop, because generating
// the subsequent bc.merge.rdx phi is driven by ComputeReductionResult recipes.
//
// Adjust AnyOf reductions; replace the reduction phi for the selected value
// with a boolean reduction phi node to check if the condition is true in any
// iteration. The final value is selected by the final ComputeReductionResult.
void LoopVectorizationPlanner::adjustRecipesForReductions(
    VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder, ElementCount MinVF) { … }

void VPDerivedIVRecipe::execute(VPTransformState &State) { … }

void VPReplicateRecipe::execute(VPTransformState &State) { … }

// Determine how to lower the scalar epilogue, which depends on 1) optimising
// for minimum code-size, 2) predicate compiler options, 3) loop hints forcing
// predication, and 4) a TTI hook that analyses whether the loop is suitable
// for predication.
static ScalarEpilogueLowering getScalarEpilogueLowering(
    Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI,
    BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
    LoopVectorizationLegality &LVL, InterleavedAccessInfo *IAI) { … }

// Process the loop in the VPlan-native vectorization path. This path builds
// VPlan upfront in the vectorization pipeline, which allows to apply
// VPlan-to-VPlan transformations from the very beginning without modifying the
// input LLVM IR.
static bool processLoopInVPlanNativePath(
    Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT,
    LoopVectorizationLegality *LVL, TargetTransformInfo *TTI,
    TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC,
    OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI,
    ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints,
    LoopVectorizationRequirements &Requirements) { … }

// Emit a remark if there are stores to floats that required a floating point
// extension. If the vectorized loop was generated with floating point there
// will be a performance penalty from the conversion overhead and the change in
// the vector width.
static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) { … }

static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks,
                                       VectorizationFactor &VF,
                                       std::optional<unsigned> VScale, Loop *L,
                                       ScalarEvolution &SE,
                                       ScalarEpilogueLowering SEL) { … }

LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts)
    : … { … }

bool LoopVectorizePass::processLoop(Loop *L) { … }

LoopVectorizeResult LoopVectorizePass::runImpl(Function &F) { … }

PreservedAnalyses LoopVectorizePass::run(Function &F,
                                         FunctionAnalysisManager &AM) { … }

void LoopVectorizePass::printPipeline(
    raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) { … }
llvm/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp